[docx] split commit for file 4200

Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
Ari Archer 2024-01-23 19:15:15 +02:00
parent 47a8228a09
commit 8948d714f6
No known key found for this signature in database
GPG Key ID: A50D5B4B599AF8A2
400 changed files with 0 additions and 31949 deletions

View File

@ -1,279 +0,0 @@
package com.twitter.search.earlybird.archive;
import java.io.IOException;
import java.util.Date;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import org.apache.commons.lang.time.FastDateFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.util.Clock;
import com.twitter.search.common.metrics.SearchRateCounter;
import com.twitter.search.common.metrics.SearchStatsReceiver;
import com.twitter.search.common.metrics.SearchStatsReceiverImpl;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
import com.twitter.search.common.util.io.recordreader.RecordReader;
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
import com.twitter.search.earlybird.EarlybirdIndexConfig;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.document.DocumentFactory;
import com.twitter.search.earlybird.document.TweetDocument;
import com.twitter.search.earlybird.exception.CriticalExceptionHandler;
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
import com.twitter.search.earlybird.partition.SearchIndexingMetricSet;
import com.twitter.search.earlybird.partition.SegmentHdfsFlusher;
import com.twitter.search.earlybird.partition.SegmentInfo;
import com.twitter.search.earlybird.partition.SegmentLoader;
import com.twitter.search.earlybird.partition.SegmentOptimizer;
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
import com.twitter.search.earlybird.partition.SimpleSegmentIndexer;
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
/**
* Given a segment, this class checks if the segment has an index built on HDFS:
* if not, use SimpleSegmentIndexer to build an index
* if yes, load the HDFS index, build a new index for the new status data which has dates newer
* than the HDFS index, then append the loaded HDFS index.
*/
public class ArchiveSegmentUpdater {
private static final Logger LOG = LoggerFactory.getLogger(ArchiveSegmentUpdater.class);
private final SegmentSyncConfig sync;
private final EarlybirdIndexConfig earlybirdIndexConfig;
private final ZooKeeperTryLockFactory zkTryLockFactory;
private final SearchStatsReceiver statsReceiver = new SearchStatsReceiverImpl();
private final SearchIndexingMetricSet searchIndexingMetricSet =
new SearchIndexingMetricSet(statsReceiver);
private final EarlybirdSearcherStats searcherStats =
new EarlybirdSearcherStats(statsReceiver);
private final SearchRateCounter indexNewSegment =
new SearchRateCounter("index_new_segment");
private final SearchRateCounter updateExistingSegment =
new SearchRateCounter("update_existing_segment");
private final SearchRateCounter skipExistingSegment =
new SearchRateCounter("skip_existing_segment");
private Clock clock;
public ArchiveSegmentUpdater(ZooKeeperTryLockFactory zooKeeperTryLockFactory,
SegmentSyncConfig sync,
EarlybirdIndexConfig earlybirdIndexConfig,
Clock clock) {
this.sync = sync;
this.earlybirdIndexConfig = earlybirdIndexConfig;
this.zkTryLockFactory = zooKeeperTryLockFactory;
this.clock = clock;
}
private boolean canUpdateSegment(SegmentInfo segmentInfo) {
if (!(segmentInfo.getSegment() instanceof ArchiveSegment)) {
LOG.info("only ArchiveSegment is available for updating now: "
+ segmentInfo);
return false;
}
if (!segmentInfo.isEnabled()) {
LOG.debug("Segment is disabled: " + segmentInfo);
return false;
}
if (segmentInfo.isComplete() || segmentInfo.isIndexing()
|| segmentInfo.getSyncInfo().isLoaded()) {
LOG.debug("Cannot update already indexed segment: " + segmentInfo);
return false;
}
return true;
}
/**
* Given a segment, checks if the segment has an index built on HDFS:
* if not, use SimpleSegmentIndexer to build an index
* if yes, load the HDFS index, build a new index for the new status data which has dates newer
* than the HDFS index, then append the loaded HDFS index.
*
* Returns whether the segment was successfully updated.
*/
public boolean updateSegment(SegmentInfo segmentInfo) {
Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment);
if (!canUpdateSegment(segmentInfo)) {
return false;
}
if (segmentInfo.isIndexing()) {
LOG.error("Segment is already being indexed: " + segmentInfo);
return false;
}
final Date hdfsEndDate = ArchiveHDFSUtils.getSegmentEndDateOnHdfs(sync, segmentInfo);
if (hdfsEndDate == null) {
indexNewSegment.increment();
if (!indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE)) {
return false;
}
} else {
final Date curEndDate = ((ArchiveSegment) segmentInfo.getSegment()).getDataEndDate();
if (!hdfsEndDate.before(curEndDate)) {
skipExistingSegment.increment();
LOG.info("Segment is up-to-date: " + segmentInfo.getSegment().getTimeSliceID()
+ " Found flushed segment on HDFS with end date: "
+ FastDateFormat.getInstance("yyyyMMdd").format(hdfsEndDate));
segmentInfo.setComplete(true);
segmentInfo.getSyncInfo().setFlushed(true);
return true;
}
updateExistingSegment.increment();
LOG.info("Updating segment: " + segmentInfo.getSegment().getTimeSliceID()
+ "; new endDate will be " + FastDateFormat.getInstance("yyyyMMdd").format(curEndDate));
if (!updateSegment(segmentInfo, hdfsEndDate)) {
return false;
}
}
boolean success = SegmentOptimizer.optimize(segmentInfo);
if (!success) {
// Clean up the segment dir on local disk
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
LOG.info("Error optimizing segment: " + segmentInfo);
return false;
}
// Verify segment before uploading.
success = ArchiveSegmentVerifier.verifySegment(segmentInfo);
if (!success) {
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
LOG.info("Segment not uploaded to HDFS because it did not pass verification: " + segmentInfo);
return false;
}
// upload the index to HDFS
success = new SegmentHdfsFlusher(zkTryLockFactory, sync, false)
.flushSegmentToDiskAndHDFS(segmentInfo);
if (success) {
ArchiveHDFSUtils.deleteHdfsSegmentDir(sync, segmentInfo, false, true);
} else {
// Clean up the segment dir on hdfs
ArchiveHDFSUtils.deleteHdfsSegmentDir(sync, segmentInfo, true, false);
LOG.info("Error uploading segment to HDFS: " + segmentInfo);
}
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
return success;
}
/**
* Build index for the given segmentInfo. Only those statuses passing the dateFilter are indexed.
*/
private boolean indexSegment(final SegmentInfo segmentInfo, Predicate<Date> dateFilter) {
Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment);
RecordReader<TweetDocument> documentReader = null;
try {
ArchiveSegment archiveSegment = (ArchiveSegment) segmentInfo.getSegment();
DocumentFactory<ThriftIndexingEvent> documentFactory =
earlybirdIndexConfig.createDocumentFactory();
documentReader = archiveSegment.getStatusRecordReader(documentFactory, dateFilter);
// Read and index the statuses
boolean success = new SimpleSegmentIndexer(documentReader, searchIndexingMetricSet)
.indexSegment(segmentInfo);
if (!success) {
// Clean up segment dir on local disk
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
LOG.info("Error indexing segment: " + segmentInfo);
}
return success;
} catch (IOException e) {
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
LOG.info("Exception while indexing segment: " + segmentInfo, e);
return false;
} finally {
if (documentReader != null) {
documentReader.stop();
}
}
}
/**
* Load the index built on HDFS for the given segmentInfo, index the new data and append the
* HDFS index to the new indexed segment
*/
private boolean updateSegment(final SegmentInfo segmentInfo, final Date hdfsEndDate) {
SegmentInfo hdfsSegmentInfo = loadSegmentFromHdfs(segmentInfo, hdfsEndDate);
if (hdfsSegmentInfo == null) {
return indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE);
}
boolean success = indexSegment(segmentInfo, input -> {
// we're updating the segment - only index days after the old end date,
// and we're sure that the previous days have already been indexed.
return input.after(hdfsEndDate);
});
if (!success) {
LOG.error("Error indexing new data: " + segmentInfo);
return indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE);
}
// Now, append the index loaded from hdfs
try {
segmentInfo.getIndexSegment().append(hdfsSegmentInfo.getIndexSegment());
hdfsSegmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
LOG.info("Deleted local segment directories with end date " + hdfsEndDate + " : "
+ segmentInfo);
} catch (IOException e) {
LOG.warn("Caught IOException while appending segment " + hdfsSegmentInfo.getSegmentName(), e);
hdfsSegmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
return false;
}
segmentInfo.setComplete(true);
return true;
}
/**
* Load the index built on HDFS for the given segmentInfo and end date
*/
private SegmentInfo loadSegmentFromHdfs(final SegmentInfo segmentInfo, final Date hdfsEndDate) {
Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment);
ArchiveSegment segment = new ArchiveSegment(
segmentInfo.getTimeSliceID(),
EarlybirdConfig.getMaxSegmentSize(),
segmentInfo.getNumPartitions(),
segmentInfo.getSegment().getHashPartitionID(),
hdfsEndDate);
EarlybirdSegmentFactory factory = new EarlybirdSegmentFactory(
earlybirdIndexConfig,
searchIndexingMetricSet,
searcherStats,
clock);
SegmentInfo hdfsSegmentInfo;
try {
hdfsSegmentInfo = new SegmentInfo(segment, factory, sync);
CriticalExceptionHandler criticalExceptionHandler =
new CriticalExceptionHandler();
boolean success = new SegmentLoader(sync, criticalExceptionHandler)
.load(hdfsSegmentInfo);
if (!success) {
// If not successful, segmentLoader has already cleaned up the local dir.
LOG.info("Error loading hdfs segment " + hdfsSegmentInfo
+ ", building segment from scratch.");
hdfsSegmentInfo = null;
}
} catch (IOException e) {
LOG.error("Exception while loading segment from hdfs: " + segmentInfo, e);
hdfsSegmentInfo = null;
}
return hdfsSegmentInfo;
}
}

View File

@ -1,75 +0,0 @@
package com.twitter.search.earlybird.archive;
import java.io.IOException;
import java.util.List;
import com.google.common.annotations.VisibleForTesting;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.store.Directory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.earlybird.partition.SegmentInfo;
public final class ArchiveSegmentVerifier {
private static final Logger LOG = LoggerFactory.getLogger(ArchiveSegmentVerifier.class);
private ArchiveSegmentVerifier() {
}
@VisibleForTesting
static boolean shouldVerifySegment(SegmentInfo segmentInfo) {
if (segmentInfo.isIndexing()) {
LOG.warn("ArchiveSegmentVerifier got segment still indexing.");
return false;
}
if (!segmentInfo.isComplete()) {
LOG.warn("ArchiveSegmentVerifyer got incomplete segment.");
return false;
}
if (!segmentInfo.isOptimized()) {
LOG.warn("ArchiveSegmentVerifyer got unoptimized segment.");
return false;
}
return true;
}
/**
* Verifies an archive segment has a sane number of leaves.
*/
public static boolean verifySegment(SegmentInfo segmentInfo) {
if (!shouldVerifySegment(segmentInfo)) {
return false;
}
Directory directory = segmentInfo.getIndexSegment().getLuceneDirectory();
return verifyLuceneIndex(directory);
}
private static boolean verifyLuceneIndex(Directory directory) {
try {
DirectoryReader indexerReader = DirectoryReader.open(directory);
List<LeafReaderContext> leaves = indexerReader.getContext().leaves();
if (leaves.size() != 1) {
LOG.warn("Lucene index does not have exactly one segment: " + leaves.size() + " != 1. "
+ "Lucene segments should have been merged during optimization.");
return false;
}
LeafReader reader = leaves.get(0).reader();
if (reader.numDocs() <= 0) {
LOG.warn("Lucene index has no document: " + reader);
return false;
}
return true;
} catch (IOException e) {
LOG.warn("Found bad lucene index at: " + directory);
return false;
}
}
}

View File

@ -1,322 +0,0 @@
package com.twitter.search.earlybird.archive;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.List;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.Lists;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
import com.twitter.search.common.util.io.MergingSortedRecordReader;
import com.twitter.search.common.util.io.recordreader.RecordReader;
import com.twitter.search.earlybird.config.TierConfig;
import com.twitter.search.earlybird.document.DocumentFactory;
import com.twitter.search.earlybird.document.ThriftIndexingEventDocumentFactory;
import com.twitter.search.earlybird.document.TweetDocument;
/**
* Responsible for taking a number of daily status batches and partitioning them into time slices
* which will be used to build segments.
*
* We try to put at most N number of tweets into a time slice.
*/
public class ArchiveTimeSlicer {
private static final Logger LOG = LoggerFactory.getLogger(ArchiveTimeSlicer.class);
private static final Comparator<TweetDocument> ASCENDING =
(o1, o2) -> Long.compare(o1.getTweetID(), o2.getTweetID());
private static final Comparator<TweetDocument> DESCENDING =
(o1, o2) -> Long.compare(o2.getTweetID(), o1.getTweetID());
// Represents a number of daily batches which will go into a segment.
public static final class ArchiveTimeSlice {
private Date startDate;
private Date endDate;
private int statusCount;
private final DailyStatusBatches directory;
private final ArchiveEarlybirdIndexConfig earlybirdIndexConfig;
// This list is always ordered from oldest day, to the newest day.
// For the on-disk archive, we reverse the days in getTweetReaders().
private final List<DailyStatusBatch> batches = Lists.newArrayList();
private ArchiveTimeSlice(DailyStatusBatches directory,
ArchiveEarlybirdIndexConfig earlybirdIndexConfig) {
this.directory = directory;
this.earlybirdIndexConfig = earlybirdIndexConfig;
}
public Date getEndDate() {
return endDate;
}
public int getStatusCount() {
return statusCount;
}
public int getNumHashPartitions() {
return batches.isEmpty() ? 0 : batches.get(0).getNumHashPartitions();
}
/**
* Returns a reader for reading tweets from this timeslice.
*
* @param archiveSegment The segment to which the timeslice belongs.
* @param documentFactory The ThriftIndexingEvent to TweetDocument converter.
* @param filter A filter that determines what dates should be read.
*/
public RecordReader<TweetDocument> getStatusReader(
ArchiveSegment archiveSegment,
DocumentFactory<ThriftIndexingEvent> documentFactory,
Predicate<Date> filter) throws IOException {
// We no longer support ThriftStatus based document factories.
Preconditions.checkState(documentFactory instanceof ThriftIndexingEventDocumentFactory);
final int hashPartitionID = archiveSegment.getHashPartitionID();
List<RecordReader<TweetDocument>> readers = new ArrayList<>(batches.size());
List<DailyStatusBatch> orderedForReading = orderBatchesForReading(batches);
LOG.info("Creating new status reader for hashPartition: "
+ hashPartitionID + " timeslice: " + getDescription());
for (DailyStatusBatch batch : orderedForReading) {
if (filter.apply(batch.getDate())) {
LOG.info("Adding reader for " + batch.getDate() + " " + getDescription());
PartitionedBatch partitionedBatch = batch.getPartition(hashPartitionID);
// Don't even try to create a reader if the partition is empty.
// There does not seem to be any problem in production now, but HDFS FileSystem's javadoc
// does indicate that listStatus() is allowed to throw a FileNotFoundException if the
// partition does not exist. This check makes the code more robust against future
// HDFS FileSystem implementation changes.
if (partitionedBatch.getStatusCount() > 0) {
RecordReader<TweetDocument> tweetReaders = partitionedBatch.getTweetReaders(
archiveSegment,
directory.getStatusPathToUseForDay(batch.getDate()),
documentFactory);
readers.add(tweetReaders);
}
} else {
LOG.info("Filtered reader for " + batch.getDate() + " " + getDescription());
}
}
LOG.info("Creating reader for timeslice: " + getDescription()
+ " with " + readers.size() + " readers");
return new MergingSortedRecordReader<TweetDocument>(getMergingComparator(), readers);
}
private List<DailyStatusBatch> orderBatchesForReading(List<DailyStatusBatch> orderedBatches) {
// For the index formats using stock lucene, we want the most recent days to be indexed first.
// In the twitter in-memory optimized indexes, older tweets will be added first, and
// optimization will reverse the documents to make most recent tweets be first.
return this.earlybirdIndexConfig.isUsingLIFODocumentOrdering()
? orderedBatches : Lists.reverse(orderedBatches);
}
private Comparator<TweetDocument> getMergingComparator() {
// We always want to retrieve larger tweet ids first.
// LIFO means that the smaller ids get inserted first --> ASCENDING order.
// FIFO would mean that we want to first insert the larger ids --> DESCENDING order.
return this.earlybirdIndexConfig.isUsingLIFODocumentOrdering()
? ASCENDING : DESCENDING;
}
/**
* Returns the smallest indexed tweet ID in this timeslice for the given partition.
*
* @param hashPartitionID The partition.
*/
public long getMinStatusID(int hashPartitionID) {
if (batches.isEmpty()) {
return 0;
}
for (int i = 0; i < batches.size(); i++) {
long minStatusID = batches.get(i).getPartition(hashPartitionID).getMinStatusID();
if (minStatusID != DailyStatusBatch.EMPTY_BATCH_STATUS_ID) {
return minStatusID;
}
}
return 0;
}
/**
* Returns the highest indexed tweet ID in this timeslice for the given partition.
*
* @param hashPartitionID The partition.
*/
public long getMaxStatusID(int hashPartitionID) {
if (batches.isEmpty()) {
return Long.MAX_VALUE;
}
for (int i = batches.size() - 1; i >= 0; i--) {
long maxStatusID = batches.get(i).getPartition(hashPartitionID).getMaxStatusID();
if (maxStatusID != DailyStatusBatch.EMPTY_BATCH_STATUS_ID) {
return maxStatusID;
}
}
return Long.MAX_VALUE;
}
/**
* Returns a string with some information for this timeslice.
*/
public String getDescription() {
StringBuilder builder = new StringBuilder();
builder.append("TimeSlice[start date=");
builder.append(DailyStatusBatches.DATE_FORMAT.format(startDate));
builder.append(", end date=");
builder.append(DailyStatusBatches.DATE_FORMAT.format(endDate));
builder.append(", status count=");
builder.append(statusCount);
builder.append(", days count=");
builder.append(batches.size());
builder.append("]");
return builder.toString();
}
}
private final int maxSegmentSize;
private final DailyStatusBatches dailyStatusBatches;
private final Date tierStartDate;
private final Date tierEndDate;
private final ArchiveEarlybirdIndexConfig earlybirdIndexConfig;
private List<ArchiveTimeSlice> lastCachedTimeslices = null;
public ArchiveTimeSlicer(int maxSegmentSize,
DailyStatusBatches dailyStatusBatches,
ArchiveEarlybirdIndexConfig earlybirdIndexConfig) {
this(maxSegmentSize, dailyStatusBatches, TierConfig.DEFAULT_TIER_START_DATE,
TierConfig.DEFAULT_TIER_END_DATE, earlybirdIndexConfig);
}
public ArchiveTimeSlicer(int maxSegmentSize,
DailyStatusBatches dailyStatusBatches,
Date tierStartDate,
Date tierEndDate,
ArchiveEarlybirdIndexConfig earlybirdIndexConfig) {
this.maxSegmentSize = maxSegmentSize;
this.dailyStatusBatches = dailyStatusBatches;
this.tierStartDate = tierStartDate;
this.tierEndDate = tierEndDate;
this.earlybirdIndexConfig = earlybirdIndexConfig;
}
private boolean cacheIsValid() throws IOException {
return lastCachedTimeslices != null
&& !lastCachedTimeslices.isEmpty()
&& cacheIsValid(lastCachedTimeslices.get(lastCachedTimeslices.size() - 1).endDate);
}
private boolean cacheIsValid(Date lastDate) throws IOException {
if (lastCachedTimeslices == null || lastCachedTimeslices.isEmpty()) {
return false;
}
// Check if we have a daily batch newer than the last batch used for the newest timeslice.
Calendar cal = Calendar.getInstance();
cal.setTime(lastDate);
cal.add(Calendar.DATE, 1);
Date nextDate = cal.getTime();
boolean foundBatch = dailyStatusBatches.hasValidBatchForDay(nextDate);
LOG.info("Checking cache: Looked for valid batch for day {}. Found: {}",
DailyStatusBatches.DATE_FORMAT.format(nextDate), foundBatch);
return !foundBatch;
}
private boolean timesliceIsFull(ArchiveTimeSlice timeSlice, DailyStatusBatch batch) {
return timeSlice.statusCount + batch.getMaxPerPartitionStatusCount() > maxSegmentSize;
}
private void doTimeSlicing() throws IOException {
dailyStatusBatches.refresh();
lastCachedTimeslices = Lists.newArrayList();
ArchiveTimeSlice currentTimeSlice = null;
// Iterate over each day and add it to the current timeslice, until it gets full.
for (DailyStatusBatch batch : dailyStatusBatches.getStatusBatches()) {
if (!batch.isValid()) {
LOG.warn("Skipping hole: " + batch.getDate());
continue;
}
if (currentTimeSlice == null || timesliceIsFull(currentTimeSlice, batch)) {
if (currentTimeSlice != null) {
LOG.info("Filled timeslice: " + currentTimeSlice.getDescription());
}
currentTimeSlice = new ArchiveTimeSlice(dailyStatusBatches, earlybirdIndexConfig);
currentTimeSlice.startDate = batch.getDate();
lastCachedTimeslices.add(currentTimeSlice);
}
currentTimeSlice.endDate = batch.getDate();
currentTimeSlice.statusCount += batch.getMaxPerPartitionStatusCount();
currentTimeSlice.batches.add(batch);
}
LOG.info("Last timeslice: {}", currentTimeSlice.getDescription());
LOG.info("Done with time slicing. Number of timeslices: {}",
lastCachedTimeslices.size());
}
/**
* Returns all timeslices for this earlybird.
*/
public List<ArchiveTimeSlice> getTimeSlices() throws IOException {
if (cacheIsValid()) {
return lastCachedTimeslices;
}
LOG.info("Cache is outdated. Loading new daily batches now...");
doTimeSlicing();
return lastCachedTimeslices != null ? Collections.unmodifiableList(lastCachedTimeslices) : null;
}
/**
* Return the timeslices that overlap the tier start/end date ranges if they are specified
*/
public List<ArchiveTimeSlice> getTimeSlicesInTierRange() throws IOException {
List<ArchiveTimeSlice> timeSlices = getTimeSlices();
if (tierStartDate == TierConfig.DEFAULT_TIER_START_DATE
&& tierEndDate == TierConfig.DEFAULT_TIER_END_DATE) {
return timeSlices;
}
List<ArchiveTimeSlice> filteredTimeSlice = Lists.newArrayList();
for (ArchiveTimeSlice timeSlice : timeSlices) {
if (timeSlice.startDate.before(tierEndDate) && !timeSlice.endDate.before(tierStartDate)) {
filteredTimeSlice.add(timeSlice);
}
}
return filteredTimeSlice;
}
@VisibleForTesting
protected DailyStatusBatches getDailyStatusBatches() {
return dailyStatusBatches;
}
}

View File

@ -1,166 +0,0 @@
package com.twitter.search.earlybird.archive;
import java.io.IOException;
import java.util.Date;
import java.util.Map;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.JsonParseException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Represents a day's worth of statuses (tweets) for multiple hash partitions.
*
* Note that what this class contains is not the data, but metadata.
*
* A day of tweets will come from:
* - A scrubgen, if it has happened before the scrubgen date.
* - Our daily jobs pipeline, if it has happened after that.
*
* This class checks the _SUCCESS file exists in the "statuses" subdirectory and extracts the status
* count, min status id and max status id.
*/
public class DailyStatusBatch implements Comparable<DailyStatusBatch> {
private static final Logger LOG = LoggerFactory.getLogger(DailyStatusBatch.class);
public static final long EMPTY_BATCH_STATUS_ID = -1;
private static final String PARTITION_FORMAT = "p_%d_of_%d";
private static final String SUCCESS_FILE_NAME = "_SUCCESS";
private final Map<Integer, PartitionedBatch> hashPartitionToStatuses = Maps.newHashMap();
private final Date date;
private final int numHashPartitions;
private final boolean hasSuccessFiles;
public DailyStatusBatch(Date date, int numHashPartitions, Path statusPath, FileSystem hdfs) {
this.date = date;
this.numHashPartitions = numHashPartitions;
this.hasSuccessFiles = checkForSuccessFile(hdfs, date, statusPath);
}
public Date getDate() {
return date;
}
/**
* Check for the presence of the _SUCCESS file for the given day's path on HDFS for the statuses
* field group.
*/
private boolean checkForSuccessFile(FileSystem hdfs, Date inputDate, Path statusPath) {
Path dayPath = new Path(statusPath, ArchiveHDFSUtils.dateToPath(inputDate, "/"));
Path successFilePath = new Path(dayPath, SUCCESS_FILE_NAME);
try {
return hdfs.getFileStatus(successFilePath).isFile();
} catch (IOException e) {
LOG.error("Could not verify existence of the _SUCCESS file. Assuming it doesn't exist.", e);
}
return false;
}
/**
* Loads the data for this day for the given partition.
*/
public PartitionedBatch addPartition(FileSystem hdfs, Path dayPath, int hashPartitionID)
throws IOException {
String partitionDir = String.format(PARTITION_FORMAT, hashPartitionID, numHashPartitions);
Path path = new Path(dayPath, partitionDir);
PartitionedBatch batch =
new PartitionedBatch(path, hashPartitionID, numHashPartitions, date);
batch.load(hdfs);
hashPartitionToStatuses.put(hashPartitionID, batch);
return batch;
}
public PartitionedBatch getPartition(int hashPartitionID) {
return hashPartitionToStatuses.get(hashPartitionID);
}
/**
* Returns the greatest status count in all partitions belonging to this batch.
*/
public int getMaxPerPartitionStatusCount() {
int maxPerPartitionStatusCount = 0;
for (PartitionedBatch batch : hashPartitionToStatuses.values()) {
maxPerPartitionStatusCount = Math.max(batch.getStatusCount(), maxPerPartitionStatusCount);
}
return maxPerPartitionStatusCount;
}
public int getNumHashPartitions() {
return numHashPartitions;
}
@VisibleForTesting
boolean hasSuccessFiles() {
return hasSuccessFiles;
}
/**
* Returns true if the _status_counts files could be found in each
* hash partition subfolder that belongs to this timeslice
* AND the _SUCCESS file can be found at the root folder for day
*/
public boolean isValid() {
// make sure we have data for all hash partitions
for (int i = 0; i < numHashPartitions; i++) {
PartitionedBatch day = hashPartitionToStatuses.get(i);
if (day == null || !day.hasStatusCount() || day.isDisallowedEmptyPartition()) {
return false;
}
}
return hasSuccessFiles;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("DailyStatusBatch[date=").append(date)
.append(",valid=").append(isValid())
.append(",hasSuccessFiles=").append(hasSuccessFiles)
.append(",numHashPartitions=").append(numHashPartitions)
.append("]:\n");
for (int i = 0; i < numHashPartitions; i++) {
builder.append('\t').append(hashPartitionToStatuses.get(i).toString()).append('\n');
}
return builder.toString();
}
@Override
public int compareTo(DailyStatusBatch o) {
return date.compareTo(o.date);
}
/**
* Serialize DailyStatusBatch to a json string.
*/
public String serializeToJson() {
return serializeToJson(new Gson());
}
@VisibleForTesting
String serializeToJson(Gson gson) {
return gson.toJson(this);
}
/**
* Given a json string, parse its fields and construct a daily status batch.
* @param batchStr the json string representation of a daily status batch.
* @return the daily status batch constructed; if the string is of invalid format, null will be
* returned.
*/
static DailyStatusBatch deserializeFromJson(String batchStr) {
try {
return new Gson().fromJson(batchStr, DailyStatusBatch.class);
} catch (JsonParseException e) {
LOG.error("Error parsing json string: " + batchStr, e);
return null;
}
}
}

View File

@ -1,702 +0,0 @@
package com.twitter.search.earlybird.archive;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.NavigableMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Maps;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.time.FastDateFormat;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.quantity.Amount;
import com.twitter.common.quantity.Time;
import com.twitter.search.common.database.DatabaseConfig;
import com.twitter.search.common.util.date.DateUtil;
import com.twitter.search.common.util.io.LineRecordFileReader;
import com.twitter.search.common.util.zktrylock.TryLock;
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.common.config.EarlybirdProperty;
import com.twitter.search.earlybird.partition.HdfsUtil;
import com.twitter.search.earlybird.partition.StatusBatchFlushVersion;
/**
* Provides access to preprocessed statuses (tweets) to be indexed by archive search earlybirds.
*
* These tweets can be coming from a scrub gen or from the output of the daily jobs.
*/
public class DailyStatusBatches {
private static final Logger LOG = LoggerFactory.getLogger(DailyStatusBatches.class);
// Maximum time to spend on obtaining daily status batches by computing or loading from HDFS
private static final Amount<Long, Time> MAX_TIME_ALLOWED_DAILY_STATUS_BATCHES_MINUTES =
Amount.of(EarlybirdConfig.getLong("daily_status_batches_max_initial_load_time_minutes"),
Time.MINUTES);
// Time to wait before trying again when obtaining daily status batches fails
private static final Amount<Long, Time> DAILY_STATUS_BATCHES_WAITING_TIME_MINUTES =
Amount.of(EarlybirdConfig.getLong("daily_status_batches_waiting_time_minutes"),
Time.MINUTES);
private static final String DAILY_STATUS_BATCHES_SYNC_PATH =
EarlybirdProperty.ZK_APP_ROOT.get() + "/daily_batches_sync";
private static final String DAILY_BATCHES_ZK_LOCK = "daily_batches_zk_lock";
private static final Amount<Long, Time> DAILY_STATUS_BATCHES_ZK_LOCK_EXPIRATION_MINUTES =
Amount.of(EarlybirdConfig.getLong("daily_status_batches_zk_lock_expiration_minutes"),
Time.MINUTES);
static final FastDateFormat DATE_FORMAT = FastDateFormat.getInstance("yyyyMMdd");
// before this date, there was no twitter
private static final Date FIRST_TWITTER_DAY = DateUtil.toDate(2006, 2, 1);
private static final String STATUS_BATCHES_PREFIX = "status_batches";
private final String rootDir =
EarlybirdConfig.getString("hdfs_offline_segment_sync_dir", "top_archive_statuses");
private final String buildGen =
EarlybirdConfig.getString("offline_segment_build_gen", "bg_1");
public static final String STATUS_SUBDIR_NAME = "statuses";
public static final String LAYOUT_SUBDIR_NAME = "layouts";
public static final String SCRUB_GEN_SUFFIX_PATTERN = "scrubbed/%s";
private static final String INTERMEDIATE_COUNTS_SUBDIR_NAME = "counts";
private static final String SUCCESS_FILE_NAME = "_SUCCESS";
private static final Pattern HASH_PARTITION_PATTERN = Pattern.compile("p_(\\d+)_of_(\\d+)");
private static final Date FIRST_TWEET_DAY = DateUtil.toDate(2006, 3, 21);
private final Path rootPath = new Path(rootDir);
private final Path buildGenPath = new Path(rootPath, buildGen);
private final Path statusPath = new Path(buildGenPath, STATUS_SUBDIR_NAME);
private final NavigableMap<Date, DailyStatusBatch> statusBatches = Maps.newTreeMap();
private Date firstValidDay = null;
private Date lastValidDay = null;
private final ZooKeeperTryLockFactory zkTryLockFactory;
private final Date scrubGenDay;
private long numberOfDaysWithValidScrubGenData;
public DailyStatusBatches(
ZooKeeperTryLockFactory zooKeeperTryLockFactory, Date scrubGenDay) throws IOException {
this.zkTryLockFactory = zooKeeperTryLockFactory;
this.scrubGenDay = scrubGenDay;
FileSystem hdfs = null;
try {
hdfs = HdfsUtil.getHdfsFileSystem();
verifyDirectory(hdfs);
} finally {
IOUtils.closeQuietly(hdfs);
}
}
@VisibleForTesting
public Date getScrubGenDay() {
return scrubGenDay;
}
public Collection<DailyStatusBatch> getStatusBatches() {
return statusBatches.values();
}
/**
* Reset the states of the directory
*/
private void resetDirectory() {
statusBatches.clear();
firstValidDay = null;
lastValidDay = null;
}
/**
* Indicate whether the directory has been initialized
*/
private boolean isInitialized() {
return lastValidDay != null;
}
/**
* Load the daily status batches from HDFS; return true if one or more batches could be loaded.
**/
private boolean refreshByLoadingHDFSStatusBatches(final FileSystem fs) throws IOException {
// first find the latest valid end date of statuses
final Date lastValidStatusDay = getLastValidInputDateFromNow(fs);
if (lastValidStatusDay != null) {
if (hasStatusBatchesOnHdfs(fs, lastValidStatusDay)) {
if (loadStatusBatchesFromHdfs(fs, lastValidStatusDay)) {
return true;
}
}
}
resetDirectory();
return false;
}
/**
* Checks the directory for new data and returns true, if one or more new batches could be loaded.
*/
public void refresh() throws IOException {
final FileSystem hdfs = HdfsUtil.getHdfsFileSystem();
final Stopwatch stopwatch = Stopwatch.createStarted();
try {
if (!isInitialized()) {
if (initializeDailyStatusBatches(hdfs, stopwatch)) {
LOG.info("Successfully obtained daily status batches after {}", stopwatch);
} else {
String errMsg = "Failed to load or compute daily status batches after "
+ stopwatch.toString();
LOG.error(errMsg);
throw new IOException(errMsg);
}
} else {
loadNewDailyBatches(hdfs);
}
} finally {
IOUtils.closeQuietly(hdfs);
}
}
private boolean initializeDailyStatusBatches(final FileSystem hdfs,
final Stopwatch stopwatch) throws IOException {
long timeSpentOnDailyBatches = 0L;
long maxAllowedTimeMs = MAX_TIME_ALLOWED_DAILY_STATUS_BATCHES_MINUTES.as(Time.MILLISECONDS);
long waitingTimeMs = DAILY_STATUS_BATCHES_WAITING_TIME_MINUTES.as(Time.MILLISECONDS);
boolean firstLoop = true;
LOG.info("Starting to load or compute daily status batches for the first time.");
while (timeSpentOnDailyBatches <= maxAllowedTimeMs && !Thread.currentThread().isInterrupted()) {
if (!firstLoop) {
try {
LOG.info("Sleeping " + waitingTimeMs
+ " millis before trying to obtain daily batches again");
Thread.sleep(waitingTimeMs);
} catch (InterruptedException e) {
LOG.warn("Interrupted while waiting to load daily batches", e);
Thread.currentThread().interrupt();
break;
}
}
if (isStatusBatchLoadingEnabled() && refreshByLoadingHDFSStatusBatches(hdfs)) {
LOG.info("Successfully loaded daily status batches after {}", stopwatch);
return true;
}
final AtomicBoolean successRef = new AtomicBoolean(false);
if (computeDailyBatchesWithZKLock(hdfs, successRef, stopwatch)) {
return successRef.get();
}
timeSpentOnDailyBatches = stopwatch.elapsed(TimeUnit.MILLISECONDS);
firstLoop = false;
}
return false;
}
private boolean computeDailyBatchesWithZKLock(final FileSystem hdfs,
final AtomicBoolean successRef,
final Stopwatch stopwatch) throws IOException {
// Using a global lock to coordinate among earlybirds and segment builders so that only
// one instance would hit the HDFS name node to query the daily status directories
TryLock lock = zkTryLockFactory.createTryLock(
DatabaseConfig.getLocalHostname(),
DAILY_STATUS_BATCHES_SYNC_PATH,
DAILY_BATCHES_ZK_LOCK,
DAILY_STATUS_BATCHES_ZK_LOCK_EXPIRATION_MINUTES);
return lock.tryWithLock(() -> {
LOG.info("Obtained ZK lock to compute daily status batches after {}", stopwatch);
successRef.set(initialLoadDailyBatchInfos(hdfs));
if (successRef.get()) {
LOG.info("Successfully computed daily status batches after {}", stopwatch);
if (isStatusBatchFlushingEnabled()) {
LOG.info("Starting to store daily status batches to HDFS");
if (storeStatusBatchesToHdfs(hdfs, lastValidDay)) {
LOG.info("Successfully stored daily status batches to HDFS");
} else {
LOG.warn("Failed storing daily status batches to HDFS");
}
}
} else {
LOG.info("Failed loading daily status info");
}
});
}
private void verifyDirectory(FileSystem hdfs) throws IOException {
if (!hdfs.exists(rootPath)) {
throw new IOException("Root dir '" + rootPath + "' does not exist.");
}
if (!hdfs.exists(buildGenPath)) {
throw new IOException("Build gen dir '" + buildGenPath + "' does not exist.");
}
if (!hdfs.exists(statusPath)) {
throw new IOException("Status dir '" + statusPath + "' does not exist.");
}
}
private void loadNewDailyBatches(FileSystem hdfs) throws IOException {
Preconditions.checkNotNull(lastValidDay);
Calendar day = Calendar.getInstance();
day.setTime(lastValidDay);
day.add(Calendar.DATE, 1);
while (loadDay(hdfs, day.getTime()) != null) {
lastValidDay = day.getTime();
day.add(Calendar.DATE, 1);
}
}
private boolean initialLoadDailyBatchInfos(FileSystem hdfs) throws IOException {
LOG.info("Starting to build timeslice map from scratch.");
final Date lastValidStatusDay = getLastValidInputDateFromNow(hdfs);
if (lastValidStatusDay == null) {
LOG.warn("No data found in " + statusPath + " and scrubbed path");
return false;
}
int mostRecentYear = DateUtil.getCalendar(lastValidStatusDay).get(Calendar.YEAR);
for (int year = 2006; year <= mostRecentYear; ++year) {
// construct path to avoid hdfs.listStatus() calls
Calendar day = Calendar.getInstance();
day.set(year, Calendar.JANUARY, 1, 0, 0, 0);
day.set(Calendar.MILLISECOND, 0);
Calendar yearEnd = Calendar.getInstance();
yearEnd.set(year, Calendar.DECEMBER, 31, 0, 0, 0);
yearEnd.set(Calendar.MILLISECOND, 0);
if (lastValidDay != null) {
// We're updating.
if (lastValidDay.after(yearEnd.getTime())) {
// This year was already loaded.
continue;
}
if (lastValidDay.after(day.getTime())) {
// Start one day after last valid date.
day.setTime(lastValidDay);
day.add(Calendar.DATE, 1);
}
}
for (; !day.after(yearEnd); day.add(Calendar.DATE, 1)) {
loadDay(hdfs, day.getTime());
}
}
boolean updated = false;
numberOfDaysWithValidScrubGenData = 0;
// Iterate batches in sorted order.
for (DailyStatusBatch batch : statusBatches.values()) {
if (!batch.isValid()) {
break;
}
if (batch.getDate().before(scrubGenDay)) {
numberOfDaysWithValidScrubGenData++;
}
if (firstValidDay == null) {
firstValidDay = batch.getDate();
}
if (lastValidDay == null || lastValidDay.before(batch.getDate())) {
lastValidDay = batch.getDate();
updated = true;
}
}
LOG.info("Number of statusBatches: {}", statusBatches.size());
return updated;
}
private static String filesToString(FileStatus[] files) {
if (files == null) {
return "null";
}
StringBuilder b = new StringBuilder();
for (FileStatus s : files) {
b.append(s.getPath().toString()).append(", ");
}
return b.toString();
}
@VisibleForTesting
protected DailyStatusBatch loadDay(FileSystem hdfs, Date day) throws IOException {
Path dayPath = new Path(getStatusPathToUseForDay(day), ArchiveHDFSUtils.dateToPath(day, "/"));
LOG.debug("Looking for batch in " + dayPath.toString());
DailyStatusBatch result = this.statusBatches.get(day);
if (result != null) {
return result;
}
final FileStatus[] files;
try {
files = hdfs.listStatus(dayPath);
LOG.debug("Files found: " + filesToString(files));
} catch (FileNotFoundException e) {
LOG.debug("loadDay() called, but directory does not exist for day: " + day
+ " in: " + dayPath);
return null;
}
if (files != null && files.length > 0) {
for (FileStatus file : files) {
Matcher matcher = HASH_PARTITION_PATTERN.matcher(file.getPath().getName());
if (matcher.matches()) {
int numHashPartitions = Integer.parseInt(matcher.group(2));
result = new DailyStatusBatch(
day, numHashPartitions, getStatusPathToUseForDay(day), hdfs);
for (int partitionID = 0; partitionID < numHashPartitions; partitionID++) {
result.addPartition(hdfs, dayPath, partitionID);
}
if (result.isValid()) {
statusBatches.put(day, result);
return result;
} else {
LOG.info("Invalid batch found for day: " + day + ", batch: " + result);
}
} else {
// skip logging the intermediate count subdirectories or _SUCCESS files.
if (!INTERMEDIATE_COUNTS_SUBDIR_NAME.equals(file.getPath().getName())
&& !SUCCESS_FILE_NAME.equals(file.getPath().getName())) {
LOG.warn("Path does not match hash partition pattern: " + file.getPath());
}
}
}
} else {
LOG.warn("No data found for day: " + day + " in: " + dayPath
+ " files null: " + (files == null));
}
return null;
}
/**
* Determines if this directory has a valid batch for the given day.
*/
public boolean hasValidBatchForDay(Date day) throws IOException {
FileSystem hdfs = null;
try {
hdfs = HdfsUtil.getHdfsFileSystem();
return hasValidBatchForDay(hdfs, day);
} finally {
IOUtils.closeQuietly(hdfs);
}
}
private boolean hasValidBatchForDay(FileSystem fs, Date day) throws IOException {
DailyStatusBatch batch = loadDay(fs, day);
return batch != null && batch.isValid();
}
@VisibleForTesting
Date getFirstValidDay() {
return firstValidDay;
}
@VisibleForTesting
Date getLastValidDay() {
return lastValidDay;
}
private Date getLastValidInputDateFromNow(FileSystem hdfs) throws IOException {
Calendar cal = Calendar.getInstance();
cal.setTime(new Date()); // current date
return getLastValidInputDate(hdfs, cal);
}
/**
* Starting from current date, probe backward till we find a valid input Date
*/
@VisibleForTesting
Date getLastValidInputDate(FileSystem hdfs, Calendar cal) throws IOException {
cal.set(Calendar.MILLISECOND, 0);
cal.set(Calendar.HOUR_OF_DAY, 0);
cal.set(Calendar.MINUTE, 0);
cal.set(Calendar.SECOND, 0);
cal.set(Calendar.MILLISECOND, 0);
Date lastValidInputDate = cal.getTime();
LOG.info("Probing backwards for last valid data date from " + lastValidInputDate);
while (lastValidInputDate.after(FIRST_TWITTER_DAY)) {
if (hasValidBatchForDay(hdfs, lastValidInputDate)) {
LOG.info("Found latest valid data on date " + lastValidInputDate);
LOG.info(" Used path: {}", getStatusPathToUseForDay(lastValidInputDate));
return lastValidInputDate;
}
cal.add(Calendar.DATE, -1);
lastValidInputDate = cal.getTime();
}
return null;
}
/**
* Check if the daily status batches are already on HDFS
*/
@VisibleForTesting
boolean hasStatusBatchesOnHdfs(FileSystem fs, Date lastDataDay) {
String hdfsFileName = getHdfsStatusBatchSyncFileName(lastDataDay);
try {
return fs.exists(new Path(hdfsFileName));
} catch (IOException ex) {
LOG.error("Failed checking status batch file on HDFS: " + hdfsFileName, ex);
return false;
}
}
/**
* Load the daily status batches from HDFS by first copying the file from HDFS to local disk
* and then reading from the local disk.
*
* @param day the latest day of valid statuses.
* @return true if the loading is successful.
*/
@VisibleForTesting
boolean loadStatusBatchesFromHdfs(FileSystem fs, Date day) {
// set the directory state to initial state
resetDirectory();
String fileHdfsPath = getHdfsStatusBatchSyncFileName(day);
String fileLocalPath = getLocalStatusBatchSyncFileName(day);
LOG.info("Using " + fileHdfsPath + " as the HDFS batch summary load path.");
LOG.info("Using " + fileLocalPath + " as the local batch summary sync path.");
LineRecordFileReader lineReader = null;
try {
fs.copyToLocalFile(new Path(fileHdfsPath), new Path(fileLocalPath));
lineReader = new LineRecordFileReader(fileLocalPath);
String batchLine;
while ((batchLine = lineReader.readNext()) != null) {
DailyStatusBatch batch = DailyStatusBatch.deserializeFromJson(batchLine);
if (batch == null) {
LOG.error("Invalid daily status batch constructed from line: " + batchLine);
resetDirectory();
return false;
}
Date date = batch.getDate();
if (firstValidDay == null || firstValidDay.after(date)) {
firstValidDay = date;
}
if (lastValidDay == null || lastValidDay.before(date)) {
lastValidDay = date;
}
statusBatches.put(date, batch);
}
LOG.info("Loaded {} status batches from HDFS: {}",
statusBatches.size(), fileHdfsPath);
LOG.info("First entry: {}", statusBatches.firstEntry().getValue().toString());
LOG.info("Last entry: {}", statusBatches.lastEntry().getValue().toString());
return true;
} catch (IOException ex) {
LOG.error("Failed loading time slices from HDFS: " + fileHdfsPath, ex);
resetDirectory();
return false;
} finally {
if (lineReader != null) {
lineReader.stop();
}
}
}
/**
* Flush the daily status batches to local disk and then upload to HDFS.
*/
private boolean storeStatusBatchesToHdfs(FileSystem fs, Date day) {
Preconditions.checkNotNull(lastValidDay);
if (!StatusBatchFlushVersion.CURRENT_FLUSH_VERSION.isOfficial()) {
LOG.info("Status batch flush version is not official, no batches will be flushed to HDFS");
return true;
}
String fileLocalPath = getLocalStatusBatchSyncFileName(day);
// Flush to local disk
File outputFile = null;
FileWriter fileWriter = null;
try {
LOG.info("Flushing daily status batches into: " + fileLocalPath);
outputFile = new File(fileLocalPath);
outputFile.getParentFile().mkdirs();
if (!outputFile.getParentFile().exists()) {
LOG.error("Cannot create directory: " + outputFile.getParentFile().toString());
return false;
}
fileWriter = new FileWriter(outputFile, false);
for (Date date : statusBatches.keySet()) {
fileWriter.write(statusBatches.get(date).serializeToJson());
fileWriter.write("\n");
}
fileWriter.flush();
// Upload the file to HDFS
return uploadStatusBatchesToHdfs(fs, day);
} catch (IOException e) {
String fileHdfsPath = getHdfsStatusBatchSyncFileName(day);
LOG.error("Failed storing status batches to HDFS: " + fileHdfsPath, e);
return false;
} finally {
try {
if (fileWriter != null) {
fileWriter.close();
}
} catch (IOException e) {
LOG.error("Error to close fileWrite.", e);
}
if (outputFile != null) {
// Delete the local file
outputFile.delete();
}
}
}
/**
* Upload the status batches to HDFS.
*/
@VisibleForTesting
boolean uploadStatusBatchesToHdfs(FileSystem fs, Date day) {
String localFileName = getLocalStatusBatchSyncFileName(day);
String hdfsFileName = getHdfsStatusBatchSyncFileName(day);
LOG.info("Using " + hdfsFileName + " as the HDFS batch summary upload path.");
LOG.info("Using " + localFileName + " as the local batch summary sync path.");
try {
Path hdfsFilePath = new Path(hdfsFileName);
if (fs.exists(hdfsFilePath)) {
LOG.warn("Found status batch file on HDFS: " + hdfsFileName);
return true;
}
String hdfsTempName = getHdfsStatusBatchTempSyncFileName(day);
Path hdfsTempPath = new Path(hdfsTempName);
if (fs.exists(hdfsTempPath)) {
LOG.info("Found existing temporary status batch file on HDFS, removing: " + hdfsTempName);
if (!fs.delete(hdfsTempPath, false)) {
LOG.error("Failed to delete temporary file: " + hdfsTempName);
return false;
}
}
fs.copyFromLocalFile(new Path(localFileName), hdfsTempPath);
if (fs.rename(hdfsTempPath, hdfsFilePath)) {
LOG.debug("Renamed " + hdfsTempName + " on HDFS to: " + hdfsFileName);
return true;
} else {
LOG.error("Failed to rename " + hdfsTempName + " on HDFS to: " + hdfsFileName);
return false;
}
} catch (IOException ex) {
LOG.error("Failed uploading status batch file to HDFS: " + hdfsFileName, ex);
return false;
}
}
private static boolean isStatusBatchFlushingEnabled() {
return EarlybirdProperty.ARCHIVE_DAILY_STATUS_BATCH_FLUSHING_ENABLED.get(false);
}
private static boolean isStatusBatchLoadingEnabled() {
return EarlybirdConfig.getBool("archive_daily_status_batch_loading_enabled", false);
}
private static String getVersionFileExtension() {
return StatusBatchFlushVersion.CURRENT_FLUSH_VERSION.getVersionFileExtension();
}
String getStatusBatchSyncRootDir() {
return EarlybirdConfig.getString("archive_daily_status_batch_sync_dir",
"daily_status_batches") + "/" + scrubGenSuffix();
}
@VisibleForTesting
String getLocalStatusBatchSyncFileName(Date day) {
return getStatusBatchSyncRootDir() + "/" + STATUS_BATCHES_PREFIX + "_"
+ DATE_FORMAT.format(day) + getVersionFileExtension();
}
String getHdfsStatusBatchSyncRootDir() {
return EarlybirdConfig.getString("hdfs_archive_daily_status_batch_sync_dir",
"daily_status_batches") + "/" + scrubGenSuffix();
}
@VisibleForTesting
String getHdfsStatusBatchSyncFileName(Date day) {
return getHdfsStatusBatchSyncRootDir() + "/" + STATUS_BATCHES_PREFIX + "_"
+ DATE_FORMAT.format(day) + getVersionFileExtension();
}
private String getHdfsStatusBatchTempSyncFileName(Date day) {
return getHdfsStatusBatchSyncRootDir() + "/" + DatabaseConfig.getLocalHostname() + "_"
+ STATUS_BATCHES_PREFIX + "_" + DATE_FORMAT.format(day) + getVersionFileExtension();
}
private String scrubGenSuffix() {
return String.format(SCRUB_GEN_SUFFIX_PATTERN, DATE_FORMAT.format(scrubGenDay));
}
/**
* Returns the path to the directory that stores the statuses for the given day.
*/
public Path getStatusPathToUseForDay(Date day) {
if (!day.before(scrubGenDay)) {
return statusPath;
}
String suffix = scrubGenSuffix();
Preconditions.checkArgument(!suffix.isEmpty());
Path scrubPath = new Path(buildGenPath, suffix);
return new Path(scrubPath, STATUS_SUBDIR_NAME);
}
/**
* Determines if the data for the specified scrub gen was fully built, by checking the number of
* days for which data was built against the expected number of days extracted from the specified
* scrub gen date.
*/
public boolean isScrubGenDataFullyBuilt(FileSystem hdfs) throws IOException {
initialLoadDailyBatchInfos(hdfs);
if (numberOfDaysWithValidScrubGenData == 0) {
LOG.warn("numberOfDaysWithValidScrubGenData is 0");
}
long expectedDays = getDiffBetweenDays(scrubGenDay);
return expectedDays == numberOfDaysWithValidScrubGenData;
}
@VisibleForTesting
long getDiffBetweenDays(Date day) {
long diff = day.getTime() - FIRST_TWEET_DAY.getTime();
return TimeUnit.DAYS.convert(diff, TimeUnit.MILLISECONDS);
}
}

View File

@ -1,333 +0,0 @@
package com.twitter.search.earlybird.archive;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Comparator;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Lists;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.common.config.Config;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentUtil;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
import com.twitter.search.common.util.date.DateUtil;
import com.twitter.search.common.util.io.EmptyRecordReader;
import com.twitter.search.common.util.io.LzoThriftBlockFileReader;
import com.twitter.search.common.util.io.MergingSortedRecordReader;
import com.twitter.search.common.util.io.TransformingRecordReader;
import com.twitter.search.common.util.io.recordreader.RecordReader;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.document.DocumentFactory;
import com.twitter.search.earlybird.document.TweetDocument;
import com.twitter.search.earlybird.partition.HdfsUtil;
/**
* A batch of pre-processed tweets for a single hash partition from a particular day.
*/
public class PartitionedBatch {
private static final Logger LOG = LoggerFactory.getLogger(PartitionedBatch.class);
private static final Date START_DATE_INCLUSIVE = DateUtil.toDate(2006, 03, 21);
private static final String STATUS_COUNT_FILE_PREFIX = "_status_count_";
private static final Pattern STATUS_COUNT_FILE_PATTERN =
Pattern.compile(STATUS_COUNT_FILE_PREFIX + "(\\d+)_minid_(\\d+)_maxid_(\\d+)");
private static final int MAXIMUM_OUT_OF_ORDER_TOLERANCE_HOURS =
EarlybirdConfig.getInt("archive_max_out_of_order_tolerance_hours", 12);
private static final int READER_INIT_IOEXCEPTION_RETRIES = 20;
private static final PathFilter LZO_DATA_FILES_FILTER = file -> file.getName().endsWith(".lzo");
private static final PathFilter TXT_DATA_FILES_FILTER = file -> file.getName().endsWith(".txt");
private static final Comparator<ThriftIndexingEvent> DESC_THRIFT_INDEXING_EVENT_COMPARATOR =
(o1, o2) -> ComparisonChain.start()
.compare(o2.getSortId(), o1.getSortId())
.compare(o2.getUid(), o1.getUid())
.result();
// Number archive tweets skipped because they are too out-of-order.
private static final SearchCounter OUT_OF_ORDER_STATUSES_SKIPPED =
SearchCounter.export("out_of_order_archive_statuses_skipped");
@VisibleForTesting
protected static final long MAXIMUM_OUT_OF_ORDER_TOLERANCE_MILLIS =
TimeUnit.HOURS.toMillis(MAXIMUM_OUT_OF_ORDER_TOLERANCE_HOURS);
private final Date date;
private final Path path;
private int statusCount;
private long minStatusID;
private long maxStatusID;
private final int hashPartitionID;
private boolean hasStatusCountFile;
private final int numHashPartitions;
@VisibleForTesting
public PartitionedBatch(
Path path,
int hashPartitionID,
int numHashPartitions,
Date date) {
this.path = path;
this.hashPartitionID = hashPartitionID;
this.numHashPartitions = numHashPartitions;
this.date = date;
}
/**
* Loads all the information (tweet count, etc.) for this partition and day from HDFS.
*/
public void load(FileSystem hdfs) throws IOException {
FileStatus[] dailyBatchFiles = null;
try {
// listStatus() javadoc says it throws FileNotFoundException when path does not exist.
// However, the actual implementations return null or an empty array instead.
// We handle all 3 cases: null, empty array, or FileNotFoundException.
dailyBatchFiles = hdfs.listStatus(path);
} catch (FileNotFoundException e) {
// don't do anything here and the day will be handled as empty.
}
if (dailyBatchFiles != null && dailyBatchFiles.length > 0) {
for (FileStatus file : dailyBatchFiles) {
String fileName = file.getPath().getName();
if (fileName.equals(STATUS_COUNT_FILE_PREFIX)) {
// zero tweets in this partition - this can happen for early days in 2006
handleEmptyPartition();
} else {
Matcher matcher = STATUS_COUNT_FILE_PATTERN.matcher(fileName);
if (matcher.matches()) {
try {
statusCount = Integer.parseInt(matcher.group(1));
// Only adjustMinStatusId in production. For tests, this makes the tests harder to
// understand.
minStatusID = Config.environmentIsTest() ? Long.parseLong(matcher.group(2))
: adjustMinStatusId(Long.parseLong(matcher.group(2)), date);
maxStatusID = Long.parseLong(matcher.group(3));
hasStatusCountFile = true;
} catch (NumberFormatException e) {
// invalid file - ignore
LOG.warn("Could not parse status count file name.", e);
}
}
}
}
} else {
// Partition folder does not exist. This case can happen for early days of twitter
// where some partitions are empty. Set us to having a status count file, the validity of
// the parent DailyStatusBatch will still be determined by whether there was a _SUCCESS file
// in the day root.
handleEmptyPartition();
if (date.after(getEarliestDenseDay())) {
LOG.error("Unexpected empty directory {} for {}", path, date);
}
}
}
private void handleEmptyPartition() {
statusCount = 0;
minStatusID = DailyStatusBatch.EMPTY_BATCH_STATUS_ID;
maxStatusID = DailyStatusBatch.EMPTY_BATCH_STATUS_ID;
hasStatusCountFile = true;
}
/**
* Sometimes tweets are out-of-order (E.g. a tweet from Sep 2012 got into a
* batch in July 2013). See SEARCH-1750 for more details.
* This adjust the minStatusID if it is badly out-of-order.
*/
@VisibleForTesting
protected static long adjustMinStatusId(long minStatusID, Date date) {
long dateTime = date.getTime();
// If the daily batch is for a day before we started using snow flake IDs. Never adjust.
if (!SnowflakeIdParser.isUsableSnowflakeTimestamp(dateTime)) {
return minStatusID;
}
long earliestStartTime = dateTime - MAXIMUM_OUT_OF_ORDER_TOLERANCE_MILLIS;
long minStatusTime = SnowflakeIdParser.getTimestampFromTweetId(minStatusID);
if (minStatusTime < earliestStartTime) {
long newMinId = SnowflakeIdParser.generateValidStatusId(earliestStartTime, 0);
LOG.info("Daily batch for " + date + " has badly out of order tweet: " + minStatusID
+ ". The minStatusID for the day this batch is adjusted to " + newMinId);
return newMinId;
} else {
return minStatusID;
}
}
/**
* Returns a reader that reads tweets from the given directory.
*
* @param archiveSegment Determines the timeslice ID of all read tweets.
* @param tweetsPath The path to the directory where the tweets for this day are stored.
* @param documentFactory The ThriftIndexingEvent to TweetDocument converter.
*/
public RecordReader<TweetDocument> getTweetReaders(
ArchiveSegment archiveSegment,
Path tweetsPath,
DocumentFactory<ThriftIndexingEvent> documentFactory) throws IOException {
RecordReader<TweetDocument> tweetDocumentReader =
new TransformingRecordReader<>(
createTweetReader(tweetsPath), new Function<ThriftIndexingEvent, TweetDocument>() {
@Override
public TweetDocument apply(ThriftIndexingEvent event) {
return new TweetDocument(
event.getSortId(),
archiveSegment.getTimeSliceID(),
EarlybirdThriftDocumentUtil.getCreatedAtMs(event.getDocument()),
documentFactory.newDocument(event)
);
}
});
tweetDocumentReader.setExhaustStream(true);
return tweetDocumentReader;
}
private RecordReader<ThriftIndexingEvent> createTweetReader(Path tweetsPath) throws IOException {
if (date.before(START_DATE_INCLUSIVE)) {
return new EmptyRecordReader<>();
}
List<RecordReader<ThriftIndexingEvent>> readers = Lists.newArrayList();
FileSystem hdfs = HdfsUtil.getHdfsFileSystem();
try {
Path dayPath = new Path(tweetsPath, ArchiveHDFSUtils.dateToPath(date, "/"));
Path partitionPath =
new Path(dayPath, String.format("p_%d_of_%d", hashPartitionID, numHashPartitions));
PathFilter pathFilter =
Config.environmentIsTest() ? TXT_DATA_FILES_FILTER : LZO_DATA_FILES_FILTER;
FileStatus[] files = hdfs.listStatus(partitionPath, pathFilter);
for (FileStatus fileStatus : files) {
String fileStatusPath = fileStatus.getPath().toString().replaceAll("file:/", "/");
RecordReader<ThriftIndexingEvent> reader = createRecordReaderWithRetries(fileStatusPath);
readers.add(reader);
}
} finally {
IOUtils.closeQuietly(hdfs);
}
if (readers.isEmpty()) {
return new EmptyRecordReader<>();
}
return new MergingSortedRecordReader<>(DESC_THRIFT_INDEXING_EVENT_COMPARATOR, readers);
}
private RecordReader<ThriftIndexingEvent> createRecordReaderWithRetries(String filePath)
throws IOException {
Predicate<ThriftIndexingEvent> recordFilter = getRecordFilter();
int numTries = 0;
while (true) {
try {
++numTries;
return new LzoThriftBlockFileReader<>(filePath, ThriftIndexingEvent.class, recordFilter);
} catch (IOException e) {
if (numTries < READER_INIT_IOEXCEPTION_RETRIES) {
LOG.warn("Failed to open LzoThriftBlockFileReader for " + filePath + ". Will retry.", e);
} else {
LOG.error("Failed to open LzoThriftBlockFileReader for " + filePath
+ " after too many retries.", e);
throw e;
}
}
}
}
private Predicate<ThriftIndexingEvent> getRecordFilter() {
return Config.environmentIsTest() ? null : input -> {
if (input == null) {
return false;
}
// We only guard against status IDs that are too small, because it is possible
// for a very old tweet to get into today's batch, but not possible for a very
// large ID (a future tweet ID that is not yet published) to get in today's
// batch, unless tweet ID generation messed up.
long statusId = input.getSortId();
boolean keep = statusId >= minStatusID;
if (!keep) {
LOG.debug("Out of order documentId: {} minStatusID: {} Date: {} Path: {}",
statusId, minStatusID, date, path);
OUT_OF_ORDER_STATUSES_SKIPPED.increment();
}
return keep;
};
}
/**
* Returns the number of statuses in this batch
*/
public int getStatusCount() {
return statusCount;
}
/**
* Was the _status_count file was found in this folder.
*/
public boolean hasStatusCount() {
return hasStatusCountFile;
}
public long getMinStatusID() {
return minStatusID;
}
public long getMaxStatusID() {
return maxStatusID;
}
public Date getDate() {
return date;
}
public Path getPath() {
return path;
}
/**
* Check whether the partition is
* . empty and
* . it is disallowed (empty partition can only happen before 2010)
* (Empty partition means that the directory is missing when scan happens.)
*
* @return true if the partition has no documents and it is not allowed.
*/
public boolean isDisallowedEmptyPartition() {
return hasStatusCountFile
&& statusCount == 0
&& minStatusID == DailyStatusBatch.EMPTY_BATCH_STATUS_ID
&& maxStatusID == DailyStatusBatch.EMPTY_BATCH_STATUS_ID
&& date.after(getEarliestDenseDay());
}
@Override
public String toString() {
return "PartitionedBatch[hashPartitionId=" + hashPartitionID
+ ",numHashPartitions=" + numHashPartitions
+ ",date=" + date
+ ",path=" + path
+ ",hasStatusCountFile=" + hasStatusCountFile
+ ",statusCount=" + statusCount + "]";
}
private Date getEarliestDenseDay() {
return EarlybirdConfig.getDate("archive_search_earliest_dense_day");
}
}

View File

@ -1,64 +0,0 @@
java_library(
name = "segment_builder_lib",
sources = ["**/*.java"],
platform = "java8",
tags = [
"bazel-compatible",
"bazel-only",
],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server",
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"3rdparty/jvm/org/slf4j:slf4j-api",
"decider/src/main/scala",
"finatra/inject/inject-core/src/main/scala",
"finatra/inject/inject-server/src/main/scala/com/twitter/inject/server",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/quantity",
"src/java/com/twitter/common/util:system-mocks",
"src/java/com/twitter/common_internal/text/version",
"src/java/com/twitter/search/common/config",
"src/java/com/twitter/search/common/database",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/partitioning/base",
"src/java/com/twitter/search/common/partitioning/zookeeper",
"src/java/com/twitter/search/common/schema",
"src/java/com/twitter/search/common/schema/base",
"src/java/com/twitter/search/common/util:closeresourceutil",
"src/java/com/twitter/search/common/util:gcutil",
"src/java/com/twitter/search/common/util:kerberos",
"src/java/com/twitter/search/common/util/date",
"src/java/com/twitter/search/common/util/io:flushable",
"src/java/com/twitter/search/common/util/zktrylock",
"src/java/com/twitter/search/common/util/zookeeper",
"src/java/com/twitter/search/earlybird:earlybird-lib",
"src/java/com/twitter/search/earlybird/common",
"src/java/com/twitter/search/earlybird/common/config",
"src/java/com/twitter/search/earlybird/common/userupdates",
"util/util-core:scala",
],
)
# Using hadoop_binary target can automatically exclude hadoop related jars in the built jar
# and load in the right jars based on hadoop config.
hadoop_binary(
name = "segment_builder_binary",
basename = "segment_builder",
main = "com.twitter.search.earlybird.archive.segmentbuilder.SegmentBuilderMain",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":segment_builder_lib",
"src/java/com/twitter/search/common/logging:search-log4j",
],
)

View File

@ -1,29 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
import com.twitter.search.earlybird.partition.SegmentInfo;
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
public class BuiltAndFinalizedSegment extends SegmentBuilderSegment {
public BuiltAndFinalizedSegment(
SegmentInfo segmentInfo,
SegmentConfig segmentConfig,
EarlybirdSegmentFactory earlybirdSegmentFactory,
int alreadyRetriedCount,
SegmentSyncConfig sync) {
super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync);
}
@Override
public SegmentBuilderSegment handle() throws SegmentInfoConstructionException,
SegmentUpdaterException {
throw new IllegalStateException("Should not handle a BuildAndFinalizedSegment.");
}
@Override
public boolean isBuilt() {
return true;
}
}

View File

@ -1,101 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
import java.util.concurrent.atomic.AtomicBoolean;
import com.google.common.base.Stopwatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.util.Clock;
import com.twitter.search.common.util.GCUtil;
import com.twitter.search.common.util.zktrylock.TryLock;
import com.twitter.search.earlybird.archive.ArchiveSegmentUpdater;
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
import com.twitter.search.earlybird.partition.SegmentInfo;
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
public class NotYetBuiltSegment extends SegmentBuilderSegment {
private static final Logger LOG = LoggerFactory.getLogger(NotYetBuiltSegment.class);
public NotYetBuiltSegment(
SegmentInfo segmentInfo,
SegmentConfig segmentConfig,
EarlybirdSegmentFactory earlybirdSegmentFactory,
int alreadyRetriedCount,
SegmentSyncConfig sync) {
super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync);
}
/**
* 1. Grab the ZK lock for this segment.
* 2a. if lock fails, another host is updating; return the SOMEONE_ELSE_IS_BUILDING state.
* 2b. if lock succeeds, check again if the updated segment exists on HDFS.
* 3a. if so, just move on.
* 3b. if not, update the segment.
* In both cases, we need to check if the segment can now be marked as BUILT_AND_FINALIZED.
*/
@Override
public SegmentBuilderSegment handle()
throws SegmentUpdaterException, SegmentInfoConstructionException {
LOG.info("Handling a not yet built segment: {}", this.getSegmentName());
Stopwatch stopwatch = Stopwatch.createStarted();
TryLock lock = getZooKeeperTryLock();
// The tryWithLock can only access variables from parent class that are final. However, we
// would like to pass the process() return value to the parent class. So here we use
// AtomicBoolean reference instead of Boolean.
final AtomicBoolean successRef = new AtomicBoolean(false);
boolean gotLock = lock.tryWithLock(() -> {
ArchiveSegmentUpdater updater = new ArchiveSegmentUpdater(
segmentConfig.getTryLockFactory(),
sync,
segmentConfig.getEarlybirdIndexConfig(),
Clock.SYSTEM_CLOCK);
boolean success = updater.updateSegment(segmentInfo);
successRef.set(success);
});
if (!gotLock) {
LOG.info("cannot acquire zookeeper lock for: " + segmentInfo);
return new SomeoneElseIsBuildingSegment(
segmentInfo,
segmentConfig,
earlybirdSegmentFactory,
alreadyRetriedCount,
sync);
}
// 1. we want to make sure the heap is clean right after building a segment so that it's ready
// for us to start allocations for a new segment
// I think we've had cases where we were seeing OOM's while building
// 2. the thing that I think it helps with is compaction (vs just organically running CMS)
// which would clean up the heap, but may leave it in a fragmented state
// and running a Full GC is supposed to compact the remaining tenured space.
GCUtil.runGC();
if (successRef.get()) {
LOG.info("Indexing segment {} took {}", segmentInfo, stopwatch);
LOG.info("Finished building {}", segmentInfo.getSegment().getSegmentName());
return new BuiltAndFinalizedSegment(
segmentInfo, segmentConfig, earlybirdSegmentFactory, 0, sync);
} else {
int alreadyTried = alreadyRetriedCount + 1;
String errMsg = "failed updating segment for: " + segmentInfo
+ " for " + alreadyTried + " times";
LOG.error(errMsg);
if (alreadyTried < segmentConfig.getMaxRetriesOnFailure()) {
return new NotYetBuiltSegment(
createNewSegmentInfo(segmentInfo),
segmentConfig,
earlybirdSegmentFactory,
alreadyTried,
sync);
} else {
throw new SegmentUpdaterException(errMsg);
}
}
}
}

View File

@ -1,39 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
import java.util.HashMap;
import java.util.Map;
import com.twitter.common.util.Clock;
/**
* A class that prevents handling a given segment more than once every hdfsCheckIntervalMillis
*/
public class RateLimitingSegmentHandler {
private final long hdfsCheckIntervalMillis;
private final Clock clock;
private final Map<String, Long> segmentNameToLastUpdatedTimeMillis = new HashMap<>();
RateLimitingSegmentHandler(long hdfsCheckIntervalMillis, Clock clock) {
this.hdfsCheckIntervalMillis = hdfsCheckIntervalMillis;
this.clock = clock;
}
SegmentBuilderSegment processSegment(SegmentBuilderSegment segment)
throws SegmentUpdaterException, SegmentInfoConstructionException {
String segmentName = segment.getSegmentName();
Long lastUpdatedMillis = segmentNameToLastUpdatedTimeMillis.get(segmentName);
if (lastUpdatedMillis == null) {
lastUpdatedMillis = 0L;
}
long nowMillis = clock.nowMillis();
if (nowMillis - lastUpdatedMillis < hdfsCheckIntervalMillis) {
return segment;
}
segmentNameToLastUpdatedTimeMillis.put(segmentName, nowMillis);
return segment.handle();
}
}

View File

@ -1,540 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.Uninterruptibles;
import com.google.inject.Inject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.quantity.Amount;
import com.twitter.common.quantity.Time;
import com.twitter.common.util.Clock;
import com.twitter.decider.Decider;
import com.twitter.inject.annotations.Flag;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.metrics.SearchLongGauge;
import com.twitter.search.common.metrics.SearchStatsReceiver;
import com.twitter.search.common.metrics.SearchStatsReceiverImpl;
import com.twitter.search.common.partitioning.zookeeper.SearchZkClient;
import com.twitter.search.common.util.Kerberos;
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
import com.twitter.search.earlybird.archive.ArchiveOnDiskEarlybirdIndexConfig;
import com.twitter.search.earlybird.archive.ArchiveSegment;
import com.twitter.search.earlybird.archive.DailyStatusBatches;
import com.twitter.search.earlybird.archive.ArchiveTimeSlicer;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.util.ScrubGenUtil;
import com.twitter.search.earlybird.exception.CriticalExceptionHandler;
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
import com.twitter.search.earlybird.partition.SearchIndexingMetricSet;
import com.twitter.search.earlybird.partition.SegmentInfo;
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
/**
* This class provides the core logic to build segment indices offline.
* For each server, it coordinate via zookeeper to pick the next segment, build the indices for it
* and upload them to HDFS. A state machine is used to handle the build state transitions. There
* are three states:
* NOT_BUILD_YET: a segment that needs to be built
* SOMEONE_ELSE_IS_BUILDING: another server is building the segment.
* BUILT_AND_FINALIZED: the indices of this segment have already been built.
*/
public class SegmentBuilder {
private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilder.class);
private final boolean onlyRunOnce;
private final int waitBetweenLoopsMins;
private final int startUpBatchSize;
private final int instance;
private final int waitBetweenSegmentsSecs;
private final int waitBeforeQuitMins;
// When multiple segment builders start simultaneously, they might make the HDFS name node and
// zookeeper overwhelmed. So, we let some instances sleep sometimes before they start to avoid
// the issues.
private final long startUpSleepMins;
// If no more segments to built, wait this interval before checking again.
private final long processWaitingInterval = TimeUnit.MINUTES.toMillis(10);
// The hash partitions that segments will be built.
private final ImmutableList<Integer> hashPartitions;
private final SearchStatsReceiver statsReceiver = new SearchStatsReceiverImpl();
private final SearchIndexingMetricSet searchIndexingMetricSet =
new SearchIndexingMetricSet(statsReceiver);
private final EarlybirdSearcherStats searcherStats =
new EarlybirdSearcherStats(statsReceiver);
private final ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig;
private final ZooKeeperTryLockFactory zkTryLockFactory;
private final RateLimitingSegmentHandler segmentHandler;
private final Clock clock;
private final int numSegmentBuilderPartitions;
private final int myPartitionId;
private final SegmentConfig segmentConfig;
private final EarlybirdSegmentFactory segmentFactory;
private final SegmentBuilderCoordinator segmentBuilderCoordinator;
private final SegmentSyncConfig segmentSyncConfig;
private final Random random = new Random();
private static final double SLEEP_RANDOMIZATION_RATIO = .2;
// Stats
// The flush version used to build segments
private static final SearchLongGauge CURRENT_FLUSH_VERSION =
SearchLongGauge.export("current_flush_version");
// Accumulated number and time in seconds spent on building segments locally
private static SearchCounter segmentsBuiltLocally =
SearchCounter.export("segments_built_locally");
private static SearchCounter timeSpentOnSuccessfulBuildSecs =
SearchCounter.export("time_spent_on_successful_build_secs");
// The total number of segments to be built
private static final SearchLongGauge SEGMENTS_TO_BUILD =
SearchLongGauge.export("segments_to_build");
// How many segments failed locally
private static final SearchCounter FAILED_SEGMENTS =
SearchCounter.export("failed_segments");
@Inject
protected SegmentBuilder(@Flag("onlyRunOnce") boolean onlyRunOnceFlag,
@Flag("waitBetweenLoopsMins") int waitBetweenLoopsMinsFlag,
@Flag("startup_batch_size") int startUpBatchSizeFlag,
@Flag("instance") int instanceFlag,
@Flag("segmentZkLockExpirationHours")
int segmentZkLockExpirationHoursFlag,
@Flag("startupSleepMins") long startupSleepMinsFlag,
@Flag("maxRetriesOnFailure") int maxRetriesOnFailureFlag,
@Flag("hash_partitions") List<Integer> hashPartitionsFlag,
@Flag("numSegmentBuilderPartitions") int numSegmentBuilderPartitionsFlag,
@Flag("waitBetweenSegmentsSecs") int waitBetweenSegmentsSecsFlag,
@Flag("waitBeforeQuitMins") int waitBeforeQuitMinsFlag,
@Flag("scrubGen") String scrubGen,
Decider decider) {
this(onlyRunOnceFlag,
waitBetweenLoopsMinsFlag,
startUpBatchSizeFlag,
instanceFlag,
segmentZkLockExpirationHoursFlag,
startupSleepMinsFlag,
hashPartitionsFlag,
maxRetriesOnFailureFlag,
waitBetweenSegmentsSecsFlag,
waitBeforeQuitMinsFlag,
SearchZkClient.getSZooKeeperClient().createZooKeeperTryLockFactory(),
new RateLimitingSegmentHandler(TimeUnit.MINUTES.toMillis(10), Clock.SYSTEM_CLOCK),
Clock.SYSTEM_CLOCK,
numSegmentBuilderPartitionsFlag,
decider,
getSyncConfig(scrubGen));
}
@VisibleForTesting
protected SegmentBuilder(boolean onlyRunOnceFlag,
int waitBetweenLoopsMinsFlag,
int startUpBatchSizeFlag,
int instanceFlag,
int segmentZkLockExpirationHoursFlag,
long startupSleepMinsFlag,
List<Integer> hashPartitions,
int maxRetriesOnFailure,
int waitBetweenSegmentsSecsFlag,
int waitBeforeQuitMinsFlag,
ZooKeeperTryLockFactory zooKeeperTryLockFactory,
RateLimitingSegmentHandler segmentHandler,
Clock clock,
int numSegmentBuilderPartitions,
Decider decider,
SegmentSyncConfig syncConfig) {
LOG.info("Creating SegmentBuilder");
LOG.info("Penguin version in use: " + EarlybirdConfig.getPenguinVersion());
// Set command line flag values
this.onlyRunOnce = onlyRunOnceFlag;
this.waitBetweenLoopsMins = waitBetweenLoopsMinsFlag;
this.startUpBatchSize = startUpBatchSizeFlag;
this.instance = instanceFlag;
this.waitBetweenSegmentsSecs = waitBetweenSegmentsSecsFlag;
this.waitBeforeQuitMins = waitBeforeQuitMinsFlag;
this.segmentHandler = segmentHandler;
this.zkTryLockFactory = zooKeeperTryLockFactory;
this.segmentSyncConfig = syncConfig;
this.startUpSleepMins = startupSleepMinsFlag;
if (!hashPartitions.isEmpty()) {
this.hashPartitions = ImmutableList.copyOf(hashPartitions);
} else {
this.hashPartitions = null;
}
Amount<Long, Time> segmentZKLockExpirationTime = Amount.of((long)
segmentZkLockExpirationHoursFlag, Time.HOURS);
this.earlybirdIndexConfig =
new ArchiveOnDiskEarlybirdIndexConfig(decider, searchIndexingMetricSet,
new CriticalExceptionHandler());
this.segmentConfig = new SegmentConfig(
earlybirdIndexConfig,
segmentZKLockExpirationTime,
maxRetriesOnFailure,
zkTryLockFactory);
this.segmentFactory = new EarlybirdSegmentFactory(
earlybirdIndexConfig,
searchIndexingMetricSet,
searcherStats,
clock);
this.segmentBuilderCoordinator = new SegmentBuilderCoordinator(
zkTryLockFactory, syncConfig, clock);
this.clock = clock;
this.numSegmentBuilderPartitions = numSegmentBuilderPartitions;
this.myPartitionId = instance % numSegmentBuilderPartitions;
SearchLongGauge.export("segment_builder_partition_id_" + myPartitionId).set(1);
CURRENT_FLUSH_VERSION.set(earlybirdIndexConfig.getSchema().getMajorVersionNumber());
}
void run() {
LOG.info("Config values: {}", EarlybirdConfig.allValuesAsString());
// Sleep some time uninterruptibly before get started so that if multiple instances are running,
// the HDFS name node and zookeeper wont be overwhelmed
// Say, we have 100 instances (instance_arg will have value from 0 - 99, our
// STARTUP_BATCH_SIZE_ARG is 20 and startUpSleepMins is 3 mins. Then the first 20 instances
// will not sleep, but start immediately. then instance 20 - 39 will sleep 3 mins and then
// start to run. instance 40 - 59 will sleep 6 mins then start to run. instances 60 - 79 will
// sleep 9 mins and then start to run and so forth.
long sleepTime = instance / startUpBatchSize * startUpSleepMins;
LOG.info("Instance={}, Start up batch size={}", instance, startUpBatchSize);
LOG.info("Sleep {} minutes to void HDFS name node and ZooKeeper overwhelmed.", sleepTime);
Uninterruptibles.sleepUninterruptibly(sleepTime, TimeUnit.MINUTES);
// Kinit here.
Kerberos.kinit(
EarlybirdConfig.getString("kerberos_user", ""),
EarlybirdConfig.getString("kerberos_keytab_path", "")
);
long waitBetweenLoopsMs = TimeUnit.MINUTES.toMillis(waitBetweenLoopsMins);
if (onlyRunOnce) {
LOG.info("This segment builder will run the full rebuild of all the segments");
} else {
LOG.info("This segment builder will incrementally check for new data and rebuilt "
+ "current segments as needed.");
LOG.info("The waiting interval between two new data checking is: "
+ waitBetweenLoopsMs + " ms.");
}
boolean scrubGenPresent = segmentSyncConfig.getScrubGen().isPresent();
LOG.info("Scrub gen present: {}", scrubGenPresent);
boolean scrubGenDataFullyBuilt = segmentBuilderCoordinator.isScrubGenDataFullyBuilt(instance);
LOG.info("Scrub gen data fully built: {}", scrubGenDataFullyBuilt);
if (!scrubGenPresent || scrubGenDataFullyBuilt) {
LOG.info("Starting segment building loop...");
while (!Thread.currentThread().isInterrupted()) {
try {
indexingLoop();
if (onlyRunOnce) {
LOG.info("only run once is true, breaking");
break;
}
clock.waitFor(waitBetweenLoopsMs);
} catch (InterruptedException e) {
LOG.info("Interrupted, quitting segment builder");
Thread.currentThread().interrupt();
} catch (SegmentInfoConstructionException e) {
LOG.error("Error creating new segmentInfo, quitting segment builder: ", e);
break;
} catch (SegmentUpdaterException e) {
FAILED_SEGMENTS.increment();
// Before the segment builder quits, sleep for WAIT_BEFORE_QUIT_MINS minutes so that the
// FAILED_SEGMENTS stat can be exported.
try {
clock.waitFor(TimeUnit.MINUTES.toMillis(waitBeforeQuitMins));
} catch (InterruptedException ex) {
LOG.info("Interrupted, quitting segment builder");
Thread.currentThread().interrupt();
}
LOG.error("SegmentUpdater processing segment error, quitting segment builder: ", e);
break;
}
}
} else {
LOG.info("Cannot build the segments for scrub gen yet.");
}
}
// Refactoring the run loop to here for unittest
@VisibleForTesting
void indexingLoop()
throws SegmentInfoConstructionException, InterruptedException, SegmentUpdaterException {
// This map contains all the segments to be processed; if a segment is built, it will be removed
// from the map.
Map<String, SegmentBuilderSegment> buildableSegmentInfoMap;
try {
buildableSegmentInfoMap = createSegmentInfoMap();
printSegmentInfoMap(buildableSegmentInfoMap);
} catch (IOException e) {
LOG.error("Error creating segmentInfoMap: ", e);
return;
}
while (!buildableSegmentInfoMap.isEmpty()) {
boolean hasBuiltSegment = processSegments(buildableSegmentInfoMap);
if (!hasBuiltSegment) {
// If we successfully built a segment, no need to sleep since building a segment takes a
// long time
clock.waitFor(processWaitingInterval);
}
}
}
// Actual shutdown.
protected void doShutdown() {
LOG.info("doShutdown()...");
try {
earlybirdIndexConfig.getResourceCloser().shutdownExecutor();
} catch (InterruptedException e) {
LOG.error("Interrupted during shutdown. ", e);
}
LOG.info("Segment builder stopped!");
}
private List<ArchiveTimeSlicer.ArchiveTimeSlice> createTimeSlices() throws IOException {
Preconditions.checkState(segmentSyncConfig.getScrubGen().isPresent());
Date scrubGen = ScrubGenUtil.parseScrubGenToDate(segmentSyncConfig.getScrubGen().get());
final DailyStatusBatches dailyStatusBatches =
new DailyStatusBatches(zkTryLockFactory, scrubGen);
final ArchiveTimeSlicer archiveTimeSlicer = new ArchiveTimeSlicer(
EarlybirdConfig.getMaxSegmentSize(), dailyStatusBatches, earlybirdIndexConfig);
Stopwatch stopwatch = Stopwatch.createStarted();
List<ArchiveTimeSlicer.ArchiveTimeSlice> timeSlices = archiveTimeSlicer.getTimeSlices();
if (timeSlices == null) {
LOG.error("Failed to load timeslice map after {}", stopwatch);
return Collections.emptyList();
}
LOG.info("Took {} to get timeslices", stopwatch);
return timeSlices;
}
private static class TimeSliceAndHashPartition implements Comparable<TimeSliceAndHashPartition> {
public final ArchiveTimeSlicer.ArchiveTimeSlice timeSlice;
public final Integer hashPartition;
public TimeSliceAndHashPartition(
ArchiveTimeSlicer.ArchiveTimeSlice timeSlice,
Integer hashPartition) {
this.timeSlice = timeSlice;
this.hashPartition = hashPartition;
}
@Override
public int compareTo(TimeSliceAndHashPartition o) {
Integer myHashPartition = this.hashPartition;
Integer otherHashPartition = o.hashPartition;
long myTimeSliceId = this.timeSlice.getMinStatusID(myHashPartition);
long otherTimeSliceId = o.timeSlice.getMinStatusID(otherHashPartition);
return ComparisonChain.start()
.compare(myHashPartition, otherHashPartition)
.compare(myTimeSliceId, otherTimeSliceId)
.result();
}
}
/**
* For all the timeslices, create the corresponding SegmentInfo and store in a map
*/
@VisibleForTesting
Map<String, SegmentBuilderSegment> createSegmentInfoMap() throws IOException {
final List<ArchiveTimeSlicer.ArchiveTimeSlice> timeSlices = createTimeSlices();
List<TimeSliceAndHashPartition> timeSlicePairs = createPairs(timeSlices);
// Export how many segments should be built
SEGMENTS_TO_BUILD.set(timeSlicePairs.size());
LOG.info("Total number of segments to be built across all segment builders: {}",
timeSlicePairs.size());
List<TimeSliceAndHashPartition> mySegments = getSegmentsForMyPartition(timeSlicePairs);
Map<String, SegmentBuilderSegment> segmentInfoMap = new HashMap<>();
for (TimeSliceAndHashPartition mySegment : mySegments) {
ArchiveSegment segment = new ArchiveSegment(mySegment.timeSlice, mySegment.hashPartition,
EarlybirdConfig.getMaxSegmentSize());
SegmentInfo segmentInfo = new SegmentInfo(segment, segmentFactory, segmentSyncConfig);
segmentInfoMap.put(segmentInfo.getSegment().getSegmentName(), new NotYetBuiltSegment(
segmentInfo, segmentConfig, segmentFactory, 0, segmentSyncConfig));
}
return segmentInfoMap;
}
private List<TimeSliceAndHashPartition> createPairs(
List<ArchiveTimeSlicer.ArchiveTimeSlice> timeSlices) {
List<TimeSliceAndHashPartition> timeSlicePairs = new ArrayList<>();
for (ArchiveTimeSlicer.ArchiveTimeSlice slice : timeSlices) {
List<Integer> localPartitions = hashPartitions;
if (localPartitions == null) {
localPartitions = range(slice.getNumHashPartitions());
}
for (Integer partition : localPartitions) {
timeSlicePairs.add(new TimeSliceAndHashPartition(slice, partition));
}
}
return timeSlicePairs;
}
private List<TimeSliceAndHashPartition> getSegmentsForMyPartition(
List<TimeSliceAndHashPartition> timeSlicePairs) {
Collections.sort(timeSlicePairs);
List<TimeSliceAndHashPartition> myTimeSlices = new ArrayList<>();
for (int i = myPartitionId; i < timeSlicePairs.size(); i += numSegmentBuilderPartitions) {
myTimeSlices.add(timeSlicePairs.get(i));
}
LOG.info("Getting segments to be built for partition: {}", myPartitionId);
LOG.info("Total number of partitions: {}", numSegmentBuilderPartitions);
LOG.info("Number of segments picked: {}", myTimeSlices.size());
return myTimeSlices;
}
/**
* Print out the segmentInfo Map for debugging
*/
private void printSegmentInfoMap(Map<String, SegmentBuilderSegment> segmentInfoMap) {
LOG.info("SegmentInfoMap: ");
for (Map.Entry<String, SegmentBuilderSegment> entry : segmentInfoMap.entrySet()) {
LOG.info(entry.getValue().toString());
}
LOG.info("Total SegmentInfoMap size: " + segmentInfoMap.size() + ". done.");
}
/**
* Build indices or refresh state for the segments in the specified segmentInfoMap, which only
* contains the segments that need to build or are building. When a segment has not been built,
* it is built here. If built successfully, it will be removed from the map; otherwise, its
* state will be updated in the map.
*
* Returns true iff this process has built a segment.
*/
@VisibleForTesting
boolean processSegments(Map<String, SegmentBuilderSegment> segmentInfoMap)
throws SegmentInfoConstructionException, SegmentUpdaterException, InterruptedException {
boolean hasBuiltSegment = false;
Iterator<Map.Entry<String, SegmentBuilderSegment>> iter =
segmentInfoMap.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry<String, SegmentBuilderSegment> entry = iter.next();
SegmentBuilderSegment originalSegment = entry.getValue();
LOG.info("About to process segment: {}", originalSegment.getSegmentName());
long startMillis = System.currentTimeMillis();
SegmentBuilderSegment updatedSegment = segmentHandler.processSegment(originalSegment);
if (updatedSegment.isBuilt()) {
iter.remove();
hasBuiltSegment = true;
if (originalSegment instanceof NotYetBuiltSegment) {
// Record the total time spent on successfully building a semgent, used to compute the
// average segment building time.
long timeSpent = System.currentTimeMillis() - startMillis;
segmentsBuiltLocally.increment();
timeSpentOnSuccessfulBuildSecs.add(timeSpent / 1000);
}
} else {
entry.setValue(updatedSegment);
}
clock.waitFor(getSegmentSleepTime());
}
return hasBuiltSegment;
}
private long getSegmentSleepTime() {
// The Hadoop name node can handle only about 200 requests/sec before it gets overloaded.
// Updating the state of a node that has been built takes about 1 second. In the worst case
// scenario with 800 segment builders, we end up with about 800 requests/sec. Adding a 10
// second sleep lowers the worst case to about 80 requests/sec.
long sleepMillis = TimeUnit.SECONDS.toMillis(waitBetweenSegmentsSecs);
// Use randomization so that we can't get all segment builders hitting it at the exact same time
int lowerSleepBoundMillis = (int) (sleepMillis * (1.0 - SLEEP_RANDOMIZATION_RATIO));
int upperSleepBoundMillis = (int) (sleepMillis * (1.0 + SLEEP_RANDOMIZATION_RATIO));
return randRange(lowerSleepBoundMillis, upperSleepBoundMillis);
}
/**
* Returns a pseudo-random number between min and max, inclusive.
*/
private int randRange(int min, int max) {
return random.nextInt((max - min) + 1) + min;
}
/**
* Returns list of integers 0, 1, 2, ..., count-1.
*/
private static List<Integer> range(int count) {
List<Integer> nums = new ArrayList<>(count);
for (int i = 0; i < count; i++) {
nums.add(i);
}
return nums;
}
private static SegmentSyncConfig getSyncConfig(String scrubGen) {
if (scrubGen == null || scrubGen.isEmpty()) {
throw new RuntimeException(
"Scrub gen expected, but could not get it from the arguments.");
}
LOG.info("Scrub gen: " + scrubGen);
return new SegmentSyncConfig(Optional.of(scrubGen));
}
}

View File

@ -1,109 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
import java.util.Collection;
import com.google.common.collect.ImmutableList;
import com.google.inject.Module;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.app.Flaggable;
import com.twitter.inject.server.AbstractTwitterServer;
import com.twitter.util.Future;
import com.twitter.util.Time;
public class SegmentBuilderApp extends AbstractTwitterServer {
private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilderApp.class);
public SegmentBuilderApp() {
createFlag("onlyRunOnce",
true,
"whether to stop segment builder after one loop",
Flaggable.ofBoolean());
createFlag("waitBetweenLoopsMins",
60,
"how many minutes to wait between building loops",
Flaggable.ofInt());
createFlag("startup_batch_size",
30,
"How many instances can start and read timeslice info from HDFS at the same time. "
+ "If you don't know what this parameter is, please do not change this parameter.",
Flaggable.ofInt());
createFlag("instance",
20,
"the job instance number",
Flaggable.ofInt());
createFlag("segmentZkLockExpirationHours",
0,
"max hours to hold the zookeeper lock while building segment",
Flaggable.ofInt());
createFlag("startupSleepMins",
2L,
"sleep multiplier of startupSleepMins before job runs",
Flaggable.ofLong());
createFlag("maxRetriesOnFailure",
3,
"how many times we should try to rebuild a segment when failure happens",
Flaggable.ofInt());
createFlag("hash_partitions",
ImmutableList.of(),
"comma separated hash partition ids, e.g., 0,1,3,4. "
+ "If not specified, all the partitions will be built.",
Flaggable.ofJavaList(Flaggable.ofInt()));
createFlag("numSegmentBuilderPartitions",
100,
"Number of partitions for dividing up all segment builder work",
Flaggable.ofInt());
createFlag("waitBetweenSegmentsSecs",
10,
"Time to sleep between processing segments.",
Flaggable.ofInt());
createFlag("waitBeforeQuitMins",
2,
"How many minutes to sleep before quitting.",
Flaggable.ofInt());
createFlag("scrubGen",
"",
"Scrub gen for which segment builders should be run.",
Flaggable.ofString());
}
@Override
public void start() {
SegmentBuilder segmentBuilder = injector().instance(SegmentBuilder.class);
closeOnExit((Time time) -> {
segmentBuilder.doShutdown();
return Future.Unit();
});
LOG.info("Starting run()");
segmentBuilder.run();
LOG.info("run() complete");
// Now shutdown
shutdown();
}
protected void shutdown() {
LOG.info("Calling close() to initiate shutdown");
close();
}
@Override
public Collection<Module> javaModules() {
return ImmutableList.of(new SegmentBuilderModule());
}
}

View File

@ -1,200 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
import java.io.IOException;
import java.util.Date;
import java.util.Optional;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.quantity.Amount;
import com.twitter.common.quantity.Time;
import com.twitter.common.util.Clock;
import com.twitter.search.common.database.DatabaseConfig;
import com.twitter.search.common.util.zktrylock.TryLock;
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
import com.twitter.search.earlybird.archive.DailyStatusBatches;
import com.twitter.search.earlybird.common.config.EarlybirdProperty;
import com.twitter.search.earlybird.util.ScrubGenUtil;
import com.twitter.search.earlybird.partition.HdfsUtil;
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
import com.twitter.util.Duration;
/**
* Coordinate between segment builders for scrubbing pipeline.
* When segment builder is running, all of them will try to find a HDFS file indicating if data is
* ready. If the file does not exist, only one of them will go through the files and see if
* scrubbing pipeline has generated all data for this scrub gen.
*
* If the instance that got the lock found all data, it still exists, because otherwise we will
* have one single segmentbuilder instance trying to build all segments, which is not what we want.
* But if it exists, then the next time all segmentbuilder instances are scheduled, they will all
* find the file, and will start building segments.
*/
class SegmentBuilderCoordinator {
private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilderCoordinator.class);
private static final Amount<Long, Time> ZK_LOCK_EXPIRATION_MIN = Amount.of(5L, Time.MINUTES);
private static final String SEGMENT_BUILDER_SYNC_NODE = "scrub_gen_data_sync";
private static final String SEGMENT_BUILDER_SYNC_ZK_PATH =
EarlybirdProperty.ZK_APP_ROOT.get() + "/segment_builder_sync";
private static final String DATA_FULLY_BUILT_FILE = "_data_fully_built";
static final int FIRST_INSTANCE = 0;
private static final long NON_FIRST_INSTANCE_SLEEP_BEFORE_RETRY_DURATION_MS =
Duration.fromHours(1).inMillis();
private final ZooKeeperTryLockFactory zkTryLockFactory;
private final SegmentSyncConfig syncConfig;
private final Optional<Date> scrubGenDayOpt;
private final Optional<String> scrubGenOpt;
private final Clock clock;
SegmentBuilderCoordinator(
ZooKeeperTryLockFactory zkTryLockFactory, SegmentSyncConfig syncConfig, Clock clock) {
this.zkTryLockFactory = zkTryLockFactory;
this.syncConfig = syncConfig;
this.scrubGenOpt = syncConfig.getScrubGen();
this.scrubGenDayOpt = scrubGenOpt.map(ScrubGenUtil::parseScrubGenToDate);
this.clock = clock;
}
public boolean isScrubGenDataFullyBuilt(int instanceNumber) {
// Only segment builder that takes scrub gen should use isPartitioningOutputReady to coordinate
Preconditions.checkArgument(scrubGenDayOpt.isPresent());
final FileSystem hdfs;
try {
hdfs = HdfsUtil.getHdfsFileSystem();
} catch (IOException e) {
LOG.error("Could not create HDFS file system.", e);
return false;
}
return isScrubGenDataFullyBuilt(
instanceNumber,
scrubGenDayOpt.get(),
NON_FIRST_INSTANCE_SLEEP_BEFORE_RETRY_DURATION_MS,
hdfs
);
}
@VisibleForTesting
boolean isScrubGenDataFullyBuilt(
int instanceNumber,
Date scrubGenDay,
long nonFirstInstanceSleepBeforeRetryDuration,
FileSystem hdfs) {
// Check if the scrub gen has been fully built file exists.
if (checkHaveScrubGenDataFullyBuiltFileOnHdfs(hdfs)) {
return true;
}
// If it doesn't exist, let first instance see if scrub gen has been fully built and create the
// file.
if (instanceNumber == FIRST_INSTANCE) {
// We were missing some data on HDFS for this scrub gen in previous run,
// but we might've gotten more data in the meantime, check again.
// Only allow instance 0 to do this mainly for 2 reasons:
// 1) Since instances are scheduled in batches, it's possible that a instance from latter
// batch find the fully built file in hdfs and start processing. We end up doing work with
// only partial instances.
// 2) If we sleep before we release lock, it's hard to estimate how long a instance will
// be scheduled.
// For deterministic reason, we simplify a bit and only allow instance 0 to check and write
// data is fully build file to hdfs.
try {
checkIfScrubGenDataIsFullyBuilt(hdfs, scrubGenDay);
} catch (IOException e) {
LOG.error("Failed to grab lock and check scrub gen data.", e);
}
} else {
// for all other instances, sleep for a bit to give time for first instance to check if scrub
// gen has been fully built and create the file, then check again.
try {
LOG.info(
"Sleeping for {} ms before re-checking if scrub gen has been fully built file exists",
nonFirstInstanceSleepBeforeRetryDuration);
clock.waitFor(nonFirstInstanceSleepBeforeRetryDuration);
return checkHaveScrubGenDataFullyBuiltFileOnHdfs(hdfs);
} catch (InterruptedException e) {
LOG.warn("Interrupted when sleeping before re-checking if scrub gen has been fully built "
+ "file exists", e);
}
}
// if hasSuccessFileToHdfs returns false, then should always return false in the end.
// next run will find success file for this scrub gen and move forward.
return false;
}
private void checkIfScrubGenDataIsFullyBuilt(
FileSystem hdfs, Date scrubGenDay) throws IOException {
// Build the lock, try to acquire it, and check the data on HDFS
TryLock lock = zkTryLockFactory.createTryLock(
DatabaseConfig.getLocalHostname(),
SEGMENT_BUILDER_SYNC_ZK_PATH,
SEGMENT_BUILDER_SYNC_NODE,
ZK_LOCK_EXPIRATION_MIN);
Preconditions.checkState(scrubGenOpt.isPresent());
String scrubGen = scrubGenOpt.get();
lock.tryWithLock(() -> {
LOG.info(String.format(
"Obtained ZK lock to check if data for scrub gen %s is ready.", scrubGen));
final DailyStatusBatches directory =
new DailyStatusBatches(zkTryLockFactory, scrubGenDay);
if (directory.isScrubGenDataFullyBuilt(hdfs)
&& createScrubGenDataFullyBuiltFileOnHdfs(hdfs)) {
LOG.info(String.format("All data for scrub gen %s is ready.", scrubGen));
} else {
LOG.info(String.format("Data for scrub gen %s is not ready yet.", scrubGen));
}
});
}
private boolean createScrubGenDataFullyBuiltFileOnHdfs(FileSystem fs) {
Path path = getScrubGenDataFullyBuiltFilePath();
try {
fs.mkdirs(new Path(statusReadyHDFSPath()));
if (fs.createNewFile(path)) {
LOG.info("Successfully created file " + path + " on HDFS.");
return true;
} else {
LOG.warn("Failed to create file " + path + " on HDFS.");
}
} catch (IOException e) {
LOG.error("Failed to create file on HDFS " + path.toString(), e);
}
return false;
}
private boolean checkHaveScrubGenDataFullyBuiltFileOnHdfs(FileSystem fs) {
Path path = getScrubGenDataFullyBuiltFilePath();
try {
boolean ret = fs.exists(path);
LOG.info("Checking if file exists showing scrubgen is fully built.");
LOG.info("Path checked: {}, Exist check: {}", path, ret);
return ret;
} catch (IOException e) {
LOG.error("Failed to check file on HDFS " + path.toString(), e);
return false;
}
}
@VisibleForTesting
Path getScrubGenDataFullyBuiltFilePath() {
return new Path(statusReadyHDFSPath(), DATA_FULLY_BUILT_FILE);
}
@VisibleForTesting
String statusReadyHDFSPath() {
return syncConfig.getHdfsSegmentSyncRootDir() + "/segment_builder_sync";
}
}

View File

@ -1,10 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
public final class SegmentBuilderMain {
private SegmentBuilderMain() { }
public static void main(String[] args) {
new SegmentBuilderApp().main(args);
}
}

View File

@ -1,58 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
import java.io.File;
import com.google.inject.Provides;
import com.google.inject.Singleton;
import com.twitter.app.Flaggable;
import com.twitter.decider.Decider;
import com.twitter.inject.TwitterModule;
import com.twitter.inject.annotations.Flag;
import com.twitter.search.common.config.LoggerConfiguration;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.util.EarlybirdDecider;
public class SegmentBuilderModule extends TwitterModule {
private static final String CONFIG_FILE_FLAG_NAME = "config_file";
private static final String SEGMENT_LOG_DIR_FLAG_NAME = "segment_log_dir";
public SegmentBuilderModule() {
createFlag(CONFIG_FILE_FLAG_NAME,
new File("earlybird-search.yml"),
"specify config file",
Flaggable.ofFile());
createFlag(SEGMENT_LOG_DIR_FLAG_NAME,
"",
"override log dir from config file",
Flaggable.ofString());
}
/**
* Initializes the Earlybird config and the log configuration, and returns an EarlybirdDecider
* object, which will be injected into the SegmentBuilder instance.
*
* @param configFile The config file to use to initialize EarlybirdConfig
* @param segmentLogDir If not empty, used to override the log directory from the config file
* @return An initialized EarlybirdDecider
*/
@Provides
@Singleton
public Decider provideDecider(@Flag(CONFIG_FILE_FLAG_NAME) File configFile,
@Flag(SEGMENT_LOG_DIR_FLAG_NAME) String segmentLogDir) {
// By default Guice will build singletons eagerly:
// https://github.com/google/guice/wiki/Scopes#eager-singletons
// So in order to ensure that the EarlybirdConfig and LoggerConfiguration initializations occur
// before the EarlybirdDecider initialization, we place them here.
EarlybirdConfig.init(configFile.getName());
if (!segmentLogDir.isEmpty()) {
EarlybirdConfig.overrideLogDir(segmentLogDir);
}
new LoggerConfiguration(EarlybirdConfig.getLogPropertiesFile(), EarlybirdConfig.getLogDir())
.configure();
return EarlybirdDecider.initialize();
}
}

View File

@ -1,100 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
import java.io.IOException;
import com.google.common.base.Preconditions;
import com.twitter.common.quantity.Amount;
import com.twitter.common.quantity.Time;
import com.twitter.search.common.database.DatabaseConfig;
import com.twitter.search.common.util.zktrylock.TryLock;
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
import com.twitter.search.earlybird.archive.ArchiveSegment;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
import com.twitter.search.earlybird.partition.SegmentInfo;
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
public abstract class SegmentBuilderSegment {
protected final SegmentInfo segmentInfo;
protected final SegmentConfig segmentConfig;
protected final EarlybirdSegmentFactory earlybirdSegmentFactory;
protected final int alreadyRetriedCount;
protected final SegmentSyncConfig sync;
public SegmentBuilderSegment(SegmentInfo segmentInfo,
SegmentConfig segmentConfig,
EarlybirdSegmentFactory earlybirdSegmentFactory,
int alreadyRetriedCount,
SegmentSyncConfig segmentSyncConfig) {
this.segmentConfig = segmentConfig;
this.earlybirdSegmentFactory = earlybirdSegmentFactory;
this.alreadyRetriedCount = alreadyRetriedCount;
this.sync = segmentSyncConfig;
Preconditions.checkState(segmentInfo.getSegment() instanceof ArchiveSegment);
this.segmentInfo = Preconditions.checkNotNull(segmentInfo);
}
public SegmentInfo getSegmentInfo() {
return segmentInfo;
}
public String getSegmentName() {
return segmentInfo.getSegmentName();
}
public int getAlreadyRetriedCount() {
return alreadyRetriedCount;
}
/**
* Handle the segment, potentially transitioning to a new state.
* @return The state after handling.
*/
public abstract SegmentBuilderSegment handle()
throws SegmentInfoConstructionException, SegmentUpdaterException;
public boolean isBuilt() {
return false;
}
@Override
public String toString() {
return "SegmentBuilderSegment{"
+ "segmentInfo=" + segmentInfo
+ ", state=" + this.getClass().getSimpleName()
+ ", alreadyRetriedCount=" + alreadyRetriedCount + '}';
}
/**
* Given a SegmentInfo, create a new one with the same time slice and partitionID but clean
* internal state.
*/
protected SegmentInfo createNewSegmentInfo(SegmentInfo oldSegmentInfo)
throws SegmentInfoConstructionException {
Preconditions.checkArgument(oldSegmentInfo.getSegment() instanceof ArchiveSegment);
ArchiveSegment archiveSegment = (ArchiveSegment) oldSegmentInfo.getSegment();
try {
ArchiveSegment segment = new ArchiveSegment(archiveSegment.getArchiveTimeSlice(),
archiveSegment.getHashPartitionID(), EarlybirdConfig.getMaxSegmentSize());
return new SegmentInfo(segment, earlybirdSegmentFactory, sync);
} catch (IOException e) {
throw new SegmentInfoConstructionException("Error creating new segments", e);
}
}
protected TryLock getZooKeeperTryLock() {
ZooKeeperTryLockFactory tryLockFactory = segmentConfig.getTryLockFactory();
String zkRootPath = sync.getZooKeeperSyncFullPath();
String nodeName = segmentInfo.getZkNodeName();
Amount<Long, Time> expirationTime = segmentConfig.getSegmentZKLockExpirationTime();
return tryLockFactory.createTryLock(
DatabaseConfig.getLocalHostname(),
zkRootPath,
nodeName,
expirationTime);
}
}

View File

@ -1,41 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
import com.twitter.common.quantity.Amount;
import com.twitter.common.quantity.Time;
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
import com.twitter.search.earlybird.archive.ArchiveOnDiskEarlybirdIndexConfig;
public class SegmentConfig {
private final ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig;
private final Amount<Long, Time> segmentZKLockExpirationTime;
private final int maxRetriesOnFailure;
private final ZooKeeperTryLockFactory tryLockFactory;
public SegmentConfig(
ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig,
Amount<Long, Time> segmentZKLockExpirationTime,
int maxRetriesOnFailure,
ZooKeeperTryLockFactory tryLockFactory) {
this.earlybirdIndexConfig = earlybirdIndexConfig;
this.segmentZKLockExpirationTime = segmentZKLockExpirationTime;
this.maxRetriesOnFailure = maxRetriesOnFailure;
this.tryLockFactory = tryLockFactory;
}
public ArchiveOnDiskEarlybirdIndexConfig getEarlybirdIndexConfig() {
return earlybirdIndexConfig;
}
public Amount<Long, Time> getSegmentZKLockExpirationTime() {
return segmentZKLockExpirationTime;
}
public int getMaxRetriesOnFailure() {
return maxRetriesOnFailure;
}
public ZooKeeperTryLockFactory getTryLockFactory() {
return tryLockFactory;
}
}

View File

@ -1,12 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
import java.io.IOException;
/**
* Used if exceptions are thrown during creating new SegmentInfo during the indexing loop
*/
class SegmentInfoConstructionException extends Exception {
SegmentInfoConstructionException(String msg, IOException e) {
super(msg, e);
}
}

View File

@ -1,13 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
import com.google.common.annotations.VisibleForTesting;
/**
* Used when when SegmentUpdater fails processing segments.
*/
@VisibleForTesting
class SegmentUpdaterException extends Exception {
SegmentUpdaterException(String msg) {
super(msg);
}
}

View File

@ -1,69 +0,0 @@
package com.twitter.search.earlybird.archive.segmentbuilder;
import java.util.concurrent.atomic.AtomicBoolean;
import com.google.common.annotations.VisibleForTesting;
import com.twitter.common.base.Command;
import com.twitter.search.common.util.zktrylock.TryLock;
import com.twitter.search.earlybird.archive.ArchiveHDFSUtils;
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
import com.twitter.search.earlybird.partition.SegmentInfo;
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
public class SomeoneElseIsBuildingSegment extends SegmentBuilderSegment {
public SomeoneElseIsBuildingSegment(
SegmentInfo segmentInfo,
SegmentConfig segmentConfig,
EarlybirdSegmentFactory earlybirdSegmentFactory,
int alreadyRetriedCount,
SegmentSyncConfig sync) {
super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync);
}
/**
* This method refreshes local state of a segment.
* 1. Try to grab the ZK lock
* 2a. if got the lock, the segment is not being built; mark segment as NOT_BUILT_YET.
* 2b. otherwise, the segment is being built; keep the SOMEONE_ELSE_IS_BUILDING state
*/
@Override
public SegmentBuilderSegment handle()
throws SegmentInfoConstructionException, SegmentUpdaterException {
TryLock lock = getZooKeeperTryLock();
final AtomicBoolean alreadyBuilt = new AtomicBoolean(false);
boolean gotLock = lock.tryWithLock((Command) () -> {
// The segment might have already finished built by others
if (segmentExistsOnHdfs()) {
alreadyBuilt.set(true);
}
});
if (!gotLock) {
return this;
}
if (alreadyBuilt.get()) {
return new BuiltAndFinalizedSegment(
segmentInfo, segmentConfig, earlybirdSegmentFactory, 0, sync);
} else {
// When a segment failed building, its state might not be clean. So, it is necessary to
// create a new SegmentInfo with a clean state
SegmentInfo newSegmentInfo = createNewSegmentInfo(segmentInfo);
return new NotYetBuiltSegment(
newSegmentInfo,
segmentConfig,
earlybirdSegmentFactory,
alreadyRetriedCount + 1,
sync);
}
}
@VisibleForTesting
boolean segmentExistsOnHdfs() {
return ArchiveHDFSUtils.hasSegmentIndicesOnHDFS(sync, segmentInfo);
}
}

View File

@ -1,37 +0,0 @@
java_library(
sources = ["*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/twitter/elephantbird:core",
"3rdparty/jvm/commons-codec",
"3rdparty/jvm/commons-httpclient",
"3rdparty/jvm/geo/google:geoGoogle",
"3rdparty/jvm/org/apache/lucene:lucene-core",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"decider/src/main/scala",
"finagle/finagle-core/src/main",
"finagle/finagle-thrift/src/main/java",
"finagle/finagle-thrift/src/main/scala",
"scrooge/scrooge-core/src/main/scala",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/optional",
"src/java/com/twitter/search/common/decider",
"src/java/com/twitter/search/common/logging",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/util:finagleutil",
"src/java/com/twitter/search/common/util/earlybird",
"src/java/com/twitter/search/common/util/thrift:thrift-utils",
"src/java/com/twitter/search/queryparser/query:core-query-nodes",
"src/thrift/com/twitter/context:twitter-context-scala",
"src/thrift/com/twitter/search:earlybird-java",
"src/thrift/com/twitter/search/common:caching-java",
"src/thrift/com/twitter/search/common:constants-java",
"src/thrift/com/twitter/search/common:query-java",
"strato/src/main/scala/com/twitter/strato/opcontext",
"twitter-context/src/main/scala",
"util/util-core:scala",
],
)

View File

@ -1,120 +0,0 @@
package com.twitter.search.earlybird.common;
import org.apache.commons.codec.binary.Base64;
import org.apache.thrift.TException;
import org.apache.thrift.TSerializer;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.slf4j.Logger;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
public final class Base64RequestResponseForLogging {
private static final Logger GENERAL_LOG = org.slf4j.LoggerFactory.getLogger(
Base64RequestResponseForLogging.class);
private static final Logger FAILED_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger(
Base64RequestResponseForLogging.class.getName() + ".FailedRequests");
private static final Logger RANDOM_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger(
Base64RequestResponseForLogging.class.getName() + ".RandomRequests");
private static final Logger SLOW_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger(
Base64RequestResponseForLogging.class.getName() + ".SlowRequests");
private enum LogType {
FAILED,
RANDOM,
SLOW,
};
private final LogType logtype;
private final String logLine;
private final EarlybirdRequest request;
private final EarlybirdResponse response;
private final Base64 base64 = new Base64();
// TSerializer is not threadsafe, so create a new one for each request
private final TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory());
private Base64RequestResponseForLogging(
LogType logType, String logLine, EarlybirdRequest request, EarlybirdResponse response) {
this.logtype = logType;
this.logLine = logLine;
this.request = request;
this.response = response;
}
public static Base64RequestResponseForLogging randomRequest(
String logLine, EarlybirdRequest request, EarlybirdResponse response) {
return new Base64RequestResponseForLogging(LogType.RANDOM, logLine, request, response);
}
public static Base64RequestResponseForLogging failedRequest(
String logLine, EarlybirdRequest request, EarlybirdResponse response) {
return new Base64RequestResponseForLogging(LogType.FAILED, logLine, request, response);
}
public static Base64RequestResponseForLogging slowRequest(
String logLine, EarlybirdRequest request, EarlybirdResponse response) {
return new Base64RequestResponseForLogging(LogType.SLOW, logLine, request, response);
}
private String asBase64(EarlybirdRequest clearedRequest) {
try {
// The purpose of this log is to make it easy to re-issue requests in formz to reproduce
// issues. If queries are re-issued as is they will be treated as late-arriving queries and
// dropped due to the clientRequestTimeMs being set to the original query time. For ease of
// use purposes we clear clientRequestTimeMs and log it out separately for the rare case it
// is needed.
clearedRequest.unsetClientRequestTimeMs();
return base64.encodeToString(serializer.serialize(clearedRequest));
} catch (TException e) {
GENERAL_LOG.error("Failed to serialize request for logging.", e);
return "failed_to_serialize";
}
}
private String asBase64(EarlybirdResponse earlybirdResponse) {
try {
return base64.encodeToString(serializer.serialize(earlybirdResponse));
} catch (TException e) {
GENERAL_LOG.error("Failed to serialize response for logging.", e);
return "failed_to_serialize";
}
}
private String getFormattedMessage() {
String base64Request = asBase64(
EarlybirdRequestUtil.copyAndClearUnnecessaryValuesForLogging(request));
String base64Response = asBase64(response);
return logLine + ", clientRequestTimeMs: " + request.getClientRequestTimeMs()
+ ", " + base64Request + ", " + base64Response;
}
/**
* Logs the Base64-encoded request and response to the success or failure log.
*/
public void log() {
// Do the serializing/concatting this way so it happens on the background thread for
// async logging
Object logObject = new Object() {
@Override
public String toString() {
return getFormattedMessage();
}
};
switch (logtype) {
case FAILED:
FAILED_REQUEST_LOG.info("{}", logObject);
break;
case RANDOM:
RANDOM_REQUEST_LOG.info("{}", logObject);
break;
case SLOW:
SLOW_REQUEST_LOG.info("{}", logObject);
break;
default:
// Not logging anything for other log types.
break;
}
}
}

View File

@ -1,55 +0,0 @@
package com.twitter.search.earlybird.common;
import java.util.concurrent.atomic.AtomicBoolean;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.common.metrics.SearchCustomGauge;
/**
* A monitor which enforces the condition that a single thread's work is caught up, and allows
* other threads to wait to be notified when the work is complete. An AtomicBoolean ensures the
* current status is visible to all threads.
*/
public class CaughtUpMonitor {
private static final Logger LOG = LoggerFactory.getLogger(CaughtUpMonitor.class);
protected final AtomicBoolean isCaughtUp = new AtomicBoolean(false);
public CaughtUpMonitor(String statPrefix) {
SearchCustomGauge.export(statPrefix + "_is_caught_up", () -> isCaughtUp() ? 1 : 0);
}
public boolean isCaughtUp() {
return isCaughtUp.get();
}
/**
* Set caught up state, and notify waiting threads if caught up.
*/
public synchronized void setAndNotify(boolean caughtUp) {
isCaughtUp.set(caughtUp);
if (caughtUp) {
// Readers are caught up, notify waiting threads
notifyAll();
}
}
/**
* Wait using Object.wait() until caught up or until thread is interrupted.
*/
public synchronized void resetAndWaitUntilCaughtUp() {
LOG.info("Waiting to catch up.");
// Explicitly set isCaughtUp to false before waiting
isCaughtUp.set(false);
try {
while (!isCaughtUp()) {
wait();
}
} catch (InterruptedException e) {
LOG.error("{} was interrupted while waiting to catch up", Thread.currentThread());
}
LOG.info("Caught up.");
}
}

View File

@ -1,85 +0,0 @@
package com.twitter.search.earlybird.common;
import java.util.Optional;
import com.twitter.common.optional.Optionals;
import com.twitter.search.common.util.FinagleUtil;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
import com.twitter.strato.opcontext.Attribution;
import com.twitter.strato.opcontext.HttpEndpoint;
public final class ClientIdUtil {
// Blenders should always set the EarlybirdRequest.clientId field. It should be set to the Finagle
// client ID of the client that caused the blender to send this request to the roots. If the
// Finagle ID of the blender's client cannot be determined, it will be set to "unknown" (see
// com.twitter.search.common.util.FinagleUtil.UNKNOWN_CLIENT_NAME). However, other services that
// send requests to roots might not set EarlybirdRequest.clientId.
//
// So an "unset" clientId means: EarlybirdRequest.clientId was null.
// An "unknown" clientId means: the client that sent us the request
// tried setting EarlybirdRequest.clientId, but couldn't figure out a good value for it.
public static final String UNSET_CLIENT_ID = "unset";
private static final String CLIENT_ID_FOR_UNKNOWN_CLIENTS = "unknown_client_id";
private static final String CLIENT_ID_PREFIX = "client_id_";
private static final String FINAGLE_CLIENT_ID_AND_CLIENT_ID_PATTERN =
"finagle_id_%s_and_client_id_%s";
private static final String CLIENT_ID_AND_REQUEST_TYPE = "client_id_%s_and_type_%s";
private ClientIdUtil() {
}
/** Returns the ID of the client that initiated this request or UNSET_CLIENT_ID if not set. */
public static String getClientIdFromRequest(EarlybirdRequest request) {
return Optional
.ofNullable(request.getClientId())
.map(String::toLowerCase)
.orElse(UNSET_CLIENT_ID);
}
/**
* Returns the Strato http endpoint attribution as an Optional.
*/
public static Optional<String> getClientIdFromHttpEndpointAttribution() {
return Optionals
.optional(Attribution.httpEndpoint())
.map(HttpEndpoint::name)
.map(String::toLowerCase);
}
/** Formats the given clientId into a string that can be used for stats. */
public static String formatClientId(String clientId) {
return CLIENT_ID_PREFIX + clientId;
}
/**
* Formats the given Finagle clientId and the given clientId into a single string that can be used
* for stats, or other purposes where the two IDs need to be combined.
*/
public static String formatFinagleClientIdAndClientId(String finagleClientId, String clientId) {
return String.format(FINAGLE_CLIENT_ID_AND_CLIENT_ID_PATTERN, finagleClientId, clientId);
}
/**
* Formats the given clientId and requestType into a single string that can be used
* for stats or other purposes.
*/
public static String formatClientIdAndRequestType(
String clientId, String requestType) {
return String.format(CLIENT_ID_AND_REQUEST_TYPE, clientId, requestType);
}
/**
* Format the quota client id
*/
public static String getQuotaClientId(String clientId) {
if (FinagleUtil.UNKNOWN_CLIENT_NAME.equals(clientId) || UNSET_CLIENT_ID.equals(clientId)) {
return CLIENT_ID_FOR_UNKNOWN_CLIENTS;
}
return clientId;
}
}

View File

@ -1,365 +0,0 @@
package com.twitter.search.earlybird.common;
import java.util.EnumMap;
import java.util.Map;
import scala.Option;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Maps;
import com.twitter.context.TwitterContext;
import com.twitter.context.thriftscala.Viewer;
import com.twitter.decider.Decider;
import com.twitter.finagle.thrift.ClientId;
import com.twitter.finagle.thrift.ClientId$;
import com.twitter.search.TwitterContextPermit;
import com.twitter.search.common.constants.thriftjava.ThriftQuerySource;
import com.twitter.search.common.decider.DeciderUtil;
import com.twitter.search.common.logging.RPCLogger;
import com.twitter.search.common.metrics.FailureRatioCounter;
import com.twitter.search.common.metrics.Timer;
import com.twitter.search.common.util.earlybird.TermStatisticsUtil;
import com.twitter.search.common.util.earlybird.ThriftSearchResultUtil;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
import com.twitter.search.earlybird.thrift.ThriftFacetFieldRequest;
import com.twitter.search.earlybird.thrift.ThriftHistogramSettings;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
import com.twitter.search.earlybird.thrift.ThriftTermStatisticsRequest;
import static com.twitter.search.common.util.earlybird.EarlybirdResponseUtil
.responseConsideredFailed;
public class EarlybirdRequestLogger extends RPCLogger {
protected enum ExtraFields {
QUERY_MAX_HITS_TO_PROCESS,
COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS,
RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS,
NUM_HITS_PROCESSED,
QUERY_COST,
CPU_TOTAL,
QUERY_SOURCE,
CLIENT_ID,
FINAGLE_CLIENT_ID
}
protected enum ShardOnlyExtraFields {
NUM_SEARCHED_SEGMENTS,
SCORING_TIME_NANOS
}
protected enum RootOnlyExtraFields {
CACHING_ALLOWED,
DEBUG_MODE,
CACHE_HIT,
USER_AGENT,
// See JIRA APPSEC-2303 for IP addresses logging
}
private static final String LOG_FULL_REQUEST_DETAILS_ON_ERROR_DECIDER_KEY =
"log_full_request_details_on_error";
private static final String LOG_FULL_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY =
"log_full_request_details_random_fraction";
private static final String LOG_FULL_SLOW_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY =
"log_full_slow_request_details_random_fraction";
private static final String SLOW_REQUEST_LATENCY_THRESHOLD_MS_DECIDER_KEY =
"slow_request_latency_threshold_ms";
private final Decider decider;
private final boolean enableLogUnknownClientRequests;
private static final Map<ThriftQuerySource, FailureRatioCounter>
FAILURE_RATIO_COUNTER_BY_QUERY_SOURCE = preBuildFailureRatioCounters();
private static final FailureRatioCounter NO_QUERY_SOURCE_FAILURE_RATIO_COUNTER =
new FailureRatioCounter("earlybird_logger", "query_source", "not_set");
static EarlybirdRequestLogger buildForRoot(
String loggerName, int latencyWarnThreshold, Decider decider) {
return new EarlybirdRequestLogger(loggerName, latencyWarnThreshold,
decider, true, RPCLogger.Fields.values(), ExtraFields.values(),
RootOnlyExtraFields.values());
}
static EarlybirdRequestLogger buildForShard(
String loggerName, int latencyWarnThreshold, Decider decider) {
return new EarlybirdRequestLogger(loggerName, latencyWarnThreshold,
decider, false, RPCLogger.Fields.values(), ExtraFields.values(),
ShardOnlyExtraFields.values());
}
@VisibleForTesting
EarlybirdRequestLogger(String loggerName, int latencyWarnThreshold, Decider decider) {
this(loggerName, latencyWarnThreshold, decider, false, RPCLogger.Fields.values(),
ExtraFields.values(), RootOnlyExtraFields.values(), ShardOnlyExtraFields.values());
}
private EarlybirdRequestLogger(String loggerName, int latencyWarnThreshold, Decider decider,
boolean enableLogUnknownClientRequests, Enum[]... fieldEnums) {
super(loggerName, fieldEnums);
this.decider = decider;
this.enableLogUnknownClientRequests = enableLogUnknownClientRequests;
setLatencyWarnThreshold(latencyWarnThreshold);
}
/**
* Logs the given earlybird request and response.
*
* @param request The earlybird request.
* @param response The earlybird response.
* @param timer The time it took to process this request.
*/
public void logRequest(EarlybirdRequest request, EarlybirdResponse response, Timer timer) {
try {
LogEntry entry = newLogEntry();
setRequestLogEntries(entry, request);
setResponseLogEntries(entry, response);
if (timer != null) {
entry.setField(ExtraFields.CPU_TOTAL, Long.toString(timer.getElapsedCpuTotal()));
}
boolean wasError = response != null && responseConsideredFailed(response.getResponseCode());
long responseTime = response != null ? response.getResponseTime() : 0L;
String logLine = writeLogLine(entry, responseTime, wasError);
// This code path is called for pre/post logging
// Prevent same request showing up twice by only logging on post logging
if (response != null && DeciderUtil.isAvailableForRandomRecipient(
decider, LOG_FULL_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY)) {
Base64RequestResponseForLogging.randomRequest(logLine, request, response).log();
}
// Unknown client request logging only applies to pre-logging.
if (enableLogUnknownClientRequests && response == null) {
UnknownClientRequestForLogging unknownClientRequestLogger =
UnknownClientRequestForLogging.unknownClientRequest(logLine, request);
if (unknownClientRequestLogger != null) {
unknownClientRequestLogger.log();
}
}
if (wasError
&& DeciderUtil.isAvailableForRandomRecipient(
decider, LOG_FULL_REQUEST_DETAILS_ON_ERROR_DECIDER_KEY)) {
new RequestResponseForLogging(request, response).logFailedRequest();
Base64RequestResponseForLogging.failedRequest(logLine, request, response).log();
}
boolean wasSlow = response != null
&& responseTime >= DeciderUtil.getAvailability(
decider, SLOW_REQUEST_LATENCY_THRESHOLD_MS_DECIDER_KEY);
if (wasSlow
&& DeciderUtil.isAvailableForRandomRecipient(
decider, LOG_FULL_SLOW_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY)) {
Base64RequestResponseForLogging.slowRequest(logLine, request, response).log();
}
FailureRatioCounter failureRatioCounter =
FAILURE_RATIO_COUNTER_BY_QUERY_SOURCE.get(request.getQuerySource());
if (failureRatioCounter != null) {
failureRatioCounter.requestFinished(!wasError);
} else {
NO_QUERY_SOURCE_FAILURE_RATIO_COUNTER.requestFinished(!wasError);
}
} catch (Exception e) {
LOG.error("Exception building log entry ", e);
}
}
private void setRequestLogEntries(LogEntry entry, EarlybirdRequest request) {
entry.setField(Fields.CLIENT_HOST, request.getClientHost());
entry.setField(Fields.CLIENT_REQUEST_ID, request.getClientRequestID());
entry.setField(Fields.REQUEST_TYPE, requestTypeForLog(request));
if (request.isSetSearchQuery()) {
ThriftSearchQuery searchQuery = request.getSearchQuery();
entry.setField(Fields.QUERY, searchQuery.getSerializedQuery());
if (searchQuery.isSetMaxHitsToProcess()) {
entry.setField(ExtraFields.QUERY_MAX_HITS_TO_PROCESS,
Integer.toString(searchQuery.getMaxHitsToProcess()));
}
if (searchQuery.isSetCollectorParams()
&& searchQuery.getCollectorParams().isSetTerminationParams()
&& searchQuery.getCollectorParams().getTerminationParams().isSetMaxHitsToProcess()) {
entry.setField(ExtraFields.COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS,
Integer.toString(searchQuery.getCollectorParams().getTerminationParams()
.getMaxHitsToProcess()));
}
if (searchQuery.isSetRelevanceOptions()
&& searchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) {
entry.setField(ExtraFields.RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS,
Integer.toString(searchQuery.getRelevanceOptions().getMaxHitsToProcess()));
}
}
entry.setField(Fields.NUM_REQUESTED, Integer.toString(numRequestedForLog(request)));
if (request.isSetQuerySource()) {
entry.setField(ExtraFields.QUERY_SOURCE, request.getQuerySource().name());
}
if (request.isSetClientId()) {
entry.setField(ExtraFields.CLIENT_ID, request.getClientId());
}
entry.setField(RootOnlyExtraFields.CACHING_ALLOWED,
Boolean.toString(EarlybirdRequestUtil.isCachingAllowed(request)));
entry.setField(RootOnlyExtraFields.DEBUG_MODE, Byte.toString(request.getDebugMode()));
Option<ClientId> clientIdOption = ClientId$.MODULE$.current();
if (clientIdOption.isDefined()) {
entry.setField(ExtraFields.FINAGLE_CLIENT_ID, clientIdOption.get().name());
}
setLogEntriesFromTwitterContext(entry);
}
@VisibleForTesting
Option<Viewer> getTwitterContext() {
return TwitterContext.acquire(TwitterContextPermit.get()).apply();
}
private void setLogEntriesFromTwitterContext(LogEntry entry) {
Option<Viewer> viewerOption = getTwitterContext();
if (viewerOption.nonEmpty()) {
Viewer viewer = viewerOption.get();
if (viewer.userAgent().nonEmpty()) {
String userAgent = viewer.userAgent().get();
// we only replace the comma in the user-agent with %2C to make it easily parseable,
// specially with command line tools like cut/sed/awk
userAgent = userAgent.replace(",", "%2C");
entry.setField(RootOnlyExtraFields.USER_AGENT, userAgent);
}
}
}
private void setResponseLogEntries(LogEntry entry, EarlybirdResponse response) {
if (response != null) {
entry.setField(Fields.NUM_RETURNED, Integer.toString(numResultsForLog(response)));
entry.setField(Fields.RESPONSE_CODE, String.valueOf(response.getResponseCode()));
entry.setField(Fields.RESPONSE_TIME_MICROS, Long.toString(response.getResponseTimeMicros()));
if (response.isSetSearchResults()) {
entry.setField(ExtraFields.NUM_HITS_PROCESSED,
Integer.toString(response.getSearchResults().getNumHitsProcessed()));
entry.setField(ExtraFields.QUERY_COST,
Double.toString(response.getSearchResults().getQueryCost()));
if (response.getSearchResults().isSetScoringTimeNanos()) {
entry.setField(ShardOnlyExtraFields.SCORING_TIME_NANOS,
Long.toString(response.getSearchResults().getScoringTimeNanos()));
}
}
if (response.isSetCacheHit()) {
entry.setField(RootOnlyExtraFields.CACHE_HIT, String.valueOf(response.isCacheHit()));
}
if (response.isSetNumSearchedSegments()) {
entry.setField(ShardOnlyExtraFields.NUM_SEARCHED_SEGMENTS,
Integer.toString(response.getNumSearchedSegments()));
}
}
}
private static int numRequestedForLog(EarlybirdRequest request) {
int num = 0;
if (request.isSetFacetRequest() && request.getFacetRequest().isSetFacetFields()) {
for (ThriftFacetFieldRequest field : request.getFacetRequest().getFacetFields()) {
num += field.getNumResults();
}
} else if (request.isSetTermStatisticsRequest()) {
num = request.getTermStatisticsRequest().getTermRequestsSize();
} else if (request.isSetSearchQuery()) {
num = request.getSearchQuery().isSetCollectorParams()
? request.getSearchQuery().getCollectorParams().getNumResultsToReturn() : 0;
if (request.getSearchQuery().getSearchStatusIdsSize() > 0) {
num = Math.max(num, request.getSearchQuery().getSearchStatusIdsSize());
}
}
return num;
}
/**
* Returns the number of results in the given response. If the response is a term stats response,
* then the returned value will be the number of term results. If the response is a facet
* response, then the returned value will be the number of facet results. Otherwise, the returned
* value will be the number of search results.
*/
public static int numResultsForLog(EarlybirdResponse response) {
if (response == null) {
return 0;
} else if (response.isSetFacetResults()) {
return ThriftSearchResultUtil.numFacetResults(response.getFacetResults());
} else if (response.isSetTermStatisticsResults()) {
return response.getTermStatisticsResults().getTermResultsSize();
} else {
return ThriftSearchResultUtil.numResults(response.getSearchResults());
}
}
private static String requestTypeForLog(EarlybirdRequest request) {
StringBuilder requestType = new StringBuilder(64);
if (request.isSetFacetRequest()) {
requestType.append("FACETS");
int numFields = request.getFacetRequest().getFacetFieldsSize();
if (numFields > 0) {
// For 1 or 2 fields, just put them in the request type. For more, just log the number.
if (numFields <= 2) {
for (ThriftFacetFieldRequest field : request.getFacetRequest().getFacetFields()) {
requestType.append(":").append(field.getFieldName().toUpperCase());
}
} else {
requestType.append(":MULTI-").append(numFields);
}
}
} else if (request.isSetTermStatisticsRequest()) {
ThriftTermStatisticsRequest termStatsRequest = request.getTermStatisticsRequest();
requestType.append("TERMSTATS-")
.append(termStatsRequest.getTermRequestsSize());
ThriftHistogramSettings histoSettings = termStatsRequest.getHistogramSettings();
if (histoSettings != null) {
String binSizeVal = String.valueOf(TermStatisticsUtil.determineBinSize(histoSettings));
String numBinsVal = String.valueOf(histoSettings.getNumBins());
requestType.append(":NUMBINS-").append(numBinsVal).append(":BINSIZE-").append(binSizeVal);
}
} else if (request.isSetSearchQuery()) {
requestType.append("SEARCH:");
requestType.append(request.getSearchQuery().getRankingMode().name());
// Denote when a from user id is present.
if (request.getSearchQuery().isSetFromUserIDFilter64()) {
requestType.append(":NETWORK-")
.append(request.getSearchQuery().getFromUserIDFilter64Size());
}
// Denote when required status ids are present.
if (request.getSearchQuery().getSearchStatusIdsSize() > 0) {
requestType.append(":IDS-").append(request.getSearchQuery().getSearchStatusIdsSize());
}
}
return requestType.toString();
}
private static Map<ThriftQuerySource, FailureRatioCounter> preBuildFailureRatioCounters() {
Map<ThriftQuerySource, FailureRatioCounter> counterByQuerySource =
new EnumMap<>(ThriftQuerySource.class);
for (ThriftQuerySource thriftQuerySource : ThriftQuerySource.values()) {
FailureRatioCounter counter = new FailureRatioCounter("earlybird_logger", "query_source",
thriftQuerySource.toString());
counterByQuerySource.put(thriftQuerySource, counter);
}
return Maps.immutableEnumMap(counterByQuerySource);
}
}

View File

@ -1,37 +0,0 @@
package com.twitter.search.earlybird.common;
import com.twitter.decider.Decider;
import com.twitter.search.common.metrics.Timer;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
public final class EarlybirdRequestPostLogger {
private final EarlybirdRequestLogger logger;
public static EarlybirdRequestPostLogger buildForRoot(
int latencyWarnThreshold, Decider decider) {
EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForRoot(
EarlybirdRequestPostLogger.class.getName(), latencyWarnThreshold, decider);
return new EarlybirdRequestPostLogger(requestLogger);
}
public static EarlybirdRequestPostLogger buildForShard(
int latencyWarnThreshold, Decider decider) {
EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForShard(
EarlybirdRequestPostLogger.class.getName(), latencyWarnThreshold, decider);
return new EarlybirdRequestPostLogger(requestLogger);
}
private EarlybirdRequestPostLogger(EarlybirdRequestLogger logger) {
this.logger = logger;
}
public void logRequest(EarlybirdRequest request, EarlybirdResponse response, Timer timer) {
EarlybirdRequestUtil.updateHitsCounters(request);
logger.logRequest(request, response, timer);
}
}

View File

@ -1,32 +0,0 @@
package com.twitter.search.earlybird.common;
import com.twitter.decider.Decider;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
public final class EarlybirdRequestPreLogger {
private final EarlybirdRequestLogger logger;
public static EarlybirdRequestPreLogger buildForRoot(Decider decider) {
EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForRoot(
EarlybirdRequestPreLogger.class.getName(), Integer.MAX_VALUE, decider);
return new EarlybirdRequestPreLogger(requestLogger);
}
public static EarlybirdRequestPreLogger buildForShard(
int latencyWarnThreshold, Decider decider) {
EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForShard(
EarlybirdRequestPreLogger.class.getName(), latencyWarnThreshold, decider);
return new EarlybirdRequestPreLogger(requestLogger);
}
private EarlybirdRequestPreLogger(EarlybirdRequestLogger logger) {
this.logger = logger;
}
public void logRequest(EarlybirdRequest request) {
logger.logRequest(request, null, null);
}
}

View File

@ -1,244 +0,0 @@
package com.twitter.search.earlybird.common;
import java.util.concurrent.TimeUnit;
import com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.metrics.SearchMovingAverage;
import com.twitter.search.common.metrics.SearchRateCounter;
import com.twitter.search.common.metrics.SearchTimerStats;
import com.twitter.search.common.query.thriftjava.CollectorParams;
import com.twitter.search.common.query.thriftjava.CollectorTerminationParams;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
import com.twitter.search.earlybird.thrift.ThriftSearchRelevanceOptions;
public final class EarlybirdRequestUtil {
// This logger is setup to log to a separate set of log files (request_info) and use an
// async logger so as to not block the searcher thread. See search/earlybird/config/log4j.xml
private static final Logger LOG = LoggerFactory.getLogger(EarlybirdRequestUtil.class);
@VisibleForTesting
static final SearchMovingAverage REQUESTED_NUM_RESULTS_STAT =
SearchMovingAverage.export("requested_num_results");
@VisibleForTesting
static final SearchMovingAverage REQUESTED_MAX_HITS_TO_PROCESS_STAT =
SearchMovingAverage.export("requested_max_hits_to_process");
@VisibleForTesting
static final SearchMovingAverage REQUESTED_COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS_STAT =
SearchMovingAverage.export("requested_collector_params_max_hits_to_process");
@VisibleForTesting
static final SearchMovingAverage REQUESTED_RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS_STAT =
SearchMovingAverage.export("requested_relevance_options_max_hits_to_process");
@VisibleForTesting
static final SearchCounter REQUESTED_MAX_HITS_TO_PROCESS_ARE_DIFFERENT_STAT =
SearchCounter.export("requested_max_hits_to_process_are_different");
private static final SearchRateCounter REQUEST_WITH_MORE_THAN_2K_NUM_RESULTS_STAT =
SearchRateCounter.export("request_with_more_than_2k_num_result");
private static final SearchRateCounter REQUEST_WITH_MORE_THAN_4K_NUM_RESULTS_STAT =
SearchRateCounter.export("request_with_more_than_4k_num_result");
// Stats for tracking clock skew between earlybird and the client-specified request timestamp.
@VisibleForTesting
public static final SearchTimerStats CLIENT_CLOCK_DIFF_ABS =
SearchTimerStats.export("client_clock_diff_abs", TimeUnit.MILLISECONDS, false, true);
@VisibleForTesting
public static final SearchTimerStats CLIENT_CLOCK_DIFF_POS =
SearchTimerStats.export("client_clock_diff_pos", TimeUnit.MILLISECONDS, false, true);
@VisibleForTesting
public static final SearchTimerStats CLIENT_CLOCK_DIFF_NEG =
SearchTimerStats.export("client_clock_diff_neg", TimeUnit.MILLISECONDS, false, true);
@VisibleForTesting
public static final SearchRateCounter CLIENT_CLOCK_DIFF_MISSING =
SearchRateCounter.export("client_clock_diff_missing");
private static final int MAX_NUM_RESULTS = 4000;
private static final int OLD_MAX_NUM_RESULTS = 2000;
private EarlybirdRequestUtil() {
}
/**
* Logs and fixes some potentially excessive values in the given request.
*/
public static void logAndFixExcessiveValues(EarlybirdRequest request) {
ThriftSearchQuery searchQuery = request.getSearchQuery();
if (searchQuery != null) {
int maxHitsToProcess = 0;
int numResultsToReturn = 0;
if (searchQuery.isSetCollectorParams()) {
numResultsToReturn = searchQuery.getCollectorParams().getNumResultsToReturn();
if (searchQuery.getCollectorParams().isSetTerminationParams()) {
maxHitsToProcess =
searchQuery.getCollectorParams().getTerminationParams().getMaxHitsToProcess();
}
}
if (maxHitsToProcess > 50000) {
LOG.warn("Excessive max hits in " + request.toString());
}
// We used to limit number of results to 2000. These two counters help us track if we receive
// too many requests with large number of results set.
String warningMessageTemplate = "Exceed %d num result in %s";
if (numResultsToReturn > MAX_NUM_RESULTS) {
LOG.warn(String.format(warningMessageTemplate, MAX_NUM_RESULTS, request.toString()));
REQUEST_WITH_MORE_THAN_4K_NUM_RESULTS_STAT.increment();
searchQuery.getCollectorParams().setNumResultsToReturn(MAX_NUM_RESULTS);
} else if (numResultsToReturn > OLD_MAX_NUM_RESULTS) {
LOG.warn(String.format(warningMessageTemplate, OLD_MAX_NUM_RESULTS, request.toString()));
REQUEST_WITH_MORE_THAN_2K_NUM_RESULTS_STAT.increment();
}
ThriftSearchRelevanceOptions options = searchQuery.getRelevanceOptions();
if (options != null) {
if (options.getMaxHitsToProcess() > 50000) {
LOG.warn("Excessive max hits in " + request.toString());
}
}
}
}
/**
* Sets {@code request.searchQuery.collectorParams} if they are not already set.
*/
public static void checkAndSetCollectorParams(EarlybirdRequest request) {
ThriftSearchQuery searchQuery = request.getSearchQuery();
if (searchQuery == null) {
return;
}
if (!searchQuery.isSetCollectorParams()) {
searchQuery.setCollectorParams(new CollectorParams());
}
if (!searchQuery.getCollectorParams().isSetNumResultsToReturn()) {
searchQuery.getCollectorParams().setNumResultsToReturn(searchQuery.getNumResults());
}
if (!searchQuery.getCollectorParams().isSetTerminationParams()) {
CollectorTerminationParams terminationParams = new CollectorTerminationParams();
if (request.isSetTimeoutMs()) {
terminationParams.setTimeoutMs(request.getTimeoutMs());
}
if (request.isSetMaxQueryCost()) {
terminationParams.setMaxQueryCost(request.getMaxQueryCost());
}
searchQuery.getCollectorParams().setTerminationParams(terminationParams);
}
setMaxHitsToProcess(searchQuery);
}
// Early birds will only look for maxHitsToProcess in CollectorParameters.TerminationParameters.
// Priority to set CollectorParameters.TerminationParameters.maxHitsToProcess is
// 1 Collector parameters
// 2 RelevanceParameters
// 3 ThrfitQuery.maxHitsToProcess
private static void setMaxHitsToProcess(ThriftSearchQuery thriftSearchQuery) {
CollectorTerminationParams terminationParams = thriftSearchQuery
.getCollectorParams().getTerminationParams();
if (!terminationParams.isSetMaxHitsToProcess()) {
if (thriftSearchQuery.isSetRelevanceOptions()
&& thriftSearchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) {
terminationParams.setMaxHitsToProcess(
thriftSearchQuery.getRelevanceOptions().getMaxHitsToProcess());
} else {
terminationParams.setMaxHitsToProcess(thriftSearchQuery.getMaxHitsToProcess());
}
}
}
/**
* Creates a copy of the given request and unsets the binary fields to make the logged line for
* this request look nicer.
*/
public static EarlybirdRequest copyAndClearUnnecessaryValuesForLogging(EarlybirdRequest request) {
EarlybirdRequest copiedRequest = request.deepCopy();
if (copiedRequest.isSetSearchQuery()) {
// These fields are very large and the binary data doesn't play well with formz
copiedRequest.getSearchQuery().unsetTrustedFilter();
copiedRequest.getSearchQuery().unsetDirectFollowFilter();
}
return copiedRequest;
}
/**
* Updates some hit-related stats based on the parameters in the given request.
*/
public static void updateHitsCounters(EarlybirdRequest request) {
if ((request == null) || !request.isSetSearchQuery()) {
return;
}
ThriftSearchQuery searchQuery = request.getSearchQuery();
if (searchQuery.isSetNumResults()) {
REQUESTED_NUM_RESULTS_STAT.addSample(searchQuery.getNumResults());
}
if (searchQuery.isSetMaxHitsToProcess()) {
REQUESTED_MAX_HITS_TO_PROCESS_STAT.addSample(searchQuery.getMaxHitsToProcess());
}
Integer collectorParamsMaxHitsToProcess = null;
if (searchQuery.isSetCollectorParams()
&& searchQuery.getCollectorParams().isSetTerminationParams()
&& searchQuery.getCollectorParams().getTerminationParams().isSetMaxHitsToProcess()) {
collectorParamsMaxHitsToProcess =
searchQuery.getCollectorParams().getTerminationParams().getMaxHitsToProcess();
REQUESTED_COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS_STAT
.addSample(collectorParamsMaxHitsToProcess);
}
Integer relevanceOptionsMaxHitsToProcess = null;
if (searchQuery.isSetRelevanceOptions()
&& searchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) {
relevanceOptionsMaxHitsToProcess = searchQuery.getRelevanceOptions().getMaxHitsToProcess();
REQUESTED_RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS_STAT
.addSample(relevanceOptionsMaxHitsToProcess);
}
if ((collectorParamsMaxHitsToProcess != null)
&& (relevanceOptionsMaxHitsToProcess != null)
&& (collectorParamsMaxHitsToProcess != relevanceOptionsMaxHitsToProcess)) {
REQUESTED_MAX_HITS_TO_PROCESS_ARE_DIFFERENT_STAT.increment();
}
}
public static boolean isCachingAllowed(EarlybirdRequest request) {
return !request.isSetCachingParams() || request.getCachingParams().isCache();
}
/**
* Track the clock difference between this server and its client's specified request time.
* When there is no clock drift between machines, this will record the inflight time between this
* server and the client.
*
* @param request the incoming earlybird request.
*/
public static void recordClientClockDiff(EarlybirdRequest request) {
if (request.isSetClientRequestTimeMs()) {
final long timeDiff = System.currentTimeMillis() - request.getClientRequestTimeMs();
final long timeDiffAbs = Math.abs(timeDiff);
if (timeDiff >= 0) {
CLIENT_CLOCK_DIFF_POS.timerIncrement(timeDiffAbs);
} else {
CLIENT_CLOCK_DIFF_NEG.timerIncrement(timeDiffAbs);
}
CLIENT_CLOCK_DIFF_ABS.timerIncrement(timeDiffAbs);
} else {
CLIENT_CLOCK_DIFF_MISSING.increment();
}
}
}

View File

@ -1,28 +0,0 @@
package com.twitter.search.earlybird.common;
import javax.inject.Inject;
import javax.inject.Singleton;
import org.apache.thrift.protocol.TProtocolFactory;
import com.twitter.finagle.Service;
import com.twitter.search.common.util.thrift.ThriftToBytesFilter;
import com.twitter.search.earlybird.thrift.EarlybirdService;
@Singleton
public class EarlybirdThriftBackend extends EarlybirdService.ServiceToClient {
/**
* Wrapping the bytes svc back to a EarlybirdService.ServiceToClient, which
* is a EarlybirdService.ServiceIface again.
*/
@Inject
public EarlybirdThriftBackend(
ThriftToBytesFilter thriftToBytesFilter,
Service<byte[], byte[]> byteService,
TProtocolFactory protocolFactory) {
super(thriftToBytesFilter.andThen(byteService), protocolFactory);
}
}

View File

@ -1,34 +0,0 @@
package com.twitter.search.earlybird.common;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.common.metrics.SearchRateCounter;
/**
* When incremented, a non-paging alert will be triggered. Use this to assert for bad conditions
* that should generally never happen.
*/
public class NonPagingAssert {
private static final Logger LOG = LoggerFactory.getLogger(NonPagingAssert.class);
private static final String ASSERT_STAT_PREFIX = "non_paging_assert_";
private final String name;
private final SearchRateCounter assertCounter;
public NonPagingAssert(String name) {
this.name = name;
this.assertCounter = SearchRateCounter.export(ASSERT_STAT_PREFIX + name);
}
public void assertFailed() {
LOG.error("NonPagingAssert failed: {}", name);
assertCounter.increment();
}
public static void assertFailed(String name) {
NonPagingAssert nonPagingAssert = new NonPagingAssert(name);
nonPagingAssert.assertFailed();
}
}

View File

@ -1,55 +0,0 @@
package com.twitter.search.earlybird.common;
import org.apache.thrift.TException;
import org.apache.thrift.TSerializer;
import org.apache.thrift.protocol.TSimpleJSONProtocol;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
public class RequestResponseForLogging {
private static final Logger LOG = LoggerFactory.getLogger(
RequestResponseForLogging.class);
private static final Logger FAILED_REQUEST_LOG = LoggerFactory.getLogger(
RequestResponseForLogging.class.getName() + ".FailedRequests");
private final EarlybirdRequest request;
private final EarlybirdResponse response;
public RequestResponseForLogging(EarlybirdRequest request,
EarlybirdResponse response) {
this.request = request;
this.response = response;
}
private String serialize(EarlybirdRequest clearedRequest, EarlybirdResponse theResponse) {
TSerializer serializer = new TSerializer(new TSimpleJSONProtocol.Factory());
try {
String requestJson = serializer.toString(clearedRequest);
String responseJson = serializer.toString(theResponse);
return "{\"request\":" + requestJson + ", \"response\":" + responseJson + "}";
} catch (TException e) {
LOG.error("Failed to serialize request/response for logging.", e);
return "";
}
}
/**
* Logs the request and response stored in this instance to the failure log file.
*/
public void logFailedRequest() {
// Do the serializing/concatting this way so it happens on the background thread for
// async logging
FAILED_REQUEST_LOG.info("{}", new Object() {
@Override
public String toString() {
return serialize(
EarlybirdRequestUtil.copyAndClearUnnecessaryValuesForLogging(request), response);
}
});
}
}

View File

@ -1,44 +0,0 @@
package com.twitter.search.earlybird.common;
import org.apache.lucene.search.Query;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
public class RequestResponsePair {
private final EarlybirdRequest request;
private final EarlybirdResponse response;
private final org.apache.lucene.search.Query luceneQuery;
// The serialized query in its final form, after various modifications have been applied to it.
// As a note, we have some code paths in which this can be null, but I don't really see them
// triggered in production right now.
private final com.twitter.search.queryparser.query.Query finalSerializedQuery;
public RequestResponsePair(
EarlybirdRequest request,
com.twitter.search.queryparser.query.Query finalSerializedQuery,
org.apache.lucene.search.Query luceneQuery,
EarlybirdResponse response) {
this.request = request;
this.luceneQuery = luceneQuery;
this.response = response;
this.finalSerializedQuery = finalSerializedQuery;
}
public String getFinalSerializedQuery() {
return finalSerializedQuery != null ? finalSerializedQuery.serialize() : "N/A";
}
public EarlybirdRequest getRequest() {
return request;
}
public EarlybirdResponse getResponse() {
return response;
}
public Query getLuceneQuery() {
return luceneQuery;
}
}

View File

@ -1,77 +0,0 @@
package com.twitter.search.earlybird.common;
import org.apache.commons.codec.binary.Base64;
import org.apache.thrift.TException;
import org.apache.thrift.TSerializer;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.slf4j.Logger;
import com.twitter.search.common.util.FinagleUtil;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
/**
* This class logs all requests that misses either the finagle Id or the client Id.
*/
public final class UnknownClientRequestForLogging {
private static final Logger GENERAL_LOG = org.slf4j.LoggerFactory.getLogger(
UnknownClientRequestForLogging.class);
private static final Logger LOG = org.slf4j.LoggerFactory.getLogger(
UnknownClientRequestForLogging.class.getName() + ".unknownClientRequests");
private final String logLine;
private final EarlybirdRequest request;
private final String clientId;
private final String finagleId;
private final Base64 base64 = new Base64();
private final TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory());
private UnknownClientRequestForLogging(
String logLine,
EarlybirdRequest request,
String clientId,
String finagleId) {
this.logLine = logLine;
this.request = request;
this.clientId = clientId;
this.finagleId = finagleId;
}
/**
* Returns an UnknownClientRequestForLogging instance if a client ID is not set on the given
* earlybird request. If the request has a client ID set, {@code null} is returned.
*
* @param logLine Additional information to propagate to the log file, when logging this request.
* @param request The earlybird request.
*/
public static UnknownClientRequestForLogging unknownClientRequest(
String logLine, EarlybirdRequest request) {
String clientId = ClientIdUtil.getClientIdFromRequest(request);
String finagleId = FinagleUtil.getFinagleClientName();
if (clientId.equals(ClientIdUtil.UNSET_CLIENT_ID)) {
return new UnknownClientRequestForLogging(logLine, request, clientId, finagleId);
} else {
return null;
}
}
private String asBase64() {
try {
// Need to make a deepCopy() here, because the request may still be in use (e.g. if we are
// doing this in the pre-logger), and we should not be modifying crucial fields on the
// EarlybirdRequest in place.
EarlybirdRequest clearedRequest = request.deepCopy();
clearedRequest.unsetClientRequestTimeMs();
return base64.encodeToString(serializer.serialize(clearedRequest));
} catch (TException e) {
GENERAL_LOG.error("Failed to serialize request for logging.", e);
return "failed_to_serialize";
}
}
public void log() {
LOG.info("{},{},{},{}", clientId, finagleId, logLine, asBase64());
}
}

View File

@ -1,21 +0,0 @@
java_library(
sources = ["*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/code/findbugs:jsr305",
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/org/apache/commons:commons-lang3",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"3rdparty/jvm/org/slf4j:slf4j-api",
"3rdparty/jvm/org/yaml:snakeyaml",
"finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common_internal/text/version",
"src/java/com/twitter/search/common/aurora",
"src/java/com/twitter/search/common/config",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/util/zookeeper",
],
)

View File

@ -1,363 +0,0 @@
package com.twitter.search.earlybird.common.config;
import java.util.Date;
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;
import com.google.common.collect.ImmutableMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.aurora.AuroraInstanceKey;
import com.twitter.search.common.config.Config;
import com.twitter.search.common.config.ConfigFile;
import com.twitter.search.common.config.ConfigurationException;
import com.twitter.search.common.config.SearchPenguinVersionsConfig;
public final class EarlybirdConfig {
private static final Logger LOG = LoggerFactory.getLogger(EarlybirdConfig.class);
private static final String DEFAULT_CONFIG_FILE = "earlybird-search.yml";
private static final String LATE_TWEET_BUFFER_KEY = "late_tweet_buffer";
public static final String EARLYBIRD_ZK_CONFIG_DIR = "/twitter/search/production/earlybird/";
public static final String EARLYBIRD_CONFIG_DIR = "earlybird/config";
public static final String USER_SNAPSHOT_BASE_DIR = "user_snapshot_base_dir";
private static volatile ConfigFile earlybirdConfig = null;
private static volatile Map<String, Object> overrideValueMap = ImmutableMap.of();
private static String logDirOverride = null;
private static AuroraInstanceKey auroraInstanceKey = null;
private static int adminPort;
private EarlybirdConfig() { }
private static final class PenguinVersionHolder {
private static final PenguinVersion PENGUIN_VERSION_SINGLETON =
SearchPenguinVersionsConfig.getSingleSupportedVersion(
EarlybirdProperty.PENGUIN_VERSION.get());
private static final byte PENGUIN_VERSION_BYTE_VALUE =
PENGUIN_VERSION_SINGLETON.getByteValue();
}
public static byte getPenguinVersionByte() {
return PenguinVersionHolder.PENGUIN_VERSION_BYTE_VALUE;
}
public static PenguinVersion getPenguinVersion() {
return PenguinVersionHolder.PENGUIN_VERSION_SINGLETON;
}
/**
* Reads the earlybird configuration from the given file.
*/
public static synchronized void init(@Nullable String configFile) {
if (earlybirdConfig == null) {
String file = configFile == null ? DEFAULT_CONFIG_FILE : configFile;
earlybirdConfig = new ConfigFile(EARLYBIRD_CONFIG_DIR, file);
}
}
public static synchronized void setOverrideValues(Map<String, Object> overrideValues) {
overrideValueMap = ImmutableMap.copyOf(overrideValues);
}
/**
* Pack all values in a string that can be printed for informational purposes.
* @return the string.
*/
public static String allValuesAsString() {
Map<String, String> stringMap = earlybirdConfig.getStringMap();
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("Config environment: " + Config.getEnvironment() + "\n\n");
stringBuilder.append(
String.format("Values from earlybird-search.yml (total %d):\n", stringMap.size()));
stringMap.forEach((key, value) -> {
stringBuilder.append(String.format(" %s: %s\n", key, value.toString()));
if (overrideValueMap.containsKey(key)) {
stringBuilder.append(String.format(
" override value: %s\n", overrideValueMap.get(key).toString()));
}
});
stringBuilder.append(String.format(
"\n\nAll command-line overrides (total: %d):\n", overrideValueMap.size()));
overrideValueMap.forEach((key, value) -> {
stringBuilder.append(String.format(" %s: %s\n", key, value.toString()));
});
return stringBuilder.toString();
}
/**
* Returns the value of the given property as a string. If the property is not set, a runtime
* exception is thrown.
*/
public static String getString(String property) {
Object overrideValue = overrideValueMap.get(property);
if (overrideValue != null) {
return (String) overrideValue;
}
try {
return earlybirdConfig.getString(property);
} catch (ConfigurationException e) {
LOG.error("Fatal error: could not get config string " + property, e);
throw new RuntimeException(e);
}
}
/**
* Returns the value of the given property as a string.
*/
public static String getString(String property, String defaultValue) {
Object overrideValue = overrideValueMap.get(property);
if (overrideValue != null) {
return (String) overrideValue;
}
return earlybirdConfig.getString(property, defaultValue);
}
/**
* Returns the value of the given property as an integer. If the property is not set, a runtime
* exception is thrown.
*/
public static int getInt(String property) {
Object overrideValue = overrideValueMap.get(property);
if (overrideValue != null) {
return (int) overrideValue;
}
try {
return earlybirdConfig.getInt(property);
} catch (ConfigurationException e) {
LOG.error("Fatal error: could not get config int " + property, e);
throw new RuntimeException(e);
}
}
/**
* Returns the value of the given property as an integer.
*/
public static int getInt(String property, int defaultValue) {
Object overrideValue = overrideValueMap.get(property);
if (overrideValue != null) {
return (int) overrideValue;
}
return earlybirdConfig.getInt(property, defaultValue);
}
/**
* Returns the value of the given property as a double.
*/
public static double getDouble(String property, double defaultValue) {
Object overrideValue = overrideValueMap.get(property);
if (overrideValue != null) {
return (double) overrideValue;
}
return earlybirdConfig.getDouble(property, defaultValue);
}
/**
* Returns the value of the given property as a long. If the property is not set, a runtime
* exception is thrown.
*/
public static long getLong(String property) {
Object overrideValue = overrideValueMap.get(property);
if (overrideValue != null) {
return (long) overrideValue;
}
try {
return earlybirdConfig.getLong(property);
} catch (ConfigurationException e) {
LOG.error("Fatal error: could not get config long " + property, e);
throw new RuntimeException(e);
}
}
/**
* Returns the value of the given property as a long.
*/
public static long getLong(String property, long defaultValue) {
Object overrideValue = overrideValueMap.get(property);
if (overrideValue != null) {
return (long) overrideValue;
}
return earlybirdConfig.getLong(property, defaultValue);
}
/**
* Returns the value of the given property as a boolean. If the property is not set, a runtime
* exception is thrown.
*/
public static boolean getBool(String property) {
Object overrideValue = overrideValueMap.get(property);
if (overrideValue != null) {
return (boolean) overrideValue;
}
try {
return earlybirdConfig.getBool(property);
} catch (ConfigurationException e) {
LOG.error("Fatal error: could not get config boolean " + property, e);
throw new RuntimeException(e);
}
}
/**
* Returns the value of the given property as a boolean.
*/
public static boolean getBool(String property, boolean defaultValue) {
Object overrideValue = overrideValueMap.get(property);
if (overrideValue != null) {
return (boolean) overrideValue;
}
return earlybirdConfig.getBool(property, defaultValue);
}
/**
* Returns the value of the given property as a date.
*/
public static Date getDate(String property) {
Object overrideValue = overrideValueMap.get(property);
if (overrideValue != null) {
return (Date) overrideValue;
}
Date date = (Date) earlybirdConfig.getObject(property, null);
if (date == null) {
throw new RuntimeException("Could not get config date: " + property);
}
return date;
}
/**
* Returns the value of the given property as a list of strings.
*/
public static List<String> getListOfStrings(String property) {
Object overrideValue = overrideValueMap.get(property);
if (overrideValue != null) {
return (List<String>) overrideValue;
}
List<String> list = (List<String>) earlybirdConfig.getObject(property, null);
if (list == null) {
throw new RuntimeException("Could not get list of strings: " + property);
}
return list;
}
/**
* Returns the value of the given property as a map.
*/
@SuppressWarnings("unchecked")
public static Map<String, Object> getMap(String property) {
Map<String, Object> map = (Map<String, Object>) earlybirdConfig.getObject(property, null);
if (map == null) {
throw new RuntimeException("Could not find config property: " + property);
}
return map;
}
public static int getMaxSegmentSize() {
return EarlybirdConfig.getInt("max_segment_size", 1 << 16);
}
/**
* Returns the log properties file.
*/
public static String getLogPropertiesFile() {
try {
String filename = earlybirdConfig.getString("log_properties_filename");
return earlybirdConfig.getConfigFilePath(filename);
} catch (ConfigurationException e) {
// Print here rather than use LOG - log was probably not initialized yet.
LOG.error("Fatal error: could not get log properties file", e);
throw new RuntimeException(e);
}
}
/**
* Returns the log directory.
*/
public static String getLogDir() {
if (logDirOverride != null) {
return logDirOverride;
} else {
return EarlybirdConfig.getString("log_dir");
}
}
public static void overrideLogDir(String logDir) {
EarlybirdConfig.logDirOverride = logDir;
}
public static int getThriftPort() {
return EarlybirdProperty.THRIFT_PORT.get();
}
public static int getWarmUpThriftPort() {
return EarlybirdProperty.WARMUP_THRIFT_PORT.get();
}
public static int getSearcherThreads() {
return EarlybirdProperty.SEARCHER_THREADS.get();
}
public static int getLateTweetBuffer() {
return getInt(LATE_TWEET_BUFFER_KEY);
}
public static int getAdminPort() {
return adminPort;
}
public static void setAdminPort(int adminPort) {
EarlybirdConfig.adminPort = adminPort;
}
public static boolean isRealtimeOrProtected() {
String earlybirdName = EarlybirdProperty.EARLYBIRD_NAME.get();
return earlybirdName.contains("realtime") || earlybirdName.contains("protected");
}
public static boolean consumeUserScrubGeoEvents() {
return EarlybirdProperty.CONSUME_GEO_SCRUB_EVENTS.get();
}
@Nullable
public static AuroraInstanceKey getAuroraInstanceKey() {
return auroraInstanceKey;
}
public static void setAuroraInstanceKey(AuroraInstanceKey auroraInstanceKey) {
EarlybirdConfig.auroraInstanceKey = auroraInstanceKey;
}
public static boolean isAurora() {
return auroraInstanceKey != null;
}
public static void setForTests(String property, Object value) {
earlybirdConfig.setForTests(DEFAULT_CONFIG_FILE, property, value);
}
public static synchronized void clearForTests() {
earlybirdConfig = new ConfigFile(EARLYBIRD_CONFIG_DIR, DEFAULT_CONFIG_FILE);
}
}

View File

@ -1,390 +0,0 @@
package com.twitter.search.earlybird.common.config;
import java.lang.reflect.Modifier;
import java.util.Arrays;
import java.util.List;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.stream.Collectors;
import com.google.common.collect.ImmutableList;
import com.twitter.app.Flag;
import com.twitter.app.Flaggable;
import com.twitter.app.Flags;
import com.twitter.finagle.mtls.authentication.ServiceIdentifier;
/**
* Stateless class that represents an Earlybird property that can be specified by a command line
* flag.
* <p>
* This is a regular Java class instead of enum to have a generic type.
*
* @param <T>
*/
public final class EarlybirdProperty<T> {
private static final class PropertyType<T> {
private static final PropertyType<Boolean> BOOLEAN = new PropertyType<>(
Flaggable.ofJavaBoolean(), EarlybirdConfig::getBool, EarlybirdConfig::getBool);
private static final PropertyType<Integer> INT = new PropertyType<>(
Flaggable.ofJavaInteger(), EarlybirdConfig::getInt, EarlybirdConfig::getInt);
private static final PropertyType<String> STRING = new PropertyType<>(
Flaggable.ofString(), EarlybirdConfig::getString, EarlybirdConfig::getString);
private final Flaggable<T> flaggable;
private final Function<String, T> getter;
private final BiFunction<String, T, T> getterWithDefault;
private PropertyType(Flaggable<T> flaggable, Function<String, T> getter,
BiFunction<String, T, T> getterWithDefault) {
this.flaggable = flaggable;
this.getter = getter;
this.getterWithDefault = getterWithDefault;
}
}
public static final EarlybirdProperty<String> PENGUIN_VERSION =
new EarlybirdProperty<>(
"penguin_version",
"The penguin version to index.",
PropertyType.STRING,
false);
public static final EarlybirdProperty<Integer> THRIFT_PORT = new EarlybirdProperty<>(
"thrift_port",
"override thrift port from config file",
PropertyType.INT,
false);
public static final EarlybirdProperty<Integer> WARMUP_THRIFT_PORT = new EarlybirdProperty<>(
"warmup_thrift_port",
"override warmup thrift port from config file",
PropertyType.INT,
false);
public static final EarlybirdProperty<Integer> SEARCHER_THREADS = new EarlybirdProperty<>(
"searcher_threads",
"override number of searcher threads from config file",
PropertyType.INT,
false);
public static final EarlybirdProperty<String> EARLYBIRD_TIER = new EarlybirdProperty<>(
"earlybird_tier",
"the earlybird tier (e.g. tier1), used on Aurora",
PropertyType.STRING,
true);
public static final EarlybirdProperty<Integer> REPLICA_ID = new EarlybirdProperty<>(
"replica_id",
"the ID in a partition, used on Aurora",
PropertyType.INT,
true);
public static final EarlybirdProperty<Integer> PARTITION_ID = new EarlybirdProperty<>(
"partition_id",
"partition ID, used on Aurora",
PropertyType.INT,
true);
public static final EarlybirdProperty<Integer> NUM_PARTITIONS = new EarlybirdProperty<>(
"num_partitions",
"number of partitions, used on Aurora",
PropertyType.INT,
true);
public static final EarlybirdProperty<Integer> NUM_INSTANCES = new EarlybirdProperty<>(
"num_instances",
"number of instances in the job, used on Aurora",
PropertyType.INT,
true);
public static final EarlybirdProperty<Integer> SERVING_TIMESLICES = new EarlybirdProperty<>(
"serving_timeslices",
"number of time slices to serve, used on Aurora",
PropertyType.INT,
true);
public static final EarlybirdProperty<String> ROLE = new EarlybirdProperty<>(
"role",
"Role in the service path of Earlybird",
PropertyType.STRING,
true,
true);
public static final EarlybirdProperty<String> EARLYBIRD_NAME = new EarlybirdProperty<>(
"earlybird_name",
"Name in the service path of Earlybird without hash partition suffix",
PropertyType.STRING,
true,
true);
public static final EarlybirdProperty<String> ENV = new EarlybirdProperty<>(
"env",
"Environment in the service path of Earlybird",
PropertyType.STRING,
true,
true);
public static final EarlybirdProperty<String> ZONE = new EarlybirdProperty<>(
"zone",
"Zone (data center) in the service path of Earlybird",
PropertyType.STRING,
true,
true);
public static final EarlybirdProperty<String> DL_URI = new EarlybirdProperty<>(
"dl_uri",
"DistributedLog URI for default DL reader",
PropertyType.STRING,
false);
public static final EarlybirdProperty<String> USER_UPDATES_DL_URI = new EarlybirdProperty<>(
"user_updates_dl_uri",
"DistributedLog URI for user updates DL reader",
PropertyType.STRING,
false);
public static final EarlybirdProperty<String> ANTISOCIAL_USERUPDATES_DL_STREAM =
new EarlybirdProperty<>(
"antisocial_userupdates_dl_stream",
"DL stream name for antisocial user updates without DL version suffix",
PropertyType.STRING,
false);
public static final EarlybirdProperty<String> ZK_APP_ROOT = new EarlybirdProperty<>(
"zk_app_root",
"SZooKeeper base root path for this application",
PropertyType.STRING,
true);
public static final EarlybirdProperty<Boolean> SEGMENT_LOAD_FROM_HDFS_ENABLED =
new EarlybirdProperty<>(
"segment_load_from_hdfs_enabled",
"Whether to load segment data from HDFS",
PropertyType.BOOLEAN,
false);
public static final EarlybirdProperty<Boolean> SEGMENT_FLUSH_TO_HDFS_ENABLED =
new EarlybirdProperty<>(
"segment_flush_to_hdfs_enabled",
"Whether to flush segment data to HDFS",
PropertyType.BOOLEAN,
false);
public static final EarlybirdProperty<String> HDFS_SEGMENT_SYNC_DIR = new EarlybirdProperty<>(
"hdfs_segment_sync_dir",
"HDFS directory to sync segment data",
PropertyType.STRING,
false);
public static final EarlybirdProperty<String> HDFS_SEGMENT_UPLOAD_DIR = new EarlybirdProperty<>(
"hdfs_segment_upload_dir",
"HDFS directory to upload segment data",
PropertyType.STRING,
false);
public static final EarlybirdProperty<Boolean> ARCHIVE_DAILY_STATUS_BATCH_FLUSHING_ENABLED =
new EarlybirdProperty<>(
"archive_daily_status_batch_flushing_enabled",
"Whether to enable archive daily status batch flushing",
PropertyType.BOOLEAN,
false);
public static final EarlybirdProperty<String> HDFS_INDEX_SYNC_DIR = new EarlybirdProperty<>(
"hdfs_index_sync_dir",
"HDFS directory to sync index data",
PropertyType.STRING,
true);
public static final EarlybirdProperty<Boolean> READ_INDEX_FROM_PROD_LOCATION =
new EarlybirdProperty<>(
"read_index_from_prod_location",
"Read index from prod to speed up startup on staging / loadtest",
PropertyType.BOOLEAN,
false);
public static final EarlybirdProperty<Boolean> USE_DECIDER_OVERLAY = new EarlybirdProperty<>(
"use_decider_overlay",
"Whether to use decider overlay",
PropertyType.BOOLEAN,
false);
public static final EarlybirdProperty<String> DECIDER_OVERLAY_CONFIG = new EarlybirdProperty<>(
"decider_overlay_config",
"Path to decider overlay config",
PropertyType.STRING,
false);
public static final EarlybirdProperty<Integer> MAX_CONCURRENT_SEGMENT_INDEXERS =
new EarlybirdProperty<>(
"max_concurrent_segment_indexers",
"Maximum number of segments indexed concurrently",
PropertyType.INT,
false);
public static final EarlybirdProperty<Boolean> TF_MODELS_ENABLED =
new EarlybirdProperty<>(
"tf_models_enabled",
"Whether tensorflow models should be loaded",
PropertyType.BOOLEAN,
false);
public static final EarlybirdProperty<String> TF_MODELS_CONFIG_PATH =
new EarlybirdProperty<>(
"tf_models_config_path",
"The configuration path of the yaml file containing the list of tensorflow models to load.",
PropertyType.STRING,
false);
public static final EarlybirdProperty<Integer> TF_INTER_OP_THREADS =
new EarlybirdProperty<>(
"tf_inter_op_threads",
"How many tensorflow inter op threads to use. See TF documentation for more information.",
PropertyType.INT,
false);
public static final EarlybirdProperty<Integer> TF_INTRA_OP_THREADS =
new EarlybirdProperty<>(
"tf_intra_op_threads",
"How many tensorflow intra op threads to use. See TF documentation for more information.",
PropertyType.INT,
false);
public static final EarlybirdProperty<Integer> MAX_ALLOWED_REPLICAS_NOT_IN_SERVER_SET =
new EarlybirdProperty<>(
"max_allowed_replicas_not_in_server_set",
"How many replicas are allowed to be missing from the Earlybird server set.",
PropertyType.INT,
false);
public static final EarlybirdProperty<Boolean> CHECK_NUM_REPLICAS_IN_SERVER_SET =
new EarlybirdProperty<>(
"check_num_replicas_in_server_set",
"Whether CoordinatedEarlybirdActions should check the number of alive replicas",
PropertyType.BOOLEAN,
false);
public static final EarlybirdProperty<Integer> MAX_QUEUE_SIZE =
new EarlybirdProperty<>(
"max_queue_size",
"Maximum size of searcher worker executor queue. If <= 0 queue is unbounded.",
PropertyType.INT,
false);
public static final EarlybirdProperty<String> KAFKA_ENV =
new EarlybirdProperty<>(
"kafka_env",
"The environment to use for kafka topics.",
PropertyType.STRING,
false);
public static final EarlybirdProperty<String> KAFKA_PATH =
new EarlybirdProperty<>(
"kafka_path",
"Wily path to the Search kafka cluster.",
PropertyType.STRING,
false);
public static final EarlybirdProperty<String> TWEET_EVENTS_KAFKA_PATH =
new EarlybirdProperty<>(
"tweet_events_kafka_path",
"Wily path to the tweet-events kafka cluster.",
PropertyType.STRING,
false);
public static final EarlybirdProperty<String> USER_UPDATES_KAFKA_TOPIC =
new EarlybirdProperty<>(
"user_updates_topic",
"Name of the Kafka topic that contain user updates.",
PropertyType.STRING,
false);
public static final EarlybirdProperty<String> USER_SCRUB_GEO_KAFKA_TOPIC =
new EarlybirdProperty<>(
"user_scrub_geo_topic",
"Name of the Kafka topic that contain UserScrubGeoEvents.",
PropertyType.STRING,
false);
public static final EarlybirdProperty<String> EARLYBIRD_SCRUB_GEN =
new EarlybirdProperty<>(
"earlybird_scrub_gen",
"SCRUB_GEN TO DEPLOY",
PropertyType.STRING,
false);
public static final EarlybirdProperty<Boolean> CONSUME_GEO_SCRUB_EVENTS =
new EarlybirdProperty<>(
"consume_geo_scrub_events",
"Whether to consume user scrub geo events or not",
PropertyType.BOOLEAN,
false);
private static final List<EarlybirdProperty<?>> ALL_PROPERTIES =
Arrays.stream(EarlybirdProperty.class.getDeclaredFields())
.filter(field ->
(field.getModifiers() & Modifier.STATIC) > 0
&& field.getType() == EarlybirdProperty.class)
.map(field -> {
try {
return (EarlybirdProperty<?>) field.get(EarlybirdProperty.class);
} catch (Exception e) {
throw new RuntimeException(e);
}
})
.collect(Collectors.collectingAndThen(Collectors.toList(), ImmutableList::copyOf));
public static ServiceIdentifier getServiceIdentifier() {
return new ServiceIdentifier(
ROLE.get(),
EARLYBIRD_NAME.get(),
ENV.get(),
ZONE.get());
}
private final String name;
private final String help;
private final PropertyType<T> type;
private final boolean requiredOnAurora;
private final boolean requiredOnDedicated;
private EarlybirdProperty(String name, String help, PropertyType<T> type,
boolean requiredOnAurora) {
this(name, help, type, requiredOnAurora, false);
}
private EarlybirdProperty(String name, String help, PropertyType<T> type,
boolean requiredOnAurora, boolean requiredOnDedicated) {
this.name = name;
this.help = help;
this.type = type;
this.requiredOnAurora = requiredOnAurora;
this.requiredOnDedicated = requiredOnDedicated;
}
public String name() {
return name;
}
public boolean isRequiredOnAurora() {
return requiredOnAurora;
}
public boolean isRequiredOnDedicated() {
return requiredOnDedicated;
}
public Flag<T> createFlag(Flags flags) {
return flags.createMandatory(name, help, null, type.flaggable);
}
public T get() {
return type.getter.apply(name);
}
public T get(T devaultValue) {
return type.getterWithDefault.apply(name, devaultValue);
}
public static EarlybirdProperty[] values() {
return ALL_PROPERTIES.toArray(new EarlybirdProperty[0]);
}
}

View File

@ -1,45 +0,0 @@
java_library(
sources = ["*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/commons-io",
"3rdparty/jvm/geo/google:geoGoogle",
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server",
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn",
"3rdparty/jvm/org/apache/lucene:lucene-core",
"3rdparty/jvm/org/apache/lucene:lucene-facet",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"3rdparty/jvm/org/slf4j:slf4j-api",
"3rdparty/src/jvm/com/twitter/scalding:core",
"3rdparty/src/jvm/com/twitter/scalding:date",
"3rdparty/src/jvm/com/twitter/scalding:parquet",
"decider/src/main/scala",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/util:system-mocks",
"src/java/com/twitter/common_internal/hadoop",
"src/java/com/twitter/search/common/logging",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/partitioning/snowflakeparser",
"src/java/com/twitter/search/common/schema/earlybird",
"src/java/com/twitter/search/common/util/hash",
"src/java/com/twitter/search/common/util/io",
"src/java/com/twitter/search/common/util/io:dl-reader-writer",
"src/java/com/twitter/search/common/util/io:flushable",
"src/java/com/twitter/search/common/util/io:record-reader-api",
"src/java/com/twitter/search/earlybird/common/config",
"src/scala/com/twitter/scalding_internal/error_handling",
"src/scala/com/twitter/scalding_internal/multiformat",
"src/scala/com/twitter/scalding_internal/source",
"src/scala/com/twitter/search/user_table/sources",
"src/thrift/com/twitter/search/common:indexing-java",
"src/thrift/com/twitter/tweetypie:events-java",
"util/util-core:scala",
],
)

View File

@ -1,100 +0,0 @@
package com.twitter.search.earlybird.common.userupdates;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.metrics.SearchCustomGauge;
import com.twitter.search.common.metrics.SearchTimerStats;
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
import com.twitter.tweetypie.thriftjava.UserScrubGeoEvent;
/**
* Map of users who have actioned to delete location data from their tweets. UserID's are mapped
* to the maxTweetId that will eventually be scrubbed from the index (userId -> maxTweetId).
*
* ConcurrentHashMap is thread safe without synchronizing the whole map. Reads can happen very fast
* while writes are done with a lock. This is ideal since many Earlybird Searcher threads could
* be reading from the map at once, whereas we will only be adding to the map via kafka.
*
* This map is checked against to filter out tweets that should not be returned to geo queries.
* See: go/realtime-geo-filtering
*/
public class UserScrubGeoMap {
// The number of geo events that contain a user ID already present in the map. This count is used
// to verify the number of users in the map against the number of events consumed from kafka.
private static final SearchCounter USER_SCRUB_GEO_EVENT_EXISTING_USER_COUNT =
SearchCounter.export("user_scrub_geo_event_existing_user_count");
public static final SearchTimerStats USER_SCRUB_GEO_EVENT_LAG_STAT =
SearchTimerStats.export("user_scrub_geo_event_lag",
TimeUnit.MILLISECONDS,
false,
true);
private ConcurrentHashMap<Long, Long> map;
public UserScrubGeoMap() {
map = new ConcurrentHashMap<>();
SearchCustomGauge.export("num_users_in_geo_map", this::getNumUsersInMap);
}
/**
* Ensure that the max_tweet_id in the userScrubGeoEvent is greater than the one already stored
* in the map for the given user id (if any) before updating the entry for this user.
* This will protect Earlybirds from potential issues where out of date UserScrubGeoEvents
* appear in the incoming Kafka stream.
*
* @param userScrubGeoEvent
*/
public void indexUserScrubGeoEvent(UserScrubGeoEvent userScrubGeoEvent) {
long userId = userScrubGeoEvent.getUser_id();
long newMaxTweetId = userScrubGeoEvent.getMax_tweet_id();
long oldMaxTweetId = map.getOrDefault(userId, 0L);
if (map.containsKey(userId)) {
USER_SCRUB_GEO_EVENT_EXISTING_USER_COUNT.increment();
}
map.put(userId, Math.max(oldMaxTweetId, newMaxTweetId));
USER_SCRUB_GEO_EVENT_LAG_STAT.timerIncrement(computeEventLag(newMaxTweetId));
}
/**
* A tweet is geo scrubbed if it is older than the max tweet id that is scrubbed for the tweet's
* author.
* If there is no entry for the tweet's author in the map, then the tweet is not geo scrubbed.
*
* @param tweetId
* @param fromUserId
* @return
*/
public boolean isTweetGeoScrubbed(long tweetId, long fromUserId) {
return tweetId <= map.getOrDefault(fromUserId, 0L);
}
/**
* The lag (in milliseconds) from when a UserScrubGeoEvent is created, until it is applied to the
* UserScrubGeoMap. Take the maxTweetId found in the current event and convert it to a timestamp.
* The maxTweetId will give us a timestamp closest to when Tweetypie processes macaw-geo requests.
*
* @param maxTweetId
* @return
*/
private long computeEventLag(long maxTweetId) {
long eventCreatedAtTime = SnowflakeIdParser.getTimestampFromTweetId(maxTweetId);
return System.currentTimeMillis() - eventCreatedAtTime;
}
public long getNumUsersInMap() {
return map.size();
}
public ConcurrentHashMap<Long, Long> getMap() {
return map;
}
public boolean isEmpty() {
return map.isEmpty();
}
public boolean isSet(long userId) {
return map.containsKey(userId);
}
}

View File

@ -1,572 +0,0 @@
package com.twitter.search.earlybird.common.userupdates;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Predicate;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.common.metrics.SearchLongGauge;
import com.twitter.search.common.metrics.SearchRateCounter;
import com.twitter.search.common.util.hash.GeneralLongHashFunction;
/**
* Table containing metadata about users, like NSFW or Antisocial status.
* Used for result filtering.
*/
public class UserTable {
private static final Logger LOG = LoggerFactory.getLogger(UserTable.class);
@VisibleForTesting // Not final for testing.
protected static long userUpdateTableMaxCapacity = 1L << 30;
private static final int DEFAULT_INITIAL_CAPACITY = 1024;
private static final int BYTE_WIDTH = 8;
private static final String USER_TABLE_CAPACITY = "user_table_capacity";
private static final String USER_TABLE_SIZE = "user_table_size";
private static final String
USER_NUM_USERS_WITH_NO_BITS_SET = "user_table_users_with_no_bits_set";
private static final String USER_TABLE_ANTISOCIAL_USERS = "user_table_antisocial_users";
private static final String USER_TABLE_OFFENSIVE_USERS = "user_table_offensive_users";
private static final String USER_TABLE_NSFW_USERS = "user_table_nsfw_users";
private static final String USER_TABLE_IS_PROTECTED_USERS = "user_table_is_protected_users";
/**
* number of users filtered
*/
private static final SearchRateCounter USER_TABLE_USERS_FILTERED_COUNTER =
new SearchRateCounter("user_table_users_filtered");
private SearchLongGauge userTableCapacity;
private SearchLongGauge userTableSize;
private SearchLongGauge userTableNumUsersWithNoBitsSet;
private SearchLongGauge userTableAntisocialUsers;
private SearchLongGauge userTableOffensiveUsers;
private SearchLongGauge userTableNsfwUsers;
private SearchLongGauge userTableIsProtectedUsers;
private final Predicate<Long> userIdFilter;
private long lastRecordTimestamp;
private static final class HashTable {
private int numUsersInTable;
private int numUsersWithNoBitsSet;
// size 8 array contains the number of users who have the bit set at the index (0-7) position
// e.g. setBitCounts[0] stores the number of users who have the 0 bit set in their bytes
private long[] setBitCounts;
private final long[] hash;
private final byte[] bits;
private final int hashMask;
HashTable(int size) {
this.hash = new long[size];
this.bits = new byte[size];
this.hashMask = size - 1;
this.numUsersInTable = 0;
this.setBitCounts = new long[BYTE_WIDTH];
}
protected int hashSize() {
return hash.length;
}
// If we want to decrease the number of users in the table, we can delete as many users
// as this table returns, by calling filterTableAndCountValidItems.
public void setCountOfNumUsersWithNoBitsSet() {
int count = 0;
for (int i = 0; i < hash.length; i++) {
if ((hash[i] > 0) && (bits[i] == 0)) {
count++;
}
}
numUsersWithNoBitsSet = count;
}
public void setSetBitCounts() {
long[] counts = new long[BYTE_WIDTH];
for (int i = 0; i < hash.length; i++) {
if (hash[i] > 0) {
int tempBits = bits[i] & 0xff;
int curBitPos = 0;
while (tempBits != 0) {
if ((tempBits & 1) != 0) {
counts[curBitPos]++;
}
tempBits = tempBits >>> 1;
curBitPos++;
}
}
}
setBitCounts = counts;
}
}
public static final int ANTISOCIAL_BIT = 1;
public static final int OFFENSIVE_BIT = 1 << 1;
public static final int NSFW_BIT = 1 << 2;
public static final int IS_PROTECTED_BIT = 1 << 3;
public long getLastRecordTimestamp() {
return this.lastRecordTimestamp;
}
public void setLastRecordTimestamp(long lastRecordTimestamp) {
this.lastRecordTimestamp = lastRecordTimestamp;
}
public void setOffensive(long userID, boolean offensive) {
set(userID, OFFENSIVE_BIT, offensive);
}
public void setAntisocial(long userID, boolean antisocial) {
set(userID, ANTISOCIAL_BIT, antisocial);
}
public void setNSFW(long userID, boolean nsfw) {
set(userID, NSFW_BIT, nsfw);
}
public void setIsProtected(long userID, boolean isProtected) {
set(userID, IS_PROTECTED_BIT, isProtected);
}
/**
* Adds the given user update to this table.
*/
public boolean indexUserUpdate(UserUpdatesChecker checker, UserUpdate userUpdate) {
if (checker.skipUserUpdate(userUpdate)) {
return false;
}
switch (userUpdate.updateType) {
case ANTISOCIAL:
setAntisocial(userUpdate.twitterUserID, userUpdate.updateValue != 0);
break;
case NSFW:
setNSFW(userUpdate.twitterUserID, userUpdate.updateValue != 0);
break;
case OFFENSIVE:
setOffensive(userUpdate.twitterUserID, userUpdate.updateValue != 0);
break;
case PROTECTED:
setIsProtected(userUpdate.twitterUserID, userUpdate.updateValue != 0);
break;
default:
return false;
}
return true;
}
private final AtomicReference<HashTable> hashTable = new AtomicReference<>();
private int hashCode(long userID) {
return (int) GeneralLongHashFunction.hash(userID);
}
/**
* Returns an iterator for user IDs that have at least one of the bits set.
*/
public Iterator<Long> getFlaggedUserIdIterator() {
HashTable table = hashTable.get();
final long[] currUserIdTable = table.hash;
final byte[] currBitsTable = table.bits;
return new Iterator<Long>() {
private int index = findNext(0);
private int findNext(int index) {
int startingIndex = index;
while (startingIndex < currUserIdTable.length) {
if (currUserIdTable[startingIndex] != 0 && currBitsTable[startingIndex] != 0) {
break;
}
++startingIndex;
}
return startingIndex;
}
@Override
public boolean hasNext() {
return index < currUserIdTable.length;
}
@Override
public Long next() {
Long r = currUserIdTable[index];
index = findNext(index + 1);
return r;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
/**
* Constructs an UserUpdatesTable with an given HashTable instance.
* Use <code>useIdFilter</code> as a Predicate that returns true for the elements
* needed to be kept in the table.
* Use shouldRehash to force a rehasing on the given HashTable.
*/
private UserTable(HashTable hashTable, Predicate<Long> userIdFilter,
boolean shouldRehash) {
Preconditions.checkNotNull(userIdFilter);
this.hashTable.set(hashTable);
this.userIdFilter = userIdFilter;
exportUserUpdatesTableStats();
LOG.info("User table num users: {}. Users with no bits set: {}. "
+ "Antisocial users: {}. Offensive users: {}. Nsfw users: {}. IsProtected users: {}.",
this.getNumUsersInTable(),
this.getNumUsersWithNoBitsSet(),
this.getSetBitCount(ANTISOCIAL_BIT),
this.getSetBitCount(OFFENSIVE_BIT),
this.getSetBitCount(NSFW_BIT),
this.getSetBitCount(IS_PROTECTED_BIT));
if (shouldRehash) {
int filteredTableSize = filterTableAndCountValidItems();
// Having exactly 100% usage can impact lookup. Maintain the table at under 50% usage.
int newTableCapacity = computeDesiredHashTableCapacity(filteredTableSize * 2);
rehash(newTableCapacity);
LOG.info("User table num users after rehash: {}. Users with no bits set: {}. "
+ "Antisocial users: {}. Offensive users: {}. Nsfw users: {}. IsProtected users: {}.",
this.getNumUsersInTable(),
this.getNumUsersWithNoBitsSet(),
this.getSetBitCount(ANTISOCIAL_BIT),
this.getSetBitCount(OFFENSIVE_BIT),
this.getSetBitCount(NSFW_BIT),
this.getSetBitCount(IS_PROTECTED_BIT));
}
}
private UserTable(int initialSize, Predicate<Long> userIdFilter) {
this(new HashTable(computeDesiredHashTableCapacity(initialSize)), userIdFilter, false);
}
@VisibleForTesting
public UserTable(int initialSize) {
this(initialSize, userId -> true);
}
public static UserTable
newTableWithDefaultCapacityAndPredicate(Predicate<Long> userIdFilter) {
return new UserTable(DEFAULT_INITIAL_CAPACITY, userIdFilter);
}
public static UserTable newTableNonFilteredWithDefaultCapacity() {
return newTableWithDefaultCapacityAndPredicate(userId -> true);
}
private void exportUserUpdatesTableStats() {
userTableSize = SearchLongGauge.export(USER_TABLE_SIZE);
userTableCapacity = SearchLongGauge.export(USER_TABLE_CAPACITY);
userTableNumUsersWithNoBitsSet = SearchLongGauge.export(
USER_NUM_USERS_WITH_NO_BITS_SET
);
userTableAntisocialUsers = SearchLongGauge.export(USER_TABLE_ANTISOCIAL_USERS);
userTableOffensiveUsers = SearchLongGauge.export(USER_TABLE_OFFENSIVE_USERS);
userTableNsfwUsers = SearchLongGauge.export(USER_TABLE_NSFW_USERS);
userTableIsProtectedUsers = SearchLongGauge.export(USER_TABLE_IS_PROTECTED_USERS);
LOG.info(
"Exporting stats for user table. Starting with numUsersInTable={}, usersWithZeroBits={}, "
+ "antisocialUsers={}, offensiveUsers={}, nsfwUsers={}, isProtectedUsers={}.",
getNumUsersInTable(),
getNumUsersWithNoBitsSet(),
getSetBitCount(ANTISOCIAL_BIT),
getSetBitCount(OFFENSIVE_BIT),
getSetBitCount(NSFW_BIT),
getSetBitCount(IS_PROTECTED_BIT));
updateStats();
}
private void updateStats() {
HashTable table = this.hashTable.get();
userTableSize.set(table.numUsersInTable);
userTableNumUsersWithNoBitsSet.set(table.numUsersWithNoBitsSet);
userTableCapacity.set(table.hashSize());
userTableAntisocialUsers.set(getSetBitCount(ANTISOCIAL_BIT));
userTableOffensiveUsers.set(getSetBitCount(OFFENSIVE_BIT));
userTableNsfwUsers.set(getSetBitCount(NSFW_BIT));
userTableIsProtectedUsers.set(getSetBitCount(IS_PROTECTED_BIT));
}
/**
* Computes the size of the hashtable as the first power of two greater than or equal to initialSize
*/
private static int computeDesiredHashTableCapacity(int initialSize) {
long powerOfTwoSize = 2;
while (initialSize > powerOfTwoSize) {
powerOfTwoSize *= 2;
}
if (powerOfTwoSize > Integer.MAX_VALUE) {
LOG.error("Error: powerOfTwoSize overflowed Integer.MAX_VALUE! Initial size: " + initialSize);
powerOfTwoSize = 1 << 30; // max power of 2
}
return (int) powerOfTwoSize;
}
public int getNumUsersInTable() {
return hashTable.get().numUsersInTable;
}
/**
* Get the number of users who have the bit set at the `userStateBit` position
*/
public long getSetBitCount(int userStateBit) {
int bit = userStateBit;
int bitPosition = 0;
while (bit != 0 && (bit & 1) == 0) {
bit = bit >>> 1;
bitPosition++;
}
return hashTable.get().setBitCounts[bitPosition];
}
public Predicate<Long> getUserIdFilter() {
return userIdFilter::test;
}
/**
* Updates a user flag in this table.
*/
public final void set(long userID, int bit, boolean value) {
// if userID is filtered return immediately
if (!shouldKeepUser(userID)) {
USER_TABLE_USERS_FILTERED_COUNTER.increment();
return;
}
HashTable table = this.hashTable.get();
int hashPos = findHashPosition(table, userID);
long item = table.hash[hashPos];
byte bits = 0;
int bitsDiff = 0;
if (item != 0) {
byte bitsOriginally = bits = table.bits[hashPos];
if (value) {
bits |= bit;
} else {
// AND'ing with the inverse map clears the desired bit, but
// doesn't change any of the other bits
bits &= ~bit;
}
// Find the changed bits after the above operation, it is possible that no bit is changed if
// the input 'bit' is already set/unset in the table.
// Since bitwise operators cannot be directly applied on Byte, Byte is promoted into int to
// apply the operators. When that happens, if the most significant bit of the Byte is set,
// the promoted int has all significant bits set to 1. 0xff bitmask is applied here to make
// sure only the last 8 bits are considered.
bitsDiff = (bitsOriginally & 0xff) ^ (bits & 0xff);
if (bitsOriginally > 0 && bits == 0) {
table.numUsersWithNoBitsSet++;
} else if (bitsOriginally == 0 && bits > 0) {
table.numUsersWithNoBitsSet--;
}
} else {
if (!value) {
// no need to add this user, since all bits would be false anyway
return;
}
// New user string.
if (table.numUsersInTable + 1 >= (table.hashSize() >> 1)
&& table.hashSize() != userUpdateTableMaxCapacity) {
if (2L * (long) table.hashSize() < userUpdateTableMaxCapacity) {
rehash(2 * table.hashSize());
table = this.hashTable.get();
} else {
if (table.hashSize() < (int) userUpdateTableMaxCapacity) {
rehash((int) userUpdateTableMaxCapacity);
table = this.hashTable.get();
LOG.warn("User update table size reached Integer.MAX_VALUE, performance will degrade.");
}
}
// Must repeat this operation with the resized hashTable.
hashPos = findHashPosition(table, userID);
}
item = userID;
bits |= bit;
bitsDiff = bit & 0xff;
table.numUsersInTable++;
}
table.hash[hashPos] = item;
table.bits[hashPos] = bits;
// update setBitCounts for the changed bits after applying the input 'bit'
int curBitsDiffPos = 0;
while (bitsDiff != 0) {
if ((bitsDiff & 1) != 0) {
if (value) {
table.setBitCounts[curBitsDiffPos]++;
} else {
table.setBitCounts[curBitsDiffPos]--;
}
}
bitsDiff = bitsDiff >>> 1;
curBitsDiffPos++;
}
updateStats();
}
public final boolean isSet(long userID, int bits) {
HashTable table = hashTable.get();
int hashPos = findHashPosition(table, userID);
return table.hash[hashPos] != 0 && (table.bits[hashPos] & bits) != 0;
}
/**
* Returns true when userIdFilter condition is being met.
* If filter is not present returns true
*/
private boolean shouldKeepUser(long userID) {
return userIdFilter.test(userID);
}
private int findHashPosition(final HashTable table, final long userID) {
int code = hashCode(userID);
int hashPos = code & table.hashMask;
// Locate user in hash
long item = table.hash[hashPos];
if (item != 0 && item != userID) {
// Conflict: keep searching different locations in
// the hash table.
final int inc = ((code >> 8) + code) | 1;
do {
code += inc;
hashPos = code & table.hashMask;
item = table.hash[hashPos];
} while (item != 0 && item != userID);
}
return hashPos;
}
/**
* Applies the filtering predicate and returns the size of the filtered table.
*/
private synchronized int filterTableAndCountValidItems() {
final HashTable oldTable = this.hashTable.get();
int newSize = 0;
int clearNoItemSet = 0;
int clearNoBitsSet = 0;
int clearDontKeepUser = 0;
for (int i = 0; i < oldTable.hashSize(); i++) {
final long item = oldTable.hash[i]; // this is the userID
final byte bits = oldTable.bits[i];
boolean clearSlot = false;
if (item == 0) {
clearSlot = true;
clearNoItemSet++;
} else if (bits == 0) {
clearSlot = true;
clearNoBitsSet++;
} else if (!shouldKeepUser(item)) {
clearSlot = true;
clearDontKeepUser++;
}
if (clearSlot) {
oldTable.hash[i] = 0;
oldTable.bits[i] = 0;
} else {
newSize += 1;
}
}
oldTable.setCountOfNumUsersWithNoBitsSet();
oldTable.setSetBitCounts();
LOG.info("Done filtering table: clearNoItemSet={}, clearNoBitsSet={}, clearDontKeepUser={}",
clearNoItemSet, clearNoBitsSet, clearDontKeepUser);
return newSize;
}
/**
* Called when hash is too small (> 50% occupied)
*/
private void rehash(final int newSize) {
final HashTable oldTable = this.hashTable.get();
final HashTable newTable = new HashTable(newSize);
final int newMask = newTable.hashMask;
final long[] newHash = newTable.hash;
final byte[] newBits = newTable.bits;
for (int i = 0; i < oldTable.hashSize(); i++) {
final long item = oldTable.hash[i];
final byte bits = oldTable.bits[i];
if (item != 0 && bits != 0) {
int code = hashCode(item);
int hashPos = code & newMask;
assert hashPos >= 0;
if (newHash[hashPos] != 0) {
final int inc = ((code >> 8) + code) | 1;
do {
code += inc;
hashPos = code & newMask;
} while (newHash[hashPos] != 0);
}
newHash[hashPos] = item;
newBits[hashPos] = bits;
newTable.numUsersInTable++;
}
}
newTable.setCountOfNumUsersWithNoBitsSet();
newTable.setSetBitCounts();
this.hashTable.set(newTable);
updateStats();
}
public void setTable(UserTable newTable) {
hashTable.set(newTable.hashTable.get());
updateStats();
}
@VisibleForTesting
protected int getHashTableCapacity() {
return hashTable.get().hashSize();
}
@VisibleForTesting
protected int getNumUsersWithNoBitsSet() {
return hashTable.get().numUsersWithNoBitsSet;
}
}

View File

@ -1,263 +0,0 @@
package com.twitter.search.earlybird.common.userupdates;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.concurrent.TimeUnit;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import javax.annotation.Nullable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common_internal.hadoop.HdfsUtils;
import com.twitter.scalding.DateRange;
import com.twitter.scalding.Hours;
import com.twitter.scalding.RichDate;
import com.twitter.search.user_table.sources.MostRecentGoodSafetyUserStateSource;
import com.twitter.search.common.indexing.thriftjava.SafetyUserState;
import com.twitter.search.common.util.io.LzoThriftBlockFileReader;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.util.Duration;
import com.twitter.util.Time;
/**
* Builds a user table from a user safety snapshot on HDFS.
*/
public class UserTableBuilderFromSnapshot {
private static final Logger LOG = LoggerFactory.getLogger(UserTableBuilderFromSnapshot.class);
private static final int MAX_DAYS_TO_CHECK = 7;
public static final String DATA_DIR = "user_states";
public static final String METADATA_DIR = "last_updated_ms";
private final String snapshotBaseDir;
private String snapshotDataPath;
private String snapshotMetaDataPath;
private UserTable userTable;
private long nsfwCount;
private long antisocialCount;
private long isProtectedCount;
public UserTableBuilderFromSnapshot() {
snapshotBaseDir =
EarlybirdConfig.getString(EarlybirdConfig.USER_SNAPSHOT_BASE_DIR, null);
LOG.info("Configured user snapshot directory: " + snapshotBaseDir);
}
private static final class UserUpdate {
public final long userId;
@Nullable public final Boolean antisocial;
@Nullable public final Boolean nsfw;
@Nullable public final Boolean isProtected;
private UserUpdate(long userId,
@Nullable Boolean antisocial,
@Nullable Boolean nsfw,
@Nullable Boolean isProtected) {
this.userId = userId;
this.antisocial = antisocial;
this.nsfw = nsfw;
this.isProtected = isProtected;
}
public static UserUpdate fromUserState(SafetyUserState safetyUserState) {
long userId = safetyUserState.getUserID();
@Nullable Boolean antisocial = null;
@Nullable Boolean nsfw = null;
@Nullable Boolean isProtected = null;
if (safetyUserState.isIsAntisocial()) {
antisocial = true;
}
if (safetyUserState.isIsNsfw()) {
nsfw = true;
}
if (safetyUserState.isSetIsProtected() && safetyUserState.isIsProtected()) {
isProtected = true;
}
return new UserUpdate(userId, antisocial, nsfw, isProtected);
}
}
/**
* Builds a user table from an HDFS user snapshot.
* @return The table, or nothing if something went wrong.
*/
public Optional<UserTable> build(Predicate<Long> userFilter) {
userTable = UserTable.newTableWithDefaultCapacityAndPredicate(userFilter);
nsfwCount = 0;
antisocialCount = 0;
isProtectedCount = 0;
if (snapshotBaseDir == null || snapshotBaseDir.isEmpty()) {
LOG.info("No snapshot directory. Can't build user table.");
return Optional.empty();
}
LOG.info("Starting to build user table.");
Stream<UserUpdate> stream = null;
try {
setSnapshotPath();
stream = getUserUpdates();
stream.forEach(this::insertUser);
} catch (IOException e) {
LOG.error("IOException while building table: {}", e.getMessage(), e);
return Optional.empty();
} finally {
if (stream != null) {
stream.close();
}
}
LOG.info("Built user table with {} users, {} nsfw, {} antisocial and {} protected.",
userTable.getNumUsersInTable(),
nsfwCount,
antisocialCount,
isProtectedCount);
try {
userTable.setLastRecordTimestamp(readTimestampOfLastSeenUpdateFromSnapshot());
} catch (IOException e) {
LOG.error("IOException reading timestamp of last update: {}", e.getMessage(), e);
return Optional.empty();
}
LOG.info("Setting last record timestamp to {}.", userTable.getLastRecordTimestamp());
return Optional.of(userTable);
}
private void setSnapshotPath() {
snapshotDataPath =
new MostRecentGoodSafetyUserStateSource(
snapshotBaseDir,
DATA_DIR,
METADATA_DIR,
DateRange.apply(
RichDate.now().$minus(Hours.apply(MAX_DAYS_TO_CHECK * 24)),
RichDate.now())
).partitionHdfsPaths(new HdfsConfiguration())
._1()
.head()
.replaceAll("\\*$", "");
snapshotMetaDataPath = snapshotDataPath.replace(DATA_DIR, METADATA_DIR);
LOG.info("Snapshot data path: {}", snapshotDataPath);
LOG.info("Snapshot metadata path: {}", snapshotMetaDataPath);
}
private Stream<UserUpdate> getUserUpdates() throws IOException {
FileSystem fs = FileSystem.get(new Configuration());
List<String> lzoFiles =
Arrays.stream(fs.listStatus(new Path(snapshotDataPath),
path -> path.getName().startsWith("part-")))
.map(fileStatus -> Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath())
.toString())
.collect(Collectors.toList());
final LzoThriftBlockFileReader<SafetyUserState> thriftReader =
new LzoThriftBlockFileReader<>(lzoFiles, SafetyUserState.class, null);
Iterator<UserUpdate> iter = new Iterator<UserUpdate>() {
private SafetyUserState next;
@Override
public boolean hasNext() {
if (next != null) {
return true;
}
do {
try {
next = thriftReader.readNext();
} catch (IOException e) {
throw new RuntimeException(e);
}
} while (next == null && !thriftReader.isExhausted());
return next != null;
}
@Override
public UserUpdate next() {
if (next != null || hasNext()) {
UserUpdate userUpdate = UserUpdate.fromUserState(next);
next = null;
return userUpdate;
}
throw new NoSuchElementException();
}
};
return StreamSupport
.stream(
Spliterators.spliteratorUnknownSize(iter, Spliterator.ORDERED | Spliterator.NONNULL),
false)
.onClose(thriftReader::stop);
}
private long readTimestampOfLastSeenUpdateFromSnapshot() throws IOException {
String timestampFile = snapshotMetaDataPath + "part-00000";
BufferedReader buffer = new BufferedReader(new InputStreamReader(
HdfsUtils.getInputStreamSupplier(timestampFile).openStream()));
long timestampMillis = Long.parseLong(buffer.readLine());
LOG.info("read timestamp {} from HDFS:{}", timestampMillis, timestampFile);
Time time = Time.fromMilliseconds(timestampMillis)
.minus(Duration.fromTimeUnit(10, TimeUnit.MINUTES));
return time.inMilliseconds();
}
private void insertUser(UserUpdate userUpdate) {
if (userUpdate == null) {
return;
}
if (userUpdate.antisocial != null) {
userTable.set(
userUpdate.userId,
UserTable.ANTISOCIAL_BIT,
userUpdate.antisocial);
antisocialCount++;
}
if (userUpdate.nsfw != null) {
userTable.set(
userUpdate.userId,
UserTable.NSFW_BIT,
userUpdate.nsfw);
nsfwCount++;
}
if (userUpdate.isProtected != null) {
userTable.set(
userUpdate.userId,
UserTable.IS_PROTECTED_BIT,
userUpdate.isProtected);
isProtectedCount++;
}
}
}

View File

@ -1,38 +0,0 @@
package com.twitter.search.earlybird.common.userupdates;
import java.util.Date;
import com.twitter.search.common.indexing.thriftjava.UserUpdateType;
/**
* Contains an update for a user.
*/
public class UserUpdate {
public final long twitterUserID;
public final UserUpdateType updateType;
public final int updateValue;
private final Date updatedAt;
public UserUpdate(long twitterUserID,
UserUpdateType updateType,
int updateValue,
Date updatedAt) {
this.twitterUserID = twitterUserID;
this.updateType = updateType;
this.updateValue = updateValue;
this.updatedAt = (Date) updatedAt.clone();
}
@Override public String toString() {
return "UserInfoUpdate[userID=" + twitterUserID + ",updateType=" + updateType
+ ",updateValue=" + updateValue + ",updatedAt=" + getUpdatedAt() + "]";
}
/**
* Returns a copy of the updated-at date.
*/
public Date getUpdatedAt() {
return (Date) updatedAt.clone();
}
}

View File

@ -1,70 +0,0 @@
package com.twitter.search.earlybird.common.userupdates;
import java.util.Date;
import java.util.concurrent.TimeUnit;
import com.twitter.common.util.Clock;
import com.twitter.decider.Decider;
import com.twitter.search.common.indexing.thriftjava.UserUpdateType;
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
/**
* Contains logic for deciding whether to apply a certain user update to the {@link UserTable}.
*/
public class UserUpdatesChecker {
private final Date antisocialStartDate;
private final Decider decider;
private final boolean isFullArchiveCluster;
public UserUpdatesChecker(Clock clock, Decider decider, EarlybirdCluster cluster) {
// How many days of antisocial users to keep. A value of -1 means keeping all user updates.
long antisocialRecordDays =
EarlybirdConfig.getLong("keep_recent_antisocial_user_updates_days", 30);
this.antisocialStartDate = antisocialRecordDays > 0
? new Date(clock.nowMillis() - TimeUnit.DAYS.toMillis(antisocialRecordDays)) : null;
this.decider = decider;
this.isFullArchiveCluster = cluster == EarlybirdCluster.FULL_ARCHIVE;
}
/**
* Decides whether to skip the given UserInfoUpdate.
*/
public boolean skipUserUpdate(UserUpdate userUpdate) {
if (userUpdate == null) { // always skip null updates
return true;
}
UserUpdateType type = userUpdate.updateType;
if (type == UserUpdateType.PROTECTED && skipProtectedUserUpdate()) {
return true;
}
if (type == UserUpdateType.ANTISOCIAL && skipAntisocialUserUpdate(userUpdate)) {
return true;
}
// NSFW users can continue to tweet even after they are marked as NSFW. That means
// that the snapshot needs to have all NSFW users from the beginning of time. Hence, no NSFW
// users updates check here.
// pass all checks, do not skip this user update
return false;
}
// Antisocial/suspended users can't tweet after they are suspended. Thus if our index stores
// tweets from the last 10 days, and they were suspended 60 days ago, we don't need them since
// there will be no tweets from them. We can save space by not storing info about those users.
// (For archive, at rebuild time we filter out all suspended users tweets, so for a user that
// was suspended before a rebuild, no need to use space to store that the user is suspended)
private boolean skipAntisocialUserUpdate(UserUpdate userUpdate) {
return antisocialStartDate != null && userUpdate.getUpdatedAt().before(antisocialStartDate);
}
// skip protected user updates for realtime and protected clusters
private boolean skipProtectedUserUpdate() {
return !isFullArchiveCluster;
}
}

View File

@ -1,21 +0,0 @@
java_library(
sources = ["**/*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/code/findbugs:jsr305",
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"3rdparty/jvm/org/slf4j:slf4j-api",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/util:system-mocks",
"src/java/com/twitter/search/common/config",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/partitioning/snowflakeparser",
"src/java/com/twitter/search/common/util/date",
"src/java/com/twitter/search/common/util/zookeeper",
"src/java/com/twitter/search/earlybird/common/config",
],
)

View File

@ -1,26 +0,0 @@
package com.twitter.search.earlybird.config;
/**
* An interface for abstracting a tier's serving range.
*/
public interface ServingRange {
/**
* Returns the serving range's lowest tweet ID.
*/
long getServingRangeSinceId();
/**
* Returns the serving range's highest tweet ID.
*/
long getServingRangeMaxId();
/**
* Returns the serving range's earliest time, in seconds since epoch.
*/
long getServingRangeSinceTimeSecondsFromEpoch();
/**
* Returns the serving range's latest time, in seconds since epoch.
*/
long getServingRangeUntilTimeSecondsFromEpoch();
}

View File

@ -1,175 +0,0 @@
package com.twitter.search.earlybird.config;
import java.util.Date;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import com.google.common.base.Preconditions;
import com.twitter.common.util.Clock;
import com.twitter.search.common.config.Config;
import com.twitter.search.common.config.ConfigFile;
import com.twitter.search.common.config.ConfigurationException;
import com.twitter.search.common.metrics.SearchLongGauge;
import com.twitter.search.common.util.date.DateUtil;
/**
* This class provides APIs to access the tier configurations for a cluster.
* Each tier has tier name, number of partitions, tier start time and end time.
*/
public final class TierConfig {
private static final org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(TierConfig.class);
private static final String DEFAULT_CONFIG_DIR = "common/config";
public static final String DEFAULT_TIER_FILE = "earlybird-tiers.yml";
public static final Date DEFAULT_TIER_START_DATE = DateUtil.toDate(2006, 3, 21);
// It's convenient for DEFAULT_TIER_END_DATE to be before ~2100, because then the output of
// FieldTermCounter.getHourValue(DEFAULT_TIER_END_END_DATE) can still fit into an integer.
public static final Date DEFAULT_TIER_END_DATE = DateUtil.toDate(2099, 1, 1);
public static final String DEFAULT_TIER_NAME = "all";
public static final boolean DEFAULT_ENABLED = true;
public static final TierInfo.RequestReadType DEFAULT_READ_TYPE = TierInfo.RequestReadType.LIGHT;
private static ConfigFile tierConfigFile = null;
private static ConfigSource tierConfigSource = null;
public enum ConfigSource {
LOCAL,
ZOOKEEPER
}
private TierConfig() { }
private static synchronized void init() {
if (tierConfigFile == null) {
tierConfigFile = new ConfigFile(DEFAULT_CONFIG_DIR, DEFAULT_TIER_FILE);
tierConfigSource = ConfigSource.LOCAL;
SearchLongGauge.export("tier_config_source_" + tierConfigSource.name()).set(1);
LOG.info("Tier config file " + DEFAULT_TIER_FILE + " is successfully loaded from bundle.");
}
}
public static ConfigFile getConfigFile() {
init();
return tierConfigFile;
}
public static String getConfigFileName() {
return getConfigFile().getConfigFileName();
}
/**
* Return all the tier names specified in the config file.
*/
public static Set<String> getTierNames() {
return Config.getConfig().getMapCopy(getConfigFileName()).keySet();
}
/**
* Sets the value of the given tier config property to the given value.
*/
public static void setForTests(String property, Object value) {
Config.getConfig().setForTests(DEFAULT_TIER_FILE, property, value);
}
/**
* Returns the config info for the specified tier.
*/
public static TierInfo getTierInfo(String tierName) {
return getTierInfo(tierName, null /* use current environment */);
}
/**
* Returns the config info for the specified tier and environment.
*/
public static TierInfo getTierInfo(String tierName, @Nullable String environment) {
String tierConfigFileType = getConfigFileName();
Map<String, Object> tierInfo;
try {
tierInfo = (Map<String, Object>) Config.getConfig()
.getFromEnvironment(environment, tierConfigFileType, tierName);
} catch (ConfigurationException e) {
throw new RuntimeException(e);
}
if (tierInfo == null) {
LOG.error("Cannot find tier config for "
+ tierName + "in config file: " + tierConfigFileType);
throw new RuntimeException("Configuration error: " + tierConfigFileType);
}
Long partitions = (Long) tierInfo.get("number_of_partitions");
if (partitions == null) {
LOG.error("No number of partition is specified for tier "
+ tierName + " in tier config file " + tierConfigFileType);
throw new RuntimeException("Configuration error: " + tierConfigFileType);
}
Long numTimeslices = (Long) tierInfo.get("serving_timeslices");
if (numTimeslices == null) {
LOG.info("No max timeslices is specified for tier "
+ tierName + " in tier config file " + tierConfigFileType
+ ", not setting a cap on number of serving timeslices");
// NOTE: we use max int32 here because it will ultimately be cast to an int, but the config
// map expects Longs for all integral types. Using Long.MAX_VALUE leads to max serving
// timeslices being set to -1 when it is truncated to an int.
numTimeslices = (long) Integer.MAX_VALUE;
}
Date tierStartDate = (Date) tierInfo.get("data_range_start_date_inclusive");
if (tierStartDate == null) {
tierStartDate = DEFAULT_TIER_START_DATE;
}
Date tierEndDate = (Date) tierInfo.get("data_range_end_date_exclusive");
if (tierEndDate == null) {
tierEndDate = DEFAULT_TIER_END_DATE;
}
Boolean tierEnabled = (Boolean) tierInfo.get("tier_enabled");
if (tierEnabled == null) {
tierEnabled = DEFAULT_ENABLED;
}
TierInfo.RequestReadType readType =
getRequestReadType((String) tierInfo.get("tier_read_type"), DEFAULT_READ_TYPE);
TierInfo.RequestReadType readTypeOverride =
getRequestReadType((String) tierInfo.get("tier_read_type_override"), readType);
return new TierInfo(
tierName,
tierStartDate,
tierEndDate,
partitions.intValue(),
numTimeslices.intValue(),
tierEnabled,
(String) tierInfo.get("serving_range_since_id_exclusive"),
(String) tierInfo.get("serving_range_max_id_inclusive"),
(Date) tierInfo.get("serving_range_start_date_inclusive_override"),
(Date) tierInfo.get("serving_range_end_date_exclusive_override"),
readType,
readTypeOverride,
Clock.SYSTEM_CLOCK);
}
public static synchronized void clear() {
tierConfigFile = null;
tierConfigSource = null;
}
protected static synchronized ConfigSource getTierConfigSource() {
return tierConfigSource;
}
private static TierInfo.RequestReadType getRequestReadType(
String readTypeEnumName, TierInfo.RequestReadType defaultReadType) {
TierInfo.RequestReadType readType = defaultReadType;
if (readTypeEnumName != null) {
readType = TierInfo.RequestReadType.valueOf(readTypeEnumName.trim().toUpperCase());
Preconditions.checkState(readType != null);
}
return readType;
}
}

View File

@ -1,180 +0,0 @@
package com.twitter.search.earlybird.config;
import java.util.Date;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.twitter.common.util.Clock;
/**
* Properties of a single tier.
*/
public class TierInfo implements ServingRange {
// What I'm seeing historically is that this has been used when adding a new tier. First you
// add it and send dark traffic to it, then possibly grey and then you launch it by turning on
// light traffic.
public static enum RequestReadType {
// Light read: send request, wait for results, and results are returned
LIGHT,
// Dark read: send request, do not wait for results, and results are discarded
DARK,
// Grey read: send request, wait for results, but discard after results come back.
// Same results as dark read; similar latency as light read.
GREY,
}
private final String tierName;
private final Date dataStartDate;
private final Date dataEndDate;
private final int numPartitions;
private final int maxTimeslices;
private final TierServingBoundaryEndPoint servingRangeSince;
private final TierServingBoundaryEndPoint servingRangeMax;
private final TierServingBoundaryEndPoint servingRangeSinceOverride;
private final TierServingBoundaryEndPoint servingRangeMaxOverride;
// These two properties are only used by clients of Earlybird (E.g. roots),
// but not by Earlybirds.
private final boolean enabled;
private final RequestReadType readType;
private final RequestReadType readTypeOverride;
public TierInfo(String tierName,
Date dataStartDate,
Date dataEndDate,
int numPartitions,
int maxTimeslices,
boolean enabled,
String sinceIdString,
String maxIdString,
Date servingStartDateOverride,
Date servingEndDateOverride,
RequestReadType readType,
RequestReadType readTypeOverride,
Clock clock) {
Preconditions.checkArgument(numPartitions > 0);
Preconditions.checkArgument(maxTimeslices > 0);
this.tierName = tierName;
this.dataStartDate = dataStartDate;
this.dataEndDate = dataEndDate;
this.numPartitions = numPartitions;
this.maxTimeslices = maxTimeslices;
this.enabled = enabled;
this.readType = readType;
this.readTypeOverride = readTypeOverride;
this.servingRangeSince = TierServingBoundaryEndPoint
.newTierServingBoundaryEndPoint(sinceIdString, dataStartDate, clock);
this.servingRangeMax = TierServingBoundaryEndPoint
.newTierServingBoundaryEndPoint(maxIdString, dataEndDate, clock);
if (servingStartDateOverride != null) {
this.servingRangeSinceOverride = TierServingBoundaryEndPoint.newTierServingBoundaryEndPoint(
TierServingBoundaryEndPoint.INFERRED_FROM_DATA_RANGE, servingStartDateOverride, clock);
} else {
this.servingRangeSinceOverride = servingRangeSince;
}
if (servingEndDateOverride != null) {
this.servingRangeMaxOverride = TierServingBoundaryEndPoint.newTierServingBoundaryEndPoint(
TierServingBoundaryEndPoint.INFERRED_FROM_DATA_RANGE, servingEndDateOverride, clock);
} else {
this.servingRangeMaxOverride = servingRangeMax;
}
}
@VisibleForTesting
public TierInfo(String tierName,
Date dataStartDate,
Date dataEndDate,
int numPartitions,
int maxTimeslices,
boolean enabled,
String sinceIdString,
String maxIdString,
RequestReadType readType,
Clock clock) {
// No overrides:
// servingRangeSinceOverride == servingRangeSince
// servingRangeMaxOverride == servingRangeMax
// readTypeOverride == readType
this(tierName, dataStartDate, dataEndDate, numPartitions, maxTimeslices, enabled, sinceIdString,
maxIdString, null, null, readType, readType, clock);
}
@Override
public String toString() {
return tierName;
}
public String getTierName() {
return tierName;
}
public Date getDataStartDate() {
return dataStartDate;
}
public Date getDataEndDate() {
return dataEndDate;
}
public int getNumPartitions() {
return numPartitions;
}
public int getMaxTimeslices() {
return maxTimeslices;
}
public TierConfig.ConfigSource getSource() {
return TierConfig.getTierConfigSource();
}
public boolean isEnabled() {
return enabled;
}
public boolean isDarkRead() {
return readType == RequestReadType.DARK;
}
public RequestReadType getReadType() {
return readType;
}
public RequestReadType getReadTypeOverride() {
return readTypeOverride;
}
public long getServingRangeSinceId() {
return servingRangeSince.getBoundaryTweetId();
}
public long getServingRangeMaxId() {
return servingRangeMax.getBoundaryTweetId();
}
long getServingRangeOverrideSinceId() {
return servingRangeSinceOverride.getBoundaryTweetId();
}
long getServingRangeOverrideMaxId() {
return servingRangeMaxOverride.getBoundaryTweetId();
}
public long getServingRangeSinceTimeSecondsFromEpoch() {
return servingRangeSince.getBoundaryTimeSecondsFromEpoch();
}
public long getServingRangeUntilTimeSecondsFromEpoch() {
return servingRangeMax.getBoundaryTimeSecondsFromEpoch();
}
long getServingRangeOverrideSinceTimeSecondsFromEpoch() {
return servingRangeSinceOverride.getBoundaryTimeSecondsFromEpoch();
}
long getServingRangeOverrideUntilTimeSecondsFromEpoch() {
return servingRangeMaxOverride.getBoundaryTimeSecondsFromEpoch();
}
}

View File

@ -1,39 +0,0 @@
package com.twitter.search.earlybird.config;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import javax.inject.Inject;
import com.twitter.search.common.util.zookeeper.ZooKeeperProxy;
public class TierInfoSource {
private final ZooKeeperProxy zkClient;
@Inject
public TierInfoSource(ZooKeeperProxy sZooKeeperClient) {
this.zkClient = sZooKeeperClient;
}
public List<TierInfo> getTierInformation() {
return getTierInfoWithPrefix("tier");
}
public String getConfigFileType() {
return TierConfig.getConfigFileName();
}
private List<TierInfo> getTierInfoWithPrefix(String tierPrefix) {
Set<String> tierNames = TierConfig.getTierNames();
List<TierInfo> tierInfos = new ArrayList<>();
for (String name : tierNames) {
if (name.startsWith(tierPrefix)) {
TierInfo tierInfo = TierConfig.getTierInfo(name);
tierInfos.add(tierInfo);
}
}
return tierInfos;
}
}

View File

@ -1,78 +0,0 @@
package com.twitter.search.earlybird.config;
import java.util.Comparator;
import java.util.SortedSet;
import com.google.common.base.Preconditions;
public final class TierInfoUtil {
public static final Comparator<TierInfo> TIER_COMPARATOR = (t1, t2) -> {
// Reverse sort order based on date.
return t2.getDataStartDate().compareTo(t1.getDataStartDate());
};
private TierInfoUtil() {
}
/**
* Checks that the serving ranges and the override serving ranges of the given tiers do not
* overlap, and do not have gaps. Dark reads tiers are ignored.
*/
public static void checkTierServingRanges(SortedSet<TierInfo> tierInfos) {
boolean tierServingRangesOverlap = false;
boolean tierOverrideServingRangesOverlap = false;
boolean tierServingRangesHaveGaps = false;
boolean tierOverrideServingRangesHaveGaps = false;
TierInfoWrapper previousTierInfoWrapper = null;
TierInfoWrapper previousOverrideTierInfoWrapper = null;
for (TierInfo tierInfo : tierInfos) {
TierInfoWrapper tierInfoWrapper = new TierInfoWrapper(tierInfo, false);
TierInfoWrapper overrideTierInfoWrapper = new TierInfoWrapper(tierInfo, true);
// Check only the tiers to which we send light reads.
if (!tierInfoWrapper.isDarkRead()) {
if (previousTierInfoWrapper != null) {
if (TierInfoWrapper.servingRangesOverlap(previousTierInfoWrapper, tierInfoWrapper)) {
// In case of rebalancing, we may have an overlap data range while
// overriding with a good serving range.
if (previousOverrideTierInfoWrapper == null
|| TierInfoWrapper.servingRangesOverlap(
previousOverrideTierInfoWrapper, overrideTierInfoWrapper)) {
tierServingRangesOverlap = true;
}
}
if (TierInfoWrapper.servingRangesHaveGap(previousTierInfoWrapper, tierInfoWrapper)) {
tierServingRangesHaveGaps = true;
}
}
previousTierInfoWrapper = tierInfoWrapper;
}
if (!overrideTierInfoWrapper.isDarkRead()) {
if (previousOverrideTierInfoWrapper != null) {
if (TierInfoWrapper.servingRangesOverlap(previousOverrideTierInfoWrapper,
overrideTierInfoWrapper)) {
tierOverrideServingRangesOverlap = true;
}
if (TierInfoWrapper.servingRangesHaveGap(previousOverrideTierInfoWrapper,
overrideTierInfoWrapper)) {
tierOverrideServingRangesHaveGaps = true;
}
}
previousOverrideTierInfoWrapper = overrideTierInfoWrapper;
}
}
Preconditions.checkState(!tierServingRangesOverlap,
"Serving ranges of light reads tiers must not overlap.");
Preconditions.checkState(!tierServingRangesHaveGaps,
"Serving ranges of light reads tiers must not have gaps.");
Preconditions.checkState(!tierOverrideServingRangesOverlap,
"Override serving ranges of light reads tiers must not overlap.");
Preconditions.checkState(!tierOverrideServingRangesHaveGaps,
"Override serving ranges of light reads tiers must not have gaps.");
}
}

View File

@ -1,89 +0,0 @@
package com.twitter.search.earlybird.config;
import java.util.Date;
import com.google.common.base.Preconditions;
/**
* A simple wrapper around TierInfo that returns the "real" or the "overriden" values from the given
* {@code TierInfo} instance, based on the given {@code useOverrideTierConfig} flag.
*/
public class TierInfoWrapper implements ServingRange {
private final TierInfo tierInfo;
private final boolean useOverrideTierConfig;
public TierInfoWrapper(TierInfo tierInfo, boolean useOverrideTierConfig) {
this.tierInfo = Preconditions.checkNotNull(tierInfo);
this.useOverrideTierConfig = useOverrideTierConfig;
}
public String getTierName() {
return tierInfo.getTierName();
}
public Date getDataStartDate() {
return tierInfo.getDataStartDate();
}
public Date getDataEndDate() {
return tierInfo.getDataEndDate();
}
public int getNumPartitions() {
return tierInfo.getNumPartitions();
}
public int getMaxTimeslices() {
return tierInfo.getMaxTimeslices();
}
public TierConfig.ConfigSource getSource() {
return tierInfo.getSource();
}
public boolean isEnabled() {
return tierInfo.isEnabled();
}
public boolean isDarkRead() {
return getReadType() == TierInfo.RequestReadType.DARK;
}
public TierInfo.RequestReadType getReadType() {
return useOverrideTierConfig ? tierInfo.getReadTypeOverride() : tierInfo.getReadType();
}
public long getServingRangeSinceId() {
return useOverrideTierConfig
? tierInfo.getServingRangeOverrideSinceId()
: tierInfo.getServingRangeSinceId();
}
public long getServingRangeMaxId() {
return useOverrideTierConfig
? tierInfo.getServingRangeOverrideMaxId()
: tierInfo.getServingRangeMaxId();
}
public long getServingRangeSinceTimeSecondsFromEpoch() {
return useOverrideTierConfig
? tierInfo.getServingRangeOverrideSinceTimeSecondsFromEpoch()
: tierInfo.getServingRangeSinceTimeSecondsFromEpoch();
}
public long getServingRangeUntilTimeSecondsFromEpoch() {
return useOverrideTierConfig
? tierInfo.getServingRangeOverrideUntilTimeSecondsFromEpoch()
: tierInfo.getServingRangeUntilTimeSecondsFromEpoch();
}
public static boolean servingRangesOverlap(TierInfoWrapper tier1, TierInfoWrapper tier2) {
return (tier1.getServingRangeMaxId() > tier2.getServingRangeSinceId())
&& (tier2.getServingRangeMaxId() > tier1.getServingRangeSinceId());
}
public static boolean servingRangesHaveGap(TierInfoWrapper tier1, TierInfoWrapper tier2) {
return (tier1.getServingRangeMaxId() < tier2.getServingRangeSinceId())
|| (tier2.getServingRangeMaxId() < tier1.getServingRangeSinceId());
}
}

View File

@ -1,146 +0,0 @@
package com.twitter.search.earlybird.config;
import java.util.Date;
import javax.annotation.Nullable;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.twitter.common.util.Clock;
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
/**
* The start or end boundary of a tier's serving range.
* This is used to add since_id and max_id operators onto search queries.
*/
public class TierServingBoundaryEndPoint {
@VisibleForTesting
public static final String INFERRED_FROM_DATA_RANGE = "inferred_from_data_range";
public static final String RELATIVE_TO_CURRENT_TIME_MS = "relative_to_current_time_ms";
// Either offsetToCurrentTimeMillis is set or (absoluteTweetId and timeBoundarySecondsFromEpoch)
// are set.
@Nullable
private final Long offsetToCurrentTimeMillis;
@Nullable
private final Long absoluteTweetId;
@Nullable
private final Long timeBoundarySecondsFromEpoch;
private final Clock clock;
TierServingBoundaryEndPoint(Long absoluteTweetId,
Long timeBoundarySecondsFromEpoch,
Long offsetToCurrentTimeMillis,
Clock clock) {
this.offsetToCurrentTimeMillis = offsetToCurrentTimeMillis;
this.absoluteTweetId = absoluteTweetId;
this.timeBoundarySecondsFromEpoch = timeBoundarySecondsFromEpoch;
this.clock = clock;
}
/**
* Parse the boundary string and construct a TierServingBoundaryEndPoint instance.
* @param boundaryString boundary configuration string. Valid values are:
* <li>
* "inferred_from_data_range" infers serving range from data range. This only works after
* Nov 2010 when Twitter switched to snowflake IDs.
* This is the default value.
* </li>
* <li>
* "absolute_tweet_id_and_timestamp_millis:id:timestamp" a tweet ID/timestamp is given
* explicitly as the serving range
* boundary.
* </li>
* <li>
* "relative_to_current_time_ms:offset" adds offset onto current timestamp in millis to
* compute serving range.
* </li>
*
* @param boundaryDate the data boundary. This is used in conjunction with
* inferred_from_data_date to determine the serving boundary.
* @param clock Clock used to obtain current time, when relative_to_current_time_ms is used.
* Tests pass in a FakeClock.
*/
public static TierServingBoundaryEndPoint newTierServingBoundaryEndPoint(String boundaryString,
Date boundaryDate,
Clock clock) {
if (boundaryString == null || boundaryString.trim().equals(
INFERRED_FROM_DATA_RANGE)) {
return inferBoundaryFromDataRange(boundaryDate, clock);
} else if (boundaryString.trim().startsWith(RELATIVE_TO_CURRENT_TIME_MS)) {
return getRelativeBoundary(boundaryString, clock);
} else {
throw new IllegalStateException("Cannot parse serving range string: " + boundaryString);
}
}
private static TierServingBoundaryEndPoint inferBoundaryFromDataRange(Date boundaryDate,
Clock clock) {
// infer from data range
// handle default start date and end date, in case the dates are not specified in the config
if (boundaryDate.equals(TierConfig.DEFAULT_TIER_START_DATE)) {
return new TierServingBoundaryEndPoint(
-1L, TierConfig.DEFAULT_TIER_START_DATE.getTime() / 1000, null, clock);
} else if (boundaryDate.equals(TierConfig.DEFAULT_TIER_END_DATE)) {
return new TierServingBoundaryEndPoint(
Long.MAX_VALUE, TierConfig.DEFAULT_TIER_END_DATE.getTime() / 1000, null, clock);
} else {
// convert data start / end dates into since / max ID.
long boundaryTimeMillis = boundaryDate.getTime();
if (!SnowflakeIdParser.isUsableSnowflakeTimestamp(boundaryTimeMillis)) {
throw new IllegalStateException("Serving time range can not be determined, because "
+ boundaryDate + " is before Twitter switched to snowflake tweet IDs.");
}
// Earlybird since_id is inclusive and max_id is exclusive. We substract 1 here.
// Consider example:
// full0: 5000 (inclusive) - 6000 (exclusive)
// full1: 6000 (inclusive) - 7000 (exclusive)
// For tier full0, we should use max_id 5999 instead of 6000.
// For tier full1, we should use since_id 5999 instead of 6000.
// Hence we substract 1 here.
long adjustedTweetId =
SnowflakeIdParser.generateValidStatusId(boundaryTimeMillis, 0) - 1;
Preconditions.checkState(adjustedTweetId >= 0, "boundary tweet ID must be non-negative");
return new TierServingBoundaryEndPoint(
adjustedTweetId, boundaryTimeMillis / 1000, null, clock);
}
}
private static TierServingBoundaryEndPoint getRelativeBoundary(String boundaryString,
Clock clock) {
// An offset relative to current time is given
String[] parts = boundaryString.split(":");
Preconditions.checkState(parts.length == 2);
long offset = Long.parseLong(parts[1]);
return new TierServingBoundaryEndPoint(null, null, offset, clock);
}
/**
* Returns the tweet ID for this tier boundary. If the tier boundary was created using a tweet ID,
* that tweet ID is returned. Otherwise, a tweet ID is derived from the time boundary.
*/
@VisibleForTesting
public long getBoundaryTweetId() {
// If absoluteTweetId is available, use it.
if (absoluteTweetId != null) {
return absoluteTweetId;
} else {
Preconditions.checkNotNull(offsetToCurrentTimeMillis);
long boundaryTime = clock.nowMillis() + offsetToCurrentTimeMillis;
return SnowflakeIdParser.generateValidStatusId(boundaryTime, 0);
}
}
/**
* Returns the time boundary for this tier boundary, in seconds since epoch.
*/
public long getBoundaryTimeSecondsFromEpoch() {
if (timeBoundarySecondsFromEpoch != null) {
return timeBoundarySecondsFromEpoch;
} else {
Preconditions.checkNotNull(offsetToCurrentTimeMillis);
return (clock.nowMillis() + offsetToCurrentTimeMillis) / 1000;
}
}
}

Some files were not shown because too many files have changed in this diff Show More