the-algorithm/src/java/com/twitter/search/earlybird_root/mergers/StrictRecencyResponseMerger.java
2023-03-31 22:16:43 -04:00

298 lines
13 KiB
Java

package com.twitter.search.earlybird_root.mergers;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.TimeUnit;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.twitter.search.common.metrics.SearchTimerStats;
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
import com.twitter.search.earlybird.thrift.ThriftSearchResult;
import com.twitter.search.earlybird.thrift.ThriftSearchResults;
import com.twitter.search.earlybird_root.common.EarlybirdFeatureSchemaMerger;
import com.twitter.search.earlybird_root.common.EarlybirdRequestContext;
import com.twitter.util.Future;
/**
* A RecencyResponseMerger that prioritizes not losing results during pagination.
* As of now, this merger is used by Gnip to make sure that scrolling returns all results.
*
* The logic used for merging partitions is a bit tricky, because on one hand, we want to make sure
* that we do miss results on the next pagination request; on the other hand, we want to return as
* many results as we can, and we want to set the minSearchedStatusID of the merged response as low
* as we can, in order to minimize the number of pagination requests.
*
* The merging logic is:
*
* Realtime cluster:
* 1. merge results from all partitions
* 2. if at least one partition response is early-terminated, set earlyTerminated = true
* on the merged response
* 3. set trimmingMinId = max(minSearchedStatusIDs of all partition responses)
* 4. trim all results to trimmingMinId
* 5. set minSearchedStatusID on the merged response to trimmingMinId
* 6. if we have more than numRequested results:
* - keep only the newest numRequested results
* - set minSearchedStatusID of the merged response to the lowest tweet ID in the response
* 7. if at least one partition response is not early-terminated, set
* tierBottomId = max(minSearchedStatusIDs of all non-early-terminated responses)
* (otherwise, set tierBottomId to some undefined value: -1, Long.MAX_VALUE, etc.)
* 8. if minSearchedStatusID of the merged response is the same as tierBottomId,
* clear the early-termination flag on the merged response
*
* The logic in steps 7 and 8 can be a little tricky to understand. They basically say: when we've
* exhausted the "least deep" partition in the realtime cluster, it's time to move to the full
* archive cluster (if we keep going past the "least deep" partition, we might miss results).
*
* Full archive cluster:
* 1. merge results from all partitions
* 2. if at least one partition response is early-terminated, set earlyTerminated = true
* on the merged response
* 3. set trimmingMinId to:
* - max(minSearchedStatusIDs of early-terminated responses), if at least one partition response
* is early-terminated
* - min(minSearchedStatusIDs of all responses), if all partition responses are not
* early-terminated
* 4. trim all results to trimmingMinId
* 5. set minSearchedStatusID of the merged response to trimmingMinId
* 6. if we have more than numRequested results:
* - keep only the newest numRequested results
* - set minSearchedStatusID of the merged response to the lowest tweet ID in the response
*
* The logic in step 3 can be a little tricky to understand. On one hand, if we always set
* trimmingMinId to the highest minSearchedStatusID, then some tweets at the very bottom of some
* partitions will never be returned. Consider the case:
*
* partition 1 has tweets 10, 8, 6
* partition 2 has tweets 9, 7, 5
*
* In this case, we would always trim all results to minId = 6, and tweet 5 would never be returned.
*
* On the other hand, if we always set trimmingMinId to the lowest minSearchedStatusID, then we
* might miss tweets from partitions that early-terminated. Consider the case:
*
* partition 1 has tweets 10, 5, 3, 1 that match our query
* partition 2 has tweets 9, 8, 7, 6, 2 that match our query
*
* If we ask for 3 results, than partition 1 will return tweets 10, 5, 3, and partition 2 will
* return tweets 9, 8, 7. If we set trimmingMinId = min(minSearchedStatusIDs), then the next
* pagination request will have [max_id = 2], and we will miss tweet 6.
*
* So the intuition here is that if we have an early-terminated response, we cannot set
* trimmingMinId to something lower than the minSearchedStatusID returned by that partition
* (otherwise we might miss results from that partition). However, if we've exhausted all
* partitions, then it's OK to not trim any result, because tiers do not intersect, so we will not
* miss any result from the next tier once we get there.
*/
public class StrictRecencyResponseMerger extends RecencyResponseMerger {
private static final SearchTimerStats STRICT_RECENCY_TIMER_AVG =
SearchTimerStats.export("merge_recency_strict", TimeUnit.NANOSECONDS, false, true);
@VisibleForTesting
static final EarlyTerminationTrimmingStats PARTITION_MERGING_EARLY_TERMINATION_TRIMMING_STATS =
new EarlyTerminationTrimmingStats("strict_recency_partition_merging");
@VisibleForTesting
static final EarlyTerminationTrimmingStats TIER_MERGING_EARLY_TERMINATION_TRIMMING_STATS =
new EarlyTerminationTrimmingStats("strict_recency_tier_merging");
private final EarlybirdCluster cluster;
public StrictRecencyResponseMerger(EarlybirdRequestContext requestContext,
List<Future<EarlybirdResponse>> responses,
ResponseAccumulator mode,
EarlybirdFeatureSchemaMerger featureSchemaMerger,
EarlybirdCluster cluster) {
super(requestContext, responses, mode, featureSchemaMerger);
this.cluster = cluster;
}
@Override
protected SearchTimerStats getMergedResponseTimer() {
return STRICT_RECENCY_TIMER_AVG;
}
/**
* Unlike {@link com.twitter.search.earlybird_root.mergers.RecencyResponseMerger}, this method
* takes a much simpler approach by just taking the max of the maxSearchedStatusIds.
*
* Also, when no maxSearchedStatusId is available at all, Long.MIN_VALUE is used instead of
* Long.MAX_VALUE. This ensures that we don't return any result in these cases.
*/
@Override
protected long findMaxFullySearchedStatusID() {
return accumulatedResponses.getMaxIds().isEmpty()
? Long.MIN_VALUE : Collections.max(accumulatedResponses.getMaxIds());
}
/**
* This method is subtly different from the base class version: when no minSearchedStatusId is
* available at all, Long.MAX_VALUE is used instead of Long.MIN_VALUE. This ensures that we
* don't return any result in these cases.
*/
@Override
protected long findMinFullySearchedStatusID() {
List<Long> minIds = accumulatedResponses.getMinIds();
if (minIds.isEmpty()) {
return Long.MAX_VALUE;
}
if (accumulatedResponses.isMergingPartitionsWithinATier()) {
return getTrimmingMinId();
}
// When merging tiers, the min ID should be the smallest among the min IDs.
return Collections.min(minIds);
}
@Override
protected TrimStats trimResults(
ThriftSearchResults searchResults, long mergedMin, long mergedMax) {
if (!searchResults.isSetResults() || searchResults.getResultsSize() == 0) {
// no results, no trimming needed
return TrimStats.EMPTY_STATS;
}
TrimStats trimStats = new TrimStats();
trimExactDups(searchResults, trimStats);
filterResultsByMergedMinMaxIds(searchResults, mergedMax, mergedMin, trimStats);
int numResults = computeNumResultsToKeep();
if (searchResults.getResultsSize() > numResults) {
trimStats.setResultsTruncatedFromTailCount(searchResults.getResultsSize() - numResults);
searchResults.setResults(searchResults.getResults().subList(0, numResults));
}
return trimStats;
}
/**
* This method is different from the base class version because when minResultId is bigger
* than currentMergedMin, we always take minResultId.
* If we don't do this, we would lose results.
*
* Illustration with an example. Assuming we are outside of the lag threshold.
* Num results requested: 3
* Response 1: min: 100 max: 900 results: 400, 500, 600
* Response 2: min: 300 max: 700 results: 350, 450, 550
*
* Merged results: 600, 550, 500
* Merged max: 900
* Merged min: we could take 300 (minId), or take 500 (minResultId).
*
* If we take minId, and use 300 as the pagination cursor, we'd lose results
* 350 and 450 when we paginate. So we have to take minResultId here.
*/
@Override
protected void setMergedMinSearchedStatusId(
ThriftSearchResults searchResults,
long currentMergedMin,
boolean resultsWereTrimmed) {
if (accumulatedResponses.getMinIds().isEmpty()) {
return;
}
long minId = currentMergedMin;
if (resultsWereTrimmed
&& (searchResults != null)
&& searchResults.isSetResults()
&& (searchResults.getResultsSize() > 0)) {
List<ThriftSearchResult> results = searchResults.getResults();
minId = results.get(results.size() - 1).getId();
}
searchResults.setMinSearchedStatusID(minId);
}
@Override
protected boolean clearEarlyTerminationIfReachingTierBottom(EarlybirdResponse mergedResponse) {
if (EarlybirdCluster.isArchive(cluster)) {
// We don't need to worry about the tier bottom when merging partition responses in the full
// archive cluster: if all partitions were exhausted and we didn't trim the results, then
// the early-terminated flag on the merged response will be false. If at least one partition
// is early-terminated, or we trimmed some results, then the early-terminated flag on the
// merged response will be true, and we should continue getting results from this tier before
// we move to the next one.
return false;
}
ThriftSearchResults searchResults = mergedResponse.getSearchResults();
if (searchResults.getMinSearchedStatusID() == getTierBottomId()) {
mergedResponse.getEarlyTerminationInfo().setEarlyTerminated(false);
mergedResponse.getEarlyTerminationInfo().unsetMergedEarlyTerminationReasons();
responseMessageBuilder.debugVerbose(
"Set earlytermination to false because minSearchedStatusId is tier bottom");
return true;
}
return false;
}
@Override
protected boolean shouldEarlyTerminateWhenEnoughTrimmedResults() {
return false;
}
@Override
protected final EarlyTerminationTrimmingStats getEarlyTerminationTrimmingStatsForPartitions() {
return PARTITION_MERGING_EARLY_TERMINATION_TRIMMING_STATS;
}
@Override
protected final EarlyTerminationTrimmingStats getEarlyTerminationTrimmingStatsForTiers() {
return TIER_MERGING_EARLY_TERMINATION_TRIMMING_STATS;
}
/** Determines the bottom of the realtime cluster, based on the partition responses. */
private long getTierBottomId() {
Preconditions.checkState(!EarlybirdCluster.isArchive(cluster));
long tierBottomId = -1;
for (EarlybirdResponse response : accumulatedResponses.getSuccessResponses()) {
if (!isEarlyTerminated(response)
&& response.isSetSearchResults()
&& response.getSearchResults().isSetMinSearchedStatusID()
&& (response.getSearchResults().getMinSearchedStatusID() > tierBottomId)) {
tierBottomId = response.getSearchResults().getMinSearchedStatusID();
}
}
return tierBottomId;
}
/** Determines the minId to which all results should be trimmed. */
private long getTrimmingMinId() {
List<Long> minIds = accumulatedResponses.getMinIds();
Preconditions.checkArgument(!minIds.isEmpty());
if (!EarlybirdCluster.isArchive(cluster)) {
return Collections.max(minIds);
}
long maxOfEarlyTerminatedMins = -1;
long minOfAllMins = Long.MAX_VALUE;
for (EarlybirdResponse response : accumulatedResponses.getSuccessResponses()) {
if (response.isSetSearchResults()
&& response.getSearchResults().isSetMinSearchedStatusID()) {
long minId = response.getSearchResults().getMinSearchedStatusID();
minOfAllMins = Math.min(minOfAllMins, minId);
if (isEarlyTerminated(response)) {
maxOfEarlyTerminatedMins = Math.max(maxOfEarlyTerminatedMins, minId);
}
}
}
if (maxOfEarlyTerminatedMins >= 0) {
return maxOfEarlyTerminatedMins;
} else {
return minOfAllMins;
}
}
/** Determines if the given earlybird response is early terminated. */
private boolean isEarlyTerminated(EarlybirdResponse response) {
return response.isSetEarlyTerminationInfo()
&& response.getEarlyTerminationInfo().isEarlyTerminated();
}
}