diff --git a/src/java/com/twitter/search/earlybird_root/filters/EarlybirdTimeRangeFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/EarlybirdTimeRangeFilter.docx new file mode 100644 index 000000000..13998018a Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/EarlybirdTimeRangeFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/EarlybirdTimeRangeFilter.java b/src/java/com/twitter/search/earlybird_root/filters/EarlybirdTimeRangeFilter.java deleted file mode 100644 index bd5eda6de..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/EarlybirdTimeRangeFilter.java +++ /dev/null @@ -1,205 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.Collections; -import java.util.Map; -import java.util.Optional; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.Maps; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.util.earlybird.EarlybirdResponseUtil; -import com.twitter.search.earlybird.config.ServingRange; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.EarlybirdRequestType; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.util.IdTimeRanges; -import com.twitter.util.Future; - -/** - * A Finagle filter used to filter requests to tiers. - * Parses serialized query on Earlybird request, and extracts since / until / since_id / max_id - * operators. This filter then tests whether the request overlaps with the given tier. If there - * is no overlap, an empty response is returned without actually forwarding the requests to the - * underlying service. - */ -public class EarlybirdTimeRangeFilter extends - SimpleFilter { - - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdTimeRangeFilter.class); - - private static final EarlybirdResponse ERROR_RESPONSE = - new EarlybirdResponse(EarlybirdResponseCode.PERSISTENT_ERROR, 0) - .setSearchResults(new ThriftSearchResults()); - - private final ServingRangeProvider servingRangeProvider; - private final Optional queryRewriter; - - private static final Map FAILED_REQUESTS; - static { - final Map tempMap = - Maps.newEnumMap(EarlybirdRequestType.class); - for (EarlybirdRequestType requestType : EarlybirdRequestType.values()) { - tempMap.put(requestType, SearchCounter.export( - "time_range_filter_" + requestType.getNormalizedName() + "_failed_requests")); - } - FAILED_REQUESTS = Collections.unmodifiableMap(tempMap); - } - - public static EarlybirdTimeRangeFilter newTimeRangeFilterWithQueryRewriter( - ServingRangeProvider servingRangeProvider, - SearchDecider decider) { - - return new EarlybirdTimeRangeFilter(servingRangeProvider, - Optional.of(new EarlybirdTimeFilterQueryRewriter(servingRangeProvider, decider))); - } - - public static EarlybirdTimeRangeFilter newTimeRangeFilterWithoutQueryRewriter( - ServingRangeProvider servingRangeProvider) { - - return new EarlybirdTimeRangeFilter(servingRangeProvider, Optional.empty()); - } - - /** - * Construct a filter that avoids forwarding requests to unrelated tiers - * based on requests' since / until / since_id / max_id. - * @param provider Holds the boundary information. - */ - EarlybirdTimeRangeFilter( - ServingRangeProvider provider, - Optional rewriter) { - - this.servingRangeProvider = provider; - this.queryRewriter = rewriter; - } - - public ServingRangeProvider getServingRangeProvider() { - return servingRangeProvider; - } - - @Override - public Future apply( - EarlybirdRequestContext requestContext, - Service service) { - - Query parsedQuery = requestContext.getParsedQuery(); - if (parsedQuery != null) { - // Only perform filtering if serialized query is set. - try { - IdTimeRanges queryRanges = IdTimeRanges.fromQuery(parsedQuery); - if (queryRanges == null) { - // No time ranges in query. - return issueServiceRequest(service, requestContext); - } - - ServingRange servingRange = - servingRangeProvider.getServingRange( - requestContext, requestContext.useOverrideTierConfig()); - - if (queryDoesNotOverlapWithServingRange(queryRanges, servingRange)) { - return Future.value(tierSkippedResponse(requestContext.getEarlybirdRequestType(), - servingRange)); - } else { - return issueServiceRequest(service, requestContext); - } - } catch (QueryParserException e) { - LOG.warn("Unable to get IdTimeRanges from query: " + parsedQuery.serialize()); - // The failure here is not due to a miss-formed query from the client, since we already - // were able to successfully get a parsed Query from the request. - // If we can't determine the time ranges, pass the query along to the tier, and just - // restrict it to the timeranges of the tier. - return issueServiceRequest(service, requestContext); - } - } else { - // There's no serialized query. Just pass through like an identity filter. - return issueServiceRequest(service, requestContext); - } - } - - private boolean queryDoesNotOverlapWithServingRange(IdTimeRanges queryRanges, - ServingRange servingRange) { - // As long as a query overlaps with the tier serving range on either side, - // the request is not filtered. I.e. we want to be conservative when doing this filtering, - // because it is just an optimization. We ignore the inclusiveness / exclusiveness of the - // boundaries. If the tier boundary and the query boundry happen to be the same, we do not - // filter the request. - return queryRanges.getSinceIDExclusive().or(0L) - > servingRange.getServingRangeMaxId() - || queryRanges.getMaxIDInclusive().or(Long.MAX_VALUE) - < servingRange.getServingRangeSinceId() - || queryRanges.getSinceTimeInclusive().or(0) - > servingRange.getServingRangeUntilTimeSecondsFromEpoch() - || queryRanges.getUntilTimeExclusive().or(Integer.MAX_VALUE) - < servingRange.getServingRangeSinceTimeSecondsFromEpoch(); - } - - private Future issueServiceRequest( - Service service, - EarlybirdRequestContext requestContext) { - - try { - EarlybirdRequestContext request = requestContext; - if (queryRewriter.isPresent()) { - request = queryRewriter.get().rewriteRequest(requestContext); - } - return service.apply(request); - } catch (QueryParserException e) { - FAILED_REQUESTS.get(requestContext.getEarlybirdRequestType()).increment(); - String msg = "Failed to add time filter operators"; - LOG.error(msg, e); - - // Note that in this case it is not clear whether the error is the client's fault or our - // fault, so we don't necessarily return a CLIENT_ERROR here. - // Currently this actually returns a PERSISTENT_ERROR. - if (requestContext.getRequest().getDebugMode() > 0) { - return Future.value( - ERROR_RESPONSE.deepCopy().setDebugString(msg + ": " + e.getMessage())); - } else { - return Future.value(ERROR_RESPONSE); - } - } - } - - /** - * Creates a tier skipped response, based on the given request type. - * - * For recency, relevance, facets and top tweets requests, this method returns a SUCCESS response - * with no search results and the minSearchedStatusID and maxSearchedStatusID appropriately set. - * For term stats response, it returns a TIER_SKIPPED response, but we need to revisit this. - * - * @param requestType The type of the request. - * @param servingRange The serving range of the tier that we're skipping. - */ - @VisibleForTesting - public static EarlybirdResponse tierSkippedResponse( - EarlybirdRequestType requestType, - ServingRange servingRange) { - String debugMessage = - "Tier skipped because it does not intersect with query time boundaries."; - if (requestType == EarlybirdRequestType.TERM_STATS) { - // If it's a term stats request, return a TIER_SKIPPED response for now. - // But we need to figure out the right thing to do here. - return new EarlybirdResponse(EarlybirdResponseCode.TIER_SKIPPED, 0) - .setDebugString(debugMessage); - } else { - // minIds in ServingRange instances are set to tierLowerBoundary - 1, because the - // since_id operator is exclusive. The max_id operator on the other hand is inclusive, - // so maxIds in ServingRange instances are also set to tierUpperBoundary - 1. - // Here we want both of them to be inclusive, so we need to increment the minId by 1. - return EarlybirdResponseUtil.tierSkippedRootResponse( - servingRange.getServingRangeSinceId() + 1, - servingRange.getServingRangeMaxId(), - debugMessage); - } - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/FullArchiveProtectedOperatorFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/FullArchiveProtectedOperatorFilter.docx new file mode 100644 index 000000000..388c361ff Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/FullArchiveProtectedOperatorFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/FullArchiveProtectedOperatorFilter.java b/src/java/com/twitter/search/earlybird_root/filters/FullArchiveProtectedOperatorFilter.java deleted file mode 100644 index 778f118e4..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/FullArchiveProtectedOperatorFilter.java +++ /dev/null @@ -1,167 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.List; - -import javax.inject.Inject; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.earlybird.thrift.EarlybirdDebugInfo; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryNodeUtils; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.search.SearchOperator; -import com.twitter.search.queryparser.query.search.SearchOperatorConstants; -import com.twitter.search.queryparser.visitors.DropAllProtectedOperatorVisitor; -import com.twitter.search.queryparser.visitors.QueryTreeIndex; -import com.twitter.util.Future; - -/** - * Full archive service filter validates requests with a protected operator, appends the - * '[exclude protected]' operator by default, and appends '[filter protected]' operator instead if - * 'getProtectedTweetsOnly' request param is set. A client error response is returned if any of the - * following rules is violated. - * 1. There is at most one 'protected' operator in the query. - * 2. If there is a 'protected' operator, it must be in the query root node. - * 3. The parent node of the 'protected' operator must not be negated and must be a conjunction. - * 4. If there is a positive 'protected' operator, 'followedUserIds' and 'searcherId' request - * params must be set. - */ -public class FullArchiveProtectedOperatorFilter extends - SimpleFilter { - private static final Logger LOG = - LoggerFactory.getLogger(FullArchiveProtectedOperatorFilter.class); - private static final SearchOperator EXCLUDE_PROTECTED_OPERATOR = - new SearchOperator(SearchOperator.Type.EXCLUDE, SearchOperatorConstants.PROTECTED); - private static final SearchOperator FILTER_PROTECTED_OPERATOR = - new SearchOperator(SearchOperator.Type.FILTER, SearchOperatorConstants.PROTECTED); - private static final SearchCounter QUERY_PARSER_FAILURE_COUNT = - SearchCounter.export("protected_operator_filter_query_parser_failure_count"); - - private final DropAllProtectedOperatorVisitor dropProtectedOperatorVisitor; - private final SearchDecider decider; - - @Inject - public FullArchiveProtectedOperatorFilter( - DropAllProtectedOperatorVisitor dropProtectedOperatorVisitor, - SearchDecider decider) { - this.dropProtectedOperatorVisitor = dropProtectedOperatorVisitor; - this.decider = decider; - } - - @Override - public Future apply( - EarlybirdRequestContext requestContext, - Service service) { - Query query = requestContext.getParsedQuery(); - if (query == null) { - return service.apply(requestContext); - } - - QueryTreeIndex queryTreeIndex = QueryTreeIndex.buildFor(query); - List nodeList = queryTreeIndex.getNodeList(); - // try to find a protected operator, returns error response if more than one protected - // operator is detected - SearchOperator protectedOperator = null; - for (Query node : nodeList) { - if (node instanceof SearchOperator) { - SearchOperator searchOp = (SearchOperator) node; - if (SearchOperatorConstants.PROTECTED.equals(searchOp.getOperand())) { - if (protectedOperator == null) { - protectedOperator = searchOp; - } else { - return createErrorResponse("Only one 'protected' operator is expected."); - } - } - } - } - - Query processedQuery; - if (protectedOperator == null) { - // no protected operator is detected, append '[exclude protected]' by default - processedQuery = QueryNodeUtils.appendAsConjunction(query, EXCLUDE_PROTECTED_OPERATOR); - } else { - // protected operator must be in the query root node - if (queryTreeIndex.getParentOf(protectedOperator) != query) { - return createErrorResponse("'protected' operator must be in the query root node"); - } - // the query node that contains protected operator must not be negated - if (query.mustNotOccur()) { - return createErrorResponse("The query node that contains a 'protected' operator must not" - + " be negated."); - } - // the query node that contains protected operator must be a conjunction - if (!query.isTypeOf(Query.QueryType.CONJUNCTION)) { - return createErrorResponse("The query node that contains a 'protected' operator must" - + " be a conjunction."); - } - // check the existence of 'followedUserIds' and 'searcherId' if it is a positive operator - if (isPositive(protectedOperator)) { - if (!validateRequestParam(requestContext.getRequest())) { - return createErrorResponse("'followedUserIds' and 'searcherId' are required " - + "by positive 'protected' operator."); - } - } - processedQuery = query; - } - // update processedQuery if 'getProtectedTweetsOnly' is set to true, it takes precedence over - // the existing protected operators - if (requestContext.getRequest().isGetProtectedTweetsOnly()) { - if (!validateRequestParam(requestContext.getRequest())) { - return createErrorResponse("'followedUserIds' and 'searcherId' are required " - + "when 'getProtectedTweetsOnly' is set to true."); - } - try { - processedQuery = processedQuery.accept(dropProtectedOperatorVisitor); - } catch (QueryParserException e) { - // this should not happen since we already have a parsed query - QUERY_PARSER_FAILURE_COUNT.increment(); - LOG.warn( - "Failed to drop protected operator for serialized query: " + query.serialize(), e); - } - processedQuery = - QueryNodeUtils.appendAsConjunction(processedQuery, FILTER_PROTECTED_OPERATOR); - } - - if (processedQuery == query) { - return service.apply(requestContext); - } else { - EarlybirdRequestContext clonedRequestContext = - EarlybirdRequestContext.copyRequestContext(requestContext, processedQuery); - return service.apply(clonedRequestContext); - } - } - - private boolean validateRequestParam(EarlybirdRequest request) { - List followedUserIds = request.followedUserIds; - Long searcherId = (request.searchQuery != null && request.searchQuery.isSetSearcherId()) - ? request.searchQuery.getSearcherId() : null; - return followedUserIds != null && !followedUserIds.isEmpty() && searcherId != null; - } - - private boolean isPositive(SearchOperator searchOp) { - boolean isNegateExclude = searchOp.mustNotOccur() - && searchOp.getOperatorType() == SearchOperator.Type.EXCLUDE; - boolean isPositive = !searchOp.mustNotOccur() - && (searchOp.getOperatorType() == SearchOperator.Type.INCLUDE - || searchOp.getOperatorType() == SearchOperator.Type.FILTER); - return isNegateExclude || isPositive; - } - - private Future createErrorResponse(String errorMsg) { - EarlybirdResponse response = new EarlybirdResponse(EarlybirdResponseCode.CLIENT_ERROR, 0); - response.setDebugInfo(new EarlybirdDebugInfo().setHost("full_archive_root")); - response.setDebugString(errorMsg); - return Future.value(response); - } - -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/FullArchiveServingRangeProvider.docx b/src/java/com/twitter/search/earlybird_root/filters/FullArchiveServingRangeProvider.docx new file mode 100644 index 000000000..681e2fc99 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/FullArchiveServingRangeProvider.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/FullArchiveServingRangeProvider.java b/src/java/com/twitter/search/earlybird_root/filters/FullArchiveServingRangeProvider.java deleted file mode 100644 index e7ec96963..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/FullArchiveServingRangeProvider.java +++ /dev/null @@ -1,64 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.Date; -import java.util.concurrent.TimeUnit; - -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.util.date.DateUtil; -import com.twitter.search.earlybird.config.ServingRange; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; - -public class FullArchiveServingRangeProvider implements ServingRangeProvider { - - public static final Date FULL_ARCHIVE_START_DATE = DateUtil.toDate(2006, 3, 21); - private static final int DEFAULT_SERVING_RANGE_BOUNDARY_HOURS_AGO = 48; - - private final SearchDecider decider; - private final String deciderKey; - - public FullArchiveServingRangeProvider( - SearchDecider decider, String deciderKey) { - this.decider = decider; - this.deciderKey = deciderKey; - } - - @Override - public ServingRange getServingRange( - final EarlybirdRequestContext requestContext, boolean useBoundaryOverride) { - return new ServingRange() { - @Override - public long getServingRangeSinceId() { - // we use 1 instead of 0, because the since_id operator is inclusive in earlybirds. - return 1L; - } - - @Override - public long getServingRangeMaxId() { - long servingRangeEndMillis = TimeUnit.HOURS.toMillis( - (decider.featureExists(deciderKey)) - ? decider.getAvailability(deciderKey) - : DEFAULT_SERVING_RANGE_BOUNDARY_HOURS_AGO); - - long boundaryTime = requestContext.getCreatedTimeMillis() - servingRangeEndMillis; - return SnowflakeIdParser.generateValidStatusId(boundaryTime, 0); - } - - @Override - public long getServingRangeSinceTimeSecondsFromEpoch() { - return FULL_ARCHIVE_START_DATE.getTime() / 1000; - } - - @Override - public long getServingRangeUntilTimeSecondsFromEpoch() { - long servingRangeEndMillis = TimeUnit.HOURS.toMillis( - (decider.featureExists(deciderKey)) - ? decider.getAvailability(deciderKey) - : DEFAULT_SERVING_RANGE_BOUNDARY_HOURS_AGO); - - long boundaryTime = requestContext.getCreatedTimeMillis() - servingRangeEndMillis; - return boundaryTime / 1000; - } - }; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/InitializeRequestContextFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/InitializeRequestContextFilter.docx new file mode 100644 index 000000000..fed999a15 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/InitializeRequestContextFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/InitializeRequestContextFilter.java b/src/java/com/twitter/search/earlybird_root/filters/InitializeRequestContextFilter.java deleted file mode 100644 index a1f5bafa2..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/InitializeRequestContextFilter.java +++ /dev/null @@ -1,66 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import javax.inject.Inject; - -import com.google.common.annotations.VisibleForTesting; - -import com.twitter.common.util.Clock; -import com.twitter.finagle.Filter; -import com.twitter.finagle.Service; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.earlybird.common.EarlybirdRequestUtil; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.QueryParsingUtils; -import com.twitter.search.earlybird_root.common.TwitterContextProvider; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.util.Future; - -/** - * Creates a new RequestContext from an EarlybirdRequest, and passes the RequestContext down to - * the rest of the filter/service chain. - */ -public class InitializeRequestContextFilter extends - Filter { - - @VisibleForTesting - static final SearchCounter FAILED_QUERY_PARSING = - SearchCounter.export("initialize_request_context_filter_query_parsing_failure"); - - private final SearchDecider decider; - private final TwitterContextProvider twitterContextProvider; - private final Clock clock; - - /** - * The constructor of the filter. - */ - @Inject - public InitializeRequestContextFilter(SearchDecider decider, - TwitterContextProvider twitterContextProvider, - Clock clock) { - this.decider = decider; - this.twitterContextProvider = twitterContextProvider; - this.clock = clock; - } - - @Override - public Future apply( - EarlybirdRequest request, - Service service) { - - EarlybirdRequestUtil.recordClientClockDiff(request); - - EarlybirdRequestContext requestContext; - try { - requestContext = EarlybirdRequestContext.newContext( - request, decider, twitterContextProvider.get(), clock); - } catch (QueryParserException e) { - FAILED_QUERY_PARSING.increment(); - return QueryParsingUtils.newClientErrorResponse(request, e); - } - - return service.apply(requestContext); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/IsUserProtectedMetadataTrackingFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/IsUserProtectedMetadataTrackingFilter.docx new file mode 100644 index 000000000..58570b913 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/IsUserProtectedMetadataTrackingFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/IsUserProtectedMetadataTrackingFilter.java b/src/java/com/twitter/search/earlybird_root/filters/IsUserProtectedMetadataTrackingFilter.java deleted file mode 100644 index 9d7b2c0fe..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/IsUserProtectedMetadataTrackingFilter.java +++ /dev/null @@ -1,80 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.EnumMap; -import java.util.List; -import java.util.Map; - -import com.google.common.annotations.VisibleForTesting; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResultExtraMetadata; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.EarlybirdRequestType; -import com.twitter.util.Future; -import com.twitter.util.FutureEventListener; - -/** - * Filter tracks the isUserProtected metadata stats returned from Earlybirds. - */ -public class IsUserProtectedMetadataTrackingFilter - extends SimpleFilter { - private static final String COUNTER_PREFIX = "is_user_protected_metadata_count_filter_"; - @VisibleForTesting - final Map totalCounterByRequestTypeMap; - @VisibleForTesting - final Map isProtectedCounterByRequestTypeMap; - - public IsUserProtectedMetadataTrackingFilter() { - this.totalCounterByRequestTypeMap = new EnumMap<>(EarlybirdRequestType.class); - this.isProtectedCounterByRequestTypeMap = new EnumMap<>(EarlybirdRequestType.class); - for (EarlybirdRequestType requestType : EarlybirdRequestType.values()) { - this.totalCounterByRequestTypeMap.put(requestType, - SearchCounter.export(COUNTER_PREFIX + requestType.getNormalizedName() + "_total")); - this.isProtectedCounterByRequestTypeMap.put(requestType, - SearchCounter.export(COUNTER_PREFIX + requestType.getNormalizedName() + "_is_protected")); - } - } - - @Override - public Future apply( - EarlybirdRequestContext request, - Service service) { - Future response = service.apply(request); - - EarlybirdRequestType requestType = request.getEarlybirdRequestType(); - response.addEventListener(new FutureEventListener() { - @Override - public void onSuccess(EarlybirdResponse response) { - if (!response.isSetSearchResults() || response.getSearchResults().getResults().isEmpty()) { - return; - } - List searchResults = response.getSearchResults().getResults(); - int totalCount = searchResults.size(); - int isUserProtectedCount = 0; - for (ThriftSearchResult searchResult : searchResults) { - if (searchResult.isSetMetadata() && searchResult.getMetadata().isSetExtraMetadata()) { - ThriftSearchResultExtraMetadata extraMetadata = - searchResult.getMetadata().getExtraMetadata(); - if (extraMetadata.isIsUserProtected()) { - isUserProtectedCount++; - } - } - } - IsUserProtectedMetadataTrackingFilter.this - .totalCounterByRequestTypeMap.get(requestType).add(totalCount); - IsUserProtectedMetadataTrackingFilter.this - .isProtectedCounterByRequestTypeMap.get(requestType).add(isUserProtectedCount); - } - - @Override - public void onFailure(Throwable cause) { } - }); - - return response; - } - -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/MarkTweetSourceFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/MarkTweetSourceFilter.docx new file mode 100644 index 000000000..3a08263eb Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/MarkTweetSourceFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/MarkTweetSourceFilter.java b/src/java/com/twitter/search/earlybird_root/filters/MarkTweetSourceFilter.java deleted file mode 100644 index 2a6321089..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/MarkTweetSourceFilter.java +++ /dev/null @@ -1,49 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftTweetSource; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.EarlybirdRequestType; -import com.twitter.util.Function; -import com.twitter.util.Future; - -public class MarkTweetSourceFilter - extends SimpleFilter { - private final SearchCounter searchResultsNotSet; - - private final ThriftTweetSource tweetSource; - - public MarkTweetSourceFilter(ThriftTweetSource tweetSource) { - this.tweetSource = tweetSource; - searchResultsNotSet = SearchCounter.export( - tweetSource.name().toLowerCase() + "_mark_tweet_source_filter_search_results_not_set"); - } - - @Override - public Future apply( - final EarlybirdRequestContext requestContext, - Service service) { - return service.apply(requestContext).map(new Function() { - @Override - public EarlybirdResponse apply(EarlybirdResponse response) { - if (response.getResponseCode() == EarlybirdResponseCode.SUCCESS - && requestContext.getEarlybirdRequestType() != EarlybirdRequestType.TERM_STATS) { - if (!response.isSetSearchResults()) { - searchResultsNotSet.increment(); - } else { - for (ThriftSearchResult searchResult : response.getSearchResults().getResults()) { - searchResult.setTweetSource(tweetSource); - } - } - } - return response; - } - } - ); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/MetadataTrackingFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/MetadataTrackingFilter.docx new file mode 100644 index 000000000..8c8f5a20e Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/MetadataTrackingFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/MetadataTrackingFilter.java b/src/java/com/twitter/search/earlybird_root/filters/MetadataTrackingFilter.java deleted file mode 100644 index 8a1b29fc6..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/MetadataTrackingFilter.java +++ /dev/null @@ -1,119 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.List; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.cache.CacheBuilder; -import com.google.common.cache.CacheLoader; -import com.google.common.cache.LoadingCache; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.metrics.SearchMovingAverage; -import com.twitter.search.earlybird.common.ClientIdUtil; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata; -import com.twitter.search.earlybird_root.common.EarlybirdRequestType; -import com.twitter.util.Future; -import com.twitter.util.FutureEventListener; - -/** - * Filter that is tracking the engagement stats returned from Earlybirds. - */ -public class MetadataTrackingFilter extends SimpleFilter { - - private static final String SCORING_SIGNAL_STAT_PREFIX = "scoring_signal_"; - private static final String SCORE_STAT_PATTERN = "client_id_score_tracker_for_%s_x100"; - - @VisibleForTesting - static final SearchMovingAverage SCORING_SIGNAL_FAV_COUNT = - SearchMovingAverage.export(SCORING_SIGNAL_STAT_PREFIX + "fav_count"); - - @VisibleForTesting - static final SearchMovingAverage SCORING_SIGNAL_REPLY_COUNT = - SearchMovingAverage.export(SCORING_SIGNAL_STAT_PREFIX + "reply_count"); - - @VisibleForTesting - static final SearchMovingAverage SCORING_SIGNAL_RETWEET_COUNT = - SearchMovingAverage.export(SCORING_SIGNAL_STAT_PREFIX + "retweet_count"); - - @VisibleForTesting - static final LoadingCache CLIENT_SCORE_METRICS_LOADING_CACHE = - CacheBuilder.newBuilder().build(new CacheLoader() { - public SearchMovingAverage load(String clientId) { - return SearchMovingAverage.export(String.format(SCORE_STAT_PATTERN, clientId)); - } - }); - - @Override - public Future apply(final EarlybirdRequest request, - Service service) { - - Future response = service.apply(request); - - response.addEventListener(new FutureEventListener() { - @Override - public void onSuccess(EarlybirdResponse earlybirdResponse) { - EarlybirdRequestType type = EarlybirdRequestType.of(request); - - if (earlybirdResponse.responseCode == EarlybirdResponseCode.SUCCESS - && type == EarlybirdRequestType.RELEVANCE - && earlybirdResponse.isSetSearchResults() - && earlybirdResponse.getSearchResults().isSetResults()) { - - List searchResults = earlybirdResponse.getSearchResults() - .getResults(); - - long totalFavoriteAmount = 0; - long totalReplyAmount = 0; - long totalRetweetAmount = 0; - double totalScoreX100 = 0; - - for (ThriftSearchResult result : searchResults) { - if (!result.isSetMetadata()) { - continue; - } - - ThriftSearchResultMetadata metadata = result.getMetadata(); - - if (metadata.isSetFavCount()) { - totalFavoriteAmount += metadata.getFavCount(); - } - - if (metadata.isSetReplyCount()) { - totalReplyAmount += metadata.getReplyCount(); - } - - if (metadata.isSetRetweetCount()) { - totalRetweetAmount += metadata.getRetweetCount(); - } - - if (metadata.isSetScore()) { - // Scale up the score by 100 so that scores are at least 1 and visible on viz graph - totalScoreX100 += metadata.getScore() * 100; - } - } - - // We only count present engagement counts but report the full size of the search results. - // This means that we consider the missing counts as being 0. - SCORING_SIGNAL_FAV_COUNT.addSamples(totalFavoriteAmount, searchResults.size()); - SCORING_SIGNAL_REPLY_COUNT.addSamples(totalReplyAmount, searchResults.size()); - SCORING_SIGNAL_RETWEET_COUNT.addSamples(totalRetweetAmount, searchResults.size()); - // Export per client id average scores. - String requestClientId = ClientIdUtil.getClientIdFromRequest(request); - String quotaClientId = ClientIdUtil.getQuotaClientId(requestClientId); - CLIENT_SCORE_METRICS_LOADING_CACHE.getUnchecked(quotaClientId) - .addSamples((long) totalScoreX100, searchResults.size()); - } - } - - @Override - public void onFailure(Throwable cause) { } - }); - - return response; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/NamedMultiTermDisjunctionStatsFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/NamedMultiTermDisjunctionStatsFilter.docx new file mode 100644 index 000000000..ed87a0cce Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/NamedMultiTermDisjunctionStatsFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/NamedMultiTermDisjunctionStatsFilter.java b/src/java/com/twitter/search/earlybird_root/filters/NamedMultiTermDisjunctionStatsFilter.java deleted file mode 100644 index c75864124..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/NamedMultiTermDisjunctionStatsFilter.java +++ /dev/null @@ -1,45 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentMap; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.metrics.Percentile; -import com.twitter.search.common.metrics.PercentileUtil; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.util.Future; - -public class NamedMultiTermDisjunctionStatsFilter extends - SimpleFilter { - - private static final String STAT_FORMAT = "named_disjunction_size_client_%s_key_%s"; - // ClientID -> disjunction name -> operand count - private static final ConcurrentMap>> - NAMED_MULTI_TERM_DISJUNCTION_IDS_COUNT = new ConcurrentHashMap<>(); - - @Override - public Future apply(EarlybirdRequest request, - Service service) { - - if (request.getSearchQuery().isSetNamedDisjunctionMap()) { - for (Map.Entry> entry - : request.getSearchQuery().getNamedDisjunctionMap().entrySet()) { - - Map> statsForClient = - NAMED_MULTI_TERM_DISJUNCTION_IDS_COUNT.computeIfAbsent( - request.getClientId(), clientId -> new ConcurrentHashMap<>()); - Percentile stats = statsForClient.computeIfAbsent(entry.getKey(), - keyName -> PercentileUtil.createPercentile( - String.format(STAT_FORMAT, request.getClientId(), keyName))); - - stats.record(entry.getValue().size()); - } - } - - return service.apply(request); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/NullcastTrackingFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/NullcastTrackingFilter.docx new file mode 100644 index 000000000..ceba0fb57 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/NullcastTrackingFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/NullcastTrackingFilter.java b/src/java/com/twitter/search/earlybird_root/filters/NullcastTrackingFilter.java deleted file mode 100644 index d7003533d..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/NullcastTrackingFilter.java +++ /dev/null @@ -1,81 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.HashSet; -import java.util.Set; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableSet; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.util.earlybird.EarlybirdResponseUtil; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.queryparser.query.search.SearchOperatorConstants; -import com.twitter.search.queryparser.visitors.DetectPositiveOperatorVisitor; - -/** - * Filter that is tracking the unexpected nullcast results from Earlybirds. - */ -public class NullcastTrackingFilter extends SensitiveResultsTrackingFilter { - public NullcastTrackingFilter() { - super("unexpected nullcast tweets", true); - } - - private static final Logger LOG = LoggerFactory.getLogger(NullcastTrackingFilter.class); - - @VisibleForTesting - static final SearchCounter BAD_NULLCAST_QUERY_COUNT = - SearchCounter.export("unexpected_nullcast_query_count"); - - @VisibleForTesting - static final SearchCounter BAD_NULLCAST_RESULT_COUNT = - SearchCounter.export("unexpected_nullcast_result_count"); - - @Override - protected Logger getLogger() { - return LOG; - } - - @Override - protected SearchCounter getSensitiveQueryCounter() { - return BAD_NULLCAST_QUERY_COUNT; - } - - @Override - protected SearchCounter getSensitiveResultsCounter() { - return BAD_NULLCAST_RESULT_COUNT; - } - - @Override - protected Set getSensitiveResults(EarlybirdRequestContext requestContext, - EarlybirdResponse earlybirdResponse) throws Exception { - if (!requestContext.getParsedQuery().accept( - new DetectPositiveOperatorVisitor(SearchOperatorConstants.NULLCAST))) { - return EarlybirdResponseUtil.findUnexpectedNullcastStatusIds( - earlybirdResponse.getSearchResults(), requestContext.getRequest()); - } else { - return new HashSet<>(); - } - } - - /** - * Some Earlybird requests are not searches, instead, they are scoring requests. - * These requests supply a list of IDs to be scored. - * It is OK to return nullcast tweet result if the ID is supplied in the request. - * This extracts the scoring request tweet IDs. - */ - @Override - protected Set getExceptedResults(EarlybirdRequestContext requestContext) { - EarlybirdRequest request = requestContext.getRequest(); - if (request == null - || !request.isSetSearchQuery() - || request.getSearchQuery().getSearchStatusIdsSize() == 0) { - return ImmutableSet.of(); - } - return request.getSearchQuery().getSearchStatusIds(); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/PostCacheRequestTypeCountFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/PostCacheRequestTypeCountFilter.docx new file mode 100644 index 000000000..aeacf6c01 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/PostCacheRequestTypeCountFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/PostCacheRequestTypeCountFilter.java b/src/java/com/twitter/search/earlybird_root/filters/PostCacheRequestTypeCountFilter.java deleted file mode 100644 index d83fd1227..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/PostCacheRequestTypeCountFilter.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import javax.inject.Inject; - -public class PostCacheRequestTypeCountFilter extends RequestTypeCountFilter { - @Inject - public PostCacheRequestTypeCountFilter() { - super("post_cache"); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/PreCacheRequestTypeCountFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/PreCacheRequestTypeCountFilter.docx new file mode 100644 index 000000000..ff88f508a Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/PreCacheRequestTypeCountFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/PreCacheRequestTypeCountFilter.java b/src/java/com/twitter/search/earlybird_root/filters/PreCacheRequestTypeCountFilter.java deleted file mode 100644 index e5d2b00c7..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/PreCacheRequestTypeCountFilter.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import javax.inject.Inject; - -public class PreCacheRequestTypeCountFilter extends RequestTypeCountFilter { - @Inject - public PreCacheRequestTypeCountFilter() { - super("pre_cache"); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/QueryLangStatFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/QueryLangStatFilter.docx new file mode 100644 index 000000000..f0a69f1ca Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/QueryLangStatFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/QueryLangStatFilter.java b/src/java/com/twitter/search/earlybird_root/filters/QueryLangStatFilter.java deleted file mode 100644 index dbbc3d23a..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/QueryLangStatFilter.java +++ /dev/null @@ -1,114 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; -import javax.inject.Inject; -import javax.inject.Singleton; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import com.twitter.common.text.language.LocaleUtil; -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.constants.thriftjava.ThriftLanguage; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.util.lang.ThriftLanguageUtil; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.util.Future; - -/** - * Export stats for query languages. - */ -@Singleton -public class QueryLangStatFilter - extends SimpleFilter { - - public static class Config { - // We put a limit here in case an error in the client are sending us random lang codes. - private int maxNumberOfLangs; - - public Config(int maxNumberOfLangs) { - this.maxNumberOfLangs = maxNumberOfLangs; - } - - public int getMaxNumberOfLangs() { - return maxNumberOfLangs; - } - } - - @VisibleForTesting - protected static final String LANG_STATS_PREFIX = "num_queries_in_lang_"; - - private final Config config; - private final SearchCounter allCountsForLangsOverMaxNumLang = - SearchCounter.export(LANG_STATS_PREFIX + "overflow"); - - private final ConcurrentHashMap langCounters = - new ConcurrentHashMap<>(); - - @Inject - public QueryLangStatFilter(Config config) { - this.config = config; - } - - private SearchCounter getCounter(String lang) { - Preconditions.checkNotNull(lang); - - SearchCounter counter = langCounters.get(lang); - if (counter == null) { - if (langCounters.size() >= config.getMaxNumberOfLangs()) { - return allCountsForLangsOverMaxNumLang; - } - synchronized (langCounters) { // This double-checked locking is safe, - // since we're using a ConcurrentHashMap - counter = langCounters.get(lang); - if (counter == null) { - counter = SearchCounter.export(LANG_STATS_PREFIX + lang); - langCounters.put(lang, counter); - } - } - } - - return counter; - } - - @Override - public Future apply( - EarlybirdRequestContext requestContext, - Service service) { - - String lang = null; - - ThriftSearchQuery searchQuery = requestContext.getRequest().getSearchQuery(); - - lang = searchQuery.getQueryLang(); - - if (lang == null) { - // fallback to ui lang - lang = searchQuery.getUiLang(); - } - - if (lang == null && searchQuery.isSetUserLangs()) { - // fallback to the user lang with the highest confidence - double maxConfidence = Double.MIN_VALUE; - - for (Map.Entry entry : searchQuery.getUserLangs().entrySet()) { - if (entry.getValue() > maxConfidence) { - lang = ThriftLanguageUtil.getLanguageCodeOf(entry.getKey()); - maxConfidence = entry.getValue(); - } - } - } - - if (lang == null) { - lang = LocaleUtil.UNDETERMINED_LANGUAGE; - } - - getCounter(lang).increment(); - - return service.apply(requestContext); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/QueryOperatorStatFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/QueryOperatorStatFilter.docx new file mode 100644 index 000000000..d93169cf1 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/QueryOperatorStatFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/QueryOperatorStatFilter.java b/src/java/com/twitter/search/earlybird_root/filters/QueryOperatorStatFilter.java deleted file mode 100644 index 1b17299f9..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/QueryOperatorStatFilter.java +++ /dev/null @@ -1,194 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.EnumSet; -import java.util.Set; -import java.util.concurrent.TimeUnit; - -import scala.runtime.BoxedUnit; - -import com.google.common.collect.ImmutableMap; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.annotation.Annotation; -import com.twitter.search.queryparser.query.search.SearchOperator; -import com.twitter.search.queryparser.query.search.SearchOperatorConstants; -import com.twitter.search.queryparser.visitors.DetectAnnotationVisitor; -import com.twitter.search.queryparser.visitors.DetectVisitor; -import com.twitter.util.Future; - -/** - * For a given query, increments counters if that query has a number of search operators or - * annotations applied to it. Used to detect unusual traffic patterns. - */ -public class QueryOperatorStatFilter - extends SimpleFilter { - private static final Logger LOG = LoggerFactory.getLogger(QueryOperatorStatFilter.class); - - private final SearchCounter numQueryOperatorDetectionErrors = - SearchCounter.export("query_operator_detection_errors"); - - private final SearchCounter numQueryOperatorConsideredRequests = - SearchCounter.export("query_operator_requests_considered"); - - private final ImmutableMap filterOperatorStats; - - // Keeps track of the number of queries with a filter applied, whose type we don't care about. - private final SearchCounter numUnknownFilterOperatorRequests = - SearchCounter.export("query_operator_filter_unknown_requests"); - - private final ImmutableMap includeOperatorStats; - - // Keeps track of the number of queries with an include operator applied, whose type we don't - // know about. - private final SearchCounter numUnknownIncludeOperatorRequests = - SearchCounter.export("query_operator_include_unknown_requests"); - - private final ImmutableMap operatorTypeStats; - - private final SearchCounter numVariantRequests = - SearchCounter.export("query_operator_variant_requests"); - - /** - * Construct this QueryOperatorStatFilter by getting the complete set of possible filters a query - * might have and associating each with a counter. - */ - public QueryOperatorStatFilter() { - - ImmutableMap.Builder filterBuilder = new ImmutableMap.Builder<>(); - for (String operand : SearchOperatorConstants.VALID_FILTER_OPERANDS) { - filterBuilder.put( - operand, - SearchTimerStats.export( - "query_operator_filter_" + operand + "_requests", - TimeUnit.MILLISECONDS, - false, - true)); - } - filterOperatorStats = filterBuilder.build(); - - ImmutableMap.Builder includeBuilder = new ImmutableMap.Builder<>(); - for (String operand : SearchOperatorConstants.VALID_INCLUDE_OPERANDS) { - includeBuilder.put( - operand, - SearchTimerStats.export( - "query_operator_include_" + operand + "_requests", - TimeUnit.MILLISECONDS, - false, - true)); - } - includeOperatorStats = includeBuilder.build(); - - ImmutableMap.Builder operatorBuilder = - new ImmutableMap.Builder<>(); - for (SearchOperator.Type operatorType : SearchOperator.Type.values()) { - operatorBuilder.put( - operatorType, - SearchTimerStats.export( - "query_operator_" + operatorType.name().toLowerCase() + "_requests", - TimeUnit.MILLISECONDS, - false, - true - )); - } - operatorTypeStats = operatorBuilder.build(); - } - - @Override - public Future apply( - EarlybirdRequestContext requestContext, - Service service) { - numQueryOperatorConsideredRequests.increment(); - Query parsedQuery = requestContext.getParsedQuery(); - - if (parsedQuery == null) { - return service.apply(requestContext); - } - - SearchTimer timer = new SearchTimer(); - timer.start(); - - return service.apply(requestContext).ensure(() -> { - timer.stop(); - - try { - updateTimersForOperatorsAndOperands(parsedQuery, timer); - updateCountersIfVariantAnnotation(parsedQuery); - } catch (QueryParserException e) { - LOG.warn("Unable to test if query has operators defined", e); - numQueryOperatorDetectionErrors.increment(); - } - return BoxedUnit.UNIT; - }); - } - - /** - * Tracks request stats for operators and operands. - * - * @param parsedQuery the query to check. - */ - private void updateTimersForOperatorsAndOperands(Query parsedQuery, SearchTimer timer) - throws QueryParserException { - final DetectVisitor detectVisitor = new DetectVisitor(false, SearchOperator.Type.values()); - parsedQuery.accept(detectVisitor); - - Set detectedOperatorTypes = EnumSet.noneOf(SearchOperator.Type.class); - for (Query query : detectVisitor.getDetectedQueries()) { - // This detectVisitor only matches on SearchOperators. - SearchOperator operator = (SearchOperator) query; - SearchOperator.Type operatorType = operator.getOperatorType(); - detectedOperatorTypes.add(operatorType); - - if (operatorType == SearchOperator.Type.INCLUDE) { - updateOperandStats( - operator, - includeOperatorStats, - timer, - numUnknownIncludeOperatorRequests); - } - if (operatorType == SearchOperator.Type.FILTER) { - updateOperandStats( - operator, - filterOperatorStats, - timer, - numUnknownFilterOperatorRequests); - } - } - - for (SearchOperator.Type type : detectedOperatorTypes) { - operatorTypeStats.get(type).stoppedTimerIncrement(timer); - } - } - - private void updateOperandStats( - SearchOperator operator, - ImmutableMap operandRequestStats, - SearchTimer timer, - SearchCounter unknownOperandStat) { - String operand = operator.getOperand(); - SearchTimerStats stats = operandRequestStats.get(operand); - - if (stats != null) { - stats.stoppedTimerIncrement(timer); - } else { - unknownOperandStat.increment(); - } - } - - private void updateCountersIfVariantAnnotation(Query parsedQuery) throws QueryParserException { - DetectAnnotationVisitor visitor = new DetectAnnotationVisitor(Annotation.Type.VARIANT); - if (parsedQuery.accept(visitor)) { - numVariantRequests.increment(); - } - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/QueryTokenizerFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/QueryTokenizerFilter.docx new file mode 100644 index 000000000..1e3528b7d Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/QueryTokenizerFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/QueryTokenizerFilter.java b/src/java/com/twitter/search/earlybird_root/filters/QueryTokenizerFilter.java deleted file mode 100644 index e7c8a2c54..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/QueryTokenizerFilter.java +++ /dev/null @@ -1,92 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.concurrent.TimeUnit; -import javax.inject.Inject; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.common_internal.text.version.PenguinVersionConfig; -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.finagle.tracing.Trace; -import com.twitter.finagle.tracing.Tracing; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.QueryParsingUtils; -import com.twitter.search.queryparser.parser.SerializedQueryParser; -import com.twitter.search.queryparser.parser.SerializedQueryParser.TokenizationOption; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.util.Duration; -import com.twitter.util.Future; - -public class QueryTokenizerFilter extends SimpleFilter { - private static final String PREFIX = "query_tokenizer_"; - private static final SearchRateCounter SUCCESS_COUNTER = - SearchRateCounter.export(PREFIX + "success"); - private static final SearchRateCounter FAILURE_COUNTER = - SearchRateCounter.export(PREFIX + "error"); - private static final SearchRateCounter SKIPPED_COUNTER = - SearchRateCounter.export(PREFIX + "skipped"); - private static final SearchTimerStats QUERY_TOKENIZER_TIME = - SearchTimerStats.export(PREFIX + "time", TimeUnit.MILLISECONDS, false); - - private final TokenizationOption tokenizationOption; - - @Inject - public QueryTokenizerFilter(PenguinVersionConfig penguinversions) { - PenguinVersion[] supportedVersions = penguinversions - .getSupportedVersions().toArray(new PenguinVersion[0]); - tokenizationOption = new TokenizationOption(true, supportedVersions); - } - - @Override - public Future apply( - EarlybirdRequestContext requestContext, - Service service) { - - if (!requestContext.getRequest().isRetokenizeSerializedQuery() - || !requestContext.getRequest().isSetSearchQuery() - || !requestContext.getRequest().getSearchQuery().isSetSerializedQuery()) { - SKIPPED_COUNTER.increment(); - return service.apply(requestContext); - } - - SearchTimer timer = QUERY_TOKENIZER_TIME.startNewTimer(); - try { - String serializedQuery = requestContext.getRequest().getSearchQuery().getSerializedQuery(); - Query parsedQuery = reparseQuery(serializedQuery); - SUCCESS_COUNTER.increment(); - return service.apply(EarlybirdRequestContext.copyRequestContext(requestContext, parsedQuery)); - } catch (QueryParserException e) { - FAILURE_COUNTER.increment(); - return QueryParsingUtils.newClientErrorResponse(requestContext.getRequest(), e); - } finally { - long elapsed = timer.stop(); - QUERY_TOKENIZER_TIME.timerIncrement(elapsed); - Tracing trace = Trace.apply(); - if (trace.isActivelyTracing()) { - trace.record(PREFIX + "time", Duration.fromMilliseconds(elapsed)); - } - } - } - - public Query reparseQuery(String serializedQuery) throws QueryParserException { - SerializedQueryParser parser = new SerializedQueryParser(tokenizationOption); - return parser.parse(serializedQuery); - } - - /** - * Initializing the query parser can take many seconds. We initialize it at warmup so that - * requests don't time out after we join the serverset. SEARCH-28801 - */ - public void performExpensiveInitialization() throws QueryParserException { - SerializedQueryParser queryParser = new SerializedQueryParser(tokenizationOption); - - // The Korean query parser takes a few seconds on it's own to initialize. - String koreanQuery = "스포츠"; - queryParser.parse(koreanQuery); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/RealtimeServingRangeProvider.docx b/src/java/com/twitter/search/earlybird_root/filters/RealtimeServingRangeProvider.docx new file mode 100644 index 000000000..f3347a831 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/RealtimeServingRangeProvider.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/RealtimeServingRangeProvider.java b/src/java/com/twitter/search/earlybird_root/filters/RealtimeServingRangeProvider.java deleted file mode 100644 index 856afc2bb..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/RealtimeServingRangeProvider.java +++ /dev/null @@ -1,60 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.concurrent.TimeUnit; - -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.earlybird.config.ServingRange; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; - -public class RealtimeServingRangeProvider implements ServingRangeProvider { - - private static final int DEFAULT_SERVING_RANGE_BOUNDARY_HOURS_AGO = 240; - - private final SearchDecider decider; - private final String deciderKey; - - public RealtimeServingRangeProvider(SearchDecider decider, String deciderKey) { - this.decider = decider; - this.deciderKey = deciderKey; - } - - @Override - public ServingRange getServingRange( - final EarlybirdRequestContext requestContext, boolean useBoundaryOverride) { - return new ServingRange() { - @Override - public long getServingRangeSinceId() { - long servingRangeStartMillis = TimeUnit.HOURS.toMillis( - (decider.featureExists(deciderKey)) - ? decider.getAvailability(deciderKey) - : DEFAULT_SERVING_RANGE_BOUNDARY_HOURS_AGO); - - long boundaryTime = requestContext.getCreatedTimeMillis() - servingRangeStartMillis; - return SnowflakeIdParser.generateValidStatusId(boundaryTime, 0); - } - - @Override - public long getServingRangeMaxId() { - return SnowflakeIdParser.generateValidStatusId( - requestContext.getCreatedTimeMillis(), 0); - } - - @Override - public long getServingRangeSinceTimeSecondsFromEpoch() { - long servingRangeStartMillis = TimeUnit.HOURS.toMillis( - (decider.featureExists(deciderKey)) - ? decider.getAvailability(deciderKey) - : DEFAULT_SERVING_RANGE_BOUNDARY_HOURS_AGO); - - long boundaryTime = requestContext.getCreatedTimeMillis() - servingRangeStartMillis; - return boundaryTime / 1000; - } - - @Override - public long getServingRangeUntilTimeSecondsFromEpoch() { - return requestContext.getCreatedTimeMillis() / 1000; - } - }; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/RejectRequestsByQuerySourceFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/RejectRequestsByQuerySourceFilter.docx new file mode 100644 index 000000000..f279bd32c Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/RejectRequestsByQuerySourceFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/RejectRequestsByQuerySourceFilter.java b/src/java/com/twitter/search/earlybird_root/filters/RejectRequestsByQuerySourceFilter.java deleted file mode 100644 index fb346c7a1..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/RejectRequestsByQuerySourceFilter.java +++ /dev/null @@ -1,94 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.HashMap; -import java.util.Map; -import javax.annotation.Nullable; -import javax.inject.Inject; - -import com.google.common.annotations.VisibleForTesting; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.constants.thriftjava.ThriftQuerySource; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.util.Future; - -/** - * Rejects requests based on the query source of the request. Intended to be used at super-root - * or archive-root. If used to reject client request at super-root, the client will get a response - * with empty results and a REQUEST_BLOCKED_ERROR status code. If used at archive-root the client - * will get a response which might contain some results from realtime and protected and the status - * code of the response will depend on how super-root combines responses from the three downstream - * roots. - */ -public class RejectRequestsByQuerySourceFilter extends - SimpleFilter { - - @VisibleForTesting - protected static final String NUM_REJECTED_REQUESTS_STAT_NAME_PATTERN = - "num_root_%s_rejected_requests_with_query_source_%s"; - @VisibleForTesting - protected static final String REJECT_REQUESTS_DECIDER_KEY_PATTERN = - "root_%s_reject_requests_with_query_source_%s"; - private final Map rejectedRequestsCounterPerQuerySource = - new HashMap<>(); - private final Map rejectRequestsDeciderKeyPerQuerySource = - new HashMap<>(); - private final SearchDecider searchDecider; - - - @Inject - public RejectRequestsByQuerySourceFilter( - @Nullable EarlybirdCluster cluster, - SearchDecider searchDecider) { - - this.searchDecider = searchDecider; - - String clusterName = cluster != null - ? cluster.getNameForStats() - : EarlybirdCluster.SUPERROOT.getNameForStats(); - - for (ThriftQuerySource querySource : ThriftQuerySource.values()) { - String querySourceName = querySource.name().toLowerCase(); - - rejectedRequestsCounterPerQuerySource.put(querySource, - SearchRateCounter.export( - String.format( - NUM_REJECTED_REQUESTS_STAT_NAME_PATTERN, clusterName, querySourceName))); - - rejectRequestsDeciderKeyPerQuerySource.put(querySource, - String.format( - REJECT_REQUESTS_DECIDER_KEY_PATTERN, clusterName, querySourceName)); - } - } - - @Override - public Future apply(EarlybirdRequest request, - Service service) { - - ThriftQuerySource querySource = request.isSetQuerySource() - ? request.getQuerySource() - : ThriftQuerySource.UNKNOWN; - - String deciderKey = rejectRequestsDeciderKeyPerQuerySource.get(querySource); - if (searchDecider.isAvailable(deciderKey)) { - rejectedRequestsCounterPerQuerySource.get(querySource).increment(); - return Future.value(getRejectedRequestResponse(querySource, deciderKey)); - } - return service.apply(request); - } - - private static EarlybirdResponse getRejectedRequestResponse( - ThriftQuerySource querySource, String deciderKey) { - return new EarlybirdResponse(EarlybirdResponseCode.REQUEST_BLOCKED_ERROR, 0) - .setSearchResults(new ThriftSearchResults()) - .setDebugString(String.format( - "Request with query source %s is blocked by decider %s", querySource, deciderKey)); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/RequestContextToEarlybirdRequestFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/RequestContextToEarlybirdRequestFilter.docx new file mode 100644 index 000000000..db9a4a4a4 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/RequestContextToEarlybirdRequestFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/RequestContextToEarlybirdRequestFilter.java b/src/java/com/twitter/search/earlybird_root/filters/RequestContextToEarlybirdRequestFilter.java deleted file mode 100644 index 1059b4d30..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/RequestContextToEarlybirdRequestFilter.java +++ /dev/null @@ -1,33 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.concurrent.TimeUnit; - -import com.twitter.finagle.Filter; -import com.twitter.finagle.Service; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.util.Future; - -/** - * A filter for transforming a RequestContext to an EarlybirdRequest. - */ -public class RequestContextToEarlybirdRequestFilter extends - Filter { - - private static final SearchTimerStats REQUEST_CONTEXT_TRIP_TIME = - SearchTimerStats.export("request_context_trip_time", TimeUnit.MILLISECONDS, false, - true); - - @Override - public Future apply( - EarlybirdRequestContext requestContext, - Service service) { - - long tripTime = System.currentTimeMillis() - requestContext.getCreatedTimeMillis(); - REQUEST_CONTEXT_TRIP_TIME.timerIncrement(tripTime); - - return service.apply(requestContext.getRequest()); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/RequestResultStatsFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/RequestResultStatsFilter.docx new file mode 100644 index 000000000..6ff1f3ba1 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/RequestResultStatsFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/RequestResultStatsFilter.java b/src/java/com/twitter/search/earlybird_root/filters/RequestResultStatsFilter.java deleted file mode 100644 index 95f0f44b5..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/RequestResultStatsFilter.java +++ /dev/null @@ -1,185 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; -import javax.inject.Inject; - -import scala.runtime.BoxedUnit; - -import com.twitter.common.util.Clock; -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.metrics.Percentile; -import com.twitter.search.common.metrics.PercentileUtil; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.query.thriftjava.CollectorParams; -import com.twitter.search.common.query.thriftjava.CollectorTerminationParams; -import com.twitter.search.earlybird.common.ClientIdUtil; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.snowflake.id.SnowflakeId; -import com.twitter.util.Function; -import com.twitter.util.Future; - -public class RequestResultStatsFilter - extends SimpleFilter { - private final Clock clock; - private final RequestResultStats stats; - - static class RequestResultStats { - private static final String PREFIX = "request_result_properties_"; - - private final SearchCounter resultsRequestedCount; - private final SearchCounter resultsReturnedCount; - private final SearchCounter maxHitsToProcessCount; - private final SearchCounter hitsProcessedCount; - private final SearchCounter docsProcessedCount; - private final SearchCounter timeoutMsCount; - private Map> requestedNumResultsPercentileByClientId; - private Map> returnedNumResultsPercentileByClientId; - private Map> oldestResultPercentileByClientId; - - RequestResultStats() { - // Request properties - resultsRequestedCount = SearchCounter.export(PREFIX + "results_requested_cnt"); - maxHitsToProcessCount = SearchCounter.export(PREFIX + "max_hits_to_process_cnt"); - timeoutMsCount = SearchCounter.export(PREFIX + "timeout_ms_cnt"); - requestedNumResultsPercentileByClientId = new ConcurrentHashMap<>(); - - // Result properties - resultsReturnedCount = SearchCounter.export(PREFIX + "results_returned_cnt"); - hitsProcessedCount = SearchCounter.export(PREFIX + "hits_processed_cnt"); - docsProcessedCount = SearchCounter.export(PREFIX + "docs_processed_cnt"); - returnedNumResultsPercentileByClientId = new ConcurrentHashMap<>(); - oldestResultPercentileByClientId = new ConcurrentHashMap<>(); - } - - SearchCounter getResultsRequestedCount() { - return resultsRequestedCount; - } - - SearchCounter getResultsReturnedCount() { - return resultsReturnedCount; - } - - SearchCounter getMaxHitsToProcessCount() { - return maxHitsToProcessCount; - } - - SearchCounter getHitsProcessedCount() { - return hitsProcessedCount; - } - - SearchCounter getDocsProcessedCount() { - return docsProcessedCount; - } - - SearchCounter getTimeoutMsCount() { - return timeoutMsCount; - } - - Percentile getOldestResultPercentile(String clientId) { - return oldestResultPercentileByClientId.computeIfAbsent(clientId, - key -> PercentileUtil.createPercentile(statName(clientId, "oldest_result_age_seconds"))); - } - - Percentile getRequestedNumResultsPercentile(String clientId) { - return requestedNumResultsPercentileByClientId.computeIfAbsent(clientId, - key -> PercentileUtil.createPercentile(statName(clientId, "requested_num_results"))); - } - - Percentile getReturnedNumResultsPercentile(String clientId) { - return returnedNumResultsPercentileByClientId.computeIfAbsent(clientId, - key -> PercentileUtil.createPercentile(statName(clientId, "returned_num_results"))); - } - - private String statName(String clientId, String suffix) { - return String.format("%s%s_%s", PREFIX, ClientIdUtil.formatClientId(clientId), suffix); - } - } - - @Inject - RequestResultStatsFilter(Clock clock, RequestResultStats stats) { - this.clock = clock; - this.stats = stats; - } - - private void updateRequestStats(EarlybirdRequest request) { - ThriftSearchQuery searchQuery = request.getSearchQuery(); - CollectorParams collectorParams = searchQuery.getCollectorParams(); - - if (collectorParams != null) { - stats.getResultsRequestedCount().add(collectorParams.numResultsToReturn); - if (request.isSetClientId()) { - stats.getRequestedNumResultsPercentile(request.getClientId()) - .record(collectorParams.numResultsToReturn); - } - CollectorTerminationParams terminationParams = collectorParams.getTerminationParams(); - if (terminationParams != null) { - if (terminationParams.isSetMaxHitsToProcess()) { - stats.getMaxHitsToProcessCount().add(terminationParams.maxHitsToProcess); - } - if (terminationParams.isSetTimeoutMs()) { - stats.getTimeoutMsCount().add(terminationParams.timeoutMs); - } - } - } else { - if (searchQuery.isSetNumResults()) { - stats.getResultsRequestedCount().add(searchQuery.numResults); - if (request.isSetClientId()) { - stats.getRequestedNumResultsPercentile(request.getClientId()) - .record(searchQuery.numResults); - } - } - if (searchQuery.isSetMaxHitsToProcess()) { - stats.getMaxHitsToProcessCount().add(searchQuery.maxHitsToProcess); - } - if (request.isSetTimeoutMs()) { - stats.getTimeoutMsCount().add(request.timeoutMs); - } - } - } - - private void updateResultsStats(String clientId, ThriftSearchResults results) { - stats.getResultsReturnedCount().add(results.getResultsSize()); - if (results.isSetNumHitsProcessed()) { - stats.getHitsProcessedCount().add(results.numHitsProcessed); - } - - if (clientId != null) { - if (results.getResultsSize() > 0) { - List resultsList = results.getResults(); - - long lastId = resultsList.get(resultsList.size() - 1).getId(); - long tweetTime = SnowflakeId.timeFromId(lastId).inLongSeconds(); - long tweetAge = (clock.nowMillis() / 1000) - tweetTime; - stats.getOldestResultPercentile(clientId).record(tweetAge); - } - - stats.getReturnedNumResultsPercentile(clientId).record(results.getResultsSize()); - } - } - - @Override - public Future apply( - EarlybirdRequest request, - Service service) { - - updateRequestStats(request); - - return service.apply(request).onSuccess( - new Function() { - @Override - public BoxedUnit apply(EarlybirdResponse response) { - if (response.isSetSearchResults()) { - updateResultsStats(request.getClientId(), response.searchResults); - } - return BoxedUnit.UNIT; - } - }); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/RequestSuccessStatsFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/RequestSuccessStatsFilter.docx new file mode 100644 index 000000000..e0218ea23 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/RequestSuccessStatsFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/RequestSuccessStatsFilter.java b/src/java/com/twitter/search/earlybird_root/filters/RequestSuccessStatsFilter.java deleted file mode 100644 index 7a942d05a..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/RequestSuccessStatsFilter.java +++ /dev/null @@ -1,79 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.concurrent.TimeUnit; -import javax.inject.Inject; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.root.RequestSuccessStats; -import com.twitter.search.common.util.FinagleUtil; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.util.Future; -import com.twitter.util.FutureEventListener; - -import static com.twitter.search.common.util.earlybird.EarlybirdResponseUtil.responseConsideredFailed; - - -/** - * Records cancellations, timeouts, and failures for requests that do not go through - * ScatterGatherService (which also updates these stats, but for different requests). - */ -public class RequestSuccessStatsFilter - extends SimpleFilter { - - private final RequestSuccessStats stats; - - @Inject - RequestSuccessStatsFilter(RequestSuccessStats stats) { - this.stats = stats; - } - - - @Override - public Future apply( - EarlybirdRequest request, - Service service) { - - final long startTime = System.nanoTime(); - - return service.apply(request).addEventListener( - new FutureEventListener() { - @Override - public void onSuccess(EarlybirdResponse response) { - boolean success = true; - - if (response.getResponseCode() == EarlybirdResponseCode.CLIENT_CANCEL_ERROR) { - success = false; - stats.getCancelledRequestCount().increment(); - } else if (response.getResponseCode() == EarlybirdResponseCode.SERVER_TIMEOUT_ERROR) { - success = false; - stats.getTimedoutRequestCount().increment(); - } else if (responseConsideredFailed(response.getResponseCode())) { - success = false; - stats.getErroredRequestCount().increment(); - } - - long latencyNanos = System.nanoTime() - startTime; - stats.getRequestLatencyStats().requestComplete( - TimeUnit.NANOSECONDS.toMillis(latencyNanos), 0, success); - } - - @Override - public void onFailure(Throwable cause) { - long latencyNanos = System.nanoTime() - startTime; - stats.getRequestLatencyStats().requestComplete( - TimeUnit.NANOSECONDS.toMillis(latencyNanos), 0, false); - - if (FinagleUtil.isCancelException(cause)) { - stats.getCancelledRequestCount().increment(); - } else if (FinagleUtil.isTimeoutException(cause)) { - stats.getTimedoutRequestCount().increment(); - } else { - stats.getErroredRequestCount().increment(); - } - } - }); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/RequestTypeCountFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/RequestTypeCountFilter.docx new file mode 100644 index 000000000..e5fe491d1 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/RequestTypeCountFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/RequestTypeCountFilter.java b/src/java/com/twitter/search/earlybird_root/filters/RequestTypeCountFilter.java deleted file mode 100644 index 477a74ed4..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/RequestTypeCountFilter.java +++ /dev/null @@ -1,105 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import com.google.common.base.Preconditions; -import com.google.common.cache.CacheBuilder; -import com.google.common.cache.CacheLoader; -import com.google.common.cache.LoadingCache; -import com.google.common.collect.ImmutableMap; - -import com.twitter.common.util.Clock; -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.clientstats.RequestCounters; -import com.twitter.search.common.clientstats.RequestCountersEventListener; -import com.twitter.search.common.util.FinagleUtil; -import com.twitter.search.earlybird.common.ClientIdUtil; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.EarlybirdRequestType; -import com.twitter.util.Future; - -public class RequestTypeCountFilter - extends SimpleFilter { - private final ImmutableMap typeCounters; - private final RequestCounters allRequestTypesCounter; - private final ImmutableMap> - perTypePerClientCounters; - - /** - * Constructs the filter. - */ - public RequestTypeCountFilter(final String statSuffix) { - ImmutableMap.Builder perTypeBuilder = - ImmutableMap.builder(); - for (EarlybirdRequestType type : EarlybirdRequestType.values()) { - perTypeBuilder.put(type, new RequestCounters( - "request_type_count_filter_" + type.getNormalizedName() + "_" + statSuffix)); - } - typeCounters = perTypeBuilder.build(); - - allRequestTypesCounter = - new RequestCounters("request_type_count_filter_all_" + statSuffix, true); - - ImmutableMap.Builder> - perTypePerClientBuilder = ImmutableMap.builder(); - - // No point in setting any kind of expiration policy for the cache, since the stats will - // continue to be exported, so the objects will not be GCed anyway. - CacheBuilder cacheBuilder = CacheBuilder.newBuilder(); - for (final EarlybirdRequestType requestType : EarlybirdRequestType.values()) { - CacheLoader cacheLoader = - new CacheLoader() { - @Override - public RequestCounters load(String clientId) { - return new RequestCounters("request_type_count_filter_for_" + clientId + "_" - + requestType.getNormalizedName() + "_" + statSuffix); - } - }; - perTypePerClientBuilder.put(requestType, cacheBuilder.build(cacheLoader)); - } - perTypePerClientCounters = perTypePerClientBuilder.build(); - } - - @Override - public Future apply( - EarlybirdRequestContext requestContext, - Service service) { - EarlybirdRequestType requestType = requestContext.getEarlybirdRequestType(); - RequestCounters requestCounters = typeCounters.get(requestType); - Preconditions.checkNotNull(requestCounters); - - // Update the per-type and "all" counters. - RequestCountersEventListener requestCountersEventListener = - new RequestCountersEventListener<>( - requestCounters, Clock.SYSTEM_CLOCK, EarlybirdSuccessfulResponseHandler.INSTANCE); - RequestCountersEventListener allRequestTypesEventListener = - new RequestCountersEventListener<>( - allRequestTypesCounter, Clock.SYSTEM_CLOCK, - EarlybirdSuccessfulResponseHandler.INSTANCE); - - RequestCountersEventListener perTypePerClientEventListener = - updatePerTypePerClientCountersListener(requestContext); - - return service.apply(requestContext) - .addEventListener(requestCountersEventListener) - .addEventListener(allRequestTypesEventListener) - .addEventListener(perTypePerClientEventListener); - } - - private RequestCountersEventListener updatePerTypePerClientCountersListener( - EarlybirdRequestContext earlybirdRequestContext) { - EarlybirdRequestType requestType = earlybirdRequestContext.getEarlybirdRequestType(); - LoadingCache perClientCounters = - perTypePerClientCounters.get(requestType); - Preconditions.checkNotNull(perClientCounters); - - String clientId = ClientIdUtil.formatFinagleClientIdAndClientId( - FinagleUtil.getFinagleClientName(), - ClientIdUtil.getClientIdFromRequest(earlybirdRequestContext.getRequest())); - RequestCounters clientCounters = perClientCounters.getUnchecked(clientId); - Preconditions.checkNotNull(clientCounters); - - return new RequestCountersEventListener<>( - clientCounters, Clock.SYSTEM_CLOCK, EarlybirdSuccessfulResponseHandler.INSTANCE); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/ResponseCodeStatFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/ResponseCodeStatFilter.docx new file mode 100644 index 000000000..d8e12e43e Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/ResponseCodeStatFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/ResponseCodeStatFilter.java b/src/java/com/twitter/search/earlybird_root/filters/ResponseCodeStatFilter.java deleted file mode 100644 index 50fa78299..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/ResponseCodeStatFilter.java +++ /dev/null @@ -1,50 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.Map; - -import com.google.common.collect.Maps; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.util.Future; -import com.twitter.util.FutureEventListener; - -public class ResponseCodeStatFilter - extends SimpleFilter { - - private final Map responseCodeCounters; - - /** - * Create ResponseCodeStatFilter - */ - public ResponseCodeStatFilter() { - responseCodeCounters = Maps.newEnumMap(EarlybirdResponseCode.class); - for (EarlybirdResponseCode code : EarlybirdResponseCode.values()) { - SearchCounter stat = SearchCounter.export("response_code_" + code.name().toLowerCase()); - responseCodeCounters.put(code, stat); - } - } - - @Override - public Future apply( - final EarlybirdRequest request, - final Service service) { - - return service.apply(request).addEventListener( - new FutureEventListener() { - - @Override - public void onSuccess(final EarlybirdResponse response) { - responseCodeCounters.get(response.getResponseCode()).increment(); - } - - @Override - public void onFailure(final Throwable cause) { } - }); - - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/ResultTierCountFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/ResultTierCountFilter.docx new file mode 100644 index 000000000..81e14458b Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/ResultTierCountFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/ResultTierCountFilter.java b/src/java/com/twitter/search/earlybird_root/filters/ResultTierCountFilter.java deleted file mode 100644 index 088ab07e7..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/ResultTierCountFilter.java +++ /dev/null @@ -1,114 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.NavigableMap; - -import javax.inject.Inject; -import javax.inject.Singleton; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableSortedMap; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchCustomGauge; -import com.twitter.search.earlybird.config.TierInfo; -import com.twitter.search.earlybird.config.TierInfoSource; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.snowflake.id.SnowflakeId; -import com.twitter.util.Future; -import com.twitter.util.FutureEventListener; - -/** - * A filter to count the tier to which the oldest tweet in the results belong. - */ -@Singleton -public class ResultTierCountFilter - extends SimpleFilter { - - private static final String COUNTER_PREFIX = "result_tier_count"; - private final long firstTweetTimeSinceEpochSec; - private final NavigableMap tierBuckets; - private final SearchCounter allCounter = SearchCounter.export(COUNTER_PREFIX + "_all"); - private final SearchCounter noResultsCounter = - SearchCounter.export(COUNTER_PREFIX + "_no_results"); - - @Inject - @SuppressWarnings("unused") - ResultTierCountFilter(TierInfoSource tierInfoSource) { - List tierInfos = tierInfoSource.getTierInformation(); - tierInfos.sort(Comparator.comparing(TierInfo::getDataStartDate)); - - firstTweetTimeSinceEpochSec = tierInfos.get(0).getServingRangeSinceTimeSecondsFromEpoch(); - - ImmutableSortedMap.Builder builder = ImmutableSortedMap.naturalOrder(); - Collections.reverse(tierInfos); - - for (TierInfo tierInfo : tierInfos) { - SearchCounter searchCounter = SearchCounter.export( - String.format("%s_%s", COUNTER_PREFIX, tierInfo.getTierName())); - builder.put(tierInfo.getServingRangeSinceTimeSecondsFromEpoch(), searchCounter); - - // export cumulative metrics to sum from the latest to a lower tier - Collection counters = builder.build().values(); - SearchCustomGauge.export( - String.format("%s_down_to_%s", COUNTER_PREFIX, tierInfo.getTierName()), - () -> counters.stream() - .mapToLong(SearchCounter::get) - .sum()); - } - - tierBuckets = builder.build(); - } - - @Override - public Future apply( - EarlybirdRequestContext context, - Service service) { - return service.apply(context).addEventListener( - new FutureEventListener() { - @Override - public void onFailure(Throwable cause) { - // do nothing - } - - @Override - public void onSuccess(EarlybirdResponse response) { - record(response); - } - }); - } - - @VisibleForTesting - void record(EarlybirdResponse response) { - if (response.isSetSearchResults()) { - long minResultsStatusId = response.getSearchResults().getResults().stream() - .mapToLong(ThriftSearchResult::getId) - .min() - .orElse(-1); - getBucket(minResultsStatusId).increment(); - } - allCounter.increment(); - } - - private SearchCounter getBucket(long statusId) { - if (statusId < 0) { - return noResultsCounter; - } - - // If non-negative statusId is not a SnowflakeId, the tweet must have been created before - // Twepoch (2010-11-04T01:42:54Z) and thus belongs to full1. - long timeSinceEpochSec = firstTweetTimeSinceEpochSec; - if (SnowflakeId.isSnowflakeId(statusId)) { - timeSinceEpochSec = SnowflakeId.timeFromId(statusId).inSeconds(); - } - - return tierBuckets.floorEntry(timeSinceEpochSec).getValue(); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/ScatterGatherWithExperimentRedirectsService.docx b/src/java/com/twitter/search/earlybird_root/filters/ScatterGatherWithExperimentRedirectsService.docx new file mode 100644 index 000000000..c425fb7b1 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/ScatterGatherWithExperimentRedirectsService.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/ScatterGatherWithExperimentRedirectsService.java b/src/java/com/twitter/search/earlybird_root/filters/ScatterGatherWithExperimentRedirectsService.java deleted file mode 100644 index 179aa259e..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/ScatterGatherWithExperimentRedirectsService.java +++ /dev/null @@ -1,59 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.Map; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.finagle.Service; -import com.twitter.search.common.root.ScatterGatherService; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ExperimentCluster; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.util.Future; - -public class ScatterGatherWithExperimentRedirectsService - extends Service { - private final Service - controlScatterGatherService; - - private final Map> - experimentScatterGatherServices; - - private static final Logger LOG = - LoggerFactory.getLogger(ScatterGatherWithExperimentRedirectsService.class); - - public ScatterGatherWithExperimentRedirectsService( - Service controlScatterGatherService, - Map> - experimentScatterGatherServices - ) { - this.controlScatterGatherService = controlScatterGatherService; - this.experimentScatterGatherServices = experimentScatterGatherServices; - } - - @Override - public Future apply(EarlybirdRequestContext request) { - if (request.getRequest().isSetExperimentClusterToUse()) { - ExperimentCluster cluster = request.getRequest().getExperimentClusterToUse(); - - if (!experimentScatterGatherServices.containsKey(cluster)) { - String error = String.format( - "Received invalid experiment cluster: %s", cluster.name()); - - LOG.error("{} Request: {}", error, request.getRequest()); - - return Future.value(new EarlybirdResponse() - .setResponseCode(EarlybirdResponseCode.CLIENT_ERROR) - .setDebugString(error)); - } - - return experimentScatterGatherServices.get(cluster).apply(request); - } - - return controlScatterGatherService.apply(request); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/SearchPayloadSizeLocalContextFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/SearchPayloadSizeLocalContextFilter.docx new file mode 100644 index 000000000..921ea2fb5 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/SearchPayloadSizeLocalContextFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/SearchPayloadSizeLocalContextFilter.java b/src/java/com/twitter/search/earlybird_root/filters/SearchPayloadSizeLocalContextFilter.java deleted file mode 100644 index 0ce99bd42..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/SearchPayloadSizeLocalContextFilter.java +++ /dev/null @@ -1,43 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.concurrent.atomic.AtomicReference; - -import scala.Option; - -import com.google.common.base.Preconditions; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.finagle.context.Contexts; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.root.SearchPayloadSizeFilter; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.util.Future; - -/** - * A filter that sets the clientId in the local context, to be usd later by SearchPayloadSizeFilter. - */ -public class SearchPayloadSizeLocalContextFilter - extends SimpleFilter { - private static final SearchCounter CLIENT_ID_CONTEXT_KEY_NOT_SET_COUNTER = SearchCounter.export( - "search_payload_size_local_context_filter_client_id_context_key_not_set"); - - @Override - public Future apply(EarlybirdRequest request, - Service service) { - // In production, the SearchPayloadSizeFilter.CLIENT_ID_CONTEXT_KEY should always be set - // (by ThriftServer). However, it's not set in tests, because tests do not start a ThriftServer. - Option> clientIdOption = - Contexts.local().get(SearchPayloadSizeFilter.CLIENT_ID_CONTEXT_KEY); - if (clientIdOption.isDefined()) { - AtomicReference clientIdReference = clientIdOption.get(); - Preconditions.checkArgument(clientIdReference.get() == null); - clientIdReference.set(request.getClientId()); - } else { - CLIENT_ID_CONTEXT_KEY_NOT_SET_COUNTER.increment(); - } - - return service.apply(request); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/SensitiveResultsTrackingFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/SensitiveResultsTrackingFilter.docx new file mode 100644 index 000000000..3f2694b69 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/SensitiveResultsTrackingFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/SensitiveResultsTrackingFilter.java b/src/java/com/twitter/search/earlybird_root/filters/SensitiveResultsTrackingFilter.java deleted file mode 100644 index 082e52dde..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/SensitiveResultsTrackingFilter.java +++ /dev/null @@ -1,140 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.Set; - -import com.google.common.base.Joiner; - -import org.apache.thrift.TException; -import org.slf4j.Logger; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.util.thrift.ThriftUtils; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.util.Future; -import com.twitter.util.FutureEventListener; - -/** - * The general framework for earlybird root to track sensitive results. - */ -public abstract class SensitiveResultsTrackingFilter - extends SimpleFilter { - - /** - * The type name is used to distinguish different kinds of sensitive results in log. - */ - private final String typeName; - - /** - * The mark is to control whether to log expensive information. - */ - private final boolean logDetails; - - /** - * Constructor helps distinguish different sensitive content trackers. - * @param typeName The sensitive content's name (e.g. nullcast) - * @param logDetails Whether to log details such as serialized requests and responses - */ - public SensitiveResultsTrackingFilter(final String typeName, boolean logDetails) { - super(); - this.typeName = typeName; - this.logDetails = logDetails; - } - - /** - * Get the LOG that the sensitive results can write to. - */ - protected abstract Logger getLogger(); - - /** - * The counter which counts the number of queries with sensitive results. - */ - protected abstract SearchCounter getSensitiveQueryCounter(); - - /** - * The counter which counts the number of sensitive results. - */ - protected abstract SearchCounter getSensitiveResultsCounter(); - - /** - * The method defines how the sensitive results are identified. - */ - protected abstract Set getSensitiveResults( - EarlybirdRequestContext requestContext, - EarlybirdResponse earlybirdResponse) throws Exception; - - /** - * Get a set of tweets which should be exclude from the sensitive results set. - */ - protected abstract Set getExceptedResults(EarlybirdRequestContext requestContext); - - @Override - public final Future apply( - final EarlybirdRequestContext requestContext, - Service service) { - Future response = service.apply(requestContext); - - response.addEventListener(new FutureEventListener() { - @Override - public void onSuccess(EarlybirdResponse earlybirdResponse) { - try { - if (earlybirdResponse.responseCode == EarlybirdResponseCode.SUCCESS - && earlybirdResponse.isSetSearchResults() - && requestContext.getParsedQuery() != null) { - Set statusIds = getSensitiveResults(requestContext, earlybirdResponse); - Set exceptedIds = getExceptedResults(requestContext); - statusIds.removeAll(exceptedIds); - - if (statusIds.size() > 0) { - getSensitiveQueryCounter().increment(); - getSensitiveResultsCounter().add(statusIds.size()); - logContent(requestContext, earlybirdResponse, statusIds); - } - } - } catch (Exception e) { - getLogger().error("Caught exception while trying to log sensitive results for query: {}", - requestContext.getParsedQuery().serialize(), e); - } - } - - @Override - public void onFailure(Throwable cause) { - } - }); - - return response; - } - - private void logContent( - final EarlybirdRequestContext requestContext, - final EarlybirdResponse earlybirdResponse, - final Set statusIds) { - - if (logDetails) { - String base64Request; - try { - base64Request = ThriftUtils.toBase64EncodedString(requestContext.getRequest()); - } catch (TException e) { - base64Request = "Failed to parse base 64 request"; - } - getLogger().error("Found " + typeName - + ": {} | " - + "parsedQuery: {} | " - + "request: {} | " - + "base 64 request: {} | " - + "response: {}", - Joiner.on(",").join(statusIds), - requestContext.getParsedQuery().serialize(), - requestContext.getRequest(), - base64Request, - earlybirdResponse); - } else { - getLogger().error("Found " + typeName + ": {} for parsedQuery {}", - Joiner.on(",").join(statusIds), - requestContext.getParsedQuery().serialize()); - } - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/ServiceExceptionHandlingFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/ServiceExceptionHandlingFilter.docx new file mode 100644 index 000000000..292f0f5a5 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/ServiceExceptionHandlingFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/ServiceExceptionHandlingFilter.java b/src/java/com/twitter/search/earlybird_root/filters/ServiceExceptionHandlingFilter.java deleted file mode 100644 index 4594aa289..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/ServiceExceptionHandlingFilter.java +++ /dev/null @@ -1,27 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.util.Future; - -/** A per-service filter for handling exceptions. */ -public class ServiceExceptionHandlingFilter - extends SimpleFilter { - private final EarlybirdResponseExceptionHandler exceptionHandler; - - /** Creates a new ServiceExceptionHandlingFilter instance. */ - public ServiceExceptionHandlingFilter(EarlybirdCluster cluster) { - this.exceptionHandler = new EarlybirdResponseExceptionHandler(cluster.getNameForStats()); - } - - @Override - public Future apply( - EarlybirdRequestContext requestContext, - Service service) { - return exceptionHandler.handleException( - requestContext.getRequest(), service.apply(requestContext)); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/ServiceResponseValidationFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/ServiceResponseValidationFilter.docx new file mode 100644 index 000000000..85ba3015e Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/ServiceResponseValidationFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/ServiceResponseValidationFilter.java b/src/java/com/twitter/search/earlybird_root/filters/ServiceResponseValidationFilter.java deleted file mode 100644 index 2464be534..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/ServiceResponseValidationFilter.java +++ /dev/null @@ -1,81 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import java.util.HashMap; -import java.util.Map; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.util.earlybird.EarlybirdResponseMergeUtil; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.EarlybirdRequestType; -import com.twitter.search.earlybird_root.validators.FacetsResponseValidator; -import com.twitter.search.earlybird_root.validators.PassThroughResponseValidator; -import com.twitter.search.earlybird_root.validators.ServiceResponseValidator; -import com.twitter.search.earlybird_root.validators.TermStatsResultsValidator; -import com.twitter.search.earlybird_root.validators.TopTweetsResultsValidator; -import com.twitter.util.Function; -import com.twitter.util.Future; - -/** - * Filter responsible for handling invalid response returned by downstream services, and - * translating them into EarlybirdResponseExceptions. - */ -public class ServiceResponseValidationFilter - extends SimpleFilter { - - private final Map> - requestTypeToResponseValidators = new HashMap<>(); - private final EarlybirdCluster cluster; - - /** - * Creates a new filter for handling invalid response - */ - public ServiceResponseValidationFilter(EarlybirdCluster cluster) { - this.cluster = cluster; - - ServiceResponseValidator passThroughValidator = - new PassThroughResponseValidator(); - - requestTypeToResponseValidators - .put(EarlybirdRequestType.FACETS, new FacetsResponseValidator(cluster)); - requestTypeToResponseValidators - .put(EarlybirdRequestType.RECENCY, passThroughValidator); - requestTypeToResponseValidators - .put(EarlybirdRequestType.RELEVANCE, passThroughValidator); - requestTypeToResponseValidators - .put(EarlybirdRequestType.STRICT_RECENCY, passThroughValidator); - requestTypeToResponseValidators - .put(EarlybirdRequestType.TERM_STATS, new TermStatsResultsValidator(cluster)); - requestTypeToResponseValidators - .put(EarlybirdRequestType.TOP_TWEETS, new TopTweetsResultsValidator(cluster)); - } - - @Override - public Future apply( - final EarlybirdRequestContext requestContext, - Service service) { - return service.apply(requestContext).flatMap( - new Function>() { - @Override - public Future apply(EarlybirdResponse response) { - if (response == null) { - return Future.exception(new IllegalStateException( - cluster + " returned null response")); - } - - if (response.getResponseCode() == EarlybirdResponseCode.SUCCESS) { - return requestTypeToResponseValidators - .get(requestContext.getEarlybirdRequestType()) - .validate(response); - } - - return Future.value(EarlybirdResponseMergeUtil.transformInvalidResponse( - response, - String.format("Failure from %s (%s)", cluster, response.getResponseCode()))); - } - }); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/ServingRangeProvider.docx b/src/java/com/twitter/search/earlybird_root/filters/ServingRangeProvider.docx new file mode 100644 index 000000000..8be3a915a Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/ServingRangeProvider.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/ServingRangeProvider.java b/src/java/com/twitter/search/earlybird_root/filters/ServingRangeProvider.java deleted file mode 100644 index fb26bd2d7..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/ServingRangeProvider.java +++ /dev/null @@ -1,12 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import com.twitter.search.earlybird.config.ServingRange; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; - -public interface ServingRangeProvider { - /** - * Get a ServingRange implementation. - * Usually backed by either TierInfoWrapper or RootClusterBoundaryInfo. - */ - ServingRange getServingRange(EarlybirdRequestContext requestContext, boolean useBoundaryOverride); -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/StratoAttributionClientIdFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/StratoAttributionClientIdFilter.docx new file mode 100644 index 000000000..7d2a3769d Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/StratoAttributionClientIdFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/StratoAttributionClientIdFilter.java b/src/java/com/twitter/search/earlybird_root/filters/StratoAttributionClientIdFilter.java deleted file mode 100644 index aff0c44e1..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/StratoAttributionClientIdFilter.java +++ /dev/null @@ -1,30 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.earlybird.common.ClientIdUtil; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.util.Future; - -/** - * A filter that will set the clientId of the request to the strato HttpEndpoint Attribution. - *

- * If the clientId is already set to something non-null then that value is used. - * If the clientId is null but Attribution.httpEndpoint() contains a value it will be set as - * the clientId. - */ -public class StratoAttributionClientIdFilter extends - SimpleFilter { - @Override - public Future apply( - EarlybirdRequest request, Service service - ) { - if (request.getClientId() == null) { - ClientIdUtil.getClientIdFromHttpEndpointAttribution().ifPresent(request::setClientId); - } - - return service.apply(request); - } -} - diff --git a/src/java/com/twitter/search/earlybird_root/filters/TopLevelExceptionHandlingFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/TopLevelExceptionHandlingFilter.docx new file mode 100644 index 000000000..f68f6877f Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/TopLevelExceptionHandlingFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/TopLevelExceptionHandlingFilter.java b/src/java/com/twitter/search/earlybird_root/filters/TopLevelExceptionHandlingFilter.java deleted file mode 100644 index f3db830fd..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/TopLevelExceptionHandlingFilter.java +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.util.Future; - -/** A top level filter for handling exceptions. */ -public class TopLevelExceptionHandlingFilter - extends SimpleFilter { - private final EarlybirdResponseExceptionHandler exceptionHandler; - - /** Creates a new TopLevelExceptionHandlingFilter instance. */ - public TopLevelExceptionHandlingFilter() { - this.exceptionHandler = new EarlybirdResponseExceptionHandler("top_level"); - } - - @Override - public Future apply(EarlybirdRequest request, - Service service) { - return exceptionHandler.handleException(request, service.apply(request)); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/UnsetSuperRootFieldsFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/UnsetSuperRootFieldsFilter.docx new file mode 100644 index 000000000..af8b36afb Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/UnsetSuperRootFieldsFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/UnsetSuperRootFieldsFilter.java b/src/java/com/twitter/search/earlybird_root/filters/UnsetSuperRootFieldsFilter.java deleted file mode 100644 index a3f24b7b2..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/UnsetSuperRootFieldsFilter.java +++ /dev/null @@ -1,30 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird_root.common.EarlybirdRequestUtil; -import com.twitter.util.Future; - -/** - * A filter that unsets some request fields that make sense only on the SuperRoot, before sending - * them to the individual roots. - */ -public class UnsetSuperRootFieldsFilter extends SimpleFilter { - private final boolean unsetFollowedUserIds; - - public UnsetSuperRootFieldsFilter() { - this(true); - } - - public UnsetSuperRootFieldsFilter(boolean unsetFollowedUserIds) { - this.unsetFollowedUserIds = unsetFollowedUserIds; - } - - @Override - public Future apply(EarlybirdRequest request, - Service service) { - return service.apply(EarlybirdRequestUtil.unsetSuperRootFields(request, unsetFollowedUserIds)); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/filters/VeryRecentTweetsFilter.docx b/src/java/com/twitter/search/earlybird_root/filters/VeryRecentTweetsFilter.docx new file mode 100644 index 000000000..a5de18c93 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/filters/VeryRecentTweetsFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/filters/VeryRecentTweetsFilter.java b/src/java/com/twitter/search/earlybird_root/filters/VeryRecentTweetsFilter.java deleted file mode 100644 index 6f0678a1e..000000000 --- a/src/java/com/twitter/search/earlybird_root/filters/VeryRecentTweetsFilter.java +++ /dev/null @@ -1,44 +0,0 @@ -package com.twitter.search.earlybird_root.filters; - -import javax.inject.Inject; - -import com.twitter.finagle.Service; -import com.twitter.finagle.SimpleFilter; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.util.Future; - -public class VeryRecentTweetsFilter - extends SimpleFilter { - private static final String DECIDER_KEY = "enable_very_recent_tweets"; - private static final SearchRateCounter VERY_RECENT_TWEETS_NOT_MODIFIED = - SearchRateCounter.export("very_recent_tweets_not_modified"); - private static final SearchRateCounter VERY_RECENT_TWEETS_ENABLED = - SearchRateCounter.export("very_recent_tweets_enabled"); - - private final SearchDecider decider; - - @Inject - public VeryRecentTweetsFilter( - SearchDecider decider - ) { - this.decider = decider; - } - - @Override - public Future apply( - EarlybirdRequest request, - Service service - ) { - if (decider.isAvailable(DECIDER_KEY)) { - VERY_RECENT_TWEETS_ENABLED.increment(); - request.setSkipVeryRecentTweets(false); - } else { - VERY_RECENT_TWEETS_NOT_MODIFIED.increment(); - } - - return service.apply(request); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/img/serving.png b/src/java/com/twitter/search/earlybird_root/img/serving.png deleted file mode 100644 index aca60b55e..000000000 Binary files a/src/java/com/twitter/search/earlybird_root/img/serving.png and /dev/null differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/AccumulatedResponses.docx b/src/java/com/twitter/search/earlybird_root/mergers/AccumulatedResponses.docx new file mode 100644 index 000000000..abf46761e Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/AccumulatedResponses.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/AccumulatedResponses.java b/src/java/com/twitter/search/earlybird_root/mergers/AccumulatedResponses.java deleted file mode 100644 index abfebf20d..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/AccumulatedResponses.java +++ /dev/null @@ -1,176 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import java.util.List; -import java.util.Map; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -import com.twitter.search.common.query.thriftjava.EarlyTerminationInfo; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.TierResponse; - -/** - * Collection of EarlybirdResponses and associated stats to be merged. - */ -public class AccumulatedResponses { - // The list of the successful responses from all earlybird futures. This does not include empty - // responses resulted from null requests. - private final List successResponses; - // The list of the unsuccessful responses from all earlybird futures. - private final List errorResponses; - // the list of max statusIds seen in each earlybird. - private final List maxIds; - // the list of min statusIds seen in each earlybird. - private final List minIds; - - private final EarlyTerminationInfo mergedEarlyTerminationInfo; - private final boolean isMergingAcrossTiers; - private final PartitionCounts partitionCounts; - private final int numSearchedSegments; - - public static final class PartitionCounts { - private final int numPartitions; - private final int numSuccessfulPartitions; - private final List perTierResponse; - - public PartitionCounts(int numPartitions, int numSuccessfulPartitions, List - perTierResponse) { - this.numPartitions = numPartitions; - this.numSuccessfulPartitions = numSuccessfulPartitions; - this.perTierResponse = perTierResponse; - } - - public int getNumPartitions() { - return numPartitions; - } - - public int getNumSuccessfulPartitions() { - return numSuccessfulPartitions; - } - - public List getPerTierResponse() { - return perTierResponse; - } - } - - /** - * Create AccumulatedResponses - */ - public AccumulatedResponses(List successResponses, - List errorResponses, - List maxIds, - List minIds, - EarlyTerminationInfo mergedEarlyTerminationInfo, - boolean isMergingAcrossTiers, - PartitionCounts partitionCounts, - int numSearchedSegments) { - this.successResponses = successResponses; - this.errorResponses = errorResponses; - this.maxIds = maxIds; - this.minIds = minIds; - this.mergedEarlyTerminationInfo = mergedEarlyTerminationInfo; - this.isMergingAcrossTiers = isMergingAcrossTiers; - this.partitionCounts = partitionCounts; - this.numSearchedSegments = numSearchedSegments; - } - - public List getSuccessResponses() { - return successResponses; - } - - public List getErrorResponses() { - return errorResponses; - } - - public List getMaxIds() { - return maxIds; - } - - public List getMinIds() { - return minIds; - } - - public EarlyTerminationInfo getMergedEarlyTerminationInfo() { - return mergedEarlyTerminationInfo; - } - - public boolean foundError() { - return !errorResponses.isEmpty(); - } - - /** - * Tries to return a merged EarlybirdResponse that propagates as much information from the error - * responses as possible. - * - * If all error responses have the same error response code, the merged response will have the - * same error response code, and the debugString/debugInfo on the merged response will be set to - * the debugString/debugInfo of one of the merged responses. - * - * If the error responses have at least 2 different response codes, TRANSIENT_ERROR will be set - * on the merged response. Also, we will look for the most common error response code, and will - * propagate the debugString/debugInfo from an error response with that response code. - */ - public EarlybirdResponse getMergedErrorResponse() { - Preconditions.checkState(!errorResponses.isEmpty()); - - // Find a response that has the most common error response code. - int maxCount = 0; - EarlybirdResponse errorResponseWithMostCommonErrorResponseCode = null; - Map responseCodeCounts = Maps.newHashMap(); - for (EarlybirdResponse errorResponse : errorResponses) { - EarlybirdResponseCode responseCode = errorResponse.getResponseCode(); - Integer responseCodeCount = responseCodeCounts.get(responseCode); - if (responseCodeCount == null) { - responseCodeCount = 0; - } - ++responseCodeCount; - responseCodeCounts.put(responseCode, responseCodeCount); - if (responseCodeCount > maxCount) { - errorResponseWithMostCommonErrorResponseCode = errorResponse; - } - } - - // If all error responses have the same response code, set it on the merged response. - // Otherwise, set TRANSIENT_ERROR on the merged response. - EarlybirdResponseCode mergedResponseCode = EarlybirdResponseCode.TRANSIENT_ERROR; - if (responseCodeCounts.size() == 1) { - mergedResponseCode = responseCodeCounts.keySet().iterator().next(); - } - - EarlybirdResponse mergedResponse = new EarlybirdResponse() - .setResponseCode(mergedResponseCode); - - // Propagate the debugString/debugInfo of the selected error response to the merged response. - Preconditions.checkNotNull(errorResponseWithMostCommonErrorResponseCode); - if (errorResponseWithMostCommonErrorResponseCode.isSetDebugString()) { - mergedResponse.setDebugString(errorResponseWithMostCommonErrorResponseCode.getDebugString()); - } - if (errorResponseWithMostCommonErrorResponseCode.isSetDebugInfo()) { - mergedResponse.setDebugInfo(errorResponseWithMostCommonErrorResponseCode.getDebugInfo()); - } - - // Set the numPartitions and numPartitionsSucceeded on the mergedResponse - mergedResponse.setNumPartitions(partitionCounts.getNumPartitions()); - mergedResponse.setNumSuccessfulPartitions(partitionCounts.getNumSuccessfulPartitions()); - - return mergedResponse; - } - - public boolean isMergingAcrossTiers() { - return isMergingAcrossTiers; - } - - public boolean isMergingPartitionsWithinATier() { - return !isMergingAcrossTiers; - } - - public PartitionCounts getPartitionCounts() { - return partitionCounts; - } - - public int getNumSearchedSegments() { - return numSearchedSegments; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/BUILD b/src/java/com/twitter/search/earlybird_root/mergers/BUILD deleted file mode 100644 index cd818b753..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/BUILD +++ /dev/null @@ -1,26 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/log4j", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/collections", - "src/java/com/twitter/common/quantity", - "src/java/com/twitter/search/common/futures", - "src/java/com/twitter/search/common/logging", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/snowflakeparser", - "src/java/com/twitter/search/common/relevance:utils", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/common/search", - "src/java/com/twitter/search/common/util:finagleutil", - "src/java/com/twitter/search/common/util/earlybird", - "src/java/com/twitter/search/earlybird_root/collectors", - "src/java/com/twitter/search/earlybird_root/common", - "src/java/com/twitter/search/queryparser/query:core-query-nodes", - "src/thrift/com/twitter/search:earlybird-java", - "src/thrift/com/twitter/search/common:query-java", - ], -) diff --git a/src/java/com/twitter/search/earlybird_root/mergers/BUILD.docx b/src/java/com/twitter/search/earlybird_root/mergers/BUILD.docx new file mode 100644 index 000000000..23ef81ced Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/BUILD.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/EarlyTerminateTierMergePredicate.docx b/src/java/com/twitter/search/earlybird_root/mergers/EarlyTerminateTierMergePredicate.docx new file mode 100644 index 000000000..2e82c1b87 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/EarlyTerminateTierMergePredicate.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/EarlyTerminateTierMergePredicate.java b/src/java/com/twitter/search/earlybird_root/mergers/EarlyTerminateTierMergePredicate.java deleted file mode 100644 index 9bde1eb03..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/EarlyTerminateTierMergePredicate.java +++ /dev/null @@ -1,9 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -public interface EarlyTerminateTierMergePredicate { - /** - * Do we have enough results so far that we can early terminate and not continue onto next tier? - */ - boolean shouldEarlyTerminateTierMerge(int totalResultsFromSuccessfulShards, - boolean foundEarlyTermination); -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/EarlybirdResponseDebugMessageBuilder.docx b/src/java/com/twitter/search/earlybird_root/mergers/EarlybirdResponseDebugMessageBuilder.docx new file mode 100644 index 000000000..e4ff96e5b Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/EarlybirdResponseDebugMessageBuilder.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/EarlybirdResponseDebugMessageBuilder.java b/src/java/com/twitter/search/earlybird_root/mergers/EarlybirdResponseDebugMessageBuilder.java deleted file mode 100644 index f27f7214b..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/EarlybirdResponseDebugMessageBuilder.java +++ /dev/null @@ -1,176 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - - -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Function; -import com.google.common.base.Joiner; -import com.google.common.collect.Iterables; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.logging.DebugMessageBuilder; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; - -/** - * Collects debug messages to attach to EarlybirdResponse - */ -class EarlybirdResponseDebugMessageBuilder { - private static final Logger LOG = - LoggerFactory.getLogger(EarlybirdResponseDebugMessageBuilder.class); - - private static final Logger TOO_MANY_FAILED_PARTITIONS_LOG = - LoggerFactory.getLogger(String.format("%s_too_many_failed_partitions", - EarlybirdResponseDebugMessageBuilder.class.getName())); - - @VisibleForTesting - protected final SearchCounter insufficientValidResponseCounter = - SearchCounter.export("insufficient_valid_partition_responses_count"); - @VisibleForTesting - protected final SearchCounter validPartitionResponseCounter = - SearchCounter.export("valid_partition_response_count"); - - // the combined debug string for all earlybird responses - private final StringBuilder debugString; - /** - * A message builder backed by the same {@link #debugString} above. - */ - private final DebugMessageBuilder debugMessageBuilder; - - private static final Joiner JOINER = Joiner.on(", "); - - EarlybirdResponseDebugMessageBuilder(EarlybirdRequest request) { - this(getDebugLevel(request)); - } - - EarlybirdResponseDebugMessageBuilder(DebugMessageBuilder.Level level) { - this.debugString = new StringBuilder(); - this.debugMessageBuilder = new DebugMessageBuilder(debugString, level); - } - - private static DebugMessageBuilder.Level getDebugLevel(EarlybirdRequest request) { - if (request.isSetDebugMode() && request.getDebugMode() > 0) { - return DebugMessageBuilder.getDebugLevel(request.getDebugMode()); - } else if (request.isSetDebugOptions()) { - return DebugMessageBuilder.Level.DEBUG_BASIC; - } else { - return DebugMessageBuilder.Level.DEBUG_NONE; - } - } - - protected boolean isDebugMode() { - return debugMessageBuilder.getDebugLevel() > 0; - } - - void append(String msg) { - debugString.append(msg); - } - - void debugAndLogWarning(String msg) { - if (isDebugMode()) { - debugString.append(msg).append('\n'); - } - LOG.warn(msg); - } - - void debugDetailed(String format, Object... args) { - debugAtLevel(DebugMessageBuilder.Level.DEBUG_DETAILED, format, args); - } - - void debugVerbose(String format, Object... args) { - debugAtLevel(DebugMessageBuilder.Level.DEBUG_VERBOSE, format, args); - } - - void debugVerbose2(String format, Object... args) { - debugAtLevel(DebugMessageBuilder.Level.DEBUG_VERBOSE_2, format, args); - } - - void debugAtLevel(DebugMessageBuilder.Level level, String format, Object... args) { - boolean levelOK = debugMessageBuilder.isAtLeastLevel(level); - if (levelOK || LOG.isDebugEnabled()) { - // We check both modes here in order to build the formatted message only once. - String message = String.format(format, args); - - LOG.debug(message); - - if (levelOK) { - debugString.append(message).append('\n'); - } - } - } - - String debugString() { - return debugString.toString(); - } - - DebugMessageBuilder getDebugMessageBuilder() { - return debugMessageBuilder; - } - - void logBelowSuccessThreshold(ThriftSearchQuery searchQuery, int numSuccessResponses, - int numPartitions, double successThreshold) { - String rawQuery = (searchQuery != null && searchQuery.isSetRawQuery()) - ? "[" + searchQuery.getRawQuery() + "]" : "null"; - String serializedQuery = (searchQuery != null && searchQuery.isSetSerializedQuery()) - ? "[" + searchQuery.getSerializedQuery() + "]" : "null"; - // Not enough successful responses from partitions. - String errorMessage = String.format( - "Only %d valid responses returned out of %d partitions for raw query: %s" - + " serialized query: %s. Lower than threshold of %s", - numSuccessResponses, numPartitions, rawQuery, serializedQuery, successThreshold); - - TOO_MANY_FAILED_PARTITIONS_LOG.warn(errorMessage); - - insufficientValidResponseCounter.increment(); - validPartitionResponseCounter.add(numSuccessResponses); - debugString.append(errorMessage); - } - - - @VisibleForTesting - void logResponseDebugInfo(EarlybirdRequest earlybirdRequest, - String partitionTierName, - EarlybirdResponse response) { - if (response.isSetDebugString() && !response.getDebugString().isEmpty()) { - debugString.append(String.format("Received response from [%s] with debug string [%s]", - partitionTierName, response.getDebugString())).append("\n"); - } - - if (!response.isSetResponseCode()) { - debugAndLogWarning(String.format( - "Received Earlybird null response code for query [%s] from [%s]", - earlybirdRequest, partitionTierName)); - } else if (response.getResponseCode() != EarlybirdResponseCode.SUCCESS - && response.getResponseCode() != EarlybirdResponseCode.PARTITION_SKIPPED - && response.getResponseCode() != EarlybirdResponseCode.PARTITION_DISABLED - && response.getResponseCode() != EarlybirdResponseCode.TIER_SKIPPED) { - debugAndLogWarning(String.format( - "Received Earlybird response error [%s] for query [%s] from [%s]", - response.getResponseCode(), earlybirdRequest, partitionTierName)); - } - - if (debugMessageBuilder.isVerbose2()) { - debugVerbose2("Earlybird [%s] returned response: %s", partitionTierName, response); - } else if (debugMessageBuilder.isVerbose()) { - if (response.isSetSearchResults() && response.getSearchResults().getResultsSize() > 0) { - String ids = JOINER.join(Iterables.transform( - response.getSearchResults().getResults(), - new Function() { - @Nullable - @Override - public Long apply(ThriftSearchResult result) { - return result.getId(); - } - })); - debugVerbose("Earlybird [%s] returned TweetIDs: %s", partitionTierName, ids); - } - } - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/EarlybirdResponseMerger.docx b/src/java/com/twitter/search/earlybird_root/mergers/EarlybirdResponseMerger.docx new file mode 100644 index 000000000..a8c64c3d5 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/EarlybirdResponseMerger.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/EarlybirdResponseMerger.java b/src/java/com/twitter/search/earlybird_root/mergers/EarlybirdResponseMerger.java deleted file mode 100644 index e52e70b29..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/EarlybirdResponseMerger.java +++ /dev/null @@ -1,604 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Map; - -import scala.runtime.BoxedUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Optional; -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.util.FinagleUtil; -import com.twitter.search.common.util.earlybird.EarlybirdResponseMergeUtil; -import com.twitter.search.common.util.earlybird.ResultsUtil; -import com.twitter.search.earlybird.thrift.EarlybirdDebugInfo; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird_root.collectors.MultiwayMergeCollector; -import com.twitter.search.earlybird_root.common.EarlybirdFeatureSchemaMerger; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.EarlybirdRequestType; -import com.twitter.search.earlybird_root.common.EarlybirdRequestUtil; -import com.twitter.util.Function; -import com.twitter.util.Future; - -/** - * Base EarlybirdResponseMerger containing basic logic to merge EarlybirdResponse objects - */ -public abstract class EarlybirdResponseMerger implements EarlyTerminateTierMergePredicate { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdResponseMerger.class); - private static final Logger MIN_SEARCHED_STATUS_ID_LOGGER = - LoggerFactory.getLogger("MinSearchedStatusIdLogger"); - - private static final SearchCounter NO_SEARCH_RESULT_COUNTER = - SearchCounter.export("no_search_result_count"); - private static final SearchCounter NO_RESPONSES_TO_MERGE = - SearchCounter.export("no_responses_to_merge"); - private static final SearchCounter EARLYBIRD_RESPONSE_NO_MORE_RESULTS = - SearchCounter.export("merger_earlybird_response_no_more_results"); - private static final String PARTITION_OR_TIER_COUNTER_NAME_FORMAT = - "merger_waited_for_response_from_%s_counter"; - private static final String PARTITION_OR_TIER_ERROR_COUNTER_NAME_FORMAT = - "merger_num_error_responses_from_%s"; - private static final String PARTITION_OR_TIER_RESPONSE_CODE_COUNTER_NAME_FORMAT = - "merger_earlybird_response_code_from_%s_%s"; - - protected final EarlybirdResponseDebugMessageBuilder responseMessageBuilder; - protected final EarlybirdRequestContext requestContext; - protected final ImmutableList> responses; - protected AccumulatedResponses accumulatedResponses; - - - @VisibleForTesting - static final Map MERGER_CREATED_STATS = - perRequestTypeCounterImmutableMap("earlybird_response_merger_%s_created_count"); - - @VisibleForTesting - static final Map - MIN_SEARCHED_STATUS_ID_LARGER_THAN_REQUEST_MAX_ID = perRequestTypeCounterImmutableMap( - "merger_%s_min_searched_status_id_larger_than_request_max_id"); - - @VisibleForTesting - static final Map - MIN_SEARCHED_STATUS_ID_LARGER_THAN_REQUEST_UNTIL_TIME = perRequestTypeCounterImmutableMap( - "merger_%s_min_searched_status_id_larger_than_request_until_time"); - - private static Map perRequestTypeCounterImmutableMap( - String statPattern) { - Map statsMap = Maps.newEnumMap(EarlybirdRequestType.class); - for (EarlybirdRequestType earlybirdRequestType : EarlybirdRequestType.values()) { - String statName = String.format(statPattern, earlybirdRequestType.getNormalizedName()); - statsMap.put(earlybirdRequestType, SearchCounter.export(statName)); - } - - return Maps.immutableEnumMap(statsMap); - } - - public static final com.google.common.base.Function> - HIT_COUNT_GETTER = - response -> response.getSearchResults() == null - ? null - : response.getSearchResults().getHitCounts(); - - private final ChainMerger chainMerger; - - private class ChainMerger { - private final EarlybirdRequestContext requestContext; - private final ResponseAccumulator responseAccumulator; - private final List> responses; - private final EarlybirdResponseDebugMessageBuilder responseMessageBuilder; - private int currentFutureIndex = -1; - - public ChainMerger(EarlybirdRequestContext requestContext, - ResponseAccumulator responseAccumulator, - List> responses, - EarlybirdResponseDebugMessageBuilder responseMessageBuilder) { - this.requestContext = requestContext; - this.responseAccumulator = responseAccumulator; - this.responses = responses; - this.responseMessageBuilder = responseMessageBuilder; - } - - public Future merge() { - // 'responseFutures' should always be sorted. - // When returned by EarlybirdScatterGather service, the responses are sorted by partition ID. - // When returned by EarlybirdChainedScatterGatherService, - // responses are sorted descending by tier start date. See: - // com.twitter.search.earlybird_root.EarlybirdChainedScatterGatherService.TIER_COMPARATOR. - // - // When merging responses from partitions, we want to wait for responses from all partitions, - // so the order in which we wait for those results does not matter. When merging responses - // from tiers, we want to wait for the response from the latest. If we don't need any more - // responses to compute the final response, then we don't need to wait for the responses from - // other tiers. If we cannot terminate early, then we want to wait for the responses from the - // second tier, and so on. - // - // We do not need to have any explicit synchronization, because: - // 1. The callbacks for future_i are set by the flatMap() callback on future_{i-1} (when - // recursively calling merge() inside the flatMap()). - // 2. Before setting the callbacks on future_i, future_{i-1}.flatMap() adds the response - // results to mergeHelper. - // 3. When the callbacks on future_i are set, the memory barrier between - // thread_running_future_{i-1} and thread_running_future_i is crossed. This guarantees - // that thread_running_future_i will see the updates to mergeHelper before it sees the - // callbacks. (Or thread_running_future_{i-1} == thread_running_future_i, in which case - // synchronization is not an issue, and correctness is guarateed by the order in which - // things will run.) - // 4. The same reasoning applies to currentFutureIndex. - - ++currentFutureIndex; - if (currentFutureIndex >= responses.size()) { - return Future.value(getTimedMergedResponse(responseAccumulator.getAccumulatedResults())); - } - - final String partitionTierName = - responseAccumulator.getNameForLogging(currentFutureIndex, responses.size()); - final String nameForEarlybirdResponseCodeStats = - responseAccumulator.getNameForEarlybirdResponseCodeStats( - currentFutureIndex, responses.size()); - - // If a tier in the chain throws an exception, convert it to a null response, and let the - // mergeHelper handle it appropriately. - return responses.get(currentFutureIndex) - .handle(Function.func(t -> { - if (FinagleUtil.isCancelException(t)) { - return new EarlybirdResponse() - .setResponseCode(EarlybirdResponseCode.CLIENT_CANCEL_ERROR); - } else if (FinagleUtil.isTimeoutException(t)) { - return new EarlybirdResponse() - .setResponseCode(EarlybirdResponseCode.SERVER_TIMEOUT_ERROR); - } else { - SearchCounter.export( - String.format(PARTITION_OR_TIER_ERROR_COUNTER_NAME_FORMAT, partitionTierName)) - .increment(); - if (responseMessageBuilder.isDebugMode()) { - responseMessageBuilder.debugAndLogWarning( - String.format("[%s] failed, exception [%s]", - partitionTierName, t.toString())); - } - LOG.warn("exception response from: " + partitionTierName, t); - return new EarlybirdResponse() - .setResponseCode(EarlybirdResponseCode.TRANSIENT_ERROR); - } - })) - .flatMap(Function.func(response -> { - Preconditions.checkNotNull(response); - - SearchCounter.export( - String.format(PARTITION_OR_TIER_RESPONSE_CODE_COUNTER_NAME_FORMAT, - nameForEarlybirdResponseCodeStats, - response.getResponseCode().name().toLowerCase())) - .increment(); - - if ((response.getResponseCode() != EarlybirdResponseCode.PARTITION_SKIPPED) - && (response.getResponseCode() != EarlybirdResponseCode.TIER_SKIPPED)) { - SearchCounter.export( - String.format(PARTITION_OR_TIER_COUNTER_NAME_FORMAT, partitionTierName)) - .increment(); - } - - if (response.getResponseCode() == EarlybirdResponseCode.CLIENT_CANCEL_ERROR) { - // the request has been cancelled, no need to proceed - return Future.value(response); - } - - rewriteResponseCodeIfSearchResultsMissing(requestContext, partitionTierName, response); - responseMessageBuilder.logResponseDebugInfo( - requestContext.getRequest(), - partitionTierName, - response); - responseAccumulator.addResponse( - responseMessageBuilder, - requestContext.getRequest(), - response); - - if (responseAccumulator.shouldEarlyTerminateMerge(EarlybirdResponseMerger.this)) { - return Future.value(getTimedMergedResponse( - responseAccumulator.getAccumulatedResults())); - } - return merge(); - })); - } - } - - private void rewriteResponseCodeIfSearchResultsMissing( - EarlybirdRequestContext earlybirdRequestContext, - String partitionTierName, - EarlybirdResponse response) { - // We always require searchResults to be set, even for term stats and facet requests. - // This is because searchResults contains important info such as pagination cursors - // like minSearchStatusId and minSearchedTimeSinceEpoch. - // We expect all successful responses to have searchResults set. - if (response.isSetResponseCode() - && response.getResponseCode() == EarlybirdResponseCode.SUCCESS - && response.getSearchResults() == null) { - NO_SEARCH_RESULT_COUNTER.increment(); - LOG.warn("Received Earlybird response with null searchResults from [{}]" - + " EarlybirdRequest [{}] EarlybirdResponse [{}] ", - partitionTierName, earlybirdRequestContext.getRequest(), response); - response.setResponseCode(EarlybirdResponseCode.TRANSIENT_ERROR); - } - } - - /** - * Construct a EarlybirdResponseMerger to merge responses from multiple partitions or tiers - * based on mode. - */ - EarlybirdResponseMerger(EarlybirdRequestContext requestContext, - List> responses, - ResponseAccumulator responseAccumulator) { - this.requestContext = requestContext; - this.responses = ImmutableList.copyOf(responses); - this.responseMessageBuilder = - new EarlybirdResponseDebugMessageBuilder(requestContext.getRequest()); - this.chainMerger = new ChainMerger(requestContext, responseAccumulator, responses, - responseMessageBuilder); - } - - /** - * Get a response merger to merge the given responses. - */ - public static EarlybirdResponseMerger getResponseMerger( - EarlybirdRequestContext requestContext, - List> responses, - ResponseAccumulator helper, - EarlybirdCluster cluster, - EarlybirdFeatureSchemaMerger featureSchemaMerger, - int numPartitions) { - EarlybirdRequestType type = requestContext.getEarlybirdRequestType(); - MERGER_CREATED_STATS.get(type).increment(); - switch (type) { - case FACETS: - return new FacetResponseMerger(requestContext, responses, helper); - case TERM_STATS: - return new TermStatisticsResponseMerger(requestContext, responses, helper); - case RECENCY: - return new RecencyResponseMerger(requestContext, responses, helper, featureSchemaMerger); - case STRICT_RECENCY: - return new StrictRecencyResponseMerger( - requestContext, responses, helper, featureSchemaMerger, cluster); - case RELEVANCE: - return new RelevanceResponseMerger( - requestContext, responses, helper, featureSchemaMerger, numPartitions); - case TOP_TWEETS: - return new TopTweetsResponseMerger(requestContext, responses, helper); - default: - throw new RuntimeException("EarlybirdRequestType " + type + "is not supported by merge"); - } - } - - /** - * This method can perform two types of merges: - * 1. merge responses within a tier from different partitions. - * 2. merge responses from multiple tiers. - */ - public final Future merge() { - return chainMerger.merge() - .onSuccess(checkMinSearchedStatusIdFunction( - "max_id", - EarlybirdRequestUtil.getRequestMaxId(requestContext.getParsedQuery()), - MIN_SEARCHED_STATUS_ID_LARGER_THAN_REQUEST_MAX_ID.get( - requestContext.getEarlybirdRequestType()))) - .onSuccess(checkMinSearchedStatusIdFunction( - "until_time", - EarlybirdRequestUtil.getRequestMaxIdFromUntilTime(requestContext.getParsedQuery()), - MIN_SEARCHED_STATUS_ID_LARGER_THAN_REQUEST_UNTIL_TIME.get( - requestContext.getEarlybirdRequestType()))); - } - - /** - * Returns the function that checks if the minSearchedStatusID on the merged response is higher - * than the max ID in the request. - */ - private Function checkMinSearchedStatusIdFunction( - final String operator, final Optional requestMaxId, final SearchCounter stat) { - return Function.cons(mergedResponse -> { - if (requestMaxId.isPresent() - && requestMaxId.get() != Long.MAX_VALUE - && (mergedResponse.getResponseCode() == EarlybirdResponseCode.SUCCESS) - && mergedResponse.isSetSearchResults() - && mergedResponse.getSearchResults().isSetMinSearchedStatusID()) { - long minSearchedStatusId = mergedResponse.getSearchResults().getMinSearchedStatusID(); - // We sometimes set minSearchedStatusId = max_id + 1 when a request times out even - // before any search happens. - // Check SEARCH-10134 for more details. - if (minSearchedStatusId > requestMaxId.get() + 1) { - stat.increment(); - String logMessage = "Response has a minSearchedStatusID ({}) larger than request " - + operator + " ({})." - + "\nrequest type: {}" - + "\nrequest: {}" - + "\nmerged response: {}" - + "\nSuccessful accumulated responses:"; - List logMessageParams = Lists.newArrayList(); - logMessageParams.add(minSearchedStatusId); - logMessageParams.add(requestMaxId.get()); - logMessageParams.add(requestContext.getEarlybirdRequestType()); - logMessageParams.add(requestContext.getRequest()); - logMessageParams.add(mergedResponse); - for (EarlybirdResponse response : accumulatedResponses.getSuccessResponses()) { - logMessage += "\naccumulated response: {}"; - logMessageParams.add(response); - } - MIN_SEARCHED_STATUS_ID_LOGGER.warn(logMessage, logMessageParams.toArray()); - } - } - }); - } - - private EarlybirdResponse getTimedMergedResponse(AccumulatedResponses accResponses) { - long start = System.nanoTime(); - try { - return getMergedResponse(accResponses); - } finally { - long totalTime = System.nanoTime() - start; - getMergedResponseTimer().timerIncrement(totalTime); - } - } - - private EarlybirdResponse initializeMergedSuccessResponseFromAccumulatedResponses() { - EarlybirdResponse mergedResponse = new EarlybirdResponse(); - - AccumulatedResponses.PartitionCounts partitionCounts = - accumulatedResponses.getPartitionCounts(); - - mergedResponse.setNumPartitions(partitionCounts.getNumPartitions()) - .setNumSuccessfulPartitions(partitionCounts.getNumSuccessfulPartitions()) - .setPerTierResponse(partitionCounts.getPerTierResponse()) - .setNumSearchedSegments(accumulatedResponses.getNumSearchedSegments()); - - mergedResponse.setEarlyTerminationInfo(accumulatedResponses.getMergedEarlyTerminationInfo()); - mergedResponse.setResponseCode(EarlybirdResponseCode.SUCCESS); - - return mergedResponse; - } - - private EarlybirdResponse getMergedResponse(AccumulatedResponses accResponses) { - accumulatedResponses = accResponses; - EarlybirdResponse mergedResponse; - - if (accumulatedResponses.getSuccessResponses().isEmpty() - && !accumulatedResponses.foundError()) { - // No successful or error responses. This means that all tiers / partitions are intentionally - // skipped. Return a blank successful response. - NO_RESPONSES_TO_MERGE.increment(); - mergedResponse = new EarlybirdResponse() - .setResponseCode(EarlybirdResponseCode.SUCCESS) - .setSearchResults(new ThriftSearchResults()) - .setDebugString("No responses to merge, probably because all tiers/partitions " - + "were skipped."); - } else if (accumulatedResponses.isMergingAcrossTiers()) { - mergedResponse = getMergedResponseAcrossTiers(); - } else { - mergedResponse = getMergedResponseAcrossPartitions(); - } - - saveMergedDebugString(mergedResponse); - return mergedResponse; - } - - private EarlybirdResponse getMergedResponseAcrossTiers() { - Preconditions.checkState( - !accumulatedResponses.getSuccessResponses().isEmpty() - || accumulatedResponses.foundError()); - - // When merging across tiers, if we have one failed tier, we should fail the whole - // response. Note that due to early termination, if a tier that is old fails - // but the newer tiers return enough results, the failed tier won't show up - // here in accumulatedResponses -- the only tiers that show up here - // will be successful. - if (accumulatedResponses.foundError()) { - // The TierResponseAccumulator early terminates on the first error, so we should - // never get more than one error. This means that the getMergedErrorResponse will - // return an error response with the error code of that one error, and will never - // have to decide which error response to return if the error responses are all - // different. - - // Perhaps we should just return accumulatedResponses.getErrorResponses().get(0); - Preconditions.checkState(accumulatedResponses.getErrorResponses().size() == 1); - return accumulatedResponses.getMergedErrorResponse(); - } else { - EarlybirdResponse mergedResponse = initializeMergedSuccessResponseFromAccumulatedResponses(); - return internalMerge(mergedResponse); - } - } - - private EarlybirdResponse getMergedResponseAcrossPartitions() { - Preconditions.checkState( - !accumulatedResponses.getSuccessResponses().isEmpty() - || accumulatedResponses.foundError()); - - EarlybirdResponse mergedResponse; - - // Unlike tier merging, one failed response doesn't mean the merged response should - // fail. If we have successful responses we can check the success ratio and if its - // good we can still return a successful merge. - if (!accumulatedResponses.getSuccessResponses().isEmpty()) { - // We have at least one successful response, but still need to check the success ratio. - // mergedResponse is a SUCCESS response after this call, but we will - // set it to failure below if necessary. - mergedResponse = initializeMergedSuccessResponseFromAccumulatedResponses(); - - int numSuccessResponses = mergedResponse.getNumSuccessfulPartitions(); - int numPartitions = mergedResponse.getNumPartitions(); - double successThreshold = getSuccessResponseThreshold(); - if (checkSuccessPartitionRatio(numSuccessResponses, numPartitions, successThreshold)) { - // Success! Proceed with merging. - mergedResponse.setResponseCode(EarlybirdResponseCode.SUCCESS); - mergedResponse = internalMerge(mergedResponse); - } else { - responseMessageBuilder.logBelowSuccessThreshold( - requestContext.getRequest().getSearchQuery(), numSuccessResponses, numPartitions, - successThreshold); - mergedResponse.setResponseCode(EarlybirdResponseCode.TOO_MANY_PARTITIONS_FAILED_ERROR); - } - } else { - mergedResponse = accumulatedResponses.getMergedErrorResponse(); - } - - return mergedResponse; - } - - /** - * Derive class should implement the logic to merge the specific type of results (recency, - * relevance, Top Tweets, etc..) - */ - protected abstract EarlybirdResponse internalMerge(EarlybirdResponse response); - - protected abstract SearchTimerStats getMergedResponseTimer(); - - /** - * Do we have enough results so far that we can early terminate and not continue onto next tier? - */ - public boolean shouldEarlyTerminateTierMerge(int totalResultsFromSuccessfulShards, - boolean foundEarlyTermination) { - // We are taking the most conservative tier response merging. - // This is the most conservative merge logic --- as long as we have some results, we should - // not return anything from the next tier. This may cause not ideal experience where a - // page is not full, but the use can still scroll further. - - return foundEarlyTermination || totalResultsFromSuccessfulShards >= 1; - } - - private void saveMergedDebugString(EarlybirdResponse mergedResponse) { - if (responseMessageBuilder.isDebugMode()) { - String message = responseMessageBuilder.debugString(); - mergedResponse.setDebugString(message); - if (!accumulatedResponses.getSuccessResponses().isEmpty() - && accumulatedResponses.getSuccessResponses().get(0).isSetDebugInfo()) { - - EarlybirdDebugInfo debugInfo = - accumulatedResponses.getSuccessResponses().get(0).getDebugInfo(); - mergedResponse.setDebugInfo(debugInfo); - } - } - } - - private double getSuccessResponseThreshold() { - EarlybirdRequest request = requestContext.getRequest(); - if (request.isSetSuccessfulResponseThreshold()) { - double successfulResponseThreshold = request.getSuccessfulResponseThreshold(); - Preconditions.checkArgument(successfulResponseThreshold > 0, - "Invalid successfulResponseThreshold %s", successfulResponseThreshold); - Preconditions.checkArgument(successfulResponseThreshold <= 1.0, - "Invalid successfulResponseThreshold %s", successfulResponseThreshold); - return successfulResponseThreshold; - } else { - return getDefaultSuccessResponseThreshold(); - } - } - - protected abstract double getDefaultSuccessResponseThreshold(); - - private static boolean checkSuccessPartitionRatio( - int numSuccessResponses, - int numPartitions, - double goodResponseThreshold) { - Preconditions.checkArgument(goodResponseThreshold > 0.0, - "Invalid goodResponseThreshold %s", goodResponseThreshold); - return numSuccessResponses >= (numPartitions * goodResponseThreshold); - } - - /** - * Merge hit counts from all results. - */ - protected Map aggregateHitCountMap() { - Map hitCounts = ResultsUtil - .aggregateCountMap(accumulatedResponses.getSuccessResponses(), HIT_COUNT_GETTER); - if (hitCounts.size() > 0) { - if (responseMessageBuilder.isDebugMode()) { - responseMessageBuilder.append("Hit counts:\n"); - for (Map.Entry entry : hitCounts.entrySet()) { - responseMessageBuilder.append(String.format(" %10s seconds: %d hits\n", - entry.getKey() / 1000, entry.getValue())); - } - } - return hitCounts; - } - return null; - } - - /** - * Returns the number of results to keep as part of merge-collection. - */ - protected final int computeNumResultsToKeep() { - return EarlybirdResponseMergeUtil.computeNumResultsToKeep(requestContext.getRequest()); - } - - /** - * Remove exact duplicates (same id) from the result set. - */ - protected static void trimExactDups(ThriftSearchResults searchResults, TrimStats trimStats) { - int numResults = searchResults.getResultsSize(); - List oldResults = searchResults.getResults(); - List newResults = Lists.newArrayListWithCapacity(numResults); - HashSet resultSet = Sets.newHashSetWithExpectedSize(numResults); - - for (ThriftSearchResult result : oldResults) { - if (resultSet.contains(result.getId())) { - trimStats.increaseRemovedDupsCount(); - continue; - } - - newResults.add(result); - resultSet.add(result.getId()); - } - - searchResults.setResults(newResults); - } - - protected final int addResponsesToCollector(MultiwayMergeCollector collector) { - int totalResultSize = 0; - for (EarlybirdResponse response : accumulatedResponses.getSuccessResponses()) { - if (response.isSetSearchResults()) { - totalResultSize += response.getSearchResults().getResultsSize(); - } - collector.addResponse(response); - } - return totalResultSize; - } - - /** - * Given a sorted searchResults (for recency, sorted by ID; for relevance, sorted by score), - * returns the first 'computeNumResultsToKeep()' number of results. - * - * @param searchResults the searchResults to be truncated. - */ - protected final void truncateResults(ThriftSearchResults searchResults, TrimStats trimStats) { - int numResultsRequested = computeNumResultsToKeep(); - - int to = numResultsRequested == Integer.MAX_VALUE ? searchResults.getResultsSize() - : Math.min(numResultsRequested, searchResults.getResultsSize()); - if (searchResults.getResultsSize() > to) { - trimStats.setResultsTruncatedFromTailCount(searchResults.getResultsSize() - to); - - if (to > 0) { - searchResults.setResults(searchResults.getResults().subList(0, to)); - } else { - // No more results for the next page - EARLYBIRD_RESPONSE_NO_MORE_RESULTS.increment(); - searchResults.setResults(Collections.emptyList()); - } - } - } - - EarlybirdRequest getEarlybirdRequest() { - return requestContext.getRequest(); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/FacetResponseMerger.docx b/src/java/com/twitter/search/earlybird_root/mergers/FacetResponseMerger.docx new file mode 100644 index 000000000..2262d6076 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/FacetResponseMerger.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/FacetResponseMerger.java b/src/java/com/twitter/search/earlybird_root/mergers/FacetResponseMerger.java deleted file mode 100644 index 06fc76d18..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/FacetResponseMerger.java +++ /dev/null @@ -1,353 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.TimeUnit; - -import com.google.common.collect.Sets; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.logging.DebugMessageBuilder; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.ranking.thriftjava.ThriftFacetRankingOptions; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.util.earlybird.FacetsResultsUtils; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftFacetCount; -import com.twitter.search.earlybird.thrift.ThriftFacetCountMetadata; -import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults; -import com.twitter.search.earlybird.thrift.ThriftFacetResults; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.util.Future; - -/** - * Merger class to merge facets EarlybirdResponse objects - */ -public class FacetResponseMerger extends EarlybirdResponseMerger { - private static final Logger LOG = LoggerFactory.getLogger(FacetResponseMerger.class); - - private static final SearchTimerStats TIMER = - SearchTimerStats.export("merge_facets", TimeUnit.NANOSECONDS, false, true); - - private static final double SUCCESSFUL_RESPONSE_THRESHOLD = 0.9; - private final DebugMessageBuilder debugMessageBuilder; - - - /** - * Constructor to create the merger - */ - public FacetResponseMerger(EarlybirdRequestContext requestContext, - List> responses, - ResponseAccumulator mode) { - super(requestContext, responses, mode); - debugMessageBuilder = responseMessageBuilder.getDebugMessageBuilder(); - debugMessageBuilder.verbose("--- Request Received: %s", requestContext.getRequest()); - } - - @Override - protected SearchTimerStats getMergedResponseTimer() { - return TIMER; - } - - @Override - protected double getDefaultSuccessResponseThreshold() { - return SUCCESSFUL_RESPONSE_THRESHOLD; - } - - @Override - protected EarlybirdResponse internalMerge(EarlybirdResponse facetsResponse) { - - final Map facetFieldInfoMap = - new HashMap<>(); - final Set userIDWhitelist = new HashSet<>(); - - // First, parse the responses and build up our facet info map. - boolean termStatsFilteringMode = FacetsResultsUtils.prepareFieldInfoMap( - requestContext.getRequest().getFacetRequest(), facetFieldInfoMap); - // Iterate through all futures and get results. - collectResponsesAndPopulateMap(facetFieldInfoMap, userIDWhitelist); - - // Next, aggregate the top facets and update the blender response. - facetsResponse - .setFacetResults(new ThriftFacetResults() - .setFacetFields(new HashMap<>()) - .setUserIDWhitelist(userIDWhitelist)); - - // keep track of how many facets a user contributed - this map gets reset for every field - Map perFieldAntiGamingMap = new HashMap<>(); - - // this one is used for images and twimges - Map imagesAntiGamingMap = new HashMap<>(); - - Set twimgDedupSet = null; - - for (final Map.Entry entry - : facetFieldInfoMap.entrySet()) { - // reset for each field - String field = entry.getKey(); - final Map antiGamingMap; - if (field.equals(EarlybirdFieldConstant.IMAGES_FACET) - || field.equals(EarlybirdFieldConstant.TWIMG_FACET)) { - antiGamingMap = imagesAntiGamingMap; - } else { - perFieldAntiGamingMap.clear(); - antiGamingMap = perFieldAntiGamingMap; - } - - ThriftFacetFieldResults results = new ThriftFacetFieldResults(); - FacetsResultsUtils.FacetFieldInfo info = entry.getValue(); - results.setTotalCount(info.totalCounts); - results.setTopFacets(new ArrayList<>()); - FacetsResultsUtils.fillTopLanguages(info, results); - if (info.topFacets != null && !info.topFacets.isEmpty()) { - fillFacetFieldResults(info, antiGamingMap, results); - } - - if (field.equals(EarlybirdFieldConstant.TWIMG_FACET)) { - if (twimgDedupSet == null) { - twimgDedupSet = Sets.newHashSet(); - } - FacetsResultsUtils.dedupTwimgFacet(twimgDedupSet, results, debugMessageBuilder); - } - - facetsResponse.getFacetResults().putToFacetFields(entry.getKey(), results); - } - - if (!termStatsFilteringMode) { - // in term stats filtering mode, if doing it here would break term stats filtering - FacetsResultsUtils.mergeTwimgResults( - facetsResponse.getFacetResults(), - Collections.reverseOrder( - FacetsResultsUtils.getFacetCountComparator( - requestContext.getRequest().getFacetRequest()))); - } - - // Update the numHitsProcessed on ThriftSearchResults. - int numHitsProcessed = 0; - int numPartitionsEarlyTerminated = 0; - for (EarlybirdResponse earlybirdResponse: accumulatedResponses.getSuccessResponses()) { - ThriftSearchResults searchResults = earlybirdResponse.getSearchResults(); - if (searchResults != null) { - numHitsProcessed += searchResults.getNumHitsProcessed(); - numPartitionsEarlyTerminated += searchResults.getNumPartitionsEarlyTerminated(); - } - } - ThriftSearchResults searchResults = new ThriftSearchResults(); - searchResults.setResults(new ArrayList<>()); // required field - searchResults.setNumHitsProcessed(numHitsProcessed); - searchResults.setNumPartitionsEarlyTerminated(numPartitionsEarlyTerminated); - facetsResponse.setSearchResults(searchResults); - - LOG.debug("Facets call completed successfully: {}", facetsResponse); - - FacetsResultsUtils.fixNativePhotoUrl(facetsResponse); - return facetsResponse; - } - - private void fillFacetFieldResults(FacetsResultsUtils.FacetFieldInfo facetFieldInfo, - Map antiGamingMap, - ThriftFacetFieldResults results) { - int minWeightedCount = 0; - int minSimpleCount = 0; - int maxPenaltyCount = Integer.MAX_VALUE; - double maxPenaltyCountRatio = 1; - boolean excludePossiblySensitiveFacets = false; - boolean onlyReturnFacetsWithDisplayTweet = false; - int maxHitsPerUser = -1; - - EarlybirdRequest request = requestContext.getRequest(); - if (request.getFacetRequest() != null) { - ThriftFacetRankingOptions rankingOptions = request.getFacetRequest().getFacetRankingOptions(); - - if (request.getSearchQuery() != null) { - maxHitsPerUser = request.getSearchQuery().getMaxHitsPerUser(); - } - - if (rankingOptions != null) { - LOG.debug("FacetsResponseMerger: Using rankingOptions={}", rankingOptions); - - if (rankingOptions.isSetMinCount()) { - minWeightedCount = rankingOptions.getMinCount(); - } - if (rankingOptions.isSetMinSimpleCount()) { - minSimpleCount = rankingOptions.getMinSimpleCount(); - } - if (rankingOptions.isSetMaxPenaltyCount()) { - maxPenaltyCount = rankingOptions.getMaxPenaltyCount(); - } - if (rankingOptions.isSetMaxPenaltyCountRatio()) { - maxPenaltyCountRatio = rankingOptions.getMaxPenaltyCountRatio(); - } - if (rankingOptions.isSetExcludePossiblySensitiveFacets()) { - excludePossiblySensitiveFacets = rankingOptions.isExcludePossiblySensitiveFacets(); - } - if (rankingOptions.isSetOnlyReturnFacetsWithDisplayTweet()) { - onlyReturnFacetsWithDisplayTweet = rankingOptions.isOnlyReturnFacetsWithDisplayTweet(); - } - } - } else { - LOG.warn("earlybirdRequest.getFacetRequest() is null"); - } - - ThriftFacetCount[] topFacetsArray = new ThriftFacetCount[facetFieldInfo.topFacets.size()]; - - facetFieldInfo.topFacets.values().toArray(topFacetsArray); - Arrays.sort(topFacetsArray, Collections.reverseOrder( - FacetsResultsUtils.getFacetCountComparator(request.getFacetRequest()))); - - int numResults = capFacetFieldWidth(facetFieldInfo.fieldRequest.numResults); - - if (topFacetsArray.length < numResults) { - numResults = topFacetsArray.length; - } - - int collected = 0; - for (int i = 0; i < topFacetsArray.length; ++i) { - ThriftFacetCount count = topFacetsArray[i]; - - if (onlyReturnFacetsWithDisplayTweet - && (!count.isSetMetadata() || !count.getMetadata().isSetStatusId() - || count.getMetadata().getStatusId() == -1)) { - // status id must be set - continue; - } - - if (excludePossiblySensitiveFacets && count.isSetMetadata() - && count.getMetadata().isStatusPossiblySensitive()) { - // the display tweet may be offensive or NSFW - if (DebugMessageBuilder.DEBUG_VERBOSE <= debugMessageBuilder.getDebugLevel()) { - debugMessageBuilder.verbose2("[%d] FacetsResponseMerger EXCLUDED: offensive or NSFW %s, " - + "explanation: %s", - i, facetCountSummary(count), - count.getMetadata().getExplanation()); - } - continue; - } - - boolean filterOutUser = false; - if (maxHitsPerUser != -1 && count.isSetMetadata()) { - ThriftFacetCountMetadata metadata = count.getMetadata(); - if (!metadata.dontFilterUser) { - long twitterUserId = metadata.getTwitterUserId(); - int numResultsFromUser = 1; - if (twitterUserId != -1) { - Integer perUser = antiGamingMap.get(twitterUserId); - if (perUser != null) { - numResultsFromUser = perUser + 1; - filterOutUser = numResultsFromUser > maxHitsPerUser; - } - antiGamingMap.put(twitterUserId, numResultsFromUser); - } - } - } - - // Filter facets those don't meet the basic criteria. - if (count.getSimpleCount() < minSimpleCount) { - if (DebugMessageBuilder.DEBUG_VERBOSE <= debugMessageBuilder.getDebugLevel()) { - debugMessageBuilder.verbose2( - "[%d] FacetsResponseMerger EXCLUDED: simpleCount:%d < minSimpleCount:%d, %s", - i, count.getSimpleCount(), minSimpleCount, facetCountSummary(count)); - } - continue; - } - if (count.getWeightedCount() < minWeightedCount) { - if (DebugMessageBuilder.DEBUG_VERBOSE <= debugMessageBuilder.getDebugLevel()) { - debugMessageBuilder.verbose2( - "[%d] FacetsResponseMerger EXCLUDED: weightedCount:%d < minWeightedCount:%d, %s", - i, count.getWeightedCount(), minWeightedCount, facetCountSummary(count)); - } - continue; - } - if (filterOutUser) { - if (DebugMessageBuilder.DEBUG_VERBOSE <= debugMessageBuilder.getDebugLevel()) { - debugMessageBuilder.verbose2( - "[%d] FacetsResponseMerger EXCLUDED: antiGaming filterd user: %d: %s", - i, count.getMetadata().getTwitterUserId(), facetCountSummary(count)); - } - continue; - } - if (count.getPenaltyCount() > maxPenaltyCount) { - if (DebugMessageBuilder.DEBUG_VERBOSE <= debugMessageBuilder.getDebugLevel()) { - debugMessageBuilder.verbose2( - "[%d] FacetsResponseMerger EXCLUCED: penaltyCount:%.3f > maxPenaltyCount:%.3f, %s", - i, count.getPenaltyCount(), maxPenaltyCount, facetCountSummary(count)); - } - continue; - } - if (((double) count.getPenaltyCount() / count.getSimpleCount()) > maxPenaltyCountRatio) { - if (DebugMessageBuilder.DEBUG_VERBOSE <= debugMessageBuilder.getDebugLevel()) { - debugMessageBuilder.verbose2( - "[%d] FacetsResponseMerger EXCLUDED: penaltyCountRatio: %.3f > " - + "maxPenaltyCountRatio:%.3f, %s", - i, (double) count.getPenaltyCount() / count.getSimpleCount(), maxPenaltyCountRatio, - facetCountSummary(count)); - } - continue; - } - results.addToTopFacets(count); - - collected++; - if (collected >= numResults) { - break; - } - } - } - - private static int capFacetFieldWidth(int numResults) { - int ret = numResults; - if (numResults <= 0) { - // this in theory should not be allowed, but for now we issue the request with goodwill length - ret = 10; // default to 10 for future merge code to terminate correctly - } - if (numResults >= 100) { - ret = 100; - } - return ret; - } - - private static String facetCountSummary(final ThriftFacetCount count) { - if (count.isSetMetadata()) { - return String.format("Label: %s (s:%d, w:%d, p:%d, score:%.2f, sid:%d (%s))", - count.getFacetLabel(), count.getSimpleCount(), count.getWeightedCount(), - count.getPenaltyCount(), count.getScore(), count.getMetadata().getStatusId(), - count.getMetadata().getStatusLanguage()); - } else { - return String.format("Label: %s (s:%d, w:%d, p:%d, score:%.2f)", count.getFacetLabel(), - count.getSimpleCount(), count.getWeightedCount(), count.getPenaltyCount(), - count.getScore()); - } - } - - // Iterate through the backend responses and fill up the FacetFieldInfo map. - private void collectResponsesAndPopulateMap( - final Map facetFieldInfoMap, - final Set userIDWhitelist) { - // Next, iterate through the backend responses. - int i = 0; - for (EarlybirdResponse facetsResponse : accumulatedResponses.getSuccessResponses()) { - if (facetsResponse.isSetFacetResults()) { - LOG.debug("Facet response from earlybird {} is {} ", i, facetsResponse.getFacetResults()); - i++; - ThriftFacetResults facetResults = facetsResponse.getFacetResults(); - if (facetResults.isSetUserIDWhitelist()) { - userIDWhitelist.addAll(facetResults.getUserIDWhitelist()); - } - FacetsResultsUtils.fillFacetFieldInfo( - facetResults, facetFieldInfoMap, - userIDWhitelist); - } - } - LOG.debug("Earlybird facet response total size {}", i); - } -} - diff --git a/src/java/com/twitter/search/earlybird_root/mergers/PartitionResponseAccumulator.docx b/src/java/com/twitter/search/earlybird_root/mergers/PartitionResponseAccumulator.docx new file mode 100644 index 000000000..6404f6cff Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/PartitionResponseAccumulator.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/PartitionResponseAccumulator.java b/src/java/com/twitter/search/earlybird_root/mergers/PartitionResponseAccumulator.java deleted file mode 100644 index 22fcb101c..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/PartitionResponseAccumulator.java +++ /dev/null @@ -1,44 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; - - -public final class PartitionResponseAccumulator extends ResponseAccumulator { - private static final String TARGET_TYPE_PARTITION = "partition"; - - @Override - public String getNameForLogging(int responseIndex, int numTotalResponses) { - return TARGET_TYPE_PARTITION + responseIndex; - } - - @Override - public String getNameForEarlybirdResponseCodeStats(int responseIndex, int numTotalResponses) { - // We do not need to differentiate between partitions: we just want to get the number of - // responses returned by Earlybirds, for each EarlybirdResponseCode. - return TARGET_TYPE_PARTITION; - } - - @Override - boolean shouldEarlyTerminateMerge(EarlyTerminateTierMergePredicate merger) { - return false; - } - - @Override - public void handleSkippedResponse(EarlybirdResponseCode responseCode) { } - - @Override - public void handleErrorResponse(EarlybirdResponse response) { - } - - @Override - public AccumulatedResponses.PartitionCounts getPartitionCounts() { - return new AccumulatedResponses.PartitionCounts(getNumResponses(), - getSuccessResponses().size() + getSuccessfulEmptyResponseCount(), null); - } - - @Override - protected boolean isMergingAcrossTiers() { - return false; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/RecencyResponseMerger.docx b/src/java/com/twitter/search/earlybird_root/mergers/RecencyResponseMerger.docx new file mode 100644 index 000000000..964b27433 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/RecencyResponseMerger.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/RecencyResponseMerger.java b/src/java/com/twitter/search/earlybird_root/mergers/RecencyResponseMerger.java deleted file mode 100644 index bc4742493..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/RecencyResponseMerger.java +++ /dev/null @@ -1,638 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.query.thriftjava.EarlyTerminationInfo; -import com.twitter.search.common.relevance.utils.ResultComparators; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird_root.collectors.RecencyMergeCollector; -import com.twitter.search.earlybird_root.common.EarlybirdFeatureSchemaMerger; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.util.Future; - -import static com.twitter.search.earlybird_root.mergers.RecencyResponseMerger - .EarlyTerminationTrimmingStats.Type.ALREADY_EARLY_TERMINATED; -import static com.twitter.search.earlybird_root.mergers.RecencyResponseMerger - .EarlyTerminationTrimmingStats.Type.FILTERED; -import static com.twitter.search.earlybird_root.mergers.RecencyResponseMerger - .EarlyTerminationTrimmingStats.Type.FILTERED_AND_TRUNCATED; -import static com.twitter.search.earlybird_root.mergers.RecencyResponseMerger - .EarlyTerminationTrimmingStats.Type.NOT_EARLY_TERMINATED; -import static com.twitter.search.earlybird_root.mergers.RecencyResponseMerger - .EarlyTerminationTrimmingStats.Type.TERMINATED_GOT_EXACT_NUM_RESULTS; -import static com.twitter.search.earlybird_root.mergers.RecencyResponseMerger - .EarlyTerminationTrimmingStats.Type.TRUNCATED; - -/** - * Merger class to merge recency search EarlybirdResponse objects. - */ -public class RecencyResponseMerger extends EarlybirdResponseMerger { - private static final Logger LOG = LoggerFactory.getLogger(RecencyResponseMerger.class); - - private static final SearchTimerStats RECENCY_TIMER = - SearchTimerStats.export("merge_recency", TimeUnit.NANOSECONDS, false, true); - - @VisibleForTesting - static final String TERMINATED_COLLECTED_ENOUGH_RESULTS = - "terminated_collected_enough_results"; - - // Allowed replication lag relative to all replicas. Replication lag exceeding - // this amount may result in some tweets from the replica not returned in search. - private static final long ALLOWED_REPLICATION_LAG_MS = 10000; - - private static final double SUCCESSFUL_RESPONSE_THRESHOLD = 0.9; - - @VisibleForTesting - static final SearchCounter RECENCY_ZERO_RESULT_COUNT_AFTER_FILTERING_MAX_MIN_IDS = - SearchCounter.export("merger_recency_zero_result_count_after_filtering_max_min_ids"); - - @VisibleForTesting - static final SearchCounter RECENCY_TRIMMED_TOO_MANY_RESULTS_COUNT = - SearchCounter.export("merger_recency_trimmed_too_many_results_count"); - - private static final SearchCounter RECENCY_TIER_MERGE_EARLY_TERMINATED_WITH_NOT_ENOUGH_RESULTS = - SearchCounter.export("merger_recency_tier_merge_early_terminated_with_not_enough_results"); - - private static final SearchCounter RECENCY_CLEARED_EARLY_TERMINATION_COUNT = - SearchCounter.export("merger_recency_cleared_early_termination_count"); - - /** - * Results were truncated because merged results exceeded the requested numResults. - */ - @VisibleForTesting - static final String MERGING_EARLY_TERMINATION_REASON_TRUNCATED = - "root_merging_truncated_results"; - - /** - * Results that were were filtered smaller than merged minSearchedStatusId were filtered out. - */ - @VisibleForTesting - static final String MERGING_EARLY_TERMINATION_REASON_FILTERED = - "root_merging_filtered_results"; - - @VisibleForTesting - static final EarlyTerminationTrimmingStats PARTITION_MERGING_EARLY_TERMINATION_TRIMMING_STATS = - new EarlyTerminationTrimmingStats("recency_partition_merging"); - - @VisibleForTesting - static final EarlyTerminationTrimmingStats TIER_MERGING_EARLY_TERMINATION_TRIMMING_STATS = - new EarlyTerminationTrimmingStats("recency_tier_merging"); - - @VisibleForTesting - static class EarlyTerminationTrimmingStats { - - enum Type { - /** - * The whole result was not terminated at all. - */ - NOT_EARLY_TERMINATED, - /** - * Was terminated before we did any trimming. - */ - ALREADY_EARLY_TERMINATED, - /** - * Was not terminated when merged, but results were filtered due to min/max ranges. - */ - FILTERED, - /** - * Was not terminated when merged, but results were truncated. - */ - TRUNCATED, - /** - * Was not terminated when merged, but results were filtered due to min/max ranges and - * truncated. - */ - FILTERED_AND_TRUNCATED, - /** - * When the search asks for X result, and we get exactly X results back, without trimming - * or truncating on the tail side (min_id side), we still mark the search as early terminated. - * This is because later tiers possibly has more results. - */ - TERMINATED_GOT_EXACT_NUM_RESULTS, - } - - /** - * A counter tracking merged responses for each {@link EarlyTerminationTrimmingStats.Type} - * define above. - */ - private final ImmutableMap searchCounterMap; - - EarlyTerminationTrimmingStats(String prefix) { - Map tempMap = Maps.newEnumMap(Type.class); - - tempMap.put(NOT_EARLY_TERMINATED, - SearchCounter.export(prefix + "_not_early_terminated_after_merging")); - tempMap.put(ALREADY_EARLY_TERMINATED, - SearchCounter.export(prefix + "_early_terminated_before_merge_trimming")); - tempMap.put(TRUNCATED, - SearchCounter.export(prefix + "_early_terminated_after_merging_truncated")); - tempMap.put(FILTERED, - SearchCounter.export(prefix + "_early_terminated_after_merging_filtered")); - tempMap.put(FILTERED_AND_TRUNCATED, - SearchCounter.export(prefix + "_early_terminated_after_merging_filtered_and_truncated")); - tempMap.put(TERMINATED_GOT_EXACT_NUM_RESULTS, - SearchCounter.export(prefix + "_early_terminated_after_merging_got_exact_num_results")); - - searchCounterMap = Maps.immutableEnumMap(tempMap); - } - - public SearchCounter getCounterFor(Type type) { - return searchCounterMap.get(type); - } - } - - private final EarlybirdFeatureSchemaMerger featureSchemaMerger; - - public RecencyResponseMerger(EarlybirdRequestContext requestContext, - List> responses, - ResponseAccumulator mode, - EarlybirdFeatureSchemaMerger featureSchemaMerger) { - super(requestContext, responses, mode); - this.featureSchemaMerger = featureSchemaMerger; - } - - @Override - protected double getDefaultSuccessResponseThreshold() { - return SUCCESSFUL_RESPONSE_THRESHOLD; - } - - @Override - protected SearchTimerStats getMergedResponseTimer() { - return RECENCY_TIMER; - } - - @Override - protected EarlybirdResponse internalMerge(EarlybirdResponse mergedResponse) { - // The merged maxSearchedStatusId and minSearchedStatusId - long maxId = findMaxFullySearchedStatusID(); - long minId = findMinFullySearchedStatusID(); - - RecencyMergeCollector collector = new RecencyMergeCollector(responses.size()); - int totalResultSize = addResponsesToCollector(collector); - ThriftSearchResults searchResults = collector.getAllSearchResults(); - - TrimStats trimStats = trimResults(searchResults, minId, maxId); - setMergedMaxSearchedStatusId(searchResults, maxId); - setMergedMinSearchedStatusId( - searchResults, minId, trimStats.getResultsTruncatedFromTailCount() > 0); - - mergedResponse.setSearchResults(searchResults); - - // Override some components of the response as appropriate to real-time. - searchResults.setHitCounts(aggregateHitCountMap()); - if (accumulatedResponses.isMergingPartitionsWithinATier() - && clearEarlyTerminationIfReachingTierBottom(mergedResponse)) { - RECENCY_CLEARED_EARLY_TERMINATION_COUNT.increment(); - } else { - setEarlyTerminationForTrimmedResults(mergedResponse, trimStats); - } - - responseMessageBuilder.debugVerbose("Hits: %s %s", totalResultSize, trimStats); - responseMessageBuilder.debugVerbose( - "Hash Partitioned Earlybird call completed successfully: %s", mergedResponse); - - featureSchemaMerger.collectAndSetFeatureSchemaInResponse( - searchResults, - requestContext, - "merger_recency_tier", - accumulatedResponses.getSuccessResponses()); - - return mergedResponse; - } - - /** - * When we reached tier bottom, pagination can stop working even though we haven't got - * all results. e.g. - * Results from partition 1: [101 91 81], minSearchedStatusId is 81 - * Results from Partition 2: [102 92], minSearchedStatusId is 92, not early terminated. - * - * After merge, we get [102, 101, 92], with minResultId == 92. Since results from - * partition 2 is not early terminated, 92 is the tier bottom here. Since results are - * filtered, early termination for merged result is set to true, so blender will call again, - * with maxDocId == 91. This time we get result: - * Results from partition 1: [91 81], minSearchedStatusId is 81 - * Results from partition 2: [], minSearchedStatusId is still 92 - * After merge we get [] and minSearchedStatusId is still 92. No progress can be made on - * pagination and clients get stuck. - * - * So in this case, we clear the early termination flag to tell blender there is no more - * result in this tier. Tweets below tier bottom will be missed, but that also happens - * without this step, as the next pagination call will return empty results anyway. - * So even if there is NOT overlap between tiers, this is still better. - * - * Return true if early termination is cleared due to this, otherwise return false. - * To be safe, we do nothing here to keep existing behavior and only override it in - * StrictRecencyResponseMerger. - */ - protected boolean clearEarlyTerminationIfReachingTierBottom(EarlybirdResponse mergedResponse) { - return false; - } - - /** - * Determines if the merged response should be early-terminated when it has exactly as many - * trimmed results as requested, as is not early-terminated because of other reasons. - */ - protected boolean shouldEarlyTerminateWhenEnoughTrimmedResults() { - return true; - } - - /** - * If the end results were trimmed in any way, reflect that in the response as a query that was - * early terminated. A response can be either (1) truncated because we merged more results than - * what was asked for with numResults, or (2) we filtered results that were smaller than the - * merged minSearchedStatusId. - * - * @param mergedResponse the merged response. - * @param trimStats trim stats for this merge. - */ - private void setEarlyTerminationForTrimmedResults( - EarlybirdResponse mergedResponse, - TrimStats trimStats) { - - responseMessageBuilder.debugVerbose("Checking for merge trimming, trimStats %s", trimStats); - - EarlyTerminationTrimmingStats stats = getEarlyTerminationTrimmingStats(); - - EarlyTerminationInfo earlyTerminationInfo = mergedResponse.getEarlyTerminationInfo(); - Preconditions.checkNotNull(earlyTerminationInfo); - - if (!earlyTerminationInfo.isEarlyTerminated()) { - if (trimStats.getMinIdFilterCount() > 0 || trimStats.getResultsTruncatedFromTailCount() > 0) { - responseMessageBuilder.debugVerbose("Setting early termination, trimStats: %s, results: %s", - trimStats, mergedResponse); - - earlyTerminationInfo.setEarlyTerminated(true); - addEarlyTerminationReasons(earlyTerminationInfo, trimStats); - - if (trimStats.getMinIdFilterCount() > 0 - && trimStats.getResultsTruncatedFromTailCount() > 0) { - stats.getCounterFor(FILTERED_AND_TRUNCATED).increment(); - } else if (trimStats.getMinIdFilterCount() > 0) { - stats.getCounterFor(FILTERED).increment(); - } else if (trimStats.getResultsTruncatedFromTailCount() > 0) { - stats.getCounterFor(TRUNCATED).increment(); - } else { - Preconditions.checkState(false, "Invalid TrimStats: %s", trimStats); - } - } else if ((computeNumResultsToKeep() == mergedResponse.getSearchResults().getResultsSize()) - && shouldEarlyTerminateWhenEnoughTrimmedResults()) { - earlyTerminationInfo.setEarlyTerminated(true); - earlyTerminationInfo.addToMergedEarlyTerminationReasons( - TERMINATED_COLLECTED_ENOUGH_RESULTS); - stats.getCounterFor(TERMINATED_GOT_EXACT_NUM_RESULTS).increment(); - } else { - stats.getCounterFor(NOT_EARLY_TERMINATED).increment(); - } - } else { - stats.getCounterFor(ALREADY_EARLY_TERMINATED).increment(); - // Even if the results were already marked as early terminated, we can add additional - // reasons for debugging (if the merged results were filtered or truncated). - addEarlyTerminationReasons(earlyTerminationInfo, trimStats); - } - } - - private void addEarlyTerminationReasons( - EarlyTerminationInfo earlyTerminationInfo, - TrimStats trimStats) { - - if (trimStats.getMinIdFilterCount() > 0) { - earlyTerminationInfo.addToMergedEarlyTerminationReasons( - MERGING_EARLY_TERMINATION_REASON_FILTERED); - } - - if (trimStats.getResultsTruncatedFromTailCount() > 0) { - earlyTerminationInfo.addToMergedEarlyTerminationReasons( - MERGING_EARLY_TERMINATION_REASON_TRUNCATED); - } - } - - private EarlyTerminationTrimmingStats getEarlyTerminationTrimmingStats() { - if (accumulatedResponses.isMergingPartitionsWithinATier()) { - return getEarlyTerminationTrimmingStatsForPartitions(); - } else { - return getEarlyTerminationTrimmingStatsForTiers(); - } - } - - protected EarlyTerminationTrimmingStats getEarlyTerminationTrimmingStatsForPartitions() { - return PARTITION_MERGING_EARLY_TERMINATION_TRIMMING_STATS; - } - - protected EarlyTerminationTrimmingStats getEarlyTerminationTrimmingStatsForTiers() { - return TIER_MERGING_EARLY_TERMINATION_TRIMMING_STATS; - } - - /** - * If we get enough results, no need to go on. - * If one of the partitions early terminated, we can't go on or else there could be a gap. - */ - @Override - public boolean shouldEarlyTerminateTierMerge(int totalResultsFromSuccessfulShards, - boolean foundEarlyTermination) { - - - int resultsRequested = computeNumResultsToKeep(); - - boolean shouldEarlyTerminate = foundEarlyTermination - || totalResultsFromSuccessfulShards >= resultsRequested; - - if (shouldEarlyTerminate && totalResultsFromSuccessfulShards < resultsRequested) { - RECENCY_TIER_MERGE_EARLY_TERMINATED_WITH_NOT_ENOUGH_RESULTS.increment(); - } - - return shouldEarlyTerminate; - } - - /** - * Find the min status id that has been _completely_ searched across all partitions. The - * largest min status id across all partitions. - * - * @return the min searched status id found - */ - protected long findMinFullySearchedStatusID() { - List minIds = accumulatedResponses.getMinIds(); - if (minIds.isEmpty()) { - return Long.MIN_VALUE; - } - - if (accumulatedResponses.isMergingPartitionsWithinATier()) { - // When merging partitions, the min ID should be the largest among the min IDs. - return Collections.max(accumulatedResponses.getMinIds()); - } else { - // When merging tiers, the min ID should be the smallest among the min IDs. - return Collections.min(accumulatedResponses.getMinIds()); - } - } - - /** - * Find the max status id that has been _completely_ searched across all partitions. The - * smallest max status id across all partitions. - * - * This is where we reconcile replication lag by selecting the oldest maxid from the - * partitions searched. - * - * @return the max searched status id found - */ - protected long findMaxFullySearchedStatusID() { - List maxIDs = accumulatedResponses.getMaxIds(); - if (maxIDs.isEmpty()) { - return Long.MAX_VALUE; - } - Collections.sort(maxIDs); - - final long newest = maxIDs.get(maxIDs.size() - 1); - final long newestTimestamp = SnowflakeIdParser.getTimestampFromTweetId(newest); - - for (int i = 0; i < maxIDs.size(); i++) { - long oldest = maxIDs.get(i); - long oldestTimestamp = SnowflakeIdParser.getTimestampFromTweetId(oldest); - long deltaMs = newestTimestamp - oldestTimestamp; - - if (i == 0) { - LOG.debug("Max delta is {}", deltaMs); - } - - if (deltaMs < ALLOWED_REPLICATION_LAG_MS) { - if (i != 0) { - LOG.debug("{} partition replicas lagging more than {} ms", i, ALLOWED_REPLICATION_LAG_MS); - } - return oldest; - } - } - - // Can't get here - by this point oldest == newest, and delta is 0. - return newest; - } - - /** - * Trim the ThriftSearchResults if we have enough results, to return the first - * 'computeNumResultsToKeep()' number of results. - * - * If we don't have enough results after trimming, this function will first try to back fill - * older results, then newer results - * - * @param searchResults ThriftSearchResults that hold the to be trimmed List - * @return TrimStats containing statistics about how many results being removed - */ - protected TrimStats trimResults( - ThriftSearchResults searchResults, - long mergedMin, - long mergedMax) { - if (!searchResults.isSetResults() || searchResults.getResultsSize() == 0) { - // no results, no trimming needed - return TrimStats.EMPTY_STATS; - } - - if (requestContext.getRequest().getSearchQuery().isSetSearchStatusIds()) { - // Not a normal search, no trimming needed - return TrimStats.EMPTY_STATS; - } - - TrimStats trimStats = new TrimStats(); - trimExactDups(searchResults, trimStats); - - int numResultsRequested = computeNumResultsToKeep(); - if (shouldSkipTrimmingWhenNotEnoughResults(searchResults, numResultsRequested)) { - ////////////////////////////////////////////////////////// - // We don't have enough results, let's not do trimming - ////////////////////////////////////////////////////////// - return trimStats; - } - - if (accumulatedResponses.isMergingPartitionsWithinATier()) { - trimResultsBasedSearchedRange( - searchResults, trimStats, numResultsRequested, mergedMin, mergedMax); - } - - // Respect "computeNumResultsToKeep()" here, only keep "computeNumResultsToKeep()" results. - truncateResults(searchResults, trimStats); - - return trimStats; - } - - /** - * When there's not enough results, we don't remove results based on the searched range. - * This has a tradeoff: with this, we don't reduce our recall when we already don't have enough - * results. However, with this, we can lose results while paginating because we return results - * outside of the valid searched range. - */ - protected boolean shouldSkipTrimmingWhenNotEnoughResults( - ThriftSearchResults searchResults, int numResultsRequested) { - return searchResults.getResultsSize() <= numResultsRequested; - } - - - /** - * Trim results based on search range. The search range [x, y] is determined by: - * x is the maximun of the minimun search IDs; - * y is the minimun of the maximum search IDs. - * - * Ids out side of this range are removed. - * If we do not get enough results after the removal, we add IDs back until we get enough results. - * We first add IDs back from the older side back. If there's still not enough results, - * we start adding IDs from the newer side back. - */ - private void trimResultsBasedSearchedRange(ThriftSearchResults searchResults, - TrimStats trimStats, - int numResultsRequested, - long mergedMin, - long mergedMax) { - /////////////////////////////////////////////////////////////////// - // we have more results than requested, let's do some trimming - /////////////////////////////////////////////////////////////////// - - // Save the original results before trimming - List originalResults = searchResults.getResults(); - - filterResultsByMergedMinMaxIds(searchResults, mergedMax, mergedMin, trimStats); - - // This does happen. It is hard to say what we should do here so we just return the original - // result here. - if (searchResults.getResultsSize() == 0) { - RECENCY_ZERO_RESULT_COUNT_AFTER_FILTERING_MAX_MIN_IDS.increment(); - searchResults.setResults(originalResults); - - // Clean up min/mix filtered count, since we're bringing back whatever we just filtered. - trimStats.clearMaxIdFilterCount(); - trimStats.clearMinIdFilterCount(); - - if (LOG.isDebugEnabled() || responseMessageBuilder.isDebugMode()) { - String errMsg = "No trimming is done as filtered results is empty. " - + "maxId=" + mergedMax + ",minId=" + mergedMin; - LOG.debug(errMsg); - responseMessageBuilder.append(errMsg + "\n"); - } - } else { - // oops! we're trimming too many results. Let's put some back - if (searchResults.getResultsSize() < numResultsRequested) { - RECENCY_TRIMMED_TOO_MANY_RESULTS_COUNT.increment(); - - List trimmedResults = searchResults.getResults(); - long firstTrimmedResultId = trimmedResults.get(0).getId(); - long lastTrimmedResultId = trimmedResults.get(trimmedResults.size() - 1).getId(); - - // First, try to back fill with older results - int i = 0; - for (; i < originalResults.size(); ++i) { - ThriftSearchResult result = originalResults.get(i); - if (result.getId() < lastTrimmedResultId) { - trimmedResults.add(result); - trimStats.decreaseMinIdFilterCount(); - if (trimmedResults.size() >= numResultsRequested) { - break; - } - } - } - - // still not enough results? back fill with newer results - // find the oldest of the newer results - if (trimmedResults.size() < numResultsRequested) { - // still not enough results? back fill with newer results - // find the oldest of the newer results - for (i = originalResults.size() - 1; i >= 0; --i) { - ThriftSearchResult result = originalResults.get(i); - if (result.getId() > firstTrimmedResultId) { - trimmedResults.add(result); - trimStats.decreaseMaxIdFilterCount(); - if (trimmedResults.size() >= numResultsRequested) { - break; - } - } - } - - // newer results were added to the back of the list, re-sort - Collections.sort(trimmedResults, ResultComparators.ID_COMPARATOR); - } - } - } - } - - protected void setMergedMinSearchedStatusId( - ThriftSearchResults searchResults, - long currentMergedMin, - boolean resultsWereTrimmed) { - if (accumulatedResponses.getMinIds().isEmpty()) { - return; - } - - long merged; - if (searchResults == null - || !searchResults.isSetResults() - || searchResults.getResultsSize() == 0) { - merged = currentMergedMin; - } else { - List results = searchResults.getResults(); - long firstResultId = results.get(0).getId(); - long lastResultId = results.get(results.size() - 1).getId(); - merged = Math.min(firstResultId, lastResultId); - if (!resultsWereTrimmed) { - // If the results were trimmed, we want to set minSearchedStatusID to the smallest - // tweet ID in the response. Otherwise, we want to take the min between that, and - // the current minSearchedStatusID. - merged = Math.min(merged, currentMergedMin); - } - } - - searchResults.setMinSearchedStatusID(merged); - } - - private void setMergedMaxSearchedStatusId( - ThriftSearchResults searchResults, - long currentMergedMax) { - if (accumulatedResponses.getMaxIds().isEmpty()) { - return; - } - - long merged; - if (searchResults == null - || !searchResults.isSetResults() - || searchResults.getResultsSize() == 0) { - merged = currentMergedMax; - } else { - List results = searchResults.getResults(); - long firstResultId = results.get(0).getId(); - long lastResultId = results.get(results.size() - 1).getId(); - long maxResultId = Math.max(firstResultId, lastResultId); - merged = Math.max(maxResultId, currentMergedMax); - } - - searchResults.setMaxSearchedStatusID(merged); - } - - protected static void filterResultsByMergedMinMaxIds( - ThriftSearchResults results, long maxStatusId, long minStatusId, TrimStats trimStats) { - List trimedResults = - Lists.newArrayListWithCapacity(results.getResultsSize()); - - for (ThriftSearchResult result : results.getResults()) { - long statusId = result.getId(); - - if (statusId > maxStatusId) { - trimStats.increaseMaxIdFilterCount(); - } else if (statusId < minStatusId) { - trimStats.increaseMinIdFilterCount(); - } else { - trimedResults.add(result); - } - } - - results.setResults(trimedResults); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/RelevanceResponseMerger.docx b/src/java/com/twitter/search/earlybird_root/mergers/RelevanceResponseMerger.docx new file mode 100644 index 000000000..ccf9b429c Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/RelevanceResponseMerger.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/RelevanceResponseMerger.java b/src/java/com/twitter/search/earlybird_root/mergers/RelevanceResponseMerger.java deleted file mode 100644 index e58e79951..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/RelevanceResponseMerger.java +++ /dev/null @@ -1,268 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Function; -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.constants.thriftjava.ThriftLanguage; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.util.earlybird.EarlybirdResponseUtil; -import com.twitter.search.common.util.earlybird.ResultsUtil; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird.thrift.ThriftSearchRankingMode; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird_root.collectors.RelevanceMergeCollector; -import com.twitter.search.earlybird_root.common.EarlybirdFeatureSchemaMerger; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.util.Future; - -/** - * Merger class to merge relevance search EarlybirdResponse objects - */ -public class RelevanceResponseMerger extends EarlybirdResponseMerger { - private static final Logger LOG = LoggerFactory.getLogger(RelevanceResponseMerger.class); - - private static final SearchTimerStats TIMER = - SearchTimerStats.export("merge_relevance", TimeUnit.NANOSECONDS, false, true); - - private static final SearchCounter RELVEANCE_TIER_MERGE_EARLY_TERMINATED_WITH_NOT_ENOUGH_RESULTS = - SearchCounter.export("merger_relevance_tier_merge_early_terminated_with_not_enough_results"); - - private static final String PARTITION_NUM_RESULTS_COUNTER_SKIP_STATS = - "merger_relevance_post_trimmed_results_skip_stat_tier_%s_partition_%d"; - - @VisibleForTesting - public static final String PARTITION_NUM_RESULTS_COUNTER_NAME_FORMAT = - "merger_relevance_post_trimmed_results_from_tier_%s_partition_%d"; - - protected static final Function> LANG_MAP_GETTER = - response -> response.getSearchResults() == null - ? null - : response.getSearchResults().getLanguageHistogram(); - - private static final double SUCCESSFUL_RESPONSE_THRESHOLD = 0.8; - - private final EarlybirdFeatureSchemaMerger featureSchemaMerger; - - // The number of partitions are not meaningful when it is invoked through multi-tier merging. - private final int numPartitions; - - public RelevanceResponseMerger(EarlybirdRequestContext requestContext, - List> responses, - ResponseAccumulator mode, - EarlybirdFeatureSchemaMerger featureSchemaMerger, - int numPartitions) { - super(requestContext, responses, mode); - this.featureSchemaMerger = Preconditions.checkNotNull(featureSchemaMerger); - this.numPartitions = numPartitions; - } - - @Override - protected double getDefaultSuccessResponseThreshold() { - return SUCCESSFUL_RESPONSE_THRESHOLD; - } - - @Override - protected SearchTimerStats getMergedResponseTimer() { - return TIMER; - } - - @Override - protected EarlybirdResponse internalMerge(EarlybirdResponse mergedResponse) { - final ThriftSearchQuery searchQuery = requestContext.getRequest().getSearchQuery(); - long maxId = findMaxFullySearchedStatusID(); - long minId = findMinFullySearchedStatusID(); - - Preconditions.checkNotNull(searchQuery); - Preconditions.checkState(searchQuery.isSetRankingMode()); - Preconditions.checkState(searchQuery.getRankingMode() == ThriftSearchRankingMode.RELEVANCE); - - // First get the results in score order (the default comparator for this merge collector). - RelevanceMergeCollector collector = new RelevanceMergeCollector(responses.size()); - int totalResultSize = addResponsesToCollector(collector); - ThriftSearchResults searchResults = collector.getAllSearchResults(); - - TrimStats trimStats = trimResults(searchResults); - featureSchemaMerger.collectAndSetFeatureSchemaInResponse( - searchResults, - requestContext, - "merger_relevance_tier", - accumulatedResponses.getSuccessResponses()); - - mergedResponse.setSearchResults(searchResults); - - searchResults = mergedResponse.getSearchResults(); - searchResults - .setHitCounts(aggregateHitCountMap()) - .setLanguageHistogram(aggregateLanguageHistograms()); - - if (!accumulatedResponses.getMaxIds().isEmpty()) { - searchResults.setMaxSearchedStatusID(maxId); - } - - if (!accumulatedResponses.getMinIds().isEmpty()) { - searchResults.setMinSearchedStatusID(minId); - } - - LOG.debug("Hits: {} Removed duplicates: {}", totalResultSize, trimStats.getRemovedDupsCount()); - LOG.debug("Hash Partition'ed Earlybird call completed successfully: {}", mergedResponse); - - publishNumResultsFromPartitionStatistics(mergedResponse); - - return mergedResponse; - } - - /** - * If any of the partitions has an early termination, the tier merge must also early terminate. - * - * If a partition early terminated (we haven't fully searched that partition), and we instead - * moved onto the next tier, there will be a gap of unsearched results. - * - * If our early termination condition was only if we had enough results, we could get bad quality - * results by only looking at 20 hits when asking for 20 results. - */ - @Override - public boolean shouldEarlyTerminateTierMerge(int totalResultsFromSuccessfulShards, - boolean foundEarlyTermination) { - - // Don't use computeNumResultsToKeep because if returnAllResults is true, it will be - // Integer.MAX_VALUE and we will always log a stat that we didn't get enough results - int resultsRequested; - EarlybirdRequest request = requestContext.getRequest(); - if (request.isSetNumResultsToReturnAtRoot()) { - resultsRequested = request.getNumResultsToReturnAtRoot(); - } else { - resultsRequested = request.getSearchQuery().getCollectorParams().getNumResultsToReturn(); - } - if (foundEarlyTermination && totalResultsFromSuccessfulShards < resultsRequested) { - RELVEANCE_TIER_MERGE_EARLY_TERMINATED_WITH_NOT_ENOUGH_RESULTS.increment(); - } - - return foundEarlyTermination; - } - - /** - * Merge language histograms from all queries. - * - * @return Merge per-language count map. - */ - private Map aggregateLanguageHistograms() { - Map totalLangCounts = new TreeMap<>( - ResultsUtil.aggregateCountMap( - accumulatedResponses.getSuccessResponses(), LANG_MAP_GETTER)); - if (totalLangCounts.size() > 0) { - if (responseMessageBuilder.isDebugMode()) { - responseMessageBuilder.append("Language Distrbution:\n"); - int count = 0; - for (Map.Entry entry : totalLangCounts.entrySet()) { - responseMessageBuilder.append( - String.format(" %10s:%6d", entry.getKey(), entry.getValue())); - if (++count % 5 == 0) { - responseMessageBuilder.append("\n"); - } - } - responseMessageBuilder.append("\n"); - } - } - return totalLangCounts; - } - - /** - * Find the min status id that has been searched. Since no results are trimmed for Relevance mode, - * it should be the smallest among the min IDs. - */ - private long findMinFullySearchedStatusID() { - // The min ID should be the smallest among the min IDs - return accumulatedResponses.getMinIds().isEmpty() ? 0 - : Collections.min(accumulatedResponses.getMinIds()); - } - - /** - * Find the max status id that has been searched. Since no results are trimmed for Relevance mode, - * it should be the largest among the max IDs. - */ - private long findMaxFullySearchedStatusID() { - // The max ID should be the largest among the max IDs - return accumulatedResponses.getMaxIds().isEmpty() ? 0 - : Collections.max(accumulatedResponses.getMaxIds()); - } - - /** - * Return all the searchResults except duplicates. - * - * @param searchResults ThriftSearchResults that hold the to be trimmed List - * @return TrimStats containing statistics about how many results being removed - */ - private TrimStats trimResults(ThriftSearchResults searchResults) { - if (!searchResults.isSetResults() || searchResults.getResultsSize() == 0) { - // no results, no trimming needed - return TrimStats.EMPTY_STATS; - } - - if (requestContext.getRequest().getSearchQuery().isSetSearchStatusIds()) { - // Not a normal search, no trimming needed - return TrimStats.EMPTY_STATS; - } - - TrimStats trimStats = new TrimStats(); - trimExactDups(searchResults, trimStats); - - truncateResults(searchResults, trimStats); - - return trimStats; - } - - private void publishNumResultsFromPartitionStatistics(EarlybirdResponse mergedResponse) { - - // Keep track of all of the results that were kept after merging - Set mergedResults = - EarlybirdResponseUtil.getResults(mergedResponse).getResults() - .stream() - .map(result -> result.getId()) - .collect(Collectors.toSet()); - - // For each successful response (pre merge), count how many of its results were kept post merge. - // Increment the appropriate stat. - for (EarlybirdResponse response : accumulatedResponses.getSuccessResponses()) { - if (!response.isSetEarlybirdServerStats()) { - continue; - } - int numResultsKept = 0; - for (ThriftSearchResult result - : EarlybirdResponseUtil.getResults(response).getResults()) { - if (mergedResults.contains(result.getId())) { - ++numResultsKept; - } - } - - // We only update partition stats when the partition ID looks sane. - String tierName = response.getEarlybirdServerStats().getTierName(); - int partition = response.getEarlybirdServerStats().getPartition(); - if (partition >= 0 && partition < numPartitions) { - SearchCounter.export(String.format(PARTITION_NUM_RESULTS_COUNTER_NAME_FORMAT, - tierName, - partition)) - .add(numResultsKept); - } else { - SearchCounter.export(String.format(PARTITION_NUM_RESULTS_COUNTER_SKIP_STATS, - tierName, - partition)).increment(); - } - } - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/ResponseAccumulator.docx b/src/java/com/twitter/search/earlybird_root/mergers/ResponseAccumulator.docx new file mode 100644 index 000000000..3d386c5cd Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/ResponseAccumulator.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/ResponseAccumulator.java b/src/java/com/twitter/search/earlybird_root/mergers/ResponseAccumulator.java deleted file mode 100644 index ad0daa5f3..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/ResponseAccumulator.java +++ /dev/null @@ -1,356 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import java.util.ArrayList; -import java.util.EnumMap; -import java.util.List; -import java.util.Map; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.util.earlybird.ResponseMergerUtils; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird_root.common.EarlybirdRequestType; - -/** - * Accumulates EarlybirdResponse's and determines when to early terminate. - */ -public abstract class ResponseAccumulator { - - @VisibleForTesting - static class MinMaxSearchedIdStats { - /** How many results did we actually check */ - private final SearchCounter checkedMaxMinSearchedStatusId; - private final SearchCounter unsetMaxSearchedStatusId; - private final SearchCounter unsetMinSearchedStatusId; - private final SearchCounter unsetMaxAndMinSearchedStatusId; - private final SearchCounter sameMinMaxSearchedIdWithoutResults; - private final SearchCounter sameMinMaxSearchedIdWithOneResult; - private final SearchCounter sameMinMaxSearchedIdWithResults; - private final SearchCounter flippedMinMaxSearchedId; - - MinMaxSearchedIdStats(EarlybirdRequestType requestType) { - String statPrefix = "merge_helper_" + requestType.getNormalizedName(); - - checkedMaxMinSearchedStatusId = SearchCounter.export(statPrefix - + "_max_min_searched_id_checks"); - unsetMaxSearchedStatusId = SearchCounter.export(statPrefix - + "_unset_max_searched_status_id"); - unsetMinSearchedStatusId = SearchCounter.export(statPrefix - + "_unset_min_searched_status_id"); - unsetMaxAndMinSearchedStatusId = SearchCounter.export(statPrefix - + "_unset_max_and_min_searched_status_id"); - sameMinMaxSearchedIdWithoutResults = SearchCounter.export(statPrefix - + "_same_min_max_searched_id_without_results"); - sameMinMaxSearchedIdWithOneResult = SearchCounter.export(statPrefix - + "_same_min_max_searched_id_with_one_results"); - sameMinMaxSearchedIdWithResults = SearchCounter.export(statPrefix - + "_same_min_max_searched_id_with_results"); - flippedMinMaxSearchedId = SearchCounter.export(statPrefix - + "_flipped_min_max_searched_id"); - } - - @VisibleForTesting - SearchCounter getCheckedMaxMinSearchedStatusId() { - return checkedMaxMinSearchedStatusId; - } - - @VisibleForTesting - SearchCounter getFlippedMinMaxSearchedId() { - return flippedMinMaxSearchedId; - } - - @VisibleForTesting - SearchCounter getUnsetMaxSearchedStatusId() { - return unsetMaxSearchedStatusId; - } - - @VisibleForTesting - SearchCounter getUnsetMinSearchedStatusId() { - return unsetMinSearchedStatusId; - } - - @VisibleForTesting - SearchCounter getUnsetMaxAndMinSearchedStatusId() { - return unsetMaxAndMinSearchedStatusId; - } - - @VisibleForTesting - SearchCounter getSameMinMaxSearchedIdWithoutResults() { - return sameMinMaxSearchedIdWithoutResults; - } - - @VisibleForTesting - SearchCounter getSameMinMaxSearchedIdWithOneResult() { - return sameMinMaxSearchedIdWithOneResult; - } - - @VisibleForTesting - SearchCounter getSameMinMaxSearchedIdWithResults() { - return sameMinMaxSearchedIdWithResults; - } - } - - @VisibleForTesting - static final Map MIN_MAX_SEARCHED_ID_STATS_MAP; - static { - EnumMap statsMap - = Maps.newEnumMap(EarlybirdRequestType.class); - for (EarlybirdRequestType earlybirdRequestType : EarlybirdRequestType.values()) { - statsMap.put(earlybirdRequestType, new MinMaxSearchedIdStats(earlybirdRequestType)); - } - - MIN_MAX_SEARCHED_ID_STATS_MAP = Maps.immutableEnumMap(statsMap); - } - - // Merge has encountered at least one early terminated response. - private boolean foundEarlyTermination = false; - // Empty but successful response counter (E.g. when a tier or partition is skipped) - private int successfulEmptyResponseCount = 0; - // The list of the successful responses from all earlybird futures. This does not include empty - // responses resulted from null requests. - private final List successResponses = new ArrayList<>(); - // The list of the error responses from all earlybird futures. - private final List errorResponses = new ArrayList<>(); - // the list of max statusIds seen in each earlybird. - private final List maxIds = new ArrayList<>(); - // the list of min statusIds seen in each earlybird. - private final List minIds = new ArrayList<>(); - - private int numResponses = 0; - - private int numResultsAccumulated = 0; - private int numSearchedSegments = 0; - - /** - * Returns a string that can be used for logging to identify a single response out of all the - * responses that are being merged. - * - * @param responseIndex the index of a response's partition or tier, depending on the type of - * responses being accumulated. - * @param numTotalResponses the total number of partitions or tiers that are being merged. - */ - public abstract String getNameForLogging(int responseIndex, int numTotalResponses); - - /** - * Returns a string that is used to export per-EarlybirdResponseCode stats for partitions and tiers. - * - * @param responseIndex the index of of a response's partition or tier. - * @param numTotalResponses the total number of partitions or tiers that are being merged. - * @return a string that is used to export per-EarlybirdResponseCode stats for partitions and tiers. - */ - public abstract String getNameForEarlybirdResponseCodeStats( - int responseIndex, int numTotalResponses); - - abstract boolean shouldEarlyTerminateMerge(EarlyTerminateTierMergePredicate merger); - - /** - * Add a EarlybirdResponse - */ - public void addResponse(EarlybirdResponseDebugMessageBuilder responseMessageBuilder, - EarlybirdRequest request, - EarlybirdResponse response) { - numResponses++; - numSearchedSegments += response.getNumSearchedSegments(); - - if (isSkippedResponse(response)) { - // This is an empty response, no processing is required, just need to update statistics. - successfulEmptyResponseCount++; - handleSkippedResponse(response.getResponseCode()); - } else if (isErrorResponse(response)) { - errorResponses.add(response); - handleErrorResponse(response); - } else { - handleSuccessfulResponse(responseMessageBuilder, request, response); - } - } - - private boolean isErrorResponse(EarlybirdResponse response) { - return !response.isSetResponseCode() - || response.getResponseCode() != EarlybirdResponseCode.SUCCESS; - } - - private boolean isSkippedResponse(EarlybirdResponse response) { - return response.isSetResponseCode() - && (response.getResponseCode() == EarlybirdResponseCode.PARTITION_SKIPPED - || response.getResponseCode() == EarlybirdResponseCode.TIER_SKIPPED); - } - - /** - * Record a response corresponding to a skipped partition or skipped tier. - */ - protected abstract void handleSkippedResponse(EarlybirdResponseCode responseCode); - - /** - * Handle an error response - */ - protected abstract void handleErrorResponse(EarlybirdResponse response); - - /** - * Subclasses can override this to perform more successful response handling. - */ - protected void extraSuccessfulResponseHandler(EarlybirdResponse response) { } - - /** - * Whether the helper is for merging results from partitions within a single tier. - */ - protected final boolean isMergingPartitionsWithinATier() { - return !isMergingAcrossTiers(); - } - - /** - * Whether the helper is for merging results across different tiers. - */ - protected abstract boolean isMergingAcrossTiers(); - - - /** - * Record a successful response. - */ - public final void handleSuccessfulResponse( - EarlybirdResponseDebugMessageBuilder responseMessageBuilder, - EarlybirdRequest request, - EarlybirdResponse response) { - successResponses.add(response); - if (response.isSetSearchResults()) { - ThriftSearchResults searchResults = response.getSearchResults(); - numResultsAccumulated += searchResults.getResultsSize(); - - recordMinMaxSearchedIdsAndUpdateStats(responseMessageBuilder, request, response, - searchResults); - } - if (response.isSetEarlyTerminationInfo() - && response.getEarlyTerminationInfo().isEarlyTerminated()) { - foundEarlyTermination = true; - } - extraSuccessfulResponseHandler(response); - } - - private void recordMinMaxSearchedIdsAndUpdateStats( - EarlybirdResponseDebugMessageBuilder responseMessageBuidler, - EarlybirdRequest request, - EarlybirdResponse response, - ThriftSearchResults searchResults) { - - boolean isMaxIdSet = searchResults.isSetMaxSearchedStatusID(); - boolean isMinIdSet = searchResults.isSetMinSearchedStatusID(); - - if (isMaxIdSet) { - maxIds.add(searchResults.getMaxSearchedStatusID()); - } - if (isMinIdSet) { - minIds.add(searchResults.getMinSearchedStatusID()); - } - - updateMinMaxIdStats(responseMessageBuidler, request, response, searchResults, isMaxIdSet, - isMinIdSet); - } - - private void updateMinMaxIdStats( - EarlybirdResponseDebugMessageBuilder responseMessageBuilder, - EarlybirdRequest request, - EarlybirdResponse response, - ThriftSearchResults searchResults, - boolean isMaxIdSet, - boolean isMinIdSet) { - // Now just track the stats. - EarlybirdRequestType requestType = EarlybirdRequestType.of(request); - MinMaxSearchedIdStats minMaxSearchedIdStats = MIN_MAX_SEARCHED_ID_STATS_MAP.get(requestType); - - minMaxSearchedIdStats.checkedMaxMinSearchedStatusId.increment(); - if (isMaxIdSet && isMinIdSet) { - if (searchResults.getMinSearchedStatusID() > searchResults.getMaxSearchedStatusID()) { - // We do not expect this case to happen in production. - minMaxSearchedIdStats.flippedMinMaxSearchedId.increment(); - } else if (searchResults.getResultsSize() == 0 - && searchResults.getMaxSearchedStatusID() == searchResults.getMinSearchedStatusID()) { - minMaxSearchedIdStats.sameMinMaxSearchedIdWithoutResults.increment(); - responseMessageBuilder.debugVerbose( - "Got no results, and same min/max searched ids. Request: %s, Response: %s", - request, response); - } else if (searchResults.getResultsSize() == 1 - && searchResults.getMaxSearchedStatusID() == searchResults.getMinSearchedStatusID()) { - minMaxSearchedIdStats.sameMinMaxSearchedIdWithOneResult.increment(); - responseMessageBuilder.debugVerbose( - "Got one results, and same min/max searched ids. Request: %s, Response: %s", - request, response); - } else if (searchResults.getMaxSearchedStatusID() - == searchResults.getMinSearchedStatusID()) { - minMaxSearchedIdStats.sameMinMaxSearchedIdWithResults.increment(); - responseMessageBuilder.debugVerbose( - "Got multiple results, and same min/max searched ids. Request: %s, Response: %s", - request, response); - } - } else if (!isMaxIdSet && isMinIdSet) { - // We do not expect this case to happen in production. - minMaxSearchedIdStats.unsetMaxSearchedStatusId.increment(); - responseMessageBuilder.debugVerbose( - "Got unset maxSearchedStatusID. Request: %s, Response: %s", request, response); - } else if (isMaxIdSet && !isMinIdSet) { - // We do not expect this case to happen in production. - minMaxSearchedIdStats.unsetMinSearchedStatusId.increment(); - responseMessageBuilder.debugVerbose( - "Got unset minSearchedStatusID. Request: %s, Response: %s", request, response); - } else { - Preconditions.checkState(!isMaxIdSet && !isMinIdSet); - minMaxSearchedIdStats.unsetMaxAndMinSearchedStatusId.increment(); - responseMessageBuilder.debugVerbose( - "Got unset maxSearchedStatusID and minSearchedStatusID. Request: %s, Response: %s", - request, response); - } - } - - - /** - * Return partition counts with number of partitions, number of successful responses, and list of - * responses per tier. - */ - public abstract AccumulatedResponses.PartitionCounts getPartitionCounts(); - - public final AccumulatedResponses getAccumulatedResults() { - return new AccumulatedResponses(successResponses, - errorResponses, - maxIds, - minIds, - ResponseMergerUtils.mergeEarlyTerminationInfo(successResponses), - isMergingAcrossTiers(), - getPartitionCounts(), - getNumSearchedSegments()); - } - - // Getters are only intended to be used by subclasses. Other users should get data from - // AccumulatedResponses - - int getNumResponses() { - return numResponses; - } - - int getNumSearchedSegments() { - return numSearchedSegments; - } - - List getSuccessResponses() { - return successResponses; - } - - int getNumResultsAccumulated() { - return numResultsAccumulated; - } - - int getSuccessfulEmptyResponseCount() { - return successfulEmptyResponseCount; - } - - boolean foundError() { - return !errorResponses.isEmpty(); - } - - boolean foundEarlyTermination() { - return foundEarlyTermination; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/StrictRecencyResponseMerger.docx b/src/java/com/twitter/search/earlybird_root/mergers/StrictRecencyResponseMerger.docx new file mode 100644 index 000000000..19147218f Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/StrictRecencyResponseMerger.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/StrictRecencyResponseMerger.java b/src/java/com/twitter/search/earlybird_root/mergers/StrictRecencyResponseMerger.java deleted file mode 100644 index 4ea72717e..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/StrictRecencyResponseMerger.java +++ /dev/null @@ -1,297 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import java.util.Collections; -import java.util.List; -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird_root.common.EarlybirdFeatureSchemaMerger; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.util.Future; - -/** - * A RecencyResponseMerger that prioritizes not losing results during pagination. - * As of now, this merger is used by Gnip to make sure that scrolling returns all results. - * - * The logic used for merging partitions is a bit tricky, because on one hand, we want to make sure - * that we do miss results on the next pagination request; on the other hand, we want to return as - * many results as we can, and we want to set the minSearchedStatusID of the merged response as low - * as we can, in order to minimize the number of pagination requests. - * - * The merging logic is: - * - * Realtime cluster: - * 1. merge results from all partitions - * 2. if at least one partition response is early-terminated, set earlyTerminated = true - * on the merged response - * 3. set trimmingMinId = max(minSearchedStatusIDs of all partition responses) - * 4. trim all results to trimmingMinId - * 5. set minSearchedStatusID on the merged response to trimmingMinId - * 6. if we have more than numRequested results: - * - keep only the newest numRequested results - * - set minSearchedStatusID of the merged response to the lowest tweet ID in the response - * 7. if at least one partition response is not early-terminated, set - * tierBottomId = max(minSearchedStatusIDs of all non-early-terminated responses) - * (otherwise, set tierBottomId to some undefined value: -1, Long.MAX_VALUE, etc.) - * 8. if minSearchedStatusID of the merged response is the same as tierBottomId, - * clear the early-termination flag on the merged response - * - * The logic in steps 7 and 8 can be a little tricky to understand. They basically say: when we've - * exhausted the "least deep" partition in the realtime cluster, it's time to move to the full - * archive cluster (if we keep going past the "least deep" partition, we might miss results). - * - * Full archive cluster: - * 1. merge results from all partitions - * 2. if at least one partition response is early-terminated, set earlyTerminated = true - * on the merged response - * 3. set trimmingMinId to: - * - max(minSearchedStatusIDs of early-terminated responses), if at least one partition response - * is early-terminated - * - min(minSearchedStatusIDs of all responses), if all partition responses are not - * early-terminated - * 4. trim all results to trimmingMinId - * 5. set minSearchedStatusID of the merged response to trimmingMinId - * 6. if we have more than numRequested results: - * - keep only the newest numRequested results - * - set minSearchedStatusID of the merged response to the lowest tweet ID in the response - * - * The logic in step 3 can be a little tricky to understand. On one hand, if we always set - * trimmingMinId to the highest minSearchedStatusID, then some tweets at the very bottom of some - * partitions will never be returned. Consider the case: - * - * partition 1 has tweets 10, 8, 6 - * partition 2 has tweets 9, 7, 5 - * - * In this case, we would always trim all results to minId = 6, and tweet 5 would never be returned. - * - * On the other hand, if we always set trimmingMinId to the lowest minSearchedStatusID, then we - * might miss tweets from partitions that early-terminated. Consider the case: - * - * partition 1 has tweets 10, 5, 3, 1 that match our query - * partition 2 has tweets 9, 8, 7, 6, 2 that match our query - * - * If we ask for 3 results, than partition 1 will return tweets 10, 5, 3, and partition 2 will - * return tweets 9, 8, 7. If we set trimmingMinId = min(minSearchedStatusIDs), then the next - * pagination request will have [max_id = 2], and we will miss tweet 6. - * - * So the intuition here is that if we have an early-terminated response, we cannot set - * trimmingMinId to something lower than the minSearchedStatusID returned by that partition - * (otherwise we might miss results from that partition). However, if we've exhausted all - * partitions, then it's OK to not trim any result, because tiers do not intersect, so we will not - * miss any result from the next tier once we get there. - */ -public class StrictRecencyResponseMerger extends RecencyResponseMerger { - private static final SearchTimerStats STRICT_RECENCY_TIMER_AVG = - SearchTimerStats.export("merge_recency_strict", TimeUnit.NANOSECONDS, false, true); - - @VisibleForTesting - static final EarlyTerminationTrimmingStats PARTITION_MERGING_EARLY_TERMINATION_TRIMMING_STATS = - new EarlyTerminationTrimmingStats("strict_recency_partition_merging"); - - @VisibleForTesting - static final EarlyTerminationTrimmingStats TIER_MERGING_EARLY_TERMINATION_TRIMMING_STATS = - new EarlyTerminationTrimmingStats("strict_recency_tier_merging"); - - private final EarlybirdCluster cluster; - - public StrictRecencyResponseMerger(EarlybirdRequestContext requestContext, - List> responses, - ResponseAccumulator mode, - EarlybirdFeatureSchemaMerger featureSchemaMerger, - EarlybirdCluster cluster) { - super(requestContext, responses, mode, featureSchemaMerger); - this.cluster = cluster; - } - - @Override - protected SearchTimerStats getMergedResponseTimer() { - return STRICT_RECENCY_TIMER_AVG; - } - - /** - * Unlike {@link com.twitter.search.earlybird_root.mergers.RecencyResponseMerger}, this method - * takes a much simpler approach by just taking the max of the maxSearchedStatusIds. - * - * Also, when no maxSearchedStatusId is available at all, Long.MIN_VALUE is used instead of - * Long.MAX_VALUE. This ensures that we don't return any result in these cases. - */ - @Override - protected long findMaxFullySearchedStatusID() { - return accumulatedResponses.getMaxIds().isEmpty() - ? Long.MIN_VALUE : Collections.max(accumulatedResponses.getMaxIds()); - } - - /** - * This method is subtly different from the base class version: when no minSearchedStatusId is - * available at all, Long.MAX_VALUE is used instead of Long.MIN_VALUE. This ensures that we - * don't return any result in these cases. - */ - @Override - protected long findMinFullySearchedStatusID() { - List minIds = accumulatedResponses.getMinIds(); - if (minIds.isEmpty()) { - return Long.MAX_VALUE; - } - - if (accumulatedResponses.isMergingPartitionsWithinATier()) { - return getTrimmingMinId(); - } - - // When merging tiers, the min ID should be the smallest among the min IDs. - return Collections.min(minIds); - } - - @Override - protected TrimStats trimResults( - ThriftSearchResults searchResults, long mergedMin, long mergedMax) { - if (!searchResults.isSetResults() || searchResults.getResultsSize() == 0) { - // no results, no trimming needed - return TrimStats.EMPTY_STATS; - } - - TrimStats trimStats = new TrimStats(); - trimExactDups(searchResults, trimStats); - filterResultsByMergedMinMaxIds(searchResults, mergedMax, mergedMin, trimStats); - int numResults = computeNumResultsToKeep(); - if (searchResults.getResultsSize() > numResults) { - trimStats.setResultsTruncatedFromTailCount(searchResults.getResultsSize() - numResults); - searchResults.setResults(searchResults.getResults().subList(0, numResults)); - } - - return trimStats; - } - - /** - * This method is different from the base class version because when minResultId is bigger - * than currentMergedMin, we always take minResultId. - * If we don't do this, we would lose results. - * - * Illustration with an example. Assuming we are outside of the lag threshold. - * Num results requested: 3 - * Response 1: min: 100 max: 900 results: 400, 500, 600 - * Response 2: min: 300 max: 700 results: 350, 450, 550 - * - * Merged results: 600, 550, 500 - * Merged max: 900 - * Merged min: we could take 300 (minId), or take 500 (minResultId). - * - * If we take minId, and use 300 as the pagination cursor, we'd lose results - * 350 and 450 when we paginate. So we have to take minResultId here. - */ - @Override - protected void setMergedMinSearchedStatusId( - ThriftSearchResults searchResults, - long currentMergedMin, - boolean resultsWereTrimmed) { - if (accumulatedResponses.getMinIds().isEmpty()) { - return; - } - - long minId = currentMergedMin; - if (resultsWereTrimmed - && (searchResults != null) - && searchResults.isSetResults() - && (searchResults.getResultsSize() > 0)) { - List results = searchResults.getResults(); - minId = results.get(results.size() - 1).getId(); - } - - searchResults.setMinSearchedStatusID(minId); - } - - @Override - protected boolean clearEarlyTerminationIfReachingTierBottom(EarlybirdResponse mergedResponse) { - if (EarlybirdCluster.isArchive(cluster)) { - // We don't need to worry about the tier bottom when merging partition responses in the full - // archive cluster: if all partitions were exhausted and we didn't trim the results, then - // the early-terminated flag on the merged response will be false. If at least one partition - // is early-terminated, or we trimmed some results, then the ealry-terminated flag on the - // merged response will be true, and we should continue getting results from this tier before - // we move to the next one. - return false; - } - - ThriftSearchResults searchResults = mergedResponse.getSearchResults(); - if (searchResults.getMinSearchedStatusID() == getTierBottomId()) { - mergedResponse.getEarlyTerminationInfo().setEarlyTerminated(false); - mergedResponse.getEarlyTerminationInfo().unsetMergedEarlyTerminationReasons(); - responseMessageBuilder.debugVerbose( - "Set earlytermination to false because minSearchedStatusId is tier bottom"); - return true; - } - return false; - } - - @Override - protected boolean shouldEarlyTerminateWhenEnoughTrimmedResults() { - return false; - } - - @Override - protected final EarlyTerminationTrimmingStats getEarlyTerminationTrimmingStatsForPartitions() { - return PARTITION_MERGING_EARLY_TERMINATION_TRIMMING_STATS; - } - - @Override - protected final EarlyTerminationTrimmingStats getEarlyTerminationTrimmingStatsForTiers() { - return TIER_MERGING_EARLY_TERMINATION_TRIMMING_STATS; - } - - /** Determines the bottom of the realtime cluster, based on the partition responses. */ - private long getTierBottomId() { - Preconditions.checkState(!EarlybirdCluster.isArchive(cluster)); - - long tierBottomId = -1; - for (EarlybirdResponse response : accumulatedResponses.getSuccessResponses()) { - if (!isEarlyTerminated(response) - && response.isSetSearchResults() - && response.getSearchResults().isSetMinSearchedStatusID() - && (response.getSearchResults().getMinSearchedStatusID() > tierBottomId)) { - tierBottomId = response.getSearchResults().getMinSearchedStatusID(); - } - } - - return tierBottomId; - } - - /** Determines the minId to which all results should be trimmed. */ - private long getTrimmingMinId() { - List minIds = accumulatedResponses.getMinIds(); - Preconditions.checkArgument(!minIds.isEmpty()); - - if (!EarlybirdCluster.isArchive(cluster)) { - return Collections.max(minIds); - } - - long maxOfEarlyTerminatedMins = -1; - long minOfAllMins = Long.MAX_VALUE; - for (EarlybirdResponse response : accumulatedResponses.getSuccessResponses()) { - if (response.isSetSearchResults() - && response.getSearchResults().isSetMinSearchedStatusID()) { - long minId = response.getSearchResults().getMinSearchedStatusID(); - minOfAllMins = Math.min(minOfAllMins, minId); - if (isEarlyTerminated(response)) { - maxOfEarlyTerminatedMins = Math.max(maxOfEarlyTerminatedMins, minId); - } - } - } - if (maxOfEarlyTerminatedMins >= 0) { - return maxOfEarlyTerminatedMins; - } else { - return minOfAllMins; - } - } - - /** Determines if the given earlybird response is early terminated. */ - private boolean isEarlyTerminated(EarlybirdResponse response) { - return response.isSetEarlyTerminationInfo() - && response.getEarlyTerminationInfo().isEarlyTerminated(); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/SuperRootResponseMerger.docx b/src/java/com/twitter/search/earlybird_root/mergers/SuperRootResponseMerger.docx new file mode 100644 index 000000000..956fb8200 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/SuperRootResponseMerger.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/SuperRootResponseMerger.java b/src/java/com/twitter/search/earlybird_root/mergers/SuperRootResponseMerger.java deleted file mode 100644 index 5f1c7aa87..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/SuperRootResponseMerger.java +++ /dev/null @@ -1,688 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import java.util.Collections; -import java.util.List; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.cache.CacheBuilder; -import com.google.common.cache.CacheLoader; -import com.google.common.cache.LoadingCache; -import com.google.common.collect.Lists; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.collections.Pair; -import com.twitter.common.quantity.Amount; -import com.twitter.common.quantity.Time; -import com.twitter.common.util.Clock; -import com.twitter.search.common.futures.Futures; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.query.thriftjava.EarlyTerminationInfo; -import com.twitter.search.common.relevance.utils.ResultComparators; -import com.twitter.search.common.search.EarlyTerminationState; -import com.twitter.search.common.util.FinagleUtil; -import com.twitter.search.common.util.earlybird.EarlybirdResponseMergeUtil; -import com.twitter.search.common.util.earlybird.EarlybirdResponseUtil; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird.thrift.ThriftSearchRankingMode; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird.thrift.ThriftTweetSource; -import com.twitter.search.earlybird_root.common.EarlybirdFeatureSchemaMerger; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.EarlybirdServiceResponse; -import com.twitter.util.Function; -import com.twitter.util.Function0; -import com.twitter.util.Future; - -/** Utility functions for merging recency and relevance results. */ -public class SuperRootResponseMerger { - private static final Logger LOG = LoggerFactory.getLogger(SuperRootResponseMerger.class); - private static final String ALL_STATS_PREFIX = "superroot_response_merger_"; - - private static final SearchCounter FULL_ARCHIVE_MIN_ID_GREATER_THAN_REALTIME_MIN_ID = - SearchCounter.export("full_archive_min_id_greater_than_realtime_min_id"); - - private static final String ERROR_FORMAT = "%s%s_errors_from_cluster_%s_%s"; - - private final ThriftSearchRankingMode rankingMode; - private final EarlybirdFeatureSchemaMerger featureSchemaMerger; - private final String featureStatPrefix; - private final Clock clock; - private final String rankingModeStatPrefix; - - private final SearchCounter mergedResponseSearchResultsNotSet; - private final SearchCounter invalidMinStatusId; - private final SearchCounter invalidMaxStatusId; - private final SearchCounter noMinIds; - private final SearchCounter noMaxIds; - private final SearchCounter mergedResponses; - private final SearchCounter mergedResponsesWithExactDups; - private final LoadingCache, SearchCounter> dupsStats; - - private static final EarlybirdResponse EMPTY_RESPONSE = - new EarlybirdResponse(EarlybirdResponseCode.SUCCESS, 0) - .setSearchResults(new ThriftSearchResults() - .setResults(Lists.newArrayList())); - - /** - * Creates a new SuperRootResponseMerger instance. - * @param rankingMode The ranking mode to use when merging results. - * @param featureSchemaMerger The merger that can merge feature schema from different tiers. - * @param clock The clock that will be used to merge results. - */ - public SuperRootResponseMerger(ThriftSearchRankingMode rankingMode, - EarlybirdFeatureSchemaMerger featureSchemaMerger, - Clock clock) { - this.rankingModeStatPrefix = rankingMode.name().toLowerCase(); - - this.rankingMode = rankingMode; - this.featureSchemaMerger = featureSchemaMerger; - this.clock = clock; - this.featureStatPrefix = "superroot_" + rankingMode.name().toLowerCase(); - - mergedResponseSearchResultsNotSet = SearchCounter.export( - ALL_STATS_PREFIX + rankingModeStatPrefix + "_merged_response_search_results_not_set"); - invalidMinStatusId = - SearchCounter.export(ALL_STATS_PREFIX + rankingModeStatPrefix + "_invalid_min_status_id"); - invalidMaxStatusId = - SearchCounter.export(ALL_STATS_PREFIX + rankingModeStatPrefix + "_invalid_max_status_id"); - noMinIds = SearchCounter.export(ALL_STATS_PREFIX + rankingModeStatPrefix + "_no_min_ids"); - noMaxIds = SearchCounter.export(ALL_STATS_PREFIX + rankingModeStatPrefix + "_no_max_ids"); - mergedResponses = SearchCounter.export(ALL_STATS_PREFIX + rankingModeStatPrefix - + "_merged_responses"); - mergedResponsesWithExactDups = - SearchCounter.export(ALL_STATS_PREFIX + rankingModeStatPrefix - + "_merged_responses_with_exact_dups"); - dupsStats = CacheBuilder.newBuilder() - .build(new CacheLoader, SearchCounter>() { - @Override - public SearchCounter load(Pair key) { - return SearchCounter.export( - ALL_STATS_PREFIX + rankingModeStatPrefix + "_merged_responses_with_exact_dups_" - + key.getFirst().name() + "_" + key.getSecond().name()); - } - }); - } - - private void incrErrorCount(String cluster, @Nullable EarlybirdResponse response) { - String cause; - if (response != null) { - cause = response.getResponseCode().name().toLowerCase(); - } else { - cause = "null_response"; - } - String statName = String.format( - ERROR_FORMAT, ALL_STATS_PREFIX, rankingModeStatPrefix, cluster, cause - ); - - SearchCounter.export(statName).increment(); - } - - /** - * Merges the given response futures. - * - * @param earlybirdRequestContext The earlybird request. - * @param realtimeResponseFuture The response from the realtime cluster. - * @param protectedResponseFuture The response from the protected cluster. - * @param fullArchiveResponseFuture The response from the full archive cluster. - * @return A future with the merged results. - */ - public Future mergeResponseFutures( - final EarlybirdRequestContext earlybirdRequestContext, - final Future realtimeResponseFuture, - final Future protectedResponseFuture, - final Future fullArchiveResponseFuture) { - Future mergedResponseFuture = Futures.map( - realtimeResponseFuture, protectedResponseFuture, fullArchiveResponseFuture, - new Function0() { - @Override - public EarlybirdResponse apply() { - // If the realtime response is not valid, return an error response. - // Also, the realtime service should always be called. - EarlybirdServiceResponse realtimeResponse = Futures.get(realtimeResponseFuture); - - if (realtimeResponse.getServiceState().serviceWasRequested() - && (!realtimeResponse.getServiceState().serviceWasCalled() - || !EarlybirdResponseMergeUtil.isValidResponse( - realtimeResponse.getResponse()))) { - - incrErrorCount("realtime", realtimeResponse.getResponse()); - return EarlybirdResponseMergeUtil.transformInvalidResponse( - realtimeResponse.getResponse(), "realtime"); - } - - // If we have a protected response and it's not valid, return an error response. - EarlybirdServiceResponse protectedResponse = Futures.get(protectedResponseFuture); - if (protectedResponse.getServiceState().serviceWasCalled()) { - if (!EarlybirdResponseMergeUtil.isValidResponse(protectedResponse.getResponse())) { - incrErrorCount("protected", protectedResponse.getResponse()); - - return EarlybirdResponseMergeUtil.transformInvalidResponse( - protectedResponse.getResponse(), "protected"); - } - } - - // If we have a full archive response, check if it's valid. - EarlybirdServiceResponse fullArchiveResponse = Futures.get(fullArchiveResponseFuture); - boolean archiveHasError = - fullArchiveResponse.getServiceState().serviceWasCalled() - && !EarlybirdResponseMergeUtil.isValidResponse(fullArchiveResponse.getResponse()); - - // Merge the responses. - EarlybirdResponse mergedResponse = mergeResponses( - earlybirdRequestContext, - realtimeResponse.getResponse(), - protectedResponse.getResponse(), - fullArchiveResponse.getResponse()); - - // If the realtime clusters didn't return any results, and the full archive cluster - // returned an error response, return an error merged response. - if (archiveHasError && !EarlybirdResponseUtil.hasResults(mergedResponse)) { - incrErrorCount("full_archive", fullArchiveResponse.getResponse()); - - return EarlybirdResponseMergeUtil.failedEarlybirdResponse( - fullArchiveResponse.getResponse().getResponseCode(), - "realtime clusters had no results and archive cluster response had error"); - } - - // Corner case: the realtime response could have exactly numRequested results, and could - // be exhausted (not early-terminated). In this case, the request should not have been - // sent to the full archive cluster. - // - If the full archive cluster is not available, or was not requested, then we don't - // need to change anything. - // - If the full archive cluster is available and was requested (but wasn't hit - // because we found enough results in the realtime cluster), then we should set the - // early-termination flag on the merged response, to indicate that we potentially - // have more results for this query in our index. - if ((fullArchiveResponse.getServiceState() - == EarlybirdServiceResponse.ServiceState.SERVICE_NOT_CALLED) - && !EarlybirdResponseUtil.isEarlyTerminated(realtimeResponse.getResponse())) { - EarlyTerminationInfo earlyTerminationInfo = new EarlyTerminationInfo(true); - earlyTerminationInfo.setEarlyTerminationReason( - EarlyTerminationState.TERMINATED_NUM_RESULTS_EXCEEDED.getTerminationReason()); - mergedResponse.setEarlyTerminationInfo(earlyTerminationInfo); - } - - // If we've exhausted all clusters, set the minSearchedStatusID to 0. - if (!EarlybirdResponseUtil.isEarlyTerminated(mergedResponse)) { - mergedResponse.getSearchResults().setMinSearchedStatusID(0); - } - - return mergedResponse; - } - }); - - // Handle all merging exceptions. - return handleResponseException(mergedResponseFuture, - "Exception thrown while merging responses."); - } - - /** - * Merge the results in the given responses. - * - * @param earlybirdRequestContext The earlybird request context. - * @param realtimeResponse The response from the realtime cluster. - * @param protectedResponse The response from the protected cluster. - * @param fullArchiveResponse The response from the full archive cluster. - * @return The merged response. - */ - private EarlybirdResponse mergeResponses( - EarlybirdRequestContext earlybirdRequestContext, - @Nullable EarlybirdResponse realtimeResponse, - @Nullable EarlybirdResponse protectedResponse, - @Nullable EarlybirdResponse fullArchiveResponse) { - - EarlybirdRequest request = earlybirdRequestContext.getRequest(); - ThriftSearchQuery searchQuery = request.getSearchQuery(); - int numResultsRequested; - - if (request.isSetNumResultsToReturnAtRoot()) { - numResultsRequested = request.getNumResultsToReturnAtRoot(); - } else { - numResultsRequested = searchQuery.getNumResults(); - } - - Preconditions.checkState(numResultsRequested > 0); - - EarlybirdResponse mergedResponse = EMPTY_RESPONSE.deepCopy(); - if ((realtimeResponse != null) - && (realtimeResponse.getResponseCode() != EarlybirdResponseCode.TIER_SKIPPED)) { - mergedResponse = realtimeResponse.deepCopy(); - } - - if (!mergedResponse.isSetSearchResults()) { - mergedResponseSearchResultsNotSet.increment(); - mergedResponse.setSearchResults( - new ThriftSearchResults(Lists.newArrayList())); - } - - // If either the realtime or the full archive response is early-terminated, we want the merged - // response to be early-terminated too. The early-termination flag from the realtime response - // carries over to the merged response, because mergedResponse is just a deep copy of the - // realtime response. So we only need to check the early-termination flag of the full archive - // response. - if ((fullArchiveResponse != null) - && EarlybirdResponseUtil.isEarlyTerminated(fullArchiveResponse)) { - mergedResponse.setEarlyTerminationInfo(fullArchiveResponse.getEarlyTerminationInfo()); - } - - // If realtime has empty results and protected has some results then we copy the early - // termination information if that is present - if (protectedResponse != null - && mergedResponse.getSearchResults().getResults().isEmpty() - && !protectedResponse.getSearchResults().getResults().isEmpty() - && EarlybirdResponseUtil.isEarlyTerminated(protectedResponse)) { - mergedResponse.setEarlyTerminationInfo(protectedResponse.getEarlyTerminationInfo()); - } - - // Merge the results. - List mergedResults = mergeResults( - numResultsRequested, realtimeResponse, protectedResponse, fullArchiveResponse); - - // Trim the merged results if necessary. - boolean resultsTrimmed = false; - if (mergedResults.size() > numResultsRequested - && !(searchQuery.isSetRelevanceOptions() - && searchQuery.getRelevanceOptions().isReturnAllResults())) { - // If we have more results than requested, trim the result list and re-adjust - // minSearchedStatusID. - mergedResults = mergedResults.subList(0, numResultsRequested); - - // Mark early termination in merged response - if (!EarlybirdResponseUtil.isEarlyTerminated(mergedResponse)) { - EarlyTerminationInfo earlyTerminationInfo = new EarlyTerminationInfo(true); - earlyTerminationInfo.setEarlyTerminationReason( - EarlyTerminationState.TERMINATED_NUM_RESULTS_EXCEEDED.getTerminationReason()); - mergedResponse.setEarlyTerminationInfo(earlyTerminationInfo); - } - - resultsTrimmed = true; - } - - mergedResponse.getSearchResults().setResults(mergedResults); - featureSchemaMerger.mergeFeatureSchemaAcrossClusters( - earlybirdRequestContext, - mergedResponse, - featureStatPrefix, - realtimeResponse, - protectedResponse, - fullArchiveResponse); - - // Set the minSearchedStatusID and maxSearchedStatusID fields on the merged response. - setMinSearchedStatusId(mergedResponse, realtimeResponse, protectedResponse, fullArchiveResponse, - resultsTrimmed); - setMaxSearchedStatusId(mergedResponse, realtimeResponse, protectedResponse, - fullArchiveResponse); - - int numRealtimeSearchedSegments = - (realtimeResponse != null && realtimeResponse.isSetNumSearchedSegments()) - ? realtimeResponse.getNumSearchedSegments() - : 0; - - int numProtectedSearchedSegments = - (protectedResponse != null && protectedResponse.isSetNumSearchedSegments()) - ? protectedResponse.getNumSearchedSegments() - : 0; - - int numArchiveSearchedSegments = - (fullArchiveResponse != null && fullArchiveResponse.isSetNumSearchedSegments()) - ? fullArchiveResponse.getNumSearchedSegments() - : 0; - - mergedResponse.setNumSearchedSegments( - numRealtimeSearchedSegments + numProtectedSearchedSegments + numArchiveSearchedSegments); - - if (earlybirdRequestContext.getRequest().getDebugMode() > 0) { - mergedResponse.setDebugString( - mergeClusterDebugStrings(realtimeResponse, protectedResponse, fullArchiveResponse)); - } - - return mergedResponse; - } - - /** - * Merges the given responses. - * - * @param numResults the number of results requested - * @param realtimeResponse the response from the realtime response - * @param protectedResponse the response from the protected response - * @param fullArchiveResponse the response from the full archive response - * @return the list of merged results - */ - private List mergeResults(int numResults, - @Nullable EarlybirdResponse realtimeResponse, - @Nullable EarlybirdResponse protectedResponse, - @Nullable EarlybirdResponse fullArchiveResponse) { - mergedResponses.increment(); - // We first merge the results from the two realtime clusters, Realtime cluster and - // Realtime Protected Tweets cluster - List mergedResults = mergePublicAndProtectedRealtimeResults( - numResults, - realtimeResponse, - protectedResponse, - fullArchiveResponse, - clock); - - EarlybirdResponseMergeUtil.addResultsToList(mergedResults, fullArchiveResponse, - ThriftTweetSource.FULL_ARCHIVE_CLUSTER); - - List distinctMergedResults = - EarlybirdResponseMergeUtil.distinctByStatusId(mergedResults, dupsStats); - if (mergedResults != distinctMergedResults) { - mergedResponsesWithExactDups.increment(); - } - - if (rankingMode == ThriftSearchRankingMode.RELEVANCE - || rankingMode == ThriftSearchRankingMode.TOPTWEETS) { - distinctMergedResults.sort(ResultComparators.SCORE_COMPARATOR); - } else { - distinctMergedResults.sort(ResultComparators.ID_COMPARATOR); - } - - return distinctMergedResults; - } - - /** - * Method for merging tweets from protected and realtime clusters - * - realtime, guaranteed newer than any archive tweets - * - protected, also realtime, but with a potentially larger window (optional) - * - archive, public, guaranteed older than any public realtime tweets (optional, used for - * id limits, *not added to results*) - * It adds the ThriftSearchResults from protected tweets to the realtimeResponse - * - * Algorithm diagram: (with newer tweets at the top) - * ------------------------------------ <--- protected maxSearchedStatusID - * |C:Newest protected realtime tweets| - * | (does not exist if realtime | - * | maxID >= protected maxID) | - * - * | ------------------------ | <--- 60 seconds ago - * |D:Newer protected realtime tweets | - * | (does not exist if realtime | - * | maxID >= 60 seconds ago) | - * ---------- | ------------------------ | <--- public realtime maxSearchedStatusID - * |A:Public| |E:Automatically valid protected | - * |realtime| |realtime tweets | - * ---------- | ------------------------ | <--- public realtime minSearchedStatusID - * | | - * ---------- | E if archive is present | <--- public archive maxSearchedStatusID - * ---------- | E if archive is present | <--- public archive maxSearchedStatusID - * |B:Public| | F is archive is not present | - * |archive | | | - * ---------- | ------------------------ | <--- public archive minSearchedStatusID - * |F:Older protected realtime tweets | - * | (does not exist if protected | - * | minID >= public minID) | - * ------------------------------------ <--- protected minSearchedStatusID - * Step 1: Select tweets from groups A, and E. If this is enough, return them - * Step 2: Select tweets from groups A, E, and F. If this is enough, return them - * Step 3: Select tweets from groups A, D, E, and F and return them - * - * There are two primary tradeoffs, both of which favor public tweets: - * (1) Benefit: While public indexing latency is < 60s, auto-updating never misses public tweets - * Cost: Absence of public tweets may delay protected tweets from being searchable for 60s - * (2) Benefit: No failure or delay from the protected cluster will affect realtime results - * Cost: If the protected cluster indexes more slowly, auto-update may miss its tweets - * - * @param fullArchiveTweets - used solely for generating anchor points, not merged in. - */ - @VisibleForTesting - static List mergePublicAndProtectedRealtimeResults( - int numRequested, - EarlybirdResponse realtimeTweets, - EarlybirdResponse realtimeProtectedTweets, - @Nullable EarlybirdResponse fullArchiveTweets, - Clock clock) { - // See which results will actually be used - boolean isRealtimeUsable = EarlybirdResponseUtil.hasResults(realtimeTweets); - boolean isArchiveUsable = EarlybirdResponseUtil.hasResults(fullArchiveTweets); - boolean isProtectedUsable = EarlybirdResponseUtil.hasResults(realtimeProtectedTweets); - - long minId = Long.MIN_VALUE; - long maxId = Long.MAX_VALUE; - if (isRealtimeUsable) { - // Determine the actual upper/lower bounds on the tweet id - if (realtimeTweets.getSearchResults().isSetMinSearchedStatusID()) { - minId = realtimeTweets.getSearchResults().getMinSearchedStatusID(); - } - if (realtimeTweets.getSearchResults().isSetMaxSearchedStatusID()) { - maxId = realtimeTweets.getSearchResults().getMaxSearchedStatusID(); - } - - int justRight = realtimeTweets.getSearchResults().getResultsSize(); - if (isArchiveUsable) { - justRight += fullArchiveTweets.getSearchResults().getResultsSize(); - if (fullArchiveTweets.getSearchResults().isSetMinSearchedStatusID()) { - long fullArchiveMinId = fullArchiveTweets.getSearchResults().getMinSearchedStatusID(); - if (fullArchiveMinId <= minId) { - minId = fullArchiveMinId; - } else { - FULL_ARCHIVE_MIN_ID_GREATER_THAN_REALTIME_MIN_ID.increment(); - } - } - } - if (isProtectedUsable) { - for (ThriftSearchResult result : realtimeProtectedTweets.getSearchResults().getResults()) { - if (result.getId() >= minId && result.getId() <= maxId) { - justRight++; - } - } - } - if (justRight < numRequested) { - // Since this is only used as an upper bound, old (pre-2010) ids are still handled correctly - maxId = Math.max( - maxId, - SnowflakeIdParser.generateValidStatusId( - clock.nowMillis() - Amount.of(60, Time.SECONDS).as(Time.MILLISECONDS), 0)); - } - } - - List mergedSearchResults = Lists.newArrayListWithCapacity(numRequested * 2); - - // Add valid tweets in order of priority: protected, then realtime - // Only add results that are within range (that check only matters for protected) - if (isProtectedUsable) { - EarlybirdResponseMergeUtil.markWithTweetSource( - realtimeProtectedTweets.getSearchResults().getResults(), - ThriftTweetSource.REALTIME_PROTECTED_CLUSTER); - for (ThriftSearchResult result : realtimeProtectedTweets.getSearchResults().getResults()) { - if (result.getId() <= maxId && result.getId() >= minId) { - mergedSearchResults.add(result); - } - } - } - - if (isRealtimeUsable) { - EarlybirdResponseMergeUtil.addResultsToList( - mergedSearchResults, realtimeTweets, ThriftTweetSource.REALTIME_CLUSTER); - } - - // Set the minSearchedStatusID and maxSearchedStatusID on the protected response to the - // minId and maxId that were used to trim the protected results. - // This is needed in order to correctly set these IDs on the merged response. - ThriftSearchResults protectedResults = - EarlybirdResponseUtil.getResults(realtimeProtectedTweets); - if ((protectedResults != null) - && protectedResults.isSetMinSearchedStatusID() - && (protectedResults.getMinSearchedStatusID() < minId)) { - protectedResults.setMinSearchedStatusID(minId); - } - if ((protectedResults != null) - && protectedResults.isSetMaxSearchedStatusID() - && (protectedResults.getMaxSearchedStatusID() > maxId)) { - realtimeProtectedTweets.getSearchResults().setMaxSearchedStatusID(maxId); - } - - return mergedSearchResults; - } - - /** - * Merges the debug strings of the given cluster responses. - * - * @param realtimeResponse The response from the realtime cluster. - * @param protectedResponse The response from the protected cluster. - * @param fullArchiveResponse The response from the full archive cluster. - * @return The merged debug string. - */ - public static String mergeClusterDebugStrings(@Nullable EarlybirdResponse realtimeResponse, - @Nullable EarlybirdResponse protectedResponse, - @Nullable EarlybirdResponse fullArchiveResponse) { - StringBuilder sb = new StringBuilder(); - if ((realtimeResponse != null) && realtimeResponse.isSetDebugString()) { - sb.append("Realtime response: ").append(realtimeResponse.getDebugString()); - } - if ((protectedResponse != null) && protectedResponse.isSetDebugString()) { - if (sb.length() > 0) { - sb.append("\n"); - } - sb.append("Protected response: ").append(protectedResponse.getDebugString()); - } - if ((fullArchiveResponse != null) && fullArchiveResponse.isSetDebugString()) { - if (sb.length() > 0) { - sb.append("\n"); - } - sb.append("Full archive response: ").append(fullArchiveResponse.getDebugString()); - } - - if (sb.length() == 0) { - return null; - } - return sb.toString(); - } - - /** - * Sets the minSearchedStatusID field on the merged response. - * - * @param mergedResponse The merged response. - * @param fullArchiveResponse The full archive response. - * @param resultsTrimmed Whether the merged response results were trimmed. - */ - private void setMinSearchedStatusId(EarlybirdResponse mergedResponse, - EarlybirdResponse realtimeResponse, - EarlybirdResponse protectedResponse, - EarlybirdResponse fullArchiveResponse, - boolean resultsTrimmed) { - Preconditions.checkNotNull(mergedResponse.getSearchResults()); - if (resultsTrimmed) { - // We got more results that we asked for and we trimmed them. - // Set minSearchedStatusID to the ID of the oldest result. - ThriftSearchResults searchResults = mergedResponse.getSearchResults(); - if (searchResults.getResultsSize() > 0) { - List results = searchResults.getResults(); - long lastResultId = results.get(results.size() - 1).getId(); - searchResults.setMinSearchedStatusID(lastResultId); - } - return; - } - - // We did not get more results that we asked for. Get the min of the minSearchedStatusIDs of - // the merged responses. - List minIDs = Lists.newArrayList(); - if (fullArchiveResponse != null - && fullArchiveResponse.isSetSearchResults() - && fullArchiveResponse.getSearchResults().isSetMinSearchedStatusID()) { - minIDs.add(fullArchiveResponse.getSearchResults().getMinSearchedStatusID()); - if (mergedResponse.getSearchResults().isSetMinSearchedStatusID() - && mergedResponse.getSearchResults().getMinSearchedStatusID() - < fullArchiveResponse.getSearchResults().getMinSearchedStatusID()) { - invalidMinStatusId.increment(); - } - } - - if (protectedResponse != null - && !EarlybirdResponseUtil.hasResults(realtimeResponse) - && EarlybirdResponseUtil.hasResults(protectedResponse) - && protectedResponse.getSearchResults().isSetMinSearchedStatusID()) { - minIDs.add(protectedResponse.getSearchResults().getMinSearchedStatusID()); - } - - if (mergedResponse.getSearchResults().isSetMinSearchedStatusID()) { - minIDs.add(mergedResponse.getSearchResults().getMinSearchedStatusID()); - } - - if (!minIDs.isEmpty()) { - mergedResponse.getSearchResults().setMinSearchedStatusID(Collections.min(minIDs)); - } else { - noMinIds.increment(); - } - } - - /** - * Sets the maxSearchedStatusID field on the merged response. - * - * @param mergedResponse The merged response. - * @param fullArchiveResponse The full archive response. - */ - private void setMaxSearchedStatusId(EarlybirdResponse mergedResponse, - EarlybirdResponse realtimeResponse, - EarlybirdResponse protectedResponse, - EarlybirdResponse fullArchiveResponse) { - - Preconditions.checkNotNull(mergedResponse.getSearchResults()); - List maxIDs = Lists.newArrayList(); - if (fullArchiveResponse != null - && fullArchiveResponse.isSetSearchResults() - && fullArchiveResponse.getSearchResults().isSetMaxSearchedStatusID()) { - maxIDs.add(fullArchiveResponse.getSearchResults().getMaxSearchedStatusID()); - if (mergedResponse.getSearchResults().isSetMaxSearchedStatusID() - && fullArchiveResponse.getSearchResults().getMaxSearchedStatusID() - > mergedResponse.getSearchResults().getMaxSearchedStatusID()) { - invalidMaxStatusId.increment(); - } - } - - if (protectedResponse != null - && !EarlybirdResponseUtil.hasResults(realtimeResponse) - && EarlybirdResponseUtil.hasResults(protectedResponse) - && protectedResponse.getSearchResults().isSetMaxSearchedStatusID()) { - - maxIDs.add(protectedResponse.getSearchResults().getMaxSearchedStatusID()); - } - - if (mergedResponse.getSearchResults().isSetMaxSearchedStatusID()) { - maxIDs.add(mergedResponse.getSearchResults().getMaxSearchedStatusID()); - } - - ThriftSearchResults searchResults = mergedResponse.getSearchResults(); - if (searchResults.getResultsSize() > 0) { - List results = searchResults.getResults(); - maxIDs.add(results.get(0).getId()); - } - - if (!maxIDs.isEmpty()) { - mergedResponse.getSearchResults().setMaxSearchedStatusID(Collections.max(maxIDs)); - } else { - noMaxIds.increment(); - } - } - - /** - * Handles exceptions thrown while merging responses. Timeout exceptions are converted to - * SERVER_TIMEOUT_ERROR responses. All other exceptions are converted to PERSISTENT_ERROR - * responses. - */ - private Future handleResponseException( - Future responseFuture, final String debugMsg) { - return responseFuture.handle( - new Function() { - @Override - public EarlybirdResponse apply(Throwable t) { - EarlybirdResponseCode responseCode = EarlybirdResponseCode.PERSISTENT_ERROR; - if (FinagleUtil.isTimeoutException(t)) { - responseCode = EarlybirdResponseCode.SERVER_TIMEOUT_ERROR; - } - EarlybirdResponse response = new EarlybirdResponse(responseCode, 0); - response.setDebugString(debugMsg + "\n" + t); - return response; - } - }); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/TermStatisticsResponseMerger.docx b/src/java/com/twitter/search/earlybird_root/mergers/TermStatisticsResponseMerger.docx new file mode 100644 index 000000000..d5a717c62 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/TermStatisticsResponseMerger.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/TermStatisticsResponseMerger.java b/src/java/com/twitter/search/earlybird_root/mergers/TermStatisticsResponseMerger.java deleted file mode 100644 index d23fff64b..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/TermStatisticsResponseMerger.java +++ /dev/null @@ -1,90 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import java.util.Collection; -import java.util.List; -import java.util.concurrent.TimeUnit; - -import com.google.common.collect.Collections2; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.util.earlybird.FacetsResultsUtils; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftTermStatisticsRequest; -import com.twitter.search.earlybird.thrift.ThriftTermStatisticsResults; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.util.Future; - -/** - * Merger class to merge termstats EarlybirdResponse objects - */ -public class TermStatisticsResponseMerger extends EarlybirdResponseMerger { - private static final Logger LOG = LoggerFactory.getLogger(TermStatisticsResponseMerger.class); - - private static final SearchTimerStats TIMER = - SearchTimerStats.export("merge_term_stats", TimeUnit.NANOSECONDS, false, true); - - private static final double SUCCESSFUL_RESPONSE_THRESHOLD = 0.9; - - public TermStatisticsResponseMerger(EarlybirdRequestContext requestContext, - List> responses, - ResponseAccumulator mode) { - super(requestContext, responses, mode); - } - - @Override - protected SearchTimerStats getMergedResponseTimer() { - return TIMER; - } - - @Override - protected double getDefaultSuccessResponseThreshold() { - return SUCCESSFUL_RESPONSE_THRESHOLD; - } - - @Override - protected EarlybirdResponse internalMerge(EarlybirdResponse termStatsResponse) { - ThriftTermStatisticsRequest termStatisticsRequest = - requestContext.getRequest().getTermStatisticsRequest(); - - Collection termStatsResults = - Collections2.filter(accumulatedResponses.getSuccessResponses(), - earlybirdResponse -> earlybirdResponse.isSetTermStatisticsResults()); - - ThriftTermStatisticsResults results = - new ThriftTermResultsMerger( - termStatsResults, - termStatisticsRequest.getHistogramSettings()) - .merge(); - - if (results.getTermResults().isEmpty()) { - final String line = "No results returned from any backend for term statistics request: {}"; - - // If the termstats request was not empty and we got empty results. log it as a warning - // otherwise log is as a debug. - if (termStatisticsRequest.getTermRequestsSize() > 0) { - LOG.warn(line, termStatisticsRequest); - } else { - LOG.debug(line, termStatisticsRequest); - } - } - - termStatsResponse.setTermStatisticsResults(results); - termStatsResponse.setSearchResults(ThriftTermResultsMerger.mergeSearchStats(termStatsResults)); - - FacetsResultsUtils.fixNativePhotoUrl(results.getTermResults().values()); - - LOG.debug("TermStats call completed successfully: {}", termStatsResponse); - - return termStatsResponse; - } - - @Override - public boolean shouldEarlyTerminateTierMerge(int totalResultsFromSuccessfulShards, - boolean foundEarlyTermination) { - // To get accurate term stats, must never early terminate - return false; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/ThriftTermResultsMerger.docx b/src/java/com/twitter/search/earlybird_root/mergers/ThriftTermResultsMerger.docx new file mode 100644 index 000000000..51357daa8 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/ThriftTermResultsMerger.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/ThriftTermResultsMerger.java b/src/java/com/twitter/search/earlybird_root/mergers/ThriftTermResultsMerger.java deleted file mode 100644 index ccfa54aff..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/ThriftTermResultsMerger.java +++ /dev/null @@ -1,472 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.util.earlybird.FacetsResultsUtils; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftHistogramSettings; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird.thrift.ThriftTermRequest; -import com.twitter.search.earlybird.thrift.ThriftTermResults; -import com.twitter.search.earlybird.thrift.ThriftTermStatisticsResults; - -/** - * Takes multiple successful EarlybirdResponses and merges them. - */ -public class ThriftTermResultsMerger { - private static final Logger LOG = LoggerFactory.getLogger(ThriftTermResultsMerger.class); - - private static final SearchCounter BIN_ID_GAP_COUNTER = - SearchCounter.export("thrift_term_results_merger_found_gap_in_bin_ids"); - private static final SearchCounter MIN_COMPLETE_BIN_ID_ADJUSTED_NULL = - SearchCounter.export("thrift_term_results_merger_min_complete_bin_id_adjusted_null"); - private static final SearchCounter MIN_COMPLETE_BIN_ID_NULL_WITHOUT_BINS = - SearchCounter.export("thrift_term_results_merger_min_complete_bin_id_null_without_bins"); - private static final SearchCounter MIN_COMPLETE_BIN_ID_OUT_OF_RANGE = - SearchCounter.export("thrift_term_results_merger_min_complete_bin_id_out_of_range"); - private static final SearchCounter RESPONSE_WITHOUT_DRIVING_QUERY_HIT = - SearchCounter.export("response_without_driving_query_hit"); - - private static final ThriftTermRequest GLOBAL_COUNT_REQUEST = - new ThriftTermRequest().setFieldName("").setTerm(""); - - /** - * Sorted list of the most recent (and contiguous) numBins binIds across all responses. - * Expected to be an empty list if this request did not ask for histograms, or if it - * did ask for histograms for 0 numBins. - */ - @Nonnull - private final List mostRecentBinIds; - /** - * The first binId in the {@link #mostRecentBinIds} list. This value is not meant to be used in - * case mostRecentBinIds is an empty list. - */ - private final int firstBinId; - - /** - * For each unique ThriftTermRequest, stores an array of the total counts for all the binIds - * that we will return, summed up across all earlybird responses. - * - * The values in each totalCounts array correspond to the binIds in the - * {@link #mostRecentBinIds} list. - * - * Key: thrift term request. - * Value: array of the total counts summed up across all earlybird responses for the key's - * term request, corresponding to the binIds in {@link #mostRecentBinIds}. - */ - private final Map mergedTermRequestTotalCounts = Maps.newHashMap(); - /** - * The set of all unique binIds that we are merging. - */ - private final Map termResultsMap = Maps.newHashMap(); - private final ThriftHistogramSettings histogramSettings; - - /** - * Only relevant for merging responses with histogram settings. - * This will be null either if (1) the request is not asking for histograms at all, or if - * (2) numBins was set to 0 (and no bin can be considered complete). - * If not null, the minCompleteBinId will be computed as the max over all merged responses' - * minCompleteBinId's. - */ - @Nullable - private final Integer minCompleteBinId; - - /** - * Create merger with collections of results to merge - */ - public ThriftTermResultsMerger(Collection termStatsResults, - ThriftHistogramSettings histogramSettings) { - this.histogramSettings = histogramSettings; - - Collection filteredTermStatsResults = - filterOutEmptyEarlybirdResponses(termStatsResults); - - this.mostRecentBinIds = findMostRecentBinIds(histogramSettings, filteredTermStatsResults); - this.firstBinId = mostRecentBinIds.isEmpty() - ? Integer.MAX_VALUE // Should not be used if mostRecentBinIds is empty. - : mostRecentBinIds.get(0); - - List minCompleteBinIds = - Lists.newArrayListWithCapacity(filteredTermStatsResults.size()); - for (EarlybirdResponse response : filteredTermStatsResults) { - Preconditions.checkState(response.getResponseCode() == EarlybirdResponseCode.SUCCESS, - "Unsuccessful responses should not be given to ThriftTermResultsMerger."); - Preconditions.checkState(response.getTermStatisticsResults() != null, - "Response given to ThriftTermResultsMerger has no termStatisticsResults."); - - ThriftTermStatisticsResults termStatisticsResults = response.getTermStatisticsResults(); - List binIds = termStatisticsResults.getBinIds(); - - for (Map.Entry entry - : termStatisticsResults.getTermResults().entrySet()) { - ThriftTermRequest termRequest = entry.getKey(); - ThriftTermResults termResults = entry.getValue(); - - adjustTotalCount(termResults, binIds); - addTotalCountData(termRequest, termResults); - - if (histogramSettings != null) { - Preconditions.checkState(termStatisticsResults.isSetBinIds()); - addHistogramData(termRequest, termResults, termStatisticsResults.getBinIds()); - } - } - - if (histogramSettings != null) { - addMinCompleteBinId(minCompleteBinIds, response); - } - } - - minCompleteBinId = minCompleteBinIds.isEmpty() ? null : Collections.max(minCompleteBinIds); - } - - /** - * Take out any earlybird responses that we know did not match anything relevant to the query, - * and may have erroneous binIds. - */ - private Collection filterOutEmptyEarlybirdResponses( - Collection termStatsResults) { - List emptyResponses = Lists.newArrayList(); - List nonEmptyResponses = Lists.newArrayList(); - for (EarlybirdResponse response : termStatsResults) { - // Guard against erroneously merging and returning 0 counts when we actually have data to - // return from other partitions. - // When a query doesn't match anything at all on an earlybird, the binIds that are returned - // do not correspond at all to the actual query, and are just based on the data range on the - // earlybird itself. - // We can identify these responses as (1) being non-early terminated, and (2) having 0 - // hits processed. - if (isTermStatResponseEmpty(response)) { - emptyResponses.add(response); - } else { - nonEmptyResponses.add(response); - } - } - - // If all responses were "empty", we will just use those to merge into a new set of empty - // responses, using the binIds provided. - return nonEmptyResponses.isEmpty() ? emptyResponses : nonEmptyResponses; - } - - private boolean isTermStatResponseEmpty(EarlybirdResponse response) { - return response.isSetSearchResults() - && (response.getSearchResults().getNumHitsProcessed() == 0 - || drivingQueryHasNoHits(response)) - && response.isSetEarlyTerminationInfo() - && !response.getEarlyTerminationInfo().isEarlyTerminated(); - } - - /** - * If the global count bins are all 0, then we know the driving query has no hits. - * This check is added as a short term solution for SEARCH-5476. This short term fix requires - * the client to set the includeGlobalCounts to kick in. - */ - private boolean drivingQueryHasNoHits(EarlybirdResponse response) { - ThriftTermStatisticsResults termStatisticsResults = response.getTermStatisticsResults(); - if (termStatisticsResults == null || termStatisticsResults.getTermResults() == null) { - // If there's no term stats response, be conservative and return false. - return false; - } else { - ThriftTermResults globalCounts = - termStatisticsResults.getTermResults().get(GLOBAL_COUNT_REQUEST); - if (globalCounts == null) { - // We cannot tell if driving query has no hits, be conservative and return false. - return false; - } else { - for (Integer i : globalCounts.getHistogramBins()) { - if (i > 0) { - return false; - } - } - RESPONSE_WITHOUT_DRIVING_QUERY_HIT.increment(); - return true; - } - } - } - - private static List findMostRecentBinIds( - ThriftHistogramSettings histogramSettings, - Collection filteredTermStatsResults) { - Integer largestFirstBinId = null; - List binIdsToUse = null; - - if (histogramSettings != null) { - int numBins = histogramSettings.getNumBins(); - for (EarlybirdResponse response : filteredTermStatsResults) { - ThriftTermStatisticsResults termStatisticsResults = response.getTermStatisticsResults(); - Preconditions.checkState(termStatisticsResults.getBinIds().size() == numBins, - "expected all results to have the same numBins. " - + "request numBins: %s, response numBins: %s", - numBins, termStatisticsResults.getBinIds().size()); - - if (termStatisticsResults.getBinIds().size() > 0) { - Integer firstBinId = termStatisticsResults.getBinIds().get(0); - if (largestFirstBinId == null - || largestFirstBinId.intValue() < firstBinId.intValue()) { - largestFirstBinId = firstBinId; - binIdsToUse = termStatisticsResults.getBinIds(); - } - } - } - } - return binIdsToUse == null - ? Collections.emptyList() - // Just in case, make a copy of the binIds so that we don't reuse the same list from one - // of the responses we're merging. - : Lists.newArrayList(binIdsToUse); - } - - private void addMinCompleteBinId(List minCompleteBinIds, - EarlybirdResponse response) { - Preconditions.checkNotNull(histogramSettings); - ThriftTermStatisticsResults termStatisticsResults = response.getTermStatisticsResults(); - - if (termStatisticsResults.isSetMinCompleteBinId()) { - // This is the base case. Early terminated or not, this is the proper minCompleteBinId - // that we're told to use for this response. - minCompleteBinIds.add(termStatisticsResults.getMinCompleteBinId()); - } else if (termStatisticsResults.getBinIds().size() > 0) { - // This is the case where no bins were complete. For the purposes of merging, we need to - // mark all the binIds in this response as non-complete by marking the "max(binId)+1" as the - // last complete bin. - // When returning the merged response, we still have a guard for the resulting - // minCompleteBinId being outside of the binIds range, and will set the returned - // minCompleteBinId value to null, if this response's binIds end up being used as the most - // recent ones, and we need to signify that none of the bins are complete. - int binSize = termStatisticsResults.getBinIds().size(); - Integer maxBinId = termStatisticsResults.getBinIds().get(binSize - 1); - minCompleteBinIds.add(maxBinId + 1); - - LOG.debug("Adjusting null minCompleteBinId for response: {}, histogramSettings {}", - response, histogramSettings); - MIN_COMPLETE_BIN_ID_ADJUSTED_NULL.increment(); - } else { - // This should only happen in the case where numBins is set to 0. - Preconditions.checkState(histogramSettings.getNumBins() == 0, - "Expected numBins set to 0. response: %s", response); - Preconditions.checkState(minCompleteBinIds.isEmpty(), - "minCompleteBinIds: %s", minCompleteBinIds); - - LOG.debug("Got null minCompleteBinId with no bins for response: {}, histogramSettings {}", - response, histogramSettings); - MIN_COMPLETE_BIN_ID_NULL_WITHOUT_BINS.increment(); - } - } - - private void addTotalCountData(ThriftTermRequest request, ThriftTermResults results) { - ThriftTermResults termResults = termResultsMap.get(request); - if (termResults == null) { - termResultsMap.put(request, results); - } else { - termResults.setTotalCount(termResults.getTotalCount() + results.getTotalCount()); - if (termResults.isSetMetadata()) { - termResults.setMetadata( - FacetsResultsUtils.mergeFacetMetadata(termResults.getMetadata(), - results.getMetadata(), null)); - } - } - } - - /** - * Set results.totalCount to the sum of hits in only the bins that will be returned in - * the merged response. - */ - private void adjustTotalCount(ThriftTermResults results, List binIds) { - int adjustedTotalCount = 0; - List histogramBins = results.getHistogramBins(); - if ((binIds != null) && (histogramBins != null)) { - Preconditions.checkState( - histogramBins.size() == binIds.size(), - "Expected ThriftTermResults to have the same number of histogramBins as binIds set in " - + " ThriftTermStatisticsResults. ThriftTermResults.histogramBins: %s, " - + " ThriftTermStatisticsResults.binIds: %s.", - histogramBins, binIds); - for (int i = 0; i < binIds.size(); ++i) { - if (binIds.get(i) >= firstBinId) { - adjustedTotalCount += histogramBins.get(i); - } - } - } - - results.setTotalCount(adjustedTotalCount); - } - - private void addHistogramData(ThriftTermRequest request, - ThriftTermResults results, - List binIds) { - - int[] requestTotalCounts = mergedTermRequestTotalCounts.get(request); - if (requestTotalCounts == null) { - requestTotalCounts = new int[mostRecentBinIds.size()]; - mergedTermRequestTotalCounts.put(request, requestTotalCounts); - } - - // Only consider these results if they fall into the mostRecentBinIds range. - // - // The list of returned binIds is expected to be both sorted (in ascending order), and - // contiguous, which allows us to use firstBinId to check if it overlaps with the - // mostRecentBinIds range. - if (binIds.size() > 0 && binIds.get(binIds.size() - 1) >= firstBinId) { - int firstBinIndex; - if (binIds.get(0) == firstBinId) { - // This should be the common case when all partitions have the same binIds, - // no need to do a binary search. - firstBinIndex = 0; - } else { - // The firstBinId must be in the binIds range. We can find it using binary search since - // binIds are sorted. - firstBinIndex = Collections.binarySearch(binIds, firstBinId); - Preconditions.checkState(firstBinIndex >= 0, - "Expected to find firstBinId (%s) in the result binIds: %s, " - + "histogramSettings: %s, termRequest: %s", - firstBinId, binIds, histogramSettings, request); - } - - // Skip binIds that are before the smallest binId that we will use in the merged results. - for (int i = firstBinIndex; i < binIds.size(); i++) { - final Integer currentBinValue = results.getHistogramBins().get(i); - requestTotalCounts[i - firstBinIndex] += currentBinValue.intValue(); - } - } - } - - /** - * Return a new ThriftTermStatisticsResults with the total counts merged, and if enabled, - * histogram bins merged. - */ - public ThriftTermStatisticsResults merge() { - ThriftTermStatisticsResults results = new ThriftTermStatisticsResults(termResultsMap); - - if (histogramSettings != null) { - mergeHistogramBins(results); - } - - return results; - } - - - /** - * Takes multiple histogram results and merges them so: - * 1) Counts for the same binId (represents the time) and term are summed - * 2) All results are re-indexed to use the most recent bins found from the union of all bins - */ - private void mergeHistogramBins(ThriftTermStatisticsResults mergedResults) { - - mergedResults.setBinIds(mostRecentBinIds); - mergedResults.setHistogramSettings(histogramSettings); - - setMinCompleteBinId(mergedResults); - - useMostRecentBinsForEachThriftTermResults(); - } - - private void setMinCompleteBinId(ThriftTermStatisticsResults mergedResults) { - if (mostRecentBinIds.isEmpty()) { - Preconditions.checkState(minCompleteBinId == null); - // This is the case where the requested numBins is set to 0. We don't have any binIds, - // and the minCompleteBinId has to be unset. - LOG.debug("Empty binIds returned for mergedResults: {}", mergedResults); - } else { - Preconditions.checkNotNull(minCompleteBinId); - - Integer maxBinId = mostRecentBinIds.get(mostRecentBinIds.size() - 1); - if (minCompleteBinId <= maxBinId) { - mergedResults.setMinCompleteBinId(minCompleteBinId); - } else { - // Leaving the minCompleteBinId unset as it is outside the range of the returned binIds. - LOG.debug("Computed minCompleteBinId: {} is out of maxBinId: {} for mergedResults: {}", - minCompleteBinId, mergedResults); - MIN_COMPLETE_BIN_ID_OUT_OF_RANGE.increment(); - } - } - } - - /** - * Check that the binIds we are using are contiguous. Increment the provided stat if we find - * a gap, as we don't expect to find any. - * See: SEARCH-4362 - * - * @param sortedBinIds most recent numBins sorted binIds. - * @param binIdGapCounter stat to increment if we see a gap in the binId range. - */ - @VisibleForTesting - static void checkForBinIdGaps(List sortedBinIds, SearchCounter binIdGapCounter) { - for (int i = sortedBinIds.size() - 1; i > 0; i--) { - final Integer currentBinId = sortedBinIds.get(i); - final Integer previousBinId = sortedBinIds.get(i - 1); - - if (previousBinId < currentBinId - 1) { - binIdGapCounter.increment(); - break; - } - } - } - - /** - * Returns a view containing only the last N items from the list - */ - private static List takeLastN(List lst, int n) { - Preconditions.checkArgument(n <= lst.size(), - "Attempting to take more elements than the list has. List size: %s, n: %s", lst.size(), n); - return lst.subList(lst.size() - n, lst.size()); - } - - private void useMostRecentBinsForEachThriftTermResults() { - for (Map.Entry entry : termResultsMap.entrySet()) { - ThriftTermRequest request = entry.getKey(); - ThriftTermResults results = entry.getValue(); - - List histogramBins = Lists.newArrayList(); - results.setHistogramBins(histogramBins); - - int[] requestTotalCounts = mergedTermRequestTotalCounts.get(request); - Preconditions.checkNotNull(requestTotalCounts); - - for (int totalCount : requestTotalCounts) { - histogramBins.add(totalCount); - } - } - } - - /** - * Merges search stats from several earlybird responses and puts them in - * {@link ThriftSearchResults} structure. - * - * @param responses earlybird responses to merge the search stats from - * @return merged search stats inside of {@link ThriftSearchResults} structure - */ - public static ThriftSearchResults mergeSearchStats(Collection responses) { - int numHitsProcessed = 0; - int numPartitionsEarlyTerminated = 0; - - for (EarlybirdResponse response : responses) { - ThriftSearchResults searchResults = response.getSearchResults(); - - if (searchResults != null) { - numHitsProcessed += searchResults.getNumHitsProcessed(); - numPartitionsEarlyTerminated += searchResults.getNumPartitionsEarlyTerminated(); - } - } - - ThriftSearchResults searchResults = new ThriftSearchResults(new ArrayList<>()); - searchResults.setNumHitsProcessed(numHitsProcessed); - searchResults.setNumPartitionsEarlyTerminated(numPartitionsEarlyTerminated); - return searchResults; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/TierResponseAccumulator.docx b/src/java/com/twitter/search/earlybird_root/mergers/TierResponseAccumulator.docx new file mode 100644 index 000000000..8a32b636a Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/TierResponseAccumulator.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/TierResponseAccumulator.java b/src/java/com/twitter/search/earlybird_root/mergers/TierResponseAccumulator.java deleted file mode 100644 index 58b7cb877..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/TierResponseAccumulator.java +++ /dev/null @@ -1,97 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import java.util.ArrayList; -import java.util.List; - -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.TierResponse; - -public final class TierResponseAccumulator extends ResponseAccumulator { - private static final String TARGET_TYPE_TIER = "tier"; - - private final List tierResponses = new ArrayList<>(); - // Total number of partitions the request was sent to, across all tiers. - private int totalPartitionsQueriedInAllTiers = 0; - // Among the above partitions, the number of them that returned successful responses. - private int totalSuccessfulPartitionsInAllTiers = 0; - - @Override - public String getNameForLogging(int responseIndex, int numTotalResponses) { - return TARGET_TYPE_TIER + (numTotalResponses - responseIndex); - } - - @Override - public String getNameForEarlybirdResponseCodeStats(int responseIndex, int numTotalResponses) { - return TARGET_TYPE_TIER + (numTotalResponses - responseIndex); - } - - @Override - protected boolean isMergingAcrossTiers() { - return true; - } - - @Override - public boolean shouldEarlyTerminateMerge(EarlyTerminateTierMergePredicate merger) { - if (foundError()) { - return true; - } - - int numResults = 0; - for (EarlybirdResponse resp : getSuccessResponses()) { - if (resp.isSetSearchResults()) { - numResults += resp.getSearchResults().getResultsSize(); - } - } - - return merger.shouldEarlyTerminateTierMerge(numResults, foundEarlyTermination()); - } - - @Override - public void handleSkippedResponse(EarlybirdResponseCode responseCode) { - tierResponses.add(new TierResponse() - .setNumPartitions(0) - .setNumSuccessfulPartitions(0) - .setTierResponseCode(responseCode)); - } - - @Override - public void handleErrorResponse(EarlybirdResponse response) { - // TierResponse, which is only returned if merging results from different tiers. - TierResponse tr = new TierResponse(); - if (response != null) { - if (response.isSetResponseCode()) { - tr.setTierResponseCode(response.getResponseCode()); - } else { - tr.setTierResponseCode(EarlybirdResponseCode.TRANSIENT_ERROR); - } - tr.setNumPartitions(response.getNumPartitions()); - tr.setNumSuccessfulPartitions(0); - totalPartitionsQueriedInAllTiers += response.getNumPartitions(); - } else { - tr.setTierResponseCode(EarlybirdResponseCode.TRANSIENT_ERROR) - .setNumPartitions(0) - .setNumSuccessfulPartitions(0); - } - - tierResponses.add(tr); - } - - @Override - public AccumulatedResponses.PartitionCounts getPartitionCounts() { - return new AccumulatedResponses.PartitionCounts(totalPartitionsQueriedInAllTiers, - totalSuccessfulPartitionsInAllTiers, tierResponses); - } - - @Override - public void extraSuccessfulResponseHandler(EarlybirdResponse response) { - // Record tier stats. - totalPartitionsQueriedInAllTiers += response.getNumPartitions(); - totalSuccessfulPartitionsInAllTiers += response.getNumSuccessfulPartitions(); - - tierResponses.add(new TierResponse() - .setNumPartitions(response.getNumPartitions()) - .setNumSuccessfulPartitions(response.getNumSuccessfulPartitions()) - .setTierResponseCode(EarlybirdResponseCode.SUCCESS)); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/TopTweetsResponseMerger.docx b/src/java/com/twitter/search/earlybird_root/mergers/TopTweetsResponseMerger.docx new file mode 100644 index 000000000..48e53cbc1 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/TopTweetsResponseMerger.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/TopTweetsResponseMerger.java b/src/java/com/twitter/search/earlybird_root/mergers/TopTweetsResponseMerger.java deleted file mode 100644 index 5d76ab4cd..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/TopTweetsResponseMerger.java +++ /dev/null @@ -1,65 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -import java.util.List; -import java.util.concurrent.TimeUnit; - -import com.google.common.base.Preconditions; - -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird.thrift.ThriftSearchRankingMode; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird_root.collectors.RelevanceMergeCollector; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.util.Future; - -/** - * Merger class to merge toptweets EarlybirdResponse objects - */ -public class TopTweetsResponseMerger extends EarlybirdResponseMerger { - - private static final double SUCCESSFUL_RESPONSE_THRESHOLD = 0.9; - - private static final SearchTimerStats TIMER = - SearchTimerStats.export("merge_top_tweets", TimeUnit.NANOSECONDS, false, true); - - public TopTweetsResponseMerger(EarlybirdRequestContext requestContext, - List> responses, - ResponseAccumulator mode) { - super(requestContext, responses, mode); - } - - @Override - protected SearchTimerStats getMergedResponseTimer() { - return TIMER; - } - - @Override - protected double getDefaultSuccessResponseThreshold() { - return SUCCESSFUL_RESPONSE_THRESHOLD; - } - - @Override - protected EarlybirdResponse internalMerge(EarlybirdResponse mergedResponse) { - final ThriftSearchQuery searchQuery = requestContext.getRequest().getSearchQuery(); - - Preconditions.checkNotNull(searchQuery); - Preconditions.checkState(searchQuery.isSetRankingMode()); - Preconditions.checkState(searchQuery.getRankingMode() == ThriftSearchRankingMode.TOPTWEETS); - - int numResultsRequested = computeNumResultsToKeep(); - - RelevanceMergeCollector collector = new RelevanceMergeCollector(responses.size()); - - addResponsesToCollector(collector); - ThriftSearchResults searchResults = collector.getAllSearchResults(); - if (numResultsRequested < searchResults.getResults().size()) { - searchResults.setResults(searchResults.getResults().subList(0, numResultsRequested)); - } - - mergedResponse.setSearchResults(searchResults); - - return mergedResponse; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/mergers/TrimStats.docx b/src/java/com/twitter/search/earlybird_root/mergers/TrimStats.docx new file mode 100644 index 000000000..445201c85 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/mergers/TrimStats.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/mergers/TrimStats.java b/src/java/com/twitter/search/earlybird_root/mergers/TrimStats.java deleted file mode 100644 index 284f3bc1b..000000000 --- a/src/java/com/twitter/search/earlybird_root/mergers/TrimStats.java +++ /dev/null @@ -1,71 +0,0 @@ -package com.twitter.search.earlybird_root.mergers; - -/** - * Tracks what situations are encountered when trimming results - */ -class TrimStats { - protected static final TrimStats EMPTY_STATS = new TrimStats(); - - private int maxIdFilterCount = 0; - private int minIdFilterCount = 0; - private int removedDupsCount = 0; - private int resultsTruncatedFromTailCount = 0; - - int getMinIdFilterCount() { - return minIdFilterCount; - } - - int getRemovedDupsCount() { - return removedDupsCount; - } - - int getResultsTruncatedFromTailCount() { - return resultsTruncatedFromTailCount; - } - - void decreaseMaxIdFilterCount() { - maxIdFilterCount--; - } - - void decreaseMinIdFilterCount() { - minIdFilterCount--; - } - - public void clearMaxIdFilterCount() { - this.maxIdFilterCount = 0; - } - - public void clearMinIdFilterCount() { - this.minIdFilterCount = 0; - } - - void increaseMaxIdFilterCount() { - maxIdFilterCount++; - } - - void increaseMinIdFilterCount() { - minIdFilterCount++; - } - - void increaseRemovedDupsCount() { - removedDupsCount++; - } - - void setResultsTruncatedFromTailCount(int resultsTruncatedFromTailCount) { - this.resultsTruncatedFromTailCount = resultsTruncatedFromTailCount; - } - - @Override - public String toString() { - StringBuilder builder = new StringBuilder(); - - builder.append("TrimStats{"); - builder.append("maxIdFilterCount=").append(maxIdFilterCount); - builder.append(", minIdFilterCount=").append(minIdFilterCount); - builder.append(", removedDupsCount=").append(removedDupsCount); - builder.append(", resultsTruncatedFromTailCount=").append(resultsTruncatedFromTailCount); - builder.append("}"); - - return builder.toString(); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/quota/BUILD b/src/java/com/twitter/search/earlybird_root/quota/BUILD deleted file mode 100644 index 8f81a89fa..000000000 --- a/src/java/com/twitter/search/earlybird_root/quota/BUILD +++ /dev/null @@ -1,15 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/commons-io", - "3rdparty/jvm/org/json", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/search/common/dark", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/util/io/periodic", - "src/java/com/twitter/search/common/util/json", - ], -) diff --git a/src/java/com/twitter/search/earlybird_root/quota/BUILD.docx b/src/java/com/twitter/search/earlybird_root/quota/BUILD.docx new file mode 100644 index 000000000..761e07937 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/quota/BUILD.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/quota/ClientIdQuotaManager.docx b/src/java/com/twitter/search/earlybird_root/quota/ClientIdQuotaManager.docx new file mode 100644 index 000000000..4c33b40a2 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/quota/ClientIdQuotaManager.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/quota/ClientIdQuotaManager.java b/src/java/com/twitter/search/earlybird_root/quota/ClientIdQuotaManager.java deleted file mode 100644 index 2a5723a3d..000000000 --- a/src/java/com/twitter/search/earlybird_root/quota/ClientIdQuotaManager.java +++ /dev/null @@ -1,22 +0,0 @@ -package com.twitter.search.earlybird_root.quota; - -import java.util.Optional; - -/** A manager that determines how quota restrictions should be applied for each client. */ -public interface ClientIdQuotaManager { - /** - * Returns the quota for the given client, if one is set. - * - * @param clientId The ID of the client. - * @return The quota for the given client (in requests per second), if one is set. - */ - Optional getQuotaForClient(String clientId); - - /** - * Returns the common pool quota. A common pool quota must always be set. - * - * @return The common pool quota (in requests per second). - */ - QuotaInfo getCommonPoolQuota(); - -} diff --git a/src/java/com/twitter/search/earlybird_root/quota/ConfigBasedQuotaConfig.docx b/src/java/com/twitter/search/earlybird_root/quota/ConfigBasedQuotaConfig.docx new file mode 100644 index 000000000..d110bc1b6 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/quota/ConfigBasedQuotaConfig.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/quota/ConfigBasedQuotaConfig.java b/src/java/com/twitter/search/earlybird_root/quota/ConfigBasedQuotaConfig.java deleted file mode 100644 index 6565fdae6..000000000 --- a/src/java/com/twitter/search/earlybird_root/quota/ConfigBasedQuotaConfig.java +++ /dev/null @@ -1,161 +0,0 @@ -package com.twitter.search.earlybird_root.quota; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.StandardCharsets; -import java.util.Iterator; -import java.util.Map; -import java.util.Optional; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.atomic.AtomicReference; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Maps; - -import org.apache.commons.io.IOUtils; -import org.json.JSONException; -import org.json.JSONObject; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.util.io.periodic.PeriodicFileLoader; -import com.twitter.search.common.util.json.JSONParsingUtil; - -/** - * Periodically loads a json serialized map that contains the quota information indexed by - * client id. - * - * Each json object from the map is required to have an int property that represents a client's quota. - * The key for the quota property is passed to this class. - * - * Optionally it can have a should_enforce property of type boolean - * - * If this two properties are not present an exception will be thrown. - */ -public class ConfigBasedQuotaConfig extends PeriodicFileLoader { - private static final String UNSET_EMAIL = "unset"; - - private static final String PER_CLIENT_QUOTA_GAUGE_NAME_PATTERN = - "config_based_quota_for_client_id_%s"; - private static final String PER_EMAIL_QUOTA_GAUGE_NAME_PATTERN = - "config_based_quota_for_email_%s"; - - @VisibleForTesting - static final SearchLongGauge TOTAL_QUOTA = - SearchLongGauge.export("total_config_based_quota"); - - @VisibleForTesting - static final SearchLongGauge ENTRIES_COUNT = - SearchLongGauge.export("config_repo_quota_config_entries_count"); - - private final AtomicReference> clientQuotas = - new AtomicReference<>(); - - private String clientQuotaKey; - private boolean requireQuotaConfigForClients; - - /** - * Creates the object that manages loads the config from: quotaConfigPath. It periodically - * reloads the config file using the given executor service. - * - * @param quotaConfigPath Path to configuration file. - * @param executorService ScheduledExecutorService to be used for periodically reloading the file. - * @param clientQuotaKey The key that will be used to extract client quotas. - * @param requireQuotaConfigForClients Determines whether a client can be skipped - * if the associated object is missing the quota key - * (ie a client that is a SuperRoot client but the current service is Archive) - */ - public static ConfigBasedQuotaConfig newConfigBasedQuotaConfig( - String quotaConfigPath, - String clientQuotaKey, - boolean requireQuotaConfigForClients, - ScheduledExecutorService executorService, - Clock clock - ) throws Exception { - ConfigBasedQuotaConfig configLoader = new ConfigBasedQuotaConfig( - quotaConfigPath, - clientQuotaKey, - requireQuotaConfigForClients, - executorService, - clock - ); - configLoader.init(); - return configLoader; - } - - public ConfigBasedQuotaConfig( - String quotaConfigPath, - String clientQuotaKey, - boolean requireQuotaConfigForClients, - ScheduledExecutorService executorService, - Clock clock - ) throws Exception { - super("quotaConfig", quotaConfigPath, executorService, clock); - this.clientQuotaKey = clientQuotaKey; - this.requireQuotaConfigForClients = requireQuotaConfigForClients; - } - - /** - * Returns the quota information for a specific client id. - */ - public Optional getQuotaForClient(String clientId) { - return Optional.ofNullable(clientQuotas.get().get(clientId)); - } - - /** - * Load the json format and store it in a map. - */ - @Override - protected void accept(InputStream fileStream) throws JSONException, IOException { - String fileContents = IOUtils.toString(fileStream, StandardCharsets.UTF_8); - JSONObject quotaConfig = new JSONObject(JSONParsingUtil.stripComments(fileContents)); - - Map perEmailQuotas = Maps.newHashMap(); - ImmutableMap.Builder quotasBuilder = new ImmutableMap.Builder<>(); - Iterator clientIds = quotaConfig.keys(); - - long totalQuota = 0; - while (clientIds.hasNext()) { - String clientId = clientIds.next(); - JSONObject clientQuota = quotaConfig.getJSONObject(clientId); - - // Skip clients that don't send requests to this service. - // (ie some SuperRoot clients are not Archive clients) - if (!requireQuotaConfigForClients && !clientQuota.has(clientQuotaKey)) { - continue; - } - - int quotaValue = clientQuota.getInt(clientQuotaKey); - boolean shouldEnforce = clientQuota.optBoolean("should_enforce", false); - String tierValue = clientQuota.optString("tier", QuotaInfo.DEFAULT_TIER_VALUE); - boolean archiveAccess = clientQuota.optBoolean("archive_access", - QuotaInfo.DEFAULT_ARCHIVE_ACCESS_VALUE); - String email = clientQuota.optString("email", UNSET_EMAIL); - - quotasBuilder.put( - clientId, - new QuotaInfo(clientId, email, quotaValue, shouldEnforce, tierValue, archiveAccess)); - - SearchLongGauge perClientQuota = SearchLongGauge.export( - String.format(PER_CLIENT_QUOTA_GAUGE_NAME_PATTERN, clientId)); - perClientQuota.set(quotaValue); - totalQuota += quotaValue; - - Integer emailQuota = perEmailQuotas.get(email); - if (emailQuota == null) { - emailQuota = 0; - } - perEmailQuotas.put(email, emailQuota + quotaValue); - } - - clientQuotas.set(quotasBuilder.build()); - TOTAL_QUOTA.set(totalQuota); - ENTRIES_COUNT.set(clientQuotas.get().size()); - - for (String email : perEmailQuotas.keySet()) { - SearchLongGauge.export(String.format(PER_EMAIL_QUOTA_GAUGE_NAME_PATTERN, email)).set( - perEmailQuotas.get(email)); - } - } -} diff --git a/src/java/com/twitter/search/earlybird_root/quota/ConfigRepoBasedQuotaManager.docx b/src/java/com/twitter/search/earlybird_root/quota/ConfigRepoBasedQuotaManager.docx new file mode 100644 index 000000000..d458799a2 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/quota/ConfigRepoBasedQuotaManager.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/quota/ConfigRepoBasedQuotaManager.java b/src/java/com/twitter/search/earlybird_root/quota/ConfigRepoBasedQuotaManager.java deleted file mode 100644 index a2f3b6e7e..000000000 --- a/src/java/com/twitter/search/earlybird_root/quota/ConfigRepoBasedQuotaManager.java +++ /dev/null @@ -1,65 +0,0 @@ -package com.twitter.search.earlybird_root.quota; - -import java.util.Optional; - -import javax.inject.Inject; - -import com.google.common.base.Preconditions; - -import com.twitter.search.common.dark.ServerSetResolver.SelfServerSetResolver; - -/** - * A config based implementation of the {@code ClientIdQuotaManager} interface. - * It uses a ConfigBasedQuotaConfig object to load the contents of the config. - */ -public class ConfigRepoBasedQuotaManager implements ClientIdQuotaManager { - - public static final String COMMON_POOL_CLIENT_ID = "common_pool"; - - private final ConfigBasedQuotaConfig quotaConfig; - private final SelfServerSetResolver serverSetResolver; - - /** Creates a new ConfigRepoBasedQuotaManager instance. */ - @Inject - public ConfigRepoBasedQuotaManager( - SelfServerSetResolver serverSetResolver, - ConfigBasedQuotaConfig quotaConfig) { - Preconditions.checkNotNull(quotaConfig); - - this.quotaConfig = quotaConfig; - this.serverSetResolver = serverSetResolver; - } - - @Override - public Optional getQuotaForClient(String clientId) { - Optional quotaForClient = quotaConfig.getQuotaForClient(clientId); - - if (!quotaForClient.isPresent()) { - return Optional.empty(); - } - - QuotaInfo quota = quotaForClient.get(); - - int quotaValue = quota.getQuota(); - int rootInstanceCount = serverSetResolver.getServerSetSize(); - if (rootInstanceCount > 0) { - quotaValue = (int) Math.ceil((double) quotaValue / rootInstanceCount); - } - - return Optional.of( - new QuotaInfo( - quota.getQuotaClientId(), - quota.getQuotaEmail(), - quotaValue, - quota.shouldEnforceQuota(), - quota.getClientTier(), - quota.hasArchiveAccess())); - } - - @Override - public QuotaInfo getCommonPoolQuota() { - Optional commonPoolQuota = getQuotaForClient(COMMON_POOL_CLIENT_ID); - Preconditions.checkState(commonPoolQuota.isPresent()); - return commonPoolQuota.get(); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/quota/QuotaInfo.docx b/src/java/com/twitter/search/earlybird_root/quota/QuotaInfo.docx new file mode 100644 index 000000000..eed9ac48e Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/quota/QuotaInfo.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/quota/QuotaInfo.java b/src/java/com/twitter/search/earlybird_root/quota/QuotaInfo.java deleted file mode 100644 index d672f602b..000000000 --- a/src/java/com/twitter/search/earlybird_root/quota/QuotaInfo.java +++ /dev/null @@ -1,78 +0,0 @@ -package com.twitter.search.earlybird_root.quota; - -import com.google.common.base.Preconditions; - -/** - * Simple container of quota related information. - */ -public class QuotaInfo { - public static final String DEFAULT_TIER_VALUE = "no_tier"; - public static final boolean DEFAULT_ARCHIVE_ACCESS_VALUE = false; - - private final String quotaClientId; - private final String quotaEmail; - private final int quota; - private final boolean shouldEnforceQuota; - private final String clientTier; - private final boolean archiveAccess; - - /** - * Creates a new QuotaInfo object with the given clientId, quota and shouldEnforceQuota. - */ - public QuotaInfo( - String quotaClientId, - String quotaEmail, - int quota, - boolean shouldEnforceQuota, - String clientTier, - boolean archiveAccess) { - this.quotaClientId = Preconditions.checkNotNull(quotaClientId); - this.quotaEmail = Preconditions.checkNotNull(quotaEmail); - this.quota = quota; - this.shouldEnforceQuota = shouldEnforceQuota; - this.clientTier = Preconditions.checkNotNull(clientTier); - this.archiveAccess = archiveAccess; - } - - /** - * Returns the clientId for which we have the QuotaInfo. - */ - public String getQuotaClientId() { - return quotaClientId; - } - - /** - * Returns the email associated with this clientId. - */ - public String getQuotaEmail() { - return quotaEmail; - } - - /** - * Returns the integer based quota for the stored client id. - */ - public int getQuota() { - return quota; - } - - /** - * Returns whether the quota should be enforced or not. - */ - public boolean shouldEnforceQuota() { - return shouldEnforceQuota; - } - - /** - * Return tier info about the client. - */ - public String getClientTier() { - return clientTier; - } - - /** - * Returns whether the client has access to the full archive. - */ - public boolean hasArchiveAccess() { - return archiveAccess; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/AbstractRecencyAndRelevanceRequestRouter.docx b/src/java/com/twitter/search/earlybird_root/routers/AbstractRecencyAndRelevanceRequestRouter.docx new file mode 100644 index 000000000..58fc13096 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/AbstractRecencyAndRelevanceRequestRouter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/AbstractRecencyAndRelevanceRequestRouter.java b/src/java/com/twitter/search/earlybird_root/routers/AbstractRecencyAndRelevanceRequestRouter.java deleted file mode 100644 index bf4154e1a..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/AbstractRecencyAndRelevanceRequestRouter.java +++ /dev/null @@ -1,442 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.finagle.Service; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.futures.Futures; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.util.earlybird.EarlybirdResponseMergeUtil; -import com.twitter.search.earlybird.thrift.AdjustedRequestParams; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird.thrift.ThriftSearchRankingMode; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird_root.common.ClientErrorException; -import com.twitter.search.earlybird_root.common.EarlybirdFeatureSchemaMerger; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.EarlybirdRequestUtil; -import com.twitter.search.earlybird_root.common.EarlybirdServiceResponse; -import com.twitter.search.earlybird_root.filters.EarlybirdTimeRangeFilter; -import com.twitter.search.earlybird_root.mergers.SuperRootResponseMerger; -import com.twitter.search.queryparser.util.QueryUtil; -import com.twitter.util.Function; -import com.twitter.util.Function0; -import com.twitter.util.Future; - -/** - * For Recency traffic SuperRoot hits realtime and/or protected realtime first and then archive - */ -public abstract class AbstractRecencyAndRelevanceRequestRouter extends RequestRouter { - public static final String FULL_ARCHIVE_AVAILABLE_FOR_GET_PROTECTED_TWEETS_ONLY_DECIDER_KEY = - "superroot_full_archive_cluster_available_for_get_protected_tweets_only_requests"; - public static final String FULL_ARCHIVE_AVAILABLE_FOR_NOT_ENOUGH_PROTECTED_RESULTS_DECIDER_KEY = - "superroot_full_archive_cluster_available_for_requests_without_enough_protected_results"; - - private static final Logger LOG = - LoggerFactory.getLogger(AbstractRecencyAndRelevanceRequestRouter.class); - - private final String skipProtectedClusterDeciderKey; - private final String skipFullArchiveClusterDeciderKey; - - private final SearchCounter realtimeResponseInvalidCounter; - private final SearchCounter realtimeResponseSearchResultsNotSetCounter; - private final SearchCounter minSearchedStatusIdLargerThanRequestMaxIdCounter; - private final SearchCounter minSearchedStatusIdLargerThanRequestUntilTimeCounter; - - private final Service realtime; - private final Service protectedRealtime; - private final Service fullArchive; - private final SuperRootResponseMerger responseMerger; - private final SearchDecider decider; - - AbstractRecencyAndRelevanceRequestRouter( - Service realtime, - Service protectedRealtime, - Service fullArchive, - EarlybirdTimeRangeFilter realtimeTimeRangeFilter, - EarlybirdTimeRangeFilter protectedTimeRangeFilter, - EarlybirdTimeRangeFilter fullArchiveTimeRangeFilter, - ThriftSearchRankingMode rankingMode, - Clock clock, - SearchDecider decider, - EarlybirdFeatureSchemaMerger featureSchemaMerger) { - LOG.info("Instantiating AbstractRecencyAndRelevanceRequestRouter"); - this.realtime = realtimeTimeRangeFilter.andThen(realtime); - this.protectedRealtime = protectedTimeRangeFilter.andThen(protectedRealtime); - this.fullArchive = fullArchiveTimeRangeFilter.andThen(fullArchive); - this.responseMerger = new SuperRootResponseMerger(rankingMode, featureSchemaMerger, clock); - this.decider = decider; - - String rankingModeForStats = rankingMode.name().toLowerCase(); - skipProtectedClusterDeciderKey = - String.format("superroot_skip_protected_cluster_for_%s_requests", rankingModeForStats); - skipFullArchiveClusterDeciderKey = - String.format("superroot_skip_full_archive_cluster_for_%s_requests", rankingModeForStats); - - realtimeResponseInvalidCounter = - SearchCounter.export(rankingModeForStats + "_realtime_response_invalid"); - realtimeResponseSearchResultsNotSetCounter = - SearchCounter.export(rankingModeForStats + "_realtime_response_search_results_not_set"); - minSearchedStatusIdLargerThanRequestMaxIdCounter = SearchCounter.export( - rankingModeForStats + "_min_searched_status_id_larger_than_request_max_id"); - minSearchedStatusIdLargerThanRequestUntilTimeCounter = SearchCounter.export( - rankingModeForStats + "_min_searched_status_id_larger_than_request_until_time"); - } - - private void checkRequestPreconditions(EarlybirdRequest request) { - // CollectorParams should be set in EarlybirdRequestUtil.checkAndSetCollectorParams(). - Preconditions.checkNotNull(request.getSearchQuery().getCollectorParams()); - - // return a Client error if the num results are less than 0 - if (request.getSearchQuery().getNumResults() < 0) { - throw new ClientErrorException("The request.searchQuery.numResults field can't be negative"); - } - - if (request.getSearchQuery().getCollectorParams().getNumResultsToReturn() < 0) { - throw new ClientErrorException("The request.searchQuery.collectorParams.numResultsToReturn " - + "field can't be negative"); - } - } - - /** - * Hit realtime and/or protected realtime first, if not enough results, then hit archive, - * merge the results. - */ - @Override - public Future route(final EarlybirdRequestContext requestContext) { - EarlybirdRequest request = requestContext.getRequest(); - - this.checkRequestPreconditions(request); - - ArrayList savedRequestResponses = new ArrayList<>(); - - // If clients do not define numResults to return or the numResults requested are 0 - // return an empty EarlyBirdResponse without hitting any service. - if (request.getSearchQuery().getNumResults() == 0 - || request.getSearchQuery().getCollectorParams().getNumResultsToReturn() == 0) { - return Future.value(successNoResultsResponse()); - } - - // Realtime earlybird response is already required. Even if the service is not called - // the result passed to the mergers should be a valid one. - EarlybirdServiceResponse.ServiceState realtimeServiceState = - getRealtimeServiceState(requestContext); - final Future realtimeResponseFuture = - realtimeServiceState.serviceWasCalled() - ? getRealtimeResponse(savedRequestResponses, requestContext) - : Future.value(EarlybirdServiceResponse.serviceNotCalled(realtimeServiceState)); - - // If no flock response (followedUserIds) is set, request wont be sent to protected. - EarlybirdServiceResponse.ServiceState protectedServiceState = - getProtectedServiceState(requestContext); - final Future protectedResponseFuture = - protectedServiceState.serviceWasCalled() - ? getProtectedResponse(savedRequestResponses, requestContext) - : Future.value(EarlybirdServiceResponse.serviceNotCalled(protectedServiceState)); - - final Future archiveResponseFuture = - Futures.flatMap(realtimeResponseFuture, protectedResponseFuture, - new Function0>() { - @Override - public Future apply() { - EarlybirdServiceResponse realtimeResponse = Futures.get(realtimeResponseFuture); - EarlybirdServiceResponse protectedResponse = Futures.get(protectedResponseFuture); - EarlybirdServiceResponse.ServiceState fullArchiveServiceState = - getFullArchiveServiceState(requestContext, realtimeResponse, protectedResponse); - return fullArchiveServiceState.serviceWasCalled() - ? getFullArchiveResponse(savedRequestResponses, requestContext, - realtimeResponse.getResponse(), protectedResponse.getResponse()) - : Future.value( - EarlybirdServiceResponse.serviceNotCalled(fullArchiveServiceState)); - } - } - ); - - Future mergedResponse = responseMerger.mergeResponseFutures( - requestContext, realtimeResponseFuture, protectedResponseFuture, archiveResponseFuture); - mergedResponse = mergedResponse - .map(RequestRouterUtil.checkMinSearchedStatusId( - requestContext, - "max_id", - EarlybirdRequestUtil.getRequestMaxId(requestContext.getParsedQuery()), - realtimeResponseFuture, - protectedResponseFuture, - archiveResponseFuture, - minSearchedStatusIdLargerThanRequestMaxIdCounter)) - .map(RequestRouterUtil.checkMinSearchedStatusId( - requestContext, - "until_time", - EarlybirdRequestUtil.getRequestMaxIdFromUntilTime(requestContext.getParsedQuery()), - realtimeResponseFuture, - protectedResponseFuture, - archiveResponseFuture, - minSearchedStatusIdLargerThanRequestUntilTimeCounter)); - - return this.maybeAttachSentRequestsToDebugInfo( - savedRequestResponses, - requestContext, - mergedResponse - ); - } - - private EarlybirdResponse successNoResultsResponse() { - return new EarlybirdResponse(EarlybirdResponseCode.SUCCESS, 0) - .setSearchResults(new ThriftSearchResults().setResults(Collections.emptyList())); - } - - protected abstract boolean shouldSendRequestToFullArchiveCluster( - EarlybirdRequest request, EarlybirdResponse realtimeResponse); - - /** Determines if the protected service is available and if a request should be sent to it. */ - private EarlybirdServiceResponse.ServiceState getProtectedServiceState( - EarlybirdRequestContext requestContext) { - if (!requestContext.getRequest().isSetFollowedUserIds() - || requestContext.getRequest().getFollowedUserIds().isEmpty()) { - return EarlybirdServiceResponse.ServiceState.SERVICE_NOT_REQUESTED; - } - - if (decider.isAvailable(skipProtectedClusterDeciderKey)) { - return EarlybirdServiceResponse.ServiceState.SERVICE_NOT_AVAILABLE; - } - - return EarlybirdServiceResponse.ServiceState.SERVICE_CALLED; - } - - /** Determines if the realtime service is available and if a request should be sent to it. */ - private EarlybirdServiceResponse.ServiceState getRealtimeServiceState( - EarlybirdRequestContext requestContext) { - EarlybirdRequest request = requestContext.getRequest(); - - // SERVICE_NOT_REQUESTED should always be returned before other states as - // SuperRootResponseMerger has special logic for this case. - if (request.isSetGetProtectedTweetsOnly() && request.isGetProtectedTweetsOnly()) { - return EarlybirdServiceResponse.ServiceState.SERVICE_NOT_REQUESTED; - } - - return EarlybirdServiceResponse.ServiceState.SERVICE_CALLED; - } - - /** Determines if the full archive service is available and if a request should be sent to it. */ - private EarlybirdServiceResponse.ServiceState getFullArchiveServiceState( - EarlybirdRequestContext requestContext, - EarlybirdServiceResponse publicServiceResponse, - EarlybirdServiceResponse protectedServiceResponse) { - - // SERVICE_NOT_REQUESTED should be always be returned before other states as - // SuperRootResponseMerger has special logic for this case. - if (!requestContext.getRequest().isSetGetOlderResults() - || !requestContext.getRequest().isGetOlderResults()) { - return EarlybirdServiceResponse.ServiceState.SERVICE_NOT_REQUESTED; - } - - // allow requesting full archive service when decider is enabled - if (!decider.isAvailable(FULL_ARCHIVE_AVAILABLE_FOR_GET_PROTECTED_TWEETS_ONLY_DECIDER_KEY) - && requestContext.getRequest().isSetGetProtectedTweetsOnly() - && requestContext.getRequest().isGetProtectedTweetsOnly()) { - return EarlybirdServiceResponse.ServiceState.SERVICE_NOT_REQUESTED; - } - - if (decider.isAvailable(skipFullArchiveClusterDeciderKey)) { - return EarlybirdServiceResponse.ServiceState.SERVICE_NOT_AVAILABLE; - } - - boolean serviceWasCalledForPublic = - getFullArchiveServiceState(requestContext, publicServiceResponse).serviceWasCalled(); - boolean serviceWasCalledForProtected = - decider.isAvailable(FULL_ARCHIVE_AVAILABLE_FOR_NOT_ENOUGH_PROTECTED_RESULTS_DECIDER_KEY) - && getFullArchiveServiceState(requestContext, protectedServiceResponse).serviceWasCalled(); - if (!serviceWasCalledForPublic && !serviceWasCalledForProtected) { - return EarlybirdServiceResponse.ServiceState.SERVICE_NOT_CALLED; - } - - return EarlybirdServiceResponse.ServiceState.SERVICE_CALLED; - } - - private EarlybirdServiceResponse.ServiceState getFullArchiveServiceState( - EarlybirdRequestContext requestContext, - EarlybirdServiceResponse realtimeServiceResponse) { - EarlybirdResponse realtimeResponse = realtimeServiceResponse.getResponse(); - - if (!EarlybirdResponseMergeUtil.isValidResponse(realtimeResponse)) { - realtimeResponseInvalidCounter.increment(); - return EarlybirdServiceResponse.ServiceState.SERVICE_NOT_CALLED; - } - - if (!realtimeResponse.isSetSearchResults()) { - realtimeResponseSearchResultsNotSetCounter.increment(); - return EarlybirdServiceResponse.ServiceState.SERVICE_NOT_CALLED; - } - - if (!shouldSendRequestToFullArchiveCluster(requestContext.getRequest(), realtimeResponse)) { - return EarlybirdServiceResponse.ServiceState.SERVICE_NOT_CALLED; - } - - return EarlybirdServiceResponse.ServiceState.SERVICE_CALLED; - } - - /** - * Modify the original request context based on the followedUserId field and then send the - * request to the protected cluster. - */ - private Future getProtectedResponse( - ArrayList savedRequestResponses, - final EarlybirdRequestContext requestContext) { - EarlybirdRequestContext protectedRequestContext = - EarlybirdRequestContext.newContextWithRestrictFromUserIdFilter64(requestContext); - Preconditions.checkArgument( - protectedRequestContext.getRequest().getSearchQuery().isSetFromUserIDFilter64()); - - // SERVICE_NOT_REQUESTED should be always be returned before other states as - // SuperRootResponseMerger has special logic for this case. - if (protectedRequestContext.getRequest().getSearchQuery().getFromUserIDFilter64().isEmpty()) { - return Future.value(EarlybirdServiceResponse.serviceNotCalled( - EarlybirdServiceResponse.ServiceState.SERVICE_NOT_REQUESTED)); - } - - if (requestContext.getRequest().isSetAdjustedProtectedRequestParams()) { - adjustRequestParams(protectedRequestContext.getRequest(), - requestContext.getRequest().getAdjustedProtectedRequestParams()); - } - - LOG.debug("Request sent to the protected cluster: {}", protectedRequestContext.getRequest()); - return toEarlybirdServiceResponseFuture( - savedRequestResponses, - protectedRequestContext, - "protected", - this.protectedRealtime - ); - } - - private Future getRealtimeResponse( - ArrayList savedRequestResponses, - EarlybirdRequestContext requestContext) { - return toEarlybirdServiceResponseFuture( - savedRequestResponses, - requestContext, - "realtime", - this.realtime); - } - - /** - * Modifying the existing max id filter of the request or appending a new - * max id filter and then send the request to the full archive cluster. - */ - private Future getFullArchiveResponse( - ArrayList savedRequestResponses, - EarlybirdRequestContext requestContext, - EarlybirdResponse realtimeResponse, - EarlybirdResponse protectedResponse) { - long realtimeMinId = getMinSearchedId(realtimeResponse); - long protectedMinId = getMinSearchedId(protectedResponse); - // if both realtime and protected min searched ids are available, the larger(newer) one is used - // to make sure no tweets are left out. However, this means it might introduce duplicates for - // the other response. The response merger will dedup the response. This logic is enabled - // when full archive cluster is available for requests without enough protected results. - long minId = - decider.isAvailable(FULL_ARCHIVE_AVAILABLE_FOR_NOT_ENOUGH_PROTECTED_RESULTS_DECIDER_KEY) - ? Math.max(realtimeMinId, protectedMinId) : realtimeMinId; - - if (minId <= 0) { - // If the realtime response doesn't have a minSearchedStatusID set, get all results from - // the full archive cluster. - minId = Long.MAX_VALUE; - } - - // The [max_id] operator is inclusive in earlybirds. This means that a query with [max_id X] - // will return tweet X, if X matches the rest of the query. So we should add a [max_id (X - 1)] - // operator to the full archive query (instead of [max_id X]). Otherwise, we could end up with - // duplicates. For example: - // - // realtime response: results = [ 100, 90, 80 ], minSearchedStatusID = 80 - // full archive request: [max_id 80] - // full archive response: results = [ 80, 70, 60 ] - // - // In this case, tweet 80 would be returned from both the realtime and full archive clusters. - EarlybirdRequestContext archiveRequestContext = - EarlybirdRequestContext.copyRequestContext( - requestContext, - QueryUtil.addOrReplaceMaxIdFilter( - requestContext.getParsedQuery(), - minId - 1)); - - if (requestContext.getRequest().isSetAdjustedFullArchiveRequestParams()) { - adjustRequestParams(archiveRequestContext.getRequest(), - requestContext.getRequest().getAdjustedFullArchiveRequestParams()); - } - - LOG.debug("Request sent to the full archive cluster: {},", archiveRequestContext.getRequest()); - return toEarlybirdServiceResponseFuture( - savedRequestResponses, - archiveRequestContext, - "archive", - this.fullArchive - ); - } - - private long getMinSearchedId(EarlybirdResponse response) { - return response != null && response.isSetSearchResults() - ? response.getSearchResults().getMinSearchedStatusID() : 0; - } - - private void adjustRequestParams(EarlybirdRequest request, - AdjustedRequestParams adjustedRequestParams) { - ThriftSearchQuery searchQuery = request.getSearchQuery(); - - if (adjustedRequestParams.isSetNumResults()) { - searchQuery.setNumResults(adjustedRequestParams.getNumResults()); - if (searchQuery.isSetCollectorParams()) { - searchQuery.getCollectorParams().setNumResultsToReturn( - adjustedRequestParams.getNumResults()); - } - } - - if (adjustedRequestParams.isSetMaxHitsToProcess()) { - searchQuery.setMaxHitsToProcess(adjustedRequestParams.getMaxHitsToProcess()); - if (searchQuery.isSetRelevanceOptions()) { - searchQuery.getRelevanceOptions().setMaxHitsToProcess( - adjustedRequestParams.getMaxHitsToProcess()); - } - if (searchQuery.isSetCollectorParams() - && searchQuery.getCollectorParams().isSetTerminationParams()) { - searchQuery.getCollectorParams().getTerminationParams().setMaxHitsToProcess( - adjustedRequestParams.getMaxHitsToProcess()); - } - } - - if (adjustedRequestParams.isSetReturnAllResults()) { - if (searchQuery.isSetRelevanceOptions()) { - searchQuery.getRelevanceOptions().setReturnAllResults( - adjustedRequestParams.isReturnAllResults()); - } - } - } - - private Future toEarlybirdServiceResponseFuture( - List savedRequestResponses, - EarlybirdRequestContext requestContext, - String sentTo, - Service service) { - Future responseFuture = service.apply(requestContext); - this.saveRequestResponse( - savedRequestResponses, sentTo, requestContext, responseFuture - ); - - return responseFuture.map(new Function() { - @Override - public EarlybirdServiceResponse apply(EarlybirdResponse response) { - return EarlybirdServiceResponse.serviceCalled(response); - } - }); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/BUILD b/src/java/com/twitter/search/earlybird_root/routers/BUILD deleted file mode 100644 index 1f9f71b60..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/BUILD +++ /dev/null @@ -1,25 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/org/slf4j:slf4j-api", - "finatra/inject/inject-core/src/main/scala", - "src/java/com/twitter/common/collections", - "src/java/com/twitter/search/common/decider", - "src/java/com/twitter/search/common/futures", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/snowflakeparser", - "src/java/com/twitter/search/common/util/earlybird", - "src/java/com/twitter/search/earlybird/common", - "src/java/com/twitter/search/earlybird/config", - "src/java/com/twitter/search/earlybird_root/common", - "src/java/com/twitter/search/earlybird_root/filters", - "src/java/com/twitter/search/earlybird_root/mergers", - "src/java/com/twitter/search/queryparser", - "src/java/com/twitter/search/queryparser/query:core-query-nodes", - "src/thrift/com/twitter/search:earlybird-java", - ], -) diff --git a/src/java/com/twitter/search/earlybird_root/routers/BUILD.docx b/src/java/com/twitter/search/earlybird_root/routers/BUILD.docx new file mode 100644 index 000000000..aec56289d Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/BUILD.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/FacetsRequestRouter.docx b/src/java/com/twitter/search/earlybird_root/routers/FacetsRequestRouter.docx new file mode 100644 index 000000000..7f71d83eb Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/FacetsRequestRouter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/FacetsRequestRouter.java b/src/java/com/twitter/search/earlybird_root/routers/FacetsRequestRouter.java deleted file mode 100644 index 9883853f3..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/FacetsRequestRouter.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import javax.inject.Inject; -import javax.inject.Named; - -import com.twitter.finagle.Service; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.InjectionNames; -import com.twitter.search.earlybird_root.filters.EarlybirdTimeRangeFilter; -import com.twitter.util.Future; - -/** - * For Facets traffic SuperRoot forwards all traffic to the realtime cluster. - */ -public class FacetsRequestRouter extends RequestRouter { - - private final Service realtime; - - /** Creates a new FacetsRequestRouter instance to be used by the SuperRoot. */ - @Inject - public FacetsRequestRouter( - @Named(InjectionNames.REALTIME) - Service realtime, - @Named(FacetsRequestRouterModule.TIME_RANGE_FILTER) - EarlybirdTimeRangeFilter timeRangeFilter) { - - this.realtime = timeRangeFilter.andThen(realtime); - } - - @Override - public Future route(EarlybirdRequestContext requestContext) { - return realtime.apply(requestContext); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/FacetsRequestRouterModule.docx b/src/java/com/twitter/search/earlybird_root/routers/FacetsRequestRouterModule.docx new file mode 100644 index 000000000..88e371e02 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/FacetsRequestRouterModule.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/FacetsRequestRouterModule.java b/src/java/com/twitter/search/earlybird_root/routers/FacetsRequestRouterModule.java deleted file mode 100644 index 87aa5852e..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/FacetsRequestRouterModule.java +++ /dev/null @@ -1,33 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import javax.inject.Named; -import javax.inject.Singleton; - -import com.google.inject.Provides; - -import com.twitter.inject.TwitterModule; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.earlybird_root.filters.EarlybirdTimeRangeFilter; -import com.twitter.search.earlybird_root.filters.RealtimeServingRangeProvider; -import com.twitter.search.earlybird_root.filters.ServingRangeProvider; - -public class FacetsRequestRouterModule extends TwitterModule { - public static final String TIME_RANGE_FILTER = "facets_time_range_filter"; - - public static final String SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY = - "superroot_facets_serving_range_boundary_hours_ago"; - - private ServingRangeProvider getServingRangeProvider(final SearchDecider decider) - throws Exception { - return new RealtimeServingRangeProvider( - decider, SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY); - } - - @Provides - @Singleton - @Named(TIME_RANGE_FILTER) - private EarlybirdTimeRangeFilter providesTimeRangeFilter(SearchDecider decider) throws Exception { - return EarlybirdTimeRangeFilter.newTimeRangeFilterWithoutQueryRewriter( - getServingRangeProvider(decider)); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/RecencyRequestRouter.docx b/src/java/com/twitter/search/earlybird_root/routers/RecencyRequestRouter.docx new file mode 100644 index 000000000..d27c22a69 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/RecencyRequestRouter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/RecencyRequestRouter.java b/src/java/com/twitter/search/earlybird_root/routers/RecencyRequestRouter.java deleted file mode 100644 index f870c2e68..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/RecencyRequestRouter.java +++ /dev/null @@ -1,73 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import javax.inject.Inject; -import javax.inject.Named; - -import com.twitter.common.util.Clock; -import com.twitter.finagle.Service; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftSearchRankingMode; -import com.twitter.search.earlybird_root.common.EarlybirdFeatureSchemaMerger; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.InjectionNames; -import com.twitter.search.earlybird_root.filters.EarlybirdTimeRangeFilter; - -public class RecencyRequestRouter extends AbstractRecencyAndRelevanceRequestRouter { - private static final SearchCounter SKIPPED_ARCHIVE_DUE_TO_REALTIME_EARLY_TERMINATION_COUNTER = - SearchCounter.export("recency_skipped_archive_due_to_realtime_early_termination"); - private static final SearchCounter SKIPPED_ARCHIVE_DUE_TO_REALTIME_ENOUGH_RESULTS_COUNTER = - SearchCounter.export("recency_skipped_archive_due_to_realtime_enough_results"); - - @Inject - public RecencyRequestRouter( - @Named(InjectionNames.REALTIME) - Service realtime, - @Named(InjectionNames.PROTECTED) - Service protectedRealtime, - @Named(InjectionNames.FULL_ARCHIVE) - Service fullArchive, - @Named(RecencyRequestRouterModule.REALTIME_TIME_RANGE_FILTER) - EarlybirdTimeRangeFilter realtimeTimeRangeFilter, - @Named(RecencyRequestRouterModule.PROTECTED_TIME_RANGE_FILTER) - EarlybirdTimeRangeFilter protectedTimeRangeFilter, - @Named(RecencyRequestRouterModule.FULL_ARCHIVE_TIME_RANGE_FILTER) - EarlybirdTimeRangeFilter fullArchiveTimeRangeFilter, - Clock clock, - SearchDecider decider, - EarlybirdFeatureSchemaMerger featureSchemaMerger) { - super(realtime, - protectedRealtime, - fullArchive, - realtimeTimeRangeFilter, - protectedTimeRangeFilter, - fullArchiveTimeRangeFilter, - ThriftSearchRankingMode.RECENCY, - clock, - decider, - featureSchemaMerger); - } - - @Override - protected boolean shouldSendRequestToFullArchiveCluster( - EarlybirdRequest request, EarlybirdResponse realtimeResponse) { - boolean isEarlyTerminated = realtimeResponse.isSetEarlyTerminationInfo() - && realtimeResponse.getEarlyTerminationInfo().isEarlyTerminated(); - if (isEarlyTerminated) { - SKIPPED_ARCHIVE_DUE_TO_REALTIME_EARLY_TERMINATION_COUNTER.increment(); - return false; - } - - // Check if we have the minimum number of results to fulfill the original request. - int numResultsRequested = request.getSearchQuery().getNumResults(); - int actualNumResults = realtimeResponse.getSearchResults().getResultsSize(); - if (actualNumResults >= numResultsRequested) { - SKIPPED_ARCHIVE_DUE_TO_REALTIME_ENOUGH_RESULTS_COUNTER.increment(); - return false; - } - - return true; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/RecencyRequestRouterModule.docx b/src/java/com/twitter/search/earlybird_root/routers/RecencyRequestRouterModule.docx new file mode 100644 index 000000000..52e05ecde Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/RecencyRequestRouterModule.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/RecencyRequestRouterModule.java b/src/java/com/twitter/search/earlybird_root/routers/RecencyRequestRouterModule.java deleted file mode 100644 index 009c04a68..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/RecencyRequestRouterModule.java +++ /dev/null @@ -1,74 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import javax.inject.Named; -import javax.inject.Singleton; - -import com.google.inject.Provides; - -import com.twitter.inject.TwitterModule; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.earlybird_root.filters.EarlybirdTimeRangeFilter; -import com.twitter.search.earlybird_root.filters.FullArchiveServingRangeProvider; -import com.twitter.search.earlybird_root.filters.RealtimeServingRangeProvider; -import com.twitter.search.earlybird_root.filters.ServingRangeProvider; - -public class RecencyRequestRouterModule extends TwitterModule { - public static final String FULL_ARCHIVE_TIME_RANGE_FILTER = - "recency_full_archive_time_range_filter"; - public static final String REALTIME_TIME_RANGE_FILTER = - "recency_realtime_time_range_filter"; - public static final String PROTECTED_TIME_RANGE_FILTER = - "recency_protected_time_range_filter"; - - public static final String REALTIME_RANGE_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY = - "superroot_recency_realtime_serving_range_boundary_hours_ago"; - public static final String PROTECTED_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY = - "superroot_recency_protected_serving_range_boundary_hours_ago"; - public static final String FULL_ARCHIVE_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY = - "superroot_recency_full_archive_serving_range_boundary_hours_ago"; - - private ServingRangeProvider getFullArchiveServingRangeProvider(final SearchDecider decider) - throws Exception { - return new FullArchiveServingRangeProvider( - decider, FULL_ARCHIVE_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY); - } - - private ServingRangeProvider getRealtimeServingRangeProvider(final SearchDecider decider) - throws Exception { - return new RealtimeServingRangeProvider( - decider, REALTIME_RANGE_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY); - } - - private ServingRangeProvider getProtectedServingRangeProvider(final SearchDecider decider) - throws Exception { - return new RealtimeServingRangeProvider( - decider, PROTECTED_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY); - } - - @Provides - @Singleton - @Named(FULL_ARCHIVE_TIME_RANGE_FILTER) - private EarlybirdTimeRangeFilter providesFullArchiveTimeRangeFilter(SearchDecider decider) - throws Exception { - return EarlybirdTimeRangeFilter.newTimeRangeFilterWithoutQueryRewriter( - getFullArchiveServingRangeProvider(decider)); - } - - @Provides - @Singleton - @Named(REALTIME_TIME_RANGE_FILTER) - private EarlybirdTimeRangeFilter providesRealtimeTimeRangeFilter(SearchDecider decider) - throws Exception { - return EarlybirdTimeRangeFilter.newTimeRangeFilterWithoutQueryRewriter( - getRealtimeServingRangeProvider(decider)); - } - - @Provides - @Singleton - @Named(PROTECTED_TIME_RANGE_FILTER) - private EarlybirdTimeRangeFilter providesProtectedTimeRangeFilter(SearchDecider decider) - throws Exception { - return EarlybirdTimeRangeFilter.newTimeRangeFilterWithoutQueryRewriter( - getProtectedServingRangeProvider(decider)); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/RelevanceRequestRouter.docx b/src/java/com/twitter/search/earlybird_root/routers/RelevanceRequestRouter.docx new file mode 100644 index 000000000..ac5ce3c59 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/RelevanceRequestRouter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/RelevanceRequestRouter.java b/src/java/com/twitter/search/earlybird_root/routers/RelevanceRequestRouter.java deleted file mode 100644 index cb7d10504..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/RelevanceRequestRouter.java +++ /dev/null @@ -1,100 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import java.util.concurrent.TimeUnit; - -import javax.inject.Inject; -import javax.inject.Named; - -import com.google.common.base.Preconditions; - -import com.twitter.common.util.Clock; -import com.twitter.finagle.Service; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.query.thriftjava.CollectorTerminationParams; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftSearchRankingMode; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird_root.common.EarlybirdFeatureSchemaMerger; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.InjectionNames; -import com.twitter.search.earlybird_root.filters.EarlybirdTimeRangeFilter; - -public class RelevanceRequestRouter extends AbstractRecencyAndRelevanceRequestRouter { - private static final long MILLIS_IN_ONE_DAY = TimeUnit.DAYS.toMillis(1); - - @Inject - public RelevanceRequestRouter( - @Named(InjectionNames.REALTIME) - Service realtime, - @Named(InjectionNames.PROTECTED) - Service protectedRealtime, - @Named(InjectionNames.FULL_ARCHIVE) - Service fullArchive, - @Named(RelevanceRequestRouterModule.REALTIME_TIME_RANGE_FILTER) - EarlybirdTimeRangeFilter realtimeTimeRangeFilter, - @Named(RelevanceRequestRouterModule.PROTECTED_TIME_RANGE_FILTER) - EarlybirdTimeRangeFilter protectedTimeRangeFilter, - @Named(RelevanceRequestRouterModule.FULL_ARCHIVE_TIME_RANGE_FILTER) - EarlybirdTimeRangeFilter fullArchiveTimeRangeFilter, - Clock clock, - SearchDecider decider, - EarlybirdFeatureSchemaMerger featureSchemaMerger) { - super(realtime, - protectedRealtime, - fullArchive, - realtimeTimeRangeFilter, - protectedTimeRangeFilter, - fullArchiveTimeRangeFilter, - ThriftSearchRankingMode.RELEVANCE, - clock, - decider, - featureSchemaMerger); - } - - @Override - protected boolean shouldSendRequestToFullArchiveCluster( - EarlybirdRequest request, EarlybirdResponse realtimeResponse) { - int numResultsRequested = request.getSearchQuery().getNumResults(); - int numHitsProcessed = realtimeResponse.getSearchResults().isSetNumHitsProcessed() - ? realtimeResponse.getSearchResults().getNumHitsProcessed() - : -1; - if (numHitsProcessed < numResultsRequested) { - // Send query to the full archive cluster, if we went through fewer hits in the realtime - // cluster than the requested number of results. - return true; - } - - // If we have enough hits, don't query the full archive cluster yet. - int numSuccessfulPartitions = realtimeResponse.getNumSuccessfulPartitions(); - CollectorTerminationParams terminationParams = - request.getSearchQuery().getCollectorParams().getTerminationParams(); - - Preconditions.checkArgument(terminationParams.isSetMaxHitsToProcess()); - int maxHits = terminationParams.getMaxHitsToProcess() * numSuccessfulPartitions; - - if (numHitsProcessed >= maxHits) { - return false; - } - - // Check if there is a gap between the last result and the min status ID of current search. - // If the difference is larger than one day, then we can still get more tweets from the realtime - // cluster, so there's no need to query the full archive cluster just yet. If we don't check - // this, then we might end up with a big gap in the returned results. - int numReturnedResults = realtimeResponse.getSearchResults().getResultsSize(); - if (numReturnedResults > 0) { - ThriftSearchResult lastResult = - realtimeResponse.getSearchResults().getResults().get(numReturnedResults - 1); - long lastResultTimeMillis = SnowflakeIdParser.getTimestampFromTweetId(lastResult.getId()); - long minSearchedStatusID = realtimeResponse.getSearchResults().getMinSearchedStatusID(); - long minSearchedStatusIDTimeMillis = - SnowflakeIdParser.getTimestampFromTweetId(minSearchedStatusID); - if (lastResultTimeMillis - minSearchedStatusIDTimeMillis > MILLIS_IN_ONE_DAY) { - return false; - } - } - - return true; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/RelevanceRequestRouterModule.docx b/src/java/com/twitter/search/earlybird_root/routers/RelevanceRequestRouterModule.docx new file mode 100644 index 000000000..05c1c4288 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/RelevanceRequestRouterModule.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/RelevanceRequestRouterModule.java b/src/java/com/twitter/search/earlybird_root/routers/RelevanceRequestRouterModule.java deleted file mode 100644 index eaed2c25e..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/RelevanceRequestRouterModule.java +++ /dev/null @@ -1,74 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import javax.inject.Named; -import javax.inject.Singleton; - -import com.google.inject.Provides; - -import com.twitter.inject.TwitterModule; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.earlybird_root.filters.EarlybirdTimeRangeFilter; -import com.twitter.search.earlybird_root.filters.FullArchiveServingRangeProvider; -import com.twitter.search.earlybird_root.filters.RealtimeServingRangeProvider; -import com.twitter.search.earlybird_root.filters.ServingRangeProvider; - -public class RelevanceRequestRouterModule extends TwitterModule { - public static final String FULL_ARCHIVE_TIME_RANGE_FILTER = - "relevance_full_archive_time_range_filter"; - public static final String REALTIME_TIME_RANGE_FILTER = - "relevance_realtime_time_range_filter"; - public static final String PROTECTED_TIME_RANGE_FILTER = - "relevance_protected_time_range_filter"; - - public static final String REALTIME_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY = - "superroot_relevance_realtime_serving_range_boundary_hours_ago"; - public static final String FULL_ARCHIVE_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY = - "superroot_relevance_full_archive_serving_range_boundary_hours_ago"; - public static final String PROTECTED_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY = - "superroot_relevance_protected_serving_range_boundary_hours_ago"; - - private ServingRangeProvider getFullArchiveServingRangeProvider(final SearchDecider decider) - throws Exception { - return new FullArchiveServingRangeProvider( - decider, FULL_ARCHIVE_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY); - } - - private ServingRangeProvider getRealtimeServingRangeProvider(final SearchDecider decider) - throws Exception { - return new RealtimeServingRangeProvider( - decider, REALTIME_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY); - } - - private ServingRangeProvider getProtectedServingRangeProvider(final SearchDecider decider) - throws Exception { - return new RealtimeServingRangeProvider( - decider, PROTECTED_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY); - } - - @Provides - @Singleton - @Named(FULL_ARCHIVE_TIME_RANGE_FILTER) - private EarlybirdTimeRangeFilter providesFullArchiveTimeRangeFilter(SearchDecider decider) - throws Exception { - return EarlybirdTimeRangeFilter.newTimeRangeFilterWithoutQueryRewriter( - getFullArchiveServingRangeProvider(decider)); - } - - @Provides - @Singleton - @Named(REALTIME_TIME_RANGE_FILTER) - private EarlybirdTimeRangeFilter providesRealtimeTimeRangeFilter(SearchDecider decider) - throws Exception { - return EarlybirdTimeRangeFilter.newTimeRangeFilterWithoutQueryRewriter( - getRealtimeServingRangeProvider(decider)); - } - - @Provides - @Singleton - @Named(PROTECTED_TIME_RANGE_FILTER) - private EarlybirdTimeRangeFilter providesProtectedTimeRangeFilter(SearchDecider decider) - throws Exception { - return EarlybirdTimeRangeFilter.newTimeRangeFilterWithoutQueryRewriter( - getProtectedServingRangeProvider(decider)); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/RequestRouter.docx b/src/java/com/twitter/search/earlybird_root/routers/RequestRouter.docx new file mode 100644 index 000000000..12e9c8f54 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/RequestRouter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/RequestRouter.java b/src/java/com/twitter/search/earlybird_root/routers/RequestRouter.java deleted file mode 100644 index e8d01b42c..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/RequestRouter.java +++ /dev/null @@ -1,144 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import java.util.ArrayList; -import java.util.List; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.futures.Futures; -import com.twitter.search.earlybird.thrift.EarlybirdDebugInfo; -import com.twitter.search.earlybird.thrift.EarlybirdRequestResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.util.Future; -import com.twitter.util.Try; - -/** - * Responsible for handling requests in superroot. - */ -public abstract class RequestRouter { - private static final Logger LOG = LoggerFactory.getLogger(RequestRouter.class); - - /** - * Saved request and response, to be included in debug info. - */ - class RequestResponse { - // Where is this request sent to. Freeform text like "realtime", "archive", etc. - private String sentTo; - private EarlybirdRequestContext requestContext; - private Future earlybirdResponseFuture; - - RequestResponse(String sentTo, - EarlybirdRequestContext requestContext, - Future earlybirdResponseFuture) { - this.sentTo = sentTo; - this.requestContext = requestContext; - this.earlybirdResponseFuture = earlybirdResponseFuture; - } - - String getSentTo() { - return sentTo; - } - - public EarlybirdRequestContext getRequestContext() { - return requestContext; - } - - Future getEarlybirdResponseFuture() { - return earlybirdResponseFuture; - } - } - - /** - * Forward a request to different clusters and merge the responses back into one response. - * @param requestContext - */ - public abstract Future route(EarlybirdRequestContext requestContext); - - /** - * Save a request (and its response future) to be included in debug info. - */ - void saveRequestResponse( - List requestResponses, - String sentTo, - EarlybirdRequestContext earlybirdRequestContext, - Future earlybirdResponseFuture - ) { - requestResponses.add( - new RequestResponse( - sentTo, - earlybirdRequestContext, - earlybirdResponseFuture - ) - ); - } - - Future maybeAttachSentRequestsToDebugInfo( - List requestResponses, - EarlybirdRequestContext requestContext, - Future response - ) { - if (requestContext.getRequest().getDebugMode() >= 4) { - return this.attachSentRequestsToDebugInfo( - response, - requestResponses - ); - } else { - return response; - } - } - - /** - * Attaches saved client requests and their responses to the debug info within the - * main EarlybirdResponse. - */ - Future attachSentRequestsToDebugInfo( - Future currentResponse, - List requestResponses) { - - // Get all the response futures that we're waiting on. - List> allResponseFutures = new ArrayList<>(); - for (RequestResponse rr : requestResponses) { - allResponseFutures.add(rr.getEarlybirdResponseFuture()); - } - - // Pack all the futures into a single future. - Future>> allResponsesFuture = - Futures.collectAll(allResponseFutures); - - return currentResponse.flatMap(mainResponse -> { - if (!mainResponse.isSetDebugInfo()) { - mainResponse.setDebugInfo(new EarlybirdDebugInfo()); - } - - Future responseWithRequests = allResponsesFuture.map(allResponses -> { - // Get all individual response "Trys" and see if we can extract something from them - // that we can attach to the debugInfo. - for (int i = 0; i < allResponses.size(); i++) { - - Try responseTry = allResponses.get(i); - - if (responseTry.isReturn()) { - EarlybirdResponse attachedResponse = responseTry.get(); - - // Don't include the debug string, it's already a part of the main response's - // debug string. - attachedResponse.unsetDebugString(); - - EarlybirdRequestResponse reqResp = new EarlybirdRequestResponse(); - reqResp.setSentTo(requestResponses.get(i).getSentTo()); - reqResp.setRequest(requestResponses.get(i).getRequestContext().getRequest()); - reqResp.setResponse(attachedResponse.toString()); - - mainResponse.debugInfo.addToSentRequests(reqResp); - } - } - - return mainResponse; - }); - - return responseWithRequests; - }); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/RequestRouterUtil.docx b/src/java/com/twitter/search/earlybird_root/routers/RequestRouterUtil.docx new file mode 100644 index 000000000..835043bca Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/RequestRouterUtil.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/RequestRouterUtil.java b/src/java/com/twitter/search/earlybird_root/routers/RequestRouterUtil.java deleted file mode 100644 index 785704982..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/RequestRouterUtil.java +++ /dev/null @@ -1,107 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import java.util.List; - -import com.google.common.base.Optional; -import com.google.common.collect.Lists; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.EarlybirdRequestType; -import com.twitter.search.earlybird_root.common.EarlybirdServiceResponse; -import com.twitter.util.Await; -import com.twitter.util.Function; -import com.twitter.util.Future; - -public final class RequestRouterUtil { - private static final Logger LOG = LoggerFactory.getLogger(RequestRouterUtil.class); - - private RequestRouterUtil() { - } - - /** - * Returns the function that checks if the minSearchedStatusID on the merged response is higher - * than the max ID in the request. - * - * @param requestContext The request context that stores the request. - * @param operator The operator that we're checking against (max_id or until_time). - * @param requestMaxId The maxId specified in the request (in the given operator). - * @param realtimeResponseFuture The response from the realtime cluster. - * @param protectedResponseFuture The response from the protected cluster. - * @param fullArchiveResponseFuture The response from the full archive cluster. - * @param stat The stat to increment if minSearchedStatusID on the merged response is higher than - * the max ID in the request. - * @return A function that checks if the minSearchedStatusID on the merged response is higher than - * the max ID in the request. - */ - public static Function checkMinSearchedStatusId( - final EarlybirdRequestContext requestContext, - final String operator, - final Optional requestMaxId, - final Future realtimeResponseFuture, - final Future protectedResponseFuture, - final Future fullArchiveResponseFuture, - final SearchCounter stat) { - return new Function() { - @Override - public EarlybirdResponse apply(EarlybirdResponse mergedResponse) { - if (requestMaxId.isPresent() - && (mergedResponse.getResponseCode() == EarlybirdResponseCode.SUCCESS) - && mergedResponse.isSetSearchResults() - && mergedResponse.getSearchResults().isSetMinSearchedStatusID()) { - long minSearchedStatusId = mergedResponse.getSearchResults().getMinSearchedStatusID(); - if (minSearchedStatusId > requestMaxId.get()) { - stat.increment(); - // We're logging this only for STRICT RECENCY as it was very spammy for all types of - // request. We don't expect this to happen for STRICT RECENCY but we're tracking - // with the stat when it happens for RELEVANCE and RECENCY - if (requestContext.getEarlybirdRequestType() == EarlybirdRequestType.STRICT_RECENCY) { - String logMessage = "Response has a minSearchedStatusID ({}) larger than request " - + operator + " ({})." - + "\nrequest type: {}" - + "\nrequest: {}" - + "\nmerged response: {}" - + "\nrealtime response: {}" - + "\nprotected response: {}" - + "\nfull archive response: {}"; - List logMessageParams = Lists.newArrayList(); - logMessageParams.add(minSearchedStatusId); - logMessageParams.add(requestMaxId.get()); - logMessageParams.add(requestContext.getEarlybirdRequestType()); - logMessageParams.add(requestContext.getRequest()); - logMessageParams.add(mergedResponse); - - // The realtime, protected and full archive response futures are "done" at this point: - // we have to wait for them in order to build the merged response. So it's ok to call - // Await.result() here to get the responses: it's a no-op. - try { - logMessageParams.add(Await.result(realtimeResponseFuture).getResponse()); - } catch (Exception e) { - logMessageParams.add(e); - } - try { - logMessageParams.add(Await.result(protectedResponseFuture).getResponse()); - } catch (Exception e) { - logMessageParams.add(e); - } - try { - logMessageParams.add(Await.result(fullArchiveResponseFuture).getResponse()); - } catch (Exception e) { - logMessageParams.add(e); - } - - LOG.warn(logMessage, logMessageParams.toArray()); - } - } - } - - return mergedResponse; - } - }; - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/TermStatsRequestRouter.docx b/src/java/com/twitter/search/earlybird_root/routers/TermStatsRequestRouter.docx new file mode 100644 index 000000000..6f3f9e143 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/TermStatsRequestRouter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/TermStatsRequestRouter.java b/src/java/com/twitter/search/earlybird_root/routers/TermStatsRequestRouter.java deleted file mode 100644 index efc568748..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/TermStatsRequestRouter.java +++ /dev/null @@ -1,238 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import java.util.ArrayList; -import java.util.List; -import javax.inject.Inject; -import javax.inject.Named; - -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; - - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.finagle.Service; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.util.earlybird.EarlybirdResponseUtil; -import com.twitter.search.earlybird.config.ServingRange; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.InjectionNames; -import com.twitter.search.earlybird_root.filters.EarlybirdTimeRangeFilter; -import com.twitter.search.earlybird_root.filters.ServingRangeProvider; -import com.twitter.search.earlybird_root.mergers.EarlybirdResponseMerger; -import com.twitter.search.earlybird_root.mergers.SuperRootResponseMerger; -import com.twitter.search.earlybird_root.mergers.TermStatisticsResponseMerger; -import com.twitter.search.earlybird_root.mergers.TierResponseAccumulator; -import com.twitter.util.Function; -import com.twitter.util.Future; - -import static com.twitter.search.common.util.earlybird.TermStatisticsUtil.determineBinSize; - -/** - * For TermStats traffic SuperRoot hits both realtime and archive in parallel, and then merges - * the results. - */ -public class TermStatsRequestRouter extends RequestRouter { - private static final Logger LOG = LoggerFactory.getLogger(TermStatsRequestRouter.class); - - private static final String SUPERROOT_SKIP_FULL_ARCHIVE_CLUSTER_FOR_TERM_STATS_REQUESTS = - "superroot_skip_full_archive_cluster_for_term_stats_requests"; - - private final Service realtimeService; - private final Service fullArchiveService; - - private final SearchDecider decider; - - private final ServingRangeProvider realtimeServingRangeProvider; - - @Inject - public TermStatsRequestRouter( - @Named(InjectionNames.REALTIME) - Service realtime, - @Named(TermStatsRequestRouterModule.REALTIME_TIME_RANGE_FILTER) - EarlybirdTimeRangeFilter realtimeTimeRangeFilter, - @Named(InjectionNames.FULL_ARCHIVE) - Service fullArchive, - @Named(TermStatsRequestRouterModule.FULL_ARCHIVE_TIME_RANGE_FILTER) - EarlybirdTimeRangeFilter fullArchiveTimeRangeFilter, - SearchDecider decider) { - LOG.info("Instantiating a TermStatsRequestRouter"); - - this.realtimeService = realtimeTimeRangeFilter - .andThen(realtime); - - this.fullArchiveService = fullArchiveTimeRangeFilter - .andThen(fullArchive); - - this.decider = decider; - this.realtimeServingRangeProvider = realtimeTimeRangeFilter.getServingRangeProvider(); - } - - /** - * Hit both realtime and full-archive clusters then merges term stat request. - */ - @Override - public Future route(EarlybirdRequestContext requestContext) { - List requestResponses = new ArrayList<>(); - - Future realtimeResponseFuture = realtimeService.apply(requestContext); - this.saveRequestResponse(requestResponses, "realtime", requestContext, realtimeResponseFuture); - - Future archiveResponseFuture = - requestContext.getRequest().isGetOlderResults() - && !decider.isAvailable(SUPERROOT_SKIP_FULL_ARCHIVE_CLUSTER_FOR_TERM_STATS_REQUESTS) - ? fullArchiveService.apply(requestContext) - : Future.value(emptyResponse()); - this.saveRequestResponse(requestResponses, "archive", requestContext, archiveResponseFuture); - - Future mergedResponse = - merge(realtimeResponseFuture, archiveResponseFuture, requestContext); - - return this.maybeAttachSentRequestsToDebugInfo( - requestResponses, - requestContext, - mergedResponse - ); - } - - /** - * Merge responses from realtime and full archive clusters. - */ - private Future merge( - final Future realtimeResponseFuture, - final Future archiveResponseFuture, - final EarlybirdRequestContext requestContext) { - - return realtimeResponseFuture.flatMap( - new Function>() { - @Override - public Future apply(final EarlybirdResponse realtimeResponse) { - if (!EarlybirdResponseUtil.isSuccessfulResponse(realtimeResponse)) { - return Future.value(realtimeResponse); - } - - return archiveResponseFuture.flatMap( - new Function>() { - @Override - public Future apply(EarlybirdResponse archiveResponse) { - if (!EarlybirdResponseUtil.isSuccessfulResponse(archiveResponse)) { - return Future.value( - mergeWithUnsuccessfulArchiveResponse( - requestContext, realtimeResponse, archiveResponse)); - } - - List> responses = - ImmutableList.>builder() - .add(realtimeResponseFuture) - .add(archiveResponseFuture) - .build(); - - EarlybirdResponseMerger merger = new TermStatisticsResponseMerger( - requestContext, responses, new TierResponseAccumulator()); - - return merger.merge().map(new Function() { - @Override - public EarlybirdResponse apply(EarlybirdResponse mergedResponse) { - if (requestContext.getRequest().getDebugMode() > 0) { - mergedResponse.setDebugString( - SuperRootResponseMerger.mergeClusterDebugStrings( - realtimeResponse, null, archiveResponse)); - } - return mergedResponse; - } - }); - } - }); - } - }); - } - - private EarlybirdResponse mergeWithUnsuccessfulArchiveResponse( - EarlybirdRequestContext requestContext, - EarlybirdResponse realtimeResponse, - EarlybirdResponse archiveResponse) { - // If the realtime cluster was skipped, and the full archive returned an error - // response, return the full archive response. - if (isTierSkippedResponse(realtimeResponse)) { - return archiveResponse; - } - - // If the realtime response has results and the full archive cluster returned an error - // response, we return the realtime response. If the client needs more results, it can paginate, - // and on the next request it will get the error response from the full archive cluster. - if (realtimeResponse.isSetTermStatisticsResults() - && !realtimeResponse.getTermStatisticsResults().getTermResults().isEmpty()) { - realtimeResponse.setDebugString( - "Full archive cluster returned an error response (" - + archiveResponse.getResponseCode() + "). " - + SuperRootResponseMerger.mergeClusterDebugStrings( - realtimeResponse, null, archiveResponse)); - return updateMinCompleteBinId(requestContext, realtimeResponse); - } - - // If the realtime response has no results, and the full archive cluster returned an error - // response, return a PERSISTENT_ERROR response, and merge the debug strings from the two - // responses. - EarlybirdResponse mergedResponse = - new EarlybirdResponse(EarlybirdResponseCode.PERSISTENT_ERROR, 0); - mergedResponse.setDebugString( - "Full archive cluster returned an error response (" - + archiveResponse.getResponseCode() - + "), and the realtime response had no results. " - + SuperRootResponseMerger.mergeClusterDebugStrings( - realtimeResponse, null, archiveResponse)); - return mergedResponse; - } - - /** - * If we get a completed realtime response but a failed archive response, the minCompleteBinId we - * return will be incorrect -- the realtime minCompleteBinId is assumed to be the oldest bin - * returned, rather than the bin that intersects the realtime serving boundary. In these cases, we - * need to move the minCompleteBinId forward. - *

- * Note that we cannot always set the minCompleteBinId for the realtime results to the bin - * intersecting the realtime serving boundary: somewhere in the guts of the merging logic, we set - * the minCompleteBinId of the merged response to the max of the minCompleteBinIds of the original - * responses. :-( - */ - private EarlybirdResponse updateMinCompleteBinId( - EarlybirdRequestContext requestContext, EarlybirdResponse realtimeResponse) { - Preconditions.checkArgument( - realtimeResponse.getTermStatisticsResults().isSetMinCompleteBinId()); - int roundedServingRange = roundServingRangeUpToNearestBinId(requestContext, realtimeResponse); - int minCompleteBinId = Math.max( - roundedServingRange, - realtimeResponse.getTermStatisticsResults().getMinCompleteBinId()); - realtimeResponse.getTermStatisticsResults().setMinCompleteBinId(minCompleteBinId); - return realtimeResponse; - } - - private static EarlybirdResponse emptyResponse() { - return new EarlybirdResponse(EarlybirdResponseCode.SUCCESS, 0) - .setSearchResults(new ThriftSearchResults() - .setResults(Lists.newArrayList())) - .setDebugString("Full archive cluster not requested or not available."); - } - - private static boolean isTierSkippedResponse(EarlybirdResponse response) { - return response.getResponseCode() == EarlybirdResponseCode.TIER_SKIPPED; - } - - /** - * Given a termstats request/response pair, round the serving range for the appropriate cluster up - * to the nearest binId at the appropriate resolution. - */ - private int roundServingRangeUpToNearestBinId( - EarlybirdRequestContext request, EarlybirdResponse response) { - ServingRange servingRange = realtimeServingRangeProvider.getServingRange( - request, request.useOverrideTierConfig()); - long servingRangeStartSecs = servingRange.getServingRangeSinceTimeSecondsFromEpoch(); - int binSize = determineBinSize(response.getTermStatisticsResults().getHistogramSettings()); - return (int) Math.ceil((double) servingRangeStartSecs / binSize); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/TermStatsRequestRouterModule.docx b/src/java/com/twitter/search/earlybird_root/routers/TermStatsRequestRouterModule.docx new file mode 100644 index 000000000..bd2d99746 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/TermStatsRequestRouterModule.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/TermStatsRequestRouterModule.java b/src/java/com/twitter/search/earlybird_root/routers/TermStatsRequestRouterModule.java deleted file mode 100644 index 6b11f5d43..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/TermStatsRequestRouterModule.java +++ /dev/null @@ -1,60 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import javax.inject.Named; -import javax.inject.Singleton; - -import com.google.inject.Provides; - -import com.twitter.inject.TwitterModule; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.earlybird_root.filters.EarlybirdTimeRangeFilter; -import com.twitter.search.earlybird_root.filters.FullArchiveServingRangeProvider; -import com.twitter.search.earlybird_root.filters.RealtimeServingRangeProvider; -import com.twitter.search.earlybird_root.filters.ServingRangeProvider; - -public class TermStatsRequestRouterModule extends TwitterModule { - public static final String FULL_ARCHIVE_TIME_RANGE_FILTER = - "term_stats_full_archive_time_range_filter"; - public static final String REALTIME_TIME_RANGE_FILTER = - "term_stats_realtime_time_range_filter"; - - private static final String SUPERROOT_TERM_STATS_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY = - "superroot_term_stats_serving_range_boundary_hours_ago"; - - private ServingRangeProvider getFullArchiveTimeRangeProvider(final SearchDecider decider) - throws Exception { - return new FullArchiveServingRangeProvider( - decider, SUPERROOT_TERM_STATS_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY); - } - - private ServingRangeProvider getRealtimeTimeRangeProvider(final SearchDecider decider) - throws Exception { - return new RealtimeServingRangeProvider( - decider, SUPERROOT_TERM_STATS_SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY); - } - - /** - * For term stats full archive cluster spans from 21 March to 2006 to 6 days ago from current time - */ - @Provides - @Singleton - @Named(FULL_ARCHIVE_TIME_RANGE_FILTER) - private EarlybirdTimeRangeFilter providesFullArchiveTimeRangeFilter(final SearchDecider decider) - throws Exception { - return EarlybirdTimeRangeFilter.newTimeRangeFilterWithQueryRewriter( - getFullArchiveTimeRangeProvider(decider), decider); - } - - /** - * For term stats realtime cluster spans from 6 days ago from current time to a far away date - * into the future - */ - @Provides - @Singleton - @Named(REALTIME_TIME_RANGE_FILTER) - private EarlybirdTimeRangeFilter providesRealtimeTimeRangeFilter(final SearchDecider decider) - throws Exception { - return EarlybirdTimeRangeFilter.newTimeRangeFilterWithQueryRewriter( - getRealtimeTimeRangeProvider(decider), decider); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/TopTweetsRequestRouter.docx b/src/java/com/twitter/search/earlybird_root/routers/TopTweetsRequestRouter.docx new file mode 100644 index 000000000..04a36b2b4 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/TopTweetsRequestRouter.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/TopTweetsRequestRouter.java b/src/java/com/twitter/search/earlybird_root/routers/TopTweetsRequestRouter.java deleted file mode 100644 index 20c2411b1..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/TopTweetsRequestRouter.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import javax.inject.Inject; -import javax.inject.Named; - -import com.twitter.finagle.Service; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird_root.common.EarlybirdRequestContext; -import com.twitter.search.earlybird_root.common.InjectionNames; -import com.twitter.search.earlybird_root.filters.EarlybirdTimeRangeFilter; -import com.twitter.util.Future; - -/** - * For TopTweets traffic SuperRoot forwards all traffic to the realtime cluster. - */ -public class TopTweetsRequestRouter extends RequestRouter { - - private final Service realtime; - - /** Creates a new TopTweetsRequestRouter instance to be used by the SuperRoot. */ - @Inject - public TopTweetsRequestRouter( - @Named(InjectionNames.REALTIME) - Service realtime, - @Named(TopTweetsRequestRouterModule.TIME_RANGE_FILTER) - EarlybirdTimeRangeFilter timeRangeFilter) { - - this.realtime = timeRangeFilter.andThen(realtime); - } - - @Override - public Future route(EarlybirdRequestContext requestContext) { - return realtime.apply(requestContext); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/routers/TopTweetsRequestRouterModule.docx b/src/java/com/twitter/search/earlybird_root/routers/TopTweetsRequestRouterModule.docx new file mode 100644 index 000000000..1ccdb3b3d Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/routers/TopTweetsRequestRouterModule.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/routers/TopTweetsRequestRouterModule.java b/src/java/com/twitter/search/earlybird_root/routers/TopTweetsRequestRouterModule.java deleted file mode 100644 index 03a247afb..000000000 --- a/src/java/com/twitter/search/earlybird_root/routers/TopTweetsRequestRouterModule.java +++ /dev/null @@ -1,32 +0,0 @@ -package com.twitter.search.earlybird_root.routers; - -import javax.inject.Named; -import javax.inject.Singleton; - -import com.google.inject.Provides; - -import com.twitter.inject.TwitterModule; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.earlybird_root.filters.EarlybirdTimeRangeFilter; -import com.twitter.search.earlybird_root.filters.RealtimeServingRangeProvider; -import com.twitter.search.earlybird_root.filters.ServingRangeProvider; - -public class TopTweetsRequestRouterModule extends TwitterModule { - public static final String TIME_RANGE_FILTER = "top_tweets_time_range_filter"; - - public static final String SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY = - "superroot_top_tweets_serving_range_boundary_hours_ago"; - - private ServingRangeProvider getServingRangeProvider(final SearchDecider decider) - throws Exception { - return new RealtimeServingRangeProvider(decider, SERVING_RANGE_BOUNDARY_HOURS_AGO_DECIDER_KEY); - } - - @Provides - @Singleton - @Named(TIME_RANGE_FILTER) - private EarlybirdTimeRangeFilter providesTimeRangeFilter(SearchDecider decider) throws Exception { - return EarlybirdTimeRangeFilter.newTimeRangeFilterWithoutQueryRewriter( - getServingRangeProvider(decider)); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/validators/BUILD b/src/java/com/twitter/search/earlybird_root/validators/BUILD deleted file mode 100644 index 3a39026c1..000000000 --- a/src/java/com/twitter/search/earlybird_root/validators/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/search/common/schema/earlybird", - "src/thrift/com/twitter/search:earlybird-java", - ], -) diff --git a/src/java/com/twitter/search/earlybird_root/validators/BUILD.docx b/src/java/com/twitter/search/earlybird_root/validators/BUILD.docx new file mode 100644 index 000000000..a69f7f087 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/validators/BUILD.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/validators/FacetsResponseValidator.docx b/src/java/com/twitter/search/earlybird_root/validators/FacetsResponseValidator.docx new file mode 100644 index 000000000..6c62e52e4 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/validators/FacetsResponseValidator.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/validators/FacetsResponseValidator.java b/src/java/com/twitter/search/earlybird_root/validators/FacetsResponseValidator.java deleted file mode 100644 index a40c17a42..000000000 --- a/src/java/com/twitter/search/earlybird_root/validators/FacetsResponseValidator.java +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.search.earlybird_root.validators; - -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.util.Future; - -public class FacetsResponseValidator implements ServiceResponseValidator { - - private final EarlybirdCluster cluster; - - /** - * Validator for facets responses - */ - public FacetsResponseValidator(EarlybirdCluster cluster) { - this.cluster = cluster; - } - - @Override - public Future validate(EarlybirdResponse response) { - if (!response.isSetSearchResults() || !response.getSearchResults().isSetResults()) { - return Future.exception( - new IllegalStateException(cluster + " didn't set search results.")); - } - - if (!response.isSetFacetResults()) { - return Future.exception( - new IllegalStateException( - cluster + " facets response does not have the facetResults field set.")); - } - - if (response.getFacetResults().getFacetFields().isEmpty()) { - return Future.exception( - new IllegalStateException( - cluster + " facets response does not have any facet fields set.")); - } - - return Future.value(response); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/validators/PassThroughResponseValidator.docx b/src/java/com/twitter/search/earlybird_root/validators/PassThroughResponseValidator.docx new file mode 100644 index 000000000..c569aae5d Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/validators/PassThroughResponseValidator.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/validators/PassThroughResponseValidator.java b/src/java/com/twitter/search/earlybird_root/validators/PassThroughResponseValidator.java deleted file mode 100644 index af4de0cec..000000000 --- a/src/java/com/twitter/search/earlybird_root/validators/PassThroughResponseValidator.java +++ /dev/null @@ -1,12 +0,0 @@ -package com.twitter.search.earlybird_root.validators; - -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.util.Future; - -/** A no-op ServiceResponseValidator. */ -public class PassThroughResponseValidator implements ServiceResponseValidator { - @Override - public Future validate(EarlybirdResponse response) { - return Future.value(response); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/validators/SearchResultsValidator.docx b/src/java/com/twitter/search/earlybird_root/validators/SearchResultsValidator.docx new file mode 100644 index 000000000..f429c7b0d Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/validators/SearchResultsValidator.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/validators/SearchResultsValidator.java b/src/java/com/twitter/search/earlybird_root/validators/SearchResultsValidator.java deleted file mode 100644 index 39d4f2392..000000000 --- a/src/java/com/twitter/search/earlybird_root/validators/SearchResultsValidator.java +++ /dev/null @@ -1,37 +0,0 @@ -package com.twitter.search.earlybird_root.validators; - -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.util.Future; - -public class SearchResultsValidator - implements ServiceResponseValidator { - - private final EarlybirdCluster cluster; - - public SearchResultsValidator(EarlybirdCluster cluster) { - this.cluster = cluster; - } - - @Override - public Future validate(EarlybirdResponse response) { - if (!response.isSetSearchResults() - || !response.getSearchResults().isSetResults()) { - return Future.exception( - new IllegalStateException(cluster + " didn't set search results")); - } else if (!response.getSearchResults().isSetMaxSearchedStatusID()) { - return Future.exception( - new IllegalStateException(cluster + " didn't set max searched status id")); - } else { - boolean isEarlyTerminated = response.isSetEarlyTerminationInfo() - && response.getEarlyTerminationInfo().isEarlyTerminated(); - if (!isEarlyTerminated && !response.getSearchResults().isSetMinSearchedStatusID()) { - return Future.exception( - new IllegalStateException( - cluster + " neither early terminated nor set min searched status id")); - } else { - return Future.value(response); - } - } - } -} diff --git a/src/java/com/twitter/search/earlybird_root/validators/ServiceResponseValidator.docx b/src/java/com/twitter/search/earlybird_root/validators/ServiceResponseValidator.docx new file mode 100644 index 000000000..ca9e3fba6 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/validators/ServiceResponseValidator.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/validators/ServiceResponseValidator.java b/src/java/com/twitter/search/earlybird_root/validators/ServiceResponseValidator.java deleted file mode 100644 index b025d6476..000000000 --- a/src/java/com/twitter/search/earlybird_root/validators/ServiceResponseValidator.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.twitter.search.earlybird_root.validators; - -import com.twitter.util.Future; - -public interface ServiceResponseValidator { - /** - * Interface for validating Service responses - */ - Future validate(R response); -} diff --git a/src/java/com/twitter/search/earlybird_root/validators/TermStatsResultsValidator.docx b/src/java/com/twitter/search/earlybird_root/validators/TermStatsResultsValidator.docx new file mode 100644 index 000000000..d9fcd5599 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/validators/TermStatsResultsValidator.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/validators/TermStatsResultsValidator.java b/src/java/com/twitter/search/earlybird_root/validators/TermStatsResultsValidator.java deleted file mode 100644 index 01324f3c5..000000000 --- a/src/java/com/twitter/search/earlybird_root/validators/TermStatsResultsValidator.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.twitter.search.earlybird_root.validators; - -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.util.Future; - -public class TermStatsResultsValidator implements ServiceResponseValidator { - private final EarlybirdCluster cluster; - - public TermStatsResultsValidator(EarlybirdCluster cluster) { - this.cluster = cluster; - } - - @Override - public Future validate(EarlybirdResponse response) { - if (!response.isSetTermStatisticsResults() - || !response.getTermStatisticsResults().isSetTermResults()) { - return Future.exception( - new IllegalStateException(cluster + " returned null term statistics results.")); - } - return Future.value(response); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/validators/TopTweetsResultsValidator.docx b/src/java/com/twitter/search/earlybird_root/validators/TopTweetsResultsValidator.docx new file mode 100644 index 000000000..836e0a862 Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/validators/TopTweetsResultsValidator.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/validators/TopTweetsResultsValidator.java b/src/java/com/twitter/search/earlybird_root/validators/TopTweetsResultsValidator.java deleted file mode 100644 index a0ad8eb89..000000000 --- a/src/java/com/twitter/search/earlybird_root/validators/TopTweetsResultsValidator.java +++ /dev/null @@ -1,22 +0,0 @@ -package com.twitter.search.earlybird_root.validators; - -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.util.Future; - -public class TopTweetsResultsValidator implements ServiceResponseValidator { - private final EarlybirdCluster cluster; - - public TopTweetsResultsValidator(EarlybirdCluster cluster) { - this.cluster = cluster; - } - - @Override - public Future validate(EarlybirdResponse response) { - if (!response.isSetSearchResults() || !response.getSearchResults().isSetResults()) { - return Future.exception( - new IllegalStateException(cluster + " didn't set search results.")); - } - return Future.value(response); - } -} diff --git a/src/java/com/twitter/search/earlybird_root/visitors/BUILD b/src/java/com/twitter/search/earlybird_root/visitors/BUILD deleted file mode 100644 index d82aaf4c7..000000000 --- a/src/java/com/twitter/search/earlybird_root/visitors/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "src/java/com/twitter/search/common/partitioning/base", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/queryparser/query:core-query-nodes", - "src/java/com/twitter/search/queryparser/query/search:search-query-nodes", - ], -) diff --git a/src/java/com/twitter/search/earlybird_root/visitors/BUILD.docx b/src/java/com/twitter/search/earlybird_root/visitors/BUILD.docx new file mode 100644 index 000000000..c09ab13fb Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/visitors/BUILD.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/visitors/MultiTermDisjunctionPerPartitionVisitor.docx b/src/java/com/twitter/search/earlybird_root/visitors/MultiTermDisjunctionPerPartitionVisitor.docx new file mode 100644 index 000000000..d1603452d Binary files /dev/null and b/src/java/com/twitter/search/earlybird_root/visitors/MultiTermDisjunctionPerPartitionVisitor.docx differ diff --git a/src/java/com/twitter/search/earlybird_root/visitors/MultiTermDisjunctionPerPartitionVisitor.java b/src/java/com/twitter/search/earlybird_root/visitors/MultiTermDisjunctionPerPartitionVisitor.java deleted file mode 100644 index 646b46e2c..000000000 --- a/src/java/com/twitter/search/earlybird_root/visitors/MultiTermDisjunctionPerPartitionVisitor.java +++ /dev/null @@ -1,136 +0,0 @@ -package com.twitter.search.earlybird_root.visitors; - -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; - -import com.twitter.search.common.partitioning.base.PartitionDataType; -import com.twitter.search.common.partitioning.base.PartitionMappingManager; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.queryparser.query.Conjunction; -import com.twitter.search.queryparser.query.Disjunction; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.Query.Occur; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.search.SearchOperator; -import com.twitter.search.queryparser.query.search.SearchQueryTransformer; - -/** - * Truncate user id or id lists in [multi_term_disjunction from_user_id/id] queries. - * Return null if query has incorrect operators or looked at wrong field. - */ -public class MultiTermDisjunctionPerPartitionVisitor extends SearchQueryTransformer { - private final PartitionMappingManager partitionMappingManager; - private final int partitionId; - private final String targetFieldName; - - public static final Conjunction NO_MATCH_CONJUNCTION = - new Conjunction(Occur.MUST_NOT, Collections.emptyList(), Collections.emptyList()); - - public MultiTermDisjunctionPerPartitionVisitor( - PartitionMappingManager partitionMappingManager, - int partitionId) { - this.partitionMappingManager = partitionMappingManager; - this.partitionId = partitionId; - this.targetFieldName = - partitionMappingManager.getPartitionDataType() == PartitionDataType.USER_ID - ? EarlybirdFieldConstants.EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName() - : EarlybirdFieldConstants.EarlybirdFieldConstant.ID_FIELD.getFieldName(); - } - - private boolean isTargetedQuery(Query query) { - if (query instanceof SearchOperator) { - SearchOperator operator = (SearchOperator) query; - return operator.getOperatorType() == SearchOperator.Type.MULTI_TERM_DISJUNCTION - && operator.getOperand().equals(targetFieldName); - } else { - return false; - } - } - - @Override - public Query visit(Conjunction query) throws QueryParserException { - boolean modified = false; - ImmutableList.Builder children = ImmutableList.builder(); - for (Query child : query.getChildren()) { - Query newChild = child.accept(this); - if (newChild != null) { - // For conjunction case, if any child is "multi_term_disjunction from_user_id" and returns - // Conjunction.NO_MATCH_CONJUNCTION, it should be considered same as match no docs. And - // caller should decide how to deal with it. - if (isTargetedQuery(child) && newChild == NO_MATCH_CONJUNCTION) { - return NO_MATCH_CONJUNCTION; - } - if (newChild != Conjunction.EMPTY_CONJUNCTION - && newChild != Disjunction.EMPTY_DISJUNCTION) { - children.add(newChild); - } - } - if (newChild != child) { - modified = true; - } - } - return modified ? query.newBuilder().setChildren(children.build()).build() : query; - } - - @Override - public Query visit(Disjunction disjunction) throws QueryParserException { - boolean modified = false; - ImmutableList.Builder children = ImmutableList.builder(); - for (Query child : disjunction.getChildren()) { - Query newChild = child.accept(this); - if (newChild != null - && newChild != Conjunction.EMPTY_CONJUNCTION - && newChild != Disjunction.EMPTY_DISJUNCTION - && newChild != NO_MATCH_CONJUNCTION) { - children.add(newChild); - } - if (newChild != child) { - modified = true; - } - } - return modified ? disjunction.newBuilder().setChildren(children.build()).build() : disjunction; - } - - @Override - public Query visit(SearchOperator operator) throws QueryParserException { - if (isTargetedQuery(operator)) { - List ids = extractIds(operator); - if (ids.size() > 0) { - List operands = Lists.newArrayList(targetFieldName); - for (long id : ids) { - operands.add(String.valueOf(id)); - } - return operator.newBuilder().setOperands(operands).build(); - } else { - // If the [multi_term_disjunction from_user_id] is a negation (i.e., occur == MUST_NOT), - // and there is no user id left, the whole sub query node does not do anything; if it is - // NOT a negation, then sub query matches nothing. - if (operator.getOccur() == Query.Occur.MUST_NOT) { - return Conjunction.EMPTY_CONJUNCTION; - } else { - return NO_MATCH_CONJUNCTION; - } - } - } - return operator; - } - - private List extractIds(SearchOperator operator) throws QueryParserException { - if (EarlybirdFieldConstants.EarlybirdFieldConstant.ID_FIELD - .getFieldName().equals(targetFieldName)) { - return operator.getOperands().subList(1, operator.getNumOperands()).stream() - .map(Long::valueOf) - .filter(id -> partitionMappingManager.getPartitionIdForTweetId(id) == partitionId) - .collect(Collectors.toList()); - } else { - return operator.getOperands().subList(1, operator.getNumOperands()).stream() - .map(Long::valueOf) - .filter(id -> partitionMappingManager.getPartitionIdForUserId(id) == partitionId) - .collect(Collectors.toList()); - } - } -} diff --git a/src/java/com/twitter/search/feature_update_service/BUILD b/src/java/com/twitter/search/feature_update_service/BUILD deleted file mode 100644 index 449f39d9b..000000000 --- a/src/java/com/twitter/search/feature_update_service/BUILD +++ /dev/null @@ -1,86 +0,0 @@ -java_library( - name = "feature_update_service-lib", - sources = ["*.java"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/fasterxml/jackson/core:jackson-annotations", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/javax/inject:javax.inject", - "3rdparty/jvm/org/apache/kafka:kafka-clients", - "3rdparty/jvm/org/apache/thrift:libthrift", - "decider/src/main/scala", - "finagle/finagle-core/src/main", - "finagle/finagle-exp/src/main/scala", - "finagle/finagle-http/src/main/scala", - "finagle/finagle-thrift/src/main/scala", - "finagle/finagle-thriftmux/src/main/scala", - "finatra-internal/decider/src/main/scala", - "finatra-internal/diffy/src/main/scala", - "finatra-internal/mtls-thriftmux/src/main/scala", - "finatra/inject/inject-app/src/main/scala", - "finatra/inject/inject-core/src/main/scala", - "finatra/inject/inject-server/src/main/scala", - "finatra/inject/inject-slf4j/src/main/scala", - "finatra/inject/inject-slf4j/src/main/scala/com/twitter/inject", - "finatra/inject/inject-thrift-client/src/main/scala", - "finatra/inject/inject-utils/src/main/scala", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift:controller", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/exceptions", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/filters", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/modules", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/response", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/routing", - "kafka/finagle-kafka/finatra-kafka/src/main/scala", - "science/search/feature_update_service/resources", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/constants", - "src/java/com/twitter/search/common/debug", - "src/java/com/twitter/search/common/decider", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/base", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/util:platform_stats_exporter", - "src/java/com/twitter/search/common/util/io/periodic", - "src/java/com/twitter/search/common/util/thrift:text-protocol", - "src/java/com/twitter/search/feature_update_service/filters", - "src/java/com/twitter/search/feature_update_service/modules", - "src/java/com/twitter/search/feature_update_service/stats", - "src/java/com/twitter/search/feature_update_service/util", - "src/java/com/twitter/search/ingester/model", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/search/common:schema-java", - "src/thrift/com/twitter/search/feature_update_service/thrift:thrift-java", - "src/thrift/com/twitter/tweetypie:service-java", - "src/thrift/com/twitter/tweetypie:tweet-java", - "thrift-web-forms/src/main/java/com/twitter/thriftwebforms", - "thrift-web-forms/src/main/scala/com/twitter/thriftwebforms", - "thrift-web-forms/src/main/scala/com/twitter/thriftwebforms/model", - "twitter-server-internal/src/main/scala", - "twitter-server/server/src/main/scala", - "util/util-app/src/main/scala", - "util/util-core:scala", - "util/util-function/src/main/java", - "util/util-lint/src/main/scala", - "util/util-slf4j-api/src/main/scala", - "util/util-stats/src/main/scala", - ], -) - -jvm_binary( - name = "feature_update_service", - basename = "feature_update_service", - main = "com.twitter.search.feature_update_service.FeatureUpdateServiceThriftServerMain", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":feature_update_service-lib", - "3rdparty/jvm/ch/qos/logback:logback-classic", - "loglens/loglens-logback/src/main/scala/com/twitter/loglens/logback", - "twitter-server-internal/src/main/scala", - ], -) diff --git a/src/java/com/twitter/search/feature_update_service/BUILD.docx b/src/java/com/twitter/search/feature_update_service/BUILD.docx new file mode 100644 index 000000000..1f9126908 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/BUILD.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/FeatureUpdateController.docx b/src/java/com/twitter/search/feature_update_service/FeatureUpdateController.docx new file mode 100644 index 000000000..0cd5accdc Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/FeatureUpdateController.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/FeatureUpdateController.java b/src/java/com/twitter/search/feature_update_service/FeatureUpdateController.java deleted file mode 100644 index 1613bee3c..000000000 --- a/src/java/com/twitter/search/feature_update_service/FeatureUpdateController.java +++ /dev/null @@ -1,245 +0,0 @@ -package com.twitter.search.feature_update_service; - -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import javax.inject.Inject; -import javax.inject.Named; - -import scala.runtime.BoxedUnit; - -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Lists; - -import org.apache.kafka.clients.producer.ProducerRecord; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.decider.Decider; -import com.twitter.finagle.mux.ClientDiscardedRequestException; -import com.twitter.finagle.thrift.ClientId; -import com.twitter.finatra.kafka.producers.BlockingFinagleKafkaProducer; -import com.twitter.inject.annotations.Flag; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType; -import com.twitter.search.feature_update_service.modules.EarlybirdUtilModule; -import com.twitter.search.feature_update_service.modules.FinagleKafkaProducerModule; -import com.twitter.search.feature_update_service.stats.FeatureUpdateStats; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateRequest; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateResponse; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateResponseCode; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateService; -import com.twitter.search.feature_update_service.util.FeatureUpdateValidator; -import com.twitter.search.ingester.model.IngesterThriftVersionedEvents; -import com.twitter.tweetypie.thriftjava.GetTweetFieldsOptions; -import com.twitter.tweetypie.thriftjava.GetTweetFieldsRequest; -import com.twitter.tweetypie.thriftjava.TweetInclude; -import com.twitter.tweetypie.thriftjava.TweetService; -import com.twitter.tweetypie.thriftjava.TweetVisibilityPolicy; -import com.twitter.util.ExecutorServiceFuturePool; -import com.twitter.util.Function; -import com.twitter.util.Future; -import com.twitter.util.Futures; - -import static com.twitter.tweetypie.thriftjava.Tweet._Fields.CORE_DATA; - -public class FeatureUpdateController implements FeatureUpdateService.ServiceIface { - private static final Logger LOG = LoggerFactory.getLogger(FeatureUpdateController.class); - private static final Logger REQUEST_LOG = - LoggerFactory.getLogger("feature_update_service_requests"); - private static final String KAFKA_SEND_COUNT_FORMAT = "kafka_%s_partition_%d_send_count"; - private static final String WRITE_TO_KAFKA_DECIDER_KEY = "write_events_to_kafka_update_events"; - private static final String WRITE_TO_KAFKA_DECIDER_KEY_REALTIME_CG = - "write_events_to_kafka_update_events_realtime_cg"; - - private final SearchRateCounter droppedKafkaUpdateEvents = - SearchRateCounter.export("dropped_kafka_update_events"); - - private final SearchRateCounter droppedKafkaUpdateEventsRealtimeCg = - SearchRateCounter.export("dropped_kafka_update_events_realtime_cg"); - private final Clock clock; - private final Decider decider; - private final BlockingFinagleKafkaProducer kafkaProducer; - private final BlockingFinagleKafkaProducer kafkaProducerRealtimeCg; - - private final List penguinVersions; - private final FeatureUpdateStats stats; - private final String kafkaUpdateEventsTopicName; - private final String kafkaUpdateEventsTopicNameRealtimeCg; - private final ExecutorServiceFuturePool futurePool; - private final TweetService.ServiceIface tweetService; - - @Inject - public FeatureUpdateController( - Clock clock, - Decider decider, - @Named("KafkaProducer") - BlockingFinagleKafkaProducer kafkaProducer, - @Named("KafkaProducerRealtimeCg") - BlockingFinagleKafkaProducer kafkaProducerRealtimeCg, - @Flag(EarlybirdUtilModule.PENGUIN_VERSIONS_FLAG) String penguinVersions, - FeatureUpdateStats stats, - @Flag(FinagleKafkaProducerModule.KAFKA_TOPIC_NAME_UPDATE_EVENTS_FLAG) - String kafkaUpdateEventsTopicName, - @Flag(FinagleKafkaProducerModule.KAFKA_TOPIC_NAME_UPDATE_EVENTS_FLAG_REALTIME_CG) - String kafkaUpdateEventsTopicNameRealtimeCg, - ExecutorServiceFuturePool futurePool, - TweetService.ServiceIface tweetService - ) { - this.clock = clock; - this.decider = decider; - this.kafkaProducer = kafkaProducer; - this.kafkaProducerRealtimeCg = kafkaProducerRealtimeCg; - this.penguinVersions = getPenguinVersions(penguinVersions); - this.stats = stats; - this.kafkaUpdateEventsTopicName = kafkaUpdateEventsTopicName; - this.kafkaUpdateEventsTopicNameRealtimeCg = kafkaUpdateEventsTopicNameRealtimeCg; - this.futurePool = futurePool; - this.tweetService = tweetService; - } - - @Override - public Future process(FeatureUpdateRequest featureUpdate) { - long requestStartTimeMillis = clock.nowMillis(); - - // Export overall and per-client request rate stats - final String requestClientId; - if (featureUpdate.getRequestClientId() != null - && !featureUpdate.getRequestClientId().isEmpty()) { - requestClientId = featureUpdate.getRequestClientId(); - } else if (ClientId.current().nonEmpty()) { - requestClientId = ClientId.current().get().name(); - } else { - requestClientId = "unknown"; - } - stats.clientRequest(requestClientId); - REQUEST_LOG.info("{} {}", requestClientId, featureUpdate); - - FeatureUpdateResponse errorResponse = FeatureUpdateValidator.validate(featureUpdate); - if (errorResponse != null) { - stats.clientResponse(requestClientId, errorResponse.getResponseCode()); - LOG.warn("client error: clientID {} - reason: {}", - requestClientId, errorResponse.getDetailMessage()); - return Future.value(errorResponse); - } - - ThriftIndexingEvent event = featureUpdate.getEvent(); - return writeToKafka(event, requestStartTimeMillis) - .map(responsesList -> { - stats.clientResponse(requestClientId, FeatureUpdateResponseCode.SUCCESS); - // only when both Realtime & RealtimeCG succeed, then it will return a success flag - return new FeatureUpdateResponse(FeatureUpdateResponseCode.SUCCESS); - }) - .handle(Function.func(throwable -> { - FeatureUpdateResponseCode responseCode; - // if either Realtime or RealtimeCG throws an exception, it will return a failure - if (throwable instanceof ClientDiscardedRequestException) { - responseCode = FeatureUpdateResponseCode.CLIENT_CANCEL_ERROR; - LOG.info("ClientDiscardedRequestException received from client: " + requestClientId, - throwable); - } else { - responseCode = FeatureUpdateResponseCode.TRANSIENT_ERROR; - LOG.error("Error occurred while writing to output stream: " - + kafkaUpdateEventsTopicName + ", " - + kafkaUpdateEventsTopicNameRealtimeCg, throwable); - } - stats.clientResponse(requestClientId, responseCode); - return new FeatureUpdateResponse(responseCode) - .setDetailMessage(throwable.getMessage()); - })); - } - - /** - * In writeToKafka(), we use Futures.collect() to aggregate results for two RPC calls - * Futures.collect() means that if either one of the Future fails then it will return an Exception - * only when both Realtime & RealtimeCG succeed, then it will return a success flag - * The FeatureUpdateResponse is more like an ACK message, and the upstream (feature update ingester) - * will not be affected much even if it failed (as long as the kafka message is written) - */ - private Future> writeToKafka(ThriftIndexingEvent event, - long requestStartTimeMillis) { - return Futures.collect(Lists.newArrayList( - writeToKafkaInternal(event, WRITE_TO_KAFKA_DECIDER_KEY, droppedKafkaUpdateEvents, - kafkaUpdateEventsTopicName, -1, kafkaProducer), - Futures.flatten(getUserId(event.getUid()).map( - userId -> writeToKafkaInternal(event, WRITE_TO_KAFKA_DECIDER_KEY_REALTIME_CG, - droppedKafkaUpdateEventsRealtimeCg, - kafkaUpdateEventsTopicNameRealtimeCg, userId, kafkaProducerRealtimeCg))))); - - } - - private Future writeToKafkaInternal(ThriftIndexingEvent event, String deciderKey, - SearchRateCounter droppedStats, String topicName, long userId, - BlockingFinagleKafkaProducer producer) { - if (!DeciderUtil.isAvailableForRandomRecipient(decider, deciderKey)) { - droppedStats.increment(); - return Future.Unit(); - } - - ProducerRecord producerRecord = new ProducerRecord<>( - topicName, - convertToThriftVersionedEvents(userId, event)); - - try { - return Futures.flatten(futurePool.apply(() -> - producer.send(producerRecord) - .map(record -> { - SearchCounter.export(String.format( - KAFKA_SEND_COUNT_FORMAT, record.topic(), record.partition())).increment(); - return BoxedUnit.UNIT; - }))); - } catch (Exception e) { - return Future.exception(e); - } - } - - private List getPenguinVersions(String penguinVersionsStr) { - String[] tokens = penguinVersionsStr.split("\\s*,\\s*"); - List listOfPenguinVersions = Lists.newArrayListWithCapacity(tokens.length); - for (String token : tokens) { - listOfPenguinVersions.add(PenguinVersion.valueOf(token.toUpperCase())); - } - LOG.info(String.format("Using Penguin Versions: %s", listOfPenguinVersions)); - return listOfPenguinVersions; - } - - private Future getUserId(long tweetId) { - TweetInclude tweetInclude = new TweetInclude(); - tweetInclude.setTweetFieldId(CORE_DATA.getThriftFieldId()); - GetTweetFieldsOptions getTweetFieldsOptions = new GetTweetFieldsOptions().setTweet_includes( - Collections.singleton(tweetInclude)).setVisibilityPolicy( - TweetVisibilityPolicy.NO_FILTERING); - GetTweetFieldsRequest getTweetFieldsRequest = new GetTweetFieldsRequest().setTweetIds( - Arrays.asList(tweetId)).setOptions(getTweetFieldsOptions); - try { - return tweetService.get_tweet_fields(getTweetFieldsRequest).map( - tweetFieldsResults -> tweetFieldsResults.get( - 0).tweetResult.getFound().tweet.core_data.user_id); - } catch (Exception e) { - return Future.exception(e); - } - } - - private ThriftVersionedEvents convertToThriftVersionedEvents( - long userId, ThriftIndexingEvent event) { - ThriftIndexingEvent thriftIndexingEvent = event.deepCopy() - .setEventType(ThriftIndexingEventType.PARTIAL_UPDATE); - - ImmutableMap.Builder versionedEventsBuilder = - new ImmutableMap.Builder<>(); - for (PenguinVersion penguinVersion : penguinVersions) { - versionedEventsBuilder.put(penguinVersion.getByteValue(), thriftIndexingEvent); - } - - IngesterThriftVersionedEvents thriftVersionedEvents = - new IngesterThriftVersionedEvents(userId, versionedEventsBuilder.build()); - thriftVersionedEvents.setId(thriftIndexingEvent.getUid()); - return thriftVersionedEvents; - } -} diff --git a/src/java/com/twitter/search/feature_update_service/FeatureUpdateResponseClassifier.docx b/src/java/com/twitter/search/feature_update_service/FeatureUpdateResponseClassifier.docx new file mode 100644 index 000000000..2d7c75939 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/FeatureUpdateResponseClassifier.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/FeatureUpdateResponseClassifier.java b/src/java/com/twitter/search/feature_update_service/FeatureUpdateResponseClassifier.java deleted file mode 100644 index c63e81f46..000000000 --- a/src/java/com/twitter/search/feature_update_service/FeatureUpdateResponseClassifier.java +++ /dev/null @@ -1,43 +0,0 @@ -package com.twitter.search.feature_update_service; - -import scala.runtime.AbstractPartialFunction; - -import com.twitter.finagle.service.ReqRep; -import com.twitter.finagle.service.ResponseClass; -import com.twitter.finagle.service.ResponseClassifier; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateResponse; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateResponseCode; -import com.twitter.util.Try; - -public class FeatureUpdateResponseClassifier - extends AbstractPartialFunction { - @Override - public boolean isDefinedAt(ReqRep tuple) { - return true; - } - - @Override - public ResponseClass apply(ReqRep reqRep) { - Try finagleResponse = reqRep.response(); - if (finagleResponse.isThrow()) { - return ResponseClassifier.Default().apply(reqRep); - } - FeatureUpdateResponse response = (FeatureUpdateResponse) finagleResponse.apply(); - FeatureUpdateResponseCode responseCode = response.getResponseCode(); - switch (responseCode) { - case TRANSIENT_ERROR: - case SERVER_TIMEOUT_ERROR: - return ResponseClass.RetryableFailure(); - case PERSISTENT_ERROR: - return ResponseClass.NonRetryableFailure(); - // Client cancellations don't necessarily mean failures on our end. The client decided to - // cancel the request (for example we timed out, so they sent a duplicate request etc.), - // so let's treat them as successes. - case CLIENT_CANCEL_ERROR: - default: - // The other response codes are client errors, and success, and in those cases the server - // behaved correctly, so we classify it as a success. - return ResponseClass.Success(); - } - } -} diff --git a/src/java/com/twitter/search/feature_update_service/FeatureUpdateServiceThriftServer.docx b/src/java/com/twitter/search/feature_update_service/FeatureUpdateServiceThriftServer.docx new file mode 100644 index 000000000..346d090ee Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/FeatureUpdateServiceThriftServer.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/FeatureUpdateServiceThriftServer.java b/src/java/com/twitter/search/feature_update_service/FeatureUpdateServiceThriftServer.java deleted file mode 100644 index 7f2730560..000000000 --- a/src/java/com/twitter/search/feature_update_service/FeatureUpdateServiceThriftServer.java +++ /dev/null @@ -1,149 +0,0 @@ -package com.twitter.search.feature_update_service; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; -import java.util.concurrent.TimeUnit; - -import com.google.common.base.Preconditions; -import com.google.inject.Module; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.app.Flag; -import com.twitter.app.Flaggable; -import com.twitter.finagle.Filter; -import com.twitter.finagle.Service; -import com.twitter.finagle.ThriftMux; -import com.twitter.finatra.annotations.DarkTrafficFilterType; -import com.twitter.finatra.decider.modules.DeciderModule$; -import com.twitter.finatra.mtls.thriftmux.modules.MtlsThriftWebFormsModule; -import com.twitter.finatra.mtls.thriftmux.AbstractMtlsThriftServer; -import com.twitter.finatra.thrift.filters.AccessLoggingFilter; -import com.twitter.finatra.thrift.filters.LoggingMDCFilter; -import com.twitter.finatra.thrift.filters.StatsFilter; -import com.twitter.finatra.thrift.filters.ThriftMDCFilter; -import com.twitter.finatra.thrift.filters.TraceIdMDCFilter; -import com.twitter.finatra.thrift.routing.JavaThriftRouter; -import com.twitter.inject.thrift.modules.ThriftClientIdModule$; -import com.twitter.search.common.constants.SearchThriftWebFormsAccess; -import com.twitter.search.common.metrics.BuildInfoStats; -import com.twitter.search.common.util.PlatformStatsExporter; -import com.twitter.search.feature_update_service.filters.ClientIdWhitelistFilter; -import com.twitter.search.feature_update_service.modules.ClientIdWhitelistModule; -import com.twitter.search.feature_update_service.modules.EarlybirdUtilModule; -import com.twitter.search.feature_update_service.modules.FeatureUpdateServiceDiffyModule; -import com.twitter.search.feature_update_service.modules.FinagleKafkaProducerModule; -import com.twitter.search.feature_update_service.modules.FuturePoolModule; -import com.twitter.search.feature_update_service.modules.TweetypieModule; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateService; -import com.twitter.thriftwebforms.MethodOptionsAccessConfig; -import com.twitter.util.ExecutorServiceFuturePool; - -public class FeatureUpdateServiceThriftServer extends AbstractMtlsThriftServer { - private static final Logger LOG = - LoggerFactory.getLogger(FeatureUpdateServiceThriftServer.class); - - // Ideally we would not have to access the "environment" flag here and we could instead pass - // a flag to the ThriftWebFormsModule that would either enable or disable thrift web forms. - // However, it is not simple to create our own TwitterModule that both extends the - // ThriftWebFormsModule and consumes an injected flag. - private Flag envFlag = flag().create("environment", - "", - "Environment for service (prod, staging, staging1, devel)", - Flaggable.ofString()); - - FeatureUpdateServiceThriftServer(String[] args) { - BuildInfoStats.export(); - PlatformStatsExporter.exportPlatformStats(); - - flag().parseArgs(args, true); - } - - @Override - @SuppressWarnings("unchecked") - public Collection javaModules() { - List modules = new ArrayList<>(); - modules.addAll(Arrays.asList( - ThriftClientIdModule$.MODULE$, - DeciderModule$.MODULE$, - new ClientIdWhitelistModule(), - new FinagleKafkaProducerModule(), - new EarlybirdUtilModule(), - new FuturePoolModule(), - new FeatureUpdateServiceDiffyModule(), - new TweetypieModule())); - - // Only add the Thrift Web Forms module for non-prod services because we should - // not allow write access to production data through Thrift Web Forms. - String environment = envFlag.apply(); - if ("prod".equals(environment)) { - LOG.info("Not including Thrift Web Forms because the environment is prod"); - } else { - LOG.info("Including Thrift Web Forms because the environment is " + environment); - modules.add( - MtlsThriftWebFormsModule.create( - this, - FeatureUpdateService.ServiceIface.class, - MethodOptionsAccessConfig.byLdapGroup(SearchThriftWebFormsAccess.WRITE_LDAP_GROUP) - ) - ); - } - - return modules; - } - - @Override - public void configureThrift(JavaThriftRouter router) { - router - // Initialize Mapped Diagnostic Context (MDC) for logging - // (see https://logback.qos.ch/manual/mdc.html) - .filter(LoggingMDCFilter.class) - // Inject trace ID in MDC for logging - .filter(TraceIdMDCFilter.class) - // Inject request method and client ID in MDC for logging - .filter(ThriftMDCFilter.class) - // Log client access - .filter(AccessLoggingFilter.class) - // Export basic service stats - .filter(StatsFilter.class) - .filter(ClientIdWhitelistFilter.class) - .add(FeatureUpdateController.class); - } - - @Override - public Service configureService(Service service) { - // Add the DarkTrafficFilter in "front" of the service being served. - return injector() - .instance(Filter.TypeAgnostic.class, DarkTrafficFilterType.class) - .andThen(service); - } - - @Override - public ThriftMux.Server configureThriftServer(ThriftMux.Server server) { - // This cast looks redundant, but it is required for pants to compile this file. - return (ThriftMux.Server) server.withResponseClassifier(new FeatureUpdateResponseClassifier()); - } - - @Override - public void postWarmup() { - super.postWarmup(); - - ExecutorServiceFuturePool futurePool = injector().instance(ExecutorServiceFuturePool.class); - Preconditions.checkNotNull(futurePool); - - onExit(() -> { - try { - futurePool.executor().shutdownNow(); - - futurePool.executor().awaitTermination(10L, TimeUnit.SECONDS); - } catch (InterruptedException e) { - LOG.error("Interrupted while awaiting future pool termination", e); - } - - return null; - }); - } -} diff --git a/src/java/com/twitter/search/feature_update_service/FeatureUpdateServiceThriftServerMain.docx b/src/java/com/twitter/search/feature_update_service/FeatureUpdateServiceThriftServerMain.docx new file mode 100644 index 000000000..3342c35d2 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/FeatureUpdateServiceThriftServerMain.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/FeatureUpdateServiceThriftServerMain.java b/src/java/com/twitter/search/feature_update_service/FeatureUpdateServiceThriftServerMain.java deleted file mode 100644 index d19c102a9..000000000 --- a/src/java/com/twitter/search/feature_update_service/FeatureUpdateServiceThriftServerMain.java +++ /dev/null @@ -1,12 +0,0 @@ -package com.twitter.search.feature_update_service; - -final class FeatureUpdateServiceThriftServerMain { - private FeatureUpdateServiceThriftServerMain() { - // Private constructor to satisfy checkstyle error: - // "Utility classes should not have a public or default constructor)." - } - - public static void main(String[] args) { - new FeatureUpdateServiceThriftServer(args).main(args); - } -} diff --git a/src/java/com/twitter/search/feature_update_service/README.docx b/src/java/com/twitter/search/feature_update_service/README.docx new file mode 100644 index 000000000..be167050a Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/README.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/README.md b/src/java/com/twitter/search/feature_update_service/README.md deleted file mode 100644 index ed28acbc8..000000000 --- a/src/java/com/twitter/search/feature_update_service/README.md +++ /dev/null @@ -1,6 +0,0 @@ -## Feature Update Service -Feature update service is a service that sends tweet feature updates e.g number of retweets, replies and favorites to Earlybird. Earlybird then indexes and uses these features to rank in-network Home Timeline tweets. - - - - diff --git a/src/java/com/twitter/search/feature_update_service/filters/BUILD b/src/java/com/twitter/search/feature_update_service/filters/BUILD deleted file mode 100644 index 267acdcff..000000000 --- a/src/java/com/twitter/search/feature_update_service/filters/BUILD +++ /dev/null @@ -1,22 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/inject:guice", - "decider/src/main/scala", - "finatra-internal/thrift/src/main/thrift:thrift-java", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift:controller", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/exceptions", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/filters", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/modules", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/response", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/routing", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/util/io/periodic", - "src/java/com/twitter/search/common/util/thrift:text-protocol", - "src/java/com/twitter/search/feature_update_service/whitelist", - "src/thrift/com/twitter/search/feature_update_service/thrift:thrift-java", - ], -) diff --git a/src/java/com/twitter/search/feature_update_service/filters/BUILD.docx b/src/java/com/twitter/search/feature_update_service/filters/BUILD.docx new file mode 100644 index 000000000..7405e75cb Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/filters/BUILD.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/filters/ClientIdWhitelistFilter.docx b/src/java/com/twitter/search/feature_update_service/filters/ClientIdWhitelistFilter.docx new file mode 100644 index 000000000..e2c8db70e Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/filters/ClientIdWhitelistFilter.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/filters/ClientIdWhitelistFilter.java b/src/java/com/twitter/search/feature_update_service/filters/ClientIdWhitelistFilter.java deleted file mode 100644 index 077c45067..000000000 --- a/src/java/com/twitter/search/feature_update_service/filters/ClientIdWhitelistFilter.java +++ /dev/null @@ -1,60 +0,0 @@ -package com.twitter.search.feature_update_service.filters; - -import com.google.inject.Inject; -import com.google.inject.Singleton; - -import com.twitter.finagle.Service; -import com.twitter.finatra.thrift.AbstractThriftFilter; -import com.twitter.finatra.thrift.ThriftRequest; -import com.twitter.inject.annotations.Flag; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateResponse; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateResponseCode; -import com.twitter.search.feature_update_service.whitelist.ClientIdWhitelist; -import com.twitter.util.Future; - -@Singleton -public class ClientIdWhitelistFilter extends AbstractThriftFilter { - private final boolean enabled; - private final ClientIdWhitelist whitelist; - - private final SearchRateCounter unknownClientIdStat = - SearchRateCounter.export("unknown_client_id"); - private final SearchRateCounter noClientIdStat = - SearchRateCounter.export("no_client_id"); - - @Inject - public ClientIdWhitelistFilter( - ClientIdWhitelist whitelist, - @Flag("client.whitelist.enable") Boolean enabled - ) { - this.whitelist = whitelist; - this.enabled = enabled; - } - - @Override - @SuppressWarnings("unchecked") - public Future apply(ThriftRequest request, Service, R> svc) { - if (!enabled) { - return svc.apply(request); - } - if (request.clientId().isEmpty()) { - noClientIdStat.increment(); - return (Future) Future.value( - new FeatureUpdateResponse(FeatureUpdateResponseCode.MISSING_CLIENT_ERROR) - .setDetailMessage("finagle clientId is required in request")); - - } else if (!whitelist.isClientAllowed(request.clientId().get())) { - // It's safe to use get() in the above condition because - // clientId was already checked for emptiness - unknownClientIdStat.increment(); - return (Future) Future.value( - new FeatureUpdateResponse(FeatureUpdateResponseCode.UNKNOWN_CLIENT_ERROR) - .setDetailMessage(String.format( - "request contains unknown finagle clientId: %s", request.clientId().toString()))); - } else { - return svc.apply(request); - } - } -} - diff --git a/src/java/com/twitter/search/feature_update_service/modules/BUILD b/src/java/com/twitter/search/feature_update_service/modules/BUILD deleted file mode 100644 index f7ee145be..000000000 --- a/src/java/com/twitter/search/feature_update_service/modules/BUILD +++ /dev/null @@ -1,48 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/commons-io", - "3rdparty/jvm/org/apache/kafka:kafka-clients", - "3rdparty/jvm/org/yaml:snakeyaml", - "decider/src/main/scala", - "finagle/finagle-core/src/main", - "finagle/finagle-exp/src/main/scala", - "finagle/finagle-thriftmux/src/main/scala", - "finagle/finagle-zipkin-core/src/main/scala", - "finagle/finagle-zipkin-scribe/src/main/scala", - "finatra-internal/mtls-thriftmux/src/main/scala", - "finatra/inject/inject-app/src/main/java/com/twitter/inject/annotations", - "finatra/inject/inject-core/src/main/scala", - "finatra/inject/inject-modules/src/main/scala", - "finatra/inject/inject-modules/src/main/scala/com/twitter/inject/modules", - "finatra/inject/inject-thrift-client/src/main/scala", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift:controller", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/exceptions", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/filters", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/modules", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/response", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/routing", - "kafka/finagle-kafka/finatra-kafka/src/main/scala", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/search/common/decider", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/base", - "src/java/com/twitter/search/common/util/io/kafka", - "src/java/com/twitter/search/common/util/io/periodic", - "src/java/com/twitter/search/feature_update_service/filters", - "src/java/com/twitter/search/feature_update_service/stats", - "src/java/com/twitter/search/feature_update_service/whitelist", - "src/java/com/twitter/spam/finagle", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/search/common:schema-java", - "src/thrift/com/twitter/search/feature_update_service/thrift:thrift-java", - "src/thrift/com/twitter/tweetypie:service-java", - "src/thrift/com/twitter/tweetypie:tweet-java", - "util/util-core/src/main/java", - ], -) diff --git a/src/java/com/twitter/search/feature_update_service/modules/BUILD.docx b/src/java/com/twitter/search/feature_update_service/modules/BUILD.docx new file mode 100644 index 000000000..416ca01e5 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/modules/BUILD.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/modules/ClientIdWhitelistModule.docx b/src/java/com/twitter/search/feature_update_service/modules/ClientIdWhitelistModule.docx new file mode 100644 index 000000000..3273ea3b0 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/modules/ClientIdWhitelistModule.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/modules/ClientIdWhitelistModule.java b/src/java/com/twitter/search/feature_update_service/modules/ClientIdWhitelistModule.java deleted file mode 100644 index 705de435d..000000000 --- a/src/java/com/twitter/search/feature_update_service/modules/ClientIdWhitelistModule.java +++ /dev/null @@ -1,30 +0,0 @@ -package com.twitter.search.feature_update_service.modules; - -import com.google.inject.Provides; -import com.google.inject.Singleton; - -import com.twitter.app.Flaggable; -import com.twitter.inject.TwitterModule; -import com.twitter.inject.annotations.Flag; - -import com.twitter.search.feature_update_service.whitelist.ClientIdWhitelist; - -/** - * Provides a ClientIdWhitelist, which periodically loads the - * Feature Update Service client whitelist from ConfigBus - */ -public class ClientIdWhitelistModule extends TwitterModule { - public ClientIdWhitelistModule() { - flag("client.whitelist.path", "", - "Path to client id white list.", Flaggable.ofString()); - flag("client.whitelist.enable", true, - "Enable client whitelist for production.", Flaggable.ofBoolean()); - } - - @Provides - @Singleton - public ClientIdWhitelist provideClientWhitelist( - @Flag("client.whitelist.path") String clientIdWhiteListPath) throws Exception { - return ClientIdWhitelist.initWhitelist(clientIdWhiteListPath); - } - } diff --git a/src/java/com/twitter/search/feature_update_service/modules/EarlybirdUtilModule.docx b/src/java/com/twitter/search/feature_update_service/modules/EarlybirdUtilModule.docx new file mode 100644 index 000000000..f78869f34 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/modules/EarlybirdUtilModule.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/modules/EarlybirdUtilModule.java b/src/java/com/twitter/search/feature_update_service/modules/EarlybirdUtilModule.java deleted file mode 100644 index 1f5bc495f..000000000 --- a/src/java/com/twitter/search/feature_update_service/modules/EarlybirdUtilModule.java +++ /dev/null @@ -1,13 +0,0 @@ -package com.twitter.search.feature_update_service.modules; - -import com.twitter.app.Flaggable; -import com.twitter.inject.TwitterModule; - -public class EarlybirdUtilModule extends TwitterModule { - public static final String PENGUIN_VERSIONS_FLAG = "penguin.versions"; - - public EarlybirdUtilModule() { - flag(PENGUIN_VERSIONS_FLAG, "penguin_6", - "Comma-separated list of supported Penguin versions.", Flaggable.ofString()); - } -} diff --git a/src/java/com/twitter/search/feature_update_service/modules/FeatureUpdateServiceDiffyModule.docx b/src/java/com/twitter/search/feature_update_service/modules/FeatureUpdateServiceDiffyModule.docx new file mode 100644 index 000000000..b9840fa69 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/modules/FeatureUpdateServiceDiffyModule.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/modules/FeatureUpdateServiceDiffyModule.java b/src/java/com/twitter/search/feature_update_service/modules/FeatureUpdateServiceDiffyModule.java deleted file mode 100644 index d38665624..000000000 --- a/src/java/com/twitter/search/feature_update_service/modules/FeatureUpdateServiceDiffyModule.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.twitter.search.feature_update_service.modules; - -import com.twitter.decider.Decider; -import com.twitter.inject.Injector; -import com.twitter.finatra.mtls.thriftmux.modules.MtlsJavaDarkTrafficFilterModule; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.util.Function; - - -/** - * Provide a filter that sends dark traffic to diffy, if the diffy.dest command-line parameter - * is non-empty. If diffy.dest is empty, just provide a no-op filter. - */ -public class FeatureUpdateServiceDiffyModule extends MtlsJavaDarkTrafficFilterModule { - @Override - public String destFlagName() { - return "diffy.dest"; - } - - @Override - public String defaultClientId() { - return "feature_update_service.origin"; - } - - @Override - public Function enableSampling(Injector injector) { - Decider decider = injector.instance(Decider.class); - return new Function() { - @Override - public Object apply(byte[] v1) { - return DeciderUtil.isAvailableForRandomRecipient(decider, "dark_traffic_filter"); - } - }; - } -} diff --git a/src/java/com/twitter/search/feature_update_service/modules/FinagleKafkaProducerModule.docx b/src/java/com/twitter/search/feature_update_service/modules/FinagleKafkaProducerModule.docx new file mode 100644 index 000000000..e4c4e8949 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/modules/FinagleKafkaProducerModule.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/modules/FinagleKafkaProducerModule.java b/src/java/com/twitter/search/feature_update_service/modules/FinagleKafkaProducerModule.java deleted file mode 100644 index b35177099..000000000 --- a/src/java/com/twitter/search/feature_update_service/modules/FinagleKafkaProducerModule.java +++ /dev/null @@ -1,62 +0,0 @@ -package com.twitter.search.feature_update_service.modules; - -import javax.inject.Named; -import javax.inject.Singleton; - -import com.google.inject.Provides; - -import com.twitter.app.Flaggable; -import com.twitter.common.util.Clock; -import com.twitter.finatra.kafka.producers.BlockingFinagleKafkaProducer; -import com.twitter.inject.TwitterModule; -import com.twitter.inject.annotations.Flag; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.util.io.kafka.CompactThriftSerializer; -import com.twitter.search.common.util.io.kafka.FinagleKafkaClientUtils; -import com.twitter.search.common.util.io.kafka.SearchPartitioner; -import com.twitter.search.common.util.io.kafka.SearchPartitionerRealtimeCg; - -public class FinagleKafkaProducerModule extends TwitterModule { - public static final String KAFKA_DEST_FLAG = "kafka.dest"; - public static final String KAFKA_TOPIC_NAME_UPDATE_EVENTS_FLAG = - "kafka.topic.name.update_events"; - public static final String KAFKA_TOPIC_NAME_UPDATE_EVENTS_FLAG_REALTIME_CG = - "kafka.topic.name.update_events_realtime_cg"; - public static final String KAFKA_ENABLE_S2S_AUTH_FLAG = "kafka.enable_s2s_auth"; - - public FinagleKafkaProducerModule() { - flag(KAFKA_DEST_FLAG, "Kafka cluster destination", "", Flaggable.ofString()); - flag(KAFKA_TOPIC_NAME_UPDATE_EVENTS_FLAG, "", - "Topic name for update events", Flaggable.ofString()); - flag(KAFKA_TOPIC_NAME_UPDATE_EVENTS_FLAG_REALTIME_CG, "", - "Topic name for update events", Flaggable.ofString()); - flag(KAFKA_ENABLE_S2S_AUTH_FLAG, true, "enable s2s authentication configs", - Flaggable.ofBoolean()); - } - - @Provides - @Named("KafkaProducer") - public BlockingFinagleKafkaProducer kafkaProducer( - @Flag(KAFKA_DEST_FLAG) String kafkaDest, - @Flag(KAFKA_ENABLE_S2S_AUTH_FLAG) boolean enableKafkaAuth) { - return FinagleKafkaClientUtils.newFinagleKafkaProducer( - kafkaDest, enableKafkaAuth, new CompactThriftSerializer(), - "search_cluster", SearchPartitioner.class); - } - - @Provides - @Named("KafkaProducerRealtimeCg") - public BlockingFinagleKafkaProducer kafkaProducerRealtimeCg( - @Flag(KAFKA_DEST_FLAG) String kafkaDest, - @Flag(KAFKA_ENABLE_S2S_AUTH_FLAG) boolean enableKafkaAuth) { - return FinagleKafkaClientUtils.newFinagleKafkaProducer( - kafkaDest, enableKafkaAuth, new CompactThriftSerializer(), - "search_cluster", SearchPartitionerRealtimeCg.class); - } - - @Provides - @Singleton - public Clock clock() { - return Clock.SYSTEM_CLOCK; - } -} diff --git a/src/java/com/twitter/search/feature_update_service/modules/FuturePoolModule.docx b/src/java/com/twitter/search/feature_update_service/modules/FuturePoolModule.docx new file mode 100644 index 000000000..08be45a25 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/modules/FuturePoolModule.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/modules/FuturePoolModule.java b/src/java/com/twitter/search/feature_update_service/modules/FuturePoolModule.java deleted file mode 100644 index 537f67559..000000000 --- a/src/java/com/twitter/search/feature_update_service/modules/FuturePoolModule.java +++ /dev/null @@ -1,57 +0,0 @@ -package com.twitter.search.feature_update_service.modules; - -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.inject.Provides; -import com.google.inject.Singleton; - -import com.twitter.inject.TwitterModule; -import com.twitter.search.common.metrics.SearchCustomGauge; -import com.twitter.search.feature_update_service.stats.FeatureUpdateStats; -import com.twitter.util.ExecutorServiceFuturePool; -import com.twitter.util.InterruptibleExecutorServiceFuturePool; - -public class FuturePoolModule extends TwitterModule { - /** - * Provide future pool backed by executor service, with bounded thread pool and bounded backing - * queue. - */ - @Provides - @Singleton - public ExecutorServiceFuturePool futurePool() { - // These limits are based on service capacity estimates and testing on staging, - // attempting to give the pool as many resources as possible without overloading anything. - // 100-200 threads is manageable, and the 2000 queue size is based on a conservative upper - // limit that tasks in the queue take 1 MB each, meaning queue maxes out at 2 GB, which should - // be okay given 4 GB RAM with 3 GB reserved heap. - return createFuturePool(100, 200, 2000); - } - - /** - * Create a future pool backed by executor service, with bounded thread pool and bounded backing - * queue. ONLY VISIBILE FOR TESTING; don't invoke outside this class. - */ - @VisibleForTesting - public static ExecutorServiceFuturePool createFuturePool( - int corePoolSize, int maximumPoolSize, int queueCapacity) { - final LinkedBlockingQueue queue = new LinkedBlockingQueue<>(queueCapacity); - - ExecutorServiceFuturePool futurePool = new InterruptibleExecutorServiceFuturePool( - new ThreadPoolExecutor( - corePoolSize, - maximumPoolSize, - 60L, - TimeUnit.SECONDS, - queue)); - - SearchCustomGauge.export(FeatureUpdateStats.PREFIX + "thread_pool_size", - futurePool::poolSize); - SearchCustomGauge.export(FeatureUpdateStats.PREFIX + "work_queue_size", - queue::size); - - return futurePool; - } -} diff --git a/src/java/com/twitter/search/feature_update_service/modules/TweetypieModule.docx b/src/java/com/twitter/search/feature_update_service/modules/TweetypieModule.docx new file mode 100644 index 000000000..2d61963f1 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/modules/TweetypieModule.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/modules/TweetypieModule.java b/src/java/com/twitter/search/feature_update_service/modules/TweetypieModule.java deleted file mode 100644 index 6fd041cd4..000000000 --- a/src/java/com/twitter/search/feature_update_service/modules/TweetypieModule.java +++ /dev/null @@ -1,62 +0,0 @@ -package com.twitter.search.feature_update_service.modules; - -import javax.inject.Singleton; - -import com.google.inject.Provides; - -import com.twitter.finagle.Service; -import com.twitter.finagle.ThriftMux; -import com.twitter.finagle.builder.ClientBuilder; -import com.twitter.finagle.mtls.authentication.ServiceIdentifier; -import com.twitter.finagle.mtls.client.MtlsThriftMuxClient; -import com.twitter.finagle.stats.StatsReceiver; -import com.twitter.finagle.thrift.ClientId; -import com.twitter.finagle.thrift.ThriftClientRequest; -import com.twitter.finagle.zipkin.thrift.ZipkinTracer; -import com.twitter.inject.TwitterModule; -import com.twitter.spam.finagle.FinagleUtil; -import com.twitter.tweetypie.thriftjava.TweetService; -import com.twitter.util.Duration; - -public class TweetypieModule extends TwitterModule { - @Provides - @Singleton - private ThriftMux.Client providesThriftMuxClient(ServiceIdentifier serviceIdentifier) { - return new MtlsThriftMuxClient(ThriftMux.client()) - .withMutualTls(serviceIdentifier) - .withClientId(new ClientId("feature_update_service.prod")); - } - private static final Duration DEFAULT_CONN_TIMEOUT = Duration.fromSeconds(2); - - private static final Duration TWEET_SERVICE_REQUEST_TIMEOUT = Duration.fromMilliseconds(500); - - private static final int TWEET_SERVICE_RETRIES = 5; - @Provides @Singleton - private TweetService.ServiceIface provideTweetServiceClient( - ThriftMux.Client thriftMux, - StatsReceiver statsReceiver) throws InterruptedException { - // TweetService is TweetService (tweetypie) with different api - // Since TweetService will be primarly used for interacting with - // tweetypie's flexible schema (MH), we will increase request - // timeout and retries but share other settings from TweetService. - @SuppressWarnings("unchecked") - ClientBuilder clientBuilder = FinagleUtil.getClientBuilder() - .name("tweet_service") - .stack(thriftMux) - .tcpConnectTimeout(DEFAULT_CONN_TIMEOUT) - .requestTimeout(TWEET_SERVICE_REQUEST_TIMEOUT) - .retries(TWEET_SERVICE_RETRIES) - .reportTo(statsReceiver) - .tracer(ZipkinTracer.mk(statsReceiver)); - - @SuppressWarnings("unchecked") - final Service finagleClient = - FinagleUtil.createResolvedFinagleClient( - "tweetypie", - "prod", - "tweetypie", - clientBuilder); - - return new TweetService.ServiceToClient(finagleClient); - } -} diff --git a/src/java/com/twitter/search/feature_update_service/stats/BUILD b/src/java/com/twitter/search/feature_update_service/stats/BUILD deleted file mode 100644 index 001463400..000000000 --- a/src/java/com/twitter/search/feature_update_service/stats/BUILD +++ /dev/null @@ -1,11 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/inject:guice", - "src/java/com/twitter/common/base", - "src/java/com/twitter/search/common/metrics", - "src/thrift/com/twitter/search/feature_update_service/thrift:thrift-java", - ], -) diff --git a/src/java/com/twitter/search/feature_update_service/stats/BUILD.docx b/src/java/com/twitter/search/feature_update_service/stats/BUILD.docx new file mode 100644 index 000000000..3b8fdd409 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/stats/BUILD.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/stats/FeatureUpdateStats.docx b/src/java/com/twitter/search/feature_update_service/stats/FeatureUpdateStats.docx new file mode 100644 index 000000000..e0f4825ea Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/stats/FeatureUpdateStats.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/stats/FeatureUpdateStats.java b/src/java/com/twitter/search/feature_update_service/stats/FeatureUpdateStats.java deleted file mode 100644 index aa607e85e..000000000 --- a/src/java/com/twitter/search/feature_update_service/stats/FeatureUpdateStats.java +++ /dev/null @@ -1,111 +0,0 @@ -package com.twitter.search.feature_update_service.stats; - -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentMap; - -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateResponseCode; - -/** Stat tracking for the feature update ingester service. */ -public class FeatureUpdateStats { - public static final String PREFIX = "feature_update_service_"; - - private final SearchRateCounter requestRate = SearchRateCounter.export( - PREFIX + "requests"); - - private ConcurrentMap perClientRequestRate = - new ConcurrentHashMap<>(); - - private ConcurrentMap responseCodeRate = - new ConcurrentHashMap<>(); - - private ConcurrentMap preClientResponseCodeRate = - new ConcurrentHashMap<>(); - - /** - * Record metrics for a single incoming request. - */ - public void clientRequest(String clientID) { - // 1. Track total request rate. It's better to precompute than compute the per client sum at - // query time. - requestRate.increment(); - - // 2. Track request rate per client. - incrementPerClientCounter(perClientRequestRate, clientRequestRateKey(clientID)); - } - - /** - * Record metrics for a single response. - */ - public void clientResponse(String clientID, FeatureUpdateResponseCode responseCode) { - String code = responseCode.toString().toLowerCase(); - - // 1. Track rates per response code. - incrementPerClientCounter(responseCodeRate, responseCodeKey(code)); - - // 2. Track rates per client per response code. - incrementPerClientCounter(preClientResponseCodeRate, clientResponseCodeKey(clientID, code)); - } - - /** - * Returns the total number of requests. - */ - public long getRequestRateCount() { - return requestRate.getCount(); - } - - /** - * Returns the total number of requests for the specified client. - */ - public long getClientRequestCount(String clientID) { - String key = clientRequestRateKey(clientID); - if (perClientRequestRate.containsKey(key)) { - return perClientRequestRate.get(key).getCount(); - } - return 0; - } - - /** - * Returns the total number of responses with the specified code. - */ - public long getResponseCodeCount(FeatureUpdateResponseCode responseCode) { - String code = responseCode.toString().toLowerCase(); - String key = responseCodeKey(code); - if (responseCodeRate.containsKey(key)) { - return responseCodeRate.get(key).getCount(); - } - return 0; - } - - /** - * Returns the total number of responses to the specified client with the specified code. - */ - public long getClientResponseCodeCount(String clientID, FeatureUpdateResponseCode responseCode) { - String code = responseCode.toString().toLowerCase(); - String key = clientResponseCodeKey(clientID, code); - if (preClientResponseCodeRate.containsKey(key)) { - return preClientResponseCodeRate.get(key).getCount(); - } - return 0; - } - - private static String clientRequestRateKey(String clientID) { - return String.format(PREFIX + "requests_for_client_id_%s", clientID); - } - - private static String responseCodeKey(String responseCode) { - return String.format(PREFIX + "response_code_%s", responseCode); - } - - private static String clientResponseCodeKey(String clientID, String responseCode) { - return String.format(PREFIX + "response_for_client_id_%s_code_%s", clientID, responseCode); - } - - private void incrementPerClientCounter( - ConcurrentMap rates, - String key - ) { - rates.putIfAbsent(key, SearchRateCounter.export(key)); - rates.get(key).increment(); - } -} diff --git a/src/java/com/twitter/search/feature_update_service/util/BUILD b/src/java/com/twitter/search/feature_update_service/util/BUILD deleted file mode 100644 index 0baf9e722..000000000 --- a/src/java/com/twitter/search/feature_update_service/util/BUILD +++ /dev/null @@ -1,11 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/code/findbugs:jsr305", - "src/java/com/twitter/search/common/schema/base", - "src/thrift/com/twitter/search/common:schema-java", - "src/thrift/com/twitter/search/feature_update_service/thrift:thrift-java", - ], -) diff --git a/src/java/com/twitter/search/feature_update_service/util/BUILD.docx b/src/java/com/twitter/search/feature_update_service/util/BUILD.docx new file mode 100644 index 000000000..03ebad52c Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/util/BUILD.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/util/FeatureUpdateValidator.docx b/src/java/com/twitter/search/feature_update_service/util/FeatureUpdateValidator.docx new file mode 100644 index 000000000..deb99817d Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/util/FeatureUpdateValidator.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/util/FeatureUpdateValidator.java b/src/java/com/twitter/search/feature_update_service/util/FeatureUpdateValidator.java deleted file mode 100644 index c523a083e..000000000 --- a/src/java/com/twitter/search/feature_update_service/util/FeatureUpdateValidator.java +++ /dev/null @@ -1,41 +0,0 @@ -package com.twitter.search.feature_update_service.util; - - -import javax.annotation.Nullable; - -import com.twitter.search.common.schema.base.ThriftDocumentUtil; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateRequest; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateResponse; -import com.twitter.search.feature_update_service.thriftjava.FeatureUpdateResponseCode; - -public final class FeatureUpdateValidator { - - private FeatureUpdateValidator() { } - - /** - * Validates FeatureUpdateRequest - * @param featureUpdate instance of FeatureUpdateRequest with ThriftIndexingEvent - * @return null if valid, instance of FeatureUpdateResponse if not. - * Response will have appropriate error code and message set. - */ - @Nullable - public static FeatureUpdateResponse validate(FeatureUpdateRequest featureUpdate) { - - if (ThriftDocumentUtil.hasDuplicateFields(featureUpdate.getEvent().getDocument())) { - return createResponse( - String.format("duplicate document fields: %s", featureUpdate.toString())); - } - if (!featureUpdate.getEvent().isSetUid()) { - return createResponse(String.format("unset uid: %s", featureUpdate.toString())); - } - - return null; - } - - private static FeatureUpdateResponse createResponse(String errorMsg) { - FeatureUpdateResponseCode responseCode = FeatureUpdateResponseCode.CLIENT_ERROR; - FeatureUpdateResponse response = new FeatureUpdateResponse(responseCode); - response.setDetailMessage(errorMsg); - return response; - } -} diff --git a/src/java/com/twitter/search/feature_update_service/whitelist/BUILD b/src/java/com/twitter/search/feature_update_service/whitelist/BUILD deleted file mode 100644 index 9bd13cf87..000000000 --- a/src/java/com/twitter/search/feature_update_service/whitelist/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/commons-io", - "3rdparty/jvm/org/yaml:snakeyaml", - "finagle/finagle-core/src/main", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/search/common/util/io/periodic", - ], -) diff --git a/src/java/com/twitter/search/feature_update_service/whitelist/BUILD.docx b/src/java/com/twitter/search/feature_update_service/whitelist/BUILD.docx new file mode 100644 index 000000000..51d5e5ce4 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/whitelist/BUILD.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/whitelist/ClientIdWhitelist.docx b/src/java/com/twitter/search/feature_update_service/whitelist/ClientIdWhitelist.docx new file mode 100644 index 000000000..27131e0d7 Binary files /dev/null and b/src/java/com/twitter/search/feature_update_service/whitelist/ClientIdWhitelist.docx differ diff --git a/src/java/com/twitter/search/feature_update_service/whitelist/ClientIdWhitelist.java b/src/java/com/twitter/search/feature_update_service/whitelist/ClientIdWhitelist.java deleted file mode 100644 index 4718c547e..000000000 --- a/src/java/com/twitter/search/feature_update_service/whitelist/ClientIdWhitelist.java +++ /dev/null @@ -1,77 +0,0 @@ -package com.twitter.search.feature_update_service.whitelist; - -import java.io.InputStream; -import java.util.Set; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.atomic.AtomicReference; - -import com.google.common.collect.ImmutableSet; -import com.google.common.util.concurrent.ThreadFactoryBuilder; - -import org.yaml.snakeyaml.Yaml; - -import com.twitter.common.util.Clock; -import com.twitter.finagle.thrift.ClientId; -import com.twitter.search.common.util.io.periodic.PeriodicFileLoader; - -/** - * ClientIdWhitelist extends PeriodicFileLoader to load client whitelist - * from configbus and checks to see if current clientId is allowed - */ -public class ClientIdWhitelist extends PeriodicFileLoader { - - private final AtomicReference> clientIdSet = new AtomicReference<>(); - - - public ClientIdWhitelist(String clientIdWhitelistPath, ScheduledExecutorService executorService, - Clock clock) { - super("ClientIdWhitelist", clientIdWhitelistPath, executorService, clock); - } - - /** - * Creates the object that manages loads from the clientIdWhitelistpath in config. - * It periodically reloads the client whitelist file using the given executor service. - */ - public static ClientIdWhitelist initWhitelist( - String clientIdWhitelistPath, ScheduledExecutorService executorService, - Clock clock) throws Exception { - ClientIdWhitelist clientIdWhitelist = new ClientIdWhitelist( - clientIdWhitelistPath, executorService, clock); - clientIdWhitelist.init(); - return clientIdWhitelist; - } - - /** - * Creates clock and executor service needed to create a periodic file loading object - * then returns object that accpets file. - * @param clientWhitelistPath - * @return ClientIdWhitelist - * @throws Exception - */ - public static ClientIdWhitelist initWhitelist(String clientWhitelistPath) throws Exception { - Clock clock = Clock.SYSTEM_CLOCK; - ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor( - new ThreadFactoryBuilder() - .setNameFormat("client-whitelist-reloader") - .setDaemon(true) - .build()); - - return initWhitelist(clientWhitelistPath, executorService, clock); - } - @Override - protected void accept(InputStream fileStream) { - ImmutableSet.Builder clientIdBuilder = new ImmutableSet.Builder<>(); - Yaml yaml = new Yaml(); - Set set = yaml.loadAs(fileStream, Set.class); - for (String id : set) { - clientIdBuilder.add(ClientId.apply(id)); - } - clientIdSet.set(clientIdBuilder.build()); - } - - // checks to see if clientId is in set of whitelisted clients - public boolean isClientAllowed(ClientId clientId) { - return clientIdSet.get().contains(clientId); - } -} diff --git a/src/java/com/twitter/search/img/foryou.png b/src/java/com/twitter/search/img/foryou.png deleted file mode 100644 index 6d08febde..000000000 Binary files a/src/java/com/twitter/search/img/foryou.png and /dev/null differ diff --git a/src/java/com/twitter/search/img/in-network.png b/src/java/com/twitter/search/img/in-network.png deleted file mode 100644 index 09caa3df2..000000000 Binary files a/src/java/com/twitter/search/img/in-network.png and /dev/null differ diff --git a/src/java/com/twitter/search/img/indexing.png b/src/java/com/twitter/search/img/indexing.png deleted file mode 100644 index 2704854ab..000000000 Binary files a/src/java/com/twitter/search/img/indexing.png and /dev/null differ diff --git a/src/java/com/twitter/search/img/serving.png b/src/java/com/twitter/search/img/serving.png deleted file mode 100644 index aca60b55e..000000000 Binary files a/src/java/com/twitter/search/img/serving.png and /dev/null differ diff --git a/src/java/com/twitter/search/img/top-search.png b/src/java/com/twitter/search/img/top-search.png deleted file mode 100644 index 267c3aaf2..000000000 Binary files a/src/java/com/twitter/search/img/top-search.png and /dev/null differ diff --git a/src/java/com/twitter/search/ingester/BUILD b/src/java/com/twitter/search/ingester/BUILD deleted file mode 100644 index 391184356..000000000 --- a/src/java/com/twitter/search/ingester/BUILD +++ /dev/null @@ -1,30 +0,0 @@ -target( - name = "ingester-lib", - dependencies = [ - "src/java/com/twitter/search/common/converter/earlybird", - "src/java/com/twitter/search/ingester/model", - "src/java/com/twitter/search/ingester/pipeline/app", - "src/java/com/twitter/search/ingester/pipeline/twitter", - "src/java/com/twitter/search/ingester/pipeline/twitter/engagements", - "src/java/com/twitter/search/ingester/pipeline/twitter/filters", - "src/java/com/twitter/search/ingester/pipeline/twitter/kafka", - "src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse", - "src/java/com/twitter/search/ingester/pipeline/twitter/userupdates", - "src/java/com/twitter/search/ingester/pipeline/util", - "src/java/com/twitter/search/ingester/util/jndi", - ], -) - -jvm_binary( - name = "ingester-binary", - basename = "ingester", - main = "com.twitter.search.ingester.pipeline.app.IngesterPipelineApplication\\$Main", - platform = "java8", - tags = [ - "bazel-compatible", - ], - dependencies = [ - ":ingester-lib", - "src/java/com/twitter/search/common/logging:search-log4j", - ], -) diff --git a/src/java/com/twitter/search/ingester/BUILD.docx b/src/java/com/twitter/search/ingester/BUILD.docx new file mode 100644 index 000000000..ee7f78f1f Binary files /dev/null and b/src/java/com/twitter/search/ingester/BUILD.docx differ diff --git a/src/java/com/twitter/search/ingester/README.docx b/src/java/com/twitter/search/ingester/README.docx new file mode 100644 index 000000000..a005dcbdf Binary files /dev/null and b/src/java/com/twitter/search/ingester/README.docx differ diff --git a/src/java/com/twitter/search/ingester/README.md b/src/java/com/twitter/search/ingester/README.md deleted file mode 100644 index ee0a2b15a..000000000 --- a/src/java/com/twitter/search/ingester/README.md +++ /dev/null @@ -1,10 +0,0 @@ -## Ingesters -Ingesters are services that consume raw tweets and user updates, process them through a series of transformations and write them to kafka topics for Earlybird to consume and subsequently index. - -There are two types of ingesters: -1. Tweet ingesters -2. UserUpdates ingesters - -Tweet ingesters consume raw tweets and extract different fields and features for Earlybird to index. User updates ingester produces user safety information such as whether the user is deactivated, suspended or off-boarded. The user and tweet features produced by ingesters are then used by Earlybird during tweet retieval and ranking. - -Ingesters are made up of a pipeline of stages with each stage performing a different field/feature extraction. The pipeline configuration of the ingesters can be found at science/search/ingester/config diff --git a/src/java/com/twitter/search/ingester/model/BUILD b/src/java/com/twitter/search/ingester/model/BUILD deleted file mode 100644 index 4225e7ff5..000000000 --- a/src/java/com/twitter/search/ingester/model/BUILD +++ /dev/null @@ -1,28 +0,0 @@ -java_library( - sources = ["*.java"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/code/findbugs:jsr305", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/org/apache/thrift:libthrift", - "cuad/projects/ner/thrift/src/main/thrift:thrift-java", - "src/java/com/twitter/common/text/token", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/debug", - "src/java/com/twitter/search/common/encoding/features", - "src/java/com/twitter/search/common/partitioning/base", - "src/java/com/twitter/search/common/relevance:entities_and_filters", - "src/java/com/twitter/search/common/relevance:text", - "src/java/com/twitter/search/common/relevance/features", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/schema/earlybird", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/search/common:schema-java", - "src/thrift/com/twitter/search/common/debug:debug-java", - "src/thrift/com/twitter/service/spiderduck/gen:metadata-store-java", - "src/thrift/com/twitter/tweetypie:events-java", - "util/util-core:scala", - ], -) diff --git a/src/java/com/twitter/search/ingester/model/BUILD.docx b/src/java/com/twitter/search/ingester/model/BUILD.docx new file mode 100644 index 000000000..4dcc99ce2 Binary files /dev/null and b/src/java/com/twitter/search/ingester/model/BUILD.docx differ diff --git a/src/java/com/twitter/search/ingester/model/IndexerStatus.docx b/src/java/com/twitter/search/ingester/model/IndexerStatus.docx new file mode 100644 index 000000000..372dfeb35 Binary files /dev/null and b/src/java/com/twitter/search/ingester/model/IndexerStatus.docx differ diff --git a/src/java/com/twitter/search/ingester/model/IndexerStatus.java b/src/java/com/twitter/search/ingester/model/IndexerStatus.java deleted file mode 100644 index 6893bbc67..000000000 --- a/src/java/com/twitter/search/ingester/model/IndexerStatus.java +++ /dev/null @@ -1,13 +0,0 @@ -package com.twitter.search.ingester.model; - -import com.twitter.search.common.debug.DebugEventAccumulator; - -/** - * Interface used for stages that process both TwitterMessages and ThriftVersionedEvents. - */ -public interface IndexerStatus extends DebugEventAccumulator { - /** - * Needed by the SortStage. - */ - long getId(); -} diff --git a/src/java/com/twitter/search/ingester/model/IngesterThriftVersionedEvents.docx b/src/java/com/twitter/search/ingester/model/IngesterThriftVersionedEvents.docx new file mode 100644 index 000000000..df0a14ba8 Binary files /dev/null and b/src/java/com/twitter/search/ingester/model/IngesterThriftVersionedEvents.docx differ diff --git a/src/java/com/twitter/search/ingester/model/IngesterThriftVersionedEvents.java b/src/java/com/twitter/search/ingester/model/IngesterThriftVersionedEvents.java deleted file mode 100644 index b6dd985a8..000000000 --- a/src/java/com/twitter/search/ingester/model/IngesterThriftVersionedEvents.java +++ /dev/null @@ -1,50 +0,0 @@ -package com.twitter.search.ingester.model; - -import java.util.Map; - -import com.google.common.primitives.Longs; - -import com.twitter.search.common.debug.DebugEventAccumulator; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.partitioning.base.Partitionable; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; - -/** - * Wrap of ThriftVersionedEvents, make it partitionable for the queue writer. - */ -public class IngesterThriftVersionedEvents extends ThriftVersionedEvents - implements Comparable, Partitionable, DebugEventAccumulator { - - // Make userId field easier to be accessed to calculate partition number - private final long userId; - - public IngesterThriftVersionedEvents(long userId) { - this.userId = userId; - } - - public IngesterThriftVersionedEvents(long userId, - Map versionedEvents) { - super(versionedEvents); - this.userId = userId; - } - - public IngesterThriftVersionedEvents(long userId, ThriftVersionedEvents original) { - super(original); - this.userId = userId; - } - - @Override - public int compareTo(ThriftVersionedEvents o) { - return Longs.compare(getId(), o.getId()); - } - - @Override - public long getTweetId() { - return this.getId(); - } - - @Override - public long getUserId() { - return this.userId; - } -} diff --git a/src/java/com/twitter/search/ingester/model/IngesterTweetEvent.docx b/src/java/com/twitter/search/ingester/model/IngesterTweetEvent.docx new file mode 100644 index 000000000..43215a216 Binary files /dev/null and b/src/java/com/twitter/search/ingester/model/IngesterTweetEvent.docx differ diff --git a/src/java/com/twitter/search/ingester/model/IngesterTweetEvent.java b/src/java/com/twitter/search/ingester/model/IngesterTweetEvent.java deleted file mode 100644 index 1d5fae1b9..000000000 --- a/src/java/com/twitter/search/ingester/model/IngesterTweetEvent.java +++ /dev/null @@ -1,19 +0,0 @@ -package com.twitter.search.ingester.model; - -import com.twitter.search.common.debug.DebugEventAccumulator; -import com.twitter.search.common.debug.thriftjava.DebugEvents; -import com.twitter.tweetypie.thriftjava.TweetEvent; - -public class IngesterTweetEvent extends TweetEvent implements DebugEventAccumulator { - // Used for propagating DebugEvents through the ingester stages. - private final DebugEvents debugEvents; - - public IngesterTweetEvent() { - this.debugEvents = new DebugEvents(); - } - - @Override - public DebugEvents getDebugEvents() { - return debugEvents; - } -} diff --git a/src/java/com/twitter/search/ingester/model/IngesterTwitterMessage.docx b/src/java/com/twitter/search/ingester/model/IngesterTwitterMessage.docx new file mode 100644 index 000000000..db274e5b1 Binary files /dev/null and b/src/java/com/twitter/search/ingester/model/IngesterTwitterMessage.docx differ diff --git a/src/java/com/twitter/search/ingester/model/IngesterTwitterMessage.java b/src/java/com/twitter/search/ingester/model/IngesterTwitterMessage.java deleted file mode 100644 index e89fef845..000000000 --- a/src/java/com/twitter/search/ingester/model/IngesterTwitterMessage.java +++ /dev/null @@ -1,73 +0,0 @@ -package com.twitter.search.ingester.model; - -import java.util.List; - -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; -import com.google.common.primitives.Longs; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.debug.thriftjava.DebugEvents; -import com.twitter.search.common.partitioning.base.HashPartitionFunction; -import com.twitter.search.common.partitioning.base.Partitionable; -import com.twitter.search.common.relevance.entities.TwitterMessage; - -/** - * A Twitter "status" object (e.g. a message) - * - */ -public class IngesterTwitterMessage extends TwitterMessage - implements Comparable, IndexerStatus, Partitionable { - private final DebugEvents debugEvents; - - public IngesterTwitterMessage(Long twitterId, List supportedPenguinVersions) { - this(twitterId, supportedPenguinVersions, null); - } - - public IngesterTwitterMessage( - Long twitterId, - List penguinVersions, - @Nullable DebugEvents debugEvents) { - super(twitterId, penguinVersions); - this.debugEvents = debugEvents == null ? new DebugEvents() : debugEvents.deepCopy(); - } - - @Override - public int compareTo(IndexerStatus o) { - return Longs.compare(getId(), o.getId()); - } - - @Override - public boolean equals(Object o) { - return (o instanceof IngesterTwitterMessage) - && compareTo((IngesterTwitterMessage) o) == 0; - } - - @Override - public int hashCode() { - return HashPartitionFunction.hashCode(getId()); - } - - public boolean isIndexable(boolean indexProtectedTweets) { - return getFromUserScreenName().isPresent() - && getId() != INT_FIELD_NOT_PRESENT - && (indexProtectedTweets || !isUserProtected()); - } - - @Override - public long getTweetId() { - return this.getId(); - } - - @Override - public long getUserId() { - Preconditions.checkState(getFromUserTwitterId().isPresent(), "The author user ID is missing"); - return getFromUserTwitterId().get(); - } - - @Override - public DebugEvents getDebugEvents() { - return debugEvents; - } -} diff --git a/src/java/com/twitter/search/ingester/model/KafkaRawRecord.docx b/src/java/com/twitter/search/ingester/model/KafkaRawRecord.docx new file mode 100644 index 000000000..7f566a496 Binary files /dev/null and b/src/java/com/twitter/search/ingester/model/KafkaRawRecord.docx differ diff --git a/src/java/com/twitter/search/ingester/model/KafkaRawRecord.java b/src/java/com/twitter/search/ingester/model/KafkaRawRecord.java deleted file mode 100644 index 85ea70fa7..000000000 --- a/src/java/com/twitter/search/ingester/model/KafkaRawRecord.java +++ /dev/null @@ -1,22 +0,0 @@ -package com.twitter.search.ingester.model; - -/** - * The raw data in a Kafka record. - */ -public class KafkaRawRecord { - private final byte[] data; - private final long readAtTimestampMs; - - public KafkaRawRecord(byte[] data, long readAtTimestampMs) { - this.data = data; - this.readAtTimestampMs = readAtTimestampMs; - } - - public byte[] getData() { - return data; - } - - public long getReadAtTimestampMs() { - return readAtTimestampMs; - } -} diff --git a/src/java/com/twitter/search/ingester/model/PromiseContainer.docx b/src/java/com/twitter/search/ingester/model/PromiseContainer.docx new file mode 100644 index 000000000..9e614de82 Binary files /dev/null and b/src/java/com/twitter/search/ingester/model/PromiseContainer.docx differ diff --git a/src/java/com/twitter/search/ingester/model/PromiseContainer.java b/src/java/com/twitter/search/ingester/model/PromiseContainer.java deleted file mode 100644 index 7d9b2ead9..000000000 --- a/src/java/com/twitter/search/ingester/model/PromiseContainer.java +++ /dev/null @@ -1,21 +0,0 @@ -package com.twitter.search.ingester.model; - -import com.twitter.util.Promise; - -public class PromiseContainer { - private final Promise promise; - private final U obj; - - public PromiseContainer(Promise promise, U obj) { - this.promise = promise; - this.obj = obj; - } - - public Promise getPromise() { - return promise; - } - - public U getObj() { - return obj; - } -} diff --git a/src/java/com/twitter/search/ingester/model/VisibleTokenRatioUtil.docx b/src/java/com/twitter/search/ingester/model/VisibleTokenRatioUtil.docx new file mode 100644 index 000000000..ee477d237 Binary files /dev/null and b/src/java/com/twitter/search/ingester/model/VisibleTokenRatioUtil.docx differ diff --git a/src/java/com/twitter/search/ingester/model/VisibleTokenRatioUtil.java b/src/java/com/twitter/search/ingester/model/VisibleTokenRatioUtil.java deleted file mode 100644 index 52c8654a5..000000000 --- a/src/java/com/twitter/search/ingester/model/VisibleTokenRatioUtil.java +++ /dev/null @@ -1,42 +0,0 @@ -package com.twitter.search.ingester.model; - -import com.twitter.common.text.token.TokenizedCharSequenceStream; -import com.twitter.common.text.token.attribute.CharSequenceTermAttribute; -import com.twitter.search.common.relevance.text.VisibleTokenRatioNormalizer; - -public class VisibleTokenRatioUtil { - - private static final int TOKEN_DEMARCATION = 140; - - private static final VisibleTokenRatioNormalizer NORMALIZER = - VisibleTokenRatioNormalizer.createInstance(); - - /** - * Take the number of visible tokens and divide by number of total tokens to get the - * visible token percentage (pretending 140 chars is visible as that is old typical tweet - * size). Then normalize it down to 4 bits(round it basically) - */ - public int extractAndNormalizeTokenPercentage(TokenizedCharSequenceStream tokenSeqStream) { - - CharSequenceTermAttribute attr = tokenSeqStream.addAttribute(CharSequenceTermAttribute.class); - - int totalTokens = 0; - int numTokensBelowThreshold = 0; - while (tokenSeqStream.incrementToken()) { - totalTokens++; - int offset = attr.getOffset(); - if (offset <= TOKEN_DEMARCATION) { - numTokensBelowThreshold++; - } - } - - double percent; - if (totalTokens > 0) { - percent = numTokensBelowThreshold / (double) totalTokens; - } else { - percent = 1; - } - - return NORMALIZER.normalize(percent); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/app/BUILD b/src/java/com/twitter/search/ingester/pipeline/app/BUILD deleted file mode 100644 index d28a18bd7..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/app/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -java_library( - sources = ["*.java"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/commons-logging", - "3rdparty/jvm/org/slf4j:slf4j-api", - "decider/src/main/scala", - "finagle/finagle-core/src/main", - "finagle/finagle-http/src/main/scala", - "servo/decider/src/main/scala", - "src/java/com/twitter/search/common/debug", - "src/java/com/twitter/search/common/logging", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/ingester/model", - "src/java/com/twitter/search/ingester/pipeline/twitter", - "src/java/com/twitter/search/ingester/pipeline/twitter/kafka", - "src/java/com/twitter/search/ingester/pipeline/util", - "src/java/com/twitter/search/ingester/pipeline/wire", - "src/java/com/twitter/search/ingester/util/jndi", - "src/java/org/apache/commons/pipeline", - "src/thrift/com/twitter/tweetypie:events-java", - "twitter-server/server/src/main/scala", - "util/util-app/src/main/scala", - "util/util-core:scala", - "util/util-lint/src/main/scala", - "util/util-stats/src/main/scala", - ], -) diff --git a/src/java/com/twitter/search/ingester/pipeline/app/BUILD.docx b/src/java/com/twitter/search/ingester/pipeline/app/BUILD.docx new file mode 100644 index 000000000..af1bfdd8a Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/app/BUILD.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/app/IngesterPipelineApplication.docx b/src/java/com/twitter/search/ingester/pipeline/app/IngesterPipelineApplication.docx new file mode 100644 index 000000000..bc2408aff Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/app/IngesterPipelineApplication.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/app/IngesterPipelineApplication.java b/src/java/com/twitter/search/ingester/pipeline/app/IngesterPipelineApplication.java deleted file mode 100644 index 2c7d9c952..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/app/IngesterPipelineApplication.java +++ /dev/null @@ -1,195 +0,0 @@ -package com.twitter.search.ingester.pipeline.app; - -import java.io.File; -import java.net.URL; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.atomic.AtomicBoolean; - -import com.google.common.annotations.VisibleForTesting; - -import org.apache.commons.pipeline.Pipeline; -import org.apache.commons.pipeline.PipelineCreationException; -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.config.DigesterPipelineFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import com.twitter.app.Flag; -import com.twitter.app.Flaggable; -import com.twitter.search.common.metrics.BuildInfoStats; -import com.twitter.search.ingester.pipeline.wire.ProductionWireModule; -import com.twitter.search.ingester.pipeline.wire.WireModule; -import com.twitter.search.ingester.util.jndi.JndiUtil; -import com.twitter.server.AbstractTwitterServer; -import com.twitter.server.handler.DeciderHandler$; - -/** Starts the ingester/indexer pipeline. */ -public class IngesterPipelineApplication extends AbstractTwitterServer { - private static final Logger LOG = LoggerFactory.getLogger(IngesterPipelineApplication.class); - private static final String VERSION_2 = "v2"; - private final Flag pipelineConfigFile = flag().create( - "config_file", - "", - "xml file to load pipeline config from. Required.", - Flaggable.ofString()); - - private final Flag pipelineVersion = flag().create( - "version", - "", - "Specifies if we want to run the acp pipeline or non acp pipeline.", - Flaggable.ofString()); - - private final Flag partitionArg = flag().create( - "shard", - -1, - "The partition this indexer is responsible for.", - Flaggable.ofJavaInteger()); - - private final Flag deciderOverlay = flag().create( - "decider_overlay", - "", - "Decider overlay", - Flaggable.ofString()); - - private final Flag serviceIdentifierFlag = flag().create( - "service_identifier", - "", - "Service identifier for mutual TLS authentication", - Flaggable.ofString()); - - private final Flag environment = flag().create( - "environment", - "", - "Specifies the environment the app is running in. Valid values : prod, staging, " - + "staging1. Required if pipelineVersion == 'v2'", - Flaggable.ofString() - ); - - private final Flag cluster = flag().create( - "cluster", - "", - "Specifies the cluster the app is running in. Valid values : realtime, protected, " - + "realtime_cg, user_updates. Required if pipelineVersion == 'v2'", - Flaggable.ofString() - ); - - private final Flag cores = flag().create( - "cores", - 1F, - "Specifies the number of cores this cluster is using. ", - Flaggable.ofJavaFloat() - ); - - private final CountDownLatch shutdownLatch = new CountDownLatch(1); - - public void shutdown() { - shutdownLatch.countDown(); - } - - private Pipeline pipeline; - - private final AtomicBoolean started = new AtomicBoolean(false); - - private final AtomicBoolean finished = new AtomicBoolean(false); - - /** - * Boilerplate for the Java-friendly AbstractTwitterServer - */ - public static class Main { - public static void main(String[] args) { - new IngesterPipelineApplication().main(args); - } - } - - /** - * Code is based on DigesterPipelineFactory.main. We only require reading in one config file. - */ - @Override - public void main() { - try { - JndiUtil.loadJNDI(); - - ProductionWireModule wireModule = new ProductionWireModule( - deciderOverlay.get().get(), - partitionArg.getWithDefault().get(), - serviceIdentifierFlag.get()); - WireModule.bindWireModule(wireModule); - - addAdminRoute(DeciderHandler$.MODULE$.route( - "ingester", - wireModule.getMutableDecisionMaker(), - wireModule.getDecider())); - - BuildInfoStats.export(); - if (pipelineVersion.get().get().equals(VERSION_2)) { - runPipelineV2(wireModule); - } else { - runPipelineV1(wireModule); - } - LOG.info("Pipeline terminated. Ingester is DOWN."); - } catch (Exception e) { - LOG.error("Exception in pipeline. Ingester is DOWN.", e); - throw new RuntimeException(e); - } - } - - @VisibleForTesting - boolean isFinished() { - return finished.get(); - } - - @VisibleForTesting - Pipeline createPipeline(URL pipelineConfigFileURL) throws PipelineCreationException { - DigesterPipelineFactory factory = new DigesterPipelineFactory(pipelineConfigFileURL); - LOG.info("Pipeline created from {}, about to begin processing...", pipelineConfigFileURL); - return factory.createPipeline(); - } - - void runPipelineV1(ProductionWireModule wireModule) throws Exception { - LOG.info("Running Pipeline V1"); - final File pipelineFile = new File(pipelineConfigFile.get().get()); - URL pipelineConfigFileUrl = pipelineFile.toURI().toURL(); - wireModule.setPipelineExceptionHandler(new PipelineExceptionImpl(this)); - runPipelineV1(pipelineConfigFileUrl); - shutdownLatch.await(); - } - - @VisibleForTesting - void runPipelineV1(URL pipelineConfigFileUrl) throws Exception { - pipeline = createPipeline(pipelineConfigFileUrl); - pipeline.start(); - started.set(true); - } - - void runPipelineV2(ProductionWireModule wireModule) throws Exception { - LOG.info("Running Pipeline V2"); - int threadsToSpawn = cores.get().get().intValue() - 1; - RealtimeIngesterPipelineV2 realtimePipeline = new RealtimeIngesterPipelineV2( - environment.get().get(), cluster.get().get(), threadsToSpawn); - wireModule.setPipelineExceptionHandler(new PipelineExceptionImplV2(realtimePipeline)); - realtimePipeline.run(); - } - - @Override - public void onExit() { - try { - LOG.info("Attempting to shutdown gracefully."); - /* - * Iterates over each Stage and calls finish(). The Stage is considered finished when - * its queue is empty. If there is a backup, finish() waits for the queues to empty. - */ - - // We don't call finish() unless the pipeline exists and has started because if any stage - // fails to initialize, no processing is started and not only is calling finish() unnecessary, - // but it will also deadlock any DedicatedThreadStageDriver. - if (pipeline != null && started.get()) { - pipeline.finish(); - finished.set(true); - LOG.info("Pipeline exited cleanly."); - } else { - LOG.info("Pipeline not yet started."); - } - } catch (StageException e) { - LOG.error("Unable to shutdown pipeline.", e); - } - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/app/PipelineExceptionImpl.docx b/src/java/com/twitter/search/ingester/pipeline/app/PipelineExceptionImpl.docx new file mode 100644 index 000000000..af159856c Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/app/PipelineExceptionImpl.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/app/PipelineExceptionImpl.java b/src/java/com/twitter/search/ingester/pipeline/app/PipelineExceptionImpl.java deleted file mode 100644 index 5ce4892af..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/app/PipelineExceptionImpl.java +++ /dev/null @@ -1,30 +0,0 @@ -package com.twitter.search.ingester.pipeline.app; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.ingester.pipeline.util.PipelineExceptionHandler; -import com.twitter.util.Duration; - -public class PipelineExceptionImpl implements PipelineExceptionHandler { - private static final Logger LOG = LoggerFactory.getLogger(PipelineExceptionImpl.class); - - private final IngesterPipelineApplication app; - - public PipelineExceptionImpl(IngesterPipelineApplication app) { - this.app = app; - } - - @Override - public void logAndWait(String msg, Duration waitTime) throws InterruptedException { - LOG.info(msg); - long waitTimeInMilliSecond = waitTime.inMilliseconds(); - Thread.sleep(waitTimeInMilliSecond); - } - - @Override - public void logAndShutdown(String msg) { - LOG.error(msg); - app.shutdown(); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/app/PipelineExceptionImplV2.docx b/src/java/com/twitter/search/ingester/pipeline/app/PipelineExceptionImplV2.docx new file mode 100644 index 000000000..6e9e2971e Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/app/PipelineExceptionImplV2.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/app/PipelineExceptionImplV2.java b/src/java/com/twitter/search/ingester/pipeline/app/PipelineExceptionImplV2.java deleted file mode 100644 index 1b576ebdf..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/app/PipelineExceptionImplV2.java +++ /dev/null @@ -1,29 +0,0 @@ -package com.twitter.search.ingester.pipeline.app; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.ingester.pipeline.util.PipelineExceptionHandler; -import com.twitter.util.Duration; - -public class PipelineExceptionImplV2 implements PipelineExceptionHandler { - private static final Logger LOG = LoggerFactory.getLogger(PipelineExceptionImplV2.class); - private RealtimeIngesterPipelineV2 pipeline; - - public PipelineExceptionImplV2(RealtimeIngesterPipelineV2 pipeline) { - this.pipeline = pipeline; - } - - @Override - public void logAndWait(String msg, Duration waitTime) throws InterruptedException { - LOG.info(msg); - long waitTimeInMilliSecond = waitTime.inMilliseconds(); - Thread.sleep(waitTimeInMilliSecond); - } - - @Override - public void logAndShutdown(String msg) { - LOG.info(msg); - pipeline.shutdown(); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/app/RealtimeIngesterPipelineV2.docx b/src/java/com/twitter/search/ingester/pipeline/app/RealtimeIngesterPipelineV2.docx new file mode 100644 index 000000000..5499c7242 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/app/RealtimeIngesterPipelineV2.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/app/RealtimeIngesterPipelineV2.java b/src/java/com/twitter/search/ingester/pipeline/app/RealtimeIngesterPipelineV2.java deleted file mode 100644 index b3669305c..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/app/RealtimeIngesterPipelineV2.java +++ /dev/null @@ -1,111 +0,0 @@ -package com.twitter.search.ingester.pipeline.app; -import java.util.List; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.SynchronousQueue; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.ingester.model.IngesterTweetEvent; -import com.twitter.search.ingester.model.KafkaRawRecord; -import com.twitter.search.ingester.pipeline.twitter.TweetEventDeserializerStage; -import com.twitter.search.ingester.pipeline.twitter.kafka.KafkaConsumerStage; -import com.twitter.search.ingester.pipeline.twitter.kafka.KafkaRawRecordConsumerStage; -import com.twitter.search.ingester.pipeline.util.PipelineV2CreationException; -import com.twitter.search.ingester.pipeline.util.PipelineStageException; - -public class RealtimeIngesterPipelineV2 { - private static final Logger LOG = LoggerFactory.getLogger(RealtimeIngesterPipelineV2.class); - private static final String PROD_ENV = "prod"; - private static final String STAGING_ENV = "staging"; - private static final String STAGING1_ENV = "staging1"; - private static final String REALTIME_CLUSTER = "realtime"; - private static final String PROTECTED_CLUSTER = "protected"; - private static final String REALTIME_CG_CLUSTER = "realtime_cg"; - private static final String KAFKA_CLIENT_ID = ""; - private static final String KAFKA_TOPIC_NAME = ""; - private static final String KAFKA_CONSUMER_GROUP_ID = ""; - private static final String KAFKA_CLUSTER_PATH = ""; - private static final String KAFKA_DECIDER_KEY = "ingester_tweets_consume_from_kafka"; - private static final String STATS_PREFIX = "realtimeingesterpipelinev2"; - private SearchCounter kafkaErrorCount = SearchCounter.create(STATS_PREFIX - + "_kafka_error_count"); - private Boolean running; - private String environment; - private String cluster; - private ExecutorService threadPool; - private KafkaConsumerStage kafkaConsumer; - private TweetEventDeserializerStage tweetEventDeserializerStage; - - public RealtimeIngesterPipelineV2(String environment, String cluster, int threadsToSpawn) throws - PipelineV2CreationException, PipelineStageException { - if (!environment.equals(PROD_ENV) && !environment.equals(STAGING_ENV) - && !environment.equals(STAGING1_ENV)) { - throw new PipelineV2CreationException("invalid value for environment"); - } - - if (!cluster.equals(REALTIME_CLUSTER) - && !cluster.equals(PROTECTED_CLUSTER) && !cluster.equals(REALTIME_CG_CLUSTER)) { - throw new PipelineV2CreationException("invalid value for cluster."); - } - - int numberOfThreads = Math.max(1, threadsToSpawn); - this.environment = environment; - this.cluster = cluster; - this.threadPool = new ThreadPoolExecutor(numberOfThreads, numberOfThreads, 0L, - TimeUnit.MILLISECONDS, new SynchronousQueue<>(), new ThreadPoolExecutor.CallerRunsPolicy()); - initStages(); - } - - private void initStages() throws PipelineStageException { - kafkaConsumer = new KafkaRawRecordConsumerStage(KAFKA_CLIENT_ID, KAFKA_TOPIC_NAME, - KAFKA_CONSUMER_GROUP_ID, KAFKA_CLUSTER_PATH, KAFKA_DECIDER_KEY); - kafkaConsumer.setupStageV2(); - tweetEventDeserializerStage = new TweetEventDeserializerStage(); - tweetEventDeserializerStage.setupStageV2(); - } - - /*** - * Starts the pipeline by starting the polling from Kafka and passing the events to the first - * stage of the pipeline. - */ - public void run() { - running = true; - while (running) { - pollFromKafkaAndSendToPipeline(); - } - } - - private void pollFromKafkaAndSendToPipeline() { - try { - List records = kafkaConsumer.pollFromTopic(); - for (KafkaRawRecord record : records) { - processKafkaRecord(record); - } - } catch (PipelineStageException e) { - kafkaErrorCount.increment(); - LOG.error("Error polling from Kafka", e); - } - } - - private void processKafkaRecord(KafkaRawRecord record) { - CompletableFuture stage1 = CompletableFuture.supplyAsync(() -> record, - threadPool); - - CompletableFuture stage2 = stage1.thenApplyAsync((KafkaRawRecord r) -> - tweetEventDeserializerStage.runStageV2(r), threadPool); - - } - - /*** - * Stop the pipeline from processing any further events. - */ - public void shutdown() { - running = false; - kafkaConsumer.cleanupStageV2(); - tweetEventDeserializerStage.cleanupStageV2(); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/AudioSpaceCoreFetcher.docx b/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/AudioSpaceCoreFetcher.docx new file mode 100644 index 000000000..28907f066 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/AudioSpaceCoreFetcher.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/AudioSpaceCoreFetcher.java b/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/AudioSpaceCoreFetcher.java deleted file mode 100644 index b80cd93cf..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/AudioSpaceCoreFetcher.java +++ /dev/null @@ -1,56 +0,0 @@ -package com.twitter.search.ingester.pipeline.strato_fetchers; - -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -import com.twitter.periscope.api.thriftjava.AudioSpacesLookupContext; -import com.twitter.stitch.Stitch; -import com.twitter.strato.catalog.Fetch; -import com.twitter.strato.client.Client; -import com.twitter.strato.client.Fetcher; -import com.twitter.strato.data.Conv; -import com.twitter.strato.thrift.TBaseConv; -import com.twitter.ubs.thriftjava.AudioSpace; -import com.twitter.util.Future; -import com.twitter.util.Try; - -/** - * Fetches from the audio space core strato column. - */ -public class AudioSpaceCoreFetcher { - private static final String CORE_STRATO_COLUMN = ""; - - private static final AudioSpacesLookupContext - EMPTY_AUDIO_LOOKUP_CONTEXT = new AudioSpacesLookupContext(); - - private final Fetcher fetcher; - - public AudioSpaceCoreFetcher(Client stratoClient) { - fetcher = stratoClient.fetcher( - CORE_STRATO_COLUMN, - true, // enables checking types against catalog - Conv.stringConv(), - TBaseConv.forClass(AudioSpacesLookupContext.class), - TBaseConv.forClass(AudioSpace.class)); - } - - public Future> fetch(String spaceId) { - return Stitch.run(fetcher.fetch(spaceId, EMPTY_AUDIO_LOOKUP_CONTEXT)); - } - - /** - * Use stitch to fetch mulitiple AudioSpace Objects at once - */ - public Future>>> fetchBulkSpaces(Set spaceIds) { - return Stitch.run( - Stitch.collectToTry( - spaceIds - .stream() - .map(spaceId -> fetcher.fetch(spaceId, EMPTY_AUDIO_LOOKUP_CONTEXT)) - .collect(Collectors.toList()) - ) - ); - } - -} diff --git a/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/AudioSpaceParticipantsFetcher.docx b/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/AudioSpaceParticipantsFetcher.docx new file mode 100644 index 000000000..69ef34237 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/AudioSpaceParticipantsFetcher.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/AudioSpaceParticipantsFetcher.java b/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/AudioSpaceParticipantsFetcher.java deleted file mode 100644 index 591c4e541..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/AudioSpaceParticipantsFetcher.java +++ /dev/null @@ -1,36 +0,0 @@ -package com.twitter.search.ingester.pipeline.strato_fetchers; - -import com.twitter.periscope.api.thriftjava.AudioSpacesLookupContext; -import com.twitter.stitch.Stitch; -import com.twitter.strato.catalog.Fetch; -import com.twitter.strato.client.Client; -import com.twitter.strato.client.Fetcher; -import com.twitter.strato.data.Conv; -import com.twitter.strato.thrift.TBaseConv; -import com.twitter.ubs.thriftjava.Participants; -import com.twitter.util.Future; - -/** - * Fetches from the audio space participants strato column. - */ -public class AudioSpaceParticipantsFetcher { - private static final String PARTICIPANTS_STRATO_COLUMN = ""; - - private static final AudioSpacesLookupContext - EMPTY_AUDIO_LOOKUP_CONTEXT = new AudioSpacesLookupContext(); - - private final Fetcher fetcher; - - public AudioSpaceParticipantsFetcher(Client stratoClient) { - fetcher = stratoClient.fetcher( - PARTICIPANTS_STRATO_COLUMN, - true, // enables checking types against catalog - Conv.stringConv(), - TBaseConv.forClass(AudioSpacesLookupContext.class), - TBaseConv.forClass(Participants.class)); - } - - public Future> fetch(String spaceId) { - return Stitch.run(fetcher.fetch(spaceId, EMPTY_AUDIO_LOOKUP_CONTEXT)); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/BUILD b/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/BUILD deleted file mode 100644 index 57f38483a..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/BUILD +++ /dev/null @@ -1,20 +0,0 @@ -java_library( - sources = ["*.java"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/commons-lang", - "cuad/projects/ner/thrift:thrift-java", - "periscope/api-proxy-thrift/thrift/src/main/thrift:thrift-java", - "scrooge/scrooge-core/src/main/scala", - "src/java/com/twitter/common/collections", - "stitch/stitch-core", - "strato/src/main/scala/com/twitter/strato/catalog", - "strato/src/main/scala/com/twitter/strato/client", - "strato/src/main/scala/com/twitter/strato/thrift", - "twitter-server-internal/src/main/scala", - "ubs/common/src/main/thrift/com/twitter/ubs:broadcast-thrift-java", - "util/util-core:util-core-util", - ], -) diff --git a/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/BUILD.docx b/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/BUILD.docx new file mode 100644 index 000000000..a178b2fc5 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/BUILD.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/NamedEntityFetcher.docx b/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/NamedEntityFetcher.docx new file mode 100644 index 000000000..77b15e66f Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/NamedEntityFetcher.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/NamedEntityFetcher.java b/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/NamedEntityFetcher.java deleted file mode 100644 index fb5cbefeb..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/strato_fetchers/NamedEntityFetcher.java +++ /dev/null @@ -1,47 +0,0 @@ -package com.twitter.search.ingester.pipeline.strato_fetchers; - -import scala.Option; - -import com.twitter.cuad.ner.plain.thriftjava.NamedEntities; -import com.twitter.cuad.ner.plain.thriftjava.NamedEntitiesRequestOptions; -import com.twitter.cuad.ner.thriftjava.ModelFamily; -import com.twitter.cuad.ner.thriftjava.NERCalibrateRequest; -import com.twitter.cuad.thriftjava.CalibrationLevel; -import com.twitter.cuad.thriftjava.NERCandidateSource; -import com.twitter.stitch.Stitch; -import com.twitter.strato.catalog.Fetch; -import com.twitter.strato.client.Client; -import com.twitter.strato.client.Fetcher; -import com.twitter.strato.data.Conv; -import com.twitter.strato.opcontext.ServeWithin; -import com.twitter.strato.thrift.TBaseConv; -import com.twitter.util.Duration; -import com.twitter.util.Future; - -public class NamedEntityFetcher { - private static final String NAMED_ENTITY_STRATO_COLUMN = ""; - - private static final ServeWithin SERVE_WITHIN = new ServeWithin( - Duration.fromMilliseconds(100), Option.empty()); - - private static final NamedEntitiesRequestOptions REQUEST_OPTIONS = - new NamedEntitiesRequestOptions( - new NERCalibrateRequest(CalibrationLevel.HIGH_PRECISION, NERCandidateSource.NER_CRF) - .setModel_family(ModelFamily.CFB)) - .setDisplay_entity_info(false); - - private final Fetcher fetcher; - - public NamedEntityFetcher(Client stratoClient) { - fetcher = stratoClient.fetcher( - NAMED_ENTITY_STRATO_COLUMN, - true, // enables checking types against catalog - Conv.longConv(), - TBaseConv.forClass(NamedEntitiesRequestOptions.class), - TBaseConv.forClass(NamedEntities.class)).serveWithin(SERVE_WITHIN); - } - - public Future> fetch(long tweetId) { - return Stitch.run(fetcher.fetch(tweetId, REQUEST_OPTIONS)); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/AsyncPinkUrlsResolver.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/AsyncPinkUrlsResolver.docx new file mode 100644 index 000000000..f2b16043c Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/AsyncPinkUrlsResolver.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/AsyncPinkUrlsResolver.java b/src/java/com/twitter/search/ingester/pipeline/twitter/AsyncPinkUrlsResolver.java deleted file mode 100644 index 0b1ae2187..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/AsyncPinkUrlsResolver.java +++ /dev/null @@ -1,67 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.Collection; -import java.util.List; -import java.util.Map; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Maps; - -import com.twitter.pink_floyd.thrift.ClientIdentifier; -import com.twitter.pink_floyd.thrift.Mask; -import com.twitter.pink_floyd.thrift.Storer; -import com.twitter.pink_floyd.thrift.UrlData; -import com.twitter.pink_floyd.thrift.UrlReadRequest; -import com.twitter.util.Function; -import com.twitter.util.Future; - -/** - * Resolve compressed URL via Pink - */ -public class AsyncPinkUrlsResolver { - private final Storer.ServiceIface storerClient; - private final ClientIdentifier pinkClientId; - private final Mask requestMask; - - // Use ServerSet to construct a metadata store client - public AsyncPinkUrlsResolver(Storer.ServiceIface storerClient, String pinkClientId) { - this.storerClient = storerClient; - this.pinkClientId = ClientIdentifier.valueOf(pinkClientId); - - requestMask = new Mask(); - requestMask.setResolution(true); - requestMask.setHtmlBasics(true); - requestMask.setUrlDirectInfo(true); - } - - /** - * resolve urls calling pink asynchronously - * @param urls urls to resolve - * @return Future map of resolved urls - */ - public Future> resolveUrls( - Collection urls) { - if (urls == null || urls.size() == 0) { - Future.value(Maps.newHashMap()); - } - - List urlsList = ImmutableList.copyOf(urls); - - UrlReadRequest request = new UrlReadRequest(); - request.setUrls(urlsList); - request.setClientId(pinkClientId); - request.setMask(requestMask); - - return storerClient.read(request).map(Function.func( - response -> { - Map resultMap = Maps.newHashMap(); - for (UrlData urlData : response.getData()) { - if (ResolveCompressedUrlsUtils.isResolved(urlData)) { - resultMap.put(urlData.url, ResolveCompressedUrlsUtils.getUrlInfo(urlData)); - } - } - return resultMap; - } - )); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/BUILD b/src/java/com/twitter/search/ingester/pipeline/twitter/BUILD deleted file mode 100644 index 5fd578ba8..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/BUILD +++ /dev/null @@ -1,74 +0,0 @@ -java_library( - sources = ["*.java"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/commons-io", - "3rdparty/jvm/commons-logging", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/thrift:libthrift", - "cuad/projects/ner/client/src/main/scala/com/twitter/cuad/ner/client", - "cuad/projects/ner/thrift/src/main/thrift:thrift-java", - "decider/src/main/scala", - "eventbus/client/src/main/scala/com/twitter/eventbus/client", - "finagle/finagle-core/src/main", - "finagle/finagle-thriftmux/src/main/scala", - "pink-floyd/pink-common/src/main/java/com/twitter/spiderduck/common", - "scrooge/scrooge-core", - "scrooge/scrooge-serializer/src/main/scala", - "servo/util/src/main/scala", - "src/java/com/twitter/common/text/language:locale-util", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/metastore/client_v2", - "src/java/com/twitter/search/common/config", - "src/java/com/twitter/search/common/converter/earlybird", - "src/java/com/twitter/search/common/debug", - "src/java/com/twitter/search/common/decider", - "src/java/com/twitter/search/common/encoding/features", - "src/java/com/twitter/search/common/logging", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning:timeslice-manager", - "src/java/com/twitter/search/common/partitioning/base", - "src/java/com/twitter/search/common/partitioning/snowflakeparser", - "src/java/com/twitter/search/common/relevance:classifiers", - "src/java/com/twitter/search/common/relevance:entities_and_filters", - "src/java/com/twitter/search/common/relevance:scorers", - "src/java/com/twitter/search/common/relevance:text", - "src/java/com/twitter/search/common/schema", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/common/util/text", - "src/java/com/twitter/search/common/util/thrift:text-protocol", - "src/java/com/twitter/search/ingester/model", - "src/java/com/twitter/search/ingester/model/engagements", - "src/java/com/twitter/search/ingester/pipeline/strato_fetchers", - "src/java/com/twitter/search/ingester/pipeline/twitter/filters", - "src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse", - "src/java/com/twitter/search/ingester/pipeline/util", - "src/java/com/twitter/search/ingester/pipeline/wire", - "src/java/org/apache/commons/pipeline", - "src/thrift/com/twitter/expandodo:cards-java", - "src/thrift/com/twitter/gizmoduck:thrift-java", - "src/thrift/com/twitter/pink-floyd/thrift:derivatives-java", - "src/thrift/com/twitter/pink-floyd/thrift:thrift-java", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/search/common:schema-java", - "src/thrift/com/twitter/search/common/debug:debug-java", - "src/thrift/com/twitter/service/spiderduck/gen:metadata-store-java", - "src/thrift/com/twitter/timelineservice/server/internal:thrift-java", - "src/thrift/com/twitter/tweetypie:events-java", - "src/thrift/com/twitter/tweetypie:events-scala", - "src/thrift/com/twitter/tweetypie:service-java", - "src/thrift/com/twitter/tweetypie:tweet-java", - "stitch/stitch-core", - "storage/clients/manhattan/client/src/main/scala", - "strato/src/main/scala/com/twitter/strato/catalog", - "ubs/common/src/main/thrift/com/twitter/ubs:broadcast-thrift-java", - "util/util-core:scala", - "util/util-core/src/main/java", - "util/util-function/src/main/java", - ], -) diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/BUILD.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/BUILD.docx new file mode 100644 index 000000000..3f6b59be3 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/BUILD.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/CollectComparableObjectsStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/CollectComparableObjectsStage.docx new file mode 100644 index 000000000..62113aa86 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/CollectComparableObjectsStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/CollectComparableObjectsStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/CollectComparableObjectsStage.java deleted file mode 100644 index f8d98723f..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/CollectComparableObjectsStage.java +++ /dev/null @@ -1,176 +0,0 @@ -/** - * © Copyright 2008, Summize, Inc. All rights reserved. - */ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.Collections; -import java.util.NavigableSet; -import java.util.TreeSet; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducedTypes; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.debug.DebugEventUtil; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchCustomGauge; -import com.twitter.search.common.metrics.SearchTimerStats; - -/** - * Collect incoming objects into batches of the configured size and then - * emit the Collection of objects. Internally uses a TreeSet - * to remove duplicates. Incoming objects MUST implement the Comparable - * interface. - */ -@ConsumedTypes(Comparable.class) -@ProducedTypes(NavigableSet.class) -public class CollectComparableObjectsStage extends TwitterBaseStage { - private static final Logger LOG = LoggerFactory.getLogger(CollectComparableObjectsStage.class); - - // Batch size of the collections we are emitting. - private int batchSize = -1; - - // Top tweets sorts the tweets in reverse order. - private Boolean reverseOrder = false; - - // Batch being constructed. - private TreeSet currentCollection = null; - - // Timestamp (ms) of last batch emission. - private final AtomicLong lastEmitTimeMillis = new AtomicLong(-1); - // If set, will emit a batch (only upon arrival of a new element), if time since last emit has - // exceeded this threshold. - private long emitAfterMillis = -1; - - private SearchCounter sizeBasedEmitCount; - private SearchCounter timeBasedEmitCount; - private SearchCounter sizeAndTimeBasedEmitCount; - private SearchTimerStats batchEmitTimeStats; - - @Override - protected void initStats() { - super.initStats(); - - SearchCustomGauge.export(getStageNamePrefix() + "_last_emit_time", - () -> lastEmitTimeMillis.get()); - - sizeBasedEmitCount = SearchCounter.export(getStageNamePrefix() + "_size_based_emit_count"); - timeBasedEmitCount = SearchCounter.export(getStageNamePrefix() + "_time_based_emit_count"); - sizeAndTimeBasedEmitCount = SearchCounter.export( - getStageNamePrefix() + "_size_and_time_based_emit_count"); - - batchEmitTimeStats = SearchTimerStats.export( - getStageNamePrefix() + "_batch_emit_time", - TimeUnit.MILLISECONDS, - false, // no cpu timers - true); // with percentiles - } - - @Override - protected void doInnerPreprocess() throws StageException { - // We have to initialize this stat here, because initStats() is called before - // doInnerPreprocess(), so at that point the 'clock' is not set yet. - SearchCustomGauge.export(getStageNamePrefix() + "_millis_since_last_emit", - () -> clock.nowMillis() - lastEmitTimeMillis.get()); - - currentCollection = newBatchCollection(); - if (batchSize <= 0) { - throw new StageException(this, "Must set the batchSize parameter to a value >0"); - } - } - - private TreeSet newBatchCollection() { - return new TreeSet<>(reverseOrder ? Collections.reverseOrder() : null); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!Comparable.class.isAssignableFrom(obj.getClass())) { - throw new StageException( - this, "Attempt to add a non-comparable object to a sorted collection"); - } - - currentCollection.add(obj); - if (shouldEmit()) { - // We want to trace here when we actually emit the batch, as tweets sit in this stage until - // a batch is full, and we want to see how long they actually stick around. - DebugEventUtil.addDebugEventToCollection( - currentCollection, "CollectComparableObjectsStage.outgoing", clock.nowMillis()); - emitAndCount(currentCollection); - updateLastEmitTime(); - - currentCollection = newBatchCollection(); - } - } - - private boolean shouldEmit() { - if (lastEmitTimeMillis.get() < 0) { - // Initialize lastEmit at the first tweet seen by this stage. - lastEmitTimeMillis.set(clock.nowMillis()); - } - - final boolean sizeBasedEmit = currentCollection.size() >= batchSize; - final boolean timeBasedEmit = - emitAfterMillis > 0 && lastEmitTimeMillis.get() + emitAfterMillis <= clock.nowMillis(); - - if (sizeBasedEmit && timeBasedEmit) { - sizeAndTimeBasedEmitCount.increment(); - return true; - } else if (sizeBasedEmit) { - sizeBasedEmitCount.increment(); - return true; - } else if (timeBasedEmit) { - timeBasedEmitCount.increment(); - return true; - } else { - return false; - } - } - - @Override - public void innerPostprocess() throws StageException { - if (!currentCollection.isEmpty()) { - emitAndCount(currentCollection); - updateLastEmitTime(); - currentCollection = newBatchCollection(); - } - } - - private void updateLastEmitTime() { - long currentEmitTime = clock.nowMillis(); - long previousEmitTime = lastEmitTimeMillis.getAndSet(currentEmitTime); - - // Also stat how long each emit takes. - batchEmitTimeStats.timerIncrement(currentEmitTime - previousEmitTime); - } - - public void setBatchSize(Integer size) { - LOG.info("Updating all CollectComparableObjectsStage batchSize to {}.", size); - this.batchSize = size; - } - - public Boolean getReverseOrder() { - return reverseOrder; - } - - public void setReverseOrder(Boolean reverseOrder) { - this.reverseOrder = reverseOrder; - } - - public void setEmitAfterMillis(long emitAfterMillis) { - LOG.info("Setting emitAfterMillis to {}.", emitAfterMillis); - this.emitAfterMillis = emitAfterMillis; - } - - public long getSizeBasedEmitCount() { - return sizeBasedEmitCount.get(); - } - - public long getTimeBasedEmitCount() { - return timeBasedEmitCount.get(); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ComputeTweetSignatureStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/ComputeTweetSignatureStage.docx new file mode 100644 index 000000000..bcbb6bd12 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/ComputeTweetSignatureStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ComputeTweetSignatureStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/ComputeTweetSignatureStage.java deleted file mode 100644 index 960cb6f86..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/ComputeTweetSignatureStage.java +++ /dev/null @@ -1,38 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; - -import com.twitter.search.common.relevance.classifiers.TweetQualityFeatureExtractor; -import com.twitter.search.ingester.model.IngesterTwitterMessage; - -@ConsumedTypes(IngesterTwitterMessage.class) -@ProducesConsumed -public class ComputeTweetSignatureStage extends TwitterBaseStage - { - private final TweetQualityFeatureExtractor tweetSignatureExtractor = - new TweetQualityFeatureExtractor(); - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterTwitterMessage)) { - throw new StageException(this, "Object is not a TwitterMessage instance: " + obj); - } - - IngesterTwitterMessage message = IngesterTwitterMessage.class.cast(obj); - extract(message); - emitAndCount(message); - } - - private void extract(IngesterTwitterMessage message) { - tweetSignatureExtractor.extractTweetTextFeatures(message); - } - - @Override - protected IngesterTwitterMessage innerRunStageV2(IngesterTwitterMessage message) { - extract(message); - return message; - } -} - diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertDelayedMessageToThriftStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertDelayedMessageToThriftStage.docx new file mode 100644 index 000000000..64e8d0dbd Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertDelayedMessageToThriftStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertDelayedMessageToThriftStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertDelayedMessageToThriftStage.java deleted file mode 100644 index 9a1d61bfa..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertDelayedMessageToThriftStage.java +++ /dev/null @@ -1,95 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.List; - -import javax.naming.NamingException; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducedTypes; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.converter.earlybird.DelayedIndexingConverter; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdSchemaCreateTool; -import com.twitter.search.ingester.model.IngesterThriftVersionedEvents; -import com.twitter.search.ingester.model.IngesterTwitterMessage; - -@ConsumedTypes(IngesterTwitterMessage.class) -@ProducedTypes(IngesterThriftVersionedEvents.class) -public class ConvertDelayedMessageToThriftStage extends TwitterBaseStage - { - private List penguinVersionList; - private FieldStatExporter fieldStatExporter; - private DelayedIndexingConverter messageConverter; - - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - Schema schema; - try { - schema = EarlybirdSchemaCreateTool.buildSchema(Preconditions.checkNotNull(earlybirdCluster)); - } catch (Schema.SchemaValidationException e) { - throw new StageException(this, e); - } - - penguinVersionList = wireModule.getPenguinVersions(); - messageConverter = new DelayedIndexingConverter(schema, decider); - fieldStatExporter = new FieldStatExporter("unsorted_urls", schema, penguinVersionList); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterTwitterMessage)) { - throw new StageException(this, "Object is not an IngesterTwitterMessage instance: " + obj); - } - - penguinVersionList = wireModule.getCurrentlyEnabledPenguinVersions(); - fieldStatExporter.updatePenguinVersions(penguinVersionList); - - IngesterTwitterMessage message = IngesterTwitterMessage.class.cast(obj); - for (IngesterThriftVersionedEvents events : buildVersionedEvents(message)) { - fieldStatExporter.addFieldStats(events); - emitAndCount(events); - } - } - - /** - * Method that converts all URL and card related fields and features of a TwitterMessage to a - * ThriftVersionedEvents instance. - * - * @param twitterMessage An IngesterThriftVersionedEvents instance to be converted. - * @return The corresponding ThriftVersionedEvents instance. - */ - private List buildVersionedEvents( - IngesterTwitterMessage twitterMessage) { - List versionedEvents = - messageConverter.convertMessageToOutOfOrderAppendAndFeatureUpdate( - twitterMessage, penguinVersionList); - Preconditions.checkArgument( - versionedEvents.size() == 2, - "DelayedIndexingConverter produced an incorrect number of ThriftVersionedEvents."); - return Lists.newArrayList( - toIngesterThriftVersionedEvents(versionedEvents.get(0), twitterMessage), - toIngesterThriftVersionedEvents(versionedEvents.get(1), twitterMessage)); - } - - private IngesterThriftVersionedEvents toIngesterThriftVersionedEvents( - ThriftVersionedEvents versionedEvents, IngesterTwitterMessage twitterMessage) { - // We don't want to propagate the same DebugEvents instance to multiple - // IngesterThriftVersionedEvents instances, because future stages might want to add new events - // to this list for multiple events at the same time, which would result in a - // ConcurrentModificationException. So we need to create a DebugEvents deep copy. - IngesterThriftVersionedEvents ingesterThriftVersionedEvents = - new IngesterThriftVersionedEvents(twitterMessage.getUserId()); - ingesterThriftVersionedEvents.setDarkWrite(false); - ingesterThriftVersionedEvents.setId(twitterMessage.getTweetId()); - ingesterThriftVersionedEvents.setVersionedEvents(versionedEvents.getVersionedEvents()); - ingesterThriftVersionedEvents.setDebugEvents(twitterMessage.getDebugEvents().deepCopy()); - return ingesterThriftVersionedEvents; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertMessageToThriftStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertMessageToThriftStage.docx new file mode 100644 index 000000000..d7d156c34 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertMessageToThriftStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertMessageToThriftStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertMessageToThriftStage.java deleted file mode 100644 index 9b4fb6fd9..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertMessageToThriftStage.java +++ /dev/null @@ -1,117 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.io.IOException; -import java.util.List; -import java.util.Optional; - -import javax.naming.NamingException; - -import com.google.common.base.Preconditions; - -import org.apache.commons.lang.StringUtils; -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.converter.earlybird.BasicIndexingConverter; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdSchemaCreateTool; -import com.twitter.search.ingester.model.IngesterThriftVersionedEvents; -import com.twitter.search.ingester.model.IngesterTwitterMessage; - -@ConsumedTypes(IngesterTwitterMessage.class) -@ProducesConsumed -public class ConvertMessageToThriftStage extends TwitterBaseStage - { - private static final Logger LOG = LoggerFactory.getLogger(ConvertMessageToThriftStage.class); - - private List penguinVersionList; - private String thriftVersionedEventsBranchName; - private FieldStatExporter fieldStatExporter; - private BasicIndexingConverter messageConverter; - - private SearchCounter twitterMessageToTveErrorCount; - - @Override - public void initStats() { - super.initStats(); - twitterMessageToTveErrorCount = SearchCounter.export( - getStageNamePrefix() + "_ingester_convert_twitter_message_to_tve_error_count"); - } - - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - Schema schema; - try { - schema = EarlybirdSchemaCreateTool.buildSchema(Preconditions.checkNotNull(earlybirdCluster)); - } catch (Schema.SchemaValidationException e) { - throw new StageException(this, e); - } - - penguinVersionList = wireModule.getPenguinVersions(); - Preconditions.checkState(StringUtils.isNotBlank(thriftVersionedEventsBranchName)); - messageConverter = new BasicIndexingConverter(schema, earlybirdCluster); - fieldStatExporter = new FieldStatExporter("unsorted_tweets", schema, penguinVersionList); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterTwitterMessage)) { - throw new StageException(this, "Object is not an IngesterTwitterMessage instance: " + obj); - } - - penguinVersionList = wireModule.getCurrentlyEnabledPenguinVersions(); - fieldStatExporter.updatePenguinVersions(penguinVersionList); - - IngesterTwitterMessage message = IngesterTwitterMessage.class.cast(obj); - - Optional maybeEvents = buildVersionedEvents(message); - if (maybeEvents.isPresent()) { - IngesterThriftVersionedEvents events = maybeEvents.get(); - fieldStatExporter.addFieldStats(events); - emitToBranchAndCount(thriftVersionedEventsBranchName, events); - } - - emitAndCount(message); - } - - /** - * Method that converts a TwitterMessage to a ThriftVersionedEvents. - * - * @param twitterMessage An IngesterThriftVersionedEvents instance to be converted. - * @return The corresponding ThriftVersionedEvents. - */ - private Optional buildVersionedEvents( - IngesterTwitterMessage twitterMessage) { - IngesterThriftVersionedEvents ingesterEvents = - new IngesterThriftVersionedEvents(twitterMessage.getUserId()); - ingesterEvents.setDarkWrite(false); - ingesterEvents.setId(twitterMessage.getTweetId()); - - // We will emit both the original TwitterMessage, and the ThriftVersionedEvents instance, so we - // need to make sure they have separate DebugEvents copies. - ingesterEvents.setDebugEvents(twitterMessage.getDebugEvents().deepCopy()); - - try { - ThriftVersionedEvents versionedEvents = - messageConverter.convertMessageToThrift(twitterMessage, true, penguinVersionList); - ingesterEvents.setVersionedEvents(versionedEvents.getVersionedEvents()); - return Optional.of(ingesterEvents); - } catch (IOException e) { - LOG.error("Failed to convert tweet " + twitterMessage.getTweetId() + " from TwitterMessage " - + "to ThriftVersionedEvents for Penguin versions " + penguinVersionList, - e); - twitterMessageToTveErrorCount.increment(); - } - return Optional.empty(); - } - - public void setThriftVersionedEventsBranchName(String thriftVersionedEventsBranchName) { - this.thriftVersionedEventsBranchName = thriftVersionedEventsBranchName; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertToThriftVersionedEventsStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertToThriftVersionedEventsStage.docx new file mode 100644 index 000000000..725142438 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertToThriftVersionedEventsStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertToThriftVersionedEventsStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertToThriftVersionedEventsStage.java deleted file mode 100644 index a8b52418f..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/ConvertToThriftVersionedEventsStage.java +++ /dev/null @@ -1,83 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import javax.naming.NamingException; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducedTypes; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.ingester.model.IngesterThriftVersionedEvents; -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.search.ingester.pipeline.util.PipelineStageRuntimeException; - -@ConsumedTypes(IngesterTwitterMessage.class) -@ProducedTypes(ThriftVersionedEvents.class) -public class ConvertToThriftVersionedEventsStage extends TwitterBaseStage - { - private ThriftVersionedEventsConverter converter; - - @Override - public void doInnerPreprocess() throws StageException, NamingException { - super.doInnerPreprocess(); - innerSetup(); - } - - @Override - protected void innerSetup() throws NamingException { - converter = new ThriftVersionedEventsConverter(wireModule.getPenguinVersions()); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterTwitterMessage)) { - throw new StageException(this, "Object is not an IngesterTwitterMessage: " + obj); - } - - IngesterTwitterMessage ingesterTwitterMessage = (IngesterTwitterMessage) obj; - IngesterThriftVersionedEvents maybeEvents = tryToConvert(ingesterTwitterMessage); - - if (maybeEvents == null) { - throw new StageException( - this, "Object is not a retweet or a reply: " + ingesterTwitterMessage); - } - - emitAndCount(maybeEvents); - - } - - @Override - protected IngesterThriftVersionedEvents innerRunStageV2(IngesterTwitterMessage message) { - IngesterThriftVersionedEvents maybeEvents = tryToConvert(message); - - if (maybeEvents == null) { - throw new PipelineStageRuntimeException("Object is not a retweet or reply, does not have to" - + " pass to next stage"); - } - - return maybeEvents; - } - - private IngesterThriftVersionedEvents tryToConvert(IngesterTwitterMessage message) { - converter.updatePenguinVersions(wireModule.getCurrentlyEnabledPenguinVersions()); - - if (!message.isRetweet() && !message.isReplyToTweet()) { - return null; - } - - if (message.isRetweet()) { - return converter.toOutOfOrderAppend( - message.getRetweetMessage().getSharedId(), - EarlybirdFieldConstants.EarlybirdFieldConstant.RETWEETED_BY_USER_ID, - message.getUserId(), - message.getDebugEvents().deepCopy()); - } - - return converter.toOutOfOrderAppend( - message.getInReplyToStatusId().get(), - EarlybirdFieldConstants.EarlybirdFieldConstant.REPLIED_TO_BY_USER_ID, - message.getUserId(), - message.getDebugEvents().deepCopy()); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/EventBusReaderStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/EventBusReaderStage.docx new file mode 100644 index 000000000..c53517663 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/EventBusReaderStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/EventBusReaderStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/EventBusReaderStage.java deleted file mode 100644 index b42828ce5..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/EventBusReaderStage.java +++ /dev/null @@ -1,185 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.concurrent.TimeUnit; - -import javax.naming.NamingException; - -import scala.runtime.BoxedUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.apache.commons.pipeline.Pipeline; -import org.apache.commons.pipeline.StageDriver; -import org.apache.thrift.TBase; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.eventbus.client.EventBusSubscriber; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.ingester.model.PromiseContainer; -import com.twitter.search.ingester.pipeline.util.PipelineUtil; -import com.twitter.util.Await; -import com.twitter.util.Function; -import com.twitter.util.Future; -import com.twitter.util.Promise; - -public abstract class EventBusReaderStage> extends TwitterBaseStage - { - private static final Logger LOG = LoggerFactory.getLogger(EventBusReaderStage.class); - - private static final int DECIDER_POLL_INTERVAL_IN_SECS = 5; - - private SearchCounter totalEventsCount; - - private String environment = null; - private String eventBusReaderEnabledDeciderKey; - - private StageDriver stageDriver; - - private EventBusSubscriber eventBusSubscriber = null; - - // XML configuration options - private String eventBusSubscriberId; - private int maxConcurrentEvents; - private SearchDecider searchDecider; - - protected EventBusReaderStage() { - } - - @Override - protected void initStats() { - super.initStats(); - totalEventsCount = SearchCounter.export(getStageNamePrefix() + "_total_events_count"); - } - - @Override - protected void doInnerPreprocess() throws NamingException { - searchDecider = new SearchDecider(decider); - - if (stageDriver == null) { - stageDriver = ((Pipeline) stageContext).getStageDriver(this); - } - - eventBusReaderEnabledDeciderKey = String.format( - getDeciderKeyTemplate(), - earlybirdCluster.getNameForStats(), - environment); - - PipelineUtil.feedStartObjectToStage(this); - } - - protected abstract PromiseContainer eventAndPromiseToContainer( - T incomingEvent, - Promise p); - - private Future processEvent(T incomingEvent) { - Promise p = new Promise<>(); - PromiseContainer promiseContainer = eventAndPromiseToContainer(incomingEvent, p); - totalEventsCount.increment(); - emitAndCount(promiseContainer); - return p; - } - - private void closeEventBusSubscriber() throws Exception { - if (eventBusSubscriber != null) { - Await.result(eventBusSubscriber.close()); - eventBusSubscriber = null; - } - } - - protected abstract Class getThriftClass(); - - protected abstract String getDeciderKeyTemplate(); - - private void startUpEventBusSubscriber() { - // Start reading from eventbus if it is null - if (eventBusSubscriber == null) { - //noinspection unchecked - eventBusSubscriber = wireModule.createEventBusSubscriber( - Function.func(this::processEvent), - getThriftClass(), - eventBusSubscriberId, - maxConcurrentEvents); - - } - Preconditions.checkNotNull(eventBusSubscriber); - } - - /** - * This is only kicked off once with a start object which is ignored. Then we loop - * checking the decider. If it turns off then we close the eventbus reader, - * and if it turns on, then we create a new eventbus reader. - * - * @param obj ignored - */ - @Override - public void innerProcess(Object obj) { - boolean interrupted = false; - - Preconditions.checkNotNull("The environment is not set.", environment); - - int previousEventBusReaderEnabledAvailability = 0; - while (stageDriver.getState() == StageDriver.State.RUNNING) { - int eventBusReaderEnabledAvailability = - searchDecider.getAvailability(eventBusReaderEnabledDeciderKey); - if (previousEventBusReaderEnabledAvailability != eventBusReaderEnabledAvailability) { - LOG.info("EventBusReaderStage availability decider changed from {} to {}.", - previousEventBusReaderEnabledAvailability, eventBusReaderEnabledAvailability); - - // If the availability is 0 then disable the reader, otherwise read from EventBus. - if (eventBusReaderEnabledAvailability == 0) { - try { - closeEventBusSubscriber(); - } catch (Exception e) { - LOG.warn("Exception while closing eventbus subscriber", e); - } - } else { - startUpEventBusSubscriber(); - } - } - previousEventBusReaderEnabledAvailability = eventBusReaderEnabledAvailability; - - try { - clock.waitFor(TimeUnit.SECONDS.toMillis(DECIDER_POLL_INTERVAL_IN_SECS)); - } catch (InterruptedException e) { - interrupted = true; - } - } - LOG.info("StageDriver is not RUNNING anymore, closing EventBus subscriber"); - try { - closeEventBusSubscriber(); - } catch (InterruptedException e) { - interrupted = true; - } catch (Exception e) { - LOG.warn("Exception while closing eventbus subscriber", e); - } finally { - if (interrupted) { - Thread.currentThread().interrupt(); - } - } - } - - // This is needed to set the value from XML config. - public void setEventBusSubscriberId(String eventBusSubscriberId) { - this.eventBusSubscriberId = eventBusSubscriberId; - LOG.info("EventBusReaderStage with eventBusSubscriberId: {}", eventBusSubscriberId); - } - - // This is needed to set the value from XML config. - public void setEnvironment(String environment) { - this.environment = environment; - LOG.info("Ingester is running in {}", environment); - } - - // This is needed to set the value from XML config. - public void setMaxConcurrentEvents(int maxConcurrentEvents) { - this.maxConcurrentEvents = maxConcurrentEvents; - } - - @VisibleForTesting - public void setStageDriver(StageDriver stageDriver) { - this.stageDriver = stageDriver; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/FieldStatExporter.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/FieldStatExporter.docx new file mode 100644 index 000000000..7689a9944 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/FieldStatExporter.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/FieldStatExporter.java b/src/java/com/twitter/search/ingester/pipeline/twitter/FieldStatExporter.java deleted file mode 100644 index 10ea03c29..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/FieldStatExporter.java +++ /dev/null @@ -1,150 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.List; -import java.util.Set; - -import com.google.common.base.Preconditions; -import com.google.common.collect.HashBasedTable; -import com.google.common.collect.Sets; -import com.google.common.collect.Table; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.schema.SchemaBuilder; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures; -import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeaturesUtil; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.schema.thriftjava.ThriftField; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; - -/** - * This class exports counts of fields that are present on processed tweets. It is used to ensure - * that we are not missing important fields. It is not threadsafe. - */ -public class FieldStatExporter { - private static final String STAT_FORMAT = "%s_penguin_%d_documents_with_field_%s"; - private static final String UNKNOWN_FIELD = "%s_penguin_%d_documents_with_unknown_field_%d"; - private final String statPrefix; - private final Schema schema; - private final Table fieldCounters - = HashBasedTable.create(); - private final Set encodedTweetFeaturesFields; - private final Set extendedEncodedTweetFeaturesFields; - - private List penguinVersions; - - FieldStatExporter(String statPrefix, Schema schema, List penguinVersions) { - this.statPrefix = statPrefix; - this.schema = schema; - this.penguinVersions = penguinVersions; - this.encodedTweetFeaturesFields = - getEncodedTweetFeaturesFields(EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD); - this.extendedEncodedTweetFeaturesFields = - getEncodedTweetFeaturesFields(EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD); - - for (PenguinVersion version : penguinVersions) { - for (Schema.FieldInfo info : schema.getFieldInfos()) { - String name = - String.format(STAT_FORMAT, statPrefix, version.getByteValue(), info.getName()); - SearchRateCounter counter = SearchRateCounter.export(name); - fieldCounters.put(version, info.getFieldId(), counter); - } - } - } - - /** - * Exports stats counting the number of fields that are present on each document. - */ - public void addFieldStats(ThriftVersionedEvents event) { - for (PenguinVersion penguinVersion : penguinVersions) { - byte version = penguinVersion.getByteValue(); - ThriftIndexingEvent indexingEvent = event.getVersionedEvents().get(version); - Preconditions.checkNotNull(indexingEvent); - - // We only want to count each field once per tweet. - Set seenFields = Sets.newHashSet(); - for (ThriftField field : indexingEvent.getDocument().getFields()) { - int fieldId = field.getFieldConfigId(); - if (seenFields.add(fieldId)) { - if (fieldId == EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldId()) { - exportEncodedFeaturesStats(EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD, - encodedTweetFeaturesFields, - penguinVersion, - field); - } else if (fieldId - == EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD.getFieldId()) { - exportEncodedFeaturesStats(EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD, - extendedEncodedTweetFeaturesFields, - penguinVersion, - field); - } else if (isFeatureField(field)) { - updateCounterForFeatureField( - field.getFieldConfigId(), field.getFieldData().getIntValue(), penguinVersion); - } else { - SearchRateCounter counter = fieldCounters.get(penguinVersion, fieldId); - if (counter == null) { - counter = SearchRateCounter.export( - String.format(UNKNOWN_FIELD, statPrefix, version, fieldId)); - fieldCounters.put(penguinVersion, fieldId, counter); - } - counter.increment(); - } - } - } - } - } - - private boolean isFeatureField(ThriftField field) { - String fieldName = - EarlybirdFieldConstants.getFieldConstant(field.getFieldConfigId()).getFieldName(); - return fieldName.startsWith(EarlybirdFieldConstants.ENCODED_TWEET_FEATURES_FIELD_NAME - + SchemaBuilder.CSF_VIEW_NAME_SEPARATOR) - || fieldName.startsWith(EarlybirdFieldConstants.EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME - + SchemaBuilder.CSF_VIEW_NAME_SEPARATOR); - } - - private Set getEncodedTweetFeaturesFields( - EarlybirdFieldConstant featuresField) { - Set schemaFeatureFields = Sets.newHashSet(); - String baseFieldNamePrefix = - featuresField.getFieldName() + SchemaBuilder.CSF_VIEW_NAME_SEPARATOR; - for (EarlybirdFieldConstant field : EarlybirdFieldConstant.values()) { - if (field.getFieldName().startsWith(baseFieldNamePrefix)) { - schemaFeatureFields.add(field); - } - } - return schemaFeatureFields; - } - - private void exportEncodedFeaturesStats(EarlybirdFieldConstant featuresField, - Set schemaFeatureFields, - PenguinVersion penguinVersion, - ThriftField thriftField) { - byte[] encodedFeaturesBytes = thriftField.getFieldData().getBytesValue(); - EarlybirdEncodedFeatures encodedTweetFeatures = EarlybirdEncodedFeaturesUtil.fromBytes( - schema.getSchemaSnapshot(), featuresField, encodedFeaturesBytes, 0); - for (EarlybirdFieldConstant field : schemaFeatureFields) { - updateCounterForFeatureField( - field.getFieldId(), encodedTweetFeatures.getFeatureValue(field), penguinVersion); - } - } - - private void updateCounterForFeatureField(int fieldId, int value, PenguinVersion penguinVersion) { - if (value != 0) { - SearchRateCounter counter = fieldCounters.get(penguinVersion, fieldId); - if (counter == null) { - counter = SearchRateCounter.export( - String.format(UNKNOWN_FIELD, statPrefix, penguinVersion, fieldId)); - fieldCounters.put(penguinVersion, fieldId, counter); - } - counter.increment(); - } - } - - public void updatePenguinVersions(List updatedPenguinVersions) { - penguinVersions = updatedPenguinVersions; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/FilterEventsBySafetyTypeStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/FilterEventsBySafetyTypeStage.docx new file mode 100644 index 000000000..514d9d2b9 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/FilterEventsBySafetyTypeStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/FilterEventsBySafetyTypeStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/FilterEventsBySafetyTypeStage.java deleted file mode 100644 index 2f8ba9928..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/FilterEventsBySafetyTypeStage.java +++ /dev/null @@ -1,279 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.TimeUnit; -import javax.annotation.Nonnull; - -import com.google.common.annotations.VisibleForTesting; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducedTypes; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchDelayStats; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.ingester.model.IngesterTweetEvent; -import com.twitter.search.ingester.pipeline.util.PipelineStageRuntimeException; -import com.twitter.tweetypie.thriftjava.Tweet; -import com.twitter.tweetypie.thriftjava.TweetCreateEvent; -import com.twitter.tweetypie.thriftjava.TweetEvent; -import com.twitter.tweetypie.thriftjava.TweetEventData; -import com.twitter.tweetypie.thriftjava.TweetEventFlags; - -/** - * Only lets through the create events that match the specified safety type. - * Also lets through all delete events. - */ -@ConsumedTypes(IngesterTweetEvent.class) -@ProducedTypes(IngesterTweetEvent.class) -public class FilterEventsBySafetyTypeStage extends TwitterBaseStage - { - private static final Logger LOG = LoggerFactory.getLogger(FilterEventsBySafetyTypeStage.class); - - private SearchCounter totalEventsCount; - private SearchCounter createEventsCount; - private SearchCounter createPublicEventsCount; - private SearchCounter createProtectedEventsCount; - private SearchCounter createRestrictedEventsCount; - private SearchCounter createInvalidSafetyTypeCount; - private SearchCounter deleteEventsCount; - private SearchCounter deletePublicEventsCount; - private SearchCounter deleteProtectedEventsCount; - private SearchCounter deleteRestrictedEventsCount; - private SearchCounter deleteInvalidSafetyTypeCount; - private SearchCounter otherEventsCount; - - private SearchDelayStats tweetCreateDelayStats; - - private long tweetCreateLatencyLogThresholdMillis = -1; - private SafetyType safetyType = null; - private Map> invalidSafetyTypeByEventTypeStatMap = - new ConcurrentHashMap<>(); - - public FilterEventsBySafetyTypeStage() { } - - public FilterEventsBySafetyTypeStage(String safetyType, long tweetCreateLatencyThresholdMillis) { - setSafetyType(safetyType); - this.tweetCreateLatencyLogThresholdMillis = tweetCreateLatencyThresholdMillis; - } - - /** - * To be called by XML config. Can be made private after we delete ACP code. - */ - public void setSafetyType(@Nonnull String safetyTypeString) { - this.safetyType = SafetyType.valueOf(safetyTypeString); - if (this.safetyType == SafetyType.INVALID) { - throw new UnsupportedOperationException( - "Can't create a stage that permits 'INVALID' safetytypes"); - } - } - - @Override - protected void initStats() { - super.initStats(); - innerSetupStats(); - } - - @Override - protected void innerSetupStats() { - totalEventsCount = SearchCounter.export(getStageNamePrefix() + "_total_events_count"); - createEventsCount = SearchCounter.export(getStageNamePrefix() + "_create_events_count"); - createPublicEventsCount = - SearchCounter.export(getStageNamePrefix() + "_create_public_events_count"); - createProtectedEventsCount = - SearchCounter.export(getStageNamePrefix() + "_create_protected_events_count"); - createRestrictedEventsCount = - SearchCounter.export(getStageNamePrefix() + "_create_restricted_events_count"); - createInvalidSafetyTypeCount = - SearchCounter.export(getStageNamePrefix() + "_create_missing_or_unknown_safetytype"); - deleteEventsCount = - SearchCounter.export(getStageNamePrefix() + "_delete_events_count"); - deletePublicEventsCount = - SearchCounter.export(getStageNamePrefix() + "_delete_public_events_count"); - deleteProtectedEventsCount = - SearchCounter.export(getStageNamePrefix() + "_delete_protected_events_count"); - deleteRestrictedEventsCount = - SearchCounter.export(getStageNamePrefix() + "_delete_restricted_events_count"); - deleteInvalidSafetyTypeCount = - SearchCounter.export(getStageNamePrefix() + "_delete_missing_or_unknown_safetytype"); - otherEventsCount = - SearchCounter.export(getStageNamePrefix() + "_other_events_count"); - - tweetCreateDelayStats = SearchDelayStats.export( - "create_histogram_" + getStageNamePrefix(), 90, - TimeUnit.SECONDS, TimeUnit.MILLISECONDS); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (obj instanceof IngesterTweetEvent) { - IngesterTweetEvent tweetEvent = (IngesterTweetEvent) obj; - if (tryToRecordCreateLatency(tweetEvent)) { - emitAndCount(tweetEvent); - } - } else { - throw new StageException(this, "Object is not a IngesterTweetEvent: " + obj); - } - } - - @Override - protected IngesterTweetEvent innerRunStageV2(IngesterTweetEvent tweetEvent) { - if (!tryToRecordCreateLatency(tweetEvent)) { - throw new PipelineStageRuntimeException("Event does not have to pass to the next stage."); - } - return tweetEvent; - } - - private boolean tryToRecordCreateLatency(IngesterTweetEvent tweetEvent) { - incrementCounters(tweetEvent); - boolean shouldEmit = shouldEmit(tweetEvent); - if (shouldEmit) { - if (isCreateEvent(tweetEvent.getData())) { - recordCreateLatency(tweetEvent.getData().getTweet_create_event()); - } - } - return shouldEmit; - } - - private void incrementCounters(@Nonnull TweetEvent tweetEvent) { - totalEventsCount.increment(); - SafetyType eventSafetyType = getEventSafetyType(tweetEvent); - - if (isCreateEvent(tweetEvent.getData())) { - createEventsCount.increment(); - switch (eventSafetyType) { - case PUBLIC: - createPublicEventsCount.increment(); - break; - case PROTECTED: - createProtectedEventsCount.increment(); - break; - case RESTRICTED: - createRestrictedEventsCount.increment(); - break; - default: - createInvalidSafetyTypeCount.increment(); - incrementInvalidSafetyTypeStatMap(tweetEvent, "create"); - } - } else if (isDeleteEvent(tweetEvent.getData())) { - deleteEventsCount.increment(); - switch (eventSafetyType) { - case PUBLIC: - deletePublicEventsCount.increment(); - break; - case PROTECTED: - deleteProtectedEventsCount.increment(); - break; - case RESTRICTED: - deleteRestrictedEventsCount.increment(); - break; - default: - deleteInvalidSafetyTypeCount.increment(); - incrementInvalidSafetyTypeStatMap(tweetEvent, "delete"); - } - } else { - otherEventsCount.increment(); - } - } - - private void incrementInvalidSafetyTypeStatMap(TweetEvent tweetEvent, String eventType) { - com.twitter.tweetypie.thriftjava.SafetyType thriftSafetyType = - tweetEvent.getFlags().getSafety_type(); - String safetyTypeString = - thriftSafetyType == null ? "null" : thriftSafetyType.toString().toLowerCase(); - invalidSafetyTypeByEventTypeStatMap.putIfAbsent(eventType, new ConcurrentHashMap<>()); - SearchCounter stat = invalidSafetyTypeByEventTypeStatMap.get(eventType).computeIfAbsent( - safetyTypeString, - safetyTypeStr -> SearchCounter.export( - getStageNamePrefix() - + String.format("_%s_missing_or_unknown_safetytype_%s", - eventType, safetyTypeStr))); - stat.increment(); - } - - @VisibleForTesting - boolean shouldEmit(@Nonnull TweetEvent tweetEvent) { - // Do not emit any undelete events. - if (isUndeleteEvent(tweetEvent.getData())) { - return false; - } - - SafetyType eventSafetyType = getEventSafetyType(tweetEvent); - // Custom logic for REALTIME_CG cluster - if (safetyType == SafetyType.PUBLIC_OR_PROTECTED) { - return eventSafetyType == SafetyType.PUBLIC || eventSafetyType == SafetyType.PROTECTED; - } else { - return eventSafetyType == safetyType; - } - } - - private SafetyType getEventSafetyType(@Nonnull TweetEvent tweetEvent) { - TweetEventFlags tweetEventFlags = tweetEvent.getFlags(); - return SafetyType.fromThriftSafetyType(tweetEventFlags.getSafety_type()); - } - - private boolean isCreateEvent(@Nonnull TweetEventData tweetEventData) { - return tweetEventData.isSet(TweetEventData._Fields.TWEET_CREATE_EVENT); - } - - private boolean isDeleteEvent(@Nonnull TweetEventData tweetEventData) { - return tweetEventData.isSet(TweetEventData._Fields.TWEET_DELETE_EVENT); - } - - private boolean isUndeleteEvent(@Nonnull TweetEventData tweetEventData) { - return tweetEventData.isSet(TweetEventData._Fields.TWEET_UNDELETE_EVENT); - } - - private void recordCreateLatency(TweetCreateEvent tweetCreateEvent) { - Tweet tweet = tweetCreateEvent.getTweet(); - if (tweet != null) { - long tweetCreateLatency = - clock.nowMillis() - SnowflakeIdParser.getTimestampFromTweetId(tweet.getId()); - tweetCreateDelayStats.recordLatency(tweetCreateLatency, TimeUnit.MILLISECONDS); - if (tweetCreateLatency < 0) { - LOG.warn("Received a tweet created in the future: {}", tweet); - } else if (tweetCreateLatencyLogThresholdMillis > 0 - && tweetCreateLatency > tweetCreateLatencyLogThresholdMillis) { - LOG.debug("Found late incoming tweet: {}. Create latency: {}ms. Tweet: {}", - tweet.getId(), tweetCreateLatency, tweet); - } - } - } - - public void setTweetCreateLatencyLogThresholdMillis(long tweetCreateLatencyLogThresholdMillis) { - LOG.info("Setting tweetCreateLatencyLogThresholdMillis to {}.", - tweetCreateLatencyLogThresholdMillis); - this.tweetCreateLatencyLogThresholdMillis = tweetCreateLatencyLogThresholdMillis; - } - - public enum SafetyType { - PUBLIC, - PROTECTED, - RESTRICTED, - PUBLIC_OR_PROTECTED, - INVALID; - - /** Converts a tweetypie SafetyType instance to an instance of this enum. */ - @Nonnull - public static SafetyType fromThriftSafetyType( - com.twitter.tweetypie.thriftjava.SafetyType safetyType) { - if (safetyType == null) { - return INVALID; - } - switch(safetyType) { - case PRIVATE: - return PROTECTED; - case PUBLIC: - return PUBLIC; - case RESTRICTED: - return RESTRICTED; - default: - return INVALID; - } - } - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/FilterRetweetsAndRepliesStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/FilterRetweetsAndRepliesStage.docx new file mode 100644 index 000000000..e0d7bdb46 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/FilterRetweetsAndRepliesStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/FilterRetweetsAndRepliesStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/FilterRetweetsAndRepliesStage.java deleted file mode 100644 index 7da7178ba..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/FilterRetweetsAndRepliesStage.java +++ /dev/null @@ -1,79 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducedTypes; - -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.search.ingester.pipeline.util.PipelineStageRuntimeException; - -/** - * Filters out tweets that are not retweets or replies. - */ -@ConsumedTypes(IngesterTwitterMessage.class) -@ProducedTypes(IngesterTwitterMessage.class) -public class FilterRetweetsAndRepliesStage extends TwitterBaseStage - { - private static final String EMIT_RETWEET_AND_REPLY_ENGAGEMENTS_DECIDER_KEY = - "ingester_realtime_emit_retweet_and_reply_engagements"; - - private SearchRateCounter filteredRetweetsCount; - private SearchRateCounter filteredRepliesToTweetsCount; - private SearchRateCounter incomingRetweetsAndRepliesToTweetsCount; - - @Override - public void initStats() { - super.initStats(); - innerSetupStats(); - } - - @Override - protected void innerSetupStats() { - filteredRetweetsCount = - SearchRateCounter.export(getStageNamePrefix() + "_filtered_retweets_count"); - filteredRepliesToTweetsCount = - SearchRateCounter.export(getStageNamePrefix() + "_filtered_replies_to_tweets_count"); - incomingRetweetsAndRepliesToTweetsCount = - SearchRateCounter.export( - getStageNamePrefix() + "_incoming_retweets_and_replies_to_tweets_count"); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterTwitterMessage)) { - throw new StageException(this, "Object is not an IngesterTwitterMessage: " + obj); - } - - IngesterTwitterMessage status = (IngesterTwitterMessage) obj; - if (tryToFilter(status)) { - emitAndCount(status); - } - } - - @Override - public IngesterTwitterMessage runStageV2(IngesterTwitterMessage message) { - if (!tryToFilter(message)) { - throw new PipelineStageRuntimeException("Does not have to pass to the next stage."); - } - return message; - } - - private boolean tryToFilter(IngesterTwitterMessage status) { - boolean shouldEmit = false; - if (status.isRetweet() || status.isReplyToTweet()) { - incomingRetweetsAndRepliesToTweetsCount.increment(); - if (DeciderUtil.isAvailableForRandomRecipient( - decider, EMIT_RETWEET_AND_REPLY_ENGAGEMENTS_DECIDER_KEY)) { - if (status.isRetweet()) { - filteredRetweetsCount.increment(); - } else { - filteredRepliesToTweetsCount.increment(); - } - shouldEmit = true; - } - } - return shouldEmit; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/FilterTwitterMessageStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/FilterTwitterMessageStage.docx new file mode 100644 index 000000000..6233dab58 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/FilterTwitterMessageStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/FilterTwitterMessageStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/FilterTwitterMessageStage.java deleted file mode 100644 index 61a52c200..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/FilterTwitterMessageStage.java +++ /dev/null @@ -1,77 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; - -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.ingester.pipeline.twitter.filters.IngesterValidMessageFilter; -import com.twitter.search.ingester.pipeline.util.PipelineStageRuntimeException; - -/** - * Filter out Twitter messages meeting some filtering rule. - */ -@ConsumedTypes(TwitterMessage.class) -@ProducesConsumed -public class FilterTwitterMessageStage extends TwitterBaseStage - { - private IngesterValidMessageFilter filter = null; - private SearchRateCounter validMessages; - private SearchRateCounter invalidMessages; - - @Override - protected void initStats() { - super.initStats(); - innerSetupStats(); - } - - @Override - protected void innerSetupStats() { - validMessages = SearchRateCounter.export(getStageNamePrefix() + "_valid_messages"); - invalidMessages = SearchRateCounter.export(getStageNamePrefix() + "_filtered_messages"); - } - - @Override - protected void doInnerPreprocess() { - innerSetup(); - } - - @Override - protected void innerSetup() { - filter = new IngesterValidMessageFilter(decider); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof TwitterMessage)) { - throw new StageException(this, "Object is not a IngesterTwitterMessage: " - + obj); - } - - TwitterMessage message = (TwitterMessage) obj; - if (tryToFilter(message)) { - emitAndCount(message); - } - } - - @Override - protected TwitterMessage innerRunStageV2(TwitterMessage message) { - if (!tryToFilter(message)) { - throw new PipelineStageRuntimeException("Failed to filter, does not have to " - + "pass to the next stage"); - } - return message; - } - - private boolean tryToFilter(TwitterMessage message) { - boolean ableToFilter = false; - if (message != null && filter.accepts(message)) { - validMessages.increment(); - ableToFilter = true; - } else { - invalidMessages.increment(); - } - return ableToFilter; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/LookupUserPropertiesBatchedStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/LookupUserPropertiesBatchedStage.docx new file mode 100644 index 000000000..cc961b771 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/LookupUserPropertiesBatchedStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/LookupUserPropertiesBatchedStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/LookupUserPropertiesBatchedStage.java deleted file mode 100644 index 9e0184d6f..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/LookupUserPropertiesBatchedStage.java +++ /dev/null @@ -1,60 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.Collection; -import javax.naming.NamingException; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; - -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.search.ingester.pipeline.util.BatchedElement; -import com.twitter.search.ingester.pipeline.util.PipelineStageException; -import com.twitter.search.ingester.pipeline.util.UserPropertiesManager; -import com.twitter.util.Future; - -@ConsumedTypes(IngesterTwitterMessage.class) -@ProducesConsumed -public class LookupUserPropertiesBatchedStage extends TwitterBatchedBaseStage - { - - protected UserPropertiesManager userPropertiesManager; - - @Override - protected Class getQueueObjectType() { - return IngesterTwitterMessage.class; - } - - @Override - protected Future> innerProcessBatch(Collection> batch) { - Collection batchedElements = extractOnlyElementsFromBatch(batch); - return userPropertiesManager.populateUserProperties(batchedElements); - } - - @Override - protected boolean needsToBeBatched(IngesterTwitterMessage element) { - return true; - } - - @Override - protected IngesterTwitterMessage transform(IngesterTwitterMessage element) { - return element; - } - - @Override - public synchronized void doInnerPreprocess() throws StageException, NamingException { - super.doInnerPreprocess(); - commonInnerSetup(); - } - - @Override - protected void innerSetup() throws PipelineStageException, NamingException { - super.innerSetup(); - commonInnerSetup(); - } - - private void commonInnerSetup() throws NamingException { - userPropertiesManager = new UserPropertiesManager(wireModule.getMetastoreClient()); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/NamedEntityHandler.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/NamedEntityHandler.docx new file mode 100644 index 000000000..60080166f Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/NamedEntityHandler.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/NamedEntityHandler.java b/src/java/com/twitter/search/ingester/pipeline/twitter/NamedEntityHandler.java deleted file mode 100644 index 617b8183c..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/NamedEntityHandler.java +++ /dev/null @@ -1,101 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.Set; - -import scala.Option; - -import com.google.common.collect.ImmutableSet; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.cuad.ner.plain.thriftjava.NamedEntities; -import com.twitter.cuad.ner.plain.thriftjava.NamedEntity; -import com.twitter.decider.Decider; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.search.ingester.pipeline.strato_fetchers.NamedEntityFetcher; -import com.twitter.search.ingester.pipeline.util.IngesterStageTimer; -import com.twitter.strato.catalog.Fetch; -import com.twitter.util.Future; - -/** - * Handles the retrieval and population of named entities in TwitterMessages performed - * by ingesters. - */ -class NamedEntityHandler { - private static final Logger LOG = LoggerFactory.getLogger(NamedEntityHandler.class); - - private static final String RETRIEVE_NAMED_ENTITIES_DECIDER_KEY = - "ingester_all_retrieve_named_entities_%s"; - - // Named entities are only extracted in English, Spanish, and Japanese - private static final Set NAMED_ENTITY_LANGUAGES = ImmutableSet.of("en", "es", "ja"); - - private final NamedEntityFetcher namedEntityFetcher; - private final Decider decider; - private final String deciderKey; - - private SearchRateCounter lookupStat; - private SearchRateCounter successStat; - private SearchRateCounter namedEntityCountStat; - private SearchRateCounter errorStat; - private SearchRateCounter emptyResponseStat; - private SearchRateCounter deciderSkippedStat; - private IngesterStageTimer retrieveNamedEntitiesTimer; - - NamedEntityHandler( - NamedEntityFetcher namedEntityFetcher, Decider decider, String statsPrefix, - String deciderSuffix) { - this.namedEntityFetcher = namedEntityFetcher; - this.decider = decider; - this.deciderKey = String.format(RETRIEVE_NAMED_ENTITIES_DECIDER_KEY, deciderSuffix); - - lookupStat = SearchRateCounter.export(statsPrefix + "_lookups"); - successStat = SearchRateCounter.export(statsPrefix + "_success"); - namedEntityCountStat = SearchRateCounter.export(statsPrefix + "_named_entity_count"); - errorStat = SearchRateCounter.export(statsPrefix + "_error"); - emptyResponseStat = SearchRateCounter.export(statsPrefix + "_empty_response"); - deciderSkippedStat = SearchRateCounter.export(statsPrefix + "_decider_skipped"); - retrieveNamedEntitiesTimer = new IngesterStageTimer(statsPrefix + "_request_timer"); - } - - Future> retrieve(IngesterTwitterMessage message) { - lookupStat.increment(); - return namedEntityFetcher.fetch(message.getTweetId()); - } - - void addEntitiesToMessage(IngesterTwitterMessage message, Fetch.Result result) { - retrieveNamedEntitiesTimer.start(); - Option response = result.v(); - if (response.isDefined()) { - successStat.increment(); - for (NamedEntity namedEntity : response.get().getEntities()) { - namedEntityCountStat.increment(); - message.addNamedEntity(namedEntity); - } - } else { - emptyResponseStat.increment(); - LOG.debug("Empty NERResponse for named entity query on tweet {}", message.getId()); - } - retrieveNamedEntitiesTimer.stop(); - } - - void incrementErrorCount() { - errorStat.increment(); - } - - boolean shouldRetrieve(IngesterTwitterMessage message) { - // Use decider to control retrieval of named entities. This allows us to shut off retrieval - // if it causes problems. - if (!DeciderUtil.isAvailableForRandomRecipient(decider, deciderKey)) { - deciderSkippedStat.increment(); - return false; - } - - // Named entities are only extracted in certain languages, so we can skip tweets - // in other languages - return NAMED_ENTITY_LANGUAGES.contains(message.getLanguage()); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/PopulateCodedLocationsBatchedStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/PopulateCodedLocationsBatchedStage.docx new file mode 100644 index 000000000..6527a8d28 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/PopulateCodedLocationsBatchedStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/PopulateCodedLocationsBatchedStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/PopulateCodedLocationsBatchedStage.java deleted file mode 100644 index 89bb803f5..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/PopulateCodedLocationsBatchedStage.java +++ /dev/null @@ -1,79 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.Collection; -import javax.naming.NamingException; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; - -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.search.ingester.pipeline.util.BatchedElement; -import com.twitter.search.ingester.pipeline.util.ManhattanCodedLocationProvider; -import com.twitter.search.ingester.pipeline.util.PipelineStageException; -import com.twitter.util.Future; - -/** - * Read-only stage for looking up location info and populating it onto messages. - */ -@ConsumedTypes(IngesterTwitterMessage.class) -@ProducesConsumed -public final class PopulateCodedLocationsBatchedStage - extends TwitterBatchedBaseStage { - private static final String GEOCODE_DATASET_NAME = "ingester_geocode_profile_location"; - - private ManhattanCodedLocationProvider manhattanCodedLocationProvider = null; - - /** - * Require lat/lon from TwitterMessage instead of lookup from coded_locations, - * do not batch sql, and simply emit messages passed in with regions populated on them - * rather than emitting to indexing queues. - */ - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - super.doInnerPreprocess(); - commonInnerSetup(); - } - - @Override - protected void innerSetup() throws PipelineStageException, NamingException { - super.innerSetup(); - commonInnerSetup(); - } - - private void commonInnerSetup() throws NamingException { - this.manhattanCodedLocationProvider = ManhattanCodedLocationProvider.createWithEndpoint( - wireModule.getJavaManhattanKVEndpoint(), - getStageNamePrefix(), - GEOCODE_DATASET_NAME); - } - - @Override - public void initStats() { - super.initStats(); - } - - @Override - protected Class getQueueObjectType() { - return IngesterTwitterMessage.class; - } - - @Override - protected Future> innerProcessBatch(Collection> batch) { - - Collection batchedElements = extractOnlyElementsFromBatch(batch); - return manhattanCodedLocationProvider.populateCodedLatLon(batchedElements); - } - - @Override - protected boolean needsToBeBatched(IngesterTwitterMessage message) { - return !message.hasGeoLocation() && (message.getLocation() != null) - && !message.getLocation().isEmpty(); - } - - @Override - protected IngesterTwitterMessage transform(IngesterTwitterMessage element) { - return element; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsBatchedStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsBatchedStage.docx new file mode 100644 index 000000000..d29d0acd9 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsBatchedStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsBatchedStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsBatchedStage.java deleted file mode 100644 index 3bf2ebe7f..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsBatchedStage.java +++ /dev/null @@ -1,387 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.net.URI; -import java.net.URISyntaxException; -import java.util.Collection; -import java.util.Collections; -import java.util.HashSet; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import javax.naming.NamingException; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; - -import org.apache.commons.lang.StringUtils; -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; - -import com.twitter.common.text.language.LocaleUtil; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl; -import com.twitter.search.common.metrics.Percentile; -import com.twitter.search.common.metrics.PercentileUtil; -import com.twitter.search.common.metrics.RelevanceStats; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.search.ingester.pipeline.util.BatchedElement; -import com.twitter.search.ingester.pipeline.util.PipelineStageException; -import com.twitter.search.ingester.pipeline.wire.WireModule; -import com.twitter.service.spiderduck.gen.MediaTypes; -import com.twitter.util.Duration; -import com.twitter.util.Function; -import com.twitter.util.Future; - -@ConsumedTypes(IngesterTwitterMessage.class) -@ProducesConsumed -public class ResolveCompressedUrlsBatchedStage extends TwitterBatchedBaseStage - { - - private static final int PINK_REQUEST_TIMEOUT_MILLIS = 500; - private static final int PINK_REQUEST_RETRIES = 2; - private static final String PINK_REQUESTS_BATCH_SIZE_DECIDER_KEY = "pink_requests_batch_size"; - private AsyncPinkUrlsResolver urlResolver; - private int resolveUrlPercentage = 100; - private String pinkClientId; - private SearchDecider searchDecider; - - // The number of URLs that we attempted to resolve. - private SearchRateCounter linksAttempted; - // The number of URLs that were successfully resolved. - private SearchRateCounter linksSucceeded; - // The number of URLs ignored because they are too long. - private SearchRateCounter linksTooLong; - // The number of URLs truncated because they are too long. - private SearchRateCounter linksTruncated; - - // The number of resolved URLs without a media type. - private SearchRateCounter urlsWithoutMediaType; - // The number of resolved URLs with a specific media type. - private final Map urlsWithMediaTypeMap = - Maps.newEnumMap(MediaTypes.class); - - // The number of tweets for which all URLs were resolved. - private SearchRateCounter tweetsWithResolvedURLs; - // The number of tweets for which some URLs were not resolved. - private SearchRateCounter tweetsWithUnresolvedURLs; - - // How long it takes to fully resolve all URLs in a tweet. - private Percentile millisToResolveAllTweetURLs; - - // max age that a tweet can be before passed down the pipeline - private long tweetMaxAgeToResolve; - - // number of times an element is within quota. - private SearchRateCounter numberOfElementsWithinQuota; - - // number of times element is not within quota. If element not within quota, we dont batch. - private SearchRateCounter numberOfElementsNotWithinQuota; - - // number of times element has urls. - private SearchRateCounter numberOfElementsWithUrls; - - // number of times element does not have urls. If element does not have URL, we dont batch. - private SearchRateCounter numberOfElementsWithoutUrls; - - // number of calls to needsToBeBatched method. - private SearchRateCounter numberOfCallsToNeedsToBeBatched; - - - public void setTweetMaxAgeToResolve(long tweetMaxAgeToResolve) { - this.tweetMaxAgeToResolve = tweetMaxAgeToResolve; - } - - @Override - protected Class getQueueObjectType() { - return IngesterTwitterMessage.class; - } - - @Override - protected boolean needsToBeBatched(IngesterTwitterMessage element) { - numberOfCallsToNeedsToBeBatched.increment(); - boolean isWithinQuota = (element.getId() % 100) < resolveUrlPercentage; - - if (isWithinQuota) { - this.numberOfElementsWithinQuota.increment(); - } else { - this.numberOfElementsNotWithinQuota.increment(); - } - - boolean hasUrls = !element.getExpandedUrlMap().isEmpty(); - - if (hasUrls) { - this.numberOfElementsWithUrls.increment(); - } else { - this.numberOfElementsWithoutUrls.increment(); - } - - return hasUrls && isWithinQuota; - } - - // Identity transformation. T and U types are the same - @Override - protected IngesterTwitterMessage transform(IngesterTwitterMessage element) { - return element; - } - - @Override - public void initStats() { - super.initStats(); - commonInnerSetupStats(); - } - - @Override - protected void innerSetupStats() { - super.innerSetupStats(); - commonInnerSetupStats(); - } - - private void commonInnerSetupStats() { - linksAttempted = RelevanceStats.exportRate(getStageNamePrefix() + "_num_links_attempted"); - linksSucceeded = RelevanceStats.exportRate(getStageNamePrefix() + "_num_links_succeeded"); - linksTooLong = RelevanceStats.exportRate(getStageNamePrefix() + "_num_links_toolong"); - linksTruncated = RelevanceStats.exportRate(getStageNamePrefix() + "_num_links_truncated"); - - urlsWithoutMediaType = RelevanceStats.exportRate( - getStageNamePrefix() + "_urls_without_media_type"); - - for (MediaTypes mediaType : MediaTypes.values()) { - urlsWithMediaTypeMap.put( - mediaType, - RelevanceStats.exportRate( - getStageNamePrefix() + "_urls_with_media_type_" + mediaType.name().toLowerCase())); - } - - tweetsWithResolvedURLs = RelevanceStats.exportRate( - getStageNamePrefix() + "_num_tweets_with_resolved_urls"); - tweetsWithUnresolvedURLs = RelevanceStats.exportRate( - getStageNamePrefix() + "_num_tweets_with_unresolved_urls"); - - millisToResolveAllTweetURLs = PercentileUtil.createPercentile( - getStageNamePrefix() + "_millis_to_resolve_all_tweet_urls"); - - numberOfCallsToNeedsToBeBatched = SearchRateCounter.export(getStageNamePrefix() - + "_calls_to_needsToBeBatched"); - - numberOfElementsWithinQuota = SearchRateCounter.export(getStageNamePrefix() - + "_is_within_quota"); - - numberOfElementsNotWithinQuota = SearchRateCounter.export(getStageNamePrefix() - + "_is_not_within_quota"); - - numberOfElementsWithUrls = SearchRateCounter.export(getStageNamePrefix() - + "_has_urls"); - - numberOfElementsWithoutUrls = SearchRateCounter.export(getStageNamePrefix() - + "_does_not_have_urls"); - } - - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - searchDecider = new SearchDecider(decider); - // We need to call this after assigning searchDecider because our updateBatchSize function - // depends on the searchDecider. - super.doInnerPreprocess(); - commonInnerSetup(); - } - - @Override - protected void innerSetup() throws PipelineStageException, NamingException { - searchDecider = new SearchDecider(decider); - // We need to call this after assigning searchDecider because our updateBatchSize function - // depends on the searchDecider. - super.innerSetup(); - commonInnerSetup(); - } - - private void commonInnerSetup() throws NamingException { - Preconditions.checkNotNull(pinkClientId); - urlResolver = new AsyncPinkUrlsResolver( - WireModule - .getWireModule() - .getStorer(Duration.fromMilliseconds(PINK_REQUEST_TIMEOUT_MILLIS), - PINK_REQUEST_RETRIES), - pinkClientId); - } - - @Override - protected Future> innerProcessBatch(Collection> batch) { - // Batch urls - Map> urlToTweetsMap = createUrlToTweetMap(batch); - - Set urlsToResolve = urlToTweetsMap.keySet(); - - updateBatchSize(); - - linksAttempted.increment(batch.size()); - // Do the lookup - return urlResolver.resolveUrls(urlsToResolve).map(processResolvedUrlsFunction(batch)); - } - - @Override - protected void updateBatchSize() { - // update batch based on decider - int decidedBatchSize = searchDecider.featureExists(PINK_REQUESTS_BATCH_SIZE_DECIDER_KEY) - ? searchDecider.getAvailability(PINK_REQUESTS_BATCH_SIZE_DECIDER_KEY) - : batchSize; - - setBatchedStageBatchSize(decidedBatchSize); - } - - //if not all urls for a message where resolved re-enqueue until maxAge is reached - private Function, - Collection> - processResolvedUrlsFunction(Collection> batch) { - return Function.func(resolvedUrls -> { - linksSucceeded.increment(resolvedUrls.size()); - - for (ResolveCompressedUrlsUtils.UrlInfo urlInfo : resolvedUrls.values()) { - if (urlInfo.mediaType != null) { - urlsWithMediaTypeMap.get(urlInfo.mediaType).increment(); - } else { - urlsWithoutMediaType.increment(); - } - } - - Set successfulTweets = Sets.newHashSet(); - - for (BatchedElement batchedElement : batch) { - IngesterTwitterMessage message = batchedElement.getItem(); - Set tweetUrls = message.getExpandedUrlMap().keySet(); - - int resolvedUrlCounter = 0; - - for (String url : tweetUrls) { - ResolveCompressedUrlsUtils.UrlInfo urlInfo = resolvedUrls.get(url); - - // if the url didn't resolve move on to the next one, this might trigger a re-enqueue - // if the tweet is still kind of new. But we want to process the rest for when that - // is not the case and we are going to end up passing it to the next stage - if (urlInfo == null) { - continue; - } - - String resolvedUrl = urlInfo.resolvedUrl; - Locale locale = urlInfo.language == null ? null - : LocaleUtil.getLocaleOf(urlInfo.language); - - if (StringUtils.isNotBlank(resolvedUrl)) { - ThriftExpandedUrl expandedUrl = message.getExpandedUrlMap().get(url); - resolvedUrlCounter += 1; - enrichTweetWithUrlInfo(message, expandedUrl, urlInfo, locale); - } - } - long tweetMessageAge = clock.nowMillis() - message.getDate().getTime(); - - if (resolvedUrlCounter == tweetUrls.size()) { - millisToResolveAllTweetURLs.record(tweetMessageAge); - tweetsWithResolvedURLs.increment(); - successfulTweets.add(message); - } else if (tweetMessageAge > tweetMaxAgeToResolve) { - tweetsWithUnresolvedURLs.increment(); - successfulTweets.add(message); - } else { - //re-enqueue if all urls weren't resolved and the tweet is younger than maxAge - reEnqueueAndRetry(batchedElement); - } - } - return successfulTweets; - }); - } - - private Map> createUrlToTweetMap( - Collection> batch) { - Map> urlToTweetsMap = Maps.newHashMap(); - for (BatchedElement batchedElement : batch) { - IngesterTwitterMessage message = batchedElement.getItem(); - for (String originalUrl : message.getExpandedUrlMap().keySet()) { - Set messages = urlToTweetsMap.get(originalUrl); - if (messages == null) { - messages = new HashSet<>(); - urlToTweetsMap.put(originalUrl, messages); - } - messages.add(message); - } - } - return Collections.unmodifiableMap(urlToTweetsMap); - } - - // enrich the twitterMessage with the resolvedCounter Urls. - private void enrichTweetWithUrlInfo(IngesterTwitterMessage message, - ThriftExpandedUrl expandedUrl, - ResolveCompressedUrlsUtils.UrlInfo urlInfo, - Locale locale) { - String truncatedUrl = maybeTruncate(urlInfo.resolvedUrl); - if (truncatedUrl == null) { - return; - } - - expandedUrl.setCanonicalLastHopUrl(truncatedUrl); - if (urlInfo.mediaType != null) { - // Overwrite url media type with media type from resolved url only if the media type from - // resolved url is not Unknown - if (!expandedUrl.isSetMediaType() || urlInfo.mediaType != MediaTypes.UNKNOWN) { - expandedUrl.setMediaType(urlInfo.mediaType); - } - } - if (urlInfo.linkCategory != null) { - expandedUrl.setLinkCategory(urlInfo.linkCategory); - } - // Note that if there are multiple links in one tweet message, the language of the - // link that got examined later in this for loop will overwrite the values that were - // written before. This is not an optimal design but considering most tweets have - // only one link, or same-language links, this shouldn't be a big issue. - if (locale != null) { - message.setLinkLocale(locale); - } - - if (urlInfo.description != null) { - expandedUrl.setDescription(urlInfo.description); - } - - if (urlInfo.title != null) { - expandedUrl.setTitle(urlInfo.title); - } - } - - // test methods - public void setResolveUrlPercentage(int percentage) { - this.resolveUrlPercentage = percentage; - } - - public void setPinkClientId(String pinkClientId) { - this.pinkClientId = pinkClientId; - } - - public static final int MAX_URL_LENGTH = 1000; - - private String maybeTruncate(String fullUrl) { - if (fullUrl.length() <= MAX_URL_LENGTH) { - return fullUrl; - } - - try { - URI parsed = new URI(fullUrl); - - // Create a URL with an empty query and fragment. - String simplified = new URI(parsed.getScheme(), - parsed.getAuthority(), - parsed.getPath(), - null, - null).toString(); - if (simplified.length() < MAX_URL_LENGTH) { - linksTruncated.increment(); - return simplified; - } - } catch (URISyntaxException e) { - } - - linksTooLong.increment(); - return null; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsPink.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsPink.docx new file mode 100644 index 000000000..12f5e90bc Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsPink.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsPink.java b/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsPink.java deleted file mode 100644 index 4064b590e..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsPink.java +++ /dev/null @@ -1,113 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.decider.Decider; -import com.twitter.pink_floyd.thrift.ClientIdentifier; -import com.twitter.pink_floyd.thrift.Mask; -import com.twitter.pink_floyd.thrift.Storer; -import com.twitter.pink_floyd.thrift.UrlData; -import com.twitter.pink_floyd.thrift.UrlReadRequest; -import com.twitter.pink_floyd.thrift.UrlReadResponse; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.util.Await; -import com.twitter.util.Future; -import com.twitter.util.Throw; -import com.twitter.util.Throwables; -import com.twitter.util.Try; - -import static com.twitter.search.ingester.pipeline.twitter.ResolveCompressedUrlsUtils.getUrlInfo; - -/** - * Resolve compressed URL via Pink - */ -public class ResolveCompressedUrlsPink { - private static final Logger LOG = LoggerFactory.getLogger(ResolveCompressedUrlsPink.class); - private static final String PINK_REQUESTS_BATCH_SIZE_DECIDER_KEY = "pink_requests_batch_size"; - - private final Storer.ServiceIface storerClient; - private final ClientIdentifier pinkClientId; - private final Mask requestMask; - private final SearchDecider decider; - - // Use ServerSet to construct a metadata store client - public ResolveCompressedUrlsPink(Storer.ServiceIface storerClient, - String pinkClientId, - Decider decider) { - this.storerClient = storerClient; - this.pinkClientId = ClientIdentifier.valueOf(pinkClientId); - this.decider = new SearchDecider(Preconditions.checkNotNull(decider)); - - requestMask = new Mask(); - requestMask.setResolution(true); - requestMask.setHtmlBasics(true); - requestMask.setUrlDirectInfo(true); - } - - /** - * Resolve a set of URLs using PinkFloyd. - */ - public Map resolveUrls(Set urls) { - if (urls == null || urls.size() == 0) { - return null; - } - - List urlsList = ImmutableList.copyOf(urls); - int batchSize = decider.featureExists(PINK_REQUESTS_BATCH_SIZE_DECIDER_KEY) - ? decider.getAvailability(PINK_REQUESTS_BATCH_SIZE_DECIDER_KEY) - : 10000; - int numRequests = (int) Math.ceil(1.0 * urlsList.size() / batchSize); - - List> responseFutures = Lists.newArrayList(); - for (int i = 0; i < numRequests; ++i) { - UrlReadRequest request = new UrlReadRequest(); - request.setUrls( - urlsList.subList(i * batchSize, Math.min(urlsList.size(), (i + 1) * batchSize))); - request.setMask(requestMask); - request.setClientId(pinkClientId); - - // Send all requests in parallel. - responseFutures.add(storerClient.read(request)); - } - - Map resultMap = Maps.newHashMap(); - for (Future responseFuture : responseFutures) { - Try tryResponse = getResponseTry(responseFuture); - if (tryResponse.isThrow()) { - continue; - } - - UrlReadResponse response = tryResponse.get(); - for (UrlData urlData : response.getData()) { - if (ResolveCompressedUrlsUtils.isResolved(urlData)) { - resultMap.put(urlData.url, getUrlInfo(urlData)); - } - } - } - - return resultMap; - } - - private Try getResponseTry(Future responseFuture) { - try { - Try tryResponse = Await.result(responseFuture.liftToTry()); - if (tryResponse.isThrow()) { - Throwable throwable = ((Throw) tryResponse).e(); - LOG.warn("Failed to resolve URLs with Pink Storer.", throwable); - } - return tryResponse; - } catch (Exception e) { - return Throwables.unchecked(e); - } - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsUtils.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsUtils.docx new file mode 100644 index 000000000..21260b006 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsUtils.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsUtils.java b/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsUtils.java deleted file mode 100644 index baa4269cd..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/ResolveCompressedUrlsUtils.java +++ /dev/null @@ -1,116 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Iterables; - -import org.apache.commons.lang.StringUtils; - -import com.twitter.pink_floyd.thrift.FetchStatusCode; -import com.twitter.pink_floyd.thrift.HtmlBasics; -import com.twitter.pink_floyd.thrift.Resolution; -import com.twitter.pink_floyd.thrift.UrlData; -import com.twitter.service.spiderduck.gen.LinkCategory; -import com.twitter.service.spiderduck.gen.MediaTypes; -import com.twitter.spiderduck.common.URLUtils; - -// Helper class with UrlInfo helper functions -public final class ResolveCompressedUrlsUtils { - - private ResolveCompressedUrlsUtils() { } - static class UrlInfo { - public String originalUrl; - @Nullable public String resolvedUrl; - @Nullable public String language; - @Nullable public MediaTypes mediaType; - @Nullable public LinkCategory linkCategory; - @Nullable public String description; - @Nullable public String title; - } - - /** - * Determines if the given UrlData instance is fully resolved. - * - * Based on discussions with the URL services team, we decided that the most correct way to - * determine that a URL was fully resolved is to look at a few response fields: - * - urlDirectInfo: both the media type and link category must be set. - * - htmlBasics: Pink has successfully parsed the resolved link's metadata. - * - resolution: Pink was able to successfully get to the last hop in the redirect chain. - * This is especially important, because some sites have a robots.txt file, which - * prevents Pink from following the redirect chain once it gets to that site. - * In that case, we end up with a "last hop" URL, but the FetchStatusCode is not - * set to OK. We need to ignore these URLs because we don't know if they're really - * the last hop URLs. - * Also, Pink has some restrictions on the page size. For example, it does not - * parse text pages that are larger than 2MB. So if the redirect chain leads Pink - * to one of these pages, it will stop there. And again, we don't know if this is - * the last hop URL or not, so we have to ignore that URL. - * - * @param urlData The UrlData instance. - * @return true if the URL data is fully resolved; false otherwise. - */ - public static boolean isResolved(UrlData urlData) { - // Make sure the mediaType and linkCategory fields are set. - boolean isInfoReady = urlData.isSetUrlDirectInfo() - && urlData.getUrlDirectInfo().isSetMediaType() - && urlData.getUrlDirectInfo().isSetLinkCategory(); - - // The individual HtmlBasics fields might or might not be set, depending on each website. - // However, all fields should be set at the same time, if they are present. Consider the - // resolution complete if at least one of the title, description or language fields is set. - boolean isHtmlReady = urlData.isSetHtmlBasics() - && (StringUtils.isNotEmpty(urlData.getHtmlBasics().getTitle()) - || StringUtils.isNotEmpty(urlData.getHtmlBasics().getDescription()) - || StringUtils.isNotEmpty(urlData.getHtmlBasics().getLang())); - - Resolution resolution = urlData.getResolution(); - boolean isResolutionReady = urlData.isSetResolution() - && StringUtils.isNotEmpty(resolution.getLastHopCanonicalUrl()) - && resolution.getStatus() == FetchStatusCode.OK - && resolution.getLastHopHttpResponseStatusCode() == 200; - - return isHtmlReady && isInfoReady && isResolutionReady; - } - - /** - * Creates a UrlInfo instance from the given URL data. - * - * @param urlData urlData from a resolver response. - * @return the UrlInfo instance. - */ - public static UrlInfo getUrlInfo(UrlData urlData) { - Preconditions.checkArgument(urlData.isSetResolution()); - - UrlInfo urlInfo = new UrlInfo(); - urlInfo.originalUrl = urlData.url; - Resolution resolution = urlData.getResolution(); - if (resolution.isSetLastHopCanonicalUrl()) { - urlInfo.resolvedUrl = resolution.lastHopCanonicalUrl; - } else { - // Just in case lastHopCanonicalUrl is not available (which shouldn't happen) - if (resolution.isSetRedirectionChain()) { - urlInfo.resolvedUrl = Iterables.getLast(resolution.redirectionChain); - } else { - urlInfo.resolvedUrl = urlData.url; - } - urlInfo.resolvedUrl = URLUtils.canonicalizeUrl(urlInfo.resolvedUrl); - } - if (urlData.isSetUrlDirectInfo()) { - urlInfo.mediaType = urlData.urlDirectInfo.mediaType; - urlInfo.linkCategory = urlData.urlDirectInfo.linkCategory; - } - if (urlData.isSetHtmlBasics()) { - HtmlBasics htmlBasics = urlData.getHtmlBasics(); - urlInfo.language = htmlBasics.getLang(); - if (htmlBasics.isSetDescription()) { - urlInfo.description = htmlBasics.getDescription(); - } - if (htmlBasics.isSetTitle()) { - urlInfo.title = htmlBasics.getTitle(); - } - } - return urlInfo; - } -} - diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveCardBatchedStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveCardBatchedStage.docx new file mode 100644 index 000000000..869e7a5a8 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveCardBatchedStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveCardBatchedStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveCardBatchedStage.java deleted file mode 100644 index 705c211c5..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveCardBatchedStage.java +++ /dev/null @@ -1,288 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.net.MalformedURLException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import javax.naming.NamingException; - -import com.google.common.collect.Maps; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.stage.StageTimer; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.text.language.LocaleUtil; -import com.twitter.expandodo.thriftjava.Card2; -import com.twitter.mediaservices.commons.tweetmedia.thrift_java.MediaInfo; -import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.search.ingester.pipeline.util.BatchingClient; -import com.twitter.search.ingester.pipeline.util.CardFieldUtil; -import com.twitter.search.ingester.pipeline.util.IngesterStageTimer; -import com.twitter.search.ingester.pipeline.util.ResponseNotReturnedException; -import com.twitter.spiderduck.common.URLUtils; -import com.twitter.tweetypie.thriftjava.GetTweetOptions; -import com.twitter.tweetypie.thriftjava.GetTweetResult; -import com.twitter.tweetypie.thriftjava.GetTweetsRequest; -import com.twitter.tweetypie.thriftjava.MediaEntity; -import com.twitter.tweetypie.thriftjava.StatusState; -import com.twitter.tweetypie.thriftjava.Tweet; -import com.twitter.tweetypie.thriftjava.TweetService; -import com.twitter.util.Function; -import com.twitter.util.Future; - -@ConsumedTypes(IngesterTwitterMessage.class) -@ProducesConsumed -public class RetrieveCardBatchedStage extends TwitterBaseStage - { - private static final Logger LOG = LoggerFactory.getLogger(RetrieveCardBatchedStage.class); - - private static final String CARDS_PLATFORM_KEY = "iPhone-13"; - private int batchSize = 10; - - private SearchRateCounter totalTweets; - private SearchRateCounter tweetsWithCards; - private SearchRateCounter tweetsWithoutCards; - private SearchRateCounter tweetsWithAnimatedGifMediaInfo; - private SearchRateCounter cardsWithName; - private SearchRateCounter cardsWithDomain; - private SearchRateCounter cardsWithTitles; - private SearchRateCounter cardsWithDescriptions; - private SearchRateCounter cardsWithUnknownLanguage; - private SearchRateCounter tweetsNotFound; - private SearchRateCounter malformedUrls; - private SearchRateCounter urlMismatches; - private SearchRateCounter cardExceptions; - private SearchRateCounter cardExceptionTweets; - private StageTimer retrieveCardsTimer; - - private String cardNamePrefix; - // Since there is only one thread executing this stage (although that could potentially be - // changed in the pipeline config), no need to be thread safe. - private static final Map CARD_NAME_STATS = new HashMap<>(); - - private static TweetService.ServiceToClient tweetyPieService; - private BatchingClient cardsClient; - - private String tweetypieClientId = null; - - // Can be overridden in the corresponding pipeline-ingester.*.xml config. - // By default protected tweets are filtered out. - // Only in the protected ingester pipeline is this set to false. - private boolean filterProtected = true; - - @Override - public void initStats() { - super.initStats(); - cardNamePrefix = getStageNamePrefix() + "_card_name_"; - totalTweets = SearchRateCounter.export(getStageNamePrefix() + "_total_tweets"); - tweetsWithCards = SearchRateCounter.export(getStageNamePrefix() + "_tweets_with_cards"); - tweetsWithoutCards = SearchRateCounter.export(getStageNamePrefix() + "_tweets_without_cards"); - tweetsWithAnimatedGifMediaInfo = - SearchRateCounter.export(getStageNamePrefix() + "_tweets_with_animated_gif_media_info"); - cardsWithName = SearchRateCounter.export(getStageNamePrefix() + "_tweets_with_card_name"); - cardsWithDomain = SearchRateCounter.export(getStageNamePrefix() + "_tweets_with_card_domain"); - cardsWithTitles = SearchRateCounter.export(getStageNamePrefix() + "_tweets_with_card_titles"); - cardsWithDescriptions = - SearchRateCounter.export(getStageNamePrefix() + "_tweets_with_card_descriptions"); - cardsWithUnknownLanguage = - SearchRateCounter.export(getStageNamePrefix() + "_tweets_with_unknown_card_lanuage"); - tweetsNotFound = SearchRateCounter.export(getStageNamePrefix() + "_tweets_not_found"); - malformedUrls = SearchRateCounter.export(getStageNamePrefix() + "_malformed_urls"); - urlMismatches = SearchRateCounter.export(getStageNamePrefix() + "_url_mismatches"); - cardExceptions = SearchRateCounter.export(getStageNamePrefix() + "_card_exceptions"); - cardExceptionTweets = - SearchRateCounter.export(getStageNamePrefix() + "_card_exception_tweets"); - retrieveCardsTimer = new IngesterStageTimer(getStageNamePrefix() + "_request_timer"); - } - - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - super.doInnerPreprocess(); - tweetyPieService = wireModule.getTweetyPieClient(tweetypieClientId); - cardsClient = new BatchingClient<>(this::batchRetrieveURLs, batchSize); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterTwitterMessage)) { - throw new StageException(this, - "Received object of incorrect type: " + obj.getClass().getName()); - } - - IngesterTwitterMessage message = (IngesterTwitterMessage) obj; - - cardsClient.call(message.getTweetId()) - .onSuccess(Function.cons(card -> { - updateMessage(message, card); - emitAndCount(message); - })) - .onFailure(Function.cons(exception -> { - if (!(exception instanceof ResponseNotReturnedException)) { - cardExceptionTweets.increment(); - } - - emitAndCount(message); - })); - } - - private Future> batchRetrieveURLs(Set keys) { - retrieveCardsTimer.start(); - totalTweets.increment(keys.size()); - - GetTweetOptions options = new GetTweetOptions() - .setInclude_cards(true) - .setCards_platform_key(CARDS_PLATFORM_KEY) - .setBypass_visibility_filtering(!filterProtected); - - GetTweetsRequest request = new GetTweetsRequest() - .setOptions(options) - .setTweet_ids(new ArrayList<>(keys)); - - return tweetyPieService.get_tweets(request) - .onFailure(throwable -> { - cardExceptions.increment(); - LOG.error("TweetyPie server threw an exception while requesting tweetIds: " - + request.getTweet_ids(), throwable); - return null; - }) - .map(this::createIdToCardMap); - } - - private void updateMessage(IngesterTwitterMessage message, Card2 card) { - tweetsWithCards.increment(); - - String cardName = card.getName().toLowerCase(); - addCardNameToStats(cardName); - message.setCardName(cardName); - cardsWithName.increment(); - message.setCardUrl(card.getUrl()); - - String url = getLastHop(message, card.getUrl()); - if (url != null) { - try { - String domain = URLUtils.getDomainFromURL(url); - message.setCardDomain(domain.toLowerCase()); - cardsWithDomain.increment(); - } catch (MalformedURLException e) { - malformedUrls.increment(); - if (LOG.isDebugEnabled()) { - LOG.debug("Tweet ID {} has a malformed card last hop URL: {}", message.getId(), url); - } - } - } else { - // This happens with retweet. Basically when retrieve card for a retweet, we - // get a card associated with the original tweet, so the tco won't match. - // As of Sep 2014, this seems to be the intended behavior and has been running - // like this for over a year. - urlMismatches.increment(); - } - - message.setCardTitle( - CardFieldUtil.extractBindingValue(CardFieldUtil.TITLE_BINDING_KEY, card)); - if (message.getCardTitle() != null) { - cardsWithTitles.increment(); - } - message.setCardDescription( - CardFieldUtil.extractBindingValue(CardFieldUtil.DESCRIPTION_BINDING_KEY, card)); - if (message.getCardDescription() != null) { - cardsWithDescriptions.increment(); - } - CardFieldUtil.deriveCardLang(message); - if (LocaleUtil.UNKNOWN.getLanguage().equals(message.getCardLang())) { - cardsWithUnknownLanguage.increment(); - } - } - - private Map createIdToCardMap(List listResult) { - Map responseMap = Maps.newHashMap(); - for (GetTweetResult entry : listResult) { - if (entry.isSetTweet() - && entry.isSetTweet_state() - && (entry.getTweet_state() == StatusState.FOUND)) { - long id = entry.getTweet_id(); - if (entry.getTweet().isSetCard2()) { - responseMap.put(id, entry.getTweet().getCard2()); - } else { - // Short-term fix for removal of animated GIF cards -- - // if the tweet contains an animated GIF, create a card based on media entity data - Card2 card = createCardForAnimatedGif(entry.getTweet()); - if (card != null) { - responseMap.put(id, card); - tweetsWithAnimatedGifMediaInfo.increment(); - } else { - tweetsWithoutCards.increment(); - } - } - } else { - tweetsNotFound.increment(); - } - } - return responseMap; - } - - private Card2 createCardForAnimatedGif(Tweet tweet) { - if (tweet.getMediaSize() > 0) { - for (MediaEntity mediaEntity : tweet.getMedia()) { - MediaInfo mediaInfo = mediaEntity.getMedia_info(); - if (mediaInfo != null && mediaInfo.getSetField() == MediaInfo._Fields.ANIMATED_GIF_INFO) { - Card2 card = new Card2(); - card.setName("animated_gif"); - // Use the original compressed URL for the media entity to match existing card URLs - card.setUrl(mediaEntity.getUrl()); - card.setBinding_values(Collections.emptyList()); - - return card; - } - } - } - return null; - } - - // Unfortunately the url returned in the card data is not the last hop - private String getLastHop(IngesterTwitterMessage message, String url) { - if (message.getExpandedUrlMap() != null) { - ThriftExpandedUrl expanded = message.getExpandedUrlMap().get(url); - if ((expanded != null) && expanded.isSetCanonicalLastHopUrl()) { - return expanded.getCanonicalLastHopUrl(); - } - } - return null; - } - - // Used by commons-pipeline and set via the xml config - public void setFilterProtected(boolean filterProtected) { - LOG.info("Filtering protected tweets: {}", filterProtected); - this.filterProtected = filterProtected; - } - - public void setTweetypieClientId(String tweetypieClientId) { - LOG.info("Using tweetypieClientId: {}", tweetypieClientId); - this.tweetypieClientId = tweetypieClientId; - } - - public void setInternalBatchSize(int internalBatchSize) { - this.batchSize = internalBatchSize; - } - - /** - * For each card name, we add a rate counter to observe what kinds of card we're actually - * indexing, and with what rate. - */ - private void addCardNameToStats(String cardName) { - SearchRateCounter cardNameCounter = CARD_NAME_STATS.get(cardName); - if (cardNameCounter == null) { - cardNameCounter = SearchRateCounter.export(cardNamePrefix + cardName); - CARD_NAME_STATS.put(cardName, cardNameCounter); - } - cardNameCounter.increment(); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveNamedEntitiesSingleTweetStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveNamedEntitiesSingleTweetStage.docx new file mode 100644 index 000000000..b3f5ca396 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveNamedEntitiesSingleTweetStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveNamedEntitiesSingleTweetStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveNamedEntitiesSingleTweetStage.java deleted file mode 100644 index 762abefc2..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveNamedEntitiesSingleTweetStage.java +++ /dev/null @@ -1,75 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.concurrent.CompletableFuture; -import javax.naming.NamingException; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; - -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.util.Function; - -@ConsumedTypes(IngesterTwitterMessage.class) -@ProducesConsumed -public class RetrieveNamedEntitiesSingleTweetStage extends TwitterBaseStage - > { - - private NamedEntityHandler namedEntityHandler; - - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - innerSetup(); - } - - @Override - protected void innerSetup() { - namedEntityHandler = new NamedEntityHandler( - wireModule.getNamedEntityFetcher(), decider, getStageNamePrefix(), - "single_tweet"); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterTwitterMessage)) { - throw new StageException(this, "Object is not a IngesterTwitterMessage object: " + obj); - } - IngesterTwitterMessage twitterMessage = (IngesterTwitterMessage) obj; - - if (namedEntityHandler.shouldRetrieve(twitterMessage)) { - namedEntityHandler.retrieve(twitterMessage) - .onSuccess(Function.cons(result -> { - namedEntityHandler.addEntitiesToMessage(twitterMessage, result); - emitAndCount(twitterMessage); - })) - .onFailure(Function.cons(throwable -> { - namedEntityHandler.incrementErrorCount(); - emitAndCount(twitterMessage); - })); - } else { - emitAndCount(twitterMessage); - } - } - - @Override - protected CompletableFuture innerRunStageV2(IngesterTwitterMessage - message) { - CompletableFuture cf = new CompletableFuture<>(); - - if (namedEntityHandler.shouldRetrieve(message)) { - namedEntityHandler.retrieve(message) - .onSuccess(Function.cons(result -> { - namedEntityHandler.addEntitiesToMessage(message, result); - cf.complete(message); - })) - .onFailure(Function.cons(throwable -> { - namedEntityHandler.incrementErrorCount(); - cf.complete(message); - })); - } else { - cf.complete(message); - } - - return cf; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveSpaceAdminsAndTitleStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveSpaceAdminsAndTitleStage.docx new file mode 100644 index 000000000..19b4b048b Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveSpaceAdminsAndTitleStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveSpaceAdminsAndTitleStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveSpaceAdminsAndTitleStage.java deleted file mode 100644 index 66918274c..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveSpaceAdminsAndTitleStage.java +++ /dev/null @@ -1,246 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.List; -import java.util.Optional; -import java.util.Set; -import java.util.concurrent.CompletableFuture; - -import scala.Option; -import scala.Tuple2; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.Lists; - -import org.apache.commons.lang.StringUtils; -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; - -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.relevance.entities.TwitterMessageUser; -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.search.ingester.pipeline.strato_fetchers.AudioSpaceCoreFetcher; -import com.twitter.search.ingester.pipeline.strato_fetchers.AudioSpaceParticipantsFetcher; -import com.twitter.strato.catalog.Fetch; -import com.twitter.ubs.thriftjava.AudioSpace; -import com.twitter.ubs.thriftjava.ParticipantUser; -import com.twitter.ubs.thriftjava.Participants; -import com.twitter.util.Function; -import com.twitter.util.Future; -import com.twitter.util.Futures; -import com.twitter.util.Try; - -@ConsumedTypes(IngesterTwitterMessage.class) -@ProducesConsumed -public class RetrieveSpaceAdminsAndTitleStage extends TwitterBaseStage - > { - - @VisibleForTesting - protected static final String RETRIEVE_SPACE_ADMINS_AND_TITLE_DECIDER_KEY = - "ingester_all_retrieve_space_admins_and_title"; - - private AudioSpaceCoreFetcher coreFetcher; - private AudioSpaceParticipantsFetcher participantsFetcher; - - private SearchRateCounter tweetsWithSpaceAdmins; - private SearchRateCounter tweetsWithSpaceTitle; - private SearchRateCounter coreFetchSuccess; - private SearchRateCounter coreFetchFailure; - private SearchRateCounter participantsFetchSuccess; - private SearchRateCounter participantsFetchFailure; - private SearchRateCounter emptyCore; - private SearchRateCounter emptyParticipants; - private SearchRateCounter emptySpaceTitle; - private SearchRateCounter emptySpaceAdmins; - private SearchRateCounter parallelFetchAttempts; - private SearchRateCounter parallelFetchFailure; - - - @Override - protected void doInnerPreprocess() { - innerSetup(); - } - - @Override - protected void innerSetup() { - coreFetcher = wireModule.getAudioSpaceCoreFetcher(); - participantsFetcher = wireModule.getAudioSpaceParticipantsFetcher(); - - tweetsWithSpaceAdmins = getStageStat("tweets_with_audio_space_admins"); - tweetsWithSpaceTitle = getStageStat("tweets_with_audio_space_title"); - coreFetchSuccess = getStageStat("core_fetch_success"); - coreFetchFailure = getStageStat("core_fetch_failure"); - participantsFetchSuccess = getStageStat("participants_fetch_success"); - participantsFetchFailure = getStageStat("participants_fetch_failure"); - emptyCore = getStageStat("empty_core"); - emptyParticipants = getStageStat("empty_participants"); - emptySpaceTitle = getStageStat("empty_space_title"); - emptySpaceAdmins = getStageStat("empty_space_admins"); - parallelFetchAttempts = getStageStat("parallel_fetch_attempts"); - parallelFetchFailure = getStageStat("parallel_fetch_failure"); - } - - private SearchRateCounter getStageStat(String statSuffix) { - return SearchRateCounter.export(getStageNamePrefix() + "_" + statSuffix); - } - - private Future>, Try>>> - tryRetrieveSpaceAdminAndTitle(IngesterTwitterMessage twitterMessage) { - Set spaceIds = twitterMessage.getSpaceIds(); - - if (spaceIds.isEmpty()) { - return null; - } - - if (!(DeciderUtil.isAvailableForRandomRecipient(decider, - RETRIEVE_SPACE_ADMINS_AND_TITLE_DECIDER_KEY))) { - return null; - } - - String spaceId = spaceIds.iterator().next(); - - // Query both columns in parallel. - parallelFetchAttempts.increment(); - Future> core = coreFetcher.fetch(spaceId); - Future> participants = participantsFetcher.fetch(spaceId); - - return Futures.join(core.liftToTry(), participants.liftToTry()); - } - - @Override - protected CompletableFuture innerRunStageV2(IngesterTwitterMessage - twitterMessage) { - Future>, Try>>> - tryRetrieveSpaceAdminAndTitle = tryRetrieveSpaceAdminAndTitle(twitterMessage); - - CompletableFuture cf = new CompletableFuture<>(); - - if (tryRetrieveSpaceAdminAndTitle == null) { - cf.complete(twitterMessage); - } else { - tryRetrieveSpaceAdminAndTitle.onSuccess(Function.cons(tries -> { - handleFutureOnSuccess(tries, twitterMessage); - cf.complete(twitterMessage); - })).onFailure(Function.cons(throwable -> { - handleFutureOnFailure(); - cf.complete(twitterMessage); - })); - } - - return cf; - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterTwitterMessage)) { - throw new StageException(this, "Object is not a IngesterTwitterMessage object: " + obj); - } - IngesterTwitterMessage twitterMessage = (IngesterTwitterMessage) obj; - Future>, Try>>> - tryRetrieveSpaceAdminAndTitle = tryRetrieveSpaceAdminAndTitle(twitterMessage); - - if (tryRetrieveSpaceAdminAndTitle == null) { - emitAndCount(twitterMessage); - return; - } - - tryRetrieveSpaceAdminAndTitle.onSuccess(Function.cons(tries -> { - handleFutureOnSuccess(tries, twitterMessage); - emitAndCount(twitterMessage); - })).onFailure(Function.cons(throwable -> { - handleFutureOnFailure(); - emitAndCount(twitterMessage); - })); - } - - private void handleFutureOnSuccess(Tuple2>, - Try>> tries, IngesterTwitterMessage twitterMessage) { - handleCoreFetchTry(tries._1(), twitterMessage); - handleParticipantsFetchTry(tries._2(), twitterMessage); - } - - private void handleFutureOnFailure() { - parallelFetchFailure.increment(); - } - - private void handleCoreFetchTry( - Try> fetchTry, - IngesterTwitterMessage twitterMessage) { - - if (fetchTry.isReturn()) { - coreFetchSuccess.increment(); - addSpaceTitleToMessage(twitterMessage, fetchTry.get().v()); - } else { - coreFetchFailure.increment(); - } - } - - private void handleParticipantsFetchTry( - Try> fetchTry, - IngesterTwitterMessage twitterMessage) { - - if (fetchTry.isReturn()) { - participantsFetchSuccess.increment(); - addSpaceAdminsToMessage(twitterMessage, fetchTry.get().v()); - } else { - participantsFetchFailure.increment(); - } - } - - private void addSpaceTitleToMessage( - IngesterTwitterMessage twitterMessage, - Option audioSpace) { - - if (audioSpace.isDefined()) { - String audioSpaceTitle = audioSpace.get().getTitle(); - if (StringUtils.isNotEmpty(audioSpaceTitle)) { - twitterMessage.setSpaceTitle(audioSpaceTitle); - tweetsWithSpaceTitle.increment(); - } else { - emptySpaceTitle.increment(); - } - } else { - emptyCore.increment(); - } - } - - private void addSpaceAdminsToMessage( - IngesterTwitterMessage twitterMessage, - Option participants) { - - if (participants.isDefined()) { - List admins = getAdminsFromParticipants(participants.get()); - if (!admins.isEmpty()) { - for (ParticipantUser admin : admins) { - addSpaceAdminToMessage(twitterMessage, admin); - } - tweetsWithSpaceAdmins.increment(); - } else { - emptySpaceAdmins.increment(); - } - } else { - emptyParticipants.increment(); - } - } - - private List getAdminsFromParticipants(Participants participants) { - if (!participants.isSetAdmins()) { - return Lists.newArrayList(); - } - return participants.getAdmins(); - } - - private void addSpaceAdminToMessage(IngesterTwitterMessage twitterMessage, - ParticipantUser admin) { - TwitterMessageUser.Builder userBuilder = new TwitterMessageUser.Builder(); - if (admin.isSetTwitter_screen_name() - && StringUtils.isNotEmpty(admin.getTwitter_screen_name())) { - userBuilder.withScreenName(Optional.of(admin.getTwitter_screen_name())); - } - if (admin.isSetDisplay_name() && StringUtils.isNotEmpty(admin.getDisplay_name())) { - userBuilder.withDisplayName(Optional.of(admin.getDisplay_name())); - } - twitterMessage.addSpaceAdmin(userBuilder.build()); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveSpaceIdsStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveSpaceIdsStage.docx new file mode 100644 index 000000000..faddfc33f Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveSpaceIdsStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveSpaceIdsStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveSpaceIdsStage.java deleted file mode 100644 index 112e6b875..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/RetrieveSpaceIdsStage.java +++ /dev/null @@ -1,99 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.Sets; - -import org.apache.commons.lang.StringUtils; -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; - -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.relevance.entities.TwitterMessage; - -@ConsumedTypes(TwitterMessage.class) -@ProducesConsumed -public class RetrieveSpaceIdsStage extends TwitterBaseStage - { - - @VisibleForTesting - protected static final Pattern SPACES_URL_REGEX = - Pattern.compile("^https://twitter\\.com/i/spaces/([a-zA-Z0-9]+)\\S*$"); - - @VisibleForTesting - protected static final String PARSE_SPACE_ID_DECIDER_KEY = "ingester_all_parse_space_id_from_url"; - - private static SearchRateCounter numTweetsWithSpaceIds; - private static SearchRateCounter numTweetsWithMultipleSpaceIds; - - @Override - protected void initStats() { - super.initStats(); - innerSetupStats(); - } - - @Override - protected void innerSetupStats() { - numTweetsWithSpaceIds = SearchRateCounter.export( - getStageNamePrefix() + "_tweets_with_space_ids"); - numTweetsWithMultipleSpaceIds = SearchRateCounter.export( - getStageNamePrefix() + "_tweets_with_multiple_space_ids"); - } - - @Override - public void innerProcess(Object obj) throws StageException { - TwitterMessage message = (TwitterMessage) obj; - tryToRetrieveSpaceId(message); - emitAndCount(message); - } - - private void tryToRetrieveSpaceId(TwitterMessage message) { - if (DeciderUtil.isAvailableForRandomRecipient(decider, PARSE_SPACE_ID_DECIDER_KEY)) { - Set spaceIds = parseSpaceIdsFromMessage(message); - int spaceIdCount = spaceIds.size(); - if (spaceIdCount > 0) { - numTweetsWithSpaceIds.increment(); - if (spaceIdCount > 1) { - numTweetsWithMultipleSpaceIds.increment(); - } - message.setSpaceIds(spaceIds); - } - } - } - - @Override - protected TwitterMessage innerRunStageV2(TwitterMessage message) { - tryToRetrieveSpaceId(message); - return message; - } - - private String parseSpaceIdsFromUrl(String url) { - String spaceId = null; - - if (StringUtils.isNotEmpty(url)) { - Matcher matcher = SPACES_URL_REGEX.matcher(url); - if (matcher.matches()) { - spaceId = matcher.group(1); - } - } - return spaceId; - } - - private Set parseSpaceIdsFromMessage(TwitterMessage message) { - Set spaceIds = Sets.newHashSet(); - - for (ThriftExpandedUrl expandedUrl : message.getExpandedUrls()) { - String spaceId = parseSpaceIdsFromUrl(expandedUrl.getExpandedUrl()); - if (StringUtils.isNotEmpty(spaceId)) { - spaceIds.add(spaceId); - } - } - return spaceIds; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/SingleTweetExtractAndGeocodeLatLonStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/SingleTweetExtractAndGeocodeLatLonStage.docx new file mode 100644 index 000000000..3625b6d2d Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/SingleTweetExtractAndGeocodeLatLonStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/SingleTweetExtractAndGeocodeLatLonStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/SingleTweetExtractAndGeocodeLatLonStage.java deleted file mode 100644 index 5a40020bb..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/SingleTweetExtractAndGeocodeLatLonStage.java +++ /dev/null @@ -1,99 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.indexing.thriftjava.ThriftGeoLocationSource; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.relevance.entities.GeoObject; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.text.LocationUtils; -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.search.ingester.pipeline.util.PipelineStageRuntimeException; - -/** - * Read-only stage to extract lat/lon pairs from the tweet text and populate - * the geoLocation field. - *

- * If the tweet is geotagged by mobile devices, the geo coordinates extracted from the JSON - * is used. - */ -@ConsumedTypes(IngesterTwitterMessage.class) -@ProducesConsumed -public class SingleTweetExtractAndGeocodeLatLonStage extends TwitterBaseStage - { - private static final Logger LOG = - LoggerFactory.getLogger(SingleTweetExtractAndGeocodeLatLonStage.class); - - private SearchRateCounter extractedLatLons; - private SearchRateCounter badLatLons; - - @Override - public void initStats() { - super.initStats(); - innerSetupStats(); - } - - @Override - protected void innerSetupStats() { - extractedLatLons = SearchRateCounter.export(getStageNamePrefix() + "_extracted_lat_lons"); - badLatLons = SearchRateCounter.export(getStageNamePrefix() + "_invalid_lat_lons"); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterTwitterMessage)) { - throw new StageException(this, "Object is not IngesterTwitterMessage object: " + obj); - } - - IngesterTwitterMessage message = IngesterTwitterMessage.class.cast(obj); - tryToSetGeoLocation(message); - emitAndCount(message); - } - - @Override - protected IngesterTwitterMessage innerRunStageV2(TwitterMessage message) { - // Previous stage takes in a TwitterMessage and returns a TwitterMessage. I think it was - // done to simplify testing. From this stage onwards, we only count the message that are of type - // IngesterTwitterMessage. - if (!(message instanceof IngesterTwitterMessage)) { - throw new PipelineStageRuntimeException("Message needs to be of type IngesterTwitterMessage"); - } - - IngesterTwitterMessage ingesterTwitterMessage = IngesterTwitterMessage.class.cast(message); - tryToSetGeoLocation(ingesterTwitterMessage); - return ingesterTwitterMessage; - } - - private void tryToSetGeoLocation(IngesterTwitterMessage message) { - if (message.getGeoTaggedLocation() != null) { - message.setGeoLocation(message.getGeoTaggedLocation()); - } else if (message.hasGeoLocation()) { - LOG.warn("Message {} already contains geoLocation", message.getId()); - } else { - try { - GeoObject extracted = extractLatLon(message); - if (extracted != null) { - message.setGeoLocation(extracted); - extractedLatLons.increment(); - } - } catch (NumberFormatException e) { - LOG.debug("Message contains bad latitude and longitude: " + message.getOrigLocation(), e); - badLatLons.increment(); - } catch (Exception e) { - LOG.error("Failed to extract geo location from " + message.getOrigLocation() + " for tweet " - + message.getId(), e); - } - } - } - - private GeoObject extractLatLon(IngesterTwitterMessage message) throws NumberFormatException { - double[] latlon = LocationUtils.extractLatLon(message); - return latlon == null - ? null - : new GeoObject(latlon[0], latlon[1], ThriftGeoLocationSource.TWEET_TEXT); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/TextFeatureExtractionWorkersStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/TextFeatureExtractionWorkersStage.docx new file mode 100644 index 000000000..d060ed37c Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/TextFeatureExtractionWorkersStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/TextFeatureExtractionWorkersStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/TextFeatureExtractionWorkersStage.java deleted file mode 100644 index 45e967d43..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/TextFeatureExtractionWorkersStage.java +++ /dev/null @@ -1,148 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.ExecutorService; -import javax.naming.NamingException; - -import com.google.common.collect.Queues; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCustomGauge; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.text.TweetParser; -import com.twitter.search.ingester.pipeline.util.PipelineStageRuntimeException; - -@ConsumedTypes(TwitterMessage.class) -@ProducesConsumed -public class TextFeatureExtractionWorkersStage extends TwitterBaseStage - { - private static final Logger LOG = - LoggerFactory.getLogger(TextFeatureExtractionWorkersStage.class); - - private static final int NUM_THREADS = 5; - private static final int MAX_QUEUE_SIZE = 100; - private static final long SLOW_TWEET_TIME_MILLIS = 1000; - private ExecutorService executorService = null; - - // define as static so that FeatureExtractorWorker thread can use it - private static SearchRateCounter slowTweetCounter; - private SearchRateCounter threadErrorCounter; - private SearchRateCounter threadInterruptionCounter; - private final BlockingQueue messageQueue = - Queues.newLinkedBlockingQueue(MAX_QUEUE_SIZE); - private TweetParser tweetParser; - - @Override - public void initStats() { - super.initStats(); - innerSetupStats(); - } - - @Override - protected void innerSetupStats() { - slowTweetCounter = SearchRateCounter.export( - getStageNamePrefix() + "_text_feature_extraction_slow_tweet_count"); - SearchCustomGauge.export(getStageNamePrefix() + "_queue_size", - messageQueue::size); - threadErrorCounter = SearchRateCounter.export( - getStageNamePrefix() + "_text_quality_evaluation_thread_error"); - threadInterruptionCounter = SearchRateCounter.export( - getStageNamePrefix() + "_text_quality_evaluation_thread_interruption"); - } - - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - innerSetup(); - // anything threading related, we don't need in V2 as of yet. - executorService = wireModule.getThreadPool(NUM_THREADS); - for (int i = 0; i < NUM_THREADS; ++i) { - executorService.submit(new FeatureExtractorWorker()); - } - LOG.info("Initialized {} parsers.", NUM_THREADS); - } - - @Override - protected void innerSetup() { - tweetParser = new TweetParser(); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof TwitterMessage)) { - LOG.error("Object is not a TwitterMessage object: {}", obj); - return; - } - - TwitterMessage message = TwitterMessage.class.cast(obj); - try { - messageQueue.put(message); - } catch (InterruptedException ie) { - LOG.error("Interrupted exception adding to the queue", ie); - } - } - - private boolean tryToParse(TwitterMessage message) { - boolean isAbleToParse = false; - long startTime = clock.nowMillis(); - // Parse tweet and merge the parsed out features into what we already have in the message. - try { - synchronized (this) { - tweetParser.parseTweet(message, false, false); - } - // If parsing failed we don't need to pass the tweet down the pipeline. - isAbleToParse = true; - } catch (Exception e) { - threadErrorCounter.increment(); - LOG.error("Uncaught exception from tweetParser.parseTweet()", e); - } finally { - long elapsedTime = clock.nowMillis() - startTime; - if (elapsedTime > SLOW_TWEET_TIME_MILLIS) { - LOG.debug("Took {}ms to parse tweet {}: {}", elapsedTime, message.getId(), message); - slowTweetCounter.increment(); - } - } - return isAbleToParse; - } - - @Override - protected TwitterMessage innerRunStageV2(TwitterMessage message) { - if (!tryToParse(message)) { - throw new PipelineStageRuntimeException("Failed to parse, not passing to next stage."); - } - - return message; - } - - @Override - public void innerPostprocess() { - if (executorService != null) { - executorService.shutdownNow(); - } - executorService = null; - } - - private class FeatureExtractorWorker implements Runnable { - public void run() { - while (!Thread.currentThread().isInterrupted()) { - TwitterMessage message = null; - try { - message = messageQueue.take(); - } catch (InterruptedException ie) { - threadInterruptionCounter.increment(); - LOG.error("Interrupted exception polling from the queue", ie); - continue; - } finally { - if (tryToParse(message)) { - emitAndCount(message); - } - } - } - } - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/TextQualityEvaluationWorkerStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/TextQualityEvaluationWorkerStage.docx new file mode 100644 index 000000000..128dbef0d Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/TextQualityEvaluationWorkerStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/TextQualityEvaluationWorkerStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/TextQualityEvaluationWorkerStage.java deleted file mode 100644 index 27e5d5c0c..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/TextQualityEvaluationWorkerStage.java +++ /dev/null @@ -1,181 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; -import java.util.List; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.ExecutorService; -import javax.naming.NamingException; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Queues; -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.metrics.SearchCustomGauge; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.relevance.classifiers.TweetEvaluator; -import com.twitter.search.common.relevance.classifiers.TweetOffensiveEvaluator; -import com.twitter.search.common.relevance.classifiers.TweetTextClassifier; -import com.twitter.search.common.relevance.classifiers.TweetTextEvaluator; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.scorers.TweetTextScorer; - -@ConsumedTypes(TwitterMessage.class) -@ProducesConsumed -public class TextQualityEvaluationWorkerStage extends TwitterBaseStage - { - private static final Logger LOG = LoggerFactory.getLogger(TextQualityEvaluationWorkerStage.class); - - private static final int NUM_THREADS = 5; - private static final long SLOW_TWEET_TIME_MILLIS = 1000; - // based on the batched branch 3 elements in the queue times 200 tweets per batch. - private static final int MAX_QUEUE_SIZE = 100; - private final BlockingQueue messages = - Queues.newLinkedBlockingQueue(MAX_QUEUE_SIZE); - - private static final String DO_TEXT_QUALITY_EVALUATION_DECIDER_KEY_TEMPLATE = - "ingester_%s_do_text_quality_evaluation"; - - private ExecutorService executorService = null; - private SearchRateCounter unscoredTweetCounter; - private TweetTextClassifier classifier; - private final TweetTextScorer scorer = new TweetTextScorer(null); - // Defined as static so that ClassifierWorker thread can use it - private static SearchRateCounter slowTweetCounter; - private SearchRateCounter threadErrorCounter; - private SearchRateCounter threadInterruptionCounter; - private String deciderKey; - - @Override - public void initStats() { - super.initStats(); - innerSetupStats(); - } - - public SearchRateCounter getUnscoredTweetCounter() { - return unscoredTweetCounter; - } - - @Override - protected void innerSetupStats() { - threadErrorCounter = SearchRateCounter.export( - getStageNamePrefix() + "_text_quality_evaluation_thread_error"); - threadInterruptionCounter = SearchRateCounter.export( - getStageNamePrefix() + "_text_quality_evaluation_thread_interruption"); - unscoredTweetCounter = SearchRateCounter.export( - getStageNamePrefix() + "_text_quality_evaluation_tweets_unscored_count"); - slowTweetCounter = SearchRateCounter.export( - getStageNamePrefix() + "_text_quality_evaluation_slow_tweet_count"); - SearchCustomGauge.export(getStageNamePrefix() + "_queue_size", messages::size); - } - - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - innerSetup(); - executorService = wireModule.getThreadPool(NUM_THREADS); - for (int i = 0; i < NUM_THREADS; i++) { - executorService.submit( - new ClassifierWorker()); - } - LOG.info("Initialized {} classfiers and scorers.", NUM_THREADS); - } - - @Override - protected void innerSetup() throws NamingException { - deciderKey = String.format(DO_TEXT_QUALITY_EVALUATION_DECIDER_KEY_TEMPLATE, - earlybirdCluster.getNameForStats()); - List supportedPenguinVersions = wireModule.getPenguinVersions(); - TweetOffensiveEvaluator tweetOffensiveEvaluator = wireModule.getTweetOffensiveEvaluator(); - - ImmutableList evaluators = - ImmutableList.of(tweetOffensiveEvaluator, new TweetTextEvaluator()); - classifier = new TweetTextClassifier( - evaluators, - wireModule.getServiceIdentifier(), - supportedPenguinVersions); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof TwitterMessage)) { - LOG.error("Object is not a TwitterMessage object: {}", obj); - return; - } - - if (decider.isAvailable(deciderKey)) { - TwitterMessage message = TwitterMessage.class.cast(obj); - try { - messages.put(message); - } catch (InterruptedException ie) { - LOG.error("Interrupted exception adding to the queue", ie); - } - } else { - unscoredTweetCounter.increment(); - emitAndCount(obj); - } - } - - @Override - protected TwitterMessage innerRunStageV2(TwitterMessage message) { - if (decider.isAvailable(deciderKey)) { - classifyAndScore(message); - } else { - unscoredTweetCounter.increment(); - } - - return message; - } - - private void classifyAndScore(TwitterMessage message) { - long startTime = clock.nowMillis(); - try { - // The tweet signature computed here might not be correct, since we did not resolve the - // tweet URLs yet. This is why BasicIndexingConverter does not set the tweet signature - // feature on the event it builds. - // - // We correct the tweet signature later in the ComputeTweetSignatureStage, and - // DelayedIndexingConverter sets this feature on the URL update event it creates. - synchronized (this) { - scorer.classifyAndScoreTweet(classifier, message); - } - } catch (Exception e) { - threadErrorCounter.increment(); - LOG.error("Uncaught exception from classifyAndScoreTweet", e); - } finally { - long elapsedTime = clock.nowMillis() - startTime; - if (elapsedTime > SLOW_TWEET_TIME_MILLIS) { - LOG.warn("Took {}ms to classify and score tweet {}: {}", - elapsedTime, message.getId(), message); - slowTweetCounter.increment(); - } - } - } - - @Override - public void innerPostprocess() { - if (executorService != null) { - executorService.shutdownNow(); - } - executorService = null; - } - - private class ClassifierWorker implements Runnable { - public void run() { - while (!Thread.currentThread().isInterrupted()) { - TwitterMessage message; - try { - message = messages.take(); - } catch (InterruptedException ie) { - threadInterruptionCounter.increment(); - LOG.error("Interrupted exception polling from the queue", ie); - continue; - } - - // We want to emit even if we couldn't score the tweet. - classifyAndScore(message); - emitAndCount(message); - } - } - } -} - diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/TextUrlsFeatureExtractionStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/TextUrlsFeatureExtractionStage.docx new file mode 100644 index 000000000..aebf75024 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/TextUrlsFeatureExtractionStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/TextUrlsFeatureExtractionStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/TextUrlsFeatureExtractionStage.java deleted file mode 100644 index 3843223d8..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/TextUrlsFeatureExtractionStage.java +++ /dev/null @@ -1,53 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducesConsumed; - -import com.twitter.search.common.relevance.classifiers.TweetOffensiveEvaluator; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.scorers.TweetTextScorer; -import com.twitter.search.common.relevance.text.TweetParser; -import com.twitter.search.ingester.model.IngesterTwitterMessage; - -@ConsumedTypes(TwitterMessage.class) -@ProducesConsumed -public class TextUrlsFeatureExtractionStage extends TwitterBaseStage - { - private final TweetParser tweetParser = new TweetParser(); - private TweetOffensiveEvaluator offensiveEvaluator; - private final TweetTextScorer tweetTextScorer = new TweetTextScorer(null); - - @Override - protected void doInnerPreprocess() { - innerSetup(); - } - - @Override - protected void innerSetup() { - offensiveEvaluator = wireModule.getTweetOffensiveEvaluator(); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterTwitterMessage)) { - throw new StageException(this, "Object is not a TwitterMessage instance: " + obj); - } - - IngesterTwitterMessage message = IngesterTwitterMessage.class.cast(obj); - extract(message); - emitAndCount(message); - } - - private void extract(IngesterTwitterMessage message) { - tweetParser.parseUrls(message); - offensiveEvaluator.evaluate(message); - tweetTextScorer.scoreTweet(message); - } - - @Override - protected IngesterTwitterMessage innerRunStageV2(IngesterTwitterMessage message) { - extract(message); - return message; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ThriftTweetParserStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/ThriftTweetParserStage.docx new file mode 100644 index 000000000..7055af60a Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/ThriftTweetParserStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ThriftTweetParserStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/ThriftTweetParserStage.java deleted file mode 100644 index 6a9d4369d..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/ThriftTweetParserStage.java +++ /dev/null @@ -1,178 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.List; -import java.util.Map; -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import javax.naming.NamingException; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducedTypes; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.debug.thriftjava.DebugEvents; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.ingester.model.IngesterTweetEvent; -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.search.ingester.pipeline.twitter.thriftparse.ThriftTweetParsingException; -import com.twitter.search.ingester.pipeline.twitter.thriftparse.TweetEventParseHelper; -import com.twitter.tweetypie.thriftjava.TweetCreateEvent; -import com.twitter.tweetypie.thriftjava.TweetDeleteEvent; -import com.twitter.tweetypie.thriftjava.TweetEventData; - -@ConsumedTypes(IngesterTweetEvent.class) -@ProducedTypes(IngesterTwitterMessage.class) -public class ThriftTweetParserStage extends TwitterBaseStage { - private static final Logger LOG = LoggerFactory.getLogger(ThriftTweetParserStage.class); - - // TweetEventData is a union of all possible tweet event types. TweetEventData._Fields is an enum - // that corresponds to the fields in that union. So essentially, TweetEventData._Fields tells us - // which tweet event we're getting inside TweetEventData. We want to keep track of how many tweet - // events of each type we're getting. - private final Map tweetEventCounters = - Maps.newEnumMap(TweetEventData._Fields.class); - - private final List tweetCreateEventBranches = Lists.newArrayList(); - private final List tweetDeleteEventBranches = Lists.newArrayList(); - - private boolean shouldIndexProtectedTweets; - private SearchCounter totalEventsCount; - private SearchCounter thriftParsingErrorsCount; - - private List supportedPenguinVersions; - - @Override - protected void initStats() { - super.initStats(); - - for (TweetEventData._Fields field : TweetEventData._Fields.values()) { - tweetEventCounters.put( - field, - this.makeStageCounter(field.name().toLowerCase() + "_count")); - } - totalEventsCount = this.makeStageCounter("total_events_count"); - thriftParsingErrorsCount = this.makeStageCounter("thrift_parsing_errors_count"); - } - - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - supportedPenguinVersions = wireModule.getPenguinVersions(); - LOG.info("Supported penguin versions: {}", supportedPenguinVersions); - - shouldIndexProtectedTweets = earlybirdCluster == EarlybirdCluster.PROTECTED - || earlybirdCluster == EarlybirdCluster.REALTIME_CG; - - Preconditions.checkState(!tweetDeleteEventBranches.isEmpty(), - "At least one delete branch must be specified."); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof TweetEventData || obj instanceof IngesterTweetEvent)) { - LOG.error("Object is not a TweetEventData or IngesterTweetEvent: {}", obj); - throw new StageException(this, "Object is not a TweetEventData or IngesterTweetEvent"); - } - - supportedPenguinVersions = wireModule.getCurrentlyEnabledPenguinVersions(); - - try { - IngesterTweetEvent ingesterTweetEvent = (IngesterTweetEvent) obj; - TweetEventData tweetEventData = ingesterTweetEvent.getData(); - DebugEvents debugEvents = ingesterTweetEvent.getDebugEvents(); - - // Determine if the message is a tweet delete event before the next stages mutate it. - IngesterTwitterMessage message = getTwitterMessage(tweetEventData, debugEvents); - boolean shouldEmitMessage = message != null - && message.isIndexable(shouldIndexProtectedTweets); - - if (shouldEmitMessage) { - if (!message.isDeleted()) { - emitAndCount(message); - - for (String tweetCreateEventBranch : tweetCreateEventBranches) { - // If we need to send the message to another branch, we need to make a copy. - // Otherwise, we'll have multiple stages mutating the same object in parallel. - IngesterTwitterMessage tweetCreateEventBranchMessage = - getTwitterMessage(tweetEventData, debugEvents); - emitToBranchAndCount(tweetCreateEventBranch, tweetCreateEventBranchMessage); - } - } else { - for (String tweetDeleteEventBranch : tweetDeleteEventBranches) { - // If we need to send the message to another branch, we need to make a copy. - // Otherwise, we'll have multiple stages mutating the same object in parallel. - IngesterTwitterMessage tweetDeleteEventBranchMessage = - getTwitterMessage(tweetEventData, debugEvents); - emitToBranchAndCount(tweetDeleteEventBranch, tweetDeleteEventBranchMessage); - } - } - } - } catch (ThriftTweetParsingException e) { - thriftParsingErrorsCount.increment(); - LOG.error("Failed to parse Thrift tweet event: " + obj, e); - throw new StageException(this, e); - } - } - - @Nullable - private IngesterTwitterMessage getTwitterMessage( - @Nonnull TweetEventData tweetEventData, - @Nullable DebugEvents debugEvents) - throws ThriftTweetParsingException { - totalEventsCount.increment(); - - // TweetEventData is a union of all possible tweet event types. TweetEventData._Fields is an - // enum that corresponds to all TweetEventData fields. By calling TweetEventData.getSetField(), - // we can determine which field is set. - TweetEventData._Fields tweetEventDataField = tweetEventData.getSetField(); - Preconditions.checkNotNull(tweetEventDataField); - tweetEventCounters.get(tweetEventDataField).increment(); - - if (tweetEventDataField == TweetEventData._Fields.TWEET_CREATE_EVENT) { - TweetCreateEvent tweetCreateEvent = tweetEventData.getTweet_create_event(); - return TweetEventParseHelper.getTwitterMessageFromCreationEvent( - tweetCreateEvent, supportedPenguinVersions, debugEvents); - } - if (tweetEventDataField == TweetEventData._Fields.TWEET_DELETE_EVENT) { - TweetDeleteEvent tweetDeleteEvent = tweetEventData.getTweet_delete_event(); - return TweetEventParseHelper.getTwitterMessageFromDeletionEvent( - tweetDeleteEvent, supportedPenguinVersions, debugEvents); - } - return null; - } - - /** - * Sets the branches to which all TweetDeleteEvents should be emitted. - * - * @param tweetDeleteEventBranchNames A comma-separated list of branches. - */ - public void setTweetDeleteEventBranchNames(String tweetDeleteEventBranchNames) { - parseBranches(tweetDeleteEventBranchNames, tweetDeleteEventBranches); - } - - /** - * Sets the additional branches to which all TweetCreateEvents should be emitted. - * - * @param tweetCreateEventBranchNames A comma-separated list of branches. - */ - public void setTweetCreateEventBranchNames(String tweetCreateEventBranchNames) { - parseBranches(tweetCreateEventBranchNames, tweetCreateEventBranches); - } - - private void parseBranches(String branchNames, List branches) { - branches.clear(); - for (String branch : branchNames.split(",")) { - String trimmedBranch = branch.trim(); - Preconditions.checkState(!trimmedBranch.isEmpty(), "Branches cannot be empty strings."); - branches.add(trimmedBranch); - } - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ThriftVersionedEventsConverter.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/ThriftVersionedEventsConverter.docx new file mode 100644 index 000000000..f0857296c Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/ThriftVersionedEventsConverter.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/ThriftVersionedEventsConverter.java b/src/java/com/twitter/search/ingester/pipeline/twitter/ThriftVersionedEventsConverter.java deleted file mode 100644 index 1cb21e188..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/ThriftVersionedEventsConverter.java +++ /dev/null @@ -1,132 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import com.google.common.collect.Lists; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.debug.thriftjava.DebugEvents; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.common.schema.thriftjava.ThriftDocument; -import com.twitter.search.common.schema.thriftjava.ThriftField; -import com.twitter.search.common.schema.thriftjava.ThriftFieldData; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType; -import com.twitter.search.ingester.model.IngesterThriftVersionedEvents; - -/** - * Converter for {@code ThriftVersionedEvents}. - * - */ -public class ThriftVersionedEventsConverter { - private static final long UNUSED_USER_ID = -1L; - - private Iterable penguinVersions; - - public ThriftVersionedEventsConverter(Iterable penguinVersions) { - this.penguinVersions = penguinVersions; - } - - /** - * Creates a DELETE IngesterThriftVersionedEvents instance for the given tweet ID and user ID. - * - * @param tweetId The tweet ID. - * @param userId The user ID. - * @param debugEvents The DebugEvents to propagate to the returned IngesterThriftVersionedEvents - * instance. - * @return A DELETE IngesterThriftVersionedEvents instance with the given tweet and user IDs. - */ - public IngesterThriftVersionedEvents toDelete( - long tweetId, long userId, DebugEvents debugEvents) { - ThriftIndexingEvent thriftIndexingEvent = new ThriftIndexingEvent() - .setEventType(ThriftIndexingEventType.DELETE) - .setUid(tweetId); - return toThriftVersionedEvents(tweetId, userId, thriftIndexingEvent, debugEvents); - } - - /** - * Creates an OUT_OF_ORDER_APPEND IngesterThriftVersionedEvents instance for the given tweet ID - * and the given value for the given field. - * - * @param tweetId The tweet ID. - * @param field The updated field. - * @param value The new field value. - * @param debugEvents The DebugEvents to propagate to the returned IngesterThriftVersionedEvents - * instance. - * @return An OUT_OF_ORDER_APPEND IngesterThriftVersionedEvents instance with the given tweet ID - * and value for the field. - */ - public IngesterThriftVersionedEvents toOutOfOrderAppend( - long tweetId, - EarlybirdFieldConstants.EarlybirdFieldConstant field, - long value, - DebugEvents debugEvents) { - ThriftField updateField = new ThriftField() - .setFieldConfigId(field.getFieldId()) - .setFieldData(new ThriftFieldData().setLongValue(value)); - ThriftDocument document = new ThriftDocument() - .setFields(Lists.newArrayList(updateField)); - ThriftIndexingEvent thriftIndexingEvent = new ThriftIndexingEvent() - .setEventType(ThriftIndexingEventType.OUT_OF_ORDER_APPEND) - .setUid(tweetId) - .setDocument(document); - return toThriftVersionedEvents(tweetId, UNUSED_USER_ID, thriftIndexingEvent, debugEvents); - } - - - /** - * Creates a PARTIAL_UPDATE IngesterThriftVersionedEvents instance for the given tweet ID and the - * given value for the given feature. - * - * @param tweetId The tweet ID. - * @param feature The updated feature. - * @param value The new feature value. - * @param debugEvents The DebugEvents to propagate to the returned IngesterThriftVersionedEvents - * instance. - * @return A PARTIAL_UPDATE IngesterThriftVersionedEvents instance with the given tweet ID and - * value for the feature. - */ - public IngesterThriftVersionedEvents toPartialUpdate( - long tweetId, - EarlybirdFieldConstants.EarlybirdFieldConstant feature, - int value, - DebugEvents debugEvents) { - ThriftField updateField = new ThriftField() - .setFieldConfigId(feature.getFieldId()) - .setFieldData(new ThriftFieldData().setIntValue(value)); - ThriftDocument document = new ThriftDocument() - .setFields(Lists.newArrayList(updateField)); - ThriftIndexingEvent thriftIndexingEvent = new ThriftIndexingEvent() - .setEventType(ThriftIndexingEventType.PARTIAL_UPDATE) - .setUid(tweetId) - .setDocument(document); - return toThriftVersionedEvents(tweetId, UNUSED_USER_ID, thriftIndexingEvent, debugEvents); - } - - // Wraps the given ThriftIndexingEvent into a ThriftVersionedEvents instance. - private IngesterThriftVersionedEvents toThriftVersionedEvents( - long tweetId, long userId, ThriftIndexingEvent thriftIndexingEvent, DebugEvents debugEvents) { - if (!thriftIndexingEvent.isSetCreateTimeMillis() - && (debugEvents != null) - && debugEvents.isSetCreatedAt()) { - thriftIndexingEvent.setCreateTimeMillis(debugEvents.getCreatedAt().getEventTimestampMillis()); - } - - Map versionedEvents = new HashMap<>(); - for (PenguinVersion penguinVersion : penguinVersions) { - versionedEvents.put(penguinVersion.getByteValue(), thriftIndexingEvent); - } - - IngesterThriftVersionedEvents events = - new IngesterThriftVersionedEvents(userId, versionedEvents); - events.setId(tweetId); - events.setDebugEvents(debugEvents); - return events; - } - - public void updatePenguinVersions(List updatePenguinVersions) { - penguinVersions = updatePenguinVersions; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/TweetEventDeserializerStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/TweetEventDeserializerStage.docx new file mode 100644 index 000000000..93c6d6393 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/TweetEventDeserializerStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/TweetEventDeserializerStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/TweetEventDeserializerStage.java deleted file mode 100644 index 96d7c2018..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/TweetEventDeserializerStage.java +++ /dev/null @@ -1,137 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; -import com.google.common.annotations.VisibleForTesting; -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducedTypes; -import org.apache.thrift.TDeserializer; -import org.apache.thrift.TException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import com.twitter.search.common.debug.DebugEventUtil; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.ingester.model.IngesterTweetEvent; -import com.twitter.search.ingester.model.KafkaRawRecord; -import com.twitter.search.ingester.pipeline.util.PipelineStageRuntimeException; - -/** - * Deserializes {@link KafkaRawRecord} into IngesterTweetEvent and emits those. - */ -@ConsumedTypes(KafkaRawRecord.class) -@ProducedTypes(IngesterTweetEvent.class) -public class TweetEventDeserializerStage extends TwitterBaseStage - { - private static final Logger LOG = LoggerFactory.getLogger(TweetEventDeserializerStage.class); - - // Limit how much the logs get polluted - private static final int MAX_OOM_SERIALIZED_BYTES_LOGGED = 5000; - private static final char[] HEX_ARRAY = "0123456789ABCDEF".toCharArray(); - - private final TDeserializer deserializer = new TDeserializer(); - - private SearchCounter outOfMemoryErrors; - private SearchCounter outOfMemoryErrors2; - private SearchCounter totalEventsCount; - private SearchCounter validEventsCount; - private SearchCounter deserializationErrorsCount; - - @Override - public void initStats() { - super.initStats(); - innerSetupStats(); - } - - @Override - protected void innerSetupStats() { - outOfMemoryErrors = SearchCounter.export(getStageNamePrefix() + "_out_of_memory_errors"); - outOfMemoryErrors2 = SearchCounter.export(getStageNamePrefix() + "_out_of_memory_errors_2"); - totalEventsCount = SearchCounter.export(getStageNamePrefix() + "_total_events_count"); - validEventsCount = SearchCounter.export(getStageNamePrefix() + "_valid_events_count"); - deserializationErrorsCount = - SearchCounter.export(getStageNamePrefix() + "_deserialization_errors_count"); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof KafkaRawRecord)) { - throw new StageException(this, "Object is not a KafkaRawRecord: " + obj); - } - - KafkaRawRecord kafkaRecord = (KafkaRawRecord) obj; - IngesterTweetEvent tweetEvent = tryDeserializeRecord(kafkaRecord); - - if (tweetEvent != null) { - emitAndCount(tweetEvent); - } - } - - @Override - protected IngesterTweetEvent innerRunStageV2(KafkaRawRecord kafkaRawRecord) { - IngesterTweetEvent ingesterTweetEvent = tryDeserializeRecord(kafkaRawRecord); - if (ingesterTweetEvent == null) { - throw new PipelineStageRuntimeException("failed to deserialize KafkaRawRecord : " - + kafkaRawRecord); - } - return ingesterTweetEvent; - } - - private IngesterTweetEvent tryDeserializeRecord(KafkaRawRecord kafkaRecord) { - try { - totalEventsCount.increment(); - IngesterTweetEvent tweetEvent = deserialize(kafkaRecord); - validEventsCount.increment(); - return tweetEvent; - } catch (OutOfMemoryError e) { - try { - outOfMemoryErrors.increment(); - byte[] bytes = kafkaRecord.getData(); - int limit = Math.min(bytes.length, MAX_OOM_SERIALIZED_BYTES_LOGGED); - StringBuilder sb = new StringBuilder(2 * limit + 100) - .append("OutOfMemoryError deserializing ").append(bytes.length).append(" bytes: "); - appendBytesAsHex(sb, bytes, MAX_OOM_SERIALIZED_BYTES_LOGGED); - LOG.error(sb.toString(), e); - } catch (OutOfMemoryError e2) { - outOfMemoryErrors2.increment(); - } - } - - return null; - - } - - private IngesterTweetEvent deserialize(KafkaRawRecord kafkaRecord) { - try { - IngesterTweetEvent ingesterTweetEvent = new IngesterTweetEvent(); - synchronized (this) { - deserializer.deserialize(ingesterTweetEvent, kafkaRecord.getData()); - } - // Record the created_at time and then we first saw this tweet in the ingester for tracking - // down the ingestion pipeline. - addDebugEventsToIncomingTweet(ingesterTweetEvent, kafkaRecord.getReadAtTimestampMs()); - return ingesterTweetEvent; - } catch (TException e) { - LOG.error("Unable to deserialize TweetEventData", e); - deserializationErrorsCount.increment(); - } - return null; - } - - private void addDebugEventsToIncomingTweet( - IngesterTweetEvent ingesterTweetEvent, long readAtTimestampMs) { - DebugEventUtil.setCreatedAtDebugEvent( - ingesterTweetEvent, ingesterTweetEvent.getFlags().getTimestamp_ms()); - DebugEventUtil.setProcessingStartedAtDebugEvent(ingesterTweetEvent, readAtTimestampMs); - - // The TweetEventDeserializerStage takes in a byte[] representation of a tweet, so debug events - // are not automatically appended by TwitterBaseStage. We do that explicitly here. - DebugEventUtil.addDebugEvent(ingesterTweetEvent, getFullStageName(), clock.nowMillis()); - } - - @VisibleForTesting - static void appendBytesAsHex(StringBuilder sb, byte[] bytes, int maxLength) { - int limit = Math.min(bytes.length, maxLength); - for (int j = 0; j < limit; j++) { - sb.append(HEX_ARRAY[(bytes[j] >>> 4) & 0x0F]); - sb.append(HEX_ARRAY[bytes[j] & 0x0F]); - } - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/TwitterBaseStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/TwitterBaseStage.docx new file mode 100644 index 000000000..a0e3ccba5 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/TwitterBaseStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/TwitterBaseStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/TwitterBaseStage.java deleted file mode 100644 index ba3787bd0..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/TwitterBaseStage.java +++ /dev/null @@ -1,360 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; - -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.TimeUnit; - -import javax.naming.NamingException; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -import org.apache.commons.lang.StringUtils; -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.stage.InstrumentedBaseStage; - -import com.twitter.common.metrics.Metrics; -import com.twitter.common.util.Clock; -import com.twitter.decider.Decider; -import com.twitter.search.common.debug.DebugEventAccumulator; -import com.twitter.search.common.debug.DebugEventUtil; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.metrics.Percentile; -import com.twitter.search.common.metrics.PercentileUtil; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchCustomGauge; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.ingester.pipeline.util.PipelineStageException; -import com.twitter.search.ingester.pipeline.util.PipelineStageRuntimeException; -import com.twitter.search.ingester.pipeline.wire.WireModule; - -/** - * Common functionality for all stages. - */ -public class TwitterBaseStage extends InstrumentedBaseStage { - // Currently, all stages run in separate threads, so we could use simple maps here. - // However, it seems safer to use concurrent maps, in case we ever change our stage set up. - // The performance impact should be negligible. - private final ConcurrentMap, SearchRateCounter> branchEmitObjectsRateCounters = - Maps.newConcurrentMap(); - private final ConcurrentMap, SearchRateCounter> - branchEmitBatchObjectsRateCounters = Maps.newConcurrentMap(); - - private String stageNamePrefix = null; - - protected WireModule wireModule; - protected Decider decider; - protected Clock clock; - protected EarlybirdCluster earlybirdCluster; - - private String fullStageName = null; - private Percentile processPercentile = null; - private SearchTimerStats processTimerStats = null; - private SearchRateCounter droppedItems = null; - private SearchLongGauge stageExceptions = null; - - private SearchRateCounter incomingBatchesRateCounter; - private SearchRateCounter incomingBatchObjectsRateCounter; - - private List passThroughToBranches = Collections.emptyList(); - private List additionalEmitToBranches = Collections.emptyList(); - - private boolean passThroughDownstream = false; - private boolean emitDownstream = true; - - private String dropItemsDeciderKey; - - // From XML config. - public void setPassThroughToBranches(String passThroughToBranchesString) { - // This is a comma-delimited string which is a list of branches to which we just - // pass through the incoming object without any processing/filtering. - this.passThroughToBranches = Arrays.asList(passThroughToBranchesString.split(",")); - } - - // From XML config. - public void setAdditionalEmitToBranches(String emitToBranchesString) { - // This is a comma-delimited string which is a list of branches to which we - // will emit when we call actuallyEmitAndCount(obj). - this.additionalEmitToBranches = Arrays.asList(emitToBranchesString.split(",")); - } - - // From XML config. - public void setPassThroughDownstream(boolean passThroughDownstream) { - // If true, we emit the raw object downstream - this.passThroughDownstream = passThroughDownstream; - } - - // From XML config. - public void setEmitDownstream(boolean emitDownstream) { - // If true, we emit the processed object downstream. - this.emitDownstream = emitDownstream; - } - - @Override - public final void innerPreprocess() throws StageException { - try { - setupEssentialObjects(); - doInnerPreprocess(); - } catch (NamingException e) { - throw new StageException(this, "Failed to initialize stage.", e); - } - } - - /*** - * Sets up all necessary objects for this stage of the Pipeline. Previously, this task was done - * by the preprocess() method provided by the ACP library. - * @throws PipelineStageException - */ - public void setupStageV2() throws PipelineStageException { - try { - setupCommonStats(); - innerSetupStats(); - setupEssentialObjects(); - innerSetup(); - } catch (NamingException e) { - throw new PipelineStageException(this, "Failed to initialize stage", e); - } - } - - protected void innerSetup() throws PipelineStageException, NamingException { } - - /*** - * Takes in an argument of type T, processes it and returns an argument of Type R. This is the - * main method of a pipeline stage. - */ - public R runStageV2(T arg) { - long startingTime = startProcessing(); - R processed = innerRunStageV2(arg); - endProcessing(startingTime); - return processed; - } - - /*** - * Takes in an argument of type T, processes it and pushes the processed element to some place. - * This method does not return anything as any time this method is called on a stage, it means - * there is no stage after this one. An example stage is any KafkaProducerStage. - */ - public void runFinalStageOfBranchV2(T arg) { - long startingTime = startProcessing(); - innerRunFinalStageOfBranchV2(arg); - endProcessing(startingTime); - } - - protected R innerRunStageV2(T arg) { - return null; - } - - protected void innerRunFinalStageOfBranchV2(T arg) { } - - /*** - * called at the end of a pipeline. Cleans up all resources of the stage. - */ - public void cleanupStageV2() { } - - private void setupEssentialObjects() throws NamingException { - wireModule = WireModule.getWireModule(); - decider = wireModule.getDecider(); - clock = wireModule.getClock(); - earlybirdCluster = wireModule.getEarlybirdCluster(); - dropItemsDeciderKey = - "drop_items_" + earlybirdCluster.getNameForStats() + "_" + fullStageName; - } - - protected void doInnerPreprocess() throws StageException, NamingException { } - - @Override - protected void initStats() { - super.initStats(); - setupCommonStats(); - // Export stage timers - SearchCustomGauge.export(stageNamePrefix + "_queue_size", - () -> Optional.ofNullable(getQueueSizeAverage()).orElse(0.0)); - SearchCustomGauge.export(stageNamePrefix + "_queue_percentage_full", - () -> Optional.ofNullable(getQueuePercentFull()).orElse(0.0)); - - // This only called once on startup - // In some unit tests, getQueueCapacity can return null. Hence this guard is added. - // getQueueCapacity() does not return null here in prod. - SearchLongGauge.export(stageNamePrefix + "_queue_capacity") - .set(getQueueCapacity() == null ? 0 : getQueueCapacity()); - } - - private void setupCommonStats() { - // If the stage is instantiated only once, the class name is used for stats export - // If the stage is instantiated multiple times, the "stageName" specified in the - // pipeline definition xml file is also included. - if (StringUtils.isBlank(this.getStageName())) { - fullStageName = this.getClass().getSimpleName(); - } else { - fullStageName = String.format( - "%s_%s", - this.getClass().getSimpleName(), - this.getStageName()); - } - - stageNamePrefix = Metrics.normalizeName(fullStageName).toLowerCase(); - - droppedItems = SearchRateCounter.export(stageNamePrefix + "_dropped_messages"); - stageExceptions = SearchLongGauge.export(stageNamePrefix + "_stage_exceptions"); - - processTimerStats = SearchTimerStats.export(stageNamePrefix, TimeUnit.NANOSECONDS, - true); - processPercentile = PercentileUtil.createPercentile(stageNamePrefix); - - incomingBatchesRateCounter = SearchRateCounter.export(stageNamePrefix + "_incoming_batches"); - incomingBatchObjectsRateCounter = - SearchRateCounter.export(stageNamePrefix + "_incoming_batch_objects"); - } - - protected void innerSetupStats() { - - } - - protected SearchCounter makeStageCounter(String counterName) { - return SearchCounter.export(getStageNamePrefix() + "_" + counterName); - } - - private SearchRateCounter getEmitObjectsRateCounterFor(Optional maybeBranch) { - return getRateCounterFor(maybeBranch, "emit_objects", branchEmitObjectsRateCounters); - } - - private SearchRateCounter getEmitBatchObjectsRateCounterFor(Optional maybeBranch) { - return getRateCounterFor(maybeBranch, "emit_batch_objects", branchEmitBatchObjectsRateCounters); - } - - private SearchRateCounter getRateCounterFor( - Optional maybeBranch, - String statSuffix, - ConcurrentMap, SearchRateCounter> rateCountersMap) { - SearchRateCounter rateCounter = rateCountersMap.get(maybeBranch); - if (rateCounter == null) { - String branchSuffix = maybeBranch.map(b -> "_" + b.toLowerCase()).orElse(""); - rateCounter = SearchRateCounter.export(stageNamePrefix + branchSuffix + "_" + statSuffix); - SearchRateCounter existingRateCounter = rateCountersMap.putIfAbsent(maybeBranch, rateCounter); - if (existingRateCounter != null) { - Preconditions.checkState( - existingRateCounter == rateCounter, - "SearchRateCounter.export() should always return the same stat instance."); - } - } - return rateCounter; - } - - public String getStageNamePrefix() { - return stageNamePrefix; - } - - public String getFullStageName() { - return fullStageName; - } - - @Override - public void process(Object obj) throws StageException { - long startTime = System.nanoTime(); - try { - // this needs to be updated before calling super.process() so that innerProcess can actually - // use the updated incoming rates - updateIncomingBatchStats(obj); - // Track timing events for when tweets enter each stage. - captureStageDebugEvents(obj); - - if (DeciderUtil.isAvailableForRandomRecipient(decider, dropItemsDeciderKey)) { - droppedItems.increment(); - return; - } - - super.process(obj); - - // Now emit the object raw to wherever we need to - emitToPassThroughBranches(obj); - } finally { - long processTime = System.nanoTime() - startTime; - processTimerStats.timerIncrement(processTime); - processPercentile.record(processTime); - stageExceptions.set(stats.getExceptionCount()); - } - } - - protected long startProcessing() { - long startingTime = System.nanoTime(); - checkIfObjectShouldBeEmittedOrThrowRuntimeException(); - return startingTime; - } - - protected void endProcessing(long startingTime) { - long processTime = System.nanoTime() - startingTime; - processTimerStats.timerIncrement(processTime); - processPercentile.record(processTime); - } - - private void checkIfObjectShouldBeEmittedOrThrowRuntimeException() { - if (DeciderUtil.isAvailableForRandomRecipient(decider, dropItemsDeciderKey)) { - droppedItems.increment(); - throw new PipelineStageRuntimeException("Object does not have to be processed and passed" - + " to the next stage"); - } - } - - private void emitToPassThroughBranches(Object obj) { - for (String branch : passThroughToBranches) { - actuallyEmitAndCount(Optional.of(branch), obj); - } - if (passThroughDownstream) { - actuallyEmitAndCount(Optional.empty(), obj); - } - } - - private void updateIncomingBatchStats(Object obj) { - incomingBatchesRateCounter.increment(); - incomingBatchObjectsRateCounter.increment(getBatchSizeForStats(obj)); - } - - protected void captureStageDebugEvents(Object obj) { - if (obj instanceof DebugEventAccumulator) { - DebugEventUtil.addDebugEvent( - (DebugEventAccumulator) obj, getFullStageName(), clock.nowMillis()); - } else if (obj instanceof Collection) { - DebugEventUtil.addDebugEventToCollection( - (Collection) obj, getFullStageName(), clock.nowMillis()); - } else { - SearchCounter debugEventsNotSupportedCounter = SearchCounter.export( - stageNamePrefix + "_debug_events_not_supported_for_" + obj.getClass()); - debugEventsNotSupportedCounter.increment(); - } - } - - protected int getBatchSizeForStats(Object obj) { - return (obj instanceof Collection) ? ((Collection) obj).size() : 1; - } - - protected void emitAndCount(Object obj) { - for (String branch : additionalEmitToBranches) { - actuallyEmitAndCount(Optional.of(branch), obj); - } - if (emitDownstream) { - actuallyEmitAndCount(Optional.empty(), obj); - } - } - - protected void emitToBranchAndCount(String branch, Object obj) { - actuallyEmitAndCount(Optional.of(branch), obj); - } - - // If the branch is none, emit downstream - private void actuallyEmitAndCount(Optional maybeBranch, Object obj) { - if (maybeBranch.isPresent()) { - emit(maybeBranch.get(), obj); - } else { - emit(obj); - } - getEmitObjectsRateCounterFor(maybeBranch).increment(); - getEmitBatchObjectsRateCounterFor(maybeBranch).increment(getBatchSizeForStats(obj)); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/TwitterBatchedBaseStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/TwitterBatchedBaseStage.docx new file mode 100644 index 000000000..c557a9d4d Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/TwitterBatchedBaseStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/TwitterBatchedBaseStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/TwitterBatchedBaseStage.java deleted file mode 100644 index fda5b6166..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/TwitterBatchedBaseStage.java +++ /dev/null @@ -1,309 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Iterator; -import java.util.Optional; -import java.util.Queue; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; -import javax.naming.NamingException; - -import scala.runtime.BoxedUnit; - -import com.google.common.collect.Lists; -import com.google.common.collect.Queues; - -import org.apache.commons.pipeline.StageException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCustomGauge; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.ingester.pipeline.util.BatchedElement; -import com.twitter.search.ingester.pipeline.util.PipelineStageException; -import com.twitter.util.Function; -import com.twitter.util.Future; - -public abstract class TwitterBatchedBaseStage extends - TwitterBaseStage> { - private static final Logger LOG = LoggerFactory.getLogger(TwitterBatchedBaseStage.class); - - protected final Queue> queue = - Queues.newLinkedBlockingQueue(MAX_BATCHING_QUEUE_SIZE); - - private int batchedStageBatchSize = 100; - private int forceProcessAfterMs = 500; - - private long lastProcessingTime; - - private SearchRateCounter timeBasedQueueFlush; - private SearchRateCounter sizeBasedQueueFlush; - private SearchRateCounter eventsFailed; - private SearchRateCounter numberOfCallsToNextBatchIfReady; - private SearchTimerStats batchExecutionTime; - private SearchTimerStats batchFailedExecutionTime; - private SearchRateCounter validElements; - private SearchRateCounter batchedElements; - private SearchRateCounter emittedElements; - private static final int MAX_BATCHING_QUEUE_SIZE = 10000; - - // force the implementing class to set type correctly to avoid catching issues at runtime - protected abstract Class getQueueObjectType(); - - // up to the developer on how each batch is processed. - protected abstract Future> innerProcessBatch(Collection> - batch); - - // classes that need to update their batch e.g after a decider change - // can override this - protected void updateBatchSize() { - } - - protected Collection extractOnlyElementsFromBatch(Collection> batch) { - Collection elementsOnly = new ArrayList<>(); - - for (BatchedElement batchedElement : batch) { - elementsOnly.add(batchedElement.getItem()); - } - return elementsOnly; - } - /** - * This function is used to filter the elements that we want to batch. - * e.g. if a tweet has urls batch it to resolve the urls, if it doesn't contain urls - * do not batch. - * - * @param element to be evaluated - */ - protected abstract boolean needsToBeBatched(T element); - - /** - * Tranform from type T to U element. - * T and U might be different types so this function will help with the transformation - * if the incoming T element is filtered out and is bypass directly to the next stage - * that takes incoming objects of type U - * - * @param element incoming element - */ - protected abstract R transform(T element); - - protected void reEnqueueAndRetry(BatchedElement batchedElement) { - queue.add(batchedElement); - } - - @Override - protected void initStats() { - super.initStats(); - commonInnerSetupStats(); - } - - private void commonInnerSetupStats() { - timeBasedQueueFlush = SearchRateCounter.export(getStageNamePrefix() - + "_time_based_queue_flush"); - sizeBasedQueueFlush = SearchRateCounter.export(getStageNamePrefix() - + "_size_based_queue_flush"); - batchExecutionTime = SearchTimerStats.export(getStageNamePrefix() - + "_batch_execution_time", TimeUnit.MILLISECONDS, false, true); - batchFailedExecutionTime = SearchTimerStats.export(getStageNamePrefix() - + "_batch_failed_execution_time", TimeUnit.MILLISECONDS, false, true); - eventsFailed = SearchRateCounter.export(getStageNamePrefix() + "_events_dropped"); - SearchCustomGauge.export(getStageNamePrefix() + "_batched_stage_queue_size", queue::size); - numberOfCallsToNextBatchIfReady = SearchRateCounter.export(getStageNamePrefix() - + "_calls_to_nextBatchIfReady"); - validElements = SearchRateCounter.export(getStageNamePrefix() + "_valid_elements"); - batchedElements = SearchRateCounter.export(getStageNamePrefix() + "_batched_elements"); - emittedElements = SearchRateCounter.export(getStageNamePrefix() + "_emitted_elements"); - } - - @Override - protected void innerSetupStats() { - commonInnerSetupStats(); - } - - // return a possible batch of elements to process. If we have enough for one batch - protected Optional>> nextBatchIfReady() { - numberOfCallsToNextBatchIfReady.increment(); - Optional>> batch = Optional.empty(); - - if (!queue.isEmpty()) { - long elapsed = clock.nowMillis() - lastProcessingTime; - if (elapsed > forceProcessAfterMs) { - batch = Optional.of(Lists.newArrayList(queue)); - timeBasedQueueFlush.increment(); - queue.clear(); - } else if (queue.size() >= batchedStageBatchSize) { - batch = Optional.of(queue.stream() - .limit(batchedStageBatchSize) - .map(element -> queue.remove()) - .collect(Collectors.toList())); - sizeBasedQueueFlush.increment(); - } - } - return batch; - } - - @Override - public void innerProcess(Object obj) throws StageException { - T element; - if (getQueueObjectType().isInstance(obj)) { - element = getQueueObjectType().cast(obj); - } else { - throw new StageException(this, "Trying to add an object of the wrong type to a queue. " - + getQueueObjectType().getSimpleName() - + " is the expected type"); - } - - if (!tryToAddElementToBatch(element)) { - emitAndCount(transform(element)); - } - - tryToSendBatchedRequest(); - } - - @Override - protected CompletableFuture innerRunStageV2(T element) { - CompletableFuture completableFuture = new CompletableFuture<>(); - if (!tryToAddElementToBatch(element, completableFuture)) { - completableFuture.complete(transform(element)); - } - - tryToSendBatchedRequestV2(); - - return completableFuture; - } - - private boolean tryToAddElementToBatch(T element, CompletableFuture cf) { - boolean needsToBeBatched = needsToBeBatched(element); - if (needsToBeBatched) { - queue.add(new BatchedElement<>(element, cf)); - } - - return needsToBeBatched; - } - - private boolean tryToAddElementToBatch(T element) { - return tryToAddElementToBatch(element, CompletableFuture.completedFuture(null)); - } - - private void tryToSendBatchedRequest() { - Optional>> maybeToProcess = nextBatchIfReady(); - if (maybeToProcess.isPresent()) { - Collection> batch = maybeToProcess.get(); - lastProcessingTime = clock.nowMillis(); - processBatch(batch, getOnSuccessFunction(lastProcessingTime), - getOnFailureFunction(batch, lastProcessingTime)); - } - } - - private void tryToSendBatchedRequestV2() { - Optional>> maybeToProcess = nextBatchIfReady(); - if (maybeToProcess.isPresent()) { - Collection> batch = maybeToProcess.get(); - lastProcessingTime = clock.nowMillis(); - processBatch(batch, getOnSuccessFunctionV2(batch, lastProcessingTime), - getOnFailureFunctionV2(batch, lastProcessingTime)); - } - } - - private void processBatch(Collection> batch, - Function, BoxedUnit> onSuccess, - Function onFailure) { - updateBatchSize(); - - Future> futureComputation = innerProcessBatch(batch); - - futureComputation.onSuccess(onSuccess); - - futureComputation.onFailure(onFailure); - } - - private Function, BoxedUnit> getOnSuccessFunction(long started) { - return Function.cons((elements) -> { - elements.forEach(this::emitAndCount); - batchExecutionTime.timerIncrement(clock.nowMillis() - started); - }); - } - - private Function, BoxedUnit> getOnSuccessFunctionV2(Collection> - batch, long started) { - return Function.cons((elements) -> { - Iterator> iterator = batch.iterator(); - for (R element : elements) { - if (iterator.hasNext()) { - iterator.next().getCompletableFuture().complete(element); - } else { - LOG.error("Getting Response from Batched Request, but no CompleteableFuture object" - + " to complete."); - } - } - batchExecutionTime.timerIncrement(clock.nowMillis() - started); - - }); - } - - private Function getOnFailureFunction(Collection> - batch, long started) { - return Function.cons((throwable) -> { - batch.forEach(batchedElement -> { - eventsFailed.increment(); - // pass the tweet event down better to index an incomplete event than nothing at all - emitAndCount(transform(batchedElement.getItem())); - }); - batchFailedExecutionTime.timerIncrement(clock.nowMillis() - started); - LOG.error("Failed processing batch", throwable); - }); - } - - private Function getOnFailureFunctionV2(Collection> - batch, long started) { - return Function.cons((throwable) -> { - batch.forEach(batchedElement -> { - eventsFailed.increment(); - R itemTransformed = transform(batchedElement.getItem()); - // complete the future, its better to index an incomplete event than nothing at all - batchedElement.getCompletableFuture().complete(itemTransformed); - }); - batchFailedExecutionTime.timerIncrement(clock.nowMillis() - started); - LOG.error("Failed processing batch", throwable); - }); - } - - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - try { - commonInnerSetup(); - } catch (PipelineStageException e) { - throw new StageException(this, e); - } - } - - private void commonInnerSetup() throws PipelineStageException, NamingException { - updateBatchSize(); - - if (batchedStageBatchSize < 1) { - throw new PipelineStageException(this, - "Batch size must be set at least to 1 for batched stages but is set to" - + batchedStageBatchSize); - } - - if (forceProcessAfterMs < 1) { - throw new PipelineStageException(this, "forceProcessAfterMs needs to be at least 1 " - + "ms but is set to " + forceProcessAfterMs); - } - } - - @Override - protected void innerSetup() throws PipelineStageException, NamingException { - commonInnerSetup(); - } - - // Setters for configuration parameters - public void setBatchedStageBatchSize(int maxElementsToWaitFor) { - this.batchedStageBatchSize = maxElementsToWaitFor; - } - - public void setForceProcessAfter(int forceProcessAfterMS) { - this.forceProcessAfterMs = forceProcessAfterMS; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/filters/BUILD b/src/java/com/twitter/search/ingester/pipeline/twitter/filters/BUILD deleted file mode 100644 index e5349558e..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/filters/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -java_library( - sources = ["*.java"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/code/findbugs:jsr305", - "decider/src/main/scala", - "src/java/com/twitter/search/common/decider", - "src/java/com/twitter/search/common/relevance:entities_and_filters", - "util/util-core:scala", - ], -) diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/filters/BUILD.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/filters/BUILD.docx new file mode 100644 index 000000000..c9b0a63d6 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/filters/BUILD.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/filters/IngesterValidMessageFilter.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/filters/IngesterValidMessageFilter.docx new file mode 100644 index 000000000..7f53d290f Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/filters/IngesterValidMessageFilter.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/filters/IngesterValidMessageFilter.java b/src/java/com/twitter/search/ingester/pipeline/twitter/filters/IngesterValidMessageFilter.java deleted file mode 100644 index 8f32521a4..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/filters/IngesterValidMessageFilter.java +++ /dev/null @@ -1,50 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter.filters; - -import java.util.EnumSet; -import java.util.Set; - -import com.twitter.decider.Decider; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.entities.TwitterMessageUtil; - -public class IngesterValidMessageFilter { - public static final String KEEP_NULLCAST_DECIDER_KEY = - "ingester_all_keep_nullcasts"; - public static final String STRIP_SUPPLEMENTARY_EMOJIS_DECIDER_KEY_PREFIX = - "valid_message_filter_strip_supplementary_emojis_"; - - protected final Decider decider; - - public IngesterValidMessageFilter(Decider decider) { - this.decider = decider; - } - - /** - * Evaluate a message to see if it matches the filter or not. - * - * @param message to evaluate - * @return true if this message should be emitted. - */ - public boolean accepts(TwitterMessage message) { - return TwitterMessageUtil.validateTwitterMessage( - message, getStripEmojisFields(), acceptNullcast()); - } - - private Set getStripEmojisFields() { - Set stripEmojisFields = - EnumSet.noneOf(TwitterMessageUtil.Field.class); - for (TwitterMessageUtil.Field field : TwitterMessageUtil.Field.values()) { - if (DeciderUtil.isAvailableForRandomRecipient( - decider, - STRIP_SUPPLEMENTARY_EMOJIS_DECIDER_KEY_PREFIX + field.getNameForStats())) { - stripEmojisFields.add(field); - } - } - return stripEmojisFields; - } - - protected final boolean acceptNullcast() { - return DeciderUtil.isAvailableForRandomRecipient(decider, KEEP_NULLCAST_DECIDER_KEY); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/BUILD b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/BUILD deleted file mode 100644 index fc981f0f7..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/BUILD +++ /dev/null @@ -1,32 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/commons-logging", - "3rdparty/jvm/org/apache/kafka:kafka-clients", - "3rdparty/jvm/org/slf4j:slf4j-api", - "decider/src/main/scala", - "kafka/finagle-kafka/finatra-kafka/src/main/scala", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/debug", - "src/java/com/twitter/search/common/decider", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/base", - "src/java/com/twitter/search/common/relevance:entities_and_filters", - "src/java/com/twitter/search/common/util/io/kafka", - "src/java/com/twitter/search/common/util/thrift:text-protocol", - "src/java/com/twitter/search/ingester/model", - "src/java/com/twitter/search/ingester/pipeline/twitter", - "src/java/com/twitter/search/ingester/pipeline/util", - "src/java/com/twitter/search/ingester/pipeline/wire", - "src/java/org/apache/commons/pipeline", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/search/common:schema-java", - "src/thrift/com/twitter/search/common/debug:debug-java", - "util/util-core:util-core-util", - "util/util-core/src/main/java/com/twitter/util/javainterop", - ], -) diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/BUILD.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/BUILD.docx new file mode 100644 index 000000000..ca48e4125 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/BUILD.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/DeleteUpdateEventsKafkaProducerStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/DeleteUpdateEventsKafkaProducerStage.docx new file mode 100644 index 000000000..9a973b519 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/DeleteUpdateEventsKafkaProducerStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/DeleteUpdateEventsKafkaProducerStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/DeleteUpdateEventsKafkaProducerStage.java deleted file mode 100644 index 37ecff5bb..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/DeleteUpdateEventsKafkaProducerStage.java +++ /dev/null @@ -1,66 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter.kafka; - -import javax.naming.NamingException; - -import com.google.common.base.Preconditions; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; - -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.search.ingester.pipeline.twitter.ThriftVersionedEventsConverter; -import com.twitter.search.ingester.pipeline.util.PipelineStageException; - -@ConsumedTypes(IngesterTwitterMessage.class) -public class DeleteUpdateEventsKafkaProducerStage extends KafkaProducerStage - { - private ThriftVersionedEventsConverter converter; - - public DeleteUpdateEventsKafkaProducerStage() { - super(); - } - - public DeleteUpdateEventsKafkaProducerStage(String topicName, String clientId, - String clusterPath) { - super(topicName, clientId, clusterPath); - } - - @Override - protected void innerSetup() throws PipelineStageException, NamingException { - super.innerSetup(); - commonInnerSetup(); - } - - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - super.doInnerPreprocess(); - commonInnerSetup(); - } - - private void commonInnerSetup() throws NamingException { - converter = new ThriftVersionedEventsConverter(wireModule.getPenguinVersions()); - - } - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterTwitterMessage)) { - throw new StageException(this, "Object is not an IngesterTwitterMessage: " + obj); - } - - IngesterTwitterMessage message = (IngesterTwitterMessage) obj; - innerRunFinalStageOfBranchV2(message); - } - - @Override - protected void innerRunFinalStageOfBranchV2(IngesterTwitterMessage message) { - converter.updatePenguinVersions(wireModule.getCurrentlyEnabledPenguinVersions()); - - Preconditions.checkArgument(message.getFromUserTwitterId().isPresent(), - "Missing user ID."); - - super.tryToSendEventsToKafka(converter.toDelete( - message.getTweetId(), message.getUserId(), message.getDebugEvents())); - } - - -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaConsumerStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaConsumerStage.docx new file mode 100644 index 000000000..c9cc12dc1 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaConsumerStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaConsumerStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaConsumerStage.java deleted file mode 100644 index 55675fc3c..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaConsumerStage.java +++ /dev/null @@ -1,245 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter.kafka; - -import java.time.Duration; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.apache.commons.pipeline.Pipeline; -import org.apache.commons.pipeline.StageDriver; -import org.apache.commons.pipeline.StageException; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.common.errors.SaslAuthenticationException; -import org.apache.kafka.common.errors.SerializationException; -import org.apache.kafka.common.serialization.Deserializer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.ingester.pipeline.twitter.TwitterBaseStage; -import com.twitter.search.ingester.pipeline.util.PipelineStageException; -import com.twitter.search.ingester.pipeline.util.PipelineUtil; - -/** - * A stage to read Thrift payloads from a Kafka topic. - */ -public abstract class KafkaConsumerStage extends TwitterBaseStage { - private static final Logger LOG = LoggerFactory.getLogger(KafkaConsumerStage.class); - private static final String SHUT_DOWN_ON_AUTH_FAIL = "shut_down_on_authentication_fail"; - private String kafkaClientId; - private String kafkaTopicName; - private String kafkaConsumerGroupId; - private String kafkaClusterPath; - private int maxPollRecords = 1; - private int pollTimeoutMs = 1000; - private boolean partitioned; - private String deciderKey; - private final Deserializer deserializer; - private SearchCounter pollCount; - private SearchCounter deserializationErrorCount; - private SearchRateCounter droppedMessages; - - private KafkaConsumer kafkaConsumer; - - protected KafkaConsumerStage(String kafkaClientId, String kafkaTopicName, - String kafkaConsumerGroupId, String kafkaClusterPath, - String deciderKey, Deserializer deserializer) { - - this.kafkaClientId = kafkaClientId; - this.kafkaTopicName = kafkaTopicName; - this.kafkaConsumerGroupId = kafkaConsumerGroupId; - this.kafkaClusterPath = kafkaClusterPath; - this.deciderKey = deciderKey; - this.deserializer = deserializer; - } - - protected KafkaConsumerStage(Deserializer deserializer) { - this.deserializer = deserializer; - } - - @Override - protected void initStats() { - super.initStats(); - commonInnerSetupStats(); - } - - private void commonInnerSetupStats() { - pollCount = SearchCounter.export(getStageNamePrefix() + "_poll_count"); - deserializationErrorCount = - SearchCounter.export(getStageNamePrefix() + "_deserialization_error_count"); - droppedMessages = - SearchRateCounter.export(getStageNamePrefix() + "_dropped_messages"); - } - - @Override - protected void innerSetupStats() { - commonInnerSetupStats(); - } - - @Override - protected void doInnerPreprocess() { - commonInnerSetup(); - PipelineUtil.feedStartObjectToStage(this); - } - - private void commonInnerSetup() { - Preconditions.checkNotNull(kafkaClientId); - Preconditions.checkNotNull(kafkaClusterPath); - Preconditions.checkNotNull(kafkaTopicName); - - kafkaConsumer = wireModule.newKafkaConsumer( - kafkaClusterPath, - deserializer, - kafkaClientId, - kafkaConsumerGroupId, - maxPollRecords); - if (partitioned) { - kafkaConsumer.assign(Collections.singletonList( - new TopicPartition(kafkaTopicName, wireModule.getPartition()))); - } else { - kafkaConsumer.subscribe(Collections.singleton(kafkaTopicName)); - } - } - - @Override - protected void innerSetup() { - commonInnerSetup(); - } - - @Override - public void innerProcess(Object obj) throws StageException { - StageDriver driver = ((Pipeline) stageContext).getStageDriver(this); - while (driver.getState() == StageDriver.State.RUNNING) { - pollAndEmit(); - } - - LOG.info("StageDriver state is no longer RUNNING, closing Kafka consumer."); - closeKafkaConsumer(); - } - - @VisibleForTesting - void pollAndEmit() throws StageException { - try { - List records = poll(); - for (R record : records) { - emitAndCount(record); - } - } catch (PipelineStageException e) { - throw new StageException(this, e); - } - } - - /*** - * Poll Kafka and get the items from the topic. Record stats. - * @return - * @throws PipelineStageException - */ - public List pollFromTopic() throws PipelineStageException { - long startingTime = startProcessing(); - List polledItems = poll(); - endProcessing(startingTime); - return polledItems; - } - - private List poll() throws PipelineStageException { - List recordsFromKafka = new ArrayList<>(); - try { - ConsumerRecords records = kafkaConsumer.poll(Duration.ofMillis(pollTimeoutMs)); - pollCount.increment(); - records.iterator().forEachRemaining(record -> { - if (deciderKey == null || DeciderUtil.isAvailableForRandomRecipient(decider, deciderKey)) { - recordsFromKafka.add(record.value()); - } else { - droppedMessages.increment(); - } - }); - - } catch (SerializationException e) { - deserializationErrorCount.increment(); - LOG.error("Failed to deserialize the value.", e); - } catch (SaslAuthenticationException e) { - if (DeciderUtil.isAvailableForRandomRecipient(decider, SHUT_DOWN_ON_AUTH_FAIL)) { - wireModule.getPipelineExceptionHandler() - .logAndShutdown("Authentication error connecting to Kafka broker: " + e); - } else { - throw new PipelineStageException(this, "Kafka Authentication Error", e); - } - } catch (Exception e) { - throw new PipelineStageException(e); - } - - return recordsFromKafka; - } - - @VisibleForTesting - void closeKafkaConsumer() { - try { - kafkaConsumer.close(); - LOG.info("Kafka kafkaConsumer for {} was closed", getFullStageName()); - } catch (Exception e) { - log.error("Failed to close Kafka kafkaConsumer", e); - } - } - - @Override - public void release() { - closeKafkaConsumer(); - super.release(); - } - - @Override - public void cleanupStageV2() { - closeKafkaConsumer(); - } - - @SuppressWarnings("unused") // set from pipeline config - public void setKafkaClientId(String kafkaClientId) { - this.kafkaClientId = kafkaClientId; - } - - @SuppressWarnings("unused") // set from pipeline config - public void setKafkaTopicName(String kafkaTopicName) { - this.kafkaTopicName = kafkaTopicName; - } - - @SuppressWarnings("unused") // set from pipeline config - public void setKafkaConsumerGroupId(String kafkaConsumerGroupId) { - this.kafkaConsumerGroupId = kafkaConsumerGroupId; - } - - @SuppressWarnings("unused") // set from pipeline config - public void setMaxPollRecords(int maxPollRecords) { - this.maxPollRecords = maxPollRecords; - } - - @SuppressWarnings("unused") // set from pipeline config - public void setPollTimeoutMs(int pollTimeoutMs) { - this.pollTimeoutMs = pollTimeoutMs; - } - - @SuppressWarnings("unused") // set from pipeline config - public void setPartitioned(boolean partitioned) { - this.partitioned = partitioned; - } - - @SuppressWarnings("unused") // set from pipeline config - public void setDeciderKey(String deciderKey) { - this.deciderKey = deciderKey; - } - - @VisibleForTesting - KafkaConsumer getKafkaConsumer() { - return kafkaConsumer; - } - - @SuppressWarnings("unused") // set from pipeline config - public void setKafkaClusterPath(String kafkaClusterPath) { - this.kafkaClusterPath = kafkaClusterPath; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaProducerStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaProducerStage.docx new file mode 100644 index 000000000..b1802209f Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaProducerStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaProducerStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaProducerStage.java deleted file mode 100644 index 84252d0da..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaProducerStage.java +++ /dev/null @@ -1,259 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter.kafka; - -import java.util.Collection; -import java.util.Map; - -import javax.naming.NamingException; - -import scala.runtime.BoxedUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -import org.apache.commons.pipeline.StageException; -import org.apache.kafka.clients.producer.ProducerRecord; -import org.apache.kafka.clients.producer.RecordMetadata; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.finatra.kafka.producers.BlockingFinagleKafkaProducer; -import com.twitter.search.common.debug.DebugEventUtil; -import com.twitter.search.common.debug.thriftjava.DebugEvents; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.Percentile; -import com.twitter.search.common.metrics.PercentileUtil; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType; -import com.twitter.search.common.util.io.kafka.CompactThriftSerializer; -import com.twitter.search.ingester.model.IngesterThriftVersionedEvents; -import com.twitter.search.ingester.pipeline.twitter.TwitterBaseStage; -import com.twitter.search.ingester.pipeline.util.PipelineStageException; -import com.twitter.search.ingester.pipeline.wire.IngesterPartitioner; -import com.twitter.util.Await; -import com.twitter.util.Future; - -public class KafkaProducerStage extends TwitterBaseStage { - private static final Logger LOG = LoggerFactory.getLogger(KafkaProducerStage.class); - - private static final Logger LATE_EVENTS_LOG = LoggerFactory.getLogger( - KafkaProducerStage.class.getName() + ".LateEvents"); - - private final Map> processingLatenciesStats = - Maps.newEnumMap(ThriftIndexingEventType.class); - - private String kafkaClientId; - private String kafkaTopicName; - private String kafkaClusterPath; - private SearchCounter sendCount; - private String perPartitionSendCountFormat; - private String deciderKey; - - protected BlockingFinagleKafkaProducer kafkaProducer; - - private int processingLatencyThresholdMillis = 10000; - - public KafkaProducerStage() { } - - public KafkaProducerStage(String topicName, String clientId, String clusterPath) { - this.kafkaTopicName = topicName; - this.kafkaClientId = clientId; - this.kafkaClusterPath = clusterPath; - } - - @Override - protected void initStats() { - super.initStats(); - setupCommonStats(); - } - - private void setupCommonStats() { - sendCount = SearchCounter.export(getStageNamePrefix() + "_send_count"); - perPartitionSendCountFormat = getStageNamePrefix() + "_partition_%d_send_count"; - for (ThriftIndexingEventType eventType : ThriftIndexingEventType.values()) { - processingLatenciesStats.put( - eventType, - PercentileUtil.createPercentile( - getStageNamePrefix() + "_" + eventType.name().toLowerCase() - + "_processing_latency_ms")); - } - } - - @Override - protected void innerSetupStats() { - setupCommonStats(); - } - - private boolean isEnabled() { - if (this.deciderKey != null) { - return DeciderUtil.isAvailableForRandomRecipient(decider, deciderKey); - } else { - // No decider means it's enabled. - return true; - } - } - - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - try { - innerSetup(); - } catch (PipelineStageException e) { - throw new StageException(this, e); - } - } - - @Override - protected void innerSetup() throws PipelineStageException, NamingException { - Preconditions.checkNotNull(kafkaClientId); - Preconditions.checkNotNull(kafkaClusterPath); - Preconditions.checkNotNull(kafkaTopicName); - - kafkaProducer = wireModule.newFinagleKafkaProducer( - kafkaClusterPath, - new CompactThriftSerializer(), - kafkaClientId, - IngesterPartitioner.class); - - int numPartitions = wireModule.getPartitionMappingManager().getNumPartitions(); - int numKafkaPartitions = kafkaProducer.partitionsFor(kafkaTopicName).size(); - if (numPartitions != numKafkaPartitions) { - throw new PipelineStageException(String.format( - "Number of partitions for Kafka topic %s (%d) != number of expected partitions (%d)", - kafkaTopicName, numKafkaPartitions, numPartitions)); - } - } - - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterThriftVersionedEvents)) { - throw new StageException(this, "Object is not IngesterThriftVersionedEvents: " + obj); - } - - IngesterThriftVersionedEvents events = (IngesterThriftVersionedEvents) obj; - tryToSendEventsToKafka(events); - } - - protected void tryToSendEventsToKafka(IngesterThriftVersionedEvents events) { - if (!isEnabled()) { - return; - } - - DebugEvents debugEvents = events.getDebugEvents(); - // We don't propagate debug events to Kafka, because they take about 50% - // of the storage space. - events.unsetDebugEvents(); - - ProducerRecord record = new ProducerRecord<>( - kafkaTopicName, - null, - clock.nowMillis(), - null, - events); - - sendRecordToKafka(record).ensure(() -> { - updateEventProcessingLatencyStats(events, debugEvents); - return null; - }); - } - - private Future sendRecordToKafka( - ProducerRecord record) { - Future result; - try { - result = kafkaProducer.send(record); - } catch (Exception e) { - // Even though KafkaProducer.send() returns a Future, it can throw a synchronous exception, - // so we translate synchronous exceptions into a Future.exception so we handle all exceptions - // consistently. - result = Future.exception(e); - } - - return result.onSuccess(recordMetadata -> { - sendCount.increment(); - SearchCounter.export( - String.format(perPartitionSendCountFormat, recordMetadata.partition())).increment(); - return BoxedUnit.UNIT; - }).onFailure(e -> { - stats.incrementExceptions(); - LOG.error("Sending a record failed.", e); - return BoxedUnit.UNIT; - }); - } - - private void updateEventProcessingLatencyStats(IngesterThriftVersionedEvents events, - DebugEvents debugEvents) { - if ((debugEvents != null) && debugEvents.isSetProcessingStartedAt()) { - // Get the one indexing event out of all events we're sending. - Collection indexingEvents = events.getVersionedEvents().values(); - Preconditions.checkState(!indexingEvents.isEmpty()); - ThriftIndexingEventType eventType = indexingEvents.iterator().next().getEventType(); - - // Check if the event took too much time to get to this current point. - long processingLatencyMillis = - clock.nowMillis() - debugEvents.getProcessingStartedAt().getEventTimestampMillis(); - processingLatenciesStats.get(eventType).record(processingLatencyMillis); - - if (processingLatencyMillis >= processingLatencyThresholdMillis) { - LATE_EVENTS_LOG.warn("Event of type {} for tweet {} was processed in {}ms: {}", - eventType.name(), - events.getTweetId(), - processingLatencyMillis, - DebugEventUtil.debugEventsToString(debugEvents)); - } - } - } - - public void setProcessingLatencyThresholdMillis(int processingLatencyThresholdMillis) { - this.processingLatencyThresholdMillis = processingLatencyThresholdMillis; - } - - @Override - public void innerPostprocess() throws StageException { - try { - commonCleanup(); - } catch (Exception e) { - throw new StageException(this, e); - } - } - - @Override - public void cleanupStageV2() { - try { - commonCleanup(); - } catch (Exception e) { - LOG.error("Error trying to clean up KafkaProducerStage.", e); - } - } - - private void commonCleanup() throws Exception { - Await.result(kafkaProducer.close()); - } - - @SuppressWarnings("unused") // set from pipeline config - public void setKafkaClientId(String kafkaClientId) { - this.kafkaClientId = kafkaClientId; - } - - @SuppressWarnings("unused") // set from pipeline config - public void setKafkaTopicName(String kafkaTopicName) { - this.kafkaTopicName = kafkaTopicName; - } - - @VisibleForTesting - public BlockingFinagleKafkaProducer getKafkaProducer() { - return kafkaProducer; - } - - @SuppressWarnings("unused") // set from pipeline config - public void setDeciderKey(String deciderKey) { - this.deciderKey = deciderKey; - } - - @SuppressWarnings("unused") // set from pipeline config - public void setKafkaClusterPath(String kafkaClusterPath) { - this.kafkaClusterPath = kafkaClusterPath; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaRawRecordConsumerStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaRawRecordConsumerStage.docx new file mode 100644 index 000000000..7affd0592 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaRawRecordConsumerStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaRawRecordConsumerStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaRawRecordConsumerStage.java deleted file mode 100644 index 2cb777090..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/KafkaRawRecordConsumerStage.java +++ /dev/null @@ -1,36 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter.kafka; - -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.apache.commons.pipeline.validation.ProducedTypes; -import org.apache.kafka.common.serialization.Deserializer; - -import com.twitter.finatra.kafka.serde.internal.BaseDeserializer; -import com.twitter.search.ingester.model.KafkaRawRecord; -import com.twitter.util.Time; - -/** - * Kafka consumer stage that emits the binary payload wrapped in {@code ByteArray}. - */ -@ConsumedTypes(String.class) -@ProducedTypes(KafkaRawRecord.class) -public class KafkaRawRecordConsumerStage extends KafkaConsumerStage { - public KafkaRawRecordConsumerStage() { - super(getDeserializer()); - } - - private static Deserializer getDeserializer() { - return new BaseDeserializer() { - @Override - public KafkaRawRecord deserialize(String topic, byte[] data) { - return new KafkaRawRecord(data, Time.now().inMillis()); - } - }; - } - - public KafkaRawRecordConsumerStage(String kafkaClientId, String kafkaTopicName, - String kafkaConsumerGroupId, String kafkaClusterPath, - String deciderKey) { - super(kafkaClientId, kafkaTopicName, kafkaConsumerGroupId, kafkaClusterPath, deciderKey, - getDeserializer()); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/RetweetAndReplyUpdateEventsKafkaProducerStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/RetweetAndReplyUpdateEventsKafkaProducerStage.docx new file mode 100644 index 000000000..5bc48f917 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/RetweetAndReplyUpdateEventsKafkaProducerStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/RetweetAndReplyUpdateEventsKafkaProducerStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/RetweetAndReplyUpdateEventsKafkaProducerStage.java deleted file mode 100644 index 4227617cb..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/RetweetAndReplyUpdateEventsKafkaProducerStage.java +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter.kafka; - -import org.apache.commons.pipeline.validation.ConsumedTypes; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.ingester.model.IngesterThriftVersionedEvents; - -@ConsumedTypes(ThriftVersionedEvents.class) -public class RetweetAndReplyUpdateEventsKafkaProducerStage extends KafkaProducerStage - { - public RetweetAndReplyUpdateEventsKafkaProducerStage(String kafkaTopic, String clientId, - String clusterPath) { - super(kafkaTopic, clientId, clusterPath); - } - - public RetweetAndReplyUpdateEventsKafkaProducerStage() { - super(); - } - - @Override - protected void innerRunFinalStageOfBranchV2(IngesterThriftVersionedEvents events) { - super.tryToSendEventsToKafka(events); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/TweetThriftVersionedEventsKafkaProducerStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/TweetThriftVersionedEventsKafkaProducerStage.docx new file mode 100644 index 000000000..0e5dd8879 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/TweetThriftVersionedEventsKafkaProducerStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/TweetThriftVersionedEventsKafkaProducerStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/TweetThriftVersionedEventsKafkaProducerStage.java deleted file mode 100644 index 9e96471d7..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/kafka/TweetThriftVersionedEventsKafkaProducerStage.java +++ /dev/null @@ -1,108 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter.kafka; - -import javax.naming.NamingException; - -import org.apache.commons.pipeline.StageException; -import org.apache.commons.pipeline.validation.ConsumedTypes; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.debug.DebugEventUtil; -import com.twitter.search.common.debug.thriftjava.DebugEvents; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.ingester.model.IngesterThriftVersionedEvents; -import com.twitter.search.ingester.pipeline.util.PipelineStageException; - -/** - * Kafka producer stage to write tweet indexing data as {@code ThriftVersionedEvents}. This stage - * also handles extra debug event processing. - */ -@ConsumedTypes(IngesterThriftVersionedEvents.class) -public class TweetThriftVersionedEventsKafkaProducerStage extends KafkaProducerStage - { - private static final int PROCESSING_LATENCY_THRESHOLD_FOR_UPDATES_MILLIS = 30000; - - private static final Logger LOG = - LoggerFactory.getLogger(TweetThriftVersionedEventsKafkaProducerStage.class); - - private long processedTweetCount = 0; - - private SearchLongGauge kafkaProducerLag; - - private int debugEventLogPeriod = -1; - - public TweetThriftVersionedEventsKafkaProducerStage(String kafkaTopic, String clientId, - String clusterPath) { - super(kafkaTopic, clientId, clusterPath); - } - - public TweetThriftVersionedEventsKafkaProducerStage() { - super(); - } - - @Override - protected void initStats() { - super.initStats(); - setupCommonStats(); - } - - @Override - protected void innerSetupStats() { - super.innerSetupStats(); - setupCommonStats(); - } - - private void setupCommonStats() { - kafkaProducerLag = SearchLongGauge.export( - getStageNamePrefix() + "_kafka_producer_lag_millis"); - } - - @Override - protected void innerSetup() throws PipelineStageException, NamingException { - super.innerSetup(); - } - - @Override - protected void doInnerPreprocess() throws StageException, NamingException { - super.doInnerPreprocess(); - commonInnerSetup(); - } - - private void commonInnerSetup() { - setProcessingLatencyThresholdMillis(PROCESSING_LATENCY_THRESHOLD_FOR_UPDATES_MILLIS); - } - - @Override - public void innerProcess(Object obj) throws StageException { - if (!(obj instanceof IngesterThriftVersionedEvents)) { - throw new StageException(this, "Object is not IngesterThriftVersionedEvents: " + obj); - } - - IngesterThriftVersionedEvents events = (IngesterThriftVersionedEvents) obj; - innerRunFinalStageOfBranchV2(events); - } - - @Override - protected void innerRunFinalStageOfBranchV2(IngesterThriftVersionedEvents events) { - if ((debugEventLogPeriod > 0) - && (processedTweetCount % debugEventLogPeriod == 0) - && (events.getDebugEvents() != null)) { - LOG.info("DebugEvents for tweet {}: {}", - events.getTweetId(), DebugEventUtil.debugEventsToString(events.getDebugEvents())); - } - processedTweetCount++; - - DebugEvents debugEvents = events.getDebugEvents(); - if ((debugEvents != null) && debugEvents.isSetProcessingStartedAt()) { - kafkaProducerLag.set( - clock.nowMillis() - debugEvents.getProcessingStartedAt().getEventTimestampMillis()); - } - - super.tryToSendEventsToKafka(events); - } - - @SuppressWarnings("unused") // set from pipeline config - public void setDebugEventLogPeriod(int debugEventLogPeriod) { - this.debugEventLogPeriod = debugEventLogPeriod; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/BUILD b/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/BUILD deleted file mode 100644 index bd8f88b26..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/BUILD +++ /dev/null @@ -1,32 +0,0 @@ -java_library( - sources = ["*.java"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/code/findbugs:jsr305", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/commons-lang", - "3rdparty/jvm/org/apache/thrift:libthrift", - "mediaservices/commons/src/main/thrift:thrift-java", - "src/java/com/twitter/common/text/language:locale-util", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/debug", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/base", - "src/java/com/twitter/search/common/partitioning/snowflakeparser", - "src/java/com/twitter/search/common/relevance:entities_and_filters", - "src/java/com/twitter/search/common/util/text", - "src/java/com/twitter/search/ingester/model", - "src/java/com/twitter/search/ingester/pipeline/util", - "src/thrift/com/twitter/dataproducts:enrichments_profilegeo-java", - "src/thrift/com/twitter/escherbird:tweet-annotation-java", - "src/thrift/com/twitter/gizmoduck:user-thrift-java", - "src/thrift/com/twitter/search/common/debug:debug-java", - "src/thrift/com/twitter/service/spiderduck/gen:metadata-store-java", - "src/thrift/com/twitter/tweetypie:events-java", - "src/thrift/com/twitter/tweetypie:tweet-java", - "tweetypie/src/scala/com/twitter/tweetypie/tweettext", - ], -) diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/BUILD.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/BUILD.docx new file mode 100644 index 000000000..0df3c3c19 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/BUILD.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/ThriftTweetParsingException.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/ThriftTweetParsingException.docx new file mode 100644 index 000000000..44690e2b2 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/ThriftTweetParsingException.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/ThriftTweetParsingException.java b/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/ThriftTweetParsingException.java deleted file mode 100644 index a986eec58..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/ThriftTweetParsingException.java +++ /dev/null @@ -1,7 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter.thriftparse; - -public final class ThriftTweetParsingException extends Exception { - public ThriftTweetParsingException(String message) { - super(message); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/TweetEventParseHelper.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/TweetEventParseHelper.docx new file mode 100644 index 000000000..caae7d494 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/TweetEventParseHelper.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/TweetEventParseHelper.java b/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/TweetEventParseHelper.java deleted file mode 100644 index d644898aa..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/thriftparse/TweetEventParseHelper.java +++ /dev/null @@ -1,727 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter.thriftparse; - -import java.util.Date; -import java.util.List; -import java.util.Optional; -import javax.annotation.Nonnull; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import org.apache.commons.lang.StringEscapeUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.dataproducts.enrichments.thriftjava.GeoEntity; -import com.twitter.dataproducts.enrichments.thriftjava.PotentialLocation; -import com.twitter.dataproducts.enrichments.thriftjava.ProfileGeoEnrichment; -import com.twitter.escherbird.thriftjava.TweetEntityAnnotation; -import com.twitter.expandodo.thriftjava.Card2; -import com.twitter.gizmoduck.thriftjava.User; -import com.twitter.mediaservices.commons.tweetmedia.thrift_java.MediaInfo; -import com.twitter.search.common.debug.thriftjava.DebugEvents; -import com.twitter.search.common.metrics.Percentile; -import com.twitter.search.common.metrics.PercentileUtil; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.relevance.entities.GeoObject; -import com.twitter.search.common.relevance.entities.PotentialLocationObject; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.entities.TwitterMessage.EscherbirdAnnotation; -import com.twitter.search.common.relevance.entities.TwitterMessageUser; -import com.twitter.search.common.relevance.entities.TwitterMessageUtil; -import com.twitter.search.common.relevance.entities.TwitterQuotedMessage; -import com.twitter.search.common.relevance.entities.TwitterRetweetMessage; -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.search.ingester.pipeline.util.CardFieldUtil; -import com.twitter.service.spiderduck.gen.MediaTypes; -import com.twitter.tweetypie.thriftjava.DeviceSource; -import com.twitter.tweetypie.thriftjava.DirectedAtUser; -import com.twitter.tweetypie.thriftjava.EscherbirdEntityAnnotations; -import com.twitter.tweetypie.thriftjava.ExclusiveTweetControl; -import com.twitter.tweetypie.thriftjava.GeoCoordinates; -import com.twitter.tweetypie.thriftjava.HashtagEntity; -import com.twitter.tweetypie.thriftjava.MediaEntity; -import com.twitter.tweetypie.thriftjava.MentionEntity; -import com.twitter.tweetypie.thriftjava.Place; -import com.twitter.tweetypie.thriftjava.QuotedTweet; -import com.twitter.tweetypie.thriftjava.Reply; -import com.twitter.tweetypie.thriftjava.Tweet; -import com.twitter.tweetypie.thriftjava.TweetCoreData; -import com.twitter.tweetypie.thriftjava.TweetCreateEvent; -import com.twitter.tweetypie.thriftjava.TweetDeleteEvent; -import com.twitter.tweetypie.thriftjava.UrlEntity; -import com.twitter.tweetypie.tweettext.PartialHtmlEncoding; - -/** - * This is an utility class for converting Thrift TweetEvent messages sent by TweetyPie - * into ingester internal representation, IngesterTwitterMessage. - */ -public final class TweetEventParseHelper { - private static final Logger LOG = LoggerFactory.getLogger(TweetEventParseHelper.class); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_NULL_TEXT = - SearchCounter.export("tweets_with_null_text_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter TWEET_SIZE = SearchCounter.export("tweet_size_from_thrift"); - - @VisibleForTesting - static final Percentile TWEET_SIZE_PERCENTILES = - PercentileUtil.createPercentile("tweet_size_from_thrift"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_CONVERSATION_ID = - SearchCounter.export("tweets_with_conversation_id_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_QUOTE = - SearchCounter.export("tweets_with_quote_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_ANNOTATIONS = - SearchCounter.export("tweets_with_annotation_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_ANNOTATIONS_ADDED = - SearchCounter.export("num_annotations_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_COORDINATE_FIELD = - SearchCounter.export("tweets_with_coordinate_field_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_PLACE_ADDED = - SearchCounter.export("num_places_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_PLACE_FIELD = - SearchCounter.export("tweets_with_place_field_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_PLACE_COUNTRY_CODE = - SearchCounter.export("tweets_with_place_country_code_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_USE_PLACE_FIELD = - SearchCounter.export("tweets_use_place_field_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_CANNOT_PARSE_PLACE_FIELD = - SearchCounter.export("tweets_cannot_parse_place_field_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_PROFILE_GEO_ENRICHMENT = - SearchCounter.export("tweets_with_profile_geo_enrichment_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_MENTIONS = - SearchCounter.export("tweets_with_mentions_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_MENTIONS_ADDED = - SearchCounter.export("num_mentions_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_HASHTAGS = - SearchCounter.export("tweets_with_hashtags_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_HASHTAGS_ADDED = - SearchCounter.export("num_hashtags_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_MEDIA_URL = - SearchCounter.export("tweets_with_media_url_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_MEDIA_URLS_ADDED = - SearchCounter.export("num_media_urls_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_PHOTO_MEDIA_URL = - SearchCounter.export("tweets_with_photo_media_url_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_VIDEO_MEDIA_URL = - SearchCounter.export("tweets_with_video_media_url_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_WITH_NON_MEDIA_URL = - SearchCounter.export("tweets_with_non_media_url_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_NON_MEDIA_URLS_ADDED = - SearchCounter.export("num_non_media_urls_from_thrift_cnt"); - - @VisibleForTesting - static final SearchCounter NUM_TWEETS_MISSING_QUOTE_URLS = - SearchCounter.export("num_tweets_missing_quote_urls_cnt"); - - // Utility class, disallow instantiation. - private TweetEventParseHelper() { - } - - /** Builds an IngesterTwitterMessage instance from a TweetCreateEvent. */ - @Nonnull - public static IngesterTwitterMessage getTwitterMessageFromCreationEvent( - @Nonnull TweetCreateEvent createEvent, - @Nonnull List supportedPenguinVersions, - @Nullable DebugEvents debugEvents) throws ThriftTweetParsingException { - - Tweet tweet = createEvent.getTweet(); - if (tweet == null) { - throw new ThriftTweetParsingException("No tweet field in TweetCreateEvent"); - } - - TweetCoreData coreData = tweet.getCore_data(); - if (coreData == null) { - throw new ThriftTweetParsingException("No core_data field in Tweet in TweetCreateEvent"); - } - - User user = createEvent.getUser(); - if (user == null) { - throw new ThriftTweetParsingException("No user field in TweetCreateEvent"); - } - if (!user.isSetProfile()) { - throw new ThriftTweetParsingException("No profile field in User in TweetCreateEvent"); - } - if (!user.isSetSafety()) { - throw new ThriftTweetParsingException("No safety field in User in TweetCreateEvent"); - } - - long twitterId = tweet.getId(); - IngesterTwitterMessage message = new IngesterTwitterMessage( - twitterId, - supportedPenguinVersions, - debugEvents); - - // Set the creation time based on the tweet ID, because it has millisecond granularity, - // and coreData.created_at_secs has only second granularity. - message.setDate(new Date(SnowflakeIdParser.getTimestampFromTweetId(twitterId))); - - boolean isNsfw = coreData.isNsfw_admin() || coreData.isNsfw_user(); - boolean hasMediaOrUrlsOrCards = - tweet.getMediaSize() > 0 - || tweet.getUrlsSize() > 0 - || tweet.getCardsSize() > 0 - || tweet.isSetCard2(); - - message.setIsSensitiveContent(isNsfw && hasMediaOrUrlsOrCards); - - message.setFromUser(getFromUser(user)); - if (user.isSetCounts()) { - message.setFollowersCount((int) user.getCounts().getFollowers()); - } - message.setUserProtected(user.getSafety().isIs_protected()); - message.setUserVerified(user.getSafety().isVerified()); - message.setUserBlueVerified(user.getSafety().isIs_blue_verified()); - - if (tweet.isSetLanguage()) { - message.setLanguage(tweet.getLanguage().getLanguage()); // language ID like "en" - } - - if (tweet.isSetSelf_thread_metadata()) { - message.setSelfThread(true); - } - - ExclusiveTweetControl exclusiveTweetControl = tweet.getExclusive_tweet_control(); - if (exclusiveTweetControl != null) { - if (exclusiveTweetControl.isSetConversation_author_id()) { - message.setExclusiveConversationAuthorId( - exclusiveTweetControl.getConversation_author_id()); - } - } - - setDirectedAtUser(message, coreData); - addMentionsToMessage(message, tweet); - addHashtagsToMessage(message, tweet); - addMediaEntitiesToMessage(message, tweet.getId(), tweet.getMedia()); - addUrlsToMessage(message, tweet.getUrls()); - addEscherbirdAnnotationsToMessage(message, tweet); - message.setNullcast(coreData.isNullcast()); - - if (coreData.isSetConversation_id()) { - message.setConversationId(coreData.getConversation_id()); - NUM_TWEETS_WITH_CONVERSATION_ID.increment(); - } - - // quotes - if (tweet.isSetQuoted_tweet()) { - QuotedTweet quotedTweet = tweet.getQuoted_tweet(); - if (quotedTweet.getTweet_id() > 0 && quotedTweet.getUser_id() > 0) { - if (quotedTweet.isSetPermalink()) { - String quotedURL = quotedTweet.getPermalink().getLong_url(); - UrlEntity quotedURLEntity = new UrlEntity(); - quotedURLEntity.setExpanded(quotedURL); - quotedURLEntity.setUrl(quotedTweet.getPermalink().getShort_url()); - quotedURLEntity.setDisplay(quotedTweet.getPermalink().getDisplay_text()); - addUrlsToMessage(message, Lists.newArrayList(quotedURLEntity)); - } else { - LOG.warn("Tweet {} has quoted tweet, but is missing quoted tweet URL: {}", - tweet.getId(), quotedTweet); - NUM_TWEETS_MISSING_QUOTE_URLS.increment(); - } - TwitterQuotedMessage quotedMessage = - new TwitterQuotedMessage( - quotedTweet.getTweet_id(), - quotedTweet.getUser_id()); - message.setQuotedMessage(quotedMessage); - NUM_TWEETS_WITH_QUOTE.increment(); - } - } - - // card fields - if (createEvent.getTweet().isSetCard2()) { - Card2 card = createEvent.getTweet().getCard2(); - message.setCardName(card.getName()); - message.setCardTitle( - CardFieldUtil.extractBindingValue(CardFieldUtil.TITLE_BINDING_KEY, card)); - message.setCardDescription( - CardFieldUtil.extractBindingValue(CardFieldUtil.DESCRIPTION_BINDING_KEY, card)); - CardFieldUtil.deriveCardLang(message); - message.setCardUrl(card.getUrl()); - } - - // Some fields should be set based on the "original" tweet. So if this tweet is a retweet, - // we want to extract those fields from the retweeted tweet. - Tweet retweetOrTweet = tweet; - TweetCoreData retweetOrTweetCoreData = coreData; - User retweetOrTweetUser = user; - - // retweets - boolean isRetweet = coreData.isSetShare(); - if (isRetweet) { - retweetOrTweet = createEvent.getSource_tweet(); - retweetOrTweetCoreData = retweetOrTweet.getCore_data(); - retweetOrTweetUser = createEvent.getSource_user(); - - TwitterRetweetMessage retweetMessage = new TwitterRetweetMessage(); - retweetMessage.setRetweetId(twitterId); - - if (retweetOrTweetUser != null) { - if (retweetOrTweetUser.isSetProfile()) { - retweetMessage.setSharedUserDisplayName(retweetOrTweetUser.getProfile().getName()); - } - retweetMessage.setSharedUserTwitterId(retweetOrTweetUser.getId()); - } - - retweetMessage.setSharedDate(new Date(retweetOrTweetCoreData.getCreated_at_secs() * 1000)); - retweetMessage.setSharedId(retweetOrTweet.getId()); - - addMediaEntitiesToMessage(message, retweetOrTweet.getId(), retweetOrTweet.getMedia()); - addUrlsToMessage(message, retweetOrTweet.getUrls()); - - // If a tweet's text is longer than 140 characters, the text for any retweet of that tweet - // will be truncated. And if the original tweet has hashtags or mentions after character 140, - // the Tweetypie event for the retweet will not include those hashtags/mentions, which will - // make the retweet unsearchable by those hashtags/mentions. So in order to avoid this - // problem, we add to the retweet all hashtags/mentions set on the original tweet. - addMentionsToMessage(message, retweetOrTweet); - addHashtagsToMessage(message, retweetOrTweet); - - message.setRetweetMessage(retweetMessage); - } - - // Some fields should be set based on the "original" tweet. - // Only set geo fields if this is not a retweet - if (!isRetweet) { - setGeoFields(message, retweetOrTweetCoreData, retweetOrTweetUser); - setPlacesFields(message, retweetOrTweet); - } - setText(message, retweetOrTweetCoreData); - setInReplyTo(message, retweetOrTweetCoreData, isRetweet); - setDeviceSourceField(message, retweetOrTweet); - - // Profile geo enrichment fields should be set based on this tweet, even if it's a retweet. - setProfileGeoEnrichmentFields(message, tweet); - - // The composer used to create this tweet: standard tweet creator or the camera flow. - setComposerSource(message, tweet); - - return message; - } - - private static void setGeoFields( - TwitterMessage message, TweetCoreData coreData, User user) { - - if (coreData.isSetCoordinates()) { - NUM_TWEETS_WITH_COORDINATE_FIELD.increment(); - GeoCoordinates coords = coreData.getCoordinates(); - message.setGeoTaggedLocation( - GeoObject.createForIngester(coords.getLatitude(), coords.getLongitude())); - - String location = - String.format("GeoAPI:%.4f,%.4f", coords.getLatitude(), coords.getLongitude()); - TwitterMessageUtil.setAndTruncateLocationOnMessage(message, location); - } - - // If the location was not set from the coordinates. - if ((message.getOrigLocation() == null) && (user != null) && user.isSetProfile()) { - TwitterMessageUtil.setAndTruncateLocationOnMessage(message, user.getProfile().getLocation()); - } - } - - private static void setPlacesFields(TwitterMessage message, Tweet tweet) { - if (!tweet.isSetPlace()) { - return; - } - - Place place = tweet.getPlace(); - - if (place.isSetContainers() && place.getContainersSize() > 0) { - NUM_TWEETS_WITH_PLACE_FIELD.increment(); - NUM_PLACE_ADDED.add(place.getContainersSize()); - - for (String placeId : place.getContainers()) { - message.addPlace(placeId); - } - } - - Preconditions.checkArgument(place.isSetId(), "Tweet.Place without id."); - message.setPlaceId(place.getId()); - Preconditions.checkArgument(place.isSetFull_name(), "Tweet.Place without full_name."); - message.setPlaceFullName(place.getFull_name()); - if (place.isSetCountry_code()) { - message.setPlaceCountryCode(place.getCountry_code()); - NUM_TWEETS_WITH_PLACE_COUNTRY_CODE.increment(); - } - - if (message.getGeoTaggedLocation() == null) { - Optional location = GeoObject.fromPlace(place); - - if (location.isPresent()) { - NUM_TWEETS_USE_PLACE_FIELD.increment(); - message.setGeoTaggedLocation(location.get()); - } else { - NUM_TWEETS_CANNOT_PARSE_PLACE_FIELD.increment(); - } - } - } - - private static void setText(TwitterMessage message, TweetCoreData coreData) { - /** - * TweetyPie doesn't do a full HTML escaping of the text, only a partial escaping - * so we use their code to unescape it first, then we do - * a second unescaping because when the tweet text itself has HTML escape - * sequences, we want to index the unescaped version, not the escape sequence itself. - * -- - * Yes, we *double* unescape html. About 1-2 tweets per second are double escaped, - * and we probably want to index the real text and not things like '★'. - * Unescaping already unescaped text seems safe in practice. - * -- - * - * This may seem wrong, because one thinks we should index whatever the user posts, - * but given punctuation stripping this creates odd behavior: - * - * If someone tweets & they won't be able to find it by searching for '&' because - * the tweet will be indexed as 'amp' - * - * It would also prevent some tweets from surfacing for certain searches, for example: - * - * User Tweets: John Mayer & Dave Chappelle - * We Unescape To: John Mayer & Dave Chappelle - * We Strip/Normalize To: john mayer dave chappelle - * - * A user searching for 'John Mayer Dave Chappelle' would get the above tweet. - * - * If we didn't double unescape - * - * User Tweets: John Mayer & Dave Chappelle - * We Strip/Normalize To: john mayer amp dave chappelle - * - * A user searching for 'John Mayer Dave Chappelle' would miss the above tweet. - * - * Second example - * - * User Tweets: L'Humanité - * We Unescape To: L'Humanité - * We Strip/Normalize To: l humanite - * - * If we didn't double escape - * - * User Tweets: L'Humanité - * We Strip/Normalize To: l humanit eacute - * - */ - - String text = coreData.isSetText() - ? StringEscapeUtils.unescapeHtml(PartialHtmlEncoding.decode(coreData.getText())) - : coreData.getText(); - message.setText(text); - if (text != null) { - long tweetLength = text.length(); - TWEET_SIZE.add(tweetLength); - TWEET_SIZE_PERCENTILES.record(tweetLength); - } else { - NUM_TWEETS_WITH_NULL_TEXT.increment(); - } - } - - private static void setInReplyTo( - TwitterMessage message, TweetCoreData coreData, boolean isRetweet) { - Reply reply = coreData.getReply(); - if (!isRetweet && reply != null) { - String inReplyToScreenName = reply.getIn_reply_to_screen_name(); - long inReplyToUserId = reply.getIn_reply_to_user_id(); - message.replaceToUserWithInReplyToUserIfNeeded(inReplyToScreenName, inReplyToUserId); - } - - if ((reply != null) && reply.isSetIn_reply_to_status_id()) { - message.setInReplyToStatusId(reply.getIn_reply_to_status_id()); - } - } - - private static void setProfileGeoEnrichmentFields(TwitterMessage message, Tweet tweet) { - if (!tweet.isSetProfile_geo_enrichment()) { - return; - } - - ProfileGeoEnrichment profileGeoEnrichment = tweet.getProfile_geo_enrichment(); - List thriftPotentialLocations = - profileGeoEnrichment.getPotential_locations(); - if (!thriftPotentialLocations.isEmpty()) { - NUM_TWEETS_WITH_PROFILE_GEO_ENRICHMENT.increment(); - List potentialLocations = Lists.newArrayList(); - for (PotentialLocation potentialLocation : thriftPotentialLocations) { - GeoEntity geoEntity = potentialLocation.getGeo_entity(); - potentialLocations.add(new PotentialLocationObject(geoEntity.getCountry_code(), - geoEntity.getRegion(), - geoEntity.getLocality())); - } - - message.setPotentialLocations(potentialLocations); - } - } - - private static void setDeviceSourceField(TwitterMessage message, Tweet tweet) { - DeviceSource deviceSource = tweet.getDevice_source(); - TwitterMessageUtil.setSourceOnMessage(message, modifyDeviceSourceWithNofollow(deviceSource)); - } - - /** Builds an IngesterTwitterMessage instance from a TweetDeleteEvent. */ - @Nonnull - public static IngesterTwitterMessage getTwitterMessageFromDeletionEvent( - @Nonnull TweetDeleteEvent deleteEvent, - @Nonnull List supportedPenguinVersions, - @Nullable DebugEvents debugEvents) throws ThriftTweetParsingException { - - Tweet tweet = deleteEvent.getTweet(); - if (tweet == null) { - throw new ThriftTweetParsingException("No tweet field in TweetDeleteEvent"); - } - long tweetId = tweet.getId(); - - TweetCoreData coreData = tweet.getCore_data(); - if (coreData == null) { - throw new ThriftTweetParsingException("No TweetCoreData in TweetDeleteEvent"); - } - long userId = coreData.getUser_id(); - - IngesterTwitterMessage message = new IngesterTwitterMessage( - tweetId, - supportedPenguinVersions, - debugEvents); - message.setDeleted(true); - message.setText("delete"); - message.setFromUser(TwitterMessageUser.createWithNamesAndId("delete", "delete", userId)); - - return message; - } - - private static TwitterMessageUser getFromUser(User user) { - String screenName = user.getProfile().getScreen_name(); - long id = user.getId(); - String displayName = user.getProfile().getName(); - return TwitterMessageUser.createWithNamesAndId(screenName, displayName, id); - } - - private static void addMentionsToMessage(IngesterTwitterMessage message, Tweet tweet) { - List mentions = tweet.getMentions(); - if (mentions != null) { - NUM_TWEETS_WITH_MENTIONS.increment(); - NUM_MENTIONS_ADDED.add(mentions.size()); - for (MentionEntity mention : mentions) { - addMention(message, mention); - } - } - } - - private static void addMention(IngesterTwitterMessage message, MentionEntity mention) { - // Default values. They are weird, but are consistent with JSON parsing behavior. - Optional id = Optional.of(-1L); - Optional screenName = Optional.of(""); - Optional displayName = Optional.of(""); - - if (mention.isSetUser_id()) { - id = Optional.of(mention.getUser_id()); - } - - if (mention.isSetScreen_name()) { - screenName = Optional.of(mention.getScreen_name()); - } - - if (mention.isSetName()) { - displayName = Optional.of(mention.getName()); - } - - TwitterMessageUser mentionedUser = TwitterMessageUser - .createWithOptionalNamesAndId(screenName, displayName, id); - - if (isToUser(mention, message.getToUserObject())) { - message.setToUserObject(mentionedUser); - } - message.addUserToMentions(mentionedUser); - } - - private static boolean isToUser( - MentionEntity mention, Optional optionalToUser) { - if (mention.getFrom_index() == 0) { - return true; - } - if (optionalToUser.isPresent()) { - TwitterMessageUser toUser = optionalToUser.get(); - if (toUser.getId().isPresent()) { - long toUserId = toUser.getId().get(); - return mention.getUser_id() == toUserId; - } - } - return false; - } - - private static void addHashtagsToMessage(IngesterTwitterMessage message, Tweet tweet) { - List hashtags = tweet.getHashtags(); - if (hashtags != null) { - NUM_TWEETS_WITH_HASHTAGS.increment(); - NUM_HASHTAGS_ADDED.add(hashtags.size()); - for (HashtagEntity hashtag : hashtags) { - addHashtag(message, hashtag); - } - } - } - - private static void addHashtag(IngesterTwitterMessage message, HashtagEntity hashtag) { - String hashtagString = hashtag.getText(); - message.addHashtag(hashtagString); - } - - /** Add the given media entities to the given message. */ - public static void addMediaEntitiesToMessage( - IngesterTwitterMessage message, - long photoStatusId, - @Nullable List medias) { - - if (medias != null) { - NUM_TWEETS_WITH_MEDIA_URL.increment(); - NUM_MEDIA_URLS_ADDED.add(medias.size()); - - boolean hasPhotoMediaUrl = false; - boolean hasVideoMediaUrl = false; - for (MediaEntity media : medias) { - MediaTypes mediaType = null; - if (media.isSetMedia_info()) { - MediaInfo mediaInfo = media.getMedia_info(); - if (mediaInfo != null) { - if (mediaInfo.isSet(MediaInfo._Fields.IMAGE_INFO)) { - mediaType = MediaTypes.NATIVE_IMAGE; - String mediaUrl = media.getMedia_url_https(); - if (mediaUrl != null) { - hasPhotoMediaUrl = true; - message.addPhotoUrl(photoStatusId, mediaUrl); - // Add this link to the expanded URLs too, so that the HAS_NATIVE_IMAGE_FLAG is set - // correctly too. See EncodedFeatureBuilder.updateLinkEncodedFeatures(). - } - } else if (mediaInfo.isSet(MediaInfo._Fields.VIDEO_INFO)) { - mediaType = MediaTypes.VIDEO; - hasVideoMediaUrl = true; - } - } - } - String originalUrl = media.getUrl(); - String expandedUrl = media.getExpanded_url(); - message.addExpandedMediaUrl(originalUrl, expandedUrl, mediaType); - } - - if (hasPhotoMediaUrl) { - NUM_TWEETS_WITH_PHOTO_MEDIA_URL.increment(); - } - if (hasVideoMediaUrl) { - NUM_TWEETS_WITH_VIDEO_MEDIA_URL.increment(); - } - } - } - - /** Adds the given urls to the given message. */ - public static void addUrlsToMessage( - IngesterTwitterMessage message, - @Nullable List urls) { - - if (urls != null) { - NUM_TWEETS_WITH_NON_MEDIA_URL.increment(); - NUM_NON_MEDIA_URLS_ADDED.add(urls.size()); - for (UrlEntity url : urls) { - String originalUrl = url.getUrl(); - String expandedUrl = url.getExpanded(); - message.addExpandedNonMediaUrl(originalUrl, expandedUrl); - } - } - } - - private static void addEscherbirdAnnotationsToMessage( - IngesterTwitterMessage message, Tweet tweet) { - if (tweet.isSetEscherbird_entity_annotations()) { - EscherbirdEntityAnnotations entityAnnotations = tweet.getEscherbird_entity_annotations(); - if (entityAnnotations.isSetEntity_annotations()) { - NUM_TWEETS_WITH_ANNOTATIONS.increment(); - NUM_ANNOTATIONS_ADDED.add(entityAnnotations.getEntity_annotationsSize()); - for (TweetEntityAnnotation entityAnnotation : entityAnnotations.getEntity_annotations()) { - EscherbirdAnnotation escherbirdAnnotation = - new EscherbirdAnnotation(entityAnnotation.getGroupId(), - entityAnnotation.getDomainId(), - entityAnnotation.getEntityId()); - message.addEscherbirdAnnotation(escherbirdAnnotation); - } - } - } - } - - private static void setComposerSource(IngesterTwitterMessage message, Tweet tweet) { - if (tweet.isSetComposer_source()) { - message.setComposerSource(tweet.getComposer_source()); - } - } - - private static String modifyDeviceSourceWithNofollow(@Nullable DeviceSource deviceSource) { - if (deviceSource != null) { - String source = deviceSource.getDisplay(); - int i = source.indexOf("\">"); - if (i == -1) { - return source; - } else { - return source.substring(0, i) + "\" rel=\"nofollow\">" + source.substring(i + 2); - } - } else { - return "Twitter"; - } - } - - private static void setDirectedAtUser( - IngesterTwitterMessage message, - TweetCoreData tweetCoreData) { - if (!tweetCoreData.isSetDirected_at_user()) { - return; - } - - DirectedAtUser directedAtUser = tweetCoreData.getDirected_at_user(); - - if (!directedAtUser.isSetUser_id()) { - return; - } - - message.setDirectedAtUserId(Optional.of(directedAtUser.getUser_id())); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/BUILD b/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/BUILD deleted file mode 100644 index 3bbba39ea..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/BUILD +++ /dev/null @@ -1,34 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/commons-logging", - "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-finagle-provider", - "3rdparty/jvm/org/apache/commons:commons-text", - "3rdparty/jvm/org/apache/kafka:kafka-clients", - "3rdparty/jvm/org/slf4j:slf4j-api", - "decider/src/main/scala", - "kafka/finagle-kafka/finatra-kafka/src/main/scala", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/collections", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/search/common/decider", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/util/io:dl-reader-writer", - "src/java/com/twitter/search/common/util/io:record-reader-api", - "src/java/com/twitter/search/common/util/io/kafka", - "src/java/com/twitter/search/common/util/thrift:text-protocol", - "src/java/com/twitter/search/ingester/model", - "src/java/com/twitter/search/ingester/pipeline/twitter", - "src/java/com/twitter/search/ingester/pipeline/twitter/kafka", - "src/java/com/twitter/search/ingester/pipeline/util", - "src/java/com/twitter/search/ingester/pipeline/wire", - "src/java/org/apache/commons/pipeline", - "src/thrift/com/twitter/gizmoduck:modified_user-gizmoduck_scala", - "src/thrift/com/twitter/gizmoduck:thrift-java", - "src/thrift/com/twitter/search/common:indexing-java", - "util/util-core:util-core-util", - "util/util-core/src/main/java/com/twitter/util/javainterop", - ], -) diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/BUILD.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/BUILD.docx new file mode 100644 index 000000000..161f939ac Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/BUILD.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdateIngester.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdateIngester.docx new file mode 100644 index 000000000..4e97e66f6 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdateIngester.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdateIngester.java b/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdateIngester.java deleted file mode 100644 index 03ff94ff2..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdateIngester.java +++ /dev/null @@ -1,292 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter.userupdates; - -import java.util.AbstractMap; -import java.util.Collection; -import java.util.Collections; -import java.util.EnumSet; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Set; -import java.util.function.Function; -import java.util.stream.Collectors; - -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Sets; - -import org.apache.commons.text.CaseUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.collections.Pair; -import com.twitter.decider.Decider; -import com.twitter.finagle.util.DefaultTimer; -import com.twitter.gizmoduck.thriftjava.LifecycleChangeReason; -import com.twitter.gizmoduck.thriftjava.LookupContext; -import com.twitter.gizmoduck.thriftjava.QueryFields; -import com.twitter.gizmoduck.thriftjava.Safety; -import com.twitter.gizmoduck.thriftjava.UpdateDiffItem; -import com.twitter.gizmoduck.thriftjava.User; -import com.twitter.gizmoduck.thriftjava.UserModification; -import com.twitter.gizmoduck.thriftjava.UserService; -import com.twitter.gizmoduck.thriftjava.UserType; -import com.twitter.search.common.indexing.thriftjava.AntisocialUserUpdate; -import com.twitter.search.common.indexing.thriftjava.UserUpdateType; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.util.Duration; -import com.twitter.util.Future; -import com.twitter.util.TimeoutException; - -/** - * This class ingests {@link UserModification} events and transforms them into a possibly empty list - * of {@link AntisocialUserUpdate}s to be indexed by Earlybirds. - */ -public class UserUpdateIngester { - private static final Logger LOG = LoggerFactory.getLogger(UserUpdateIngester.class); - private static final Duration RESULT_TIMEOUT = Duration.fromSeconds(3); - - private static final List NO_UPDATE = Collections.emptyList(); - - // Map from UserUpdateType to a set of Safety fields to examine. - private static final Map> SAFETY_FIELDS_MAP = - ImmutableMap.of( - UserUpdateType.ANTISOCIAL, - Sets.immutableEnumSet( - Safety._Fields.SUSPENDED, Safety._Fields.DEACTIVATED, Safety._Fields.OFFBOARDED), - UserUpdateType.NSFW, - Sets.immutableEnumSet(Safety._Fields.NSFW_USER, Safety._Fields.NSFW_ADMIN), - UserUpdateType.PROTECTED, Sets.immutableEnumSet(Safety._Fields.IS_PROTECTED)); - - private static final Function FIELD_TO_FIELD_NAME_FUNCTION = - field -> "safety." + CaseUtils.toCamelCase(field.name(), false, '_'); - - private static final Map FIELD_NAME_TO_TYPE_MAP = - SAFETY_FIELDS_MAP.entrySet().stream() - .flatMap( - entry -> entry.getValue().stream() - .map(field -> new AbstractMap.SimpleEntry<>( - FIELD_TO_FIELD_NAME_FUNCTION.apply(field), - entry.getKey()))) - .collect(Collectors.toMap( - AbstractMap.SimpleEntry::getKey, - AbstractMap.SimpleEntry::getValue)); - - private static final Map FIELD_NAME_TO_FIELD_MAP = - SAFETY_FIELDS_MAP.values().stream() - .flatMap(Collection::stream) - .collect(Collectors.toMap( - FIELD_TO_FIELD_NAME_FUNCTION, - Function.identity())); - - private static final LookupContext LOOKUP_CONTEXT = new LookupContext() - .setInclude_deactivated(true) - .setInclude_erased(true) - .setInclude_suspended(true) - .setInclude_offboarded(true) - .setInclude_protected(true); - - private final UserService.ServiceToClient userService; - private final Decider decider; - - private final SearchLongGauge userModificationLatency; - private final SearchCounter unsuccessfulUserModificationCount; - private final SearchCounter byInactiveAccountDeactivationUserModificationCount; - private final SearchCounter irrelevantUserModificationCount; - private final SearchCounter notNormalUserCount; - private final SearchCounter missingSafetyCount; - private final SearchCounter userServiceRequests; - private final SearchCounter userServiceSuccesses; - private final SearchCounter userServiceNoResults; - private final SearchCounter userServiceFailures; - private final SearchCounter userServiceTimeouts; - private final Map, SearchCounter> counterMap; - - public UserUpdateIngester( - String statPrefix, - UserService.ServiceToClient userService, - Decider decider - ) { - this.userService = userService; - this.decider = decider; - - userModificationLatency = - SearchLongGauge.export(statPrefix + "_user_modification_latency_ms"); - unsuccessfulUserModificationCount = - SearchCounter.export(statPrefix + "_unsuccessful_user_modification_count"); - byInactiveAccountDeactivationUserModificationCount = - SearchCounter.export(statPrefix - + "_by_inactive_account_deactivation_user_modification_count"); - irrelevantUserModificationCount = - SearchCounter.export(statPrefix + "_irrelevant_user_modification_count"); - notNormalUserCount = - SearchCounter.export(statPrefix + "_not_normal_user_count"); - missingSafetyCount = - SearchCounter.export(statPrefix + "_missing_safety_count"); - userServiceRequests = - SearchCounter.export(statPrefix + "_user_service_requests"); - userServiceSuccesses = - SearchCounter.export(statPrefix + "_user_service_successes"); - userServiceNoResults = - SearchCounter.export(statPrefix + "_user_service_no_results"); - userServiceFailures = - SearchCounter.export(statPrefix + "_user_service_failures"); - userServiceTimeouts = - SearchCounter.export(statPrefix + "_user_service_timeouts"); - counterMap = ImmutableMap., SearchCounter>builder() - .put(Pair.of(UserUpdateType.ANTISOCIAL, true), - SearchCounter.export(statPrefix + "_antisocial_set_count")) - .put(Pair.of(UserUpdateType.ANTISOCIAL, false), - SearchCounter.export(statPrefix + "_antisocial_unset_count")) - .put(Pair.of(UserUpdateType.NSFW, true), - SearchCounter.export(statPrefix + "_nsfw_set_count")) - .put(Pair.of(UserUpdateType.NSFW, false), - SearchCounter.export(statPrefix + "_nsfw_unset_count")) - .put(Pair.of(UserUpdateType.PROTECTED, true), - SearchCounter.export(statPrefix + "_protected_set_count")) - .put(Pair.of(UserUpdateType.PROTECTED, false), - SearchCounter.export(statPrefix + "_protected_unset_count")) - .build(); - } - - /** - * Convert a UserModification event into a (possibly empty) list of antisocial updates for - * Earlybird. - */ - public Future> transform(UserModification userModification) { - userModificationLatency.set(System.currentTimeMillis() - userModification.getUpdated_at_msec()); - - if (!userModification.isSuccess()) { - unsuccessfulUserModificationCount.increment(); - return Future.value(NO_UPDATE); - } - - // To avoid UserTable gets overflowed, we exclude traffic from ByInactiveAccountDeactivation - if (userModification.getUser_audit_data() != null - && userModification.getUser_audit_data().getReason() != null - && userModification.getUser_audit_data().getReason() - == LifecycleChangeReason.BY_INACTIVE_ACCOUNT_DEACTIVATION) { - byInactiveAccountDeactivationUserModificationCount.increment(); - return Future.value(NO_UPDATE); - } - - long userId = userModification.getUser_id(); - Set userUpdateTypes = getUserUpdateTypes(userModification); - if (userUpdateTypes.isEmpty()) { - irrelevantUserModificationCount.increment(); - return Future.value(NO_UPDATE); - } - - Future userFuture = userModification.isSetCreate() - ? Future.value(userModification.getCreate()) - : getUser(userId); - - return userFuture - .map(user -> { - if (user == null) { - return NO_UPDATE; - } else if (user.getUser_type() != UserType.NORMAL) { - LOG.info("User with id={} is not a normal user.", userId); - notNormalUserCount.increment(); - return NO_UPDATE; - } else if (!user.isSetSafety()) { - LOG.info("Safety for User with id={} is missing.", userId); - missingSafetyCount.increment(); - return NO_UPDATE; - } - - if (userModification.isSetUpdate()) { - // Apply relevant updates from UserModification as User returned from Gizmoduck may not - // have reflected them yet. - applyUpdates(user, userModification); - } - - return userUpdateTypes.stream() - .map(userUpdateType -> - convertToAntiSocialUserUpdate( - user, userUpdateType, userModification.getUpdated_at_msec())) - .peek(update -> - counterMap.get(Pair.of(update.getType(), update.isValue())).increment()) - .collect(Collectors.toList()); - }) - .onFailure(com.twitter.util.Function.cons(exception -> { - if (exception instanceof UserNotFoundException) { - userServiceNoResults.increment(); - } else if (exception instanceof TimeoutException) { - userServiceTimeouts.increment(); - LOG.error("UserService.get timed out for user id=" + userId, exception); - } else { - userServiceFailures.increment(); - LOG.error("UserService.get failed for user id=" + userId, exception); - } - })); - } - - private static Set getUserUpdateTypes(UserModification userModification) { - Set types = EnumSet.noneOf(UserUpdateType.class); - - if (userModification.isSetUpdate()) { - userModification.getUpdate().stream() - .map(UpdateDiffItem::getField_name) - .map(FIELD_NAME_TO_TYPE_MAP::get) - .filter(Objects::nonNull) - .collect(Collectors.toCollection(() -> types)); - } else if (userModification.isSetCreate() && userModification.getCreate().isSetSafety()) { - Safety safety = userModification.getCreate().getSafety(); - if (safety.isSuspended()) { - types.add(UserUpdateType.ANTISOCIAL); - } - if (safety.isNsfw_admin() || safety.isNsfw_user()) { - types.add(UserUpdateType.NSFW); - } - if (safety.isIs_protected()) { - types.add(UserUpdateType.PROTECTED); - } - } - - return types; - } - - private Future getUser(long userId) { - userServiceRequests.increment(); - return userService.get( - LOOKUP_CONTEXT, - Collections.singletonList(userId), - Collections.singleton(QueryFields.SAFETY)) - .within(DefaultTimer.getInstance(), RESULT_TIMEOUT) - .flatMap(userResults -> { - if (userResults.size() != 1 || !userResults.get(0).isSetUser()) { - return Future.exception(new UserNotFoundException(userId)); - } - - userServiceSuccesses.increment(); - return Future.value(userResults.get(0).getUser()); - }); - } - - private static void applyUpdates(User user, UserModification userModification) { - userModification.getUpdate().stream() - .filter(update -> FIELD_NAME_TO_FIELD_MAP.containsKey(update.getField_name())) - .filter(UpdateDiffItem::isSetAfter) - .forEach(update -> - user.getSafety().setFieldValue( - FIELD_NAME_TO_FIELD_MAP.get(update.getField_name()), - Boolean.valueOf(update.getAfter())) - ); - } - - private AntisocialUserUpdate convertToAntiSocialUserUpdate( - User user, - UserUpdateType userUpdateType, - long updatedAt) { - boolean value = SAFETY_FIELDS_MAP.get(userUpdateType).stream() - .anyMatch(safetyField -> (boolean) user.getSafety().getFieldValue(safetyField)); - return new AntisocialUserUpdate(user.getId(), userUpdateType, value, updatedAt); - } - - class UserNotFoundException extends Exception { - UserNotFoundException(long userId) { - super("User " + userId + " not found."); - } - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdatesPipeline.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdatesPipeline.docx new file mode 100644 index 000000000..f4d3e9679 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdatesPipeline.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdatesPipeline.java b/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdatesPipeline.java deleted file mode 100644 index 5cbf009d2..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdatesPipeline.java +++ /dev/null @@ -1,222 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter.userupdates; - -import java.time.Duration; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.Semaphore; -import java.util.function.Supplier; - -import scala.runtime.BoxedUnit; - -import com.google.common.base.Preconditions; - -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.clients.producer.ProducerRecord; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.finatra.kafka.producers.BlockingFinagleKafkaProducer; -import com.twitter.gizmoduck.thriftjava.UserModification; -import com.twitter.search.common.indexing.thriftjava.AntisocialUserUpdate; -import com.twitter.search.common.metrics.SearchCustomGauge; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.util.io.kafka.CompactThriftSerializer; -import com.twitter.search.common.util.io.kafka.ThriftDeserializer; -import com.twitter.search.ingester.pipeline.wire.WireModule; -import com.twitter.util.Future; -import com.twitter.util.Futures; - -/** - * This class reads UserModification events from Kafka, transforms them into AntisocialUserUpdates, - * and writes them to Kafka. - */ -public final class UserUpdatesPipeline { - private static final Logger LOG = LoggerFactory.getLogger(UserUpdatesPipeline.class); - private static final Duration POLL_TIMEOUT = Duration.ofSeconds(1); - private static final int MAX_PENDING_EVENTS = 100; - private static final String KAFKA_CLIENT_ID = ""; - private static final int MAX_POLL_RECORDS = 1; - private static final String USER_MODIFICATIONS_KAFKA_TOPIC = ""; - private static final String USER_UPDATES_KAFKA_TOPIC_PREFIX = ""; - private static final String KAFKA_PRODUCER_DEST = ""; - private static final String KAFKA_CONSUMER_DEST = ""; - - // This semaphore stops us from having more than MAX_PENDING_EVENTS in the pipeline at any point - // in time. - private final Semaphore pendingEvents = new Semaphore(MAX_PENDING_EVENTS); - private final Supplier isRunning; - private final KafkaConsumer userModificationConsumer; - private final UserUpdateIngester userUpdateIngester; - private final SearchRateCounter records; - private final SearchRateCounter success; - private final SearchRateCounter failure; - - private final String userUpdatesKafkaTopic; - private final BlockingFinagleKafkaProducer userUpdatesProducer; - private final Clock clock; - - /** - * Builds the pipeline. - */ - public static UserUpdatesPipeline buildPipeline( - String environment, - WireModule wireModule, - String statsPrefix, - Supplier isRunning, - Clock clock) throws Exception { - - // We only have Gizmoduck clients for staging and prod. - String gizmoduckClient; - if (environment.startsWith("staging")) { - gizmoduckClient = ""; - } else { - Preconditions.checkState("prod".equals(environment)); - gizmoduckClient = ""; - } - LOG.info("Gizmoduck client: {}", gizmoduckClient); - - String kafkaConsumerGroup = "" + environment; - KafkaConsumer userModificationConsumer = wireModule.newKafkaConsumer( - KAFKA_CONSUMER_DEST, - new ThriftDeserializer<>(UserModification.class), - KAFKA_CLIENT_ID, - kafkaConsumerGroup, - MAX_POLL_RECORDS); - userModificationConsumer.subscribe(Collections.singleton(USER_MODIFICATIONS_KAFKA_TOPIC)); - LOG.info("User modifications topic: {}", USER_MODIFICATIONS_KAFKA_TOPIC); - LOG.info("User updates Kafka topic prefix: {}", USER_UPDATES_KAFKA_TOPIC_PREFIX); - LOG.info("Kafka consumer group: {}", kafkaConsumerGroup); - LOG.info("Kafka client id: {}", KAFKA_CLIENT_ID); - - UserUpdateIngester userUpdateIngester = new UserUpdateIngester( - statsPrefix, - wireModule.getGizmoduckClient(gizmoduckClient), - wireModule.getDecider()); - - String userUpdatesKafkaTopic = USER_UPDATES_KAFKA_TOPIC_PREFIX + environment; - BlockingFinagleKafkaProducer userUpdatesProducer = - wireModule.newFinagleKafkaProducer( - KAFKA_PRODUCER_DEST, - new CompactThriftSerializer(), - KAFKA_CLIENT_ID, - null); - - return new UserUpdatesPipeline( - isRunning, - userModificationConsumer, - userUpdateIngester, - userUpdatesProducer, - userUpdatesKafkaTopic, - clock); - } - - private UserUpdatesPipeline( - Supplier isRunning, - KafkaConsumer userModificationConsumer, - UserUpdateIngester userUpdateIngester, - BlockingFinagleKafkaProducer userUpdatesProducer, - String userUpdatesKafkaTopic, - Clock clock) { - this.isRunning = isRunning; - this.userModificationConsumer = userModificationConsumer; - this.userUpdateIngester = userUpdateIngester; - this.userUpdatesProducer = userUpdatesProducer; - this.userUpdatesKafkaTopic = userUpdatesKafkaTopic; - this.clock = clock; - - String statPrefix = "user_updates_pipeline_"; - SearchCustomGauge.export(statPrefix + "semaphore_permits", pendingEvents::availablePermits); - - records = SearchRateCounter.export(statPrefix + "records_processed_total"); - success = SearchRateCounter.export(statPrefix + "records_processed_success"); - failure = SearchRateCounter.export(statPrefix + "records_processed_failure"); - } - - /** - * Start the user updates pipeline. - */ - public void run() { - while (isRunning.get()) { - try { - pollFromKafka(); - } catch (Throwable e) { - LOG.error("Exception processing event.", e); - } - } - close(); - } - /** - * Polls records from Kafka and handles timeouts, back-pressure, and error handling. - * All consumed messages are passed to the messageHandler. - */ - private void pollFromKafka() throws Exception { - for (ConsumerRecord record - : userModificationConsumer.poll(POLL_TIMEOUT)) { - pendingEvents.acquire(); - records.increment(); - - handleUserModification(record.value()) - .onFailure(e -> { - failure.increment(); - return null; - }) - .onSuccess(u -> { - success.increment(); - return null; - }) - .ensure(() -> { - pendingEvents.release(); - return null; - }); - } - } - - /** - * Handles the business logic for the user updates pipeline: - * 1. Converts incoming event into possibly empty set of AntisocialUserUpdates - * 2. Writes the result to Kafka so that Earlybird can consume it. - */ - private Future handleUserModification(UserModification event) { - return userUpdateIngester - .transform(event) - .flatMap(this::writeListToKafka); - } - - private Future writeListToKafka(List updates) { - List> futures = new ArrayList<>(); - for (AntisocialUserUpdate update : updates) { - futures.add(writeToKafka(update)); - } - return Futures.join(futures).onFailure(e -> { - LOG.info("Exception while writing to kafka", e); - return null; - }); - } - - private Future writeToKafka(AntisocialUserUpdate update) { - ProducerRecord record = new ProducerRecord<>( - userUpdatesKafkaTopic, - null, - clock.nowMillis(), - null, - update); - try { - return userUpdatesProducer.send(record).unit(); - } catch (Exception e) { - return Future.exception(e); - } - } - - private void close() { - userModificationConsumer.close(); - try { - // Acquire all of the permits, so we know all pending events have been written. - pendingEvents.acquire(MAX_PENDING_EVENTS); - } catch (Exception e) { - LOG.error("Error shutting down stage", e); - } - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdatesPipelineStage.docx b/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdatesPipelineStage.docx new file mode 100644 index 000000000..c3c1517b9 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdatesPipelineStage.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdatesPipelineStage.java b/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdatesPipelineStage.java deleted file mode 100644 index 77ba0acf0..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/twitter/userupdates/UserUpdatesPipelineStage.java +++ /dev/null @@ -1,51 +0,0 @@ -package com.twitter.search.ingester.pipeline.twitter.userupdates; - -import java.util.function.Supplier; - -import org.apache.commons.pipeline.Pipeline; -import org.apache.commons.pipeline.StageDriver; -import org.apache.commons.pipeline.StageException; - -import com.twitter.search.ingester.pipeline.twitter.TwitterBaseStage; -import com.twitter.search.ingester.pipeline.util.PipelineUtil; - -/** - * This stage is a shim for the UserUpdatesPipeline. - * - * Eventually the UserUpdatesPipeline will be called directly from a TwitterServer, but this exists - * as a bridge while we migrate. - */ -public class UserUpdatesPipelineStage extends TwitterBaseStage { - // This is 'prod', 'staging', or 'staging1'. - private String environment; - private UserUpdatesPipeline userUpdatesPipeline; - - @Override - protected void doInnerPreprocess() throws StageException { - StageDriver driver = ((Pipeline) stageContext).getStageDriver(this); - Supplier booleanSupplier = () -> driver.getState() == StageDriver.State.RUNNING; - try { - userUpdatesPipeline = UserUpdatesPipeline.buildPipeline( - environment, - wireModule, - getStageNamePrefix(), - booleanSupplier, - clock); - - } catch (Exception e) { - throw new StageException(this, e); - } - PipelineUtil.feedStartObjectToStage(this); - } - - @Override - public void innerProcess(Object obj) throws StageException { - userUpdatesPipeline.run(); - } - - @SuppressWarnings("unused") // populated from pipeline config - public void setEnvironment(String environment) { - this.environment = environment; - } - -} diff --git a/src/java/com/twitter/search/ingester/pipeline/util/BUILD b/src/java/com/twitter/search/ingester/pipeline/util/BUILD deleted file mode 100644 index 916b58636..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/BUILD +++ /dev/null @@ -1,41 +0,0 @@ -java_library( - sources = ["*.java"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/code/findbugs:jsr305", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/commons-lang", - "3rdparty/jvm/commons-logging", - "3rdparty/jvm/org/apache/commons:commons-math3", - "3rdparty/jvm/org/apache/thrift:libthrift", - "decider/src/main/scala", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/metastore/client_v2", - "src/java/com/twitter/metastore/data", - "src/java/com/twitter/search/common/debug", - "src/java/com/twitter/search/common/encoding/features", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/base", - "src/java/com/twitter/search/common/relevance:entities_and_filters", - "src/java/com/twitter/search/common/relevance/features", - "src/java/com/twitter/search/common/schema", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/common/util/geocoding", - "src/java/com/twitter/search/common/util/text", - "src/java/com/twitter/search/ingester/model", - "src/java/org/apache/commons/pipeline", - "src/scala/com/twitter/common_internal/analytics/test_user_filter", - "src/thrift/com/twitter/expandodo:cards-java", - "src/thrift/com/twitter/manhattan:internal-scala", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/search/common:schema-java", - "src/thrift/com/twitter/service/metastore/gen:thrift-java", - "stitch/stitch-core", - "storage/clients/manhattan", - "util/util-core:scala", - ], -) diff --git a/src/java/com/twitter/search/ingester/pipeline/util/BUILD.docx b/src/java/com/twitter/search/ingester/pipeline/util/BUILD.docx new file mode 100644 index 000000000..2a59439ac Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/BUILD.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/BatchedElement.docx b/src/java/com/twitter/search/ingester/pipeline/util/BatchedElement.docx new file mode 100644 index 000000000..3e5f98887 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/BatchedElement.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/BatchedElement.java b/src/java/com/twitter/search/ingester/pipeline/util/BatchedElement.java deleted file mode 100644 index 7b78a1fc5..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/BatchedElement.java +++ /dev/null @@ -1,21 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; - -import java.util.concurrent.CompletableFuture; - -public class BatchedElement { - private CompletableFuture completableFuture; - private T item; - - public BatchedElement(T item, CompletableFuture completableFuture) { - this.item = item; - this.completableFuture = completableFuture; - } - - public T getItem() { - return item; - } - - public CompletableFuture getCompletableFuture() { - return completableFuture; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/util/BatchingClient.docx b/src/java/com/twitter/search/ingester/pipeline/util/BatchingClient.docx new file mode 100644 index 000000000..326285765 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/BatchingClient.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/BatchingClient.java b/src/java/com/twitter/search/ingester/pipeline/util/BatchingClient.java deleted file mode 100644 index 222c6f544..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/BatchingClient.java +++ /dev/null @@ -1,105 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; - -import java.util.HashSet; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; - -import com.google.common.collect.Sets; - -import com.twitter.util.Future; -import com.twitter.util.Promise; - -/** - * Batches single requests of type RQ -> Future to an underlying client that supports batch - * calls with multiple values of type RQ. Threadsafe. - */ -public class BatchingClient { - @FunctionalInterface - public interface BatchClient { - /** - * Issue a request to the underlying store which supports batches of requests. - */ - Future> batchGet(Set requests); - } - - /** - * unsentRequests is not threadsafe, and so it must be externally synchronized. - */ - private final HashSet unsentRequests = new HashSet<>(); - - private final ConcurrentHashMap> promises = new ConcurrentHashMap<>(); - - private final BatchClient batchClient; - private final int batchSize; - - public BatchingClient( - BatchClient batchClient, - int batchSize - ) { - this.batchClient = batchClient; - this.batchSize = batchSize; - } - - /** - * Send a request and receive a Future. The future will not be resolved until at there at - * least batchSize requests ready to send. - */ - public Future call(RQ request) { - Promise promise = promises.computeIfAbsent(request, r -> new Promise<>()); - - maybeBatchCall(request); - - return promise; - } - - private void maybeBatchCall(RQ request) { - Set frozenRequests; - synchronized (unsentRequests) { - unsentRequests.add(request); - if (unsentRequests.size() < batchSize) { - return; - } - - // Make a copy of requests so we can modify it inside executeBatchCall without additional - // synchronization. - frozenRequests = new HashSet<>(unsentRequests); - unsentRequests.clear(); - } - - executeBatchCall(frozenRequests); - } - - private void executeBatchCall(Set requests) { - batchClient.batchGet(requests) - .onSuccess(responseMap -> { - for (Map.Entry entry : responseMap.entrySet()) { - Promise promise = promises.remove(entry.getKey()); - if (promise != null) { - promise.become(Future.value(entry.getValue())); - } - } - - Set outstandingRequests = Sets.difference(requests, responseMap.keySet()); - for (RQ request : outstandingRequests) { - Promise promise = promises.remove(request); - if (promise != null) { - promise.become(Future.exception(new ResponseNotReturnedException(request))); - } - } - - return null; - }) - .onFailure(exception -> { - for (RQ request : requests) { - Promise promise = promises.remove(request); - if (promise != null) { - promise.become(Future.exception(exception)); - } - } - - return null; - }); - } -} - diff --git a/src/java/com/twitter/search/ingester/pipeline/util/CardFieldUtil.docx b/src/java/com/twitter/search/ingester/pipeline/util/CardFieldUtil.docx new file mode 100644 index 000000000..3d3862001 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/CardFieldUtil.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/CardFieldUtil.java b/src/java/com/twitter/search/ingester/pipeline/util/CardFieldUtil.java deleted file mode 100644 index ae82f8764..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/CardFieldUtil.java +++ /dev/null @@ -1,48 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; - -import com.google.common.base.Strings; - -import com.twitter.expandodo.thriftjava.BindingValue; -import com.twitter.expandodo.thriftjava.BindingValueType; -import com.twitter.expandodo.thriftjava.Card2; -import com.twitter.search.common.util.text.LanguageIdentifierHelper; -import com.twitter.search.ingester.model.IngesterTwitterMessage; - -public final class CardFieldUtil { - - private CardFieldUtil() { - /* prevent instantiation */ - } - - /** - * Binding Keys for card fields - */ - public static final String TITLE_BINDING_KEY = "title"; - public static final String DESCRIPTION_BINDING_KEY = "description"; - - /** - * given a bindingKey and card, will return the bindingValue of the given bindingKey - * if present in card.getBinding_values(). If no match is found return null. - */ - public static String extractBindingValue(String bindingKey, Card2 card) { - for (BindingValue bindingValue : card.getBinding_values()) { - if ((bindingValue != null) - && bindingValue.isSetType() - && (bindingValue.getType() == BindingValueType.STRING) - && bindingKey.equals(bindingValue.getKey())) { - return bindingValue.getString_value(); - } - } - return null; - } - - /** - * derives card lang from title + description and sets it in TwitterMessage. - */ - public static void deriveCardLang(IngesterTwitterMessage message) { - message.setCardLang(LanguageIdentifierHelper.identifyLanguage(String.format("%s %s", - Strings.nullToEmpty(message.getCardTitle()), - Strings.nullToEmpty(message.getCardDescription()))).getLanguage()); - } -} - diff --git a/src/java/com/twitter/search/ingester/pipeline/util/IngesterStageTimer.docx b/src/java/com/twitter/search/ingester/pipeline/util/IngesterStageTimer.docx new file mode 100644 index 000000000..c755098f9 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/IngesterStageTimer.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/IngesterStageTimer.java b/src/java/com/twitter/search/ingester/pipeline/util/IngesterStageTimer.java deleted file mode 100644 index a20db70a6..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/IngesterStageTimer.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; -import java.util.concurrent.TimeUnit; -import com.twitter.common.base.MorePreconditions; -import com.twitter.search.common.metrics.SearchTimerStats; -import org.apache.commons.pipeline.stage.StageTimer; -/** - * Adds science stats export to StageTimer - */ -public class IngesterStageTimer extends StageTimer { - private final String name; - private final SearchTimerStats timer; - - public IngesterStageTimer(String statName) { - name = MorePreconditions.checkNotBlank(statName); - timer = SearchTimerStats.export(name, TimeUnit.NANOSECONDS, true); - } - - public String getName() { - return name; - } - - @Override - public void start() { - // This override is not necessary; it is added for code readability. - // super.start puts the current time in startTime - super.start(); - } - - @Override - public void stop() { - super.stop(); - long runTime = System.nanoTime() - startTime.get(); - timer.timerIncrement(runTime); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/util/ManhattanCodedLocationProvider.docx b/src/java/com/twitter/search/ingester/pipeline/util/ManhattanCodedLocationProvider.docx new file mode 100644 index 000000000..6b5bfe620 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/ManhattanCodedLocationProvider.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/ManhattanCodedLocationProvider.java b/src/java/com/twitter/search/ingester/pipeline/util/ManhattanCodedLocationProvider.java deleted file mode 100644 index cc569a939..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/ManhattanCodedLocationProvider.java +++ /dev/null @@ -1,110 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Iterator; -import java.util.List; -import java.util.Optional; - -import com.google.common.base.Preconditions; - -import com.twitter.search.common.indexing.thriftjava.ThriftGeoLocationSource; -import com.twitter.search.common.indexing.thriftjava.ThriftGeoPoint; -import com.twitter.search.common.indexing.thriftjava.ThriftGeocodeRecord; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.relevance.entities.GeoObject; -import com.twitter.search.common.util.geocoding.ManhattanGeocodeRecordStore; -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.stitch.Stitch; -import com.twitter.storage.client.manhattan.kv.JavaManhattanKVEndpoint; -import com.twitter.storage.client.manhattan.kv.ManhattanValue; -import com.twitter.util.Function; -import com.twitter.util.Future; - - -public final class ManhattanCodedLocationProvider { - - private final ManhattanGeocodeRecordStore store; - private final SearchCounter locationsCounter; - - private static final String LOCATIONS_POPULATED_STAT_NAME = "_locations_populated_count"; - - public static ManhattanCodedLocationProvider createWithEndpoint( - JavaManhattanKVEndpoint endpoint, String metricsPrefix, String datasetName) { - return new ManhattanCodedLocationProvider( - ManhattanGeocodeRecordStore.create(endpoint, datasetName), metricsPrefix); - } - - private ManhattanCodedLocationProvider(ManhattanGeocodeRecordStore store, String metricPrefix) { - this.locationsCounter = SearchCounter.export(metricPrefix + LOCATIONS_POPULATED_STAT_NAME); - this.store = store; - } - - /** - * Iterates through all given messages, and for each message that has a location set, retrieves - * the coordinates of that location from Manhattan and sets them back on that message. - */ - public Future> populateCodedLatLon( - Collection messages) { - if (messages.isEmpty()) { - return Future.value(messages); - } - - // Batch read requests - List>>> readRequests = - new ArrayList<>(messages.size()); - for (IngesterTwitterMessage message : messages) { - readRequests.add(store.asyncReadFromManhattan(message.getLocation())); - } - Future>>> batchedRequest = - Stitch.run(Stitch.collect(readRequests)); - - return batchedRequest.map(Function.func(optGeoLocations -> { - // Iterate over messages and responses simultaneously - Preconditions.checkState(messages.size() == optGeoLocations.size()); - Iterator messageIterator = messages.iterator(); - Iterator>> optGeoLocationIterator = - optGeoLocations.iterator(); - while (messageIterator.hasNext() && optGeoLocationIterator.hasNext()) { - IngesterTwitterMessage message = messageIterator.next(); - Optional> optGeoLocation = - optGeoLocationIterator.next(); - if (setGeoLocationForMessage(message, optGeoLocation)) { - locationsCounter.increment(); - } - } - return messages; - })); - } - - /** - * Returns whether a valid geolocation was successfully found and saved in the message. - */ - private boolean setGeoLocationForMessage( - IngesterTwitterMessage message, - Optional> optGeoLocation) { - if (optGeoLocation.isPresent()) { - ThriftGeocodeRecord geoLocation = optGeoLocation.get().contents(); - ThriftGeoPoint geoTags = geoLocation.getGeoPoint(); - - if ((geoTags.getLatitude() == GeoObject.DOUBLE_FIELD_NOT_PRESENT) - && (geoTags.getLongitude() == GeoObject.DOUBLE_FIELD_NOT_PRESENT)) { - // This case indicates that we have "negative cache" in coded_locations table, so - // don't try to geocode again. - message.setUncodeableLocation(); - return false; - } else { - GeoObject code = new GeoObject( - geoTags.getLatitude(), - geoTags.getLongitude(), - geoTags.getAccuracy(), - ThriftGeoLocationSource.USER_PROFILE); - message.setGeoLocation(code); - return true; - } - } else { - message.setGeocodeRequired(); - return false; - } - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/util/PenguinVersionsUtil.docx b/src/java/com/twitter/search/ingester/pipeline/util/PenguinVersionsUtil.docx new file mode 100644 index 000000000..1b221cdc8 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/PenguinVersionsUtil.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/PenguinVersionsUtil.java b/src/java/com/twitter/search/ingester/pipeline/util/PenguinVersionsUtil.java deleted file mode 100644 index 323dd201d..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/PenguinVersionsUtil.java +++ /dev/null @@ -1,47 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; - -import java.util.ArrayList; -import java.util.List; - -import com.google.common.base.Preconditions; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.decider.Decider; - -public final class PenguinVersionsUtil { - - private PenguinVersionsUtil() { /* prevent instantiation */ } - - /** - * Utility method for updating penguinVersions lists via decider availability. We must have - * at least one version available. - * @param penguinVersions - * @param decider - * @return - */ - public static List filterPenguinVersionsWithDeciders( - List penguinVersions, - Decider decider) { - List updatedPenguinVersions = new ArrayList<>(); - for (PenguinVersion penguinVersion : penguinVersions) { - if (isPenguinVersionAvailable(penguinVersion, decider)) { - updatedPenguinVersions.add(penguinVersion); - } - } - Preconditions.checkArgument(penguinVersions.size() > 0, - "At least one penguin version must be specified."); - - return updatedPenguinVersions; - } - - /** - * Checks penguinVersion decider for availability. - * @param penguinVersion - * @param decider - * @return - */ - public static boolean isPenguinVersionAvailable(PenguinVersion penguinVersion, Decider decider) { - return decider.isAvailable( - String.format("enable_penguin_version_%d", penguinVersion.getByteValue())); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/util/PipelineExceptionHandler.docx b/src/java/com/twitter/search/ingester/pipeline/util/PipelineExceptionHandler.docx new file mode 100644 index 000000000..5e5a585c6 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/PipelineExceptionHandler.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/PipelineExceptionHandler.java b/src/java/com/twitter/search/ingester/pipeline/util/PipelineExceptionHandler.java deleted file mode 100644 index fc9dd2a72..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/PipelineExceptionHandler.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; - -import com.twitter.util.Duration; - -public interface PipelineExceptionHandler { - /** - * Logs the given message and waits the given duration. - */ - void logAndWait(String msg, Duration waitTime) throws InterruptedException; - - /** - * Logs the given message and shutdowns the application. - */ - void logAndShutdown(String msg); -} diff --git a/src/java/com/twitter/search/ingester/pipeline/util/PipelineStageException.docx b/src/java/com/twitter/search/ingester/pipeline/util/PipelineStageException.docx new file mode 100644 index 000000000..99b695062 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/PipelineStageException.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/PipelineStageException.java b/src/java/com/twitter/search/ingester/pipeline/util/PipelineStageException.java deleted file mode 100644 index 4f4dcddbf..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/PipelineStageException.java +++ /dev/null @@ -1,19 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; - -public class PipelineStageException extends Exception { - public PipelineStageException(Object location, String message, Throwable cause) { - super(message + " In Stage : " + location.getClass(), cause); - } - - public PipelineStageException(Throwable cause) { - super(cause); - } - - public PipelineStageException(String message) { - super(message); - } - - public PipelineStageException(Object location, String message) { - super(message + " In Stage : " + location.getClass()); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/util/PipelineStageRuntimeException.docx b/src/java/com/twitter/search/ingester/pipeline/util/PipelineStageRuntimeException.docx new file mode 100644 index 000000000..e8101717d Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/PipelineStageRuntimeException.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/PipelineStageRuntimeException.java b/src/java/com/twitter/search/ingester/pipeline/util/PipelineStageRuntimeException.java deleted file mode 100644 index 32d237804..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/PipelineStageRuntimeException.java +++ /dev/null @@ -1,7 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; - -public class PipelineStageRuntimeException extends RuntimeException { - public PipelineStageRuntimeException(String msg) { - super(msg); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/util/PipelineUtil.docx b/src/java/com/twitter/search/ingester/pipeline/util/PipelineUtil.docx new file mode 100644 index 000000000..9c1943cad Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/PipelineUtil.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/PipelineUtil.java b/src/java/com/twitter/search/ingester/pipeline/util/PipelineUtil.java deleted file mode 100644 index 58159347b..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/PipelineUtil.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; - -import com.google.common.base.Preconditions; - -import org.apache.commons.pipeline.Feeder; -import org.apache.commons.pipeline.stage.InstrumentedBaseStage; - -public final class PipelineUtil { - - /** - * Feed an object to a specified stage. Used for stages that follow the pattern of - * looping indefinitely in the first call to process() and don't care what the object passed - * in is, but still needs at least one item fed to the stage to start processing. - * - * Examples of stages like this are: EventBusReaderStage and KafkaBytesReaderStage - * - * @param stage stage to enqueue an arbitrary object to. - */ - public static void feedStartObjectToStage(InstrumentedBaseStage stage) { - Feeder stageFeeder = stage.getStageContext().getStageFeeder(stage); - Preconditions.checkNotNull(stageFeeder); - stageFeeder.feed("off to the races"); - } - - private PipelineUtil() { /* prevent instantiation */ } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/util/PipelineV2CreationException.docx b/src/java/com/twitter/search/ingester/pipeline/util/PipelineV2CreationException.docx new file mode 100644 index 000000000..909492c0f Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/PipelineV2CreationException.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/PipelineV2CreationException.java b/src/java/com/twitter/search/ingester/pipeline/util/PipelineV2CreationException.java deleted file mode 100644 index 9248050c4..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/PipelineV2CreationException.java +++ /dev/null @@ -1,7 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; - -public class PipelineV2CreationException extends Exception { - public PipelineV2CreationException(String message) { - super(message); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/util/ResponseNotReturnedException.docx b/src/java/com/twitter/search/ingester/pipeline/util/ResponseNotReturnedException.docx new file mode 100644 index 000000000..90ce15ff9 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/ResponseNotReturnedException.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/ResponseNotReturnedException.java b/src/java/com/twitter/search/ingester/pipeline/util/ResponseNotReturnedException.java deleted file mode 100644 index ad58148cf..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/ResponseNotReturnedException.java +++ /dev/null @@ -1,7 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; - -public class ResponseNotReturnedException extends Exception { - ResponseNotReturnedException(Object request) { - super("Response not returned in batch for request: " + request); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/util/UserPropertiesManager.docx b/src/java/com/twitter/search/ingester/pipeline/util/UserPropertiesManager.docx new file mode 100644 index 000000000..b7595900f Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/util/UserPropertiesManager.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/util/UserPropertiesManager.java b/src/java/com/twitter/search/ingester/pipeline/util/UserPropertiesManager.java deleted file mode 100644 index d11932289..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/util/UserPropertiesManager.java +++ /dev/null @@ -1,446 +0,0 @@ -package com.twitter.search.ingester.pipeline.util; - -import java.util.Collection; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.apache.thrift.TBase; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common_internal.analytics.test_user_filter.TestUserFilter; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.metastore.client_v2.MetastoreClient; -import com.twitter.metastore.data.MetastoreColumn; -import com.twitter.metastore.data.MetastoreException; -import com.twitter.metastore.data.MetastoreRow; -import com.twitter.metastore.data.MetastoreValue; -import com.twitter.search.common.metrics.RelevanceStats; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchRequestStats; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.features.RelevanceSignalConstants; -import com.twitter.search.ingester.model.IngesterTwitterMessage; -import com.twitter.service.metastore.gen.ResponseCode; -import com.twitter.service.metastore.gen.TweepCred; -import com.twitter.util.Function; -import com.twitter.util.Future; - -public class UserPropertiesManager { - private static final Logger LOG = LoggerFactory.getLogger(UserPropertiesManager.class); - - @VisibleForTesting - protected static final List>> COLUMNS = - ImmutableList.of(MetastoreColumn.TWEEPCRED); // contains tweepcred value - - // same spam threshold that is use in tweeypie to spread user level spam to tweets, all tweets - // from user with spam score above such are marked so and removed from search results - @VisibleForTesting - public static final double SPAM_SCORE_THRESHOLD = 4.5; - - @VisibleForTesting - static final SearchRequestStats MANHATTAN_METASTORE_STATS = - SearchRequestStats.export("manhattan_metastore_get", true); - - private static final MetastoreGetColumnStats GET_TWEEP_CRED - = new MetastoreGetColumnStats("tweep_cred"); - - @VisibleForTesting - static final SearchRateCounter MISSING_REPUTATION_COUNTER = RelevanceStats.exportRate( - "num_missing_reputation"); - @VisibleForTesting - static final SearchRateCounter INVALID_REPUTATION_COUNTER = RelevanceStats.exportRate( - "num_invalid_reputation"); - @VisibleForTesting - static final SearchRateCounter ACCEPTED_REPUTATION_COUNTER = RelevanceStats.exportRate( - "num_accepted_reputation"); - @VisibleForTesting - static final SearchRateCounter SKIPPED_REPUTATION_CHECK_COUNTER = RelevanceStats.exportRate( - "num_skipped_reputation_check_for_test_user"); - @VisibleForTesting - static final SearchCounter DEFAULT_REPUTATION_COUNTER = SearchCounter.export( - "messages_default_reputation_count"); - @VisibleForTesting - static final SearchCounter MESSAGE_FROM_TEST_USER = - SearchCounter.export("messages_from_test_user"); - - // User level bits that are spread onto tweets - private static final SearchRateCounter IS_USER_NSFW_COUNTER = RelevanceStats.exportRate( - "num_is_nsfw"); - private static final SearchRateCounter IS_USER_SPAM_COUNTER = RelevanceStats.exportRate( - "num_is_spam"); - - // count how many tweets has "possibly_sensitive" set to true in the original json message - private static final SearchRateCounter IS_SENSITIVE_FROM_JSON_COUNTER = RelevanceStats.exportRate( - "num_is_sensitive_in_json"); - - private static final SearchCounter SENSITIVE_BITS_COUNTER = - SearchCounter.export("messages_sensitive_bits_set_count"); - - private final MetastoreClient metastoreClient; - private final UserPropertiesManager.MetastoreGetColumnStats tweepCredStats; - - /** - * Stats for keeping track of multiGet requests to metastore for a specific data column. - */ - @VisibleForTesting static class MetastoreGetColumnStats { - /** - * No data was returned from metastore for a specific user. - */ - private final SearchCounter notReturned; - /** - * Metastore returned a successful OK response. - */ - private final SearchCounter metastoreSuccess; - /** - * Metastore returned a NOT_FOUND response for a user. - */ - private final SearchCounter metastoreNotFound; - /** - * Metastore returned a BAD_INPUT response for a user. - */ - private final SearchCounter metastoreBadInput; - /** - * Metastore returned a TRANSIENT_ERROR response for a user. - */ - private final SearchCounter metastoreTransientError; - /** - * Metastore returned a PERMANENT_ERROR response for a user. - */ - private final SearchCounter metastorePermanentError; - /** - * Metastore returned an unknown response code for a user. - */ - private final SearchCounter metastoreUnknownResponseCode; - /** - * Total number of users that we asked data for in metastore. - */ - private final SearchCounter totalRequests; - - @VisibleForTesting MetastoreGetColumnStats(String columnName) { - String prefix = "manhattan_metastore_get_" + columnName; - notReturned = SearchCounter.export(prefix + "_response_not_returned"); - metastoreSuccess = SearchCounter.export(prefix + "_response_success"); - metastoreNotFound = SearchCounter.export(prefix + "_response_not_found"); - metastoreBadInput = SearchCounter.export(prefix + "_response_bad_input"); - metastoreTransientError = SearchCounter.export(prefix + "_response_transient_error"); - metastorePermanentError = SearchCounter.export(prefix + "_response_permanent_error"); - metastoreUnknownResponseCode = - SearchCounter.export(prefix + "_response_unknown_response_code"); - // Have a distinguishable prefix for the total requests stat so that we can use it to get - // a viz rate against wild-carded "prefix_response_*" stats. - totalRequests = SearchCounter.export(prefix + "_requests"); - } - - /** - * Tracks metastore get column stats for an individual user's response. - * @param responseCode the response code received from metastore. Expected to be null if no - * response came back at all. - */ - private void trackMetastoreResponseCode(@Nullable ResponseCode responseCode) { - totalRequests.increment(); - - if (responseCode == null) { - notReturned.increment(); - } else if (responseCode == ResponseCode.OK) { - metastoreSuccess.increment(); - } else if (responseCode == ResponseCode.NOT_FOUND) { - metastoreNotFound.increment(); - } else if (responseCode == ResponseCode.BAD_INPUT) { - metastoreBadInput.increment(); - } else if (responseCode == ResponseCode.TRANSIENT_ERROR) { - metastoreTransientError.increment(); - } else if (responseCode == ResponseCode.PERMANENT_ERROR) { - metastorePermanentError.increment(); - } else { - metastoreUnknownResponseCode.increment(); - } - } - - @VisibleForTesting long getTotalRequests() { - return totalRequests.get(); - } - - @VisibleForTesting long getNotReturnedCount() { - return notReturned.get(); - } - - @VisibleForTesting long getMetastoreSuccessCount() { - return metastoreSuccess.get(); - } - - @VisibleForTesting long getMetastoreNotFoundCount() { - return metastoreNotFound.get(); - } - - @VisibleForTesting long getMetastoreBadInputCount() { - return metastoreBadInput.get(); - } - - @VisibleForTesting long getMetastoreTransientErrorCount() { - return metastoreTransientError.get(); - } - - @VisibleForTesting long getMetastorePermanentErrorCount() { - return metastorePermanentError.get(); - } - - @VisibleForTesting long getMetastoreUnknownResponseCodeCount() { - return metastoreUnknownResponseCode.get(); - } - } - - /** Class that holds all user properties from Manhattan. */ - @VisibleForTesting - protected static class ManhattanUserProperties { - private double spamScore = 0; - private float tweepcred = RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL; // default - - public ManhattanUserProperties setSpamScore(double newSpamScore) { - this.spamScore = newSpamScore; - return this; - } - - public float getTweepcred() { - return tweepcred; - } - - public ManhattanUserProperties setTweepcred(float newTweepcred) { - this.tweepcred = newTweepcred; - return this; - } - } - - public UserPropertiesManager(MetastoreClient metastoreClient) { - this(metastoreClient, GET_TWEEP_CRED); - } - - @VisibleForTesting - UserPropertiesManager( - MetastoreClient metastoreClient, - MetastoreGetColumnStats tweepCredStats) { - this.metastoreClient = metastoreClient; - this.tweepCredStats = tweepCredStats; - } - - /** - * Gets user properties including TWEEPCRED, SpamScore values/flags from metastore for the - * given userids. - * - * @param userIds the list of users for which to get the properties. - * @return mapping from userId to UserProperties. If a user's twepcred score is not present in the - * metastore, of if there was a problem retrieving it, that user's score will not be set in the - * returned map. - */ - @VisibleForTesting - Future> getManhattanUserProperties(final List userIds) { - Preconditions.checkArgument(userIds != null); - if (metastoreClient == null || userIds.isEmpty()) { - return Future.value(Collections.emptyMap()); - } - - final long start = System.currentTimeMillis(); - - return metastoreClient.multiGet(userIds, COLUMNS) - .map(new Function, Map>() { - @Override - public Map apply(Map response) { - long latencyMs = System.currentTimeMillis() - start; - Map resultMap = - Maps.newHashMapWithExpectedSize(userIds.size()); - - for (Long userId : userIds) { - MetastoreRow row = response.get(userId); - processTweepCredColumn(userId, row, resultMap); - } - - MANHATTAN_METASTORE_STATS.requestComplete(latencyMs, resultMap.size(), true); - return resultMap; - } - }) - .handle(new Function>() { - @Override - public Map apply(Throwable t) { - long latencyMs = System.currentTimeMillis() - start; - LOG.error("Exception talking to metastore after " + latencyMs + " ms.", t); - - MANHATTAN_METASTORE_STATS.requestComplete(latencyMs, 0, false); - return Collections.emptyMap(); - } - }); - } - - - /** - * Process the TweepCred column data returned from metastore, takes TweepCred, fills in the - * the resultMap as appropriate. - */ - private void processTweepCredColumn( - Long userId, - MetastoreRow metastoreRow, - Map resultMap) { - MetastoreValue tweepCredValue = - metastoreRow == null ? null : metastoreRow.getValue(MetastoreColumn.TWEEPCRED); - ResponseCode responseCode = tweepCredValue == null ? null : tweepCredValue.getResponseCode(); - tweepCredStats.trackMetastoreResponseCode(responseCode); - - if (responseCode == ResponseCode.OK) { - try { - TweepCred tweepCred = tweepCredValue.getValue(); - if (tweepCred != null && tweepCred.isSetScore()) { - ManhattanUserProperties manhattanUserProperties = - getOrCreateManhattanUserProperties(userId, resultMap); - manhattanUserProperties.setTweepcred(tweepCred.getScore()); - } - } catch (MetastoreException e) { - // guaranteed not to be thrown if ResponseCode.OK - LOG.warn("Unexpected MetastoreException parsing userinfo column!", e); - } - } - } - - private static ManhattanUserProperties getOrCreateManhattanUserProperties( - Long userId, Map resultMap) { - - ManhattanUserProperties manhattanUserProperties = resultMap.get(userId); - if (manhattanUserProperties == null) { - manhattanUserProperties = new ManhattanUserProperties(); - resultMap.put(userId, manhattanUserProperties); - } - - return manhattanUserProperties; - } - - /** - * Populates the user properties from the given batch. - */ - public Future> populateUserProperties( - Collection batch) { - Set userIds = new HashSet<>(); - for (IngesterTwitterMessage message : batch) { - if ((message.getUserReputation() == IngesterTwitterMessage.DOUBLE_FIELD_NOT_PRESENT) - && !message.isDeleted()) { - Optional userId = message.getFromUserTwitterId(); - if (userId.isPresent()) { - userIds.add(userId.get()); - } else { - LOG.error("No user id present for tweet {}", message.getId()); - } - } - } - List uniqIds = Lists.newArrayList(userIds); - Collections.sort(uniqIds); // for testing predictability - - Future> manhattanUserPropertiesMap = - getManhattanUserProperties(uniqIds); - - return manhattanUserPropertiesMap.map(Function.func(map -> { - for (IngesterTwitterMessage message : batch) { - if (((message.getUserReputation() != IngesterTwitterMessage.DOUBLE_FIELD_NOT_PRESENT) - && RelevanceSignalConstants.isValidUserReputation( - (int) Math.floor(message.getUserReputation()))) - || message.isDeleted()) { - continue; - } - Optional optionalUserId = message.getFromUserTwitterId(); - if (optionalUserId.isPresent()) { - long userId = optionalUserId.get(); - ManhattanUserProperties manhattanUserProperties = map.get(userId); - - final boolean isTestUser = TestUserFilter.isTestUserId(userId); - if (isTestUser) { - MESSAGE_FROM_TEST_USER.increment(); - } - - // legacy setting of tweepcred - setTweepCred(isTestUser, manhattanUserProperties, message); - - // set additional fields - if (setSensitiveBits(manhattanUserProperties, message)) { - SENSITIVE_BITS_COUNTER.increment(); - } - } - } - return batch; - })); - } - - // good old tweepcred - private void setTweepCred( - boolean isTestUser, - ManhattanUserProperties manhattanUserProperties, - TwitterMessage message) { - float score = RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL; - if (manhattanUserProperties == null) { - if (isTestUser) { - SKIPPED_REPUTATION_CHECK_COUNTER.increment(); - } else { - MISSING_REPUTATION_COUNTER.increment(); - DEFAULT_REPUTATION_COUNTER.increment(); - } - } else if (!RelevanceSignalConstants.isValidUserReputation( - (int) Math.floor(manhattanUserProperties.tweepcred))) { - if (!isTestUser) { - INVALID_REPUTATION_COUNTER.increment(); - DEFAULT_REPUTATION_COUNTER.increment(); - } - } else { - score = manhattanUserProperties.tweepcred; - ACCEPTED_REPUTATION_COUNTER.increment(); - } - message.setUserReputation(score); - } - - // Sets sensitive content, nsfw, and spam flags in TwitterMessage, further - // sets the following bits in encoded features: - // EarlybirdFeatureConfiguration.IS_SENSITIVE_FLAG - // EarlybirdFeatureConfiguration.IS_USER_NSFW_FLAG - // EarlybirdFeatureConfiguration.IS_USER_SPAM_FLAG - private boolean setSensitiveBits( - ManhattanUserProperties manhattanUserProperties, - TwitterMessage message) { - if (manhattanUserProperties == null) { - return false; - } - - final boolean isUserSpam = manhattanUserProperties.spamScore > SPAM_SCORE_THRESHOLD; - // SEARCH-17413: Compute the field with gizmoduck data. - final boolean isUserNSFW = false; - final boolean anySensitiveBitSet = isUserSpam || isUserNSFW; - - if (message.isSensitiveContent()) { - // original json has possibly_sensitive = true, count it - IS_SENSITIVE_FROM_JSON_COUNTER.increment(); - } - - if (isUserNSFW) { - // set EarlybirdFeatureConfiguration.IS_USER_NSFW_FLAG - for (PenguinVersion penguinVersion : message.getSupportedPenguinVersions()) { - message.getTweetUserFeatures(penguinVersion).setNsfw(isUserNSFW); - } - IS_USER_NSFW_COUNTER.increment(); - } - if (isUserSpam) { - // set EarlybirdFeatureConfiguration.IS_USER_SPAM_FLAG - for (PenguinVersion penguinVersion : message.getSupportedPenguinVersions()) { - message.getTweetUserFeatures(penguinVersion).setSpam(isUserSpam); - } - IS_USER_SPAM_COUNTER.increment(); - } - - // if any of the sensitive bits are set, we return true - return anySensitiveBitSet; - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/wire/BUILD b/src/java/com/twitter/search/ingester/pipeline/wire/BUILD deleted file mode 100644 index 042d214f2..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/wire/BUILD +++ /dev/null @@ -1,52 +0,0 @@ -java_library( - sources = ["*.java"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/distributedlog:distributedlog-core", - "3rdparty/jvm/commons-logging", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/kafka:kafka-clients", - "3rdparty/jvm/org/apache/thrift:libthrift", - "cuad/projects/ner/client/src/main/scala/com/twitter/cuad/ner/client", - "decider/src/main/scala", - "eventbus/client/src/main/scala/com/twitter/eventbus/client", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authorization", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/client", - "finagle/finagle-core/src/main", - "finagle/finagle-thrift/src/main/java", - "finagle/finagle-thrift/src/main/scala", - "finagle/finagle-thriftmux/src/main/scala", - "kafka/finagle-kafka/finatra-kafka/src/main/scala", - "servo/util/src/main/scala", - "src/java/com/twitter/common/quantity", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/common_internal/manhattan", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/common_internal/zookeeper", - "src/java/com/twitter/metastore/client_v2", - "src/java/com/twitter/search/common/decider", - "src/java/com/twitter/search/common/partitioning:timeslice-manager", - "src/java/com/twitter/search/common/partitioning/base", - "src/java/com/twitter/search/common/relevance:classifiers", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/common/util/io:dl-reader-writer", - "src/java/com/twitter/search/common/util/io:record-reader-api", - "src/java/com/twitter/search/common/util/io/kafka", - "src/java/com/twitter/search/common/util/thrift:text-protocol", - "src/java/com/twitter/search/ingester/pipeline/strato_fetchers", - "src/java/com/twitter/search/ingester/pipeline/util", - "src/thrift/com/twitter/gizmoduck:thrift-java", - "src/thrift/com/twitter/manhattan:internal-scala", - "src/thrift/com/twitter/manhattan:v1-java", - "src/thrift/com/twitter/pink-floyd/thrift:thrift-java", - "src/thrift/com/twitter/tweetypie:service-java", - "storage/clients/manhattan/client/src/main/scala", - "strato/src/main/scala/com/twitter/strato/client", - "util/util-core:scala", - "util/util-function/src/main/java", - "util/util-stats/src/main/scala", - ], -) diff --git a/src/java/com/twitter/search/ingester/pipeline/wire/BUILD.docx b/src/java/com/twitter/search/ingester/pipeline/wire/BUILD.docx new file mode 100644 index 000000000..d2baf7d75 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/wire/BUILD.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/wire/IngesterPartitioner.docx b/src/java/com/twitter/search/ingester/pipeline/wire/IngesterPartitioner.docx new file mode 100644 index 000000000..2db41b7a9 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/wire/IngesterPartitioner.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/wire/IngesterPartitioner.java b/src/java/com/twitter/search/ingester/pipeline/wire/IngesterPartitioner.java deleted file mode 100644 index f126a7370..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/wire/IngesterPartitioner.java +++ /dev/null @@ -1,27 +0,0 @@ -package com.twitter.search.ingester.pipeline.wire; - -import javax.naming.NamingException; - -import com.twitter.search.common.partitioning.base.PartitionMappingManager; -import com.twitter.search.common.util.io.kafka.SearchPartitioner; - -/** - * A variant of {@code SearchPartitioner} which retrieves {@code PartitionMappingManager} from - * {@code WireModule}. - * - * Note that the value object has to implement {@code Partitionable}. - */ -public class IngesterPartitioner extends SearchPartitioner { - - public IngesterPartitioner() { - super(getPartitionMappingManager()); - } - - private static PartitionMappingManager getPartitionMappingManager() { - try { - return WireModule.getWireModule().getPartitionMappingManager(); - } catch (NamingException e) { - throw new RuntimeException(e); - } - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/wire/ProductionWireModule.docx b/src/java/com/twitter/search/ingester/pipeline/wire/ProductionWireModule.docx new file mode 100644 index 000000000..14d1f8b2f Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/wire/ProductionWireModule.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/wire/ProductionWireModule.java b/src/java/com/twitter/search/ingester/pipeline/wire/ProductionWireModule.java deleted file mode 100644 index b50962297..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/wire/ProductionWireModule.java +++ /dev/null @@ -1,363 +0,0 @@ -package com.twitter.search.ingester.pipeline.wire; - -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import javax.annotation.Nullable; -import javax.naming.Context; -import javax.naming.InitialContext; -import javax.naming.NamingException; - -import scala.Option; -import scala.collection.JavaConversions$; - -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; - -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.clients.producer.Partitioner; -import org.apache.kafka.common.serialization.Deserializer; -import org.apache.kafka.common.serialization.Serializer; -import org.apache.thrift.TBase; -import org.apache.thrift.protocol.TBinaryProtocol; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.decider.Decider; -import com.twitter.decider.DeciderFactory; -import com.twitter.decider.DeciderFactory$; -import com.twitter.decider.decisionmaker.DecisionMaker; -import com.twitter.decider.decisionmaker.MutableDecisionMaker; -import com.twitter.eventbus.client.EventBusSubscriber; -import com.twitter.eventbus.client.EventBusSubscriberBuilder; -import com.twitter.finagle.Service; -import com.twitter.finagle.ThriftMux; -import com.twitter.finagle.builder.ClientBuilder; -import com.twitter.finagle.builder.ClientConfig; -import com.twitter.finagle.mtls.authentication.ServiceIdentifier; -import com.twitter.finagle.mtls.client.MtlsThriftMuxClient; -import com.twitter.finagle.mux.transport.OpportunisticTls; -import com.twitter.finagle.service.RetryPolicy; -import com.twitter.finagle.stats.DefaultStatsReceiver; -import com.twitter.finagle.thrift.ClientId; -import com.twitter.finagle.thrift.ThriftClientRequest; -import com.twitter.finatra.kafka.producers.BlockingFinagleKafkaProducer; -import com.twitter.gizmoduck.thriftjava.UserService; -import com.twitter.metastore.client_v2.MetastoreClient; -import com.twitter.pink_floyd.thrift.Storer; -import com.twitter.search.common.partitioning.base.PartitionMappingManager; -import com.twitter.search.common.relevance.classifiers.TweetOffensiveEvaluator; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.util.io.kafka.FinagleKafkaClientUtils; -import com.twitter.search.ingester.pipeline.strato_fetchers.AudioSpaceCoreFetcher; -import com.twitter.search.ingester.pipeline.strato_fetchers.AudioSpaceParticipantsFetcher; -import com.twitter.search.ingester.pipeline.strato_fetchers.NamedEntityFetcher; -import com.twitter.search.ingester.pipeline.util.PenguinVersionsUtil; -import com.twitter.search.ingester.pipeline.util.PipelineExceptionHandler; -import com.twitter.storage.client.manhattan.kv.JavaManhattanKVEndpoint; -import com.twitter.storage.client.manhattan.kv.ManhattanKVClient; -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams; -import com.twitter.storage.client.manhattan.kv.ManhattanKVEndpointBuilder; -import com.twitter.strato.client.Client; -import com.twitter.strato.client.Strato; -import com.twitter.tweetypie.thriftjava.TweetService; -import com.twitter.util.Duration; -import com.twitter.util.Function; -import com.twitter.util.Future; - -/** - * The injection module that provides all production bindings. - */ -public class ProductionWireModule extends WireModule { - private static final Logger LOG = LoggerFactory.getLogger(ProductionWireModule.class); - - private static final String DECIDER_BASE = "config/ingester-indexer-decider.yml"; - private static final String GEOCODE_APP_ID = "search_ingester_readonly"; - private static final String CLUSTER_DEST_NAME = ""; - - private static final String JNDI_GIZMODUCK_DEST = JNDI_PIPELINE_ROOT + "gizmoduckDest"; - - private static final String PENGUIN_VERSIONS_JNDI_NAME = JNDI_PIPELINE_ROOT + "penguinVersions"; - private static final String SEGMENT_BUFFER_SIZE_JNDI_NAME = - JNDI_PIPELINE_ROOT + "segmentBufferSize"; - private static final String SEGMENT_SEAL_DELAY_TIME_MS_JNDI_NAME = - JNDI_PIPELINE_ROOT + "segmentSealDelayTimeMs"; - private static final String JNDI_DL_URI = JNDI_PIPELINE_ROOT + "distributedlog/dlUri"; - private static final String JNDI_DL_CONFIG_FILE = - JNDI_PIPELINE_ROOT + "distributedlog/configFile"; - private static final String CLUSTER_JNDI_NAME = JNDI_PIPELINE_ROOT + "cluster"; - - private static final String TIME_SLICE_MANAGER_ROOT_PATH = ""; - private static final String MAX_TIMESLICES_JNDI_NAME = - TIME_SLICE_MANAGER_ROOT_PATH + "hashPartition/maxTimeSlices"; - private static final String MAX_SEGMENT_SIZE_JNDI_NAME = - TIME_SLICE_MANAGER_ROOT_PATH + "hashPartition/maxSegmentSize"; - private static final String NUM_PARTITIONS_JNDI_NAME = - TIME_SLICE_MANAGER_ROOT_PATH + "hashPartition/numPartitions"; - - private static final String PINK_CLIENT_ID = "search_ingester"; - - private final Decider decider; - private final MutableDecisionMaker mutableDecisionMaker; - private final int partition; - private PipelineExceptionHandler pipelineExceptionHandler; - private final StratoMetaStoreWireModule stratoMetaStoreWireModule; - - private final Client stratoClient; - - private ServiceIdentifier serviceIdentifier = ServiceIdentifier.empty(); - - private List penguinVersions; - - public ProductionWireModule(String deciderOverlay, int partition, Option - serviceIdentifierFlag) { - mutableDecisionMaker = new MutableDecisionMaker(); - decider = DeciderFactory.get() - .withBaseConfig(DECIDER_BASE) - .withOverlayConfig(deciderOverlay) - .withRefreshBase(false) - .withDecisionMakers( - ImmutableList.builder() - .add(mutableDecisionMaker) - .addAll(JavaConversions$.MODULE$.asJavaCollection( - DeciderFactory$.MODULE$.DefaultDecisionMakers())) - .build()) - .apply(); - this.partition = partition; - this.stratoMetaStoreWireModule = new StratoMetaStoreWireModule(this); - if (serviceIdentifierFlag.isDefined()) { - this.serviceIdentifier = - ServiceIdentifier.flagOfServiceIdentifier().parse(serviceIdentifierFlag.get()); - } - - this.stratoClient = Strato.client() - .withMutualTls(serviceIdentifier) - .withRequestTimeout(Duration.fromMilliseconds(500)) - .build(); - } - - public ProductionWireModule(String deciderOverlay, - int partition, - PipelineExceptionHandler pipelineExceptionHandler, - Option serviceIdentifierFlag) { - this(deciderOverlay, partition, serviceIdentifierFlag); - this.pipelineExceptionHandler = pipelineExceptionHandler; - } - - public void setPipelineExceptionHandler(PipelineExceptionHandler pipelineExceptionHandler) { - this.pipelineExceptionHandler = pipelineExceptionHandler; - } - - @Override - public ServiceIdentifier getServiceIdentifier() { - return serviceIdentifier; - } - - @Override - public PartitionMappingManager getPartitionMappingManager() { - return PartitionMappingManager.getInstance(); - } - - @Override - public JavaManhattanKVEndpoint getJavaManhattanKVEndpoint() { - Preconditions.checkNotNull(serviceIdentifier, - "Can't create Manhattan client with S2S authentication because Service Identifier is null"); - LOG.info(String.format("Service identifier for Manhattan client: %s", - ServiceIdentifier.asString(serviceIdentifier))); - ManhattanKVClientMtlsParams mtlsParams = ManhattanKVClientMtlsParams.apply(serviceIdentifier, - ManhattanKVClientMtlsParams.apply$default$2(), - OpportunisticTls.Required() - ); - return ManhattanKVEndpointBuilder - .apply(ManhattanKVClient.apply(GEOCODE_APP_ID, CLUSTER_DEST_NAME, mtlsParams)) - .buildJava(); - } - - @Override - public Decider getDecider() { - return decider; - } - - // Since MutableDecisionMaker is needed only for production TwitterServer, this method is defined - // only in ProductionWireModule. - public MutableDecisionMaker getMutableDecisionMaker() { - return mutableDecisionMaker; - } - - @Override - public int getPartition() { - return partition; - } - - @Override - public PipelineExceptionHandler getPipelineExceptionHandler() { - return pipelineExceptionHandler; - } - - @Override - public Storer.ServiceIface getStorer(Duration requestTimeout, int retries) { - TBinaryProtocol.Factory factory = new TBinaryProtocol.Factory(); - - MtlsThriftMuxClient mtlsThriftMuxClient = new MtlsThriftMuxClient( - ThriftMux.client().withClientId(new ClientId(PINK_CLIENT_ID))); - ThriftMux.Client tmuxClient = mtlsThriftMuxClient - .withMutualTls(serviceIdentifier) - .withOpportunisticTls(OpportunisticTls.Required()); - - ClientBuilder< - ThriftClientRequest, - byte[], - ClientConfig.Yes, - ClientConfig.Yes, - ClientConfig.Yes> builder = ClientBuilder.get() - .dest("") - .requestTimeout(requestTimeout) - .retries(retries) - .timeout(requestTimeout.mul(retries)) - .stack(tmuxClient) - .name("pinkclient") - .reportTo(DefaultStatsReceiver.get()); - return new Storer.ServiceToClient(ClientBuilder.safeBuild(builder), factory); - } - - @Override - public MetastoreClient getMetastoreClient() throws NamingException { - return stratoMetaStoreWireModule.getMetastoreClient(this.serviceIdentifier); - } - - @Override - public ExecutorService getThreadPool(int numThreads) { - return Executors.newFixedThreadPool(numThreads); - } - - @Override - public TweetService.ServiceToClient getTweetyPieClient(String tweetypieClientId) - throws NamingException { - return TweetyPieWireModule.getTweetyPieClient(tweetypieClientId, serviceIdentifier); - } - - @Override - public UserService.ServiceToClient getGizmoduckClient(String clientId) - throws NamingException { - Context context = new InitialContext(); - String dest = (String) context.lookup(JNDI_GIZMODUCK_DEST); - - MtlsThriftMuxClient mtlsThriftMuxClient = new MtlsThriftMuxClient( - ThriftMux.client().withClientId(new ClientId(clientId))); - - Service clientBuilder = - ClientBuilder.safeBuild( - ClientBuilder - .get() - .requestTimeout(Duration.fromMilliseconds(800)) - .retryPolicy(RetryPolicy.tries(3)) - .name("search_ingester_gizmoduck_client") - .reportTo(DefaultStatsReceiver.get()) - .daemon(true) - .dest(dest) - .stack(mtlsThriftMuxClient.withMutualTls(serviceIdentifier) - .withOpportunisticTls(OpportunisticTls.Required()))); - return new UserService.ServiceToClient(clientBuilder, new TBinaryProtocol.Factory()); - } - - @Override - public > EventBusSubscriber createEventBusSubscriber( - Function> process, - Class thriftStructClass, - String eventBusSubscriberId, - int maxConcurrentEvents) { - Preconditions.checkNotNull(serviceIdentifier, - "Can't create EventBusSubscriber with S2S auth because Service Identifier is null"); - LOG.info(String.format("Service identifier for EventBusSubscriber Manhattan client: %s", - ServiceIdentifier.asString(serviceIdentifier))); - // We set the processTimeoutMs parameter here to be Duration.Top because we do not want to read - // more events from EventBus if we are experiencing back pressure and cannot write them to the - // downstream queue. - return EventBusSubscriberBuilder.apply() - .subscriberId(eventBusSubscriberId) - .skipToLatest(false) - .fromAllZones(true) - .statsReceiver(DefaultStatsReceiver.get().scope("eventbus")) - .thriftStruct(thriftStructClass) - .serviceIdentifier(serviceIdentifier) - .maxConcurrentEvents(maxConcurrentEvents) - .processTimeout(Duration.Top()) - .build(process); - } - - @Override - public Clock getClock() { - return Clock.SYSTEM_CLOCK; - } - - @Override - public TweetOffensiveEvaluator getTweetOffensiveEvaluator() { - return new TweetOffensiveEvaluator(); - } - - @Override - public EarlybirdCluster getEarlybirdCluster() throws NamingException { - Context jndiContext = new InitialContext(); - String clusterName = (String) jndiContext.lookup(CLUSTER_JNDI_NAME); - return EarlybirdCluster.valueOf(clusterName.toUpperCase()); - } - - @Override - public List getPenguinVersions() throws NamingException { - Context context = new InitialContext(); - String penguinVersionsStr = (String) context.lookup(PENGUIN_VERSIONS_JNDI_NAME); - penguinVersions = new ArrayList<>(); - - for (String penguinVersion : penguinVersionsStr.split(",")) { - PenguinVersion pv = PenguinVersion.versionFromByteValue(Byte.parseByte(penguinVersion)); - if (PenguinVersionsUtil.isPenguinVersionAvailable(pv, decider)) { - penguinVersions.add(pv); - } - } - - Preconditions.checkArgument(penguinVersions.size() > 0, - "At least one penguin version must be specified."); - - return penguinVersions; - } - - // We update penguin versions via deciders in order to disable one in case of an emergency. - @Override - public List getCurrentlyEnabledPenguinVersions() { - return PenguinVersionsUtil.filterPenguinVersionsWithDeciders(penguinVersions, decider); - } - - @Override - public NamedEntityFetcher getNamedEntityFetcher() { - return new NamedEntityFetcher(stratoClient); - } - - @Override - public AudioSpaceParticipantsFetcher getAudioSpaceParticipantsFetcher() { - return new AudioSpaceParticipantsFetcher(stratoClient); - } - - @Override - public AudioSpaceCoreFetcher getAudioSpaceCoreFetcher() { - return new AudioSpaceCoreFetcher(stratoClient); - } - - @Override - public KafkaConsumer newKafkaConsumer( - String kafkaClusterPath, Deserializer deserializer, String clientId, String groupId, - int maxPollRecords) { - return FinagleKafkaClientUtils.newKafkaConsumer( - kafkaClusterPath, deserializer, clientId, groupId, maxPollRecords); - } - - @Override - public BlockingFinagleKafkaProducer newFinagleKafkaProducer( - String kafkaClusterPath, Serializer serializer, String clientId, - @Nullable Class partitionerClass) { - return FinagleKafkaClientUtils.newFinagleKafkaProducer( - kafkaClusterPath, true, serializer, clientId, partitionerClass); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/wire/StratoMetaStoreWireModule.docx b/src/java/com/twitter/search/ingester/pipeline/wire/StratoMetaStoreWireModule.docx new file mode 100644 index 000000000..eb1dbc2d1 Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/wire/StratoMetaStoreWireModule.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/wire/StratoMetaStoreWireModule.java b/src/java/com/twitter/search/ingester/pipeline/wire/StratoMetaStoreWireModule.java deleted file mode 100644 index 0f3f5833b..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/wire/StratoMetaStoreWireModule.java +++ /dev/null @@ -1,119 +0,0 @@ -package com.twitter.search.ingester.pipeline.wire; - -import java.util.concurrent.TimeUnit; -import javax.naming.Context; -import javax.naming.InitialContext; -import javax.naming.NamingException; - -import com.google.common.base.Preconditions; - -import org.apache.thrift.protocol.TBinaryProtocol; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.quantity.Amount; -import com.twitter.common.quantity.Time; -import com.twitter.common_internal.manhattan.ManhattanClient; -import com.twitter.common_internal.manhattan.ManhattanClientImpl; -import com.twitter.finagle.Service; -import com.twitter.finagle.ThriftMux; -import com.twitter.finagle.builder.ClientBuilder; -import com.twitter.finagle.builder.ClientConfig.Yes; -import com.twitter.finagle.mtls.authentication.ServiceIdentifier; -import com.twitter.finagle.mtls.client.MtlsThriftMuxClient; -import com.twitter.finagle.mux.transport.OpportunisticTls; -import com.twitter.finagle.stats.DefaultStatsReceiver; -import com.twitter.finagle.thrift.ClientId; -import com.twitter.finagle.thrift.ThriftClientRequest; -import com.twitter.manhattan.thriftv1.ConsistencyLevel; -import com.twitter.manhattan.thriftv1.ManhattanCoordinator; -import com.twitter.metastore.client_v2.MetastoreClient; -import com.twitter.metastore.client_v2.MetastoreClientImpl; -import com.twitter.util.Duration; - -public class StratoMetaStoreWireModule { - private WireModule wireModule; - private static final Logger LOG = LoggerFactory.getLogger(StratoMetaStoreWireModule.class); - - public StratoMetaStoreWireModule(WireModule wireModule) { - this.wireModule = wireModule; - } - - private static final String MANHATTAN_SD_ZK_ROLE = - WireModule.JNDI_PIPELINE_ROOT + "manhattanSDZKRole"; - private static final String MANHATTAN_SD_ZK_ENV = - WireModule.JNDI_PIPELINE_ROOT + "manhattanSDZKEnv"; - private static final String MANHATTAN_SD_ZK_NAME = - WireModule.JNDI_PIPELINE_ROOT + "manhattanSDZKName"; - private static final String MANHATTAN_APPLICATION_ID = "ingester_starbuck"; - - private static class Options { - // The client id as a string - private final String clientId = "ingester"; - - // The connection timeout in millis - private final long connectTimeout = 50; - - // The request timeout im millis - private final long requestTimeout = 300; - - // Total timeout per call (including retries) - private final long totalTimeout = 500; - - // The maximum number of retries per call - private final int retries = 2; - } - - private final Options options = new Options(); - - private ClientBuilder getClientBuilder( - String name, - ServiceIdentifier serviceIdentifier) { - return getClientBuilder(name, new ClientId(options.clientId), serviceIdentifier); - } - - private ClientBuilder getClientBuilder( - String name, - ClientId clientId, - ServiceIdentifier serviceIdentifier) { - Preconditions.checkNotNull(serviceIdentifier, - "Can't create Metastore Manhattan client with S2S auth because Service Identifier is null"); - LOG.info(String.format("Service identifier for Metastore Manhattan client: %s", - ServiceIdentifier.asString(serviceIdentifier))); - return ClientBuilder.get() - .name(name) - .tcpConnectTimeout(new Duration(TimeUnit.MILLISECONDS.toNanos(options.connectTimeout))) - .requestTimeout(new Duration(TimeUnit.MILLISECONDS.toNanos(options.requestTimeout))) - .timeout(new Duration(TimeUnit.MILLISECONDS.toNanos(options.totalTimeout))) - .retries(options.retries) - .reportTo(DefaultStatsReceiver.get()) - .stack(new MtlsThriftMuxClient(ThriftMux.client()) - .withMutualTls(serviceIdentifier) - .withClientId(clientId) - .withOpportunisticTls(OpportunisticTls.Required())); - } - - /** - * Returns the Metastore client. - */ - public MetastoreClient getMetastoreClient(ServiceIdentifier serviceIdentifier) - throws NamingException { - Context jndiContext = new InitialContext(); - String destString = String.format("/cluster/local/%s/%s/%s", - jndiContext.lookup(MANHATTAN_SD_ZK_ROLE), - jndiContext.lookup(MANHATTAN_SD_ZK_ENV), - jndiContext.lookup(MANHATTAN_SD_ZK_NAME)); - LOG.info("Manhattan serverset Name: {}", destString); - - Service service = - ClientBuilder.safeBuild(getClientBuilder("metastore", serviceIdentifier).dest(destString)); - - ManhattanClient manhattanClient = new ManhattanClientImpl( - new ManhattanCoordinator.ServiceToClient(service, new TBinaryProtocol.Factory()), - MANHATTAN_APPLICATION_ID, - Amount.of((int) options.requestTimeout, Time.MILLISECONDS), - ConsistencyLevel.ONE); - - return new MetastoreClientImpl(manhattanClient); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/wire/TweetyPieWireModule.docx b/src/java/com/twitter/search/ingester/pipeline/wire/TweetyPieWireModule.docx new file mode 100644 index 000000000..e9de0199b Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/wire/TweetyPieWireModule.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/wire/TweetyPieWireModule.java b/src/java/com/twitter/search/ingester/pipeline/wire/TweetyPieWireModule.java deleted file mode 100644 index 3f3d67158..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/wire/TweetyPieWireModule.java +++ /dev/null @@ -1,110 +0,0 @@ -package com.twitter.search.ingester.pipeline.wire; - -import java.util.concurrent.TimeoutException; -import javax.naming.Context; -import javax.naming.InitialContext; -import javax.naming.NamingException; - -import org.apache.thrift.protocol.TBinaryProtocol; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common_internal.zookeeper.TwitterServerSet; -import com.twitter.finagle.Name; -import com.twitter.finagle.Resolvers; -import com.twitter.finagle.Service; -import com.twitter.finagle.ThriftMux; -import com.twitter.finagle.builder.ClientBuilder; -import com.twitter.finagle.builder.ClientConfig; -import com.twitter.finagle.mtls.authentication.ServiceIdentifier; -import com.twitter.finagle.mtls.client.MtlsThriftMuxClient; -import com.twitter.finagle.mux.transport.OpportunisticTls; -import com.twitter.finagle.service.RetryPolicy; -import com.twitter.finagle.stats.DefaultStatsReceiver; -import com.twitter.finagle.thrift.ClientId; -import com.twitter.finagle.thrift.ThriftClientRequest; -import com.twitter.servo.util.WaitForServerSets; -import com.twitter.tweetypie.thriftjava.TweetService; -import com.twitter.util.Await; -import com.twitter.util.Duration; - -final class TweetyPieWireModule { - private static final Logger LOG = LoggerFactory.getLogger(ProductionWireModule.class); - - private static final int TWEETYPIE_CONNECT_TIMEOUT_MS = 100; - private static final int TWEETYPIE_REQUEST_TIMEOUT_MS = 500; - - // This is actually the total tries count, so one initial try, and one more retry (if needed). - private static final int TWEETYPIE_REQUEST_NUM_TRIES = 3; - private static final int TWEETYPIE_TOTAL_TIMEOUT_MS = - TWEETYPIE_REQUEST_TIMEOUT_MS * TWEETYPIE_REQUEST_NUM_TRIES; - - private static final String TWEETYPIE_SD_ZK_ROLE = - WireModule.JNDI_PIPELINE_ROOT + "tweetypieSDZKRole"; - private static final String TWEETYPIE_SD_ZK_ENV = - WireModule.JNDI_PIPELINE_ROOT + "tweetypieSDZKEnv"; - private static final String TWEETYPIE_SD_ZK_NAME = - WireModule.JNDI_PIPELINE_ROOT + "tweetypieSDZKName"; - - private TweetyPieWireModule() { - } - - private static TwitterServerSet.Service getTweetyPieZkServerSetService() - throws NamingException { - Context jndiContext = new InitialContext(); - TwitterServerSet.Service service = new TwitterServerSet.Service( - (String) jndiContext.lookup(TWEETYPIE_SD_ZK_ROLE), - (String) jndiContext.lookup(TWEETYPIE_SD_ZK_ENV), - (String) jndiContext.lookup(TWEETYPIE_SD_ZK_NAME)); - LOG.info("TweetyPie ZK path: {}", TwitterServerSet.getPath(service)); - return service; - } - - static TweetService.ServiceToClient getTweetyPieClient( - String clientIdString, ServiceIdentifier serviceIdentifier) throws NamingException { - TwitterServerSet.Service service = getTweetyPieZkServerSetService(); - - // Use explicit Name types so we can force a wait on resolution (COORD-479) - String destString = String.format("/cluster/local/%s/%s/%s", - service.getRole(), service.getEnv(), service.getName()); - Name destination = Resolvers.eval(destString); - try { - Await.ready(WaitForServerSets.ready(destination, Duration.fromMilliseconds(10000))); - } catch (TimeoutException e) { - LOG.warn("Timed out while resolving Zookeeper ServerSet", e); - } catch (InterruptedException e) { - LOG.warn("Interrupted while resolving Zookeeper ServerSet", e); - Thread.currentThread().interrupt(); - } - - LOG.info("Creating Tweetypie client with ID: {}", clientIdString); - ClientId clientId = new ClientId(clientIdString); - - MtlsThriftMuxClient mtlsThriftMuxClient = new MtlsThriftMuxClient( - ThriftMux.client().withClientId(clientId)); - ThriftMux.Client tmuxClient = mtlsThriftMuxClient - .withMutualTls(serviceIdentifier) - .withOpportunisticTls(OpportunisticTls.Required()); - - ClientBuilder< - ThriftClientRequest, - byte[], - ClientConfig.Yes, - ClientConfig.Yes, - ClientConfig.Yes> builder = ClientBuilder.get() - .stack(tmuxClient) - .name("retrieve_cards_tweetypie_client") - .dest(destination) - .reportTo(DefaultStatsReceiver.get()) - .connectTimeout(Duration.fromMilliseconds(TWEETYPIE_CONNECT_TIMEOUT_MS)) - .requestTimeout(Duration.fromMilliseconds(TWEETYPIE_REQUEST_TIMEOUT_MS)) - .timeout(Duration.fromMilliseconds(TWEETYPIE_TOTAL_TIMEOUT_MS)) - .retryPolicy(RetryPolicy.tries( - TWEETYPIE_REQUEST_NUM_TRIES, - RetryPolicy.TimeoutAndWriteExceptionsOnly())); - - Service clientBuilder = ClientBuilder.safeBuild(builder); - - return new TweetService.ServiceToClient(clientBuilder, new TBinaryProtocol.Factory()); - } -} diff --git a/src/java/com/twitter/search/ingester/pipeline/wire/WireModule.docx b/src/java/com/twitter/search/ingester/pipeline/wire/WireModule.docx new file mode 100644 index 000000000..e822c43bc Binary files /dev/null and b/src/java/com/twitter/search/ingester/pipeline/wire/WireModule.docx differ diff --git a/src/java/com/twitter/search/ingester/pipeline/wire/WireModule.java b/src/java/com/twitter/search/ingester/pipeline/wire/WireModule.java deleted file mode 100644 index c6c5f198f..000000000 --- a/src/java/com/twitter/search/ingester/pipeline/wire/WireModule.java +++ /dev/null @@ -1,226 +0,0 @@ -package com.twitter.search.ingester.pipeline.wire; - -import java.util.List; -import java.util.concurrent.ExecutorService; -import javax.annotation.Nullable; -import javax.naming.Context; -import javax.naming.InitialContext; -import javax.naming.NamingException; - -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.clients.producer.Partitioner; -import org.apache.kafka.common.serialization.Deserializer; -import org.apache.kafka.common.serialization.Serializer; -import org.apache.thrift.TBase; - -import com.twitter.common.util.Clock; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.decider.Decider; -import com.twitter.eventbus.client.EventBusSubscriber; -import com.twitter.finagle.mtls.authentication.ServiceIdentifier; -import com.twitter.finatra.kafka.producers.BlockingFinagleKafkaProducer; -import com.twitter.gizmoduck.thriftjava.UserService; -import com.twitter.metastore.client_v2.MetastoreClient; -import com.twitter.pink_floyd.thrift.Storer; -import com.twitter.search.common.partitioning.base.PartitionMappingManager; -import com.twitter.search.common.relevance.classifiers.TweetOffensiveEvaluator; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.ingester.pipeline.strato_fetchers.AudioSpaceCoreFetcher; -import com.twitter.search.ingester.pipeline.strato_fetchers.AudioSpaceParticipantsFetcher; -import com.twitter.search.ingester.pipeline.strato_fetchers.NamedEntityFetcher; -import com.twitter.search.ingester.pipeline.util.PipelineExceptionHandler; -import com.twitter.storage.client.manhattan.kv.JavaManhattanKVEndpoint; -import com.twitter.tweetypie.thriftjava.TweetService; -import com.twitter.util.Duration; -import com.twitter.util.Function; -import com.twitter.util.Future; - -/** - * An "injection module" that provides bindings for all ingester endpoints that we want to mock out - * in tests. - */ -public abstract class WireModule { - /** The JNDI property to which this module will be bound. */ - private static final String WIRE_MODULE_NAME = ""; - - /** The root name of all properties specified in the twitter-naming-production.*.xml files. */ - public static final String JNDI_PIPELINE_ROOT = ""; - - /** - * (Re)binds the given wire module in JNDI. - * - * @param wireModule The wire module to bind in JNDI. - * @throws NamingException If the wire module cannot be bound in JNDI for some reason. - */ - public static void bindWireModule(WireModule wireModule) throws NamingException { - Context jndiContext = new InitialContext(); - jndiContext.rebind(WIRE_MODULE_NAME, wireModule); - } - - /** - * Returns the wire module bound in JNDI. - * - * @return The wire module bound in JNDI. - * @throws NamingException If there's no wire module bound in JNDI. - */ - public static WireModule getWireModule() throws NamingException { - Context jndiContext = new InitialContext(); - return (WireModule) jndiContext.lookup(WIRE_MODULE_NAME); - } - - /** - * Retrieves the service identifier needed for making mtls requests. - * @return The service identifier for the current running service. - */ - public abstract ServiceIdentifier getServiceIdentifier(); - - /** - * Creates a new {@code FinagleKafkaConsumer} with a specified consumer group ID. - */ - public abstract KafkaConsumer newKafkaConsumer( - String kafkaClusterPath, Deserializer deserializer, String clientId, String groupId, - int maxPollRecords); - - /** - * Creates a new {@code FinagleKafkaConsumer} with a specified consumer group ID. - */ - public abstract BlockingFinagleKafkaProducer newFinagleKafkaProducer( - String kafkaClusterPath, Serializer serializer, String clientId, - @Nullable Class partitionerClass); - - /** - * Gets a TweetyPie client. - * - * @param tweetypieClientId Use this string as the client id. - * @return A TweetyPie client - * @throws NamingException - */ - public abstract TweetService.ServiceToClient getTweetyPieClient(String tweetypieClientId) - throws NamingException; - - /** - * Gets a Gizmoduck client. - * - * @param clientId - * @throws NamingException - */ - public abstract UserService.ServiceToClient getGizmoduckClient(String clientId) - throws NamingException; - - /** - * Gets the ManhattanKVEndpoint that should be used for the ManhattanCodedLocationProvider - * - * @return the JavaManhattanKVEndpoint that we need for the ManhattanCodedLocationProvider - * @throws NamingException - */ - public abstract JavaManhattanKVEndpoint getJavaManhattanKVEndpoint() - throws NamingException; - - /** - * Returns the decider to be used by all stages. - * - * @return The decider to be used by all stages. - */ - public abstract Decider getDecider(); - - /** - * Returns the partition ID to be used by all stages. - * - * @return The partition ID to be used by all stages. - */ - public abstract int getPartition(); - - - /** - * Returns the PipelineExceptionHandler instance to be used by all stages. - * - * @return The PipelineExceptionHandler instance to be used by all stages. - * @throws NamingException If building the PipelineExceptionHandler instance requires some - * parameters, and those parameters were not bound in JNDI. - */ - public abstract PipelineExceptionHandler getPipelineExceptionHandler(); - - /** - * Gets the PartitionMappingManager for the Kafka writer. - * - * @return a PartitionMappingManager - */ - public abstract PartitionMappingManager getPartitionMappingManager(); - - /** - * Returns the Metastore client used by the UserPropertiesManager. - * - * @return A Metastore client. - * @throws NamingException - */ - public abstract MetastoreClient getMetastoreClient() throws NamingException; - - /** - * Returns an ExecutorService potentially backed by the specified number of threads. - * - * @param numThreads An advisory value with a suggestion for how large the threadpool should be. - * @return an ExecutorService that might be backed by some threads. - * @throws NamingException - */ - public abstract ExecutorService getThreadPool(int numThreads) throws NamingException; - - /** - * Returns the Storer interface to connect to Pink. - * - * @param requestTimeout The request timeout for the Pink client. - * @param retries The number of Finagle retries. - * @return a Storer.ServiceIface to connect to pink. - * - */ - public abstract Storer.ServiceIface getStorer(Duration requestTimeout, int retries) - throws NamingException; - - /** - * Returns an EventBusSubscriber - */ - public abstract > EventBusSubscriber createEventBusSubscriber( - Function> process, - Class thriftStructClass, - String eventBusSubscriberId, - int maxConcurrentEvents); - - /** - * Returns a Clock. - */ - public abstract Clock getClock(); - - /** - * Returns a TweetOffensiveEvaluator. - */ - public abstract TweetOffensiveEvaluator getTweetOffensiveEvaluator(); - - /** - * Returns the cluster. - */ - public abstract EarlybirdCluster getEarlybirdCluster() throws NamingException; - - /** - * Returns the current penguin version(s). - */ - public abstract List getPenguinVersions() throws NamingException; - - /** - * Returns updated penguin version(s) depending on decider availability. - */ - public abstract List getCurrentlyEnabledPenguinVersions(); - - /** - * Returns a named entities strato column fetcher. - */ - public abstract NamedEntityFetcher getNamedEntityFetcher(); - - /** - * Returns audio space participants strato column fetcher. - */ - public abstract AudioSpaceParticipantsFetcher getAudioSpaceParticipantsFetcher(); - - /** - * Returns audio space core strato column fetcher. - */ - public abstract AudioSpaceCoreFetcher getAudioSpaceCoreFetcher(); -} diff --git a/src/java/com/twitter/search/ingester/util/jndi/BUILD b/src/java/com/twitter/search/ingester/util/jndi/BUILD deleted file mode 100644 index 4eaf908dd..000000000 --- a/src/java/com/twitter/search/ingester/util/jndi/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -java_library( - sources = ["*.java"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/search/ingester:legacy", - ], -) diff --git a/src/java/com/twitter/search/ingester/util/jndi/BUILD.docx b/src/java/com/twitter/search/ingester/util/jndi/BUILD.docx new file mode 100644 index 000000000..da2dbdf34 Binary files /dev/null and b/src/java/com/twitter/search/ingester/util/jndi/BUILD.docx differ diff --git a/src/java/com/twitter/search/ingester/util/jndi/JndiUtil.docx b/src/java/com/twitter/search/ingester/util/jndi/JndiUtil.docx new file mode 100644 index 000000000..18aa981d8 Binary files /dev/null and b/src/java/com/twitter/search/ingester/util/jndi/JndiUtil.docx differ diff --git a/src/java/com/twitter/search/ingester/util/jndi/JndiUtil.java b/src/java/com/twitter/search/ingester/util/jndi/JndiUtil.java deleted file mode 100644 index 8f50870cf..000000000 --- a/src/java/com/twitter/search/ingester/util/jndi/JndiUtil.java +++ /dev/null @@ -1,70 +0,0 @@ -package com.twitter.search.ingester.util.jndi; - -import java.util.Hashtable; -import javax.naming.Context; -import javax.naming.InitialContext; -import javax.naming.NameNotFoundException; - -import org.apache.naming.config.XmlConfigurator; - -public abstract class JndiUtil { - // This is different from the search repo---twitter-naming-devtest.xml is - // checked in as a resource in src/resources/com/twitter/search/ingester. - public static final String DEFAULT_JNDI_XML = - System.getProperty("jndiXml", "/com/twitter/search/ingester/twitter-naming-devtest.xml"); - protected static String jndiXml = DEFAULT_JNDI_XML; - protected static boolean testingMode = false; - - static { - System.setProperty("javax.xml.parsers.SAXParserFactory", - "org.apache.xerces.jaxp.SAXParserFactoryImpl"); - System.setProperty("javax.xml.parsers.DocumentBuilderFactory", - "com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"); - } - - public static void loadJNDI() { - loadJNDI(jndiXml); - } - - protected static void loadJNDI(String jndiXmlFile) { - try { - Hashtable props = new Hashtable<>(); - props.put(Context.INITIAL_CONTEXT_FACTORY, "org.apache.naming.java.javaURLContextFactory"); - Context jndiContext = new InitialContext(props); - try { - jndiContext.lookup("java:comp"); - setTestingModeFromJndiContext(jndiContext); - } catch (NameNotFoundException e) { - // No context. - XmlConfigurator.loadConfiguration(JndiUtil.class.getResourceAsStream(jndiXmlFile)); - } - } catch (Exception e) { - throw new RuntimeException(String.format("Failed to load JNDI configuration file=%s %s", - jndiXmlFile, e.getMessage()), e); - } - } - - public static void setJndiXml(String jndiXml) { - JndiUtil.jndiXml = jndiXml; - } - - public static String getJndiXml() { - return jndiXml; - } - - public static void setTestingMode(Boolean testingMode) { - JndiUtil.testingMode = testingMode; - } - - public static boolean isTestingMode() { - return testingMode; - } - - private static void setTestingModeFromJndiContext(Context jndiContext) { - try { - setTestingMode((Boolean) jndiContext.lookup("java:comp/env/testingMode")); - } catch (Exception e) { - setTestingMode(false); - } - } -} diff --git a/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.docx b/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.docx new file mode 100644 index 000000000..6b0fc67e5 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py b/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py deleted file mode 100644 index 167756c01..000000000 --- a/src/python/twitter/deepbird/projects/timelines/configs/recap_earlybird/feature_config.py +++ /dev/null @@ -1,83 +0,0 @@ -# checkstyle: noqa -from twml.feature_config import FeatureConfigBuilder - - -def get_feature_config(data_spec_path, label): - return ( - FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True) - .batch_add_features( - [ - ("ebd.author_specific_score", "A"), - ("ebd.has_diff_lang", "A"), - ("ebd.has_english_tweet_diff_ui_lang", "A"), - ("ebd.has_english_ui_diff_tweet_lang", "A"), - ("ebd.is_self_tweet", "A"), - ("ebd.tweet_age_in_secs", "A"), - ("encoded_tweet_features.favorite_count", "A"), - ("encoded_tweet_features.from_verified_account_flag", "A"), - ("encoded_tweet_features.has_card_flag", "A"), - # ("encoded_tweet_features.has_consumer_video_flag", "A"), - ("encoded_tweet_features.has_image_url_flag", "A"), - ("encoded_tweet_features.has_link_flag", "A"), - ("encoded_tweet_features.has_multiple_hashtags_or_trends_flag", "A"), - # ("encoded_tweet_features.has_multiple_media_flag", "A"), - ("encoded_tweet_features.has_native_image_flag", "A"), - ("encoded_tweet_features.has_news_url_flag", "A"), - ("encoded_tweet_features.has_periscope_flag", "A"), - ("encoded_tweet_features.has_pro_video_flag", "A"), - ("encoded_tweet_features.has_quote_flag", "A"), - ("encoded_tweet_features.has_trend_flag", "A"), - ("encoded_tweet_features.has_video_url_flag", "A"), - ("encoded_tweet_features.has_vine_flag", "A"), - ("encoded_tweet_features.has_visible_link_flag", "A"), - ("encoded_tweet_features.is_offensive_flag", "A"), - ("encoded_tweet_features.is_reply_flag", "A"), - ("encoded_tweet_features.is_retweet_flag", "A"), - ("encoded_tweet_features.is_sensitive_content", "A"), - # ("encoded_tweet_features.is_user_new_flag", "A"), - ("encoded_tweet_features.language", "A"), - ("encoded_tweet_features.link_language", "A"), - ("encoded_tweet_features.num_hashtags", "A"), - ("encoded_tweet_features.num_mentions", "A"), - # ("encoded_tweet_features.profile_is_egg_flag", "A"), - ("encoded_tweet_features.reply_count", "A"), - ("encoded_tweet_features.retweet_count", "A"), - ("encoded_tweet_features.text_score", "A"), - ("encoded_tweet_features.user_reputation", "A"), - ("extended_encoded_tweet_features.embeds_impression_count", "A"), - ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"), - ("extended_encoded_tweet_features.embeds_url_count", "A"), - ("extended_encoded_tweet_features.embeds_url_count_v2", "A"), - ("extended_encoded_tweet_features.favorite_count_v2", "A"), - ("extended_encoded_tweet_features.label_abusive_hi_rcl_flag", "A"), - ("extended_encoded_tweet_features.label_dup_content_flag", "A"), - ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"), - ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"), - ("extended_encoded_tweet_features.label_spam_flag", "A"), - ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"), - ("extended_encoded_tweet_features.quote_count", "A"), - ("extended_encoded_tweet_features.reply_count_v2", "A"), - ("extended_encoded_tweet_features.retweet_count_v2", "A"), - ("extended_encoded_tweet_features.weighted_favorite_count", "A"), - ("extended_encoded_tweet_features.weighted_quote_count", "A"), - ("extended_encoded_tweet_features.weighted_reply_count", "A"), - ("extended_encoded_tweet_features.weighted_retweet_count", "A"), - ] - ) - .add_labels( - [ - label, # Tensor index: 0 - "recap.engagement.is_clicked", # Tensor index: 1 - "recap.engagement.is_favorited", # Tensor index: 2 - "recap.engagement.is_open_linked", # Tensor index: 3 - "recap.engagement.is_photo_expanded", # Tensor index: 4 - "recap.engagement.is_profile_clicked", # Tensor index: 5 - "recap.engagement.is_replied", # Tensor index: 6 - "recap.engagement.is_retweeted", # Tensor index: 7 - "recap.engagement.is_video_playback_50", # Tensor index: 8 - "timelines.earlybird_score", # Tensor index: 9 - ] - ) - .define_weight("meta.record_weight/type=earlybird") - .build() - ) diff --git a/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.docx b/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.docx new file mode 100644 index 000000000..f3395f0de Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py b/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py deleted file mode 100644 index 85b7d7f10..000000000 --- a/src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py +++ /dev/null @@ -1,74 +0,0 @@ -# checkstyle: noqa -from twml.feature_config import FeatureConfigBuilder - - -def get_feature_config(data_spec_path, label): - return FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True) \ - .batch_add_features( - [ - ("ebd.has_diff_lang", "A"), - ("ebd.tweet_age_in_secs", "A"), - ("encoded_tweet_features.composer_source_is_camera_flag", "A"), - ("encoded_tweet_features.favorite_count", "A"), - ("encoded_tweet_features.has_card_flag", "A"), - ("encoded_tweet_features.has_image_url_flag", "A"), - ("encoded_tweet_features.has_native_image_flag", "A"), - ("encoded_tweet_features.has_news_url_flag", "A"), - ("encoded_tweet_features.has_periscope_flag", "A"), - ("encoded_tweet_features.has_pro_video_flag", "A"), - ("encoded_tweet_features.has_quote_flag", "A"), - ("encoded_tweet_features.has_video_url_flag", "A"), - ("encoded_tweet_features.has_vine_flag", "A"), - ("encoded_tweet_features.has_visible_link_flag", "A"), - ("encoded_tweet_features.is_sensitive_content", "A"), - ("encoded_tweet_features.is_user_spam_flag", "A"), - ("encoded_tweet_features.link_language", "A"), - ("encoded_tweet_features.num_hashtags", "A"), - ("encoded_tweet_features.num_mentions", "A"), - ("encoded_tweet_features.reply_count", "A"), - ("encoded_tweet_features.retweet_count", "A"), - ("encoded_tweet_features.text_score", "A"), - ("encoded_tweet_features.user_reputation", "A"), - ("extended_encoded_tweet_features.decayed_favorite_count", "A"), - ("extended_encoded_tweet_features.decayed_quote_count", "A"), - ("extended_encoded_tweet_features.decayed_reply_count", "A"), - ("extended_encoded_tweet_features.decayed_retweet_count", "A"), - ("extended_encoded_tweet_features.embeds_impression_count_v2", "A"), - ("extended_encoded_tweet_features.embeds_url_count_v2", "A"), - ("extended_encoded_tweet_features.fake_favorite_count", "A"), - ("extended_encoded_tweet_features.fake_quote_count", "A"), - ("extended_encoded_tweet_features.fake_reply_count", "A"), - ("extended_encoded_tweet_features.fake_retweet_count", "A"), - ("extended_encoded_tweet_features.favorite_count_v2", "A"), - ("extended_encoded_tweet_features.label_dup_content_flag", "A"), - ("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"), - ("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"), - ("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"), - ("extended_encoded_tweet_features.periscope_exists", "A"), - ("extended_encoded_tweet_features.periscope_has_been_featured", "A"), - ("extended_encoded_tweet_features.periscope_is_currently_featured", "A"), - ("extended_encoded_tweet_features.periscope_is_from_quality_source", "A"), - ("extended_encoded_tweet_features.periscope_is_live", "A"), - ("extended_encoded_tweet_features.quote_count", "A"), - ("extended_encoded_tweet_features.reply_count_v2", "A"), - ("extended_encoded_tweet_features.retweet_count_v2", "A"), - ("extended_encoded_tweet_features.weighted_favorite_count", "A"), - ("extended_encoded_tweet_features.weighted_quote_count", "A"), - ("extended_encoded_tweet_features.weighted_reply_count", "A"), - ("extended_encoded_tweet_features.weighted_retweet_count", "A"), - ("timelines.earlybird.visible_token_ratio", "A") - ] - ).add_labels([ - label, # Tensor index: 0 - "itl.engagement.is_clicked", # Tensor index: 1 - "itl.engagement.is_favorited", # Tensor index: 2 - "itl.engagement.is_open_linked", # Tensor index: 3 - "itl.engagement.is_photo_expanded", # Tensor index: 4 - "itl.engagement.is_profile_clicked", # Tensor index: 5 - "itl.engagement.is_replied", # Tensor index: 6 - "itl.engagement.is_retweeted", # Tensor index: 7 - "itl.engagement.is_video_playback_50", # Tensor index: 8 - "timelines.earlybird_score", # Tensor index: 9 - ]) \ - .define_weight("meta.record_weight/type=earlybird") \ - .build() diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/BUILD b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/BUILD deleted file mode 100644 index 0e889392e..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/BUILD +++ /dev/null @@ -1,23 +0,0 @@ -python3_library( - name = "libs_py3", - sources = ["*.py"], - tags = ["no-mypy"], - dependencies = [ - "src/python/twitter/deepbird/io", - "src/python/twitter/deepbird/projects/timelines/configs:all_configs", - "twml:twml-nodeps", - ], -) - -python37_binary( - name = "model_earlybird", - source = "train.py", - tags = ["no-mypy"], - dependencies = [ - ":libs_py3", - "3rdparty/python/_closures/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird:model_earlybird", - "src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly:libs_py3", - "src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model:libs_py3", - "twml", - ], -) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/BUILD.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/BUILD.docx new file mode 100644 index 000000000..19e6d5151 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/BUILD.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/README.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/README.docx new file mode 100644 index 000000000..371925c73 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/README.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/README.md b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/README.md deleted file mode 100644 index 3eb9e6c74..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/README.md +++ /dev/null @@ -1,63 +0,0 @@ -# Earlybird Light Ranker - -*Note: the light ranker is an old part of the stack which we are currently in the process of replacing. -The current model was last trained several years ago, and uses some very strange features. -We are working on training a new model, and eventually rebuilding this part of the stack entirely.* - -The Earlybird light ranker is a logistic regression model which predicts the likelihood that the user will engage with a -tweet. -It is intended to be a simplified version of the heavy ranker which can run on a greater amount of tweets. - -There are currently 2 main light ranker models in use: one for ranking in network tweets (`recap_earlybird`), and -another for -out of network (UTEG) tweets (`rectweet_earlybird`). Both models are trained using the `train.py` script which is -included in this directory. They differ mainly in the set of features -used by the model. -The in network model uses -the `src/python/twitter/deepbird/projects/timelines/configs/recap/feature_config.py` file to define the -feature configuration, while the -out of network model uses `src/python/twitter/deepbird/projects/timelines/configs/rectweet_earlybird/feature_config.py`. - -The `train.py` script is essentially a series of hooks provided to for Twitter's `twml` framework to execute, -which is included under `twml/`. - -### Features - -The light ranker features pipeline is as follows: -![earlybird_features.png](earlybird_features.png) - -Some of these components are explained below: - -- Index Ingester: an indexing pipeline that handles the tweets as they are generated. This is the main input of - Earlybird, it produces Tweet Data (the basic information about the tweet, the text, the urls, media entities, facets, - etc) and Static Features (the features you can compute directly from a tweet right now, like whether it has URL, has - Cards, has quotes, etc); All information computed here are stored in index and flushed as each realtime index segments - become full. They are loaded back later from disk when Earlybird restarts. Note that the features may be computed in a - non-trivial way (like deciding the value of hasUrl), they could be computed and combined from some more "raw" - information in the tweet and from other services. - Signal Ingester: the ingester for Realtime Features, per-tweet features that can change after the tweet has been - indexed, mostly social engagements like retweetCount, favCount, replyCount, etc, along with some (future) spam signals - that's computed with later activities. These were collected and computed in a Heron topology by processing multiple - event streams and can be extended to support more features. -- User Table Features is another set of features per user. They are from User Table Updater, a different input that - processes a stream written by our user service. It's used to store sparse realtime user - information. These per-user features are propagated to the tweet being scored by - looking up the author of the tweet. -- Search Context Features are basically the information of current searcher, like their UI language, their own - produced/consumed language, and the current time (implied). They are combined with Tweet Data to compute some of the - features used in scoring. - -The scoring function in Earlybird uses both static and realtime features. Examples of static features used are: - -- Whether the tweet is a retweet -- Whether the tweet contains a link -- Whether this tweet has any trend words at ingestion time -- Whether the tweet is a reply -- A score for the static quality of the text, computed in TweetTextScorer.java in the Ingester. Based on the factors - such as offensiveness, content entropy, "shout" score, length, and readability. -- tweepcred, see top-level README.md - -Examples of realtime features used are: - -- Number of tweet likes/replies/retweets -- pToxicity and pBlock scores provided by health models diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/__init__.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/__init__.docx new file mode 100644 index 000000000..e7973f7f4 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/__init__.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/__init__.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.docx new file mode 100644 index 000000000..9297e8b9f Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py deleted file mode 100644 index 57178b92c..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/constants.py +++ /dev/null @@ -1,21 +0,0 @@ -# checkstyle: noqa - -INDEX_BY_LABEL = { - "is_clicked": 1, - "is_favorited": 2, - "is_open_linked": 3, - "is_photo_expanded": 4, - "is_profile_clicked": 5, - "is_replied": 6, - "is_retweeted": 7, - "is_video_playback_50": 8 -} - -TARGET_LABEL_IDX = 0 -EB_SCORE_IDX = 9 - -LABEL_NAMES = [label_name for label_name, _ in sorted(INDEX_BY_LABEL.items(), key=lambda item: item[1])] - -PREDICTED_CLASSES = \ - ["tf_target"] + ["tf_" + label_name for label_name in LABEL_NAMES] + ["tf_timelines.earlybird_score"] + \ - ["lolly_target"] + ["lolly_" + label_name for label_name in LABEL_NAMES] + ["lolly_timelines.earlybird_score"] diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/earlybird_features.png b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/earlybird_features.png deleted file mode 100644 index abba44ef1..000000000 Binary files a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/earlybird_features.png and /dev/null differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.docx new file mode 100644 index 000000000..89daaeb3e Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py deleted file mode 100644 index cf0c38ecc..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/example_weights.py +++ /dev/null @@ -1,43 +0,0 @@ -# checkstyle: noqa -import tensorflow.compat.v1 as tf -from .constants import INDEX_BY_LABEL, LABEL_NAMES - -# TODO: Read these from command line arguments, since they specify the existing example weights in the input data. -DEFAULT_WEIGHT_BY_LABEL = { - "is_clicked": 0.3, - "is_favorited": 1.0, - "is_open_linked": 0.1, - "is_photo_expanded": 0.03, - "is_profile_clicked": 1.0, - "is_replied": 9.0, - "is_retweeted": 1.0, - "is_video_playback_50": 0.01 -} - -def add_weight_arguments(parser): - for label_name in LABEL_NAMES: - parser.add_argument( - _make_weight_cli_argument_name(label_name), - type=float, - default=DEFAULT_WEIGHT_BY_LABEL[label_name], - dest=_make_weight_param_name(label_name) - ) - -def make_weights_tensor(input_weights, label, params): - ''' - Replaces the weights for each positive engagement and keeps the input weights for negative examples. - ''' - weight_tensors = [input_weights] - for label_name in LABEL_NAMES: - index, default_weight = INDEX_BY_LABEL[label_name], DEFAULT_WEIGHT_BY_LABEL[label_name] - weight_param_name =_make_weight_param_name(label_name) - weight_tensors.append( - tf.reshape(tf.math.scalar_mul(getattr(params, weight_param_name) - default_weight, label[:, index]), [-1, 1]) - ) - return tf.math.accumulate_n(weight_tensors) - -def _make_weight_cli_argument_name(label_name): - return f"--weight.{label_name}" - -def _make_weight_param_name(label_name): - return f"weight_{label_name}"