diff --git a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java index afde8a84e..94168a795 100644 --- a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java +++ b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java @@ -500,6 +500,58 @@ public class BasicIndexingConverter { /** * Build the correct ThriftIndexingEvent's fields based on retweet and reply status. + * + *
+   *
+   * We have six combinations here. A tweet can be
+   *   1) a reply to another tweet (then it has both in-reply-to-user-id and
+   *      in-reply-to-status-id set),
+   *   2) directed-at a user (then it only has in-reply-to-user-id set),
+   *   3) not a reply at all.
+   * Additionally, it may or may not be a retweet (if it is, then it has retweet-user-id and
+   * retweet-status-id set).
+   *
+   * We want to set some fields unconditionally, and some fields (reference-author-id and
+   * shared-status-id) depending on the reply/retweet combination.
+   *
+   * 1. Normal tweet (not a reply, not a retweet). None of the fields should be set.
+   *
+   * 2. Reply to a tweet (both in-reply-to-user-id and in-reply-to-status-id set).
+   *   IN_REPLY_TO_USER_ID_FIELD    should be set to in-reply-to-user-id
+   *   SHARED_STATUS_ID_CSF         should be set to in-reply-to-status-id
+   *   IS_REPLY_FLAG                should be set
+   *
+   * 3. Directed-at a user (only in-reply-to-user-id is set).
+   *   IN_REPLY_TO_USER_ID_FIELD    should be set to in-reply-to-user-id
+   *   IS_REPLY_FLAG                should be set
+   *
+   * 4. Retweet of a normal tweet (retweet-user-id and retweet-status-id are set).
+   *   RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
+   *   SHARED_STATUS_ID_CSF         should be set to retweet-status-id
+   *   IS_RETWEET_FLAG              should be set
+   *
+   * 5. Retweet of a reply (both in-reply-to-user-id and in-reply-to-status-id set,
+   * retweet-user-id and retweet-status-id are set).
+   *   RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
+   *   SHARED_STATUS_ID_CSF         should be set to retweet-status-id (retweet beats reply!)
+   *   IS_RETWEET_FLAG              should be set
+   *   IN_REPLY_TO_USER_ID_FIELD    should be set to in-reply-to-user-id
+   *   IS_REPLY_FLAG                should NOT be set
+   *
+   * 6. Retweet of a directed-at tweet (only in-reply-to-user-id is set,
+   * retweet-user-id and retweet-status-id are set).
+   *   RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
+   *   SHARED_STATUS_ID_CSF         should be set to retweet-status-id
+   *   IS_RETWEET_FLAG              should be set
+   *   IN_REPLY_TO_USER_ID_FIELD    should be set to in-reply-to-user-id
+   *   IS_REPLY_FLAG                should NOT be set
+   *
+   * In other words:
+   * SHARED_STATUS_ID_CSF logic: if this is a retweet SHARED_STATUS_ID_CSF should be set to
+   * retweet-status-id, otherwise if it's a reply to a tweet, it should be set to
+   * in-reply-to-status-id.
+   *
+   * 
*/ public static void buildRetweetAndReplyFields( long retweetUserIdVal, @@ -508,68 +560,19 @@ public class BasicIndexingConverter { long inReplyToUserIdVal, boolean strict, EarlybirdThriftDocumentBuilder builder) { - Optional retweetUserId = Optional.of(retweetUserIdVal).filter(x -> x > 0); - Optional sharedStatusId = Optional.of(sharedStatusIdVal).filter(x -> x > 0); - Optional inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(x -> x > 0); - Optional inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(x -> x > 0); - - // We have six combinations here. A Tweet can be - // 1) a reply to another tweet (then it has both in-reply-to-user-id and - // in-reply-to-status-id set), - // 2) directed-at a user (then it only has in-reply-to-user-id set), - // 3) not a reply at all. - // Additionally, it may or may not be a Retweet (if it is, then it has retweet-user-id and - // retweet-status-id set). - // - // We want to set some fields unconditionally, and some fields (reference-author-id and - // shared-status-id) depending on the reply/retweet combination. - // - // 1. Normal tweet (not a reply, not a retweet). None of the fields should be set. - // - // 2. Reply to a tweet (both in-reply-to-user-id and in-reply-to-status-id set). - // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id - // SHARED_STATUS_ID_CSF should be set to in-reply-to-status-id - // IS_REPLY_FLAG should be set - // - // 3. Directed-at a user (only in-reply-to-user-id is set). - // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id - // IS_REPLY_FLAG should be set - // - // 4. Retweet of a normal tweet (retweet-user-id and retweet-status-id are set). - // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id - // SHARED_STATUS_ID_CSF should be set to retweet-status-id - // IS_RETWEET_FLAG should be set - // - // 5. Retweet of a reply (both in-reply-to-user-id and in-reply-to-status-id set, - // retweet-user-id and retweet-status-id are set). - // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id - // SHARED_STATUS_ID_CSF should be set to retweet-status-id (retweet beats reply!) - // IS_RETWEET_FLAG should be set - // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id - // IS_REPLY_FLAG should NOT be set - // - // 6. Retweet of a directed-at tweet (only in-reply-to-user-id is set, - // retweet-user-id and retweet-status-id are set). - // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id - // SHARED_STATUS_ID_CSF should be set to retweet-status-id - // IS_RETWEET_FLAG should be set - // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id - // IS_REPLY_FLAG should NOT be set - // - // In other words: - // SHARED_STATUS_ID_CSF logic: if this is a retweet SHARED_STATUS_ID_CSF should be set to - // retweet-status-id, otherwise if it's a reply to a tweet, it should be set to - // in-reply-to-status-id. + Predicate isGreaterThanZero = id -> id > 0; + Optional retweetUserId = Optional.of(retweetUserIdVal).filter(isGreaterThanZero); + Optional sharedStatusId = Optional.of(sharedStatusIdVal).filter(isGreaterThanZero); + Optional inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(isGreaterThanZero); + Optional inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(isGreaterThanZero); Preconditions.checkState(retweetUserId.isPresent() == sharedStatusId.isPresent()); if (retweetUserId.isPresent()) { builder.withNativeRetweet(retweetUserId.get(), sharedStatusId.get()); - if (inReplyToUserId.isPresent()) { - // Set IN_REPLY_TO_USER_ID_FIELD even if this is a retweet of a reply. - builder.withInReplyToUserID(inReplyToUserId.get()); - } + // Set IN_REPLY_TO_USER_ID_FIELD even if this is a retweet of a reply. + inReplyToUserId.ifPresent(builder::withInReplyToUserID); } else { // If this is a retweet of a reply, we don't want to mark it as a reply, or override fields // set by the retweet logic. diff --git a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java index c5d6b1c76..92823e132 100644 --- a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java +++ b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java @@ -5,6 +5,7 @@ import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.regex.Matcher; @@ -140,7 +141,7 @@ public class EncodedFeatureBuilder { // Extract some extra information from the message text. // Index stock symbols with $ prepended textFeatures.getStocks().stream() - .filter(stock -> stock != null) + .filter(Objects::nonNull) .forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase())); // Question marks @@ -173,26 +174,23 @@ public class EncodedFeatureBuilder { } // User name features - if (message.getFromUserDisplayName().isPresent()) { - Locale locale = LanguageIdentifierHelper - .identifyLanguage(message.getFromUserDisplayName().get()); - String normalizedDisplayName = NormalizerHelper.normalize( - message.getFromUserDisplayName().get(), locale, penguinVersion); + message.getFromUserDisplayName().ifPresent(id -> { + Locale locale = LanguageIdentifierHelper.identifyLanguage(id); + String normalizedDisplayName = NormalizerHelper.normalize(id, locale, penguinVersion); TokenizerResult result = TokenizerHelper - .tokenizeTweet(normalizedDisplayName, locale, penguinVersion); + .tokenizeTweet(normalizedDisplayName, locale, penguinVersion); tokenSeqStream.reset(result.tokenSequence); try { versionedTweetFeatures.setUserDisplayNameTokenStream( - streamSerializer.serialize(tokenSeqStream)); + streamSerializer.serialize(tokenSeqStream)); versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString()); } catch (IOException e) { - LOG.error("TwitterTokenStream serialization error! Could not serialize: " - + message.getFromUserDisplayName().get()); + LOG.error("TwitterTokenStream serialization error! Could not serialize: " + id); SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment(); versionedTweetFeatures.unsetUserDisplayNameTokenStream(); versionedTweetFeatures.unsetUserDisplayNameTokenStreamText(); } - } + }); String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens()); versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText);