From c2766410623db2b4bb76e58ef8b6ba26d6caf280 Mon Sep 17 00:00:00 2001 From: pedroluiznogueira Date: Sat, 1 Apr 2023 00:43:06 -0300 Subject: [PATCH 1/4] feat: improving readability of buildRetweetAndReplyFields(..) in BasicIndexingConverter - as this method already has a java doc, why not have the entire logic explained on it, instead of inside the method - since the lambda parameters of the Predicate represent an id, giving them better names than 'x' --- .../earlybird/BasicIndexingConverter.java | 120 +++++++++--------- 1 file changed, 62 insertions(+), 58 deletions(-) diff --git a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java index ddd9e50b3..873b44f7e 100644 --- a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java +++ b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java @@ -500,66 +500,70 @@ public class BasicIndexingConverter { /** * Build the correct ThriftIndexingEvent's fields based on retweet and reply status. + * + *
+   *
+   * We have six combinations here. A tweet can be
+   *   1) a reply to another tweet (then it has both in-reply-to-user-id and
+   *      in-reply-to-status-id set),
+   *   2) directed-at a user (then it only has in-reply-to-user-id set),
+   *   3) not a reply at all.
+   * Additionally, it may or may not be a retweet (if it is, then it has retweet-user-id and
+   * retweet-status-id set).
+   *
+   * We want to set some fields unconditionally, and some fields (reference-author-id and
+   * shared-status-id) depending on the reply/retweet combination.
+   *
+   * 1. Normal tweet (not a reply, not a retweet). None of the fields should be set.
+   *
+   * 2. Reply to a tweet (both in-reply-to-user-id and in-reply-to-status-id set).
+   *   IN_REPLY_TO_USER_ID_FIELD    should be set to in-reply-to-user-id
+   *   SHARED_STATUS_ID_CSF         should be set to in-reply-to-status-id
+   *   IS_REPLY_FLAG                should be set
+   *
+   * 3. Directed-at a user (only in-reply-to-user-id is set).
+   *   IN_REPLY_TO_USER_ID_FIELD    should be set to in-reply-to-user-id
+   *   IS_REPLY_FLAG                should be set
+   *
+   * 4. Retweet of a normal tweet (retweet-user-id and retweet-status-id are set).
+   *   RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
+   *   SHARED_STATUS_ID_CSF         should be set to retweet-status-id
+   *   IS_RETWEET_FLAG              should be set
+   *
+   * 5. Retweet of a reply (both in-reply-to-user-id and in-reply-to-status-id set,
+   * retweet-user-id and retweet-status-id are set).
+   *   RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
+   *   SHARED_STATUS_ID_CSF         should be set to retweet-status-id (retweet beats reply!)
+   *   IS_RETWEET_FLAG              should be set
+   *   IN_REPLY_TO_USER_ID_FIELD    should be set to in-reply-to-user-id
+   *   IS_REPLY_FLAG                should NOT be set
+   *
+   * 6. Retweet of a directed-at tweet (only in-reply-to-user-id is set,
+   * retweet-user-id and retweet-status-id are set).
+   *   RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
+   *   SHARED_STATUS_ID_CSF         should be set to retweet-status-id
+   *   IS_RETWEET_FLAG              should be set
+   *   IN_REPLY_TO_USER_ID_FIELD    should be set to in-reply-to-user-id
+   *   IS_REPLY_FLAG                should NOT be set
+   *
+   * In other words:
+   * SHARED_STATUS_ID_CSF logic: if this is a retweet SHARED_STATUS_ID_CSF should be set to
+   * retweet-status-id, otherwise if it's a reply to a tweet, it should be set to
+   * in-reply-to-status-id.
+   *
+   * 
*/ public static void buildRetweetAndReplyFields( - long retweetUserIdVal, - long sharedStatusIdVal, - long inReplyToStatusIdVal, - long inReplyToUserIdVal, - boolean strict, - EarlybirdThriftDocumentBuilder builder) { - Optional retweetUserId = Optional.of(retweetUserIdVal).filter(x -> x > 0); - Optional sharedStatusId = Optional.of(sharedStatusIdVal).filter(x -> x > 0); - Optional inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(x -> x > 0); - Optional inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(x -> x > 0); - - // We have six combinations here. A tweet can be - // 1) a reply to another tweet (then it has both in-reply-to-user-id and - // in-reply-to-status-id set), - // 2) directed-at a user (then it only has in-reply-to-user-id set), - // 3) not a reply at all. - // Additionally, it may or may not be a retweet (if it is, then it has retweet-user-id and - // retweet-status-id set). - // - // We want to set some fields unconditionally, and some fields (reference-author-id and - // shared-status-id) depending on the reply/retweet combination. - // - // 1. Normal tweet (not a reply, not a retweet). None of the fields should be set. - // - // 2. Reply to a tweet (both in-reply-to-user-id and in-reply-to-status-id set). - // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id - // SHARED_STATUS_ID_CSF should be set to in-reply-to-status-id - // IS_REPLY_FLAG should be set - // - // 3. Directed-at a user (only in-reply-to-user-id is set). - // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id - // IS_REPLY_FLAG should be set - // - // 4. Retweet of a normal tweet (retweet-user-id and retweet-status-id are set). - // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id - // SHARED_STATUS_ID_CSF should be set to retweet-status-id - // IS_RETWEET_FLAG should be set - // - // 5. Retweet of a reply (both in-reply-to-user-id and in-reply-to-status-id set, - // retweet-user-id and retweet-status-id are set). - // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id - // SHARED_STATUS_ID_CSF should be set to retweet-status-id (retweet beats reply!) - // IS_RETWEET_FLAG should be set - // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id - // IS_REPLY_FLAG should NOT be set - // - // 6. Retweet of a directed-at tweet (only in-reply-to-user-id is set, - // retweet-user-id and retweet-status-id are set). - // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id - // SHARED_STATUS_ID_CSF should be set to retweet-status-id - // IS_RETWEET_FLAG should be set - // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id - // IS_REPLY_FLAG should NOT be set - // - // In other words: - // SHARED_STATUS_ID_CSF logic: if this is a retweet SHARED_STATUS_ID_CSF should be set to - // retweet-status-id, otherwise if it's a reply to a tweet, it should be set to - // in-reply-to-status-id. + long retweetUserIdVal, + long sharedStatusIdVal, + long inReplyToStatusIdVal, + long inReplyToUserIdVal, + boolean strict, + EarlybirdThriftDocumentBuilder builder) { + Optional retweetUserId = Optional.of(retweetUserIdVal).filter(id -> id > 0); + Optional sharedStatusId = Optional.of(sharedStatusIdVal).filter(id -> id > 0); + Optional inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(id -> id > 0); + Optional inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(id -> id > 0); Preconditions.checkState(retweetUserId.isPresent() == sharedStatusId.isPresent()); From 743241984aa36f0514d0a8e62d1a31764d4fd419 Mon Sep 17 00:00:00 2001 From: pedroluiznogueira Date: Sat, 1 Apr 2023 01:18:47 -0300 Subject: [PATCH 2/4] feat: use Optional.ifPresent instead of an if Optional.ifPresent --- .../common/converter/earlybird/BasicIndexingConverter.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java index 873b44f7e..2cfc3b986 100644 --- a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java +++ b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java @@ -570,10 +570,8 @@ public class BasicIndexingConverter { if (retweetUserId.isPresent()) { builder.withNativeRetweet(retweetUserId.get(), sharedStatusId.get()); - if (inReplyToUserId.isPresent()) { - // Set IN_REPLY_TO_USER_ID_FIELD even if this is a retweet of a reply. - builder.withInReplyToUserID(inReplyToUserId.get()); - } + // Set IN_REPLY_TO_USER_ID_FIELD even if this is a retweet of a reply. + inReplyToUserId.ifPresent(builder::withInReplyToUserID); } else { // If this is a retweet of a reply, we don't want to mark it as a reply, or override fields // set by the retweet logic. From eaeb4bc1d8c38320890429e0174a28fc9a8956e8 Mon Sep 17 00:00:00 2001 From: pedroluiznogueira Date: Sat, 1 Apr 2023 01:38:26 -0300 Subject: [PATCH 3/4] feat: reduce duplication in EncodedFeatureBuilder --- .../earlybird/EncodedFeatureBuilder.java | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java index c5d6b1c76..92823e132 100644 --- a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java +++ b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java @@ -5,6 +5,7 @@ import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.regex.Matcher; @@ -140,7 +141,7 @@ public class EncodedFeatureBuilder { // Extract some extra information from the message text. // Index stock symbols with $ prepended textFeatures.getStocks().stream() - .filter(stock -> stock != null) + .filter(Objects::nonNull) .forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase())); // Question marks @@ -173,26 +174,23 @@ public class EncodedFeatureBuilder { } // User name features - if (message.getFromUserDisplayName().isPresent()) { - Locale locale = LanguageIdentifierHelper - .identifyLanguage(message.getFromUserDisplayName().get()); - String normalizedDisplayName = NormalizerHelper.normalize( - message.getFromUserDisplayName().get(), locale, penguinVersion); + message.getFromUserDisplayName().ifPresent(id -> { + Locale locale = LanguageIdentifierHelper.identifyLanguage(id); + String normalizedDisplayName = NormalizerHelper.normalize(id, locale, penguinVersion); TokenizerResult result = TokenizerHelper - .tokenizeTweet(normalizedDisplayName, locale, penguinVersion); + .tokenizeTweet(normalizedDisplayName, locale, penguinVersion); tokenSeqStream.reset(result.tokenSequence); try { versionedTweetFeatures.setUserDisplayNameTokenStream( - streamSerializer.serialize(tokenSeqStream)); + streamSerializer.serialize(tokenSeqStream)); versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString()); } catch (IOException e) { - LOG.error("TwitterTokenStream serialization error! Could not serialize: " - + message.getFromUserDisplayName().get()); + LOG.error("TwitterTokenStream serialization error! Could not serialize: " + id); SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment(); versionedTweetFeatures.unsetUserDisplayNameTokenStream(); versionedTweetFeatures.unsetUserDisplayNameTokenStreamText(); } - } + }); String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens()); versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText); From 97fd2dc82123e1d15770c41fa2f97a4df3a567e1 Mon Sep 17 00:00:00 2001 From: pedroluiznogueira Date: Sat, 1 Apr 2023 01:39:53 -0300 Subject: [PATCH 4/4] feat: reduce duplication in BasicIndexingConverter --- .../converter/earlybird/BasicIndexingConverter.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java index 2cfc3b986..5966805a5 100644 --- a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java +++ b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java @@ -560,10 +560,11 @@ public class BasicIndexingConverter { long inReplyToUserIdVal, boolean strict, EarlybirdThriftDocumentBuilder builder) { - Optional retweetUserId = Optional.of(retweetUserIdVal).filter(id -> id > 0); - Optional sharedStatusId = Optional.of(sharedStatusIdVal).filter(id -> id > 0); - Optional inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(id -> id > 0); - Optional inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(id -> id > 0); + Predicate isGreaterThanZero = id -> id > 0; + Optional retweetUserId = Optional.of(retweetUserIdVal).filter(isGreaterThanZero); + Optional sharedStatusId = Optional.of(sharedStatusIdVal).filter(isGreaterThanZero); + Optional inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(isGreaterThanZero); + Optional inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(isGreaterThanZero); Preconditions.checkState(retweetUserId.isPresent() == sharedStatusId.isPresent());