diff --git a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java
index afde8a84e..94168a795 100644
--- a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java
+++ b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java
@@ -500,6 +500,58 @@ public class BasicIndexingConverter {
/**
* Build the correct ThriftIndexingEvent's fields based on retweet and reply status.
+ *
+ *
+ *
+ * We have six combinations here. A tweet can be
+ * 1) a reply to another tweet (then it has both in-reply-to-user-id and
+ * in-reply-to-status-id set),
+ * 2) directed-at a user (then it only has in-reply-to-user-id set),
+ * 3) not a reply at all.
+ * Additionally, it may or may not be a retweet (if it is, then it has retweet-user-id and
+ * retweet-status-id set).
+ *
+ * We want to set some fields unconditionally, and some fields (reference-author-id and
+ * shared-status-id) depending on the reply/retweet combination.
+ *
+ * 1. Normal tweet (not a reply, not a retweet). None of the fields should be set.
+ *
+ * 2. Reply to a tweet (both in-reply-to-user-id and in-reply-to-status-id set).
+ * IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
+ * SHARED_STATUS_ID_CSF should be set to in-reply-to-status-id
+ * IS_REPLY_FLAG should be set
+ *
+ * 3. Directed-at a user (only in-reply-to-user-id is set).
+ * IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
+ * IS_REPLY_FLAG should be set
+ *
+ * 4. Retweet of a normal tweet (retweet-user-id and retweet-status-id are set).
+ * RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
+ * SHARED_STATUS_ID_CSF should be set to retweet-status-id
+ * IS_RETWEET_FLAG should be set
+ *
+ * 5. Retweet of a reply (both in-reply-to-user-id and in-reply-to-status-id set,
+ * retweet-user-id and retweet-status-id are set).
+ * RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
+ * SHARED_STATUS_ID_CSF should be set to retweet-status-id (retweet beats reply!)
+ * IS_RETWEET_FLAG should be set
+ * IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
+ * IS_REPLY_FLAG should NOT be set
+ *
+ * 6. Retweet of a directed-at tweet (only in-reply-to-user-id is set,
+ * retweet-user-id and retweet-status-id are set).
+ * RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
+ * SHARED_STATUS_ID_CSF should be set to retweet-status-id
+ * IS_RETWEET_FLAG should be set
+ * IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
+ * IS_REPLY_FLAG should NOT be set
+ *
+ * In other words:
+ * SHARED_STATUS_ID_CSF logic: if this is a retweet SHARED_STATUS_ID_CSF should be set to
+ * retweet-status-id, otherwise if it's a reply to a tweet, it should be set to
+ * in-reply-to-status-id.
+ *
+ *
*/
public static void buildRetweetAndReplyFields(
long retweetUserIdVal,
@@ -508,68 +560,19 @@ public class BasicIndexingConverter {
long inReplyToUserIdVal,
boolean strict,
EarlybirdThriftDocumentBuilder builder) {
- Optional retweetUserId = Optional.of(retweetUserIdVal).filter(x -> x > 0);
- Optional sharedStatusId = Optional.of(sharedStatusIdVal).filter(x -> x > 0);
- Optional inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(x -> x > 0);
- Optional inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(x -> x > 0);
-
- // We have six combinations here. A Tweet can be
- // 1) a reply to another tweet (then it has both in-reply-to-user-id and
- // in-reply-to-status-id set),
- // 2) directed-at a user (then it only has in-reply-to-user-id set),
- // 3) not a reply at all.
- // Additionally, it may or may not be a Retweet (if it is, then it has retweet-user-id and
- // retweet-status-id set).
- //
- // We want to set some fields unconditionally, and some fields (reference-author-id and
- // shared-status-id) depending on the reply/retweet combination.
- //
- // 1. Normal tweet (not a reply, not a retweet). None of the fields should be set.
- //
- // 2. Reply to a tweet (both in-reply-to-user-id and in-reply-to-status-id set).
- // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
- // SHARED_STATUS_ID_CSF should be set to in-reply-to-status-id
- // IS_REPLY_FLAG should be set
- //
- // 3. Directed-at a user (only in-reply-to-user-id is set).
- // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
- // IS_REPLY_FLAG should be set
- //
- // 4. Retweet of a normal tweet (retweet-user-id and retweet-status-id are set).
- // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
- // SHARED_STATUS_ID_CSF should be set to retweet-status-id
- // IS_RETWEET_FLAG should be set
- //
- // 5. Retweet of a reply (both in-reply-to-user-id and in-reply-to-status-id set,
- // retweet-user-id and retweet-status-id are set).
- // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
- // SHARED_STATUS_ID_CSF should be set to retweet-status-id (retweet beats reply!)
- // IS_RETWEET_FLAG should be set
- // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
- // IS_REPLY_FLAG should NOT be set
- //
- // 6. Retweet of a directed-at tweet (only in-reply-to-user-id is set,
- // retweet-user-id and retweet-status-id are set).
- // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
- // SHARED_STATUS_ID_CSF should be set to retweet-status-id
- // IS_RETWEET_FLAG should be set
- // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
- // IS_REPLY_FLAG should NOT be set
- //
- // In other words:
- // SHARED_STATUS_ID_CSF logic: if this is a retweet SHARED_STATUS_ID_CSF should be set to
- // retweet-status-id, otherwise if it's a reply to a tweet, it should be set to
- // in-reply-to-status-id.
+ Predicate isGreaterThanZero = id -> id > 0;
+ Optional retweetUserId = Optional.of(retweetUserIdVal).filter(isGreaterThanZero);
+ Optional sharedStatusId = Optional.of(sharedStatusIdVal).filter(isGreaterThanZero);
+ Optional inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(isGreaterThanZero);
+ Optional inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(isGreaterThanZero);
Preconditions.checkState(retweetUserId.isPresent() == sharedStatusId.isPresent());
if (retweetUserId.isPresent()) {
builder.withNativeRetweet(retweetUserId.get(), sharedStatusId.get());
- if (inReplyToUserId.isPresent()) {
- // Set IN_REPLY_TO_USER_ID_FIELD even if this is a retweet of a reply.
- builder.withInReplyToUserID(inReplyToUserId.get());
- }
+ // Set IN_REPLY_TO_USER_ID_FIELD even if this is a retweet of a reply.
+ inReplyToUserId.ifPresent(builder::withInReplyToUserID);
} else {
// If this is a retweet of a reply, we don't want to mark it as a reply, or override fields
// set by the retweet logic.
diff --git a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java
index c5d6b1c76..92823e132 100644
--- a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java
+++ b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java
@@ -5,6 +5,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
+import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
@@ -140,7 +141,7 @@ public class EncodedFeatureBuilder {
// Extract some extra information from the message text.
// Index stock symbols with $ prepended
textFeatures.getStocks().stream()
- .filter(stock -> stock != null)
+ .filter(Objects::nonNull)
.forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase()));
// Question marks
@@ -173,26 +174,23 @@ public class EncodedFeatureBuilder {
}
// User name features
- if (message.getFromUserDisplayName().isPresent()) {
- Locale locale = LanguageIdentifierHelper
- .identifyLanguage(message.getFromUserDisplayName().get());
- String normalizedDisplayName = NormalizerHelper.normalize(
- message.getFromUserDisplayName().get(), locale, penguinVersion);
+ message.getFromUserDisplayName().ifPresent(id -> {
+ Locale locale = LanguageIdentifierHelper.identifyLanguage(id);
+ String normalizedDisplayName = NormalizerHelper.normalize(id, locale, penguinVersion);
TokenizerResult result = TokenizerHelper
- .tokenizeTweet(normalizedDisplayName, locale, penguinVersion);
+ .tokenizeTweet(normalizedDisplayName, locale, penguinVersion);
tokenSeqStream.reset(result.tokenSequence);
try {
versionedTweetFeatures.setUserDisplayNameTokenStream(
- streamSerializer.serialize(tokenSeqStream));
+ streamSerializer.serialize(tokenSeqStream));
versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString());
} catch (IOException e) {
- LOG.error("TwitterTokenStream serialization error! Could not serialize: "
- + message.getFromUserDisplayName().get());
+ LOG.error("TwitterTokenStream serialization error! Could not serialize: " + id);
SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment();
versionedTweetFeatures.unsetUserDisplayNameTokenStream();
versionedTweetFeatures.unsetUserDisplayNameTokenStreamText();
}
- }
+ });
String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText);