Merge 9807ebe26339bcd95d465e8fdad0b99815cd6716 into fb54d8b54984f89f7dba90a18e7c3048421464c3

This commit is contained in:
Pedro Luiz 2023-05-22 17:40:28 -05:00 committed by GitHub
commit b716094be3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 68 additions and 67 deletions

View File

@ -500,6 +500,58 @@ public class BasicIndexingConverter {
/** /**
* Build the correct ThriftIndexingEvent's fields based on retweet and reply status. * Build the correct ThriftIndexingEvent's fields based on retweet and reply status.
*
* <pre>
*
* We have six combinations here. A tweet can be
* 1) a reply to another tweet (then it has both in-reply-to-user-id and
* in-reply-to-status-id set),
* 2) directed-at a user (then it only has in-reply-to-user-id set),
* 3) not a reply at all.
* Additionally, it may or may not be a retweet (if it is, then it has retweet-user-id and
* retweet-status-id set).
*
* We want to set some fields unconditionally, and some fields (reference-author-id and
* shared-status-id) depending on the reply/retweet combination.
*
* 1. Normal tweet (not a reply, not a retweet). None of the fields should be set.
*
* 2. Reply to a tweet (both in-reply-to-user-id and in-reply-to-status-id set).
* IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
* SHARED_STATUS_ID_CSF should be set to in-reply-to-status-id
* IS_REPLY_FLAG should be set
*
* 3. Directed-at a user (only in-reply-to-user-id is set).
* IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
* IS_REPLY_FLAG should be set
*
* 4. Retweet of a normal tweet (retweet-user-id and retweet-status-id are set).
* RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
* SHARED_STATUS_ID_CSF should be set to retweet-status-id
* IS_RETWEET_FLAG should be set
*
* 5. Retweet of a reply (both in-reply-to-user-id and in-reply-to-status-id set,
* retweet-user-id and retweet-status-id are set).
* RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
* SHARED_STATUS_ID_CSF should be set to retweet-status-id (retweet beats reply!)
* IS_RETWEET_FLAG should be set
* IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
* IS_REPLY_FLAG should NOT be set
*
* 6. Retweet of a directed-at tweet (only in-reply-to-user-id is set,
* retweet-user-id and retweet-status-id are set).
* RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
* SHARED_STATUS_ID_CSF should be set to retweet-status-id
* IS_RETWEET_FLAG should be set
* IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
* IS_REPLY_FLAG should NOT be set
*
* In other words:
* SHARED_STATUS_ID_CSF logic: if this is a retweet SHARED_STATUS_ID_CSF should be set to
* retweet-status-id, otherwise if it's a reply to a tweet, it should be set to
* in-reply-to-status-id.
*
* </pre>
*/ */
public static void buildRetweetAndReplyFields( public static void buildRetweetAndReplyFields(
long retweetUserIdVal, long retweetUserIdVal,
@ -508,68 +560,19 @@ public class BasicIndexingConverter {
long inReplyToUserIdVal, long inReplyToUserIdVal,
boolean strict, boolean strict,
EarlybirdThriftDocumentBuilder builder) { EarlybirdThriftDocumentBuilder builder) {
Optional<Long> retweetUserId = Optional.of(retweetUserIdVal).filter(x -> x > 0); Predicate<Long> isGreaterThanZero = id -> id > 0;
Optional<Long> sharedStatusId = Optional.of(sharedStatusIdVal).filter(x -> x > 0); Optional<Long> retweetUserId = Optional.of(retweetUserIdVal).filter(isGreaterThanZero);
Optional<Long> inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(x -> x > 0); Optional<Long> sharedStatusId = Optional.of(sharedStatusIdVal).filter(isGreaterThanZero);
Optional<Long> inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(x -> x > 0); Optional<Long> inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(isGreaterThanZero);
Optional<Long> inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(isGreaterThanZero);
// We have six combinations here. A Tweet can be
// 1) a reply to another tweet (then it has both in-reply-to-user-id and
// in-reply-to-status-id set),
// 2) directed-at a user (then it only has in-reply-to-user-id set),
// 3) not a reply at all.
// Additionally, it may or may not be a Retweet (if it is, then it has retweet-user-id and
// retweet-status-id set).
//
// We want to set some fields unconditionally, and some fields (reference-author-id and
// shared-status-id) depending on the reply/retweet combination.
//
// 1. Normal tweet (not a reply, not a retweet). None of the fields should be set.
//
// 2. Reply to a tweet (both in-reply-to-user-id and in-reply-to-status-id set).
// IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
// SHARED_STATUS_ID_CSF should be set to in-reply-to-status-id
// IS_REPLY_FLAG should be set
//
// 3. Directed-at a user (only in-reply-to-user-id is set).
// IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
// IS_REPLY_FLAG should be set
//
// 4. Retweet of a normal tweet (retweet-user-id and retweet-status-id are set).
// RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
// SHARED_STATUS_ID_CSF should be set to retweet-status-id
// IS_RETWEET_FLAG should be set
//
// 5. Retweet of a reply (both in-reply-to-user-id and in-reply-to-status-id set,
// retweet-user-id and retweet-status-id are set).
// RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
// SHARED_STATUS_ID_CSF should be set to retweet-status-id (retweet beats reply!)
// IS_RETWEET_FLAG should be set
// IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
// IS_REPLY_FLAG should NOT be set
//
// 6. Retweet of a directed-at tweet (only in-reply-to-user-id is set,
// retweet-user-id and retweet-status-id are set).
// RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
// SHARED_STATUS_ID_CSF should be set to retweet-status-id
// IS_RETWEET_FLAG should be set
// IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
// IS_REPLY_FLAG should NOT be set
//
// In other words:
// SHARED_STATUS_ID_CSF logic: if this is a retweet SHARED_STATUS_ID_CSF should be set to
// retweet-status-id, otherwise if it's a reply to a tweet, it should be set to
// in-reply-to-status-id.
Preconditions.checkState(retweetUserId.isPresent() == sharedStatusId.isPresent()); Preconditions.checkState(retweetUserId.isPresent() == sharedStatusId.isPresent());
if (retweetUserId.isPresent()) { if (retweetUserId.isPresent()) {
builder.withNativeRetweet(retweetUserId.get(), sharedStatusId.get()); builder.withNativeRetweet(retweetUserId.get(), sharedStatusId.get());
if (inReplyToUserId.isPresent()) {
// Set IN_REPLY_TO_USER_ID_FIELD even if this is a retweet of a reply. // Set IN_REPLY_TO_USER_ID_FIELD even if this is a retweet of a reply.
builder.withInReplyToUserID(inReplyToUserId.get()); inReplyToUserId.ifPresent(builder::withInReplyToUserID);
}
} else { } else {
// If this is a retweet of a reply, we don't want to mark it as a reply, or override fields // If this is a retweet of a reply, we don't want to mark it as a reply, or override fields
// set by the retweet logic. // set by the retweet logic.

View File

@ -5,6 +5,7 @@ import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.regex.Matcher; import java.util.regex.Matcher;
@ -140,7 +141,7 @@ public class EncodedFeatureBuilder {
// Extract some extra information from the message text. // Extract some extra information from the message text.
// Index stock symbols with $ prepended // Index stock symbols with $ prepended
textFeatures.getStocks().stream() textFeatures.getStocks().stream()
.filter(stock -> stock != null) .filter(Objects::nonNull)
.forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase())); .forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase()));
// Question marks // Question marks
@ -173,11 +174,9 @@ public class EncodedFeatureBuilder {
} }
// User name features // User name features
if (message.getFromUserDisplayName().isPresent()) { message.getFromUserDisplayName().ifPresent(id -> {
Locale locale = LanguageIdentifierHelper Locale locale = LanguageIdentifierHelper.identifyLanguage(id);
.identifyLanguage(message.getFromUserDisplayName().get()); String normalizedDisplayName = NormalizerHelper.normalize(id, locale, penguinVersion);
String normalizedDisplayName = NormalizerHelper.normalize(
message.getFromUserDisplayName().get(), locale, penguinVersion);
TokenizerResult result = TokenizerHelper TokenizerResult result = TokenizerHelper
.tokenizeTweet(normalizedDisplayName, locale, penguinVersion); .tokenizeTweet(normalizedDisplayName, locale, penguinVersion);
tokenSeqStream.reset(result.tokenSequence); tokenSeqStream.reset(result.tokenSequence);
@ -186,13 +185,12 @@ public class EncodedFeatureBuilder {
streamSerializer.serialize(tokenSeqStream)); streamSerializer.serialize(tokenSeqStream));
versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString()); versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString());
} catch (IOException e) { } catch (IOException e) {
LOG.error("TwitterTokenStream serialization error! Could not serialize: " LOG.error("TwitterTokenStream serialization error! Could not serialize: " + id);
+ message.getFromUserDisplayName().get());
SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment(); SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment();
versionedTweetFeatures.unsetUserDisplayNameTokenStream(); versionedTweetFeatures.unsetUserDisplayNameTokenStream();
versionedTweetFeatures.unsetUserDisplayNameTokenStreamText(); versionedTweetFeatures.unsetUserDisplayNameTokenStreamText();
} }
} });
String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens()); String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText); versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText);