From eaeb4bc1d8c38320890429e0174a28fc9a8956e8 Mon Sep 17 00:00:00 2001 From: pedroluiznogueira Date: Sat, 1 Apr 2023 01:38:26 -0300 Subject: [PATCH] feat: reduce duplication in EncodedFeatureBuilder --- .../earlybird/EncodedFeatureBuilder.java | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java index c5d6b1c76..92823e132 100644 --- a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java +++ b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java @@ -5,6 +5,7 @@ import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.regex.Matcher; @@ -140,7 +141,7 @@ public class EncodedFeatureBuilder { // Extract some extra information from the message text. // Index stock symbols with $ prepended textFeatures.getStocks().stream() - .filter(stock -> stock != null) + .filter(Objects::nonNull) .forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase())); // Question marks @@ -173,26 +174,23 @@ public class EncodedFeatureBuilder { } // User name features - if (message.getFromUserDisplayName().isPresent()) { - Locale locale = LanguageIdentifierHelper - .identifyLanguage(message.getFromUserDisplayName().get()); - String normalizedDisplayName = NormalizerHelper.normalize( - message.getFromUserDisplayName().get(), locale, penguinVersion); + message.getFromUserDisplayName().ifPresent(id -> { + Locale locale = LanguageIdentifierHelper.identifyLanguage(id); + String normalizedDisplayName = NormalizerHelper.normalize(id, locale, penguinVersion); TokenizerResult result = TokenizerHelper - .tokenizeTweet(normalizedDisplayName, locale, penguinVersion); + .tokenizeTweet(normalizedDisplayName, locale, penguinVersion); tokenSeqStream.reset(result.tokenSequence); try { versionedTweetFeatures.setUserDisplayNameTokenStream( - streamSerializer.serialize(tokenSeqStream)); + streamSerializer.serialize(tokenSeqStream)); versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString()); } catch (IOException e) { - LOG.error("TwitterTokenStream serialization error! Could not serialize: " - + message.getFromUserDisplayName().get()); + LOG.error("TwitterTokenStream serialization error! Could not serialize: " + id); SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment(); versionedTweetFeatures.unsetUserDisplayNameTokenStream(); versionedTweetFeatures.unsetUserDisplayNameTokenStreamText(); } - } + }); String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens()); versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText);