feat: reduce duplication in EncodedFeatureBuilder

This commit is contained in:
pedroluiznogueira 2023-04-01 01:38:26 -03:00
parent 743241984a
commit eaeb4bc1d8

View File

@ -5,6 +5,7 @@ import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.regex.Matcher; import java.util.regex.Matcher;
@ -140,7 +141,7 @@ public class EncodedFeatureBuilder {
// Extract some extra information from the message text. // Extract some extra information from the message text.
// Index stock symbols with $ prepended // Index stock symbols with $ prepended
textFeatures.getStocks().stream() textFeatures.getStocks().stream()
.filter(stock -> stock != null) .filter(Objects::nonNull)
.forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase())); .forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase()));
// Question marks // Question marks
@ -173,26 +174,23 @@ public class EncodedFeatureBuilder {
} }
// User name features // User name features
if (message.getFromUserDisplayName().isPresent()) { message.getFromUserDisplayName().ifPresent(id -> {
Locale locale = LanguageIdentifierHelper Locale locale = LanguageIdentifierHelper.identifyLanguage(id);
.identifyLanguage(message.getFromUserDisplayName().get()); String normalizedDisplayName = NormalizerHelper.normalize(id, locale, penguinVersion);
String normalizedDisplayName = NormalizerHelper.normalize(
message.getFromUserDisplayName().get(), locale, penguinVersion);
TokenizerResult result = TokenizerHelper TokenizerResult result = TokenizerHelper
.tokenizeTweet(normalizedDisplayName, locale, penguinVersion); .tokenizeTweet(normalizedDisplayName, locale, penguinVersion);
tokenSeqStream.reset(result.tokenSequence); tokenSeqStream.reset(result.tokenSequence);
try { try {
versionedTweetFeatures.setUserDisplayNameTokenStream( versionedTweetFeatures.setUserDisplayNameTokenStream(
streamSerializer.serialize(tokenSeqStream)); streamSerializer.serialize(tokenSeqStream));
versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString()); versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString());
} catch (IOException e) { } catch (IOException e) {
LOG.error("TwitterTokenStream serialization error! Could not serialize: " LOG.error("TwitterTokenStream serialization error! Could not serialize: " + id);
+ message.getFromUserDisplayName().get());
SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment(); SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment();
versionedTweetFeatures.unsetUserDisplayNameTokenStream(); versionedTweetFeatures.unsetUserDisplayNameTokenStream();
versionedTweetFeatures.unsetUserDisplayNameTokenStreamText(); versionedTweetFeatures.unsetUserDisplayNameTokenStreamText();
} }
} });
String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens()); String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText); versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText);