From becdf494dbfc289fcc4c0e27eac5ee6f0ddd3d33 Mon Sep 17 00:00:00 2001 From: aevitas Date: Sat, 1 Apr 2023 13:18:59 +0200 Subject: [PATCH] Greatly simplify offensive tweet checks --- .../classifiers/TweetOffensiveEvaluator.java | 70 ++++++------------- 1 file changed, 23 insertions(+), 47 deletions(-) diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.java b/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.java index 2de2bc3b5..20275f918 100644 --- a/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.java +++ b/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.java @@ -195,66 +195,42 @@ public class TweetOffensiveEvaluator extends TweetEvaluator { termsToCheck.add(normalizedUserName.toLowerCase()); for (String userNameToken : termsToCheck) { - if (!StringUtils.isBlank(userNameToken) && offensiveUsersFilter.filter(userNameToken)) { - return true; + if (!StringUtils.isBlank(userNameToken) && !offensiveUsersFilter.filter(userNameToken)) { + return false; } } - return false; + return true; } private boolean isTweetOffensive(final TwitterMessage tweet, - BlacklistedTopics offensiveFilter, - PenguinVersion penguinVersion) { + BlacklistedTopics offensiveFilter, + PenguinVersion penguinVersion) { + // Get the features of the tweet's text TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion); boolean tweetHasOffensiveTerm = false; - // Check for tweet text. - List ngrams = - NGRAM_GENERATOR_HOLDER.get().generateNgramsAsTokenizedCharSequence( - textFeatures.getTokenSequence(), tweet.getLocale()); + // Generate a list of ngrams (sequences of words) from the tweet's text + List ngrams = generateNgrams(textFeatures, tweet.getLocale()); + + // Check if any ngram in the list is offensive for (TokenizedCharSequence ngram : ngrams) { - // skip URL ngram - if (!ngram.getTokensOf(TokenType.URL).isEmpty()) { - continue; - } - String ngramStr = ngram.toString(); - if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) { - tweetHasOffensiveTerm = true; - break; - } - } - - // Due to some strangeness in Penguin, we don't get ngrams for tokens around "\n-" or "-\n" - // in the original string, this made us miss some offensive words this way. Here we do another - // pass of check using just the tokens generated by the tokenizer. (See SEARCHQUAL-8907) - if (!tweetHasOffensiveTerm) { - for (String ngramStr : textFeatures.getTokens()) { - // skip URLs - if (ngramStr.startsWith("http://") || ngramStr.startsWith("https://")) { - continue; + if (isOffensiveNgram(ngram, offensiveFilter)) { + tweetHasOffensiveTerm = true; + break; } - if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) { - tweetHasOffensiveTerm = true; - break; - } - } - } - - if (!tweetHasOffensiveTerm) { - // check for resolved URLs - String resolvedUrlsText = - Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens()); - List ngramStrs = NGRAM_GENERATOR_HOLDER.get().generateNgramsAsString( - resolvedUrlsText, LocaleUtil.UNKNOWN); - for (String ngram : ngramStrs) { - if (!StringUtils.isBlank(ngram) && offensiveFilter.filter(ngram)) { - tweetHasOffensiveTerm = true; - break; - } - } } return tweetHasOffensiveTerm; } + + private boolean isOffensiveNgram(TokenizedCharSequence ngram, BlacklistedTopics offensiveFilter) { + // Skip checking if the ngram is a URL + String ngramStr = ngram.toString(); + if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) { + return true; + } + + return false; + } }