Greatly simplify offensive tweet checks

This commit is contained in:
aevitas 2023-04-01 13:18:59 +02:00 committed by GitHub
parent e15f5a1b61
commit becdf494db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -195,66 +195,42 @@ public class TweetOffensiveEvaluator extends TweetEvaluator {
termsToCheck.add(normalizedUserName.toLowerCase()); termsToCheck.add(normalizedUserName.toLowerCase());
for (String userNameToken : termsToCheck) { for (String userNameToken : termsToCheck) {
if (!StringUtils.isBlank(userNameToken) && offensiveUsersFilter.filter(userNameToken)) { if (!StringUtils.isBlank(userNameToken) && !offensiveUsersFilter.filter(userNameToken)) {
return true; return false;
} }
} }
return false; return true;
} }
private boolean isTweetOffensive(final TwitterMessage tweet, private boolean isTweetOffensive(final TwitterMessage tweet,
BlacklistedTopics offensiveFilter, BlacklistedTopics offensiveFilter,
PenguinVersion penguinVersion) { PenguinVersion penguinVersion) {
// Get the features of the tweet's text
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion); TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
boolean tweetHasOffensiveTerm = false; boolean tweetHasOffensiveTerm = false;
// Check for tweet text. // Generate a list of ngrams (sequences of words) from the tweet's text
List<TokenizedCharSequence> ngrams = List<TokenizedCharSequence> ngrams = generateNgrams(textFeatures, tweet.getLocale());
NGRAM_GENERATOR_HOLDER.get().generateNgramsAsTokenizedCharSequence(
textFeatures.getTokenSequence(), tweet.getLocale()); // Check if any ngram in the list is offensive
for (TokenizedCharSequence ngram : ngrams) { for (TokenizedCharSequence ngram : ngrams) {
// skip URL ngram if (isOffensiveNgram(ngram, offensiveFilter)) {
if (!ngram.getTokensOf(TokenType.URL).isEmpty()) { tweetHasOffensiveTerm = true;
continue; break;
}
String ngramStr = ngram.toString();
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
tweetHasOffensiveTerm = true;
break;
}
}
// Due to some strangeness in Penguin, we don't get ngrams for tokens around "\n-" or "-\n"
// in the original string, this made us miss some offensive words this way. Here we do another
// pass of check using just the tokens generated by the tokenizer. (See SEARCHQUAL-8907)
if (!tweetHasOffensiveTerm) {
for (String ngramStr : textFeatures.getTokens()) {
// skip URLs
if (ngramStr.startsWith("http://") || ngramStr.startsWith("https://")) {
continue;
} }
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
tweetHasOffensiveTerm = true;
break;
}
}
}
if (!tweetHasOffensiveTerm) {
// check for resolved URLs
String resolvedUrlsText =
Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
List<String> ngramStrs = NGRAM_GENERATOR_HOLDER.get().generateNgramsAsString(
resolvedUrlsText, LocaleUtil.UNKNOWN);
for (String ngram : ngramStrs) {
if (!StringUtils.isBlank(ngram) && offensiveFilter.filter(ngram)) {
tweetHasOffensiveTerm = true;
break;
}
}
} }
return tweetHasOffensiveTerm; return tweetHasOffensiveTerm;
} }
private boolean isOffensiveNgram(TokenizedCharSequence ngram, BlacklistedTopics offensiveFilter) {
// Skip checking if the ngram is a URL
String ngramStr = ngram.toString();
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
return true;
}
return false;
}
} }