mirror of
https://github.com/twitter/the-algorithm.git
synced 2025-01-24 18:01:16 +01:00
Greatly simplify offensive tweet checks
This commit is contained in:
parent
e15f5a1b61
commit
becdf494db
@ -195,66 +195,42 @@ public class TweetOffensiveEvaluator extends TweetEvaluator {
|
||||
termsToCheck.add(normalizedUserName.toLowerCase());
|
||||
|
||||
for (String userNameToken : termsToCheck) {
|
||||
if (!StringUtils.isBlank(userNameToken) && offensiveUsersFilter.filter(userNameToken)) {
|
||||
return true;
|
||||
if (!StringUtils.isBlank(userNameToken) && !offensiveUsersFilter.filter(userNameToken)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean isTweetOffensive(final TwitterMessage tweet,
|
||||
BlacklistedTopics offensiveFilter,
|
||||
PenguinVersion penguinVersion) {
|
||||
BlacklistedTopics offensiveFilter,
|
||||
PenguinVersion penguinVersion) {
|
||||
// Get the features of the tweet's text
|
||||
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
|
||||
|
||||
boolean tweetHasOffensiveTerm = false;
|
||||
|
||||
// Check for tweet text.
|
||||
List<TokenizedCharSequence> ngrams =
|
||||
NGRAM_GENERATOR_HOLDER.get().generateNgramsAsTokenizedCharSequence(
|
||||
textFeatures.getTokenSequence(), tweet.getLocale());
|
||||
// Generate a list of ngrams (sequences of words) from the tweet's text
|
||||
List<TokenizedCharSequence> ngrams = generateNgrams(textFeatures, tweet.getLocale());
|
||||
|
||||
// Check if any ngram in the list is offensive
|
||||
for (TokenizedCharSequence ngram : ngrams) {
|
||||
// skip URL ngram
|
||||
if (!ngram.getTokensOf(TokenType.URL).isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
String ngramStr = ngram.toString();
|
||||
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
|
||||
tweetHasOffensiveTerm = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Due to some strangeness in Penguin, we don't get ngrams for tokens around "\n-" or "-\n"
|
||||
// in the original string, this made us miss some offensive words this way. Here we do another
|
||||
// pass of check using just the tokens generated by the tokenizer. (See SEARCHQUAL-8907)
|
||||
if (!tweetHasOffensiveTerm) {
|
||||
for (String ngramStr : textFeatures.getTokens()) {
|
||||
// skip URLs
|
||||
if (ngramStr.startsWith("http://") || ngramStr.startsWith("https://")) {
|
||||
continue;
|
||||
if (isOffensiveNgram(ngram, offensiveFilter)) {
|
||||
tweetHasOffensiveTerm = true;
|
||||
break;
|
||||
}
|
||||
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
|
||||
tweetHasOffensiveTerm = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!tweetHasOffensiveTerm) {
|
||||
// check for resolved URLs
|
||||
String resolvedUrlsText =
|
||||
Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
|
||||
List<String> ngramStrs = NGRAM_GENERATOR_HOLDER.get().generateNgramsAsString(
|
||||
resolvedUrlsText, LocaleUtil.UNKNOWN);
|
||||
for (String ngram : ngramStrs) {
|
||||
if (!StringUtils.isBlank(ngram) && offensiveFilter.filter(ngram)) {
|
||||
tweetHasOffensiveTerm = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tweetHasOffensiveTerm;
|
||||
}
|
||||
|
||||
private boolean isOffensiveNgram(TokenizedCharSequence ngram, BlacklistedTopics offensiveFilter) {
|
||||
// Skip checking if the ngram is a URL
|
||||
String ngramStr = ngram.toString();
|
||||
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user