mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-12-22 10:11:52 +01:00
Greatly simplify offensive tweet checks
This commit is contained in:
parent
e15f5a1b61
commit
becdf494db
@ -195,66 +195,42 @@ public class TweetOffensiveEvaluator extends TweetEvaluator {
|
|||||||
termsToCheck.add(normalizedUserName.toLowerCase());
|
termsToCheck.add(normalizedUserName.toLowerCase());
|
||||||
|
|
||||||
for (String userNameToken : termsToCheck) {
|
for (String userNameToken : termsToCheck) {
|
||||||
if (!StringUtils.isBlank(userNameToken) && offensiveUsersFilter.filter(userNameToken)) {
|
if (!StringUtils.isBlank(userNameToken) && !offensiveUsersFilter.filter(userNameToken)) {
|
||||||
return true;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isTweetOffensive(final TwitterMessage tweet,
|
private boolean isTweetOffensive(final TwitterMessage tweet,
|
||||||
BlacklistedTopics offensiveFilter,
|
BlacklistedTopics offensiveFilter,
|
||||||
PenguinVersion penguinVersion) {
|
PenguinVersion penguinVersion) {
|
||||||
|
// Get the features of the tweet's text
|
||||||
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
|
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
|
||||||
|
|
||||||
boolean tweetHasOffensiveTerm = false;
|
boolean tweetHasOffensiveTerm = false;
|
||||||
|
|
||||||
// Check for tweet text.
|
// Generate a list of ngrams (sequences of words) from the tweet's text
|
||||||
List<TokenizedCharSequence> ngrams =
|
List<TokenizedCharSequence> ngrams = generateNgrams(textFeatures, tweet.getLocale());
|
||||||
NGRAM_GENERATOR_HOLDER.get().generateNgramsAsTokenizedCharSequence(
|
|
||||||
textFeatures.getTokenSequence(), tweet.getLocale());
|
// Check if any ngram in the list is offensive
|
||||||
for (TokenizedCharSequence ngram : ngrams) {
|
for (TokenizedCharSequence ngram : ngrams) {
|
||||||
// skip URL ngram
|
if (isOffensiveNgram(ngram, offensiveFilter)) {
|
||||||
if (!ngram.getTokensOf(TokenType.URL).isEmpty()) {
|
tweetHasOffensiveTerm = true;
|
||||||
continue;
|
break;
|
||||||
}
|
|
||||||
String ngramStr = ngram.toString();
|
|
||||||
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
|
|
||||||
tweetHasOffensiveTerm = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Due to some strangeness in Penguin, we don't get ngrams for tokens around "\n-" or "-\n"
|
|
||||||
// in the original string, this made us miss some offensive words this way. Here we do another
|
|
||||||
// pass of check using just the tokens generated by the tokenizer. (See SEARCHQUAL-8907)
|
|
||||||
if (!tweetHasOffensiveTerm) {
|
|
||||||
for (String ngramStr : textFeatures.getTokens()) {
|
|
||||||
// skip URLs
|
|
||||||
if (ngramStr.startsWith("http://") || ngramStr.startsWith("https://")) {
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
|
|
||||||
tweetHasOffensiveTerm = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!tweetHasOffensiveTerm) {
|
|
||||||
// check for resolved URLs
|
|
||||||
String resolvedUrlsText =
|
|
||||||
Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
|
|
||||||
List<String> ngramStrs = NGRAM_GENERATOR_HOLDER.get().generateNgramsAsString(
|
|
||||||
resolvedUrlsText, LocaleUtil.UNKNOWN);
|
|
||||||
for (String ngram : ngramStrs) {
|
|
||||||
if (!StringUtils.isBlank(ngram) && offensiveFilter.filter(ngram)) {
|
|
||||||
tweetHasOffensiveTerm = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return tweetHasOffensiveTerm;
|
return tweetHasOffensiveTerm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isOffensiveNgram(TokenizedCharSequence ngram, BlacklistedTopics offensiveFilter) {
|
||||||
|
// Skip checking if the ngram is a URL
|
||||||
|
String ngramStr = ngram.toString();
|
||||||
|
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user