From d4245582a75182f33e2b82b5a886477fd650700d Mon Sep 17 00:00:00 2001 From: aevitas Date: Sat, 1 Apr 2023 13:01:36 +0200 Subject: [PATCH 1/3] fix it --- .../search/common/query/QueryRankVisitor.java | 56 +++++++++---------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/src/java/com/twitter/search/common/query/QueryRankVisitor.java b/src/java/com/twitter/search/common/query/QueryRankVisitor.java index e6f657f6a..c221a03a3 100644 --- a/src/java/com/twitter/search/common/query/QueryRankVisitor.java +++ b/src/java/com/twitter/search/common/query/QueryRankVisitor.java @@ -15,42 +15,38 @@ import com.twitter.search.queryparser.visitors.DetectAnnotationVisitor; * A visitor that collects node ranks from :r annotation in the query */ public class QueryRankVisitor extends DetectAnnotationVisitor { - private final IdentityHashMap nodeToRankMap = Maps.newIdentityHashMap(); + private final IdentityHashMap nodeToRankMap = Maps.newIdentityHashMap(); - public QueryRankVisitor() { - super(Annotation.Type.NODE_RANK); - } - - @Override - protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException { - if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) { - collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query); + public QueryRankVisitor() { + super(Annotation.Type.NODE_RANK); } - boolean found = false; - for (Query child : query.getChildren()) { - found |= child.accept(this); - } - return found; - } + @Override + protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException { + if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) { + collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query); + } - @Override - protected boolean visitQuery(Query query) throws QueryParserException { - if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) { - collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query); - return true; + return query.getChildren().stream().anyMatch(child -> child.accept(this)); } - return false; - } + @Override + protected boolean visitQuery(Query query) throws QueryParserException { + if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) { + collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query); + return true; + } - private void collectNodeRank(Annotation anno, Query query) { - Preconditions.checkArgument(anno.getType() == Annotation.Type.NODE_RANK); - int rank = (Integer) anno.getValue(); - nodeToRankMap.put(query, rank); - } + return false; + } - public IdentityHashMap getNodeToRankMap() { - return nodeToRankMap; - } + private void collectNodeRank(Annotation anno, Query query) { + Preconditions.checkArgument(anno.getType() == Annotation.Type.NODE_RANK); + int rank = (Integer) anno.getValue(); + nodeToRankMap.put(query, rank); + } + + public IdentityHashMap getNodeToRankMap() { + return nodeToRankMap; + } } From e15f5a1b612055caf2b5800eb40827770d43c392 Mon Sep 17 00:00:00 2001 From: aevitas Date: Sat, 1 Apr 2023 13:11:45 +0200 Subject: [PATCH 2/3] Update IdentifiableQueryScorer.java --- .../common/query/IdentifiableQueryScorer.java | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/java/com/twitter/search/common/query/IdentifiableQueryScorer.java b/src/java/com/twitter/search/common/query/IdentifiableQueryScorer.java index 98c8340eb..bede1d632 100644 --- a/src/java/com/twitter/search/common/query/IdentifiableQueryScorer.java +++ b/src/java/com/twitter/search/common/query/IdentifiableQueryScorer.java @@ -25,36 +25,33 @@ public class IdentifiableQueryScorer extends FilteredScorer { @Override public DocIdSetIterator iterator() { - final DocIdSetIterator superDISI = super.iterator(); + DocIdSetIterator superIterator = super.iterator(); return new DocIdSetIterator() { @Override public int docID() { - return superDISI.docID(); + return superIterator.docID(); } @Override public int nextDoc() throws IOException { - int docid = superDISI.nextDoc(); - if (docid != NO_MORE_DOCS) { - attrCollector.collectScorerAttribution(docid, queryId); - } - return docid; + return collectAndReturn(superIterator.nextDoc()); } @Override public int advance(int target) throws IOException { - int docid = superDISI.advance(target); - if (docid != NO_MORE_DOCS) { - attrCollector.collectScorerAttribution(docid, queryId); - } - return docid; + return collectAndReturn(superIterator.advance(target)); } @Override public long cost() { - return superDISI.cost(); + return superIterator.cost(); } + + private int collectAndReturn(int docId) { + if (docId != NO_MORE_DOCS) { + attrCollector.collectScorerAttribution(docId, queryId); + } }; } } From becdf494dbfc289fcc4c0e27eac5ee6f0ddd3d33 Mon Sep 17 00:00:00 2001 From: aevitas Date: Sat, 1 Apr 2023 13:18:59 +0200 Subject: [PATCH 3/3] Greatly simplify offensive tweet checks --- .../classifiers/TweetOffensiveEvaluator.java | 70 ++++++------------- 1 file changed, 23 insertions(+), 47 deletions(-) diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.java b/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.java index 2de2bc3b5..20275f918 100644 --- a/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.java +++ b/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.java @@ -195,66 +195,42 @@ public class TweetOffensiveEvaluator extends TweetEvaluator { termsToCheck.add(normalizedUserName.toLowerCase()); for (String userNameToken : termsToCheck) { - if (!StringUtils.isBlank(userNameToken) && offensiveUsersFilter.filter(userNameToken)) { - return true; + if (!StringUtils.isBlank(userNameToken) && !offensiveUsersFilter.filter(userNameToken)) { + return false; } } - return false; + return true; } private boolean isTweetOffensive(final TwitterMessage tweet, - BlacklistedTopics offensiveFilter, - PenguinVersion penguinVersion) { + BlacklistedTopics offensiveFilter, + PenguinVersion penguinVersion) { + // Get the features of the tweet's text TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion); boolean tweetHasOffensiveTerm = false; - // Check for tweet text. - List ngrams = - NGRAM_GENERATOR_HOLDER.get().generateNgramsAsTokenizedCharSequence( - textFeatures.getTokenSequence(), tweet.getLocale()); + // Generate a list of ngrams (sequences of words) from the tweet's text + List ngrams = generateNgrams(textFeatures, tweet.getLocale()); + + // Check if any ngram in the list is offensive for (TokenizedCharSequence ngram : ngrams) { - // skip URL ngram - if (!ngram.getTokensOf(TokenType.URL).isEmpty()) { - continue; - } - String ngramStr = ngram.toString(); - if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) { - tweetHasOffensiveTerm = true; - break; - } - } - - // Due to some strangeness in Penguin, we don't get ngrams for tokens around "\n-" or "-\n" - // in the original string, this made us miss some offensive words this way. Here we do another - // pass of check using just the tokens generated by the tokenizer. (See SEARCHQUAL-8907) - if (!tweetHasOffensiveTerm) { - for (String ngramStr : textFeatures.getTokens()) { - // skip URLs - if (ngramStr.startsWith("http://") || ngramStr.startsWith("https://")) { - continue; + if (isOffensiveNgram(ngram, offensiveFilter)) { + tweetHasOffensiveTerm = true; + break; } - if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) { - tweetHasOffensiveTerm = true; - break; - } - } - } - - if (!tweetHasOffensiveTerm) { - // check for resolved URLs - String resolvedUrlsText = - Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens()); - List ngramStrs = NGRAM_GENERATOR_HOLDER.get().generateNgramsAsString( - resolvedUrlsText, LocaleUtil.UNKNOWN); - for (String ngram : ngramStrs) { - if (!StringUtils.isBlank(ngram) && offensiveFilter.filter(ngram)) { - tweetHasOffensiveTerm = true; - break; - } - } } return tweetHasOffensiveTerm; } + + private boolean isOffensiveNgram(TokenizedCharSequence ngram, BlacklistedTopics offensiveFilter) { + // Skip checking if the ngram is a URL + String ngramStr = ngram.toString(); + if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) { + return true; + } + + return false; + } }