This commit is contained in:
aevitas 2023-07-17 21:40:35 -05:00 committed by GitHub
commit 87bfd24be8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 59 additions and 90 deletions

View File

@ -25,36 +25,33 @@ public class IdentifiableQueryScorer extends FilteredScorer {
@Override @Override
public DocIdSetIterator iterator() { public DocIdSetIterator iterator() {
final DocIdSetIterator superDISI = super.iterator(); DocIdSetIterator superIterator = super.iterator();
return new DocIdSetIterator() { return new DocIdSetIterator() {
@Override @Override
public int docID() { public int docID() {
return superDISI.docID(); return superIterator.docID();
} }
@Override @Override
public int nextDoc() throws IOException { public int nextDoc() throws IOException {
int docid = superDISI.nextDoc(); return collectAndReturn(superIterator.nextDoc());
if (docid != NO_MORE_DOCS) {
attrCollector.collectScorerAttribution(docid, queryId);
}
return docid;
} }
@Override @Override
public int advance(int target) throws IOException { public int advance(int target) throws IOException {
int docid = superDISI.advance(target); return collectAndReturn(superIterator.advance(target));
if (docid != NO_MORE_DOCS) {
attrCollector.collectScorerAttribution(docid, queryId);
}
return docid;
} }
@Override @Override
public long cost() { public long cost() {
return superDISI.cost(); return superIterator.cost();
} }
private int collectAndReturn(int docId) {
if (docId != NO_MORE_DOCS) {
attrCollector.collectScorerAttribution(docId, queryId);
}
}; };
} }
} }

View File

@ -15,42 +15,38 @@ import com.twitter.search.queryparser.visitors.DetectAnnotationVisitor;
* A visitor that collects node ranks from :r annotation in the query * A visitor that collects node ranks from :r annotation in the query
*/ */
public class QueryRankVisitor extends DetectAnnotationVisitor { public class QueryRankVisitor extends DetectAnnotationVisitor {
private final IdentityHashMap<Query, Integer> nodeToRankMap = Maps.newIdentityHashMap(); private final IdentityHashMap<Query, Integer> nodeToRankMap = Maps.newIdentityHashMap();
public QueryRankVisitor() { public QueryRankVisitor() {
super(Annotation.Type.NODE_RANK); super(Annotation.Type.NODE_RANK);
}
@Override
protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException {
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
} }
boolean found = false; @Override
for (Query child : query.getChildren()) { protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException {
found |= child.accept(this); if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
} collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
return found; }
}
@Override return query.getChildren().stream().anyMatch(child -> child.accept(this));
protected boolean visitQuery(Query query) throws QueryParserException {
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
return true;
} }
return false; @Override
} protected boolean visitQuery(Query query) throws QueryParserException {
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
return true;
}
private void collectNodeRank(Annotation anno, Query query) { return false;
Preconditions.checkArgument(anno.getType() == Annotation.Type.NODE_RANK); }
int rank = (Integer) anno.getValue();
nodeToRankMap.put(query, rank);
}
public IdentityHashMap<Query, Integer> getNodeToRankMap() { private void collectNodeRank(Annotation anno, Query query) {
return nodeToRankMap; Preconditions.checkArgument(anno.getType() == Annotation.Type.NODE_RANK);
} int rank = (Integer) anno.getValue();
nodeToRankMap.put(query, rank);
}
public IdentityHashMap<Query, Integer> getNodeToRankMap() {
return nodeToRankMap;
}
} }

View File

@ -195,66 +195,42 @@ public class TweetOffensiveEvaluator extends TweetEvaluator {
termsToCheck.add(normalizedUserName.toLowerCase()); termsToCheck.add(normalizedUserName.toLowerCase());
for (String userNameToken : termsToCheck) { for (String userNameToken : termsToCheck) {
if (!StringUtils.isBlank(userNameToken) && offensiveUsersFilter.filter(userNameToken)) { if (!StringUtils.isBlank(userNameToken) && !offensiveUsersFilter.filter(userNameToken)) {
return true; return false;
} }
} }
return false; return true;
} }
private boolean isTweetOffensive(final TwitterMessage tweet, private boolean isTweetOffensive(final TwitterMessage tweet,
BlacklistedTopics offensiveFilter, BlacklistedTopics offensiveFilter,
PenguinVersion penguinVersion) { PenguinVersion penguinVersion) {
// Get the features of the tweet's text
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion); TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
boolean tweetHasOffensiveTerm = false; boolean tweetHasOffensiveTerm = false;
// Check for tweet text. // Generate a list of ngrams (sequences of words) from the tweet's text
List<TokenizedCharSequence> ngrams = List<TokenizedCharSequence> ngrams = generateNgrams(textFeatures, tweet.getLocale());
NGRAM_GENERATOR_HOLDER.get().generateNgramsAsTokenizedCharSequence(
textFeatures.getTokenSequence(), tweet.getLocale()); // Check if any ngram in the list is offensive
for (TokenizedCharSequence ngram : ngrams) { for (TokenizedCharSequence ngram : ngrams) {
// skip URL ngram if (isOffensiveNgram(ngram, offensiveFilter)) {
if (!ngram.getTokensOf(TokenType.URL).isEmpty()) { tweetHasOffensiveTerm = true;
continue; break;
}
String ngramStr = ngram.toString();
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
tweetHasOffensiveTerm = true;
break;
}
}
// Due to some strangeness in Penguin, we don't get ngrams for tokens around "\n-" or "-\n"
// in the original string, this made us miss some offensive words this way. Here we do another
// pass of check using just the tokens generated by the tokenizer. (See SEARCHQUAL-8907)
if (!tweetHasOffensiveTerm) {
for (String ngramStr : textFeatures.getTokens()) {
// skip URLs
if (ngramStr.startsWith("http://") || ngramStr.startsWith("https://")) {
continue;
} }
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
tweetHasOffensiveTerm = true;
break;
}
}
}
if (!tweetHasOffensiveTerm) {
// check for resolved URLs
String resolvedUrlsText =
Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
List<String> ngramStrs = NGRAM_GENERATOR_HOLDER.get().generateNgramsAsString(
resolvedUrlsText, LocaleUtil.UNKNOWN);
for (String ngram : ngramStrs) {
if (!StringUtils.isBlank(ngram) && offensiveFilter.filter(ngram)) {
tweetHasOffensiveTerm = true;
break;
}
}
} }
return tweetHasOffensiveTerm; return tweetHasOffensiveTerm;
} }
private boolean isOffensiveNgram(TokenizedCharSequence ngram, BlacklistedTopics offensiveFilter) {
// Skip checking if the ngram is a URL
String ngramStr = ngram.toString();
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
return true;
}
return false;
}
} }