This commit is contained in:
aevitas 2023-07-17 21:40:35 -05:00 committed by GitHub
commit 87bfd24be8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 59 additions and 90 deletions

View File

@ -25,35 +25,32 @@ public class IdentifiableQueryScorer extends FilteredScorer {
@Override
public DocIdSetIterator iterator() {
final DocIdSetIterator superDISI = super.iterator();
DocIdSetIterator superIterator = super.iterator();
return new DocIdSetIterator() {
@Override
public int docID() {
return superDISI.docID();
return superIterator.docID();
}
@Override
public int nextDoc() throws IOException {
int docid = superDISI.nextDoc();
if (docid != NO_MORE_DOCS) {
attrCollector.collectScorerAttribution(docid, queryId);
}
return docid;
return collectAndReturn(superIterator.nextDoc());
}
@Override
public int advance(int target) throws IOException {
int docid = superDISI.advance(target);
if (docid != NO_MORE_DOCS) {
attrCollector.collectScorerAttribution(docid, queryId);
}
return docid;
return collectAndReturn(superIterator.advance(target));
}
@Override
public long cost() {
return superDISI.cost();
return superIterator.cost();
}
private int collectAndReturn(int docId) {
if (docId != NO_MORE_DOCS) {
attrCollector.collectScorerAttribution(docId, queryId);
}
};
}

View File

@ -27,11 +27,7 @@ public class QueryRankVisitor extends DetectAnnotationVisitor {
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
}
boolean found = false;
for (Query child : query.getChildren()) {
found |= child.accept(this);
}
return found;
return query.getChildren().stream().anyMatch(child -> child.accept(this));
}
@Override

View File

@ -195,66 +195,42 @@ public class TweetOffensiveEvaluator extends TweetEvaluator {
termsToCheck.add(normalizedUserName.toLowerCase());
for (String userNameToken : termsToCheck) {
if (!StringUtils.isBlank(userNameToken) && offensiveUsersFilter.filter(userNameToken)) {
return true;
}
}
if (!StringUtils.isBlank(userNameToken) && !offensiveUsersFilter.filter(userNameToken)) {
return false;
}
}
return true;
}
private boolean isTweetOffensive(final TwitterMessage tweet,
BlacklistedTopics offensiveFilter,
PenguinVersion penguinVersion) {
// Get the features of the tweet's text
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
boolean tweetHasOffensiveTerm = false;
// Check for tweet text.
List<TokenizedCharSequence> ngrams =
NGRAM_GENERATOR_HOLDER.get().generateNgramsAsTokenizedCharSequence(
textFeatures.getTokenSequence(), tweet.getLocale());
// Generate a list of ngrams (sequences of words) from the tweet's text
List<TokenizedCharSequence> ngrams = generateNgrams(textFeatures, tweet.getLocale());
// Check if any ngram in the list is offensive
for (TokenizedCharSequence ngram : ngrams) {
// skip URL ngram
if (!ngram.getTokensOf(TokenType.URL).isEmpty()) {
continue;
}
String ngramStr = ngram.toString();
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
if (isOffensiveNgram(ngram, offensiveFilter)) {
tweetHasOffensiveTerm = true;
break;
}
}
// Due to some strangeness in Penguin, we don't get ngrams for tokens around "\n-" or "-\n"
// in the original string, this made us miss some offensive words this way. Here we do another
// pass of check using just the tokens generated by the tokenizer. (See SEARCHQUAL-8907)
if (!tweetHasOffensiveTerm) {
for (String ngramStr : textFeatures.getTokens()) {
// skip URLs
if (ngramStr.startsWith("http://") || ngramStr.startsWith("https://")) {
continue;
}
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
tweetHasOffensiveTerm = true;
break;
}
}
}
if (!tweetHasOffensiveTerm) {
// check for resolved URLs
String resolvedUrlsText =
Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
List<String> ngramStrs = NGRAM_GENERATOR_HOLDER.get().generateNgramsAsString(
resolvedUrlsText, LocaleUtil.UNKNOWN);
for (String ngram : ngramStrs) {
if (!StringUtils.isBlank(ngram) && offensiveFilter.filter(ngram)) {
tweetHasOffensiveTerm = true;
break;
}
}
}
return tweetHasOffensiveTerm;
}
private boolean isOffensiveNgram(TokenizedCharSequence ngram, BlacklistedTopics offensiveFilter) {
// Skip checking if the ngram is a URL
String ngramStr = ngram.toString();
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
return true;
}
return false;
}
}