This commit is contained in:
aevitas 2023-07-17 21:40:35 -05:00 committed by GitHub
commit 87bfd24be8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 59 additions and 90 deletions

View File

@ -25,36 +25,33 @@ public class IdentifiableQueryScorer extends FilteredScorer {
@Override
public DocIdSetIterator iterator() {
final DocIdSetIterator superDISI = super.iterator();
DocIdSetIterator superIterator = super.iterator();
return new DocIdSetIterator() {
@Override
public int docID() {
return superDISI.docID();
return superIterator.docID();
}
@Override
public int nextDoc() throws IOException {
int docid = superDISI.nextDoc();
if (docid != NO_MORE_DOCS) {
attrCollector.collectScorerAttribution(docid, queryId);
}
return docid;
return collectAndReturn(superIterator.nextDoc());
}
@Override
public int advance(int target) throws IOException {
int docid = superDISI.advance(target);
if (docid != NO_MORE_DOCS) {
attrCollector.collectScorerAttribution(docid, queryId);
}
return docid;
return collectAndReturn(superIterator.advance(target));
}
@Override
public long cost() {
return superDISI.cost();
return superIterator.cost();
}
private int collectAndReturn(int docId) {
if (docId != NO_MORE_DOCS) {
attrCollector.collectScorerAttribution(docId, queryId);
}
};
}
}

View File

@ -15,42 +15,38 @@ import com.twitter.search.queryparser.visitors.DetectAnnotationVisitor;
* A visitor that collects node ranks from :r annotation in the query
*/
public class QueryRankVisitor extends DetectAnnotationVisitor {
private final IdentityHashMap<Query, Integer> nodeToRankMap = Maps.newIdentityHashMap();
private final IdentityHashMap<Query, Integer> nodeToRankMap = Maps.newIdentityHashMap();
public QueryRankVisitor() {
super(Annotation.Type.NODE_RANK);
}
@Override
protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException {
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
public QueryRankVisitor() {
super(Annotation.Type.NODE_RANK);
}
boolean found = false;
for (Query child : query.getChildren()) {
found |= child.accept(this);
}
return found;
}
@Override
protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException {
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
}
@Override
protected boolean visitQuery(Query query) throws QueryParserException {
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
return true;
return query.getChildren().stream().anyMatch(child -> child.accept(this));
}
return false;
}
@Override
protected boolean visitQuery(Query query) throws QueryParserException {
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
return true;
}
private void collectNodeRank(Annotation anno, Query query) {
Preconditions.checkArgument(anno.getType() == Annotation.Type.NODE_RANK);
int rank = (Integer) anno.getValue();
nodeToRankMap.put(query, rank);
}
return false;
}
public IdentityHashMap<Query, Integer> getNodeToRankMap() {
return nodeToRankMap;
}
private void collectNodeRank(Annotation anno, Query query) {
Preconditions.checkArgument(anno.getType() == Annotation.Type.NODE_RANK);
int rank = (Integer) anno.getValue();
nodeToRankMap.put(query, rank);
}
public IdentityHashMap<Query, Integer> getNodeToRankMap() {
return nodeToRankMap;
}
}

View File

@ -195,66 +195,42 @@ public class TweetOffensiveEvaluator extends TweetEvaluator {
termsToCheck.add(normalizedUserName.toLowerCase());
for (String userNameToken : termsToCheck) {
if (!StringUtils.isBlank(userNameToken) && offensiveUsersFilter.filter(userNameToken)) {
return true;
if (!StringUtils.isBlank(userNameToken) && !offensiveUsersFilter.filter(userNameToken)) {
return false;
}
}
return false;
return true;
}
private boolean isTweetOffensive(final TwitterMessage tweet,
BlacklistedTopics offensiveFilter,
PenguinVersion penguinVersion) {
BlacklistedTopics offensiveFilter,
PenguinVersion penguinVersion) {
// Get the features of the tweet's text
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
boolean tweetHasOffensiveTerm = false;
// Check for tweet text.
List<TokenizedCharSequence> ngrams =
NGRAM_GENERATOR_HOLDER.get().generateNgramsAsTokenizedCharSequence(
textFeatures.getTokenSequence(), tweet.getLocale());
// Generate a list of ngrams (sequences of words) from the tweet's text
List<TokenizedCharSequence> ngrams = generateNgrams(textFeatures, tweet.getLocale());
// Check if any ngram in the list is offensive
for (TokenizedCharSequence ngram : ngrams) {
// skip URL ngram
if (!ngram.getTokensOf(TokenType.URL).isEmpty()) {
continue;
}
String ngramStr = ngram.toString();
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
tweetHasOffensiveTerm = true;
break;
}
}
// Due to some strangeness in Penguin, we don't get ngrams for tokens around "\n-" or "-\n"
// in the original string, this made us miss some offensive words this way. Here we do another
// pass of check using just the tokens generated by the tokenizer. (See SEARCHQUAL-8907)
if (!tweetHasOffensiveTerm) {
for (String ngramStr : textFeatures.getTokens()) {
// skip URLs
if (ngramStr.startsWith("http://") || ngramStr.startsWith("https://")) {
continue;
if (isOffensiveNgram(ngram, offensiveFilter)) {
tweetHasOffensiveTerm = true;
break;
}
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
tweetHasOffensiveTerm = true;
break;
}
}
}
if (!tweetHasOffensiveTerm) {
// check for resolved URLs
String resolvedUrlsText =
Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
List<String> ngramStrs = NGRAM_GENERATOR_HOLDER.get().generateNgramsAsString(
resolvedUrlsText, LocaleUtil.UNKNOWN);
for (String ngram : ngramStrs) {
if (!StringUtils.isBlank(ngram) && offensiveFilter.filter(ngram)) {
tweetHasOffensiveTerm = true;
break;
}
}
}
return tweetHasOffensiveTerm;
}
private boolean isOffensiveNgram(TokenizedCharSequence ngram, BlacklistedTopics offensiveFilter) {
// Skip checking if the ngram is a URL
String ngramStr = ngram.toString();
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
return true;
}
return false;
}
}