mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-12-22 18:21:51 +01:00
Merge becdf494db
into 72eda9a24f
This commit is contained in:
commit
87bfd24be8
@ -25,35 +25,32 @@ public class IdentifiableQueryScorer extends FilteredScorer {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocIdSetIterator iterator() {
|
public DocIdSetIterator iterator() {
|
||||||
final DocIdSetIterator superDISI = super.iterator();
|
DocIdSetIterator superIterator = super.iterator();
|
||||||
|
|
||||||
return new DocIdSetIterator() {
|
return new DocIdSetIterator() {
|
||||||
@Override
|
@Override
|
||||||
public int docID() {
|
public int docID() {
|
||||||
return superDISI.docID();
|
return superIterator.docID();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int nextDoc() throws IOException {
|
public int nextDoc() throws IOException {
|
||||||
int docid = superDISI.nextDoc();
|
return collectAndReturn(superIterator.nextDoc());
|
||||||
if (docid != NO_MORE_DOCS) {
|
|
||||||
attrCollector.collectScorerAttribution(docid, queryId);
|
|
||||||
}
|
|
||||||
return docid;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int advance(int target) throws IOException {
|
public int advance(int target) throws IOException {
|
||||||
int docid = superDISI.advance(target);
|
return collectAndReturn(superIterator.advance(target));
|
||||||
if (docid != NO_MORE_DOCS) {
|
|
||||||
attrCollector.collectScorerAttribution(docid, queryId);
|
|
||||||
}
|
|
||||||
return docid;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long cost() {
|
public long cost() {
|
||||||
return superDISI.cost();
|
return superIterator.cost();
|
||||||
|
}
|
||||||
|
|
||||||
|
private int collectAndReturn(int docId) {
|
||||||
|
if (docId != NO_MORE_DOCS) {
|
||||||
|
attrCollector.collectScorerAttribution(docId, queryId);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -27,11 +27,7 @@ public class QueryRankVisitor extends DetectAnnotationVisitor {
|
|||||||
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
|
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean found = false;
|
return query.getChildren().stream().anyMatch(child -> child.accept(this));
|
||||||
for (Query child : query.getChildren()) {
|
|
||||||
found |= child.accept(this);
|
|
||||||
}
|
|
||||||
return found;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -195,66 +195,42 @@ public class TweetOffensiveEvaluator extends TweetEvaluator {
|
|||||||
termsToCheck.add(normalizedUserName.toLowerCase());
|
termsToCheck.add(normalizedUserName.toLowerCase());
|
||||||
|
|
||||||
for (String userNameToken : termsToCheck) {
|
for (String userNameToken : termsToCheck) {
|
||||||
if (!StringUtils.isBlank(userNameToken) && offensiveUsersFilter.filter(userNameToken)) {
|
if (!StringUtils.isBlank(userNameToken) && !offensiveUsersFilter.filter(userNameToken)) {
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean isTweetOffensive(final TwitterMessage tweet,
|
private boolean isTweetOffensive(final TwitterMessage tweet,
|
||||||
BlacklistedTopics offensiveFilter,
|
BlacklistedTopics offensiveFilter,
|
||||||
PenguinVersion penguinVersion) {
|
PenguinVersion penguinVersion) {
|
||||||
|
// Get the features of the tweet's text
|
||||||
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
|
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
|
||||||
|
|
||||||
boolean tweetHasOffensiveTerm = false;
|
boolean tweetHasOffensiveTerm = false;
|
||||||
|
|
||||||
// Check for tweet text.
|
// Generate a list of ngrams (sequences of words) from the tweet's text
|
||||||
List<TokenizedCharSequence> ngrams =
|
List<TokenizedCharSequence> ngrams = generateNgrams(textFeatures, tweet.getLocale());
|
||||||
NGRAM_GENERATOR_HOLDER.get().generateNgramsAsTokenizedCharSequence(
|
|
||||||
textFeatures.getTokenSequence(), tweet.getLocale());
|
// Check if any ngram in the list is offensive
|
||||||
for (TokenizedCharSequence ngram : ngrams) {
|
for (TokenizedCharSequence ngram : ngrams) {
|
||||||
// skip URL ngram
|
if (isOffensiveNgram(ngram, offensiveFilter)) {
|
||||||
if (!ngram.getTokensOf(TokenType.URL).isEmpty()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
String ngramStr = ngram.toString();
|
|
||||||
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
|
|
||||||
tweetHasOffensiveTerm = true;
|
tweetHasOffensiveTerm = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Due to some strangeness in Penguin, we don't get ngrams for tokens around "\n-" or "-\n"
|
|
||||||
// in the original string, this made us miss some offensive words this way. Here we do another
|
|
||||||
// pass of check using just the tokens generated by the tokenizer. (See SEARCHQUAL-8907)
|
|
||||||
if (!tweetHasOffensiveTerm) {
|
|
||||||
for (String ngramStr : textFeatures.getTokens()) {
|
|
||||||
// skip URLs
|
|
||||||
if (ngramStr.startsWith("http://") || ngramStr.startsWith("https://")) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
|
|
||||||
tweetHasOffensiveTerm = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!tweetHasOffensiveTerm) {
|
|
||||||
// check for resolved URLs
|
|
||||||
String resolvedUrlsText =
|
|
||||||
Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
|
|
||||||
List<String> ngramStrs = NGRAM_GENERATOR_HOLDER.get().generateNgramsAsString(
|
|
||||||
resolvedUrlsText, LocaleUtil.UNKNOWN);
|
|
||||||
for (String ngram : ngramStrs) {
|
|
||||||
if (!StringUtils.isBlank(ngram) && offensiveFilter.filter(ngram)) {
|
|
||||||
tweetHasOffensiveTerm = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return tweetHasOffensiveTerm;
|
return tweetHasOffensiveTerm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isOffensiveNgram(TokenizedCharSequence ngram, BlacklistedTopics offensiveFilter) {
|
||||||
|
// Skip checking if the ngram is a URL
|
||||||
|
String ngramStr = ngram.toString();
|
||||||
|
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user