mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-12-22 10:11:52 +01:00
Merge becdf494db
into 72eda9a24f
This commit is contained in:
commit
87bfd24be8
@ -25,36 +25,33 @@ public class IdentifiableQueryScorer extends FilteredScorer {
|
||||
|
||||
@Override
|
||||
public DocIdSetIterator iterator() {
|
||||
final DocIdSetIterator superDISI = super.iterator();
|
||||
DocIdSetIterator superIterator = super.iterator();
|
||||
|
||||
return new DocIdSetIterator() {
|
||||
@Override
|
||||
public int docID() {
|
||||
return superDISI.docID();
|
||||
return superIterator.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
int docid = superDISI.nextDoc();
|
||||
if (docid != NO_MORE_DOCS) {
|
||||
attrCollector.collectScorerAttribution(docid, queryId);
|
||||
}
|
||||
return docid;
|
||||
return collectAndReturn(superIterator.nextDoc());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
int docid = superDISI.advance(target);
|
||||
if (docid != NO_MORE_DOCS) {
|
||||
attrCollector.collectScorerAttribution(docid, queryId);
|
||||
}
|
||||
return docid;
|
||||
return collectAndReturn(superIterator.advance(target));
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return superDISI.cost();
|
||||
return superIterator.cost();
|
||||
}
|
||||
|
||||
private int collectAndReturn(int docId) {
|
||||
if (docId != NO_MORE_DOCS) {
|
||||
attrCollector.collectScorerAttribution(docId, queryId);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -15,42 +15,38 @@ import com.twitter.search.queryparser.visitors.DetectAnnotationVisitor;
|
||||
* A visitor that collects node ranks from :r annotation in the query
|
||||
*/
|
||||
public class QueryRankVisitor extends DetectAnnotationVisitor {
|
||||
private final IdentityHashMap<Query, Integer> nodeToRankMap = Maps.newIdentityHashMap();
|
||||
private final IdentityHashMap<Query, Integer> nodeToRankMap = Maps.newIdentityHashMap();
|
||||
|
||||
public QueryRankVisitor() {
|
||||
super(Annotation.Type.NODE_RANK);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException {
|
||||
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
|
||||
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
|
||||
public QueryRankVisitor() {
|
||||
super(Annotation.Type.NODE_RANK);
|
||||
}
|
||||
|
||||
boolean found = false;
|
||||
for (Query child : query.getChildren()) {
|
||||
found |= child.accept(this);
|
||||
}
|
||||
return found;
|
||||
}
|
||||
@Override
|
||||
protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException {
|
||||
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
|
||||
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean visitQuery(Query query) throws QueryParserException {
|
||||
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
|
||||
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
|
||||
return true;
|
||||
return query.getChildren().stream().anyMatch(child -> child.accept(this));
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
@Override
|
||||
protected boolean visitQuery(Query query) throws QueryParserException {
|
||||
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
|
||||
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
|
||||
return true;
|
||||
}
|
||||
|
||||
private void collectNodeRank(Annotation anno, Query query) {
|
||||
Preconditions.checkArgument(anno.getType() == Annotation.Type.NODE_RANK);
|
||||
int rank = (Integer) anno.getValue();
|
||||
nodeToRankMap.put(query, rank);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public IdentityHashMap<Query, Integer> getNodeToRankMap() {
|
||||
return nodeToRankMap;
|
||||
}
|
||||
private void collectNodeRank(Annotation anno, Query query) {
|
||||
Preconditions.checkArgument(anno.getType() == Annotation.Type.NODE_RANK);
|
||||
int rank = (Integer) anno.getValue();
|
||||
nodeToRankMap.put(query, rank);
|
||||
}
|
||||
|
||||
public IdentityHashMap<Query, Integer> getNodeToRankMap() {
|
||||
return nodeToRankMap;
|
||||
}
|
||||
}
|
||||
|
@ -195,66 +195,42 @@ public class TweetOffensiveEvaluator extends TweetEvaluator {
|
||||
termsToCheck.add(normalizedUserName.toLowerCase());
|
||||
|
||||
for (String userNameToken : termsToCheck) {
|
||||
if (!StringUtils.isBlank(userNameToken) && offensiveUsersFilter.filter(userNameToken)) {
|
||||
return true;
|
||||
if (!StringUtils.isBlank(userNameToken) && !offensiveUsersFilter.filter(userNameToken)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean isTweetOffensive(final TwitterMessage tweet,
|
||||
BlacklistedTopics offensiveFilter,
|
||||
PenguinVersion penguinVersion) {
|
||||
BlacklistedTopics offensiveFilter,
|
||||
PenguinVersion penguinVersion) {
|
||||
// Get the features of the tweet's text
|
||||
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
|
||||
|
||||
boolean tweetHasOffensiveTerm = false;
|
||||
|
||||
// Check for tweet text.
|
||||
List<TokenizedCharSequence> ngrams =
|
||||
NGRAM_GENERATOR_HOLDER.get().generateNgramsAsTokenizedCharSequence(
|
||||
textFeatures.getTokenSequence(), tweet.getLocale());
|
||||
// Generate a list of ngrams (sequences of words) from the tweet's text
|
||||
List<TokenizedCharSequence> ngrams = generateNgrams(textFeatures, tweet.getLocale());
|
||||
|
||||
// Check if any ngram in the list is offensive
|
||||
for (TokenizedCharSequence ngram : ngrams) {
|
||||
// skip URL ngram
|
||||
if (!ngram.getTokensOf(TokenType.URL).isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
String ngramStr = ngram.toString();
|
||||
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
|
||||
tweetHasOffensiveTerm = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Due to some strangeness in Penguin, we don't get ngrams for tokens around "\n-" or "-\n"
|
||||
// in the original string, this made us miss some offensive words this way. Here we do another
|
||||
// pass of check using just the tokens generated by the tokenizer. (See SEARCHQUAL-8907)
|
||||
if (!tweetHasOffensiveTerm) {
|
||||
for (String ngramStr : textFeatures.getTokens()) {
|
||||
// skip URLs
|
||||
if (ngramStr.startsWith("http://") || ngramStr.startsWith("https://")) {
|
||||
continue;
|
||||
if (isOffensiveNgram(ngram, offensiveFilter)) {
|
||||
tweetHasOffensiveTerm = true;
|
||||
break;
|
||||
}
|
||||
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
|
||||
tweetHasOffensiveTerm = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!tweetHasOffensiveTerm) {
|
||||
// check for resolved URLs
|
||||
String resolvedUrlsText =
|
||||
Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
|
||||
List<String> ngramStrs = NGRAM_GENERATOR_HOLDER.get().generateNgramsAsString(
|
||||
resolvedUrlsText, LocaleUtil.UNKNOWN);
|
||||
for (String ngram : ngramStrs) {
|
||||
if (!StringUtils.isBlank(ngram) && offensiveFilter.filter(ngram)) {
|
||||
tweetHasOffensiveTerm = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tweetHasOffensiveTerm;
|
||||
}
|
||||
|
||||
private boolean isOffensiveNgram(TokenizedCharSequence ngram, BlacklistedTopics offensiveFilter) {
|
||||
// Skip checking if the ngram is a URL
|
||||
String ngramStr = ngram.toString();
|
||||
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user