the-algorithm/src/java/com/twitter/search/common/relevance/scorers/TweetTextScorer.java

243 lines
10 KiB
Java

package com.twitter.search.common.relevance.scorers;
import java.util.Map;
import java.util.concurrent.ConcurrentMap;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.metrics.RelevanceStats;
import com.twitter.search.common.metrics.SearchRateCounter;
import com.twitter.search.common.relevance.config.TweetProcessingConfig;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.features.TweetFeatures;
import com.twitter.search.common.relevance.features.TweetTextFeatures;
import com.twitter.search.common.relevance.features.TweetTextQuality;
/**
* Compute a text score for TwitterMessage based on its offensiveness,
* shoutness, length, readability and hashtag properties extracted from
* tweet text.
* <p/>
* Formula:
* text_score = offensive_text_damping * offensive_username_damping *
* Sigma(feature_score_weight * feature_score)
* <p/>
* scored features are: length, readability, shout, entropy, links
*/
public class TweetTextScorer extends TweetScorer {
private static final Logger LOG = LoggerFactory.getLogger(TweetTextScorer.class);
private static final double DEFAULT_OFFENSIVE_TERM_DAMPING = 0.2d;
private static final double DEFAULT_OFFENSIVE_NAME_DAMPING = 0.2d;
// Sigma of all weights = 1.0d
private static final double DEFAULT_LENGTH_WEIGHT = 0.5d;
private static final double DEFAULT_READABILITY_WEIGHT = 0.1d;
private static final double DEFAULT_SHOUT_WEIGHT = 0.1d;
private static final double DEFAULT_ENTROPY_WEIGHT = 0.25d;
private static final double DEFAULT_LINK_WEIGHT = 0.05d;
private static final double DEFAULT_NO_DAMPING = 1.0d;
// Sigmoid alpha values for normalization
private static final double DEFAULT_READABILITY_ALPHA = 0.05d;
private static final double DEFAULT_ENTROPY_ALPHA = 0.5d;
private static final double DEFAULT_LENGTH_ALPHA = 0.03d;
private static final ConcurrentMap<String, SearchRateCounter> RATE_COUNTERS =
Maps.newConcurrentMap();
private static final ConcurrentMap<PenguinVersion, Map<Integer, SearchRateCounter>>
SCORE_HISTOGRAMS = Maps.newConcurrentMap();
private double offensiveTermDamping = DEFAULT_OFFENSIVE_TERM_DAMPING;
private double offensiveNameDamping = DEFAULT_OFFENSIVE_NAME_DAMPING;
private double lengthWeight = DEFAULT_LENGTH_WEIGHT;
private double readabilityWeight = DEFAULT_READABILITY_WEIGHT;
private double shoutWeight = DEFAULT_SHOUT_WEIGHT;
private double entropyWeight = DEFAULT_ENTROPY_WEIGHT;
private double linkWeight = DEFAULT_LINK_WEIGHT;
private double readabilityAlpha = DEFAULT_READABILITY_ALPHA;
private double entropyAlpha = DEFAULT_ENTROPY_ALPHA;
private double lengthAlpha = DEFAULT_LENGTH_ALPHA;
/** Configure from a config file, validate the configuration. */
public TweetTextScorer(String configFile) {
TweetProcessingConfig.init(configFile);
// get dampings
checkWeightRange(offensiveTermDamping = TweetProcessingConfig
.getDouble("offensive_term_damping", DEFAULT_OFFENSIVE_TERM_DAMPING));
checkWeightRange(offensiveNameDamping = TweetProcessingConfig
.getDouble("offensive_name_damping", DEFAULT_OFFENSIVE_NAME_DAMPING));
// get weights
checkWeightRange(lengthWeight = TweetProcessingConfig
.getDouble("length_weight", DEFAULT_LENGTH_WEIGHT));
checkWeightRange(readabilityWeight = TweetProcessingConfig
.getDouble("readability_weight", DEFAULT_READABILITY_WEIGHT));
checkWeightRange(shoutWeight = TweetProcessingConfig
.getDouble("shout_weight", DEFAULT_SHOUT_WEIGHT));
checkWeightRange(entropyWeight = TweetProcessingConfig
.getDouble("entropy_weight", DEFAULT_ENTROPY_WEIGHT));
checkWeightRange(linkWeight = TweetProcessingConfig
.getDouble("link_weight", DEFAULT_LINK_WEIGHT));
// check sigma of weights
Preconditions.checkArgument(
lengthWeight + readabilityWeight + shoutWeight + entropyWeight + linkWeight == 1.0d);
readabilityAlpha = TweetProcessingConfig
.getDouble("readability_alpha", DEFAULT_READABILITY_ALPHA);
entropyAlpha = TweetProcessingConfig.getDouble("entropy_alpha", DEFAULT_ENTROPY_ALPHA);
lengthAlpha = TweetProcessingConfig.getDouble("length_alpha", DEFAULT_LENGTH_ALPHA);
}
/** Creates a new TweetTextScorer instance. */
public TweetTextScorer() {
}
/** Scores the given tweet. */
public void scoreTweet(final TwitterMessage tweet) {
Preconditions.checkNotNull(tweet);
for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) {
TweetFeatures features = Preconditions.checkNotNull(tweet.getTweetFeatures(penguinVersion));
TweetTextFeatures textFeatures = Preconditions.checkNotNull(features.getTweetTextFeatures());
TweetTextQuality textQuality = Preconditions.checkNotNull(features.getTweetTextQuality());
boolean isOffensiveText = textQuality.hasBoolQuality(
TweetTextQuality.BooleanQualityType.OFFENSIVE);
boolean isOffensiveScreenName = textQuality.hasBoolQuality(
TweetTextQuality.BooleanQualityType.OFFENSIVE_USER);
double shoutScore = DEFAULT_NO_DAMPING - textQuality.getShout();
double lengthScore = normalize(textFeatures.getLength(), lengthAlpha);
double readabilityScore = normalize(textQuality.getReadability(), readabilityAlpha);
double entropyScore = normalize(textQuality.getEntropy(), entropyAlpha);
double score = (isOffensiveText ? offensiveTermDamping : DEFAULT_NO_DAMPING)
* (isOffensiveScreenName ? offensiveNameDamping : DEFAULT_NO_DAMPING)
* (lengthWeight * lengthScore
+ readabilityWeight * readabilityScore
+ shoutWeight * shoutScore
+ entropyWeight * entropyScore
+ linkWeight * (tweet.getExpandedUrlMapSize() > 0 ? 1 : 0));
// scale to [0, 100] byte
textQuality.setTextScore((byte) (score * 100));
updateStats(
isOffensiveText,
isOffensiveScreenName,
textFeatures,
score,
getRateCounterStat("num_offensive_text_", penguinVersion),
getRateCounterStat("num_offensive_user_", penguinVersion),
getRateCounterStat("num_no_trends_", penguinVersion),
getRateCounterStat("num_has_trends_", penguinVersion),
getRateCounterStat("num_too_many_trends_", penguinVersion),
getRateCounterStat("num_scored_tweets_", penguinVersion),
getScoreHistogram(penguinVersion));
if (LOG.isDebugEnabled()) {
LOG.debug(String.format(
"Tweet length [%.2f] weighted length [%.2f], readability [%.2f] "
+ "weighted readability [%.2f], shout [%.2f] weighted shout [%.2f], "
+ "entropy [%.2f], weighted entropy [%.2f], "
+ "score [%.2f], text [%s], penguin version [%s]",
lengthScore,
lengthWeight * lengthScore,
readabilityScore,
readabilityWeight * readabilityScore,
shoutScore,
shoutWeight * shoutScore,
entropyScore,
entropyWeight * entropyScore,
score,
tweet.getText(),
penguinVersion));
}
}
}
private void updateStats(boolean isOffensiveText,
boolean isOffensiveScreenName,
TweetTextFeatures textFeatures,
double score,
SearchRateCounter offensiveTextCounter,
SearchRateCounter offensiveUserNameCounter,
SearchRateCounter noTrendsCounter,
SearchRateCounter hasTrendsCounter,
SearchRateCounter tooManyTrendsHashtagsCounter,
SearchRateCounter scoredTweets,
Map<Integer, SearchRateCounter> scoreHistogram) {
// set stats
if (isOffensiveText) {
offensiveTextCounter.increment();
}
if (isOffensiveScreenName) {
offensiveUserNameCounter.increment();
}
if (textFeatures.getTrendingTermsSize() == 0) {
noTrendsCounter.increment();
} else {
hasTrendsCounter.increment();
}
if (TwitterMessage.hasMultipleHashtagsOrTrends(textFeatures)) {
tooManyTrendsHashtagsCounter.increment();
}
scoredTweets.increment();
int bucket = (int) Math.floor(score * 10) * 10;
scoreHistogram.get(bucket).increment();
}
// normalize the passed in value to smoothed [0, 1.0d] range
private static double normalize(double value, double alpha) {
return 2 * (1.0d / (1.0d + Math.exp(-(alpha * value))) - 0.5);
}
// Make sure weight values are within the range of [0.0, 1.0]
private void checkWeightRange(double value) {
Preconditions.checkArgument(value >= 0.0d && value <= 1.0d);
}
private Map<Integer, SearchRateCounter> getScoreHistogram(PenguinVersion penguinVersion) {
Map<Integer, SearchRateCounter> scoreHistogram = SCORE_HISTOGRAMS.get(penguinVersion);
if (scoreHistogram == null) {
scoreHistogram = Maps.newHashMap();
String statsName = "num_text_score_%d_%s";
for (int i = 0; i <= 100; i += 10) {
scoreHistogram.put(i, RelevanceStats.exportRate(
String.format(statsName, i, penguinVersion.name().toLowerCase())));
}
scoreHistogram = SCORE_HISTOGRAMS.putIfAbsent(penguinVersion, scoreHistogram);
if (scoreHistogram == null) {
scoreHistogram = SCORE_HISTOGRAMS.get(penguinVersion);
}
}
return scoreHistogram;
}
private SearchRateCounter getRateCounterStat(String statPrefix, PenguinVersion penguinVersion) {
String statName = statPrefix + penguinVersion.name().toLowerCase();
SearchRateCounter rateCounter = RATE_COUNTERS.get(statName);
if (rateCounter == null) {
// Only one RateCounter instance is created for each stat name. So we don't need to worry
// that another thread might've created this instance in the meantime: we can just create/get
// it, and store it in the map.
rateCounter = RelevanceStats.exportRate(statName);
RATE_COUNTERS.put(statName, rateCounter);
}
return rateCounter;
}
}