the-algorithm/src/java/com/twitter/search/common/relevance/classifiers/TweetTextEvaluator.java

55 lines
1.9 KiB
Java

package com.twitter.search.common.relevance.classifiers;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.features.TweetTextFeatures;
import com.twitter.search.common.relevance.features.TweetTextQuality;
/**
* Calculates entropy of tweet text based on tokens.
*/
public class TweetTextEvaluator extends TweetEvaluator {
@Override
public void evaluate(final TwitterMessage tweet) {
for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) {
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
TweetTextQuality textQuality = tweet.getTweetTextQuality(penguinVersion);
double readability = 0;
int numKeptWords = textFeatures.getStrippedTokensSize();
for (String token : textFeatures.getStrippedTokens()) {
readability += token.length();
}
if (numKeptWords > 0) {
readability = readability * Math.log(numKeptWords) / numKeptWords;
}
textQuality.setReadability(readability);
textQuality.setEntropy(entropy(textFeatures.getStrippedTokens()));
textQuality.setShout(textFeatures.getCaps() / Math.max(textFeatures.getLength(), 1.0d));
}
}
private static double entropy(List<String> tokens) {
Map<String, Long> tokenCounts =
tokens.stream().collect(Collectors.groupingBy(Function.identity(), Collectors.counting()));
int numItems = tokens.size();
double entropy = 0;
for (long count : tokenCounts.values()) {
double prob = (double) count / numItems;
entropy -= prob * log2(prob);
}
return entropy;
}
private static double log2(double n) {
return Math.log(n) / Math.log(2);
}
}