the-algorithm/src/java/com/twitter/search/common/relevance/scorers/TweetTextScorer.java

package com.twitter.search.common.relevance.scorers;

import java.util.Map;
import java.util.concurrent.ConcurrentMap;

import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.metrics.RelevanceStats;
import com.twitter.search.common.metrics.SearchRateCounter;
import com.twitter.search.common.relevance.config.TweetProcessingConfig;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.features.TweetFeatures;
import com.twitter.search.common.relevance.features.TweetTextFeatures;
import com.twitter.search.common.relevance.features.TweetTextQuality;

/**
 * Compute a text score for TwitterMessage based on its offensiveness,
 * shoutness, length, readability and hashtag properties extracted from
 * tweet text.
 * <p/>
 * Formula:
 * text_score = offensive_text_damping * offensive_username_damping *
 * Sigma(feature_score_weight * feature_score)
 * <p/>
 * scored features are: length, readability, shout, entropy, links
 */
public class TweetTextScorer extends TweetScorer {
  private static final Logger LOG = LoggerFactory.getLogger(TweetTextScorer.class);

  private static final double DEFAULT_OFFENSIVE_TERM_DAMPING = 0.2d;
  private static final double DEFAULT_OFFENSIVE_NAME_DAMPING = 0.2d;

  // Sigma of all weights = 1.0d
  private static final double DEFAULT_LENGTH_WEIGHT = 0.5d;
  private static final double DEFAULT_READABILITY_WEIGHT = 0.1d;
  private static final double DEFAULT_SHOUT_WEIGHT = 0.1d;
  private static final double DEFAULT_ENTROPY_WEIGHT = 0.25d;
  private static final double DEFAULT_LINK_WEIGHT = 0.05d;

  private static final double DEFAULT_NO_DAMPING = 1.0d;

  // Sigmoid alpha values for normalization
  private static final double DEFAULT_READABILITY_ALPHA = 0.05d;
  private static final double DEFAULT_ENTROPY_ALPHA = 0.5d;
  private static final double DEFAULT_LENGTH_ALPHA = 0.03d;

  private static final ConcurrentMap<String, SearchRateCounter> RATE_COUNTERS =
      Maps.newConcurrentMap();
  private static final ConcurrentMap<PenguinVersion, Map<Integer, SearchRateCounter>>
      SCORE_HISTOGRAMS = Maps.newConcurrentMap();

  private double offensiveTermDamping = DEFAULT_OFFENSIVE_TERM_DAMPING;
  private double offensiveNameDamping = DEFAULT_OFFENSIVE_NAME_DAMPING;

  private double lengthWeight = DEFAULT_LENGTH_WEIGHT;
  private double readabilityWeight = DEFAULT_READABILITY_WEIGHT;
  private double shoutWeight = DEFAULT_SHOUT_WEIGHT;
  private double entropyWeight = DEFAULT_ENTROPY_WEIGHT;
  private double linkWeight = DEFAULT_LINK_WEIGHT;

  private double readabilityAlpha = DEFAULT_READABILITY_ALPHA;
  private double entropyAlpha = DEFAULT_ENTROPY_ALPHA;
  private double lengthAlpha = DEFAULT_LENGTH_ALPHA;

  /** Configure from a config file, validate the configuration. */
  public TweetTextScorer(String configFile) {
    TweetProcessingConfig.init(configFile);

    // get dampings
    checkWeightRange(offensiveTermDamping = TweetProcessingConfig
        .getDouble("offensive_term_damping", DEFAULT_OFFENSIVE_TERM_DAMPING));
    checkWeightRange(offensiveNameDamping = TweetProcessingConfig
        .getDouble("offensive_name_damping", DEFAULT_OFFENSIVE_NAME_DAMPING));

    // get weights
    checkWeightRange(lengthWeight = TweetProcessingConfig
        .getDouble("length_weight", DEFAULT_LENGTH_WEIGHT));
    checkWeightRange(readabilityWeight = TweetProcessingConfig
        .getDouble("readability_weight", DEFAULT_READABILITY_WEIGHT));
    checkWeightRange(shoutWeight = TweetProcessingConfig
        .getDouble("shout_weight", DEFAULT_SHOUT_WEIGHT));
    checkWeightRange(entropyWeight = TweetProcessingConfig
        .getDouble("entropy_weight", DEFAULT_ENTROPY_WEIGHT));
    checkWeightRange(linkWeight = TweetProcessingConfig
        .getDouble("link_weight", DEFAULT_LINK_WEIGHT));

    // check sigma of weights
    Preconditions.checkArgument(
        lengthWeight + readabilityWeight + shoutWeight + entropyWeight + linkWeight == 1.0d);

    readabilityAlpha = TweetProcessingConfig
        .getDouble("readability_alpha", DEFAULT_READABILITY_ALPHA);
    entropyAlpha = TweetProcessingConfig.getDouble("entropy_alpha", DEFAULT_ENTROPY_ALPHA);
    lengthAlpha = TweetProcessingConfig.getDouble("length_alpha", DEFAULT_LENGTH_ALPHA);
  }

  /** Creates a new TweetTextScorer instance. */
  public TweetTextScorer() {
  }

  /** Scores the given tweet. */
  public void scoreTweet(final TwitterMessage tweet) {
    Preconditions.checkNotNull(tweet);

    for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) {
      TweetFeatures features = Preconditions.checkNotNull(tweet.getTweetFeatures(penguinVersion));
      TweetTextFeatures textFeatures = Preconditions.checkNotNull(features.getTweetTextFeatures());
      TweetTextQuality textQuality = Preconditions.checkNotNull(features.getTweetTextQuality());
      boolean isOffensiveText = textQuality.hasBoolQuality(
          TweetTextQuality.BooleanQualityType.OFFENSIVE);
      boolean isOffensiveScreenName = textQuality.hasBoolQuality(
          TweetTextQuality.BooleanQualityType.OFFENSIVE_USER);
      double shoutScore = DEFAULT_NO_DAMPING - textQuality.getShout();
      double lengthScore = normalize(textFeatures.getLength(), lengthAlpha);
      double readabilityScore = normalize(textQuality.getReadability(), readabilityAlpha);
      double entropyScore = normalize(textQuality.getEntropy(), entropyAlpha);

      double score = (isOffensiveText ? offensiveTermDamping : DEFAULT_NO_DAMPING)
        * (isOffensiveScreenName ? offensiveNameDamping : DEFAULT_NO_DAMPING)
        * (lengthWeight * lengthScore
           + readabilityWeight * readabilityScore
           + shoutWeight * shoutScore
           + entropyWeight * entropyScore
           + linkWeight * (tweet.getExpandedUrlMapSize() > 0 ? 1 : 0));

      // scale to [0, 100] byte
      textQuality.setTextScore((byte) (score * 100));

      updateStats(
          isOffensiveText,
          isOffensiveScreenName,
          textFeatures,
          score,
          getRateCounterStat("num_offensive_text_", penguinVersion),
          getRateCounterStat("num_offensive_user_", penguinVersion),
          getRateCounterStat("num_no_trends_", penguinVersion),
          getRateCounterStat("num_has_trends_", penguinVersion),
          getRateCounterStat("num_too_many_trends_", penguinVersion),
          getRateCounterStat("num_scored_tweets_", penguinVersion),
          getScoreHistogram(penguinVersion));

      if (LOG.isDebugEnabled()) {
        LOG.debug(String.format(
            "Tweet length [%.2f] weighted length [%.2f], readability [%.2f] "
            + "weighted readability [%.2f], shout [%.2f] weighted shout [%.2f], "
            + "entropy [%.2f], weighted entropy [%.2f], "
            + "score [%.2f], text [%s], penguin version [%s]",
            lengthScore,
            lengthWeight * lengthScore,
            readabilityScore,
            readabilityWeight * readabilityScore,
            shoutScore,
            shoutWeight * shoutScore,
            entropyScore,
            entropyWeight * entropyScore,
            score,
            tweet.getText(),
            penguinVersion));
      }
    }
  }

  private void updateStats(boolean isOffensiveText,
                           boolean isOffensiveScreenName,
                           TweetTextFeatures textFeatures,
                           double score,
                           SearchRateCounter offensiveTextCounter,
                           SearchRateCounter offensiveUserNameCounter,
                           SearchRateCounter noTrendsCounter,
                           SearchRateCounter hasTrendsCounter,
                           SearchRateCounter tooManyTrendsHashtagsCounter,
                           SearchRateCounter scoredTweets,
                           Map<Integer, SearchRateCounter> scoreHistogram) {
    // set stats
    if (isOffensiveText) {
      offensiveTextCounter.increment();
    }
    if (isOffensiveScreenName) {
      offensiveUserNameCounter.increment();
    }
    if (textFeatures.getTrendingTermsSize() == 0) {
      noTrendsCounter.increment();
    } else {
      hasTrendsCounter.increment();
    }
    if (TwitterMessage.hasMultipleHashtagsOrTrends(textFeatures)) {
      tooManyTrendsHashtagsCounter.increment();
    }
    scoredTweets.increment();

    int bucket = (int) Math.floor(score * 10) * 10;
    scoreHistogram.get(bucket).increment();
  }

  // normalize the passed in value to smoothed [0, 1.0d] range
  private static double normalize(double value, double alpha) {
    return 2 * (1.0d / (1.0d + Math.exp(-(alpha * value))) - 0.5);
  }

  // Make sure weight values are within the range of [0.0, 1.0]
  private void checkWeightRange(double value) {
    Preconditions.checkArgument(value >= 0.0d && value <= 1.0d);
  }

  private Map<Integer, SearchRateCounter> getScoreHistogram(PenguinVersion penguinVersion) {
    Map<Integer, SearchRateCounter> scoreHistogram = SCORE_HISTOGRAMS.get(penguinVersion);
    if (scoreHistogram == null) {
      scoreHistogram = Maps.newHashMap();
      String statsName = "num_text_score_%d_%s";

      for (int i = 0; i <= 100; i += 10) {
        scoreHistogram.put(i, RelevanceStats.exportRate(
                               String.format(statsName, i, penguinVersion.name().toLowerCase())));
      }

      scoreHistogram = SCORE_HISTOGRAMS.putIfAbsent(penguinVersion, scoreHistogram);
      if (scoreHistogram == null) {
        scoreHistogram = SCORE_HISTOGRAMS.get(penguinVersion);
      }
    }

    return scoreHistogram;
  }

  private SearchRateCounter getRateCounterStat(String statPrefix, PenguinVersion penguinVersion) {
    String statName = statPrefix + penguinVersion.name().toLowerCase();
    SearchRateCounter rateCounter = RATE_COUNTERS.get(statName);
    if (rateCounter == null) {
      // Only one RateCounter instance is created for each stat name. So we don't need to worry
      // that another thread might've created this instance in the meantime: we can just create/get
      // it, and store it in the map.
      rateCounter = RelevanceStats.exportRate(statName);
      RATE_COUNTERS.put(statName, rateCounter);
    }
    return rateCounter;
  }
}