the-algorithm/home-mixer/server/src/main/scala/com/twitter/home_mixer/util/LanguageUtil.scala
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

94 lines
3.3 KiB
Scala

package com.twitter.home_mixer.util
import com.twitter.search.common.constants.{thriftscala => scc}
import com.twitter.search.common.util.lang.ThriftLanguageUtil
import com.twitter.service.metastore.gen.{thriftscala => smg}
object LanguageUtil {
private val DafaultMinProducedLanguageRatio = 0.05
private val DefaultMinConsumedLanguageConfidence = 0.8
/**
* Computes a list of languages based on UserLanguages information retrieved from Metastore.
*
* The list is sorted in descending order of confidence score associated with each language.
* That is, language with highest confidence value is in index 0.
*/
def computeLanguages(
userLanguages: smg.UserLanguages,
minProducedLanguageRatio: Double = DafaultMinProducedLanguageRatio,
minConsumedLanguageConfidence: Double = DefaultMinConsumedLanguageConfidence
): Seq[scc.ThriftLanguage] = {
val languageConfidenceMap = computeLanguageConfidenceMap(
userLanguages,
minProducedLanguageRatio,
minConsumedLanguageConfidence
)
languageConfidenceMap.toSeq.sortWith(_._2 > _._2).map(_._1) // _1 = language, _2 = score
}
/**
* Computes confidence map based on UserLanguages information retrieved from Metastore.
* where,
* key = language code
* value = level of confidence that the language is applicable to a user.
*/
private def computeLanguageConfidenceMap(
userLanguages: smg.UserLanguages,
minProducedLanguageRatio: Double,
minConsumedLanguageConfidence: Double
): Map[scc.ThriftLanguage, Double] = {
val producedLanguages = getLanguageMap(userLanguages.produced)
val consumedLanguages = getLanguageMap(userLanguages.consumed)
val languages = (producedLanguages.keys ++ consumedLanguages.keys).toSet
var maxConfidence = 0.0
val confidenceMap = languages.map { language =>
val produceRatio = producedLanguages
.get(language)
.map { score => if (score < minProducedLanguageRatio) 0.0 else score }
.getOrElse(0.0)
val consumeConfidence = consumedLanguages
.get(language)
.map { score => if (score < minConsumedLanguageConfidence) 0.0 else score }
.getOrElse(0.0)
val overallConfidence = (0.3 + 4 * produceRatio) * (0.1 + consumeConfidence)
maxConfidence = Math.max(maxConfidence, overallConfidence)
(language -> overallConfidence)
}.toMap
val normalizedConfidenceMap = if (maxConfidence > 0) {
confidenceMap.map {
case (language, confidenceScore) =>
val normalizedScore = (confidenceScore / maxConfidence * 0.9) + 0.1
(language -> normalizedScore)
}
} else {
confidenceMap
}
normalizedConfidenceMap
}
private def getLanguageMap(
scoredLanguages: Seq[smg.ScoredString]
): Map[scc.ThriftLanguage, Double] = {
scoredLanguages.flatMap { scoredLanguage =>
getThriftLanguage(scoredLanguage.item).map { language => (language -> scoredLanguage.weight) }
}.toMap
}
private def getThriftLanguage(languageName: String): Option[scc.ThriftLanguage] = {
val languageOrdinal = ThriftLanguageUtil.getThriftLanguageOf(languageName).ordinal
val language = scc.ThriftLanguage(languageOrdinal)
language match {
case scc.ThriftLanguage.Unknown => None
case _ => Some(language)
}
}
}