mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-30 15:06:07 +02:00
![twitter-team](/assets/img/avatar_default.png)
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
252 lines
7.2 KiB
Scala
252 lines
7.2 KiB
Scala
package com.twitter.simclusters_v2.common
|
|
|
|
object CosineSimilarityUtil {
|
|
|
|
/**
|
|
* Sum of squared elements for a given vector v
|
|
*/
|
|
def sumOfSquares[T](v: Map[T, Double]): Double = {
|
|
v.values.foldLeft(0.0) { (sum, value) => sum + value * value }
|
|
}
|
|
|
|
/**
|
|
* Sum of squared elements for a given vector v
|
|
*/
|
|
def sumOfSquaresArray(v: Array[Double]): Double = {
|
|
v.foldLeft(0.0) { (sum, value) => sum + value * value }
|
|
}
|
|
|
|
/**
|
|
* Calculate the l2Norm score
|
|
*/
|
|
def norm[T](v: Map[T, Double]): Double = {
|
|
math.sqrt(sumOfSquares(v))
|
|
}
|
|
|
|
/**
|
|
* Calculate the l2Norm score
|
|
*/
|
|
def normArray(v: Array[Double]): Double = {
|
|
math.sqrt(sumOfSquaresArray(v))
|
|
}
|
|
|
|
/**
|
|
* Calculate the logNorm score
|
|
*/
|
|
def logNorm[T](v: Map[T, Double]): Double = {
|
|
math.log(sumOfSquares(v) + 1)
|
|
}
|
|
|
|
/**
|
|
* Calculate the logNorm score
|
|
*/
|
|
def logNormArray(v: Array[Double]): Double = {
|
|
math.log(sumOfSquaresArray(v) + 1)
|
|
}
|
|
|
|
/**
|
|
* Calculate the exp scaled norm score
|
|
* */
|
|
def expScaledNorm[T](v: Map[T, Double], exponent: Double): Double = {
|
|
math.pow(sumOfSquares(v), exponent)
|
|
}
|
|
|
|
/**
|
|
* Calculate the exp scaled norm score
|
|
* */
|
|
def expScaledNormArray(v: Array[Double], exponent: Double): Double = {
|
|
math.pow(sumOfSquaresArray(v), exponent)
|
|
}
|
|
|
|
/**
|
|
* Calculate the l1Norm score
|
|
*/
|
|
def l1Norm[T](v: Map[T, Double]): Double = {
|
|
v.values.foldLeft(0.0) { (sum, value) => sum + Math.abs(value) }
|
|
}
|
|
|
|
/**
|
|
* Calculate the l1Norm score
|
|
*/
|
|
def l1NormArray(v: Array[Double]): Double = {
|
|
v.foldLeft(0.0) { (sum, value) => sum + Math.abs(value) }
|
|
}
|
|
|
|
/**
|
|
* Divide the weight vector with the applied norm
|
|
* Return the original object if the norm is 0
|
|
*
|
|
* @param v a map from cluster id to its weight
|
|
* @param norm a calculated norm from the given map v
|
|
*
|
|
* @return a map with normalized weight
|
|
*/
|
|
def applyNorm[T](v: Map[T, Double], norm: Double): Map[T, Double] = {
|
|
if (norm == 0) v else v.mapValues(x => x / norm)
|
|
}
|
|
|
|
/**
|
|
* Divide the weight vector with the applied norm
|
|
* Return the original object if the norm is 0
|
|
*
|
|
* @param v a an array of weights
|
|
* @param norm a calculated norm from the given array v
|
|
*
|
|
* @return an array with normalized weight in the same order as v
|
|
*/
|
|
def applyNormArray(v: Array[Double], norm: Double): Array[Double] = {
|
|
if (norm == 0) v else v.map(_ / norm)
|
|
}
|
|
|
|
/**
|
|
* Normalize the weight vector for easy cosine similarity calculation. If the input weight vector
|
|
* is empty or its norm is 0, return the original map.
|
|
*
|
|
* @param v a map from cluster id to its weight
|
|
*
|
|
* @return a map with normalized weight (the norm of the weight vector is 1)
|
|
*/
|
|
def normalize[T](v: Map[T, Double], maybeNorm: Option[Double] = None): Map[T, Double] = {
|
|
val norm = maybeNorm.getOrElse(CosineSimilarityUtil.norm(v))
|
|
applyNorm(v, norm)
|
|
}
|
|
|
|
/**
|
|
* Normalize the weight vector for easy cosine similarity calculation. If the input weight vector
|
|
* is empty or its norm is 0, return the original array.
|
|
*
|
|
* @param v an array of weights
|
|
*
|
|
* @return an array with normalized weight (the norm of the weight vector is 1), in the same order as v
|
|
*/
|
|
def normalizeArray(
|
|
v: Array[Double],
|
|
maybeNorm: Option[Double] = None
|
|
): Array[Double] = {
|
|
val norm = maybeNorm.getOrElse(CosineSimilarityUtil.normArray(v))
|
|
applyNormArray(v, norm)
|
|
}
|
|
|
|
/**
|
|
* Normalize the weight vector with log norm. If the input weight vector
|
|
* is empty or its norm is 0, return the original map.
|
|
*
|
|
* @param v a map from cluster id to its weight
|
|
*
|
|
* @return a map with log normalized weight
|
|
* */
|
|
def logNormalize[T](v: Map[T, Double], maybeNorm: Option[Double] = None): Map[T, Double] = {
|
|
val norm = maybeNorm.getOrElse(CosineSimilarityUtil.logNorm(v))
|
|
applyNorm(v, norm)
|
|
}
|
|
|
|
/**
|
|
* Normalize the weight vector with log norm. If the input weight vector
|
|
* is empty or its norm is 0, return the original array.
|
|
*
|
|
* @param v an array of weights
|
|
*
|
|
* @return an array with log normalized weight, in the same order as v
|
|
* */
|
|
def logNormalizeArray(
|
|
v: Array[Double],
|
|
maybeNorm: Option[Double] = None
|
|
): Array[Double] = {
|
|
val norm = maybeNorm.getOrElse(CosineSimilarityUtil.logNormArray(v))
|
|
applyNormArray(v, norm)
|
|
}
|
|
|
|
/**
|
|
* Normalize the weight vector with exponentially scaled norm. If the input weight vector
|
|
* is empty or its norm is 0, return the original map.
|
|
*
|
|
* @param v a map from cluster id to its weight
|
|
* @param exponent the exponent we apply to the weight vector's norm
|
|
*
|
|
* @return a map with exp scaled normalized weight
|
|
* */
|
|
def expScaledNormalize[T](
|
|
v: Map[T, Double],
|
|
exponent: Option[Double] = None,
|
|
maybeNorm: Option[Double] = None
|
|
): Map[T, Double] = {
|
|
val norm = maybeNorm.getOrElse(CosineSimilarityUtil.expScaledNorm(v, exponent.getOrElse(0.3)))
|
|
applyNorm(v, norm)
|
|
}
|
|
|
|
/**
|
|
* Normalize the weight vector with exponentially scaled norm. If the input weight vector
|
|
* is empty or its norm is 0, return the original map.
|
|
*
|
|
* @param v an array of weights
|
|
* @param exponent the exponent we apply to the weight vector's norm
|
|
*
|
|
* @return an array with exp scaled normalized weight, in the same order as v
|
|
* */
|
|
def expScaledNormalizeArray(
|
|
v: Array[Double],
|
|
exponent: Double,
|
|
maybeNorm: Option[Double] = None
|
|
): Array[Double] = {
|
|
val norm = maybeNorm.getOrElse(CosineSimilarityUtil.expScaledNormArray(v, exponent))
|
|
applyNormArray(v, norm)
|
|
}
|
|
|
|
/**
|
|
* Given two sparse vectors, calculate its dot product.
|
|
*
|
|
* @param v1 the first map from cluster id to its weight
|
|
* @param v2 the second map from cluster id to its weight
|
|
*
|
|
* @return the dot product of above two sparse vector
|
|
*/
|
|
def dotProduct[T](v1: Map[T, Double], v2: Map[T, Double]): Double = {
|
|
val comparer = v1.size - v2.size
|
|
val smaller = if (comparer > 0) v2 else v1
|
|
val bigger = if (comparer > 0) v1 else v2
|
|
|
|
smaller.foldLeft(0.0) {
|
|
case (sum, (id, value)) =>
|
|
sum + bigger.getOrElse(id, 0.0) * value
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Given two sparse vectors, calculate its dot product.
|
|
*
|
|
* @param v1C an array of cluster ids. Must be sorted in ascending order
|
|
* @param v1S an array of corresponding cluster scores, of the same length and order as v1c
|
|
* @param v2C an array of cluster ids. Must be sorted in ascending order
|
|
* @param v2S an array of corresponding cluster scores, of the same length and order as v2c
|
|
*
|
|
* @return the dot product of above two sparse vector
|
|
*/
|
|
def dotProductForSortedClusterAndScores(
|
|
v1C: Array[Int],
|
|
v1S: Array[Double],
|
|
v2C: Array[Int],
|
|
v2S: Array[Double]
|
|
): Double = {
|
|
require(v1C.size == v1S.size)
|
|
require(v2C.size == v2S.size)
|
|
var i1 = 0
|
|
var i2 = 0
|
|
var product: Double = 0.0
|
|
|
|
while (i1 < v1C.size && i2 < v2C.size) {
|
|
if (v1C(i1) == v2C(i2)) {
|
|
product += v1S(i1) * v2S(i2)
|
|
i1 += 1
|
|
i2 += 1
|
|
} else if (v1C(i1) > v2C(i2)) {
|
|
// v2 cluster is lower. Increment it to see if the next one matches v1's
|
|
i2 += 1
|
|
} else {
|
|
// v1 cluster is lower. Increment it to see if the next one matches v2's
|
|
i1 += 1
|
|
}
|
|
}
|
|
product
|
|
}
|
|
}
|