the-algorithm/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.scala

package com.twitter.simclusters_v2.common

import com.twitter.simclusters_v2.thriftscala.SimClusterWithScore
import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding}
import scala.collection.mutable
import scala.language.implicitConversions
import scala.util.hashing.MurmurHash3.arrayHash
import scala.util.hashing.MurmurHash3.productHash
import scala.math._

/**
 * A representation of a SimClusters Embedding, designed for low memory footprint and performance.
 * For services that cache millions of embeddings, we found this to significantly reduce allocations,
 * memory footprint and overall performance.
 *
 * Embedding data is stored in pre-sorted arrays rather than structures which use a lot of pointers
 * (e.g. Map). A minimal set of lazily-constructed intermediate data is kept.
 *
 * Be wary of adding further `val` or `lazy val`s to this class; materializing and storing more data
 * on these objects could significantly affect in-memory cache performance.
 *
 * Also, if you are using this code in a place where you care about memory footprint, be careful
 * not to materialize any of the lazy vals unless you need them.
 */
sealed trait SimClustersEmbedding extends Equals {
  import SimClustersEmbedding._

  /**
   * Any compliant implementation of the SimClustersEmbedding trait must ensure that:
   * - the cluster and score arrays are ordered as described below
   * - the cluster and score arrays are treated as immutable (.hashCode is memoized)
   * - the size of all cluster and score arrays is the same
   * - all cluster scores are > 0
   * - cluster ids are unique
   */
  // In descending score order - this is useful for truncation, where we care most about the highest scoring elements
  private[simclusters_v2] val clusterIds: Array[ClusterId]
  private[simclusters_v2] val scores: Array[Double]
  // In ascending cluster order. This is useful for operations where we try to find the same cluster in another embedding, e.g. dot product
  private[simclusters_v2] val sortedClusterIds: Array[ClusterId]
  private[simclusters_v2] val sortedScores: Array[Double]

  /**
   * Build and return a Set of all clusters in this embedding
   */
  lazy val clusterIdSet: Set[ClusterId] = sortedClusterIds.toSet

  /**
   * Build and return Seq representation of this embedding
   */
  lazy val embedding: Seq[(ClusterId, Double)] =
    sortedClusterIds.zip(sortedScores).sortBy(-_._2).toSeq

  /**
   * Build and return a Map representation of this embedding
   */
  lazy val map: Map[ClusterId, Double] = sortedClusterIds.zip(sortedScores).toMap

  lazy val l1norm: Double = CosineSimilarityUtil.l1NormArray(sortedScores)

  lazy val l2norm: Double = CosineSimilarityUtil.normArray(sortedScores)

  lazy val logNorm: Double = CosineSimilarityUtil.logNormArray(sortedScores)

  lazy val expScaledNorm: Double =
    CosineSimilarityUtil.expScaledNormArray(sortedScores, DefaultExponent)

  /**
   * The L2 Normalized Embedding. Optimize for Cosine Similarity Calculation.
   */
  lazy val normalizedSortedScores: Array[Double] =
    CosineSimilarityUtil.applyNormArray(sortedScores, l2norm)

  lazy val logNormalizedSortedScores: Array[Double] =
    CosineSimilarityUtil.applyNormArray(sortedScores, logNorm)

  lazy val expScaledNormalizedSortedScores: Array[Double] =
    CosineSimilarityUtil.applyNormArray(sortedScores, expScaledNorm)

  /**
   * The Standard Deviation of an Embedding.
   */
  lazy val std: Double = {
    if (scores.isEmpty) {
      0.0
    } else {
      val sum = scores.sum
      val mean = sum / scores.length
      var variance: Double = 0.0
      for (i <- scores.indices) {
        val v = scores(i) - mean
        variance += (v * v)
      }
      math.sqrt(variance / scores.length)
    }
  }

  /**
   * Return the score of a given clusterId.
   */
  def get(clusterId: ClusterId): Option[Double] = {
    var i = 0
    while (i < sortedClusterIds.length) {
      val thisId = sortedClusterIds(i)
      if (clusterId == thisId) return Some(sortedScores(i))
      if (thisId > clusterId) return None
      i += 1
    }
    None
  }

  /**
   * Return the score of a given clusterId. If not exist, return default.
   */
  def getOrElse(clusterId: ClusterId, default: Double = 0.0): Double = {
    require(default >= 0.0)
    var i = 0
    while (i < sortedClusterIds.length) {
      val thisId = sortedClusterIds(i)
      if (clusterId == thisId) return sortedScores(i)
      if (thisId > clusterId) return default
      i += 1
    }
    default
  }

  /**
   * Return the cluster ids
   */
  def getClusterIds(): Array[ClusterId] = clusterIds

  /**
   * Return the cluster ids with the highest scores
   */
  def topClusterIds(size: Int): Seq[ClusterId] = clusterIds.take(size)

  /**
   * Return true if this embedding contains a given clusterId
   */
  def contains(clusterId: ClusterId): Boolean = clusterIdSet.contains(clusterId)

  def sum(another: SimClustersEmbedding): SimClustersEmbedding = {
    if (another.isEmpty) this
    else if (this.isEmpty) another
    else {
      var i1 = 0
      var i2 = 0
      val l = scala.collection.mutable.ArrayBuffer.empty[(Int, Double)]
      while (i1 < sortedClusterIds.length && i2 < another.sortedClusterIds.length) {
        if (sortedClusterIds(i1) == another.sortedClusterIds(i2)) {
          l += Tuple2(sortedClusterIds(i1), sortedScores(i1) + another.sortedScores(i2))
          i1 += 1
          i2 += 1
        } else if (sortedClusterIds(i1) > another.sortedClusterIds(i2)) {
          l += Tuple2(another.sortedClusterIds(i2), another.sortedScores(i2))
          // another cluster is lower. Increment it to see if the next one matches this's
          i2 += 1
        } else {
          l += Tuple2(sortedClusterIds(i1), sortedScores(i1))
          // this cluster is lower. Increment it to see if the next one matches anothers's
          i1 += 1
        }
      }
      if (i1 == sortedClusterIds.length && i2 != another.sortedClusterIds.length)
        // this was shorter. Prepend remaining elements from another
        l ++= another.sortedClusterIds.drop(i2).zip(another.sortedScores.drop(i2))
      else if (i1 != sortedClusterIds.length && i2 == another.sortedClusterIds.length)
        // another was shorter. Prepend remaining elements from this
        l ++= sortedClusterIds.drop(i1).zip(sortedScores.drop(i1))
      SimClustersEmbedding(l)
    }
  }

  def scalarMultiply(multiplier: Double): SimClustersEmbedding = {
    require(multiplier > 0.0, "SimClustersEmbedding.scalarMultiply requires multiplier > 0.0")
    DefaultSimClustersEmbedding(
      clusterIds,
      scores.map(_ * multiplier),
      sortedClusterIds,
      sortedScores.map(_ * multiplier)
    )
  }

  def scalarDivide(divisor: Double): SimClustersEmbedding = {
    require(divisor > 0.0, "SimClustersEmbedding.scalarDivide requires divisor > 0.0")
    DefaultSimClustersEmbedding(
      clusterIds,
      scores.map(_ / divisor),
      sortedClusterIds,
      sortedScores.map(_ / divisor)
    )
  }

  def dotProduct(another: SimClustersEmbedding): Double = {
    CosineSimilarityUtil.dotProductForSortedClusterAndScores(
      sortedClusterIds,
      sortedScores,
      another.sortedClusterIds,
      another.sortedScores)
  }

  def cosineSimilarity(another: SimClustersEmbedding): Double = {
    CosineSimilarityUtil.dotProductForSortedClusterAndScores(
      sortedClusterIds,
      normalizedSortedScores,
      another.sortedClusterIds,
      another.normalizedSortedScores)
  }

  def logNormCosineSimilarity(another: SimClustersEmbedding): Double = {
    CosineSimilarityUtil.dotProductForSortedClusterAndScores(
      sortedClusterIds,
      logNormalizedSortedScores,
      another.sortedClusterIds,
      another.logNormalizedSortedScores)
  }

  def expScaledCosineSimilarity(another: SimClustersEmbedding): Double = {
    CosineSimilarityUtil.dotProductForSortedClusterAndScores(
      sortedClusterIds,
      expScaledNormalizedSortedScores,
      another.sortedClusterIds,
      another.expScaledNormalizedSortedScores)
  }

  /**
   * Return true if this is an empty embedding
   */
  def isEmpty: Boolean = sortedClusterIds.isEmpty

  /**
   * Return the Jaccard Similarity Score between two embeddings.
   * Note: this implementation should be optimized if we start to use it in production
   */
  def jaccardSimilarity(another: SimClustersEmbedding): Double = {
    if (this.isEmpty || another.isEmpty) {
      0.0
    } else {
      val intersect = clusterIdSet.intersect(another.clusterIdSet).size
      val union = clusterIdSet.union(another.clusterIdSet).size
      intersect.toDouble / union
    }
  }

  /**
   * Return the Fuzzy Jaccard Similarity Score between two embeddings.
   * Treat each Simclusters embedding as fuzzy set, calculate the fuzzy set similarity
   * metrics of two embeddings
   *
   * Paper 2.2.1: https://openreview.net/pdf?id=SkxXg2C5FX
   */
  def fuzzyJaccardSimilarity(another: SimClustersEmbedding): Double = {
    if (this.isEmpty || another.isEmpty) {
      0.0
    } else {
      val v1C = sortedClusterIds
      val v1S = sortedScores
      val v2C = another.sortedClusterIds
      val v2S = another.sortedScores

      require(v1C.length == v1S.length)
      require(v2C.length == v2S.length)

      var i1 = 0
      var i2 = 0
      var numerator = 0.0
      var denominator = 0.0

      while (i1 < v1C.length && i2 < v2C.length) {
        if (v1C(i1) == v2C(i2)) {
          numerator += min(v1S(i1), v2S(i2))
          denominator += max(v1S(i1), v2S(i2))
          i1 += 1
          i2 += 1
        } else if (v1C(i1) > v2C(i2)) {
          denominator += v2S(i2)
          i2 += 1
        } else {
          denominator += v1S(i1)
          i1 += 1
        }
      }

      while (i1 < v1C.length) {
        denominator += v1S(i1)
        i1 += 1
      }
      while (i2 < v2C.length) {
        denominator += v2S(i2)
        i2 += 1
      }

      numerator / denominator
    }
  }

  /**
   * Return the Euclidean Distance Score between two embeddings.
   * Note: this implementation should be optimized if we start to use it in production
   */
  def euclideanDistance(another: SimClustersEmbedding): Double = {
    val unionClusters = clusterIdSet.union(another.clusterIdSet)
    val variance = unionClusters.foldLeft(0.0) {
      case (sum, clusterId) =>
        val distance = math.abs(this.getOrElse(clusterId) - another.getOrElse(clusterId))
        sum + distance * distance
    }
    math.sqrt(variance)
  }

  /**
   * Return the Manhattan Distance Score between two embeddings.
   * Note: this implementation should be optimized if we start to use it in production
   */
  def manhattanDistance(another: SimClustersEmbedding): Double = {
    val unionClusters = clusterIdSet.union(another.clusterIdSet)
    unionClusters.foldLeft(0.0) {
      case (sum, clusterId) =>
        sum + math.abs(this.getOrElse(clusterId) - another.getOrElse(clusterId))
    }
  }

  /**
   * Return the number of overlapping clusters between two embeddings.
   */
  def overlappingClusters(another: SimClustersEmbedding): Int = {
    var i1 = 0
    var i2 = 0
    var count = 0

    while (i1 < sortedClusterIds.length && i2 < another.sortedClusterIds.length) {
      if (sortedClusterIds(i1) == another.sortedClusterIds(i2)) {
        count += 1
        i1 += 1
        i2 += 1
      } else if (sortedClusterIds(i1) > another.sortedClusterIds(i2)) {
        // v2 cluster is lower. Increment it to see if the next one matches v1's
        i2 += 1
      } else {
        // v1 cluster is lower. Increment it to see if the next one matches v2's
        i1 += 1
      }
    }
    count
  }

  /**
   * Return the largest product cluster scores
   */
  def maxElementwiseProduct(another: SimClustersEmbedding): Double = {
    var i1 = 0
    var i2 = 0
    var maxProduct: Double = 0.0

    while (i1 < sortedClusterIds.length && i2 < another.sortedClusterIds.length) {
      if (sortedClusterIds(i1) == another.sortedClusterIds(i2)) {
        val product = sortedScores(i1) * another.sortedScores(i2)
        if (product > maxProduct) maxProduct = product
        i1 += 1
        i2 += 1
      } else if (sortedClusterIds(i1) > another.sortedClusterIds(i2)) {
        // v2 cluster is lower. Increment it to see if the next one matches v1's
        i2 += 1
      } else {
        // v1 cluster is lower. Increment it to see if the next one matches v2's
        i1 += 1
      }
    }
    maxProduct
  }

  /**
   * Return a new SimClustersEmbedding with Max Embedding Size.
   *
   * Prefer to truncate on embedding construction where possible. Doing so is cheaper.
   */
  def truncate(size: Int): SimClustersEmbedding = {
    if (clusterIds.length <= size) {
      this
    } else {
      val truncatedClusterIds = clusterIds.take(size)
      val truncatedScores = scores.take(size)
      val (sortedClusterIds, sortedScores) =
        truncatedClusterIds.zip(truncatedScores).sortBy(_._1).unzip

      DefaultSimClustersEmbedding(
        truncatedClusterIds,
        truncatedScores,
        sortedClusterIds,
        sortedScores)
    }
  }

  def toNormalized: SimClustersEmbedding = {
    // Additional safety check. Only EmptyEmbedding's l2norm is 0.0.
    if (l2norm == 0.0) {
      EmptyEmbedding
    } else {
      this.scalarDivide(l2norm)
    }
  }

  implicit def toThrift: ThriftSimClustersEmbedding = {
    ThriftSimClustersEmbedding(
      embedding.map {
        case (clusterId, score) =>
          SimClusterWithScore(clusterId, score)
      }
    )
  }

  def canEqual(a: Any): Boolean = a.isInstanceOf[SimClustersEmbedding]

  /* We define equality as having the same clusters and scores.
   * This implementation is arguably incorrect in this case:
   *   (1 -> 1.0, 2 -> 0.0) == (1 -> 1.0)  // equals returns false
   * However, compliant implementations of SimClustersEmbedding should not include zero-weight
   * clusters, so this implementation should work correctly.
   */
  override def equals(that: Any): Boolean =
    that match {
      case that: SimClustersEmbedding =>
        that.canEqual(this) &&
          this.sortedClusterIds.sameElements(that.sortedClusterIds) &&
          this.sortedScores.sameElements(that.sortedScores)
      case _ => false
    }

  /**
   * hashcode implementation based on the contents of the embedding. As a lazy val, this relies on
   * the embedding contents being immutable.
   */
  override lazy val hashCode: Int = {
    /* Arrays uses object id as hashCode, so different arrays with the same contents hash
     * differently. To provide a stable hash code, we take the same approach as how a
     * `case class(clusters: Seq[Int], scores: Seq[Double])` would be hashed. See
     * ScalaRunTime._hashCode and MurmurHash3.productHash
     * https://github.com/scala/scala/blob/2.12.x/src/library/scala/runtime/ScalaRunTime.scala#L167
     * https://github.com/scala/scala/blob/2.12.x/src/library/scala/util/hashing/MurmurHash3.scala#L64
     *
     * Note that the hashcode is arguably incorrect in this case:
     *   (1 -> 1.0, 2 -> 0.0).hashcode == (1 -> 1.0).hashcode  // returns false
     * However, compliant implementations of SimClustersEmbedding should not include zero-weight
     * clusters, so this implementation should work correctly.
     */
    productHash((arrayHash(sortedClusterIds), arrayHash(sortedScores)))
  }
}

object SimClustersEmbedding {
  val EmptyEmbedding: SimClustersEmbedding =
    DefaultSimClustersEmbedding(Array.empty, Array.empty, Array.empty, Array.empty)

  val DefaultExponent: Double = 0.3

  // Descending by score then ascending by ClusterId
  implicit val order: Ordering[(ClusterId, Double)] =
    (a: (ClusterId, Double), b: (ClusterId, Double)) => {
      b._2 compare a._2 match {
        case 0 => a._1 compare b._1
        case c => c
      }
    }

  /**
   * Constructors
   *
   * These constructors:
   * - do not make assumptions about the ordering of the cluster/scores.
   * - do assume that cluster ids are unique
   * - ignore (drop) any cluster whose score is <= 0
   */
  def apply(embedding: (ClusterId, Double)*): SimClustersEmbedding =
    buildDefaultSimClustersEmbedding(embedding)

  def apply(embedding: Iterable[(ClusterId, Double)]): SimClustersEmbedding =
    buildDefaultSimClustersEmbedding(embedding)

  def apply(embedding: Iterable[(ClusterId, Double)], size: Int): SimClustersEmbedding =
    buildDefaultSimClustersEmbedding(embedding, truncate = Some(size))

  implicit def apply(thriftEmbedding: ThriftSimClustersEmbedding): SimClustersEmbedding =
    buildDefaultSimClustersEmbedding(thriftEmbedding.embedding.map(_.toTuple))

  def apply(thriftEmbedding: ThriftSimClustersEmbedding, truncate: Int): SimClustersEmbedding =
    buildDefaultSimClustersEmbedding(
      thriftEmbedding.embedding.map(_.toTuple),
      truncate = Some(truncate))

  private def buildDefaultSimClustersEmbedding(
    embedding: Iterable[(ClusterId, Double)],
    truncate: Option[Int] = None
  ): SimClustersEmbedding = {
    val truncatedIdAndScores = {
      val idsAndScores = embedding.filter(_._2 > 0.0).toArray.sorted(order)
      truncate match {
        case Some(t) => idsAndScores.take(t)
        case _ => idsAndScores
      }
    }

    if (truncatedIdAndScores.isEmpty) {
      EmptyEmbedding
    } else {
      val (clusterIds, scores) = truncatedIdAndScores.unzip
      val (sortedClusterIds, sortedScores) = truncatedIdAndScores.sortBy(_._1).unzip
      DefaultSimClustersEmbedding(clusterIds, scores, sortedClusterIds, sortedScores)
    }
  }

  /** ***** Aggregation Methods ******/
  /**
   * A high performance version of Sum a list of SimClustersEmbeddings.
   * Suggest using in Online Services to avoid the unnecessary GC.
   * For offline or streaming. Please check [[SimClustersEmbeddingMonoid]]
   */
  def sum(simClustersEmbeddings: Iterable[SimClustersEmbedding]): SimClustersEmbedding = {
    if (simClustersEmbeddings.isEmpty) {
      EmptyEmbedding
    } else {
      val sum = simClustersEmbeddings.foldLeft(mutable.Map[ClusterId, Double]()) {
        (sum, embedding) =>
          for (i <- embedding.sortedClusterIds.indices) {
            val clusterId = embedding.sortedClusterIds(i)
            sum.put(clusterId, embedding.sortedScores(i) + sum.getOrElse(clusterId, 0.0))
          }
          sum
      }
      SimClustersEmbedding(sum)
    }
  }

  /**
   * Support a fixed size SimClustersEmbedding Sum
   */
  def sum(
    simClustersEmbeddings: Iterable[SimClustersEmbedding],
    maxSize: Int
  ): SimClustersEmbedding = {
    sum(simClustersEmbeddings).truncate(maxSize)
  }

  /**
   * A high performance version of Mean a list of SimClustersEmbeddings.
   * Suggest using in Online Services to avoid the unnecessary GC.
   */
  def mean(simClustersEmbeddings: Iterable[SimClustersEmbedding]): SimClustersEmbedding = {
    if (simClustersEmbeddings.isEmpty) {
      EmptyEmbedding
    } else {
      sum(simClustersEmbeddings).scalarDivide(simClustersEmbeddings.size)
    }
  }

  /**
   * Support a fixed size SimClustersEmbedding Mean
   */
  def mean(
    simClustersEmbeddings: Iterable[SimClustersEmbedding],
    maxSize: Int
  ): SimClustersEmbedding = {
    mean(simClustersEmbeddings).truncate(maxSize)
  }
}

case class DefaultSimClustersEmbedding(
  override val clusterIds: Array[ClusterId],
  override val scores: Array[Double],
  override val sortedClusterIds: Array[ClusterId],
  override val sortedScores: Array[Double])
    extends SimClustersEmbedding {

  override def toString: String =
    s"DefaultSimClustersEmbedding(${clusterIds.zip(scores).mkString(",")})"
}

object DefaultSimClustersEmbedding {
  // To support existing code which builds embeddings from a Seq
  def apply(embedding: Seq[(ClusterId, Double)]): SimClustersEmbedding = SimClustersEmbedding(
    embedding)
}