the-algorithm/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.scala
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

34 lines
1.4 KiB
Scala

package com.twitter.simclusters_v2.common.clustering
/**
* Groups entities by a single embedding dimension with the largest score.
*/
class LargestDimensionClusteringMethod extends ClusteringMethod {
/**
* @param embeddings map of entity IDs and corresponding embeddings
* @param similarityFn function that outputs discrete value (0.0 or 1.0).
* 1.0 if the dimensions of the highest score (weight) from two given embeddings match.
* 0.0 otherwise.
* e.g.
* case 1: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.3, 0.8, 0.0]. similarityFn(E1, E2)=1.0
* case 2: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.4, 0.2, 0.0]. similarityFn(E1, E2)=0.0
* @tparam T embedding type. e.g. SimClustersEmbedding
*
* @return A set of sets of entity IDs, each set representing a distinct cluster.
*/
override def cluster[T](
embeddings: Map[Long, T],
similarityFn: (T, T) => Double,
recordStatCallback: (String, Long) => Unit
): Set[Set[Long]] = {
// rely on clustering by connected component.
// similarityThreshold=0.1 because it's larger than 0.0 (similarityFn returns 0.0 if two embeddings
// don't share the largest dimension.
new ConnectedComponentsClusteringMethod(similarityThreshold = 0.1)
.cluster(embeddings, similarityFn, recordStatCallback)
}
}