mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-01 00:38:46 +02:00
ef4c5eb65e
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
34 lines
1.4 KiB
Scala
34 lines
1.4 KiB
Scala
package com.twitter.simclusters_v2.common.clustering
|
|
|
|
/**
|
|
* Groups entities by a single embedding dimension with the largest score.
|
|
*/
|
|
class LargestDimensionClusteringMethod extends ClusteringMethod {
|
|
|
|
/**
|
|
* @param embeddings map of entity IDs and corresponding embeddings
|
|
* @param similarityFn function that outputs discrete value (0.0 or 1.0).
|
|
* 1.0 if the dimensions of the highest score (weight) from two given embeddings match.
|
|
* 0.0 otherwise.
|
|
* e.g.
|
|
* case 1: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.3, 0.8, 0.0]. similarityFn(E1, E2)=1.0
|
|
* case 2: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.4, 0.2, 0.0]. similarityFn(E1, E2)=0.0
|
|
* @tparam T embedding type. e.g. SimClustersEmbedding
|
|
*
|
|
* @return A set of sets of entity IDs, each set representing a distinct cluster.
|
|
*/
|
|
override def cluster[T](
|
|
embeddings: Map[Long, T],
|
|
similarityFn: (T, T) => Double,
|
|
recordStatCallback: (String, Long) => Unit
|
|
): Set[Set[Long]] = {
|
|
|
|
// rely on clustering by connected component.
|
|
// similarityThreshold=0.1 because it's larger than 0.0 (similarityFn returns 0.0 if two embeddings
|
|
// don't share the largest dimension.
|
|
new ConnectedComponentsClusteringMethod(similarityThreshold = 0.1)
|
|
.cluster(embeddings, similarityFn, recordStatCallback)
|
|
}
|
|
|
|
}
|