mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-11-16 00:25:11 +01:00
[docx] split commit for file 5000
Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
parent
c4b4b821a3
commit
2f5f511bb8
Binary file not shown.
@ -1,32 +0,0 @@
|
||||
package com.twitter.simclusters_v2.common
|
||||
|
||||
import com.twitter.simclusters_v2.common.SimClustersMultiEmbeddingId._
|
||||
import com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding.{Ids, Values}
|
||||
import com.twitter.simclusters_v2.thriftscala.{
|
||||
SimClustersMultiEmbedding,
|
||||
SimClustersEmbeddingId,
|
||||
SimClustersMultiEmbeddingId
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper methods for SimClustersMultiEmbedding
|
||||
*/
|
||||
object SimClustersMultiEmbedding {
|
||||
|
||||
// Convert a multiEmbedding to a list of (embeddingId, score)
|
||||
def toSimClustersEmbeddingIdWithScores(
|
||||
simClustersMultiEmbeddingId: SimClustersMultiEmbeddingId,
|
||||
simClustersMultiEmbedding: SimClustersMultiEmbedding
|
||||
): Seq[(SimClustersEmbeddingId, Double)] = {
|
||||
simClustersMultiEmbedding match {
|
||||
case Values(values) =>
|
||||
values.embeddings.zipWithIndex.map {
|
||||
case (embeddingWithScore, i) =>
|
||||
(toEmbeddingId(simClustersMultiEmbeddingId, i), embeddingWithScore.score)
|
||||
}
|
||||
case Ids(ids) =>
|
||||
ids.ids.map(_.toTuple)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
@ -1,96 +0,0 @@
|
||||
package com.twitter.simclusters_v2.common
|
||||
|
||||
import com.twitter.simclusters_v2.thriftscala.{
|
||||
EmbeddingType,
|
||||
InternalId,
|
||||
MultiEmbeddingType,
|
||||
TopicId,
|
||||
TopicSubId,
|
||||
SimClustersEmbeddingId => ThriftEmbeddingId,
|
||||
SimClustersMultiEmbeddingId => ThriftMultiEmbeddingId
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper methods for SimClustersMultiEmbeddingId
|
||||
*/
|
||||
object SimClustersMultiEmbeddingId {
|
||||
|
||||
private val MultiEmbeddingTypeToEmbeddingType: Map[MultiEmbeddingType, EmbeddingType] =
|
||||
Map(
|
||||
MultiEmbeddingType.LogFavApeBasedMuseTopic -> EmbeddingType.LogFavApeBasedMuseTopic,
|
||||
MultiEmbeddingType.TwiceUserInterestedIn -> EmbeddingType.TwiceUserInterestedIn,
|
||||
)
|
||||
|
||||
private val EmbeddingTypeToMultiEmbeddingType: Map[EmbeddingType, MultiEmbeddingType] =
|
||||
MultiEmbeddingTypeToEmbeddingType.map(_.swap)
|
||||
|
||||
def toEmbeddingType(multiEmbeddingType: MultiEmbeddingType): EmbeddingType = {
|
||||
MultiEmbeddingTypeToEmbeddingType.getOrElse(
|
||||
multiEmbeddingType,
|
||||
throw new IllegalArgumentException(s"Invalid type: $multiEmbeddingType"))
|
||||
}
|
||||
|
||||
def toMultiEmbeddingType(embeddingType: EmbeddingType): MultiEmbeddingType = {
|
||||
EmbeddingTypeToMultiEmbeddingType.getOrElse(
|
||||
embeddingType,
|
||||
throw new IllegalArgumentException(s"Invalid type: $embeddingType")
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a SimClusters Multi-Embedding Id and SubId to SimClusters Embedding Id.
|
||||
*/
|
||||
def toEmbeddingId(
|
||||
simClustersMultiEmbeddingId: ThriftMultiEmbeddingId,
|
||||
subId: Int
|
||||
): ThriftEmbeddingId = {
|
||||
val internalId = simClustersMultiEmbeddingId.internalId match {
|
||||
case InternalId.TopicId(topicId) =>
|
||||
InternalId.TopicSubId(
|
||||
TopicSubId(topicId.entityId, topicId.language, topicId.country, subId))
|
||||
case _ =>
|
||||
throw new IllegalArgumentException(
|
||||
s"Invalid simClusters InternalId ${simClustersMultiEmbeddingId.internalId}")
|
||||
}
|
||||
ThriftEmbeddingId(
|
||||
toEmbeddingType(simClustersMultiEmbeddingId.embeddingType),
|
||||
simClustersMultiEmbeddingId.modelVersion,
|
||||
internalId
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a subId from a SimClusters EmbeddingId.
|
||||
*/
|
||||
def toSubId(simClustersEmbeddingId: ThriftEmbeddingId): Int = {
|
||||
simClustersEmbeddingId.internalId match {
|
||||
case InternalId.TopicSubId(topicSubId) =>
|
||||
topicSubId.subId
|
||||
case _ =>
|
||||
throw new IllegalArgumentException(
|
||||
s"Invalid SimClustersEmbeddingId InternalId type, $simClustersEmbeddingId")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a SimClustersEmbeddingId to SimClustersMultiEmbeddingId.
|
||||
* Only support the Multi embedding based EmbeddingTypes.
|
||||
*/
|
||||
def toMultiEmbeddingId(
|
||||
simClustersEmbeddingId: ThriftEmbeddingId
|
||||
): ThriftMultiEmbeddingId = {
|
||||
simClustersEmbeddingId.internalId match {
|
||||
case InternalId.TopicSubId(topicSubId) =>
|
||||
ThriftMultiEmbeddingId(
|
||||
toMultiEmbeddingType(simClustersEmbeddingId.embeddingType),
|
||||
simClustersEmbeddingId.modelVersion,
|
||||
InternalId.TopicId(TopicId(topicSubId.entityId, topicSubId.language, topicSubId.country))
|
||||
)
|
||||
|
||||
case _ =>
|
||||
throw new IllegalArgumentException(
|
||||
s"Invalid SimClustersEmbeddingId InternalId type, $simClustersEmbeddingId")
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,11 +0,0 @@
|
||||
scala_library(
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"eventdetection/common/src/main/java/com/twitter/eventdetection/common/louvain",
|
||||
"eventdetection/common/src/main/java/com/twitter/eventdetection/common/model",
|
||||
"src/java/com/twitter/sbf/graph",
|
||||
"src/scala/com/twitter/simclusters_v2/scalding/common",
|
||||
],
|
||||
)
|
Binary file not shown.
Binary file not shown.
@ -1,30 +0,0 @@
|
||||
package com.twitter.simclusters_v2.common.clustering
|
||||
|
||||
import com.twitter.simclusters_v2.common.UserId
|
||||
import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
|
||||
|
||||
/**
|
||||
* Select a cluster member as cluster representative.
|
||||
*/
|
||||
trait ClusterRepresentativeSelectionMethod[T] {
|
||||
|
||||
/**
|
||||
* The main external-facing method. Sub-classes should implement this method.
|
||||
*
|
||||
* @param cluster A set of NeighborWithWeights.
|
||||
* @param embeddings A map of producer ID -> embedding.
|
||||
*
|
||||
* @return UserId of the member chosen as representative.
|
||||
*/
|
||||
def selectClusterRepresentative(
|
||||
cluster: Set[NeighborWithWeights],
|
||||
embeddings: Map[UserId, T]
|
||||
): UserId
|
||||
|
||||
}
|
||||
|
||||
object ClusterRepresentativeSelectionStatistics {
|
||||
|
||||
// Statistics, to be imported where recorded.
|
||||
val StatClusterRepresentativeSelectionTime = "cluster_representative_selection_total_time_ms"
|
||||
}
|
Binary file not shown.
@ -1,34 +0,0 @@
|
||||
package com.twitter.simclusters_v2.common.clustering
|
||||
|
||||
/**
|
||||
* Partitions a set of entities into clusters.
|
||||
* NOTE: The selection/construction of the cluster representatives (e.g. medoid, random, average) is implemented in ClusterRepresentativeSelectionMethod.scala
|
||||
*/
|
||||
trait ClusteringMethod {
|
||||
|
||||
/**
|
||||
* The main external-facing method. Sub-classes should implement this method.
|
||||
*
|
||||
* @param embeddings map of entity IDs and corresponding embeddings
|
||||
* @param similarityFn function that outputs similarity (>=0, the larger, more similar), given two embeddings
|
||||
* @tparam T embedding type. e.g. SimClustersEmbedding
|
||||
*
|
||||
* @return A set of sets of entity IDs, each set representing a distinct cluster.
|
||||
*/
|
||||
def cluster[T](
|
||||
embeddings: Map[Long, T],
|
||||
similarityFn: (T, T) => Double,
|
||||
recordStatCallback: (String, Long) => Unit = (_, _) => ()
|
||||
): Set[Set[Long]]
|
||||
|
||||
}
|
||||
|
||||
object ClusteringStatistics {
|
||||
|
||||
// Statistics, to be imported where recorded.
|
||||
val StatSimilarityGraphTotalBuildTime = "similarity_graph_total_build_time_ms"
|
||||
val StatClusteringAlgorithmRunTime = "clustering_algorithm_total_run_time_ms"
|
||||
val StatMedoidSelectionTime = "medoid_selection_total_time_ms"
|
||||
val StatComputedSimilarityBeforeFilter = "computed_similarity_before_filter"
|
||||
|
||||
}
|
Binary file not shown.
@ -1,67 +0,0 @@
|
||||
package com.twitter.simclusters_v2.common.clustering
|
||||
|
||||
import com.twitter.sbf.graph.ConnectedComponents
|
||||
import com.twitter.sbf.graph.Graph
|
||||
import com.twitter.util.Stopwatch
|
||||
import it.unimi.dsi.fastutil.ints.IntSet
|
||||
import scala.collection.SortedMap
|
||||
import scala.jdk.CollectionConverters._
|
||||
|
||||
/**
|
||||
* Aggregate entities into clusters such that a cluster contains all embeddings with a similarity
|
||||
* above a configurable threshold to any other embedding.
|
||||
*
|
||||
* @param similarityThreshold: When building the edges between entities, edges with weight
|
||||
* less than or equal to this threshold will be filtered out.
|
||||
*/
|
||||
class ConnectedComponentsClusteringMethod(
|
||||
similarityThreshold: Double)
|
||||
extends ClusteringMethod {
|
||||
|
||||
import ClusteringStatistics._
|
||||
|
||||
def cluster[T](
|
||||
embeddings: Map[Long, T],
|
||||
similarityFn: (T, T) => Double,
|
||||
recordStatCallback: (String, Long) => Unit = (_, _) => ()
|
||||
): Set[Set[Long]] = {
|
||||
|
||||
val timeSinceGraphBuildStart = Stopwatch.start()
|
||||
// com.twitter.sbf.graph.Graph expects neighbors to be sorted in ascending order.
|
||||
val sourcesById = SortedMap(embeddings.zipWithIndex.map {
|
||||
case (source, idx) => idx -> source
|
||||
}.toSeq: _*)
|
||||
|
||||
val neighbours = sourcesById.map {
|
||||
case (srcIdx, (_, src)) =>
|
||||
sourcesById
|
||||
.collect {
|
||||
case (dstIdx, (_, dst)) if srcIdx != dstIdx => // avoid self-edges
|
||||
val similarity = similarityFn(src, dst)
|
||||
recordStatCallback(
|
||||
StatComputedSimilarityBeforeFilter,
|
||||
(similarity * 100).toLong // preserve up to two decimal points
|
||||
)
|
||||
if (similarity > similarityThreshold)
|
||||
Some(dstIdx)
|
||||
else None
|
||||
}.flatten.toArray
|
||||
}.toArray
|
||||
|
||||
recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
|
||||
|
||||
val timeSinceClusteringAlgRunStart = Stopwatch.start()
|
||||
val nEdges = neighbours.map(_.length).sum / 2 // Graph expects count of undirected edges
|
||||
val graph = new Graph(sourcesById.size, nEdges, neighbours)
|
||||
|
||||
val clusters = ConnectedComponents
|
||||
.connectedComponents(graph).asScala.toSet
|
||||
.map { i: IntSet => i.asScala.map(sourcesById(_)._1).toSet }
|
||||
|
||||
recordStatCallback(
|
||||
StatClusteringAlgorithmRunTime,
|
||||
timeSinceClusteringAlgRunStart().inMilliseconds)
|
||||
|
||||
clusters
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,33 +0,0 @@
|
||||
package com.twitter.simclusters_v2.common.clustering
|
||||
|
||||
/**
|
||||
* Groups entities by a single embedding dimension with the largest score.
|
||||
*/
|
||||
class LargestDimensionClusteringMethod extends ClusteringMethod {
|
||||
|
||||
/**
|
||||
* @param embeddings map of entity IDs and corresponding embeddings
|
||||
* @param similarityFn function that outputs discrete value (0.0 or 1.0).
|
||||
* 1.0 if the dimensions of the highest score (weight) from two given embeddings match.
|
||||
* 0.0 otherwise.
|
||||
* e.g.
|
||||
* case 1: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.3, 0.8, 0.0]. similarityFn(E1, E2)=1.0
|
||||
* case 2: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.4, 0.2, 0.0]. similarityFn(E1, E2)=0.0
|
||||
* @tparam T embedding type. e.g. SimClustersEmbedding
|
||||
*
|
||||
* @return A set of sets of entity IDs, each set representing a distinct cluster.
|
||||
*/
|
||||
override def cluster[T](
|
||||
embeddings: Map[Long, T],
|
||||
similarityFn: (T, T) => Double,
|
||||
recordStatCallback: (String, Long) => Unit
|
||||
): Set[Set[Long]] = {
|
||||
|
||||
// rely on clustering by connected component.
|
||||
// similarityThreshold=0.1 because it's larger than 0.0 (similarityFn returns 0.0 if two embeddings
|
||||
// don't share the largest dimension.
|
||||
new ConnectedComponentsClusteringMethod(similarityThreshold = 0.1)
|
||||
.cluster(embeddings, similarityFn, recordStatCallback)
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
@ -1,236 +0,0 @@
|
||||
package com.twitter.simclusters_v2.common.clustering
|
||||
|
||||
import com.twitter.eventdetection.common.louvain.LouvainDriver
|
||||
import com.twitter.eventdetection.common.louvain.NetworkFactory
|
||||
import com.twitter.eventdetection.common.model.Entity
|
||||
import com.twitter.eventdetection.common.model.NetworkInput
|
||||
import com.twitter.eventdetection.common.model.TextEntityValue
|
||||
import com.twitter.util.Stopwatch
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.math.max
|
||||
|
||||
/**
|
||||
* Groups entities by the Louvain clustering method.
|
||||
* @param similarityThreshold: When building the edges between entities, edges with weight
|
||||
* less than or equal to this threshold will be filtered out.
|
||||
* @param appliedResolutionFactor: If present, will be used to multiply the applied resolution
|
||||
* parameter of the Louvain method by this factor.
|
||||
* Note that the DEFAULT_MAX_RESOLUTION will not be applied.
|
||||
*/
|
||||
class LouvainClusteringMethod(
|
||||
similarityThreshold: Double,
|
||||
appliedResolutionFactor: Option[Double])
|
||||
extends ClusteringMethod {
|
||||
|
||||
import ClusteringStatistics._
|
||||
|
||||
def cluster[T](
|
||||
embeddings: Map[Long, T],
|
||||
similarityFn: (T, T) => Double,
|
||||
recordStatCallback: (String, Long) => Unit = (_, _) => ()
|
||||
): Set[Set[Long]] = {
|
||||
|
||||
// 1. Build the graph on which to run Louvain:
|
||||
// - Weigh edges by the similarity between the 2 embeddings,
|
||||
// - Filter out edges with weight <= threshold.
|
||||
val timeSinceGraphBuildStart = Stopwatch.start()
|
||||
val edges: Seq[((Long, Long), Double)] = embeddings.toSeq
|
||||
.combinations(2)
|
||||
.map { pair: Seq[(Long, T)] => // pair of 2
|
||||
val (user1, embedding1) = pair.head
|
||||
val (user2, embedding2) = pair(1)
|
||||
val similarity = similarityFn(embedding1, embedding2)
|
||||
|
||||
recordStatCallback(
|
||||
StatComputedSimilarityBeforeFilter,
|
||||
(similarity * 100).toLong // preserve up to two decimal places
|
||||
)
|
||||
|
||||
((user1, user2), similarity)
|
||||
}
|
||||
.filter(_._2 > similarityThreshold)
|
||||
.toSeq
|
||||
|
||||
recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
|
||||
|
||||
// check if some entities do not have any incoming / outgoing edge
|
||||
// these are size-1 clusters (i.e. their own)
|
||||
val individualClusters: Set[Long] = embeddings.keySet -- edges.flatMap {
|
||||
case ((user1, user2), _) => Set(user1, user2)
|
||||
}.toSet
|
||||
|
||||
// 2. LouvainDriver uses "Entity" as input, so build 2 mappings
|
||||
// - Long (entity id) -> Entity
|
||||
// - Entity -> Long (entity id)
|
||||
val embeddingIdToEntity: Map[Long, Entity] = embeddings.map {
|
||||
case (id, _) => id -> Entity(TextEntityValue(id.toString, Some(id.toString)), None)
|
||||
}
|
||||
val entityToEmbeddingId: Map[Entity, Long] = embeddingIdToEntity.map {
|
||||
case (id, e) => e -> id
|
||||
}
|
||||
|
||||
// 3. Create the list of NetworkInput on which to run LouvainDriver
|
||||
val networkInputList = edges
|
||||
.map {
|
||||
case ((fromUserId: Long, toUserId: Long), weight: Double) =>
|
||||
new NetworkInput(embeddingIdToEntity(fromUserId), embeddingIdToEntity(toUserId), weight)
|
||||
}.toList.asJava
|
||||
|
||||
val timeSinceClusteringAlgRunStart = Stopwatch.start()
|
||||
val networkDictionary = NetworkFactory.buildDictionary(networkInputList)
|
||||
val network = NetworkFactory.buildNetwork(networkInputList, networkDictionary)
|
||||
|
||||
if (networkInputList.size() == 0) {
|
||||
// handle case if no edge at all (only one entity or all entities are too far apart)
|
||||
embeddings.keySet.map(e => Set(e))
|
||||
} else {
|
||||
// 4. Run clustering algorithm
|
||||
val clusteredIds = appliedResolutionFactor match {
|
||||
case Some(res) =>
|
||||
LouvainDriver.clusterAppliedResolutionFactor(network, networkDictionary, res)
|
||||
case None => LouvainDriver.cluster(network, networkDictionary)
|
||||
}
|
||||
|
||||
recordStatCallback(
|
||||
StatClusteringAlgorithmRunTime,
|
||||
timeSinceClusteringAlgRunStart().inMilliseconds)
|
||||
|
||||
// 5. Post-processing
|
||||
val atLeast2MembersClusters: Set[Set[Long]] = clusteredIds.asScala
|
||||
.groupBy(_._2)
|
||||
.mapValues(_.map { case (e, _) => entityToEmbeddingId(e) }.toSet)
|
||||
.values.toSet
|
||||
|
||||
atLeast2MembersClusters ++ individualClusters.map { e => Set(e) }
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
def clusterWithSilhouette[T](
|
||||
embeddings: Map[Long, T],
|
||||
similarityFn: (T, T) => Double,
|
||||
similarityFnForSil: (T, T) => Double,
|
||||
recordStatCallback: (String, Long) => Unit = (_, _) => ()
|
||||
): (Set[Set[Long]], Set[Set[(Long, Double)]]) = {
|
||||
|
||||
// 1. Build the graph on which to run Louvain:
|
||||
// - Weigh edges by the similarity between the 2 embeddings,
|
||||
// - Filter out edges with weight <= threshold.
|
||||
val timeSinceGraphBuildStart = Stopwatch.start()
|
||||
val edgesSimilarityMap = collection.mutable.Map[(Long, Long), Double]()
|
||||
|
||||
val edges: Seq[((Long, Long), Double)] = embeddings.toSeq
|
||||
.combinations(2)
|
||||
.map { pair: Seq[(Long, T)] => // pair of 2
|
||||
val (user1, embedding1) = pair.head
|
||||
val (user2, embedding2) = pair(1)
|
||||
val similarity = similarityFn(embedding1, embedding2)
|
||||
val similarityForSil = similarityFnForSil(embedding1, embedding2)
|
||||
edgesSimilarityMap.put((user1, user2), similarityForSil)
|
||||
edgesSimilarityMap.put((user2, user1), similarityForSil)
|
||||
|
||||
recordStatCallback(
|
||||
StatComputedSimilarityBeforeFilter,
|
||||
(similarity * 100).toLong // preserve up to two decimal places
|
||||
)
|
||||
|
||||
((user1, user2), similarity)
|
||||
}
|
||||
.filter(_._2 > similarityThreshold)
|
||||
.toSeq
|
||||
|
||||
recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
|
||||
|
||||
// check if some entities do not have any incoming / outgoing edge
|
||||
// these are size-1 clusters (i.e. their own)
|
||||
val individualClusters: Set[Long] = embeddings.keySet -- edges.flatMap {
|
||||
case ((user1, user2), _) => Set(user1, user2)
|
||||
}.toSet
|
||||
|
||||
// 2. LouvainDriver uses "Entity" as input, so build 2 mappings
|
||||
// - Long (entity id) -> Entity
|
||||
// - Entity -> Long (entity id)
|
||||
val embeddingIdToEntity: Map[Long, Entity] = embeddings.map {
|
||||
case (id, _) => id -> Entity(TextEntityValue(id.toString, Some(id.toString)), None)
|
||||
}
|
||||
val entityToEmbeddingId: Map[Entity, Long] = embeddingIdToEntity.map {
|
||||
case (id, e) => e -> id
|
||||
}
|
||||
|
||||
// 3. Create the list of NetworkInput on which to run LouvainDriver
|
||||
val networkInputList = edges
|
||||
.map {
|
||||
case ((fromUserId: Long, toUserId: Long), weight: Double) =>
|
||||
new NetworkInput(embeddingIdToEntity(fromUserId), embeddingIdToEntity(toUserId), weight)
|
||||
}.toList.asJava
|
||||
|
||||
val timeSinceClusteringAlgRunStart = Stopwatch.start()
|
||||
val networkDictionary = NetworkFactory.buildDictionary(networkInputList)
|
||||
val network = NetworkFactory.buildNetwork(networkInputList, networkDictionary)
|
||||
|
||||
val clusters = if (networkInputList.size() == 0) {
|
||||
// handle case if no edge at all (only one entity or all entities are too far apart)
|
||||
embeddings.keySet.map(e => Set(e))
|
||||
} else {
|
||||
// 4. Run clustering algorithm
|
||||
val clusteredIds = appliedResolutionFactor match {
|
||||
case Some(res) =>
|
||||
LouvainDriver.clusterAppliedResolutionFactor(network, networkDictionary, res)
|
||||
case None => LouvainDriver.cluster(network, networkDictionary)
|
||||
}
|
||||
|
||||
recordStatCallback(
|
||||
StatClusteringAlgorithmRunTime,
|
||||
timeSinceClusteringAlgRunStart().inMilliseconds)
|
||||
|
||||
// 5. Post-processing
|
||||
val atLeast2MembersClusters: Set[Set[Long]] = clusteredIds.asScala
|
||||
.groupBy(_._2)
|
||||
.mapValues(_.map { case (e, _) => entityToEmbeddingId(e) }.toSet)
|
||||
.values.toSet
|
||||
|
||||
atLeast2MembersClusters ++ individualClusters.map { e => Set(e) }
|
||||
|
||||
}
|
||||
|
||||
// Calculate silhouette metrics
|
||||
val contactIdWithSilhouette = clusters.map {
|
||||
case cluster =>
|
||||
val otherClusters = clusters - cluster
|
||||
|
||||
cluster.map {
|
||||
case contactId =>
|
||||
if (otherClusters.isEmpty) {
|
||||
(contactId, 0.0)
|
||||
} else {
|
||||
val otherSameClusterContacts = cluster - contactId
|
||||
|
||||
if (otherSameClusterContacts.isEmpty) {
|
||||
(contactId, 0.0)
|
||||
} else {
|
||||
// calculate similarity of given userId with all other users in the same cluster
|
||||
val a_i = otherSameClusterContacts.map {
|
||||
case sameClusterContact =>
|
||||
edgesSimilarityMap((contactId, sameClusterContact))
|
||||
}.sum / otherSameClusterContacts.size
|
||||
|
||||
// calculate similarity of given userId to all other clusters, find the best nearest cluster
|
||||
val b_i = otherClusters.map {
|
||||
case otherCluster =>
|
||||
otherCluster.map {
|
||||
case otherClusterContact =>
|
||||
edgesSimilarityMap((contactId, otherClusterContact))
|
||||
}.sum / otherCluster.size
|
||||
}.max
|
||||
|
||||
// silhouette (value) of one userId i
|
||||
val s_i = (a_i - b_i) / max(a_i, b_i)
|
||||
(contactId, s_i)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(clusters, contactIdWithSilhouette)
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,21 +0,0 @@
|
||||
package com.twitter.simclusters_v2.common.clustering
|
||||
|
||||
import com.twitter.simclusters_v2.common.UserId
|
||||
import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
|
||||
|
||||
class MaxFavScoreRepresentativeSelectionMethod[T] extends ClusterRepresentativeSelectionMethod[T] {
|
||||
|
||||
/**
|
||||
* Identify the member with largest favScoreHalfLife100Days and return it.
|
||||
*
|
||||
* @param cluster A set of NeighborWithWeights.
|
||||
* @param embeddings A map of producer ID -> embedding.
|
||||
*/
|
||||
def selectClusterRepresentative(
|
||||
cluster: Set[NeighborWithWeights],
|
||||
embeddings: Map[UserId, T],
|
||||
): UserId = {
|
||||
val key = cluster.maxBy { x: NeighborWithWeights => x.favScoreHalfLife100Days.getOrElse(0.0) }
|
||||
key.neighborId
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,28 +0,0 @@
|
||||
package com.twitter.simclusters_v2.common.clustering
|
||||
|
||||
import com.twitter.simclusters_v2.common.UserId
|
||||
import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
|
||||
|
||||
class MedoidRepresentativeSelectionMethod[T](
|
||||
producerProducerSimilarityFn: (T, T) => Double)
|
||||
extends ClusterRepresentativeSelectionMethod[T] {
|
||||
|
||||
/**
|
||||
* Identify the medoid of a cluster and return it.
|
||||
*
|
||||
* @param cluster A set of NeighborWithWeights.
|
||||
* @param embeddings A map of producer ID -> embedding.
|
||||
*/
|
||||
def selectClusterRepresentative(
|
||||
cluster: Set[NeighborWithWeights],
|
||||
embeddings: Map[UserId, T],
|
||||
): UserId = {
|
||||
val key = cluster.maxBy {
|
||||
id1 => // maxBy because we use similarity, which gets larger as we get closer.
|
||||
val v = embeddings(id1.neighborId)
|
||||
cluster
|
||||
.map(id2 => producerProducerSimilarityFn(v, embeddings(id2.neighborId))).sum
|
||||
}
|
||||
key.neighborId
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,32 +0,0 @@
|
||||
package com.twitter.simclusters_v2.common.clustering
|
||||
|
||||
import com.twitter.simclusters_v2.common.SimClustersEmbedding
|
||||
|
||||
/**
|
||||
* SimilarityFunctions provide commonly used similarity functions that this clustering library needs.
|
||||
*/
|
||||
object SimilarityFunctions {
|
||||
def simClustersCosineSimilarity: (SimClustersEmbedding, SimClustersEmbedding) => Double =
|
||||
(e1, e2) => e1.cosineSimilarity(e2)
|
||||
|
||||
def simClustersMatchingLargestDimension: (
|
||||
SimClustersEmbedding,
|
||||
SimClustersEmbedding
|
||||
) => Double = (e1, e2) => {
|
||||
val doesMatchLargestDimension: Boolean = e1
|
||||
.topClusterIds(1)
|
||||
.exists { id1 =>
|
||||
e2.topClusterIds(1).contains(id1)
|
||||
}
|
||||
|
||||
if (doesMatchLargestDimension) 1.0
|
||||
else 0.0
|
||||
}
|
||||
|
||||
def simClustersFuzzyJaccardSimilarity: (
|
||||
SimClustersEmbedding,
|
||||
SimClustersEmbedding
|
||||
) => Double = (e1, e2) => {
|
||||
e1.fuzzyJaccardSimilarity(e2)
|
||||
}
|
||||
}
|
@ -1,12 +0,0 @@
|
||||
# This package/target is separate from other simclusters common packages because the ml/api dep is
|
||||
# large (350MB+). Having it as a separate target means that we can avoid bundling it with targets
|
||||
# that do not need it.
|
||||
scala_library(
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/scala/com/twitter/ml/api/util",
|
||||
"src/scala/com/twitter/simclusters_v2/common",
|
||||
],
|
||||
)
|
BIN
src/scala/com/twitter/simclusters_v2/common/ml/BUILD.docx
Normal file
BIN
src/scala/com/twitter/simclusters_v2/common/ml/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,39 +0,0 @@
|
||||
package com.twitter.simclusters_v2.common.ml
|
||||
|
||||
import com.twitter.ml.api.Feature.Continuous
|
||||
import com.twitter.ml.api.Feature.SparseContinuous
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.ml.api.util.FDsl._
|
||||
import com.twitter.simclusters_v2.common.SimClustersEmbedding
|
||||
|
||||
class SimClustersEmbeddingAdapter(embeddingFeature: SparseContinuous)
|
||||
extends IRecordOneToOneAdapter[SimClustersEmbedding] {
|
||||
|
||||
override def getFeatureContext: FeatureContext = new FeatureContext(embeddingFeature)
|
||||
|
||||
override def adaptToDataRecord(embedding: SimClustersEmbedding): DataRecord = {
|
||||
val embeddingMap = embedding.embedding.map {
|
||||
case (clusterId, score) =>
|
||||
(clusterId.toString, score)
|
||||
}.toMap
|
||||
|
||||
new DataRecord().setFeatureValue(embeddingFeature, embeddingMap)
|
||||
}
|
||||
}
|
||||
|
||||
class NormalizedSimClustersEmbeddingAdapter(
|
||||
embeddingFeature: SparseContinuous,
|
||||
normFeature: Continuous)
|
||||
extends IRecordOneToOneAdapter[SimClustersEmbedding] {
|
||||
|
||||
override def getFeatureContext: FeatureContext = new FeatureContext(embeddingFeature, normFeature)
|
||||
|
||||
override def adaptToDataRecord(embedding: SimClustersEmbedding): DataRecord = {
|
||||
|
||||
val normalizedEmbedding = Map(
|
||||
embedding.sortedClusterIds.map(_.toString).zip(embedding.normalizedSortedScores): _*)
|
||||
|
||||
val dataRecord = new DataRecord().setFeatureValue(embeddingFeature, normalizedEmbedding)
|
||||
dataRecord.setFeatureValue(normFeature, embedding.l2norm)
|
||||
}
|
||||
}
|
BIN
src/scala/com/twitter/simclusters_v2/common/package.docx
Normal file
BIN
src/scala/com/twitter/simclusters_v2/common/package.docx
Normal file
Binary file not shown.
@ -1,17 +0,0 @@
|
||||
package com.twitter.simclusters_v2
|
||||
|
||||
package object common {
|
||||
|
||||
type TweetId = Long
|
||||
type UserId = Long
|
||||
type ClusterId = Int
|
||||
type SemanticCoreEntityId = Long // Use TopicId if it's a Topic related project.
|
||||
type UTTEntityId = Long
|
||||
type Timestamp = Long
|
||||
type Language = String
|
||||
type Country = String
|
||||
type LocaleEntity = (Long, Language)
|
||||
type TopicId = Long
|
||||
type GroupId = Long
|
||||
type SpaceId = String
|
||||
}
|
Binary file not shown.
@ -1,164 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources
|
||||
|
||||
import com.twitter.bijection.scrooge.BinaryScalaCodec
|
||||
import com.twitter.bijection.scrooge.CompactScalaCodec
|
||||
import com.twitter.bijection.Bufferable
|
||||
import com.twitter.bijection.Injection
|
||||
import com.twitter.hermit.candidate.thriftscala.Candidates
|
||||
import com.twitter.scalding.DateRange
|
||||
import com.twitter.scalding.commons.source.VersionedKeyValSource
|
||||
import com.twitter.scalding_internal.source.lzo_scrooge.DailySuffixMostRecentLzoScrooge
|
||||
import com.twitter.scalding_internal.source.lzo_scrooge.FixedPathLzoScrooge
|
||||
import com.twitter.scalding_internal.source.lzo_scrooge.HourlySuffixMostRecentLzoScrooge
|
||||
import com.twitter.simclusters_v2.thriftscala._
|
||||
|
||||
case class EdgeWithDecayedWtsFixedPathSource(path: String)
|
||||
extends FixedPathLzoScrooge[EdgeWithDecayedWeights](path, EdgeWithDecayedWeights)
|
||||
|
||||
case class UserAndNeighborsFixedPathSource(path: String)
|
||||
extends FixedPathLzoScrooge[UserAndNeighbors](path, UserAndNeighbors)
|
||||
|
||||
case class NormsAndCountsFixedPathSource(path: String)
|
||||
extends FixedPathLzoScrooge[NormsAndCounts](path, NormsAndCounts)
|
||||
|
||||
case class UserToInterestedInClustersFixedPathSource(path: String)
|
||||
extends FixedPathLzoScrooge[UserToInterestedInClusters](path, UserToInterestedInClusters)
|
||||
|
||||
case class TimelineDataExtractorFixedPathSource(path: String)
|
||||
extends FixedPathLzoScrooge[ReferenceTweets](path, ReferenceTweets)
|
||||
|
||||
case class TweetClusterScoresHourlySuffixSource(path: String, override val dateRange: DateRange)
|
||||
extends HourlySuffixMostRecentLzoScrooge[TweetAndClusterScores](path, dateRange)
|
||||
|
||||
case class TweetTopKClustersHourlySuffixSource(path: String, override val dateRange: DateRange)
|
||||
extends HourlySuffixMostRecentLzoScrooge[TweetTopKClustersWithScores](
|
||||
path,
|
||||
dateRange
|
||||
)
|
||||
|
||||
case class ClusterTopKTweetsHourlySuffixSource(path: String, override val dateRange: DateRange)
|
||||
extends HourlySuffixMostRecentLzoScrooge[ClusterTopKTweetsWithScores](
|
||||
path,
|
||||
dateRange
|
||||
)
|
||||
|
||||
case class TweetSimilarityUnhydratedPairsSource(path: String, override val dateRange: DateRange)
|
||||
extends DailySuffixMostRecentLzoScrooge[LabelledTweetPairs](
|
||||
path,
|
||||
dateRange
|
||||
)
|
||||
|
||||
case class WTFCandidatesSource(path: String)
|
||||
extends FixedPathLzoScrooge[Candidates](path, Candidates)
|
||||
|
||||
case class EmbeddingsLiteSource(path: String)
|
||||
extends FixedPathLzoScrooge[EmbeddingsLite](path, EmbeddingsLite)
|
||||
|
||||
object AdhocKeyValSources {
|
||||
def interestedInSource(path: String): VersionedKeyValSource[Long, ClustersUserIsInterestedIn] = {
|
||||
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
|
||||
implicit val valInject: Injection[ClustersUserIsInterestedIn, Array[Byte]] =
|
||||
CompactScalaCodec(ClustersUserIsInterestedIn)
|
||||
VersionedKeyValSource[Long, ClustersUserIsInterestedIn](path)
|
||||
}
|
||||
|
||||
def clusterDetailsSource(path: String): VersionedKeyValSource[(String, Int), ClusterDetails] = {
|
||||
implicit val keyInject: Injection[(String, Int), Array[Byte]] =
|
||||
Bufferable.injectionOf[(String, Int)]
|
||||
implicit val valInject: Injection[ClusterDetails, Array[Byte]] =
|
||||
CompactScalaCodec(ClusterDetails)
|
||||
VersionedKeyValSource[(String, Int), ClusterDetails](path)
|
||||
}
|
||||
|
||||
def bipartiteQualitySource(
|
||||
path: String
|
||||
): VersionedKeyValSource[(String, Int), BipartiteClusterQuality] = {
|
||||
implicit val keyInject: Injection[(String, Int), Array[Byte]] =
|
||||
Bufferable.injectionOf[(String, Int)]
|
||||
implicit val valInject: Injection[BipartiteClusterQuality, Array[Byte]] =
|
||||
CompactScalaCodec(BipartiteClusterQuality)
|
||||
VersionedKeyValSource[(String, Int), BipartiteClusterQuality](path)
|
||||
}
|
||||
|
||||
def entityToClustersSource(
|
||||
path: String
|
||||
): VersionedKeyValSource[SimClustersEmbeddingId, SimClustersEmbedding] = {
|
||||
implicit val keyInject: Injection[SimClustersEmbeddingId, Array[Byte]] =
|
||||
BinaryScalaCodec(SimClustersEmbeddingId)
|
||||
implicit val valInject: Injection[SimClustersEmbedding, Array[Byte]] =
|
||||
BinaryScalaCodec(SimClustersEmbedding)
|
||||
VersionedKeyValSource[SimClustersEmbeddingId, SimClustersEmbedding](path)
|
||||
}
|
||||
|
||||
def clusterToEntitiesSource(
|
||||
path: String
|
||||
): VersionedKeyValSource[SimClustersEmbeddingId, InternalIdEmbedding] = {
|
||||
implicit val keyInject: Injection[SimClustersEmbeddingId, Array[Byte]] = BinaryScalaCodec(
|
||||
SimClustersEmbeddingId)
|
||||
implicit val valInject: Injection[InternalIdEmbedding, Array[Byte]] =
|
||||
BinaryScalaCodec(InternalIdEmbedding)
|
||||
VersionedKeyValSource[SimClustersEmbeddingId, InternalIdEmbedding](path)
|
||||
}
|
||||
|
||||
// For storing producer-simclusters embeddings
|
||||
def topProducerToClusterEmbeddingsSource(
|
||||
path: String
|
||||
): VersionedKeyValSource[Long, TopSimClustersWithScore] = {
|
||||
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
|
||||
implicit val valInject: Injection[TopSimClustersWithScore, Array[Byte]] =
|
||||
CompactScalaCodec(TopSimClustersWithScore)
|
||||
VersionedKeyValSource[Long, TopSimClustersWithScore](path)
|
||||
}
|
||||
|
||||
// For storing producer-simclusters embeddings
|
||||
def topClusterEmbeddingsToProducerSource(
|
||||
path: String
|
||||
): VersionedKeyValSource[PersistedFullClusterId, TopProducersWithScore] = {
|
||||
implicit val keyInject: Injection[PersistedFullClusterId, Array[Byte]] =
|
||||
CompactScalaCodec(PersistedFullClusterId)
|
||||
implicit val valInject: Injection[TopProducersWithScore, Array[Byte]] =
|
||||
CompactScalaCodec(TopProducersWithScore)
|
||||
VersionedKeyValSource[PersistedFullClusterId, TopProducersWithScore](path)
|
||||
}
|
||||
|
||||
def userToInferredEntitiesSource(
|
||||
path: String
|
||||
): VersionedKeyValSource[Long, SimClustersInferredEntities] = {
|
||||
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
|
||||
implicit val valInject: Injection[SimClustersInferredEntities, Array[Byte]] =
|
||||
CompactScalaCodec(SimClustersInferredEntities)
|
||||
VersionedKeyValSource[Long, SimClustersInferredEntities](path)
|
||||
}
|
||||
|
||||
def knownForAdhocSource(path: String): VersionedKeyValSource[Long, ClustersUserIsKnownFor] = {
|
||||
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
|
||||
implicit val valInject: Injection[ClustersUserIsKnownFor, Array[Byte]] =
|
||||
CompactScalaCodec(ClustersUserIsKnownFor)
|
||||
VersionedKeyValSource[Long, ClustersUserIsKnownFor](path)
|
||||
}
|
||||
|
||||
def knownForSBFResultsDevelSource(
|
||||
path: String
|
||||
): VersionedKeyValSource[Long, Array[(Int, Float)]] = {
|
||||
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
|
||||
implicit val valInject: Injection[Array[(Int, Float)], Array[Byte]] =
|
||||
Bufferable.injectionOf[Array[(Int, Float)]]
|
||||
VersionedKeyValSource[Long, Array[(Int, Float)]](path)
|
||||
}
|
||||
|
||||
// injection to store adjlist in the mapped indices space for users
|
||||
def intermediateSBFResultsDevelSource(
|
||||
path: String
|
||||
): VersionedKeyValSource[Int, List[(Int, Float)]] = {
|
||||
implicit val keyInject: Injection[Int, Array[Byte]] = Injection.int2BigEndian
|
||||
implicit val valInject: Injection[List[(Int, Float)], Array[Byte]] =
|
||||
Bufferable.injectionOf[List[(Int, Float)]]
|
||||
VersionedKeyValSource[Int, List[(Int, Float)]](path)
|
||||
}
|
||||
|
||||
def mappedIndicesDevelSource(path: String): VersionedKeyValSource[Int, Long] = {
|
||||
implicit val keyInject: Injection[Int, Array[Byte]] = Injection.int2BigEndian
|
||||
implicit val valInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
|
||||
VersionedKeyValSource[Int, Long](path)
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
BIN
src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD.docx
Normal file
BIN
src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD.docx
Normal file
Binary file not shown.
BIN
src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.docx
Normal file
BIN
src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.docx
Normal file
Binary file not shown.
@ -1,49 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources
|
||||
|
||||
object DataPaths {
|
||||
|
||||
val InterestedIn2020Path =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_20M_145K_2020"
|
||||
|
||||
val InterestedIn2020ThriftPath =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_20M_145K_2020_thrift"
|
||||
|
||||
val InterestedInLite2020Path =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_lite_20M_145K_2020"
|
||||
|
||||
val InterestedInLite2020ThriftPath =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_lite_20M_145K_2020_thrift"
|
||||
|
||||
val KnownFor2020Path =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_known_for_20M_145K_2020"
|
||||
|
||||
// keep this inside /user/cassowary/manhattan_sequence_files/ to use the latest 3 retention policy
|
||||
val KnownFor2020ThriftDatasetPath =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_known_for_20M_145K_2020_thrift"
|
||||
|
||||
val OfflineClusterTopMediaTweets2020DatasetPath =
|
||||
"/user/cassowary/manhattan_sequence_files/cluster_top_media_tweets_20M_145K_2020"
|
||||
}
|
||||
|
||||
/**
|
||||
* These should only be accessed from simclusters_v2 data pipeline for intermediate data, these
|
||||
* are not opt-out compliant and shouldn't be exposed externally.
|
||||
*/
|
||||
object InternalDataPaths {
|
||||
// Internal versions, not to be read or written outside of simcluster_v2
|
||||
|
||||
private[simclusters_v2] val RawInterestedIn2020Path =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_interested_in_20M_145K_2020"
|
||||
|
||||
private[simclusters_v2] val RawInterestedInLite2020Path =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_interested_in_lite_20M_145K_2020"
|
||||
|
||||
private[simclusters_v2] val RawKnownForDec11Path =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_dec11"
|
||||
|
||||
private[simclusters_v2] val RawKnownForUpdatedPath =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_updated"
|
||||
|
||||
private[simclusters_v2] val RawKnownFor2020Path =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_2020"
|
||||
}
|
Binary file not shown.
@ -1,39 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources
|
||||
|
||||
import com.twitter.scalding.DateOps
|
||||
import com.twitter.scalding.DateRange
|
||||
import com.twitter.scalding.Days
|
||||
import com.twitter.scalding.TypedPipe
|
||||
import com.twitter.scalding_internal.dalv2.DAL
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
|
||||
import com.twitter.simclusters_v2.thriftscala.NormsAndCounts
|
||||
import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
|
||||
import java.util.TimeZone
|
||||
|
||||
object DataSources {
|
||||
|
||||
/**
|
||||
* Reads production normalized graph data from atla-proc
|
||||
*/
|
||||
def userUserNormalizedGraphSource(implicit dateRange: DateRange): TypedPipe[UserAndNeighbors] = {
|
||||
DAL
|
||||
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(14)(DateOps.UTC))
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads production user norms and counts data from atla-proc
|
||||
*/
|
||||
def userNormsAndCounts(
|
||||
implicit dateRange: DateRange,
|
||||
timeZone: TimeZone
|
||||
): TypedPipe[NormsAndCounts] = {
|
||||
DAL
|
||||
.readMostRecentSnapshot(ProducerNormsAndCountsScalaDataset, dateRange.prepend(Days(14)))
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
@ -1,222 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources
|
||||
|
||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
||||
import com.twitter.scalding.DateRange
|
||||
import com.twitter.scalding.typed.TypedPipe
|
||||
import com.twitter.scalding_internal.dalv2.DAL
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
||||
import com.twitter.simclusters_v2.thriftscala._
|
||||
import com.twitter.wtf.entity_real_graph.thriftscala.EntityType
|
||||
import com.twitter.simclusters_v2.common.ClusterId
|
||||
import com.twitter.simclusters_v2.common.ModelVersions
|
||||
|
||||
object EntityEmbeddingsSources {
|
||||
|
||||
final val SemanticCoreSimClustersEmbeddingsDec11Dataset =
|
||||
SemanticCoreSimclustersEmbeddingsScalaDataset
|
||||
|
||||
final val SemanticCoreSimClustersEmbeddingsUpdatedDataset =
|
||||
SemanticCoreSimclustersEmbeddingsUpdatedScalaDataset
|
||||
|
||||
final val SemanticCoreSimClustersEmbeddings2020Dataset =
|
||||
SemanticCoreSimclustersEmbeddings2020ScalaDataset
|
||||
|
||||
final val SemanticCorePerLanguageSimClustersEmbeddingsDataset =
|
||||
SemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
|
||||
|
||||
final val LogFavSemanticCorePerLanguageSimClustersEmbeddingsDataset =
|
||||
LogFavSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
|
||||
|
||||
final val HashtagSimClustersEmbeddingsUpdatedDataset =
|
||||
HashtagSimclustersEmbeddingsUpdatedScalaDataset
|
||||
|
||||
final val ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset =
|
||||
ReverseIndexSemanticCoreSimclustersEmbeddingsScalaDataset
|
||||
|
||||
final val ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset =
|
||||
ReverseIndexSemanticCoreSimclustersEmbeddingsUpdatedScalaDataset
|
||||
|
||||
final val ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset =
|
||||
ReverseIndexSemanticCoreSimclustersEmbeddings2020ScalaDataset
|
||||
|
||||
final val ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
|
||||
ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
|
||||
|
||||
final val LogFavReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
|
||||
LogFavReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
|
||||
|
||||
final val ReverseIndexHashtagSimClustersEmbeddingsUpdatedDataset =
|
||||
ReverseIndexHashtagSimclustersEmbeddingsUpdatedScalaDataset
|
||||
|
||||
// Fav-based TFG topic embeddings built from user device languages
|
||||
// Keyed by SimClustersEmbeddingId with InternalId.TopicId ((topic, language) pair, with country = None)
|
||||
final val FavTfgTopicEmbeddingsDataset = FavTfgTopicEmbeddingsScalaDataset
|
||||
|
||||
final val FavTfgTopicEmbeddingsParquetDataset = FavTfgTopicEmbeddingsParquetScalaDataset
|
||||
|
||||
final val FavTfgTopicEmbeddings2020Dataset = FavTfgTopicEmbeddings2020ScalaDataset
|
||||
|
||||
final val FavTfgTopicEmbeddings2020ParquetDataset = FavTfgTopicEmbeddings2020ParquetScalaDataset
|
||||
|
||||
// Logfav-based TFG topic embeddings built from user device languages
|
||||
// Keyed by SimClustersEmbeddingId with InternalId.LocaleEntityId ((topic, language) pair)
|
||||
final val LogFavTfgTopicEmbeddingsDataset = LogFavTfgTopicEmbeddingsScalaDataset
|
||||
|
||||
final val LogFavTfgTopicEmbeddingsParquetDataset = LogFavTfgTopicEmbeddingsParquetScalaDataset
|
||||
|
||||
// Fav-based TFG topic embeddings built from inferred user consumed languages
|
||||
// Keyed by SimClustersEmbeddingId with InternalId.TopicId ((topic, country, language) tuple)
|
||||
final val FavInferredLanguageTfgTopicEmbeddingsDataset =
|
||||
FavInferredLanguageTfgTopicEmbeddingsScalaDataset
|
||||
|
||||
private val validSemanticCoreEmbeddingTypes = Seq(
|
||||
EmbeddingType.FavBasedSematicCoreEntity,
|
||||
EmbeddingType.FollowBasedSematicCoreEntity
|
||||
)
|
||||
|
||||
/**
|
||||
* Given a fav/follow/etc embedding type and a ModelVersion, retrieve the corresponding dataset to
|
||||
* (SemanticCore entityId -> List(clusterId)) from a certain dateRange.
|
||||
*/
|
||||
def getSemanticCoreEntityEmbeddingsSource(
|
||||
embeddingType: EmbeddingType,
|
||||
modelVersion: String,
|
||||
dateRange: DateRange
|
||||
): TypedPipe[(Long, SimClustersEmbedding)] = {
|
||||
val dataSet = modelVersion match {
|
||||
case ModelVersions.Model20M145KDec11 => SemanticCoreSimClustersEmbeddingsDec11Dataset
|
||||
case ModelVersions.Model20M145KUpdated => SemanticCoreSimClustersEmbeddingsUpdatedDataset
|
||||
case _ => throw new IllegalArgumentException(s"ModelVersion $modelVersion is not supported")
|
||||
}
|
||||
assert(validSemanticCoreEmbeddingTypes.contains(embeddingType))
|
||||
entityEmbeddingsSource(dataSet, embeddingType, dateRange)
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a fav/follow/etc embedding type and a ModelVersion, retrieve the corresponding dataset to
|
||||
* (clusterId -> List(SemanticCore entityId)) from a certain dateRange.
|
||||
*/
|
||||
def getReverseIndexedSemanticCoreEntityEmbeddingsSource(
|
||||
embeddingType: EmbeddingType,
|
||||
modelVersion: String,
|
||||
dateRange: DateRange
|
||||
): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
|
||||
val dataSet = modelVersion match {
|
||||
case ModelVersions.Model20M145KDec11 =>
|
||||
ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset
|
||||
case ModelVersions.Model20M145KUpdated =>
|
||||
ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset
|
||||
case ModelVersions.Model20M145K2020 =>
|
||||
ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset
|
||||
case _ => throw new IllegalArgumentException(s"ModelVersion $modelVersion is not supported")
|
||||
}
|
||||
|
||||
assert(validSemanticCoreEmbeddingTypes.contains(embeddingType))
|
||||
reverseIndexedEntityEmbeddingsSource(dataSet, embeddingType, dateRange)
|
||||
}
|
||||
|
||||
// Return the raw DAL dataset reference. Use this if you're writing to DAL.
|
||||
def getEntityEmbeddingsDataset(
|
||||
entityType: EntityType,
|
||||
modelVersion: String,
|
||||
isEmbeddingsPerLocale: Boolean = false
|
||||
): KeyValDALDataset[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] = {
|
||||
(entityType, modelVersion) match {
|
||||
case (EntityType.SemanticCore, ModelVersions.Model20M145KDec11) =>
|
||||
SemanticCoreSimClustersEmbeddingsDec11Dataset
|
||||
case (EntityType.SemanticCore, ModelVersions.Model20M145KUpdated) =>
|
||||
if (isEmbeddingsPerLocale) {
|
||||
SemanticCorePerLanguageSimClustersEmbeddingsDataset
|
||||
} else {
|
||||
SemanticCoreSimClustersEmbeddingsUpdatedDataset
|
||||
}
|
||||
case (EntityType.SemanticCore, ModelVersions.Model20M145K2020) =>
|
||||
SemanticCoreSimClustersEmbeddings2020Dataset
|
||||
case (EntityType.Hashtag, ModelVersions.Model20M145KUpdated) =>
|
||||
HashtagSimClustersEmbeddingsUpdatedDataset
|
||||
case (entityType, modelVersion) =>
|
||||
throw new IllegalArgumentException(
|
||||
s"(Entity Type, ModelVersion) ($entityType, $modelVersion) not supported.")
|
||||
}
|
||||
}
|
||||
|
||||
// Return the raw DAL dataset reference. Use this if you're writing to DAL.
|
||||
def getReverseIndexedEntityEmbeddingsDataset(
|
||||
entityType: EntityType,
|
||||
modelVersion: String,
|
||||
isEmbeddingsPerLocale: Boolean = false
|
||||
): KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]] = {
|
||||
(entityType, modelVersion) match {
|
||||
case (EntityType.SemanticCore, ModelVersions.Model20M145KDec11) =>
|
||||
ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset
|
||||
case (EntityType.SemanticCore, ModelVersions.Model20M145KUpdated) =>
|
||||
if (isEmbeddingsPerLocale) {
|
||||
ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset
|
||||
} else {
|
||||
ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset
|
||||
}
|
||||
case (EntityType.SemanticCore, ModelVersions.Model20M145K2020) =>
|
||||
ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset
|
||||
case (EntityType.Hashtag, ModelVersions.Model20M145KUpdated) =>
|
||||
ReverseIndexHashtagSimClustersEmbeddingsUpdatedDataset
|
||||
case (entityType, modelVersion) =>
|
||||
throw new IllegalArgumentException(
|
||||
s"(Entity Type, ModelVersion) ($entityType, $modelVersion) not supported.")
|
||||
}
|
||||
}
|
||||
|
||||
private def entityEmbeddingsSource(
|
||||
dataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]],
|
||||
embeddingType: EmbeddingType,
|
||||
dateRange: DateRange
|
||||
): TypedPipe[(Long, SimClustersEmbedding)] = {
|
||||
val pipe = DAL
|
||||
.readMostRecentSnapshot(dataset, dateRange)
|
||||
.withRemoteReadPolicy(AllowCrossDC)
|
||||
.toTypedPipe
|
||||
filterEntityEmbeddingsByType(pipe, embeddingType)
|
||||
}
|
||||
|
||||
private def reverseIndexedEntityEmbeddingsSource(
|
||||
dataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]],
|
||||
embeddingType: EmbeddingType,
|
||||
dateRange: DateRange
|
||||
): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
|
||||
val pipe = DAL
|
||||
.readMostRecentSnapshot(dataset, dateRange)
|
||||
.withRemoteReadPolicy(AllowCrossDC)
|
||||
.toTypedPipe
|
||||
filterReverseIndexedEntityEmbeddingsByType(pipe, embeddingType)
|
||||
}
|
||||
|
||||
private[hdfs_sources] def filterEntityEmbeddingsByType(
|
||||
pipe: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]],
|
||||
embeddingType: EmbeddingType
|
||||
): TypedPipe[(Long, SimClustersEmbedding)] = {
|
||||
pipe.collect {
|
||||
case KeyVal(
|
||||
SimClustersEmbeddingId(_embeddingType, _, InternalId.EntityId(entityId)),
|
||||
embedding
|
||||
) if _embeddingType == embeddingType =>
|
||||
(entityId, embedding)
|
||||
}
|
||||
}
|
||||
|
||||
private[hdfs_sources] def filterReverseIndexedEntityEmbeddingsByType(
|
||||
pipe: TypedPipe[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]],
|
||||
embeddingType: EmbeddingType
|
||||
): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
|
||||
pipe.collect {
|
||||
case KeyVal(
|
||||
SimClustersEmbeddingId(_embeddingType, _, InternalId.ClusterId(clusterId)),
|
||||
embedding
|
||||
) if _embeddingType == embeddingType =>
|
||||
val entitiesWithScores = embedding.embedding.collect {
|
||||
case InternalIdWithScore(InternalId.EntityId(entityId), score) =>
|
||||
SemanticCoreEntityWithScore(entityId, score)
|
||||
}
|
||||
(clusterId, entitiesWithScores)
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,178 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources
|
||||
|
||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
||||
import com.twitter.scalding.{DateOps, DateRange, Days, TypedPipe}
|
||||
import com.twitter.scalding_internal.dalv2.DAL
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, ProcAtla}
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
||||
import com.twitter.simclusters_v2.common.UserId
|
||||
import com.twitter.simclusters_v2.thriftscala.ModelVersion
|
||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
|
||||
import java.util.TimeZone
|
||||
|
||||
object InterestedInSources {
|
||||
|
||||
private val ModelVersionInterestedInDatasetMap: Map[ModelVersion, KeyValDALDataset[
|
||||
KeyVal[UserId, ClustersUserIsInterestedIn]
|
||||
]] = Map(
|
||||
ModelVersion.Model20m145kDec11 -> SimclustersV2InterestedInScalaDataset,
|
||||
ModelVersion.Model20m145kUpdated -> SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
|
||||
ModelVersion.Model20m145k2020 -> SimclustersV2InterestedIn20M145K2020ScalaDataset
|
||||
)
|
||||
|
||||
/**
|
||||
* Internal version, not PDP compliant, not to be used outside simclusters_v2
|
||||
* Reads 20M145KDec11 production InterestedIn data from atla-proc, with a 14-day extended window
|
||||
*/
|
||||
private[simclusters_v2] def simClustersRawInterestedInDec11Source(
|
||||
dateRange: DateRange,
|
||||
timeZone: TimeZone
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
|
||||
DAL
|
||||
.readMostRecentSnapshot(
|
||||
SimclustersV2RawInterestedIn20M145KDec11ScalaDataset,
|
||||
dateRange.prepend(Days(14)(timeZone))
|
||||
)
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe
|
||||
.map {
|
||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
||||
(userId, clustersUserIsInterestedIn)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal version, not PDP compliant, not to be used outside simclusters_v2
|
||||
* Reads 20M145KUpdated InterestedIn data from atla-proc, with a 14-day extended window
|
||||
*/
|
||||
private[simclusters_v2] def simClustersRawInterestedInUpdatedSource(
|
||||
dateRange: DateRange,
|
||||
timeZone: TimeZone
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
DAL
|
||||
.readMostRecentSnapshot(
|
||||
SimclustersV2RawInterestedIn20M145KUpdatedScalaDataset,
|
||||
dateRange.prepend(Days(14)(timeZone))
|
||||
)
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe.map {
|
||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
||||
(userId, clustersUserIsInterestedIn)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal version, not PDP compliant, not to be used outside simclusters_v2
|
||||
* Reads 20M145K2020 InterestedIn data from atla-proc, with a 14-day extended window
|
||||
*/
|
||||
private[simclusters_v2] def simClustersRawInterestedIn2020Source(
|
||||
dateRange: DateRange,
|
||||
timeZone: TimeZone
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
DAL
|
||||
.readMostRecentSnapshot(
|
||||
SimclustersV2RawInterestedIn20M145K2020ScalaDataset,
|
||||
dateRange.prepend(Days(14)(timeZone))
|
||||
)
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe.map {
|
||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
||||
(userId, clustersUserIsInterestedIn)
|
||||
}
|
||||
}
|
||||
|
||||
private[simclusters_v2] def simClustersRawInterestedInLite2020Source(
|
||||
dateRange: DateRange,
|
||||
timeZone: TimeZone
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
DAL
|
||||
.readMostRecentSnapshot(
|
||||
SimclustersV2RawInterestedInLite20M145K2020ScalaDataset,
|
||||
dateRange.extend(Days(14)(timeZone)))
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe.map {
|
||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
||||
(userId, clustersUserIsInterestedIn)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads 20M145KDec11 production InterestedIn data from atla-proc, with a 14-day extended window
|
||||
*/
|
||||
def simClustersInterestedInDec11Source(
|
||||
dateRange: DateRange,
|
||||
timeZone: TimeZone
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
|
||||
DAL
|
||||
.readMostRecentSnapshot(
|
||||
SimclustersV2InterestedInScalaDataset,
|
||||
dateRange.prepend(Days(14)(timeZone)))
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe.map {
|
||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
||||
(userId, clustersUserIsInterestedIn)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads 20M145KUpdated InterestedIn data from atla-proc, with a 14-day extended window
|
||||
*/
|
||||
def simClustersInterestedInUpdatedSource(
|
||||
dateRange: DateRange,
|
||||
timeZone: TimeZone
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
DAL
|
||||
.readMostRecentSnapshot(
|
||||
SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
|
||||
dateRange.prepend(Days(14)(timeZone))
|
||||
)
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe.map {
|
||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
||||
(userId, clustersUserIsInterestedIn)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads 20M145K2020 InterestedIn data from atla-proc, with a 14-day extended window
|
||||
*/
|
||||
def simClustersInterestedIn2020Source(
|
||||
dateRange: DateRange,
|
||||
timeZone: TimeZone
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
DAL
|
||||
.readMostRecentSnapshot(
|
||||
SimclustersV2InterestedIn20M145K2020ScalaDataset,
|
||||
dateRange.prepend(Days(14)(timeZone))
|
||||
)
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe.map {
|
||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
||||
(userId, clustersUserIsInterestedIn)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads InterestedIn data based on ModelVersion from atla-proc, with a 14-day extended window
|
||||
*/
|
||||
def simClustersInterestedInSource(
|
||||
modelVersion: ModelVersion,
|
||||
dateRange: DateRange,
|
||||
timeZone: TimeZone
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
|
||||
DAL
|
||||
.readMostRecentSnapshot(
|
||||
ModelVersionInterestedInDatasetMap(modelVersion),
|
||||
dateRange.prepend(Days(14)(timeZone))
|
||||
)
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe.map {
|
||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
||||
(userId, clustersUserIsInterestedIn)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
@ -1,86 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources
|
||||
|
||||
import com.twitter.scalding.DateRange
|
||||
import com.twitter.scalding.TypedPipe
|
||||
import com.twitter.scalding_internal.dalv2.DAL
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.Proc3Atla
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
||||
import com.twitter.simclusters_v2.thriftscala.EmbeddingType
|
||||
import com.twitter.simclusters_v2.thriftscala.InternalId
|
||||
import com.twitter.simclusters_v2.thriftscala.ModelVersion
|
||||
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding
|
||||
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
|
||||
import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore
|
||||
|
||||
object ProducerEmbeddingSources {
|
||||
|
||||
/**
|
||||
* Helper function to retrieve producer SimClusters embeddings with the legacy `TopSimClustersWithScore`
|
||||
* value type.
|
||||
*/
|
||||
def producerEmbeddingSourceLegacy(
|
||||
embeddingType: EmbeddingType,
|
||||
modelVersion: ModelVersion
|
||||
)(
|
||||
implicit dateRange: DateRange
|
||||
): TypedPipe[(Long, TopSimClustersWithScore)] = {
|
||||
val producerEmbeddingDataset = (embeddingType, modelVersion) match {
|
||||
case (EmbeddingType.ProducerFollowBasedSemanticCoreEntity, ModelVersion.Model20m145kDec11) =>
|
||||
ProducerTopKSimclusterEmbeddingsByFollowScoreScalaDataset
|
||||
case (EmbeddingType.ProducerFavBasedSemanticCoreEntity, ModelVersion.Model20m145kDec11) =>
|
||||
ProducerTopKSimclusterEmbeddingsByFavScoreScalaDataset
|
||||
case (
|
||||
EmbeddingType.ProducerFollowBasedSemanticCoreEntity,
|
||||
ModelVersion.Model20m145kUpdated) =>
|
||||
ProducerTopKSimclusterEmbeddingsByFollowScoreUpdatedScalaDataset
|
||||
case (EmbeddingType.ProducerFavBasedSemanticCoreEntity, ModelVersion.Model20m145kUpdated) =>
|
||||
ProducerTopKSimclusterEmbeddingsByFavScoreUpdatedScalaDataset
|
||||
case (_, _) =>
|
||||
throw new ClassNotFoundException(
|
||||
"Unsupported embedding type: " + embeddingType + " and model version: " + modelVersion)
|
||||
}
|
||||
|
||||
DAL
|
||||
.readMostRecentSnapshot(producerEmbeddingDataset).withRemoteReadPolicy(
|
||||
AllowCrossClusterSameDC)
|
||||
.toTypedPipe.map {
|
||||
case KeyVal(producerId, topSimClustersWithScore) =>
|
||||
(producerId, topSimClustersWithScore)
|
||||
}
|
||||
}
|
||||
|
||||
def producerEmbeddingSource(
|
||||
embeddingType: EmbeddingType,
|
||||
modelVersion: ModelVersion
|
||||
)(
|
||||
implicit dateRange: DateRange
|
||||
): TypedPipe[(Long, SimClustersEmbedding)] = {
|
||||
val producerEmbeddingDataset = (embeddingType, modelVersion) match {
|
||||
case (EmbeddingType.AggregatableLogFavBasedProducer, ModelVersion.Model20m145k2020) =>
|
||||
AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
|
||||
case (EmbeddingType.AggregatableFollowBasedProducer, ModelVersion.Model20m145k2020) =>
|
||||
AggregatableProducerSimclustersEmbeddingsByFollowScore2020ScalaDataset
|
||||
case (EmbeddingType.RelaxedAggregatableLogFavBasedProducer, ModelVersion.Model20m145k2020) =>
|
||||
AggregatableProducerSimclustersEmbeddingsByLogFavScoreRelaxedFavEngagementThreshold2020ScalaDataset
|
||||
case (_, _) =>
|
||||
throw new ClassNotFoundException(
|
||||
"Unsupported embedding type: " + embeddingType + " and model version: " + modelVersion)
|
||||
}
|
||||
|
||||
DAL
|
||||
.readMostRecentSnapshot(
|
||||
producerEmbeddingDataset
|
||||
)
|
||||
.withRemoteReadPolicy(ExplicitLocation(Proc3Atla))
|
||||
.toTypedPipe
|
||||
.map {
|
||||
case KeyVal(
|
||||
SimClustersEmbeddingId(_, _, InternalId.UserId(producerId: Long)),
|
||||
embedding: SimClustersEmbedding) =>
|
||||
(producerId, embedding)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,13 +0,0 @@
|
||||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/scala/com/twitter/scalding_internal/multiformat/format",
|
||||
"src/scala/com/twitter/simclusters_v2/common",
|
||||
"src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala",
|
||||
"src/thrift/com/twitter/ml/api:embedding-scala",
|
||||
"src/thrift/com/twitter/recos/entities:entities-thrift-scala",
|
||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
|
||||
],
|
||||
)
|
Binary file not shown.
Binary file not shown.
@ -1,16 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
||||
|
||||
import com.twitter.bijection.Bufferable
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
|
||||
ScalaCompactThrift,
|
||||
genericInjection
|
||||
}
|
||||
import com.twitter.simclusters_v2.thriftscala.ClusterDetails
|
||||
|
||||
object ClusterDetailsInjection {
|
||||
val injection = KeyValInjection[(String, Int), ClusterDetails](
|
||||
genericInjection(Bufferable.injectionOf[(String, Int)]),
|
||||
ScalaCompactThrift(ClusterDetails)
|
||||
)
|
||||
}
|
Binary file not shown.
@ -1,13 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
||||
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
|
||||
import com.twitter.simclusters_v2.thriftscala.{TweetsWithScore, DayPartitionedClusterId}
|
||||
|
||||
object ClusterTopMediaTweetsInjection {
|
||||
|
||||
val injection = KeyValInjection[DayPartitionedClusterId, TweetsWithScore](
|
||||
ScalaCompactThrift(DayPartitionedClusterId),
|
||||
ScalaCompactThrift(TweetsWithScore)
|
||||
)
|
||||
}
|
Binary file not shown.
@ -1,14 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
||||
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
|
||||
import com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores
|
||||
import com.twitter.simclusters_v2.thriftscala.FullClusterId
|
||||
|
||||
object ClusterTopTweetsInjection {
|
||||
|
||||
val clusterIdToTopKTweetsInjection = KeyValInjection[FullClusterId, TopKTweetsWithScores](
|
||||
ScalaCompactThrift(FullClusterId),
|
||||
ScalaCompactThrift(TopKTweetsWithScores)
|
||||
)
|
||||
}
|
Binary file not shown.
@ -1,16 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
||||
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaBinaryThrift
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
|
||||
import com.twitter.simclusters_v2.common.UserId
|
||||
import com.twitter.simclusters_v2.thriftscala._
|
||||
|
||||
object ClusteringInjections {
|
||||
|
||||
final val OrderedClustersAndMembersInjection: KeyValInjection[
|
||||
UserId,
|
||||
OrderedClustersAndMembers
|
||||
] =
|
||||
KeyValInjection(Long2BigEndian, ScalaBinaryThrift(OrderedClustersAndMembers))
|
||||
}
|
Binary file not shown.
@ -1,47 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
||||
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaBinaryThrift
|
||||
import com.twitter.simclusters_v2.thriftscala._
|
||||
import com.twitter.ml.api.thriftscala.Embedding
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
|
||||
|
||||
object EntityEmbeddingsInjections {
|
||||
|
||||
final val EntitySimClustersEmbeddingInjection: KeyValInjection[
|
||||
SimClustersEmbeddingId,
|
||||
SimClustersEmbedding
|
||||
] =
|
||||
KeyValInjection(
|
||||
ScalaBinaryThrift(SimClustersEmbeddingId),
|
||||
ScalaBinaryThrift(SimClustersEmbedding)
|
||||
)
|
||||
|
||||
final val InternalIdEmbeddingInjection: KeyValInjection[
|
||||
SimClustersEmbeddingId,
|
||||
InternalIdEmbedding
|
||||
] =
|
||||
KeyValInjection(
|
||||
ScalaBinaryThrift(SimClustersEmbeddingId),
|
||||
ScalaBinaryThrift(InternalIdEmbedding)
|
||||
)
|
||||
|
||||
final val EntitySimClustersMultiEmbeddingInjection: KeyValInjection[
|
||||
SimClustersMultiEmbeddingId,
|
||||
SimClustersMultiEmbedding
|
||||
] =
|
||||
KeyValInjection(
|
||||
ScalaBinaryThrift(SimClustersMultiEmbeddingId),
|
||||
ScalaBinaryThrift(SimClustersMultiEmbedding)
|
||||
)
|
||||
|
||||
final val UserMbcgEmbeddingInjection: KeyValInjection[
|
||||
Long,
|
||||
Embedding
|
||||
] =
|
||||
KeyValInjection[Long, Embedding](
|
||||
Long2BigEndian,
|
||||
ScalaCompactThrift(Embedding)
|
||||
)
|
||||
}
|
Binary file not shown.
@ -1,27 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
||||
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
|
||||
Int2BigEndian,
|
||||
Long2BigEndian,
|
||||
ScalaCompactThrift
|
||||
}
|
||||
import com.twitter.simclusters_v2.thriftscala.SimClustersInferredEntities
|
||||
|
||||
object InferredEntitiesInjections {
|
||||
|
||||
final val InferredEntityInjection: KeyValInjection[Long, SimClustersInferredEntities] =
|
||||
KeyValInjection(
|
||||
Long2BigEndian,
|
||||
ScalaCompactThrift(SimClustersInferredEntities)
|
||||
)
|
||||
|
||||
final val InferredEntityKeyedByClusterInjection: KeyValInjection[
|
||||
Int,
|
||||
SimClustersInferredEntities
|
||||
] =
|
||||
KeyValInjection(
|
||||
Int2BigEndian,
|
||||
ScalaCompactThrift(SimClustersInferredEntities)
|
||||
)
|
||||
}
|
Binary file not shown.
@ -1,13 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
||||
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.StringUtf8
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
|
||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
|
||||
|
||||
object InterestedInInjection {
|
||||
val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(ClustersUserIsInterestedIn))
|
||||
val languageInjection =
|
||||
KeyValInjection(StringUtf8, ScalaCompactThrift(ClustersUserIsInterestedIn))
|
||||
}
|
Binary file not shown.
@ -1,12 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
||||
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
|
||||
Long2BigEndian,
|
||||
ScalaCompactThrift
|
||||
}
|
||||
import com.twitter.simclusters_v2.thriftscala._
|
||||
|
||||
object KnownForInjection {
|
||||
val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(ClustersUserIsKnownFor))
|
||||
}
|
Binary file not shown.
@ -1,31 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
||||
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
|
||||
import com.twitter.simclusters_v2.thriftscala.LeftNode
|
||||
import com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList
|
||||
import com.twitter.simclusters_v2.thriftscala.RightNode
|
||||
import com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct
|
||||
import com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeightList
|
||||
import com.twitter.simclusters_v2.thriftscala.SimilarRightNodes
|
||||
import com.twitter.simclusters_v2.thriftscala.CandidateTweetsList
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
|
||||
|
||||
object MultiTypeGraphInjections {
|
||||
final val truncatedMultiTypeGraphInjection =
|
||||
KeyValInjection(ScalaCompactThrift(LeftNode), ScalaCompactThrift(RightNodeWithEdgeWeightList))
|
||||
final val topKRightNounListInjection =
|
||||
KeyValInjection(
|
||||
ScalaCompactThrift(RightNodeTypeStruct),
|
||||
ScalaCompactThrift(NounWithFrequencyList))
|
||||
final val similarRightNodesInjection =
|
||||
KeyValInjection[RightNode, SimilarRightNodes](
|
||||
ScalaCompactThrift(RightNode),
|
||||
ScalaCompactThrift(SimilarRightNodes)
|
||||
)
|
||||
final val tweetRecommendationsInjection =
|
||||
KeyValInjection[Long, CandidateTweetsList](
|
||||
Long2BigEndian,
|
||||
ScalaCompactThrift(CandidateTweetsList)
|
||||
)
|
||||
}
|
Binary file not shown.
@ -1,45 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
||||
|
||||
import com.twitter.hermit.candidate.thriftscala.Candidates
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
|
||||
Long2BigEndian,
|
||||
ScalaBinaryThrift,
|
||||
ScalaCompactThrift
|
||||
}
|
||||
import com.twitter.simclusters_v2.thriftscala.{
|
||||
PersistedFullClusterId,
|
||||
SimClustersEmbedding,
|
||||
SimClustersEmbeddingId,
|
||||
TopProducersWithScore,
|
||||
TopSimClustersWithScore
|
||||
}
|
||||
|
||||
object ProducerEmbeddingsInjections {
|
||||
final val ProducerTopKSimClusterEmbeddingsInjection: KeyValInjection[
|
||||
Long,
|
||||
TopSimClustersWithScore
|
||||
] =
|
||||
KeyValInjection(
|
||||
keyCodec = Long2BigEndian,
|
||||
valueCodec = ScalaCompactThrift(TopSimClustersWithScore))
|
||||
|
||||
final val SimClusterEmbeddingTopKProducersInjection: KeyValInjection[
|
||||
PersistedFullClusterId,
|
||||
TopProducersWithScore
|
||||
] =
|
||||
KeyValInjection(
|
||||
keyCodec = ScalaCompactThrift(PersistedFullClusterId),
|
||||
valueCodec = ScalaCompactThrift(TopProducersWithScore))
|
||||
|
||||
final val SimilarUsersInjection: KeyValInjection[Long, Candidates] =
|
||||
KeyValInjection(keyCodec = Long2BigEndian, valueCodec = ScalaCompactThrift(Candidates))
|
||||
|
||||
final val ProducerSimClustersEmbeddingInjection: KeyValInjection[
|
||||
SimClustersEmbeddingId,
|
||||
SimClustersEmbedding
|
||||
] =
|
||||
KeyValInjection(
|
||||
keyCodec = ScalaBinaryThrift(SimClustersEmbeddingId),
|
||||
valueCodec = ScalaBinaryThrift(SimClustersEmbedding))
|
||||
}
|
Binary file not shown.
@ -1,53 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
||||
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
|
||||
Long2BigEndian,
|
||||
ScalaCompactThrift,
|
||||
StringUtf8
|
||||
}
|
||||
import com.twitter.recos.entities.thriftscala.{
|
||||
SemanticCoreEntityScoreList,
|
||||
SemanticCoreEntityWithLocale,
|
||||
UserIdWithLocale,
|
||||
UserScoreList
|
||||
}
|
||||
|
||||
object SemanticCoreEntitiesInjections {
|
||||
|
||||
final val StringToSemanticCoreEntityScoreListInjection: KeyValInjection[
|
||||
String,
|
||||
SemanticCoreEntityScoreList
|
||||
] =
|
||||
KeyValInjection(
|
||||
StringUtf8,
|
||||
ScalaCompactThrift(SemanticCoreEntityScoreList)
|
||||
)
|
||||
|
||||
final val LongToSemanticCoreEntityScoreListInjection: KeyValInjection[
|
||||
Long,
|
||||
SemanticCoreEntityScoreList
|
||||
] =
|
||||
KeyValInjection(
|
||||
Long2BigEndian,
|
||||
ScalaCompactThrift(SemanticCoreEntityScoreList)
|
||||
)
|
||||
|
||||
final val UserWithLocaleToSemanticCoreEntityScoreListInjection: KeyValInjection[
|
||||
UserIdWithLocale,
|
||||
SemanticCoreEntityScoreList
|
||||
] =
|
||||
KeyValInjection(
|
||||
ScalaCompactThrift(UserIdWithLocale),
|
||||
ScalaCompactThrift(SemanticCoreEntityScoreList)
|
||||
)
|
||||
|
||||
final val SemanticCoreEntityWithLocaleToUsersScoreListInjection: KeyValInjection[
|
||||
SemanticCoreEntityWithLocale,
|
||||
UserScoreList
|
||||
] =
|
||||
KeyValInjection(
|
||||
ScalaCompactThrift(SemanticCoreEntityWithLocale),
|
||||
ScalaCompactThrift(UserScoreList)
|
||||
)
|
||||
}
|
Binary file not shown.
@ -1,12 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
||||
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
|
||||
Long2BigEndian,
|
||||
ScalaCompactThrift
|
||||
}
|
||||
import com.twitter.simclusters_v2.thriftscala.SingleSideUserScores
|
||||
|
||||
object SingleSideUserScoresInjection {
|
||||
val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(SingleSideUserScores))
|
||||
}
|
@ -1,60 +0,0 @@
|
||||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
":data_sources",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:core",
|
||||
"src/scala/com/twitter/scalding_internal/dalv2",
|
||||
"src/scala/com/twitter/scalding_internal/multiformat/format",
|
||||
"src/scala/com/twitter/scalding_internal/source/lzo_scrooge",
|
||||
"src/scala/com/twitter/simclusters_v2/common",
|
||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
|
||||
"src/thrift/com/twitter/wtf/entity_real_graph:entity_real_graph-thrift-scala",
|
||||
],
|
||||
)
|
||||
|
||||
scala_library(
|
||||
name = "data_sources",
|
||||
sources = [],
|
||||
description = "DAL datasets we wish to expose externally",
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
":reverse_index_semantic_core_per_language_simclusters_embeddings_presto-scala",
|
||||
":semantic_core_per_language_simclusters_embeddings_presto-scala",
|
||||
"src/scala/com/twitter/simclusters_v2/common",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "reverse_index_semantic_core_per_language_simclusters_embeddings_presto",
|
||||
java_schema = "com.twitter.simclusters_v2.thriftjava.InternalIdEmbeddingWithId",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
scala_schema = "com.twitter.simclusters_v2.thriftscala.InternalIdEmbeddingWithId",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
java_dependencies = [
|
||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java",
|
||||
],
|
||||
scala_dependencies = [
|
||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "semantic_core_per_language_simclusters_embeddings_presto",
|
||||
java_schema = "com.twitter.simclusters_v2.thriftjava.SimClustersEmbeddingWithId",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
scala_schema = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
java_dependencies = [
|
||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java",
|
||||
],
|
||||
scala_dependencies = [
|
||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
|
||||
],
|
||||
)
|
Binary file not shown.
Binary file not shown.
@ -1,10 +0,0 @@
|
||||
package com.twitter.simclusters_v2.hdfs_sources.presto_hdfs_sources
|
||||
|
||||
object EntityEmbeddingsPrestoSources {
|
||||
|
||||
final val SemanticCorePerLanguageSimClustersEmbeddingsDataset =
|
||||
SemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset
|
||||
|
||||
final val ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
|
||||
ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset
|
||||
}
|
Binary file not shown.
Before Width: | Height: | Size: 61 KiB |
Binary file not shown.
Before Width: | Height: | Size: 66 KiB |
Binary file not shown.
Before Width: | Height: | Size: 26 KiB |
Binary file not shown.
Before Width: | Height: | Size: 71 KiB |
Binary file not shown.
Before Width: | Height: | Size: 233 KiB |
Binary file not shown.
Before Width: | Height: | Size: 70 KiB |
@ -1,521 +0,0 @@
|
||||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/fasterxml/jackson:jackson-module-scala",
|
||||
"3rdparty/jvm/com/fasterxml/jackson/core:jackson-core",
|
||||
"3rdparty/jvm/com/fasterxml/jackson/core:jackson-databind",
|
||||
"3rdparty/jvm/com/fasterxml/jackson/module:jackson-module-scala",
|
||||
"3rdparty/jvm/com/googlecode/matrix-toolkits-java",
|
||||
"3rdparty/jvm/com/twitter/storehaus:algebra",
|
||||
"3rdparty/jvm/com/twitter/storehaus:core",
|
||||
"escherbird/src/scala/com/twitter/escherbird/scalding/source",
|
||||
"flockdb-tools/datasets/flock:flock-follows-edges-scala",
|
||||
"src/java/com/twitter/ml/api/constant",
|
||||
"src/java/com/twitter/sbf/core",
|
||||
"src/java/com/twitter/sbf/graph",
|
||||
"src/scala/com/twitter/frigate/user_sampler/common",
|
||||
"src/scala/com/twitter/ml/api:api-base",
|
||||
"src/scala/com/twitter/ml/api/bq",
|
||||
"src/scala/com/twitter/pluck/source/cassowary:sims",
|
||||
"src/scala/com/twitter/pluck/source/core_workflows/user_model:condensed_user_state-scala",
|
||||
"src/scala/com/twitter/scalding_internal/dalv2",
|
||||
"src/scala/com/twitter/scalding_internal/job",
|
||||
"src/scala/com/twitter/scalding_internal/job/analytics_batch",
|
||||
"src/scala/com/twitter/scalding_internal/source",
|
||||
"src/scala/com/twitter/scalding_internal/source/lzo_scrooge",
|
||||
"src/scala/com/twitter/simclusters_v2/candidate_source",
|
||||
"src/scala/com/twitter/simclusters_v2/hdfs_sources",
|
||||
"src/scala/com/twitter/simclusters_v2/scalding/common",
|
||||
"src/scala/com/twitter/simclusters_v2/summingbird/common",
|
||||
"src/scala/com/twitter/timelines/prediction/features/common",
|
||||
"src/scala/com/twitter/timelines/prediction/features/itl",
|
||||
"src/scala/com/twitter/timelines/prediction/features/recap",
|
||||
"src/scala/com/twitter/wtf/entity_real_graph/scalding/common",
|
||||
"src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala",
|
||||
"src/thrift/com/twitter/wtf/scalding/sims:sims-thrift-scala",
|
||||
"twadoop_config/configuration/log_categories/group/recos-platform:content_recommender_get_content_recommendations-scala",
|
||||
"twadoop_config/configuration/log_categories/group/recos-platform:content_recommender_get_topic_tweets_recommendations-scala",
|
||||
"twadoop_config/configuration/log_categories/group/timeline:timeline_service_favorites-scala",
|
||||
"usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala",
|
||||
"usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala",
|
||||
"util/util-core:util-core-util",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "evd_cluster_similarity",
|
||||
main = "com.twitter.simclusters_v2.scalding.EigenVectorsForClusterSimilarityAdhoc",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "cluster_evaluation",
|
||||
main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationAdhoc",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "cluster_evaluation_20m_145k",
|
||||
main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "cluster_evaluation_20m_145k_2020",
|
||||
main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K2020",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "bp_cluster_evaluation",
|
||||
main = "com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluation",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "update_knownfor",
|
||||
main = "com.twitter.simclusters_v2.scalding.UpdateKnownForAdhoc",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "update_knownfor_prod",
|
||||
main = "com.twitter.simclusters_v2.scalding.UpdateKnownFor20M145K",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "cluster_details",
|
||||
main = "com.twitter.simclusters_v2.scalding.ClusterDetailsBatch",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "cluster_details_20m_145k_updated",
|
||||
main = "com.twitter.simclusters_v2.scalding.ClusterDetails20M145KUpdated",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "cluster_details_20m_145k_2020",
|
||||
main = "com.twitter.simclusters_v2.scalding.ClusterDetails20M145K2020",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "cluster_details-adhoc",
|
||||
main = "com.twitter.simclusters_v2.scalding.ClusterDetailsAdhoc",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "cluster_details-dump",
|
||||
main = "com.twitter.simclusters_v2.scalding.DumpClusterDetailsAdhoc",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "interested_in",
|
||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForBatch",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "interested_in_from_producer_embeddings",
|
||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsBatchApp",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "employee_graph_from_user_user",
|
||||
main = "com.twitter.simclusters_v2.scalding.EmployeeGraphFromUserUser",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "interested_in_20m_145k_updated",
|
||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145KUpdated",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "interested_in_20m_145k_2020",
|
||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145K2020",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "interested_in_lite_20m_145k_2020",
|
||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "interested_in_lite_20m_145k_2020-adhoc",
|
||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020Adhoc",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "interested_in_from_ape_2020-adhoc",
|
||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020AdhocApp",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "interested_in_from_ape_2020",
|
||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020BatchApp",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "known_for_to_mh",
|
||||
main = "com.twitter.simclusters_v2.scalding.KnownForToMHBatch",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "user_user_normalized_graph",
|
||||
main = "com.twitter.simclusters_v2.scalding.UserUserNormalizedGraphBatch",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "user_user_graph",
|
||||
main = "com.twitter.simclusters_v2.scalding.UserUserGraphBatch",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "user_user_graph-adhoc",
|
||||
main = "com.twitter.simclusters_v2.scalding.UserUserGraphAdhoc",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "producer_norms_and_counts",
|
||||
main = "com.twitter.simclusters_v2.scalding.ProducerNormsAndCountsBatch",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "fav_graph",
|
||||
main = "com.twitter.simclusters_v2.scalding.UserUserFavGraphBatch",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "top_users_similarity_graph",
|
||||
main = "com.twitter.simclusters_v2.scalding.TopUsersSimilarityGraphApp",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "top_users_only",
|
||||
main = "com.twitter.simclusters_v2.scalding.TopUsersOnlyApp",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
hadoop_binary(
|
||||
name = "dump_fav_graph_adhoc",
|
||||
main = "com.twitter.simclusters_v2.scalding.DumpFavGraphAdhoc",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
||||
|
||||
# Generated with `capesospy-v2 create_target interested_in_for_20M_145k_2020 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml`, config hash 8f19bf.
|
||||
scalding_job(
|
||||
name = "interested_in_for_20M_145k_2020",
|
||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145K2020",
|
||||
args = ["--socialProofThreshold 2 --maxClustersPerUser 50"],
|
||||
config = [
|
||||
("hadoop.combine-input", "true"),
|
||||
("hadoop.map.jvm.total-memory", "3072m"),
|
||||
("hadoop.reduce.jvm.total-memory", "3072m"),
|
||||
("hadoop.submitter.jvm.total-memory", "5120m"),
|
||||
("submitter.tier", "preemptible"),
|
||||
],
|
||||
cron = "14 * * * *",
|
||||
hadoop_cluster = "atla-proc",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":scalding",
|
||||
],
|
||||
)
|
BIN
src/scala/com/twitter/simclusters_v2/scalding/BUILD.docx
Normal file
BIN
src/scala/com/twitter/simclusters_v2/scalding/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,513 +0,0 @@
|
||||
package com.twitter.simclusters_v2.scalding
|
||||
|
||||
import com.twitter.algebird.Aggregator
|
||||
import com.twitter.algebird.Monoid
|
||||
import com.twitter.scalding._
|
||||
import com.twitter.scalding.commons.source.VersionedKeyValSource
|
||||
import com.twitter.scalding.typed.TypedPipe
|
||||
import com.twitter.scalding_internal.dalv2.DAL
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
|
||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
||||
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
|
||||
import com.twitter.simclusters_v2.hdfs_sources.NormsAndCountsFixedPathSource
|
||||
import com.twitter.simclusters_v2.hdfs_sources.ProducerNormsAndCountsScalaDataset
|
||||
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInScalaDataset
|
||||
import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
|
||||
import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
|
||||
import com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluationClasses._
|
||||
import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._
|
||||
import com.twitter.simclusters_v2.scalding.common.Util
|
||||
import com.twitter.simclusters_v2.thriftscala.BipartiteClusterQuality
|
||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
|
||||
import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
|
||||
import com.twitter.simclusters_v2.thriftscala.NormsAndCounts
|
||||
import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object BipartiteClusterEvaluation extends TwitterExecutionApp {
|
||||
|
||||
implicit val tz: java.util.TimeZone = DateOps.UTC
|
||||
implicit val dp = DateParser.default
|
||||
|
||||
private def getClusterL2Norms(
|
||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])]
|
||||
): Execution[Map[Int, Float]] = {
|
||||
knownFor
|
||||
.flatMap {
|
||||
case (_, clusterArray) =>
|
||||
clusterArray.map {
|
||||
case (clusterId, score) =>
|
||||
Map(clusterId -> score * score)
|
||||
}
|
||||
}
|
||||
.sum
|
||||
.getExecution
|
||||
.map(_.mapValues { x => math.sqrt(x).toFloat })
|
||||
}
|
||||
|
||||
def l2NormalizeKnownFor(
|
||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])]
|
||||
): Execution[TypedPipe[(Long, Array[(Int, Float)])]] = {
|
||||
getClusterL2Norms(knownFor).map { clusterToNorms =>
|
||||
knownFor.mapValues { clusterScoresArray =>
|
||||
clusterScoresArray.map {
|
||||
case (clusterId, score) =>
|
||||
(clusterId, score / clusterToNorms(clusterId))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:bp_cluster_evaluation && \
|
||||
* oscar hdfs --user frigate --host hadoopnest2.atla.twitter.com --bundle bp_cluster_evaluation \
|
||||
* --tool com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluation --screen --screen-detached \
|
||||
* --tee logs/newBpQuality_updateUnnormalizedScores_interestedInUsing20190329Graph_evaluatedOn20190329Graph_run2 \
|
||||
* -- --normsAndCountsDir /user/frigate/your_ldap/producerNormsAndCounts_20190330 \
|
||||
* --graphInputDir /user/frigate/your_ldap/user_user_normalized_graph_copiedFromAtlaProc_20190329 \
|
||||
* --knownForDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownFor \
|
||||
* --interestedInDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/interestedInUsing20190329Graph \
|
||||
* --outgoingVolumesResultsDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_outgoingVolumes \
|
||||
* --incomingVolumesResultsDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_incomingVolumes \
|
||||
* --outputDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_perCluster \
|
||||
* --toEmailAddress your_ldap@twitter.com --modelVersion 20M_145K_updated
|
||||
*/
|
||||
override def job: Execution[Unit] = Execution.getConfigMode.flatMap {
|
||||
case (config, mode) =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
val args = config.getArgs
|
||||
|
||||
val interestedIn = args.optional("interestedInDir") match {
|
||||
case Some(dir) =>
|
||||
TypedPipe
|
||||
.from(AdhocKeyValSources.interestedInSource(args("interestedInDir")))
|
||||
case None =>
|
||||
DAL
|
||||
.readMostRecentSnapshotNoOlderThan(
|
||||
SimclustersV2InterestedInScalaDataset,
|
||||
Days(20)
|
||||
)
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe
|
||||
.map {
|
||||
case KeyVal(key, value) => (key, value)
|
||||
}
|
||||
}
|
||||
|
||||
val inputKnownFor = args
|
||||
.optional("knownForDir")
|
||||
.map { location => KnownForSources.readKnownFor(location) }
|
||||
.getOrElse(KnownForSources.knownFor_20M_Dec11_145K)
|
||||
|
||||
val modelVersion =
|
||||
args.optional("modelVersion").getOrElse("20M_145K_dec11")
|
||||
|
||||
val useLogFavWeights = args.boolean("useLogFavWeights")
|
||||
|
||||
val shouldL2NormalizeKnownFor = args.boolean("l2NormalizeKnownFor")
|
||||
|
||||
val toEmailAddressOpt = args.optional("toEmailAddress")
|
||||
|
||||
val knownForExec = if (shouldL2NormalizeKnownFor) {
|
||||
l2NormalizeKnownFor(inputKnownFor)
|
||||
} else {
|
||||
Execution.from(inputKnownFor)
|
||||
}
|
||||
|
||||
val finalExec = knownForExec.flatMap { knownFor =>
|
||||
val graph = args.optional("graphInputDir") match {
|
||||
case Some(dir) =>
|
||||
TypedPipe.from(UserAndNeighborsFixedPathSource(dir))
|
||||
case None =>
|
||||
DAL
|
||||
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(20))
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe
|
||||
}
|
||||
|
||||
val producerNormsAndCounts = args.optional("normsAndCountsDir") match {
|
||||
case Some(dir) =>
|
||||
TypedPipe.from(NormsAndCountsFixedPathSource(args(dir)))
|
||||
case None =>
|
||||
DAL
|
||||
.readMostRecentSnapshotNoOlderThan(ProducerNormsAndCountsScalaDataset, Days(20))
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe
|
||||
}
|
||||
|
||||
val clusterIncomingVolumesExec = loadOrMake(
|
||||
computeClusterIncomingVolumes(knownFor, producerNormsAndCounts, useLogFavWeights),
|
||||
modelVersion,
|
||||
args("incomingVolumesResultsDir")
|
||||
)
|
||||
|
||||
val resultsWithOutgoingVolumesExec = loadOrMake(
|
||||
getResultsWithOutgoingVolumes(graph, interestedIn, useLogFavWeights),
|
||||
modelVersion,
|
||||
args("outgoingVolumesResultsDir")
|
||||
)
|
||||
|
||||
val finalPerClusterResultsExec =
|
||||
finalPerClusterResults(
|
||||
knownFor,
|
||||
interestedIn,
|
||||
resultsWithOutgoingVolumesExec,
|
||||
clusterIncomingVolumesExec)
|
||||
.flatMap { pipe => loadOrMake(pipe, modelVersion, args("outputDir")) }
|
||||
|
||||
finalPerClusterResultsExec.flatMap { finalPerClusterResults =>
|
||||
val perClusterResults = finalPerClusterResults.values
|
||||
val distributionResultsExec = getClusterResultsSummary(perClusterResults).map {
|
||||
case Some(summary) =>
|
||||
"Summary of results across clusters: \n" +
|
||||
Util.prettyJsonMapper.writeValueAsString(summary)
|
||||
case _ =>
|
||||
"No summary of results! The cluster level results pipe must be empty!"
|
||||
}
|
||||
|
||||
val overallResultsExec = perClusterResults.sum.toOptionExecution.map {
|
||||
case Some(overallQuality) =>
|
||||
"Overall Quality: \n" +
|
||||
Util.prettyJsonMapper.writeValueAsString(
|
||||
printableBipartiteQuality(overallQuality)
|
||||
)
|
||||
case _ =>
|
||||
"No overall quality! The cluster level results pipe must be empty!"
|
||||
}
|
||||
|
||||
Execution.zip(distributionResultsExec, overallResultsExec).map {
|
||||
case (distResults, overallResults) =>
|
||||
toEmailAddressOpt.foreach { address =>
|
||||
Util.sendEmail(
|
||||
distResults + "\n" + overallResults,
|
||||
"Bipartite cluster quality for " + modelVersion,
|
||||
address
|
||||
)
|
||||
}
|
||||
println(distResults + "\n" + overallResults)
|
||||
}
|
||||
}
|
||||
}
|
||||
Util.printCounters(finalExec)
|
||||
}
|
||||
}
|
||||
|
||||
def getResultsWithOutgoingVolumes(
|
||||
graph: TypedPipe[UserAndNeighbors],
|
||||
interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
||||
useLogFavWeights: Boolean
|
||||
): TypedPipe[(Int, BipartiteClusterQuality)] = {
|
||||
graph
|
||||
.map { un => (un.userId, un.neighbors) }
|
||||
// should this be a leftJoin? For now, leaving it as an inner join. If in the future,
|
||||
// we want to compare two approaches with very different coverages on interestedIn, this
|
||||
// could become a problem.
|
||||
.join(interestedIn)
|
||||
.withReducers(4000)
|
||||
.flatMap {
|
||||
case (userId, (neighbors, clusters)) =>
|
||||
getBIResultsFromSingleUser(userId, neighbors, clusters, useLogFavWeights)
|
||||
}
|
||||
.sumByKey
|
||||
.withReducers(600)
|
||||
.map {
|
||||
case (clusterId, bir) =>
|
||||
(
|
||||
clusterId,
|
||||
BipartiteClusterQuality(
|
||||
inClusterFollowEdges = Some(bir.inClusterWeights.isFollowEdge),
|
||||
inClusterFavEdges = Some(bir.inClusterWeights.isFavEdge),
|
||||
favWtSumOfInClusterFollowEdges = Some(bir.inClusterWeights.favWtIfFollowEdge),
|
||||
favWtSumOfInClusterFavEdges = Some(bir.inClusterWeights.favWtIfFavEdge),
|
||||
outgoingFollowEdges = Some(bir.totalOutgoingVolumes.isFollowEdge),
|
||||
outgoingFavEdges = Some(bir.totalOutgoingVolumes.isFavEdge),
|
||||
favWtSumOfOutgoingFollowEdges = Some(bir.totalOutgoingVolumes.favWtIfFollowEdge),
|
||||
favWtSumOfOutgoingFavEdges = Some(bir.totalOutgoingVolumes.favWtIfFavEdge),
|
||||
interestedInSize = Some(bir.interestedInSize),
|
||||
sampledEdges = Some(
|
||||
bir.edgeSample
|
||||
.iterator()
|
||||
.asScala
|
||||
.toSeq
|
||||
.map {
|
||||
case (edge, data) => makeThriftSampledEdge(edge, data)
|
||||
}
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
def getBIResultsFromSingleUser(
|
||||
userId: Long,
|
||||
neighbors: Seq[NeighborWithWeights],
|
||||
clusters: ClustersUserIsInterestedIn,
|
||||
useLogFavScores: Boolean
|
||||
): List[(Int, BipartiteIntermediateResults)] = {
|
||||
val neighborsToWeights = neighbors.map { neighborAndWeights =>
|
||||
val isFollowEdge = neighborAndWeights.isFollowed match {
|
||||
case Some(true) => 1.0
|
||||
case _ => 0.0
|
||||
}
|
||||
val favScore = if (useLogFavScores) {
|
||||
neighborAndWeights.logFavScore.getOrElse(0.0)
|
||||
} else neighborAndWeights.favScoreHalfLife100Days.getOrElse(0.0)
|
||||
val isFavEdge = math.min(1, math.ceil(favScore))
|
||||
neighborAndWeights.neighborId -> Weights(
|
||||
isFollowEdge,
|
||||
isFavEdge,
|
||||
favScore * isFollowEdge,
|
||||
favScore
|
||||
)
|
||||
}.toMap
|
||||
|
||||
val outgoingVolumes = Monoid.sum(neighborsToWeights.values)(WeightsMonoid)
|
||||
|
||||
clusters.clusterIdToScores.toList.map {
|
||||
case (clusterId, scoresStruct) =>
|
||||
val inClusterNeighbors =
|
||||
(scoresStruct.usersBeingFollowed.getOrElse(Nil) ++
|
||||
scoresStruct.usersThatWereFaved.getOrElse(Nil)).toSet
|
||||
val edgesForSampling = inClusterNeighbors.flatMap { neighborId =>
|
||||
if (neighborsToWeights.contains(neighborId)) {
|
||||
Some(
|
||||
(userId, neighborId),
|
||||
SampledEdgeData(
|
||||
neighborsToWeights(neighborId).favWtIfFollowEdge,
|
||||
neighborsToWeights(neighborId).favWtIfFavEdge,
|
||||
scoresStruct.followScore.getOrElse(0.0),
|
||||
scoresStruct.favScore.getOrElse(0.0)
|
||||
)
|
||||
)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
val inClusterWeights =
|
||||
Monoid.sum(neighborsToWeights.filterKeys(inClusterNeighbors).values)(WeightsMonoid)
|
||||
|
||||
(
|
||||
clusterId,
|
||||
BipartiteIntermediateResults(
|
||||
inClusterWeights,
|
||||
outgoingVolumes,
|
||||
1,
|
||||
samplerMonoid.build(edgesForSampling)
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
def computeClusterIncomingVolumes(
|
||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
|
||||
producerNormsAndCounts: TypedPipe[NormsAndCounts],
|
||||
useLogFavWeights: Boolean
|
||||
): TypedPipe[(Int, BipartiteClusterQuality)] = {
|
||||
producerNormsAndCounts
|
||||
.map { x => (x.userId, x) }
|
||||
.join(knownFor)
|
||||
.withReducers(100)
|
||||
.flatMap {
|
||||
case (userId, (normsAndCounts, clusters)) =>
|
||||
clusters.map {
|
||||
case (clusterId, _) =>
|
||||
val followerCount =
|
||||
normsAndCounts.followerCount.getOrElse(0L).toDouble
|
||||
val faverCount = normsAndCounts.faverCount.getOrElse(0L).toDouble
|
||||
val favWtSumOfIncomingFollows = if (useLogFavWeights) {
|
||||
normsAndCounts.logFavWeightsOnFollowEdgesSum.getOrElse(0.0)
|
||||
} else {
|
||||
normsAndCounts.favWeightsOnFollowEdgesSum.getOrElse(0.0)
|
||||
}
|
||||
val favWtSumOfIncomingFavs = if (useLogFavWeights) {
|
||||
normsAndCounts.logFavWeightsOnFavEdgesSum.getOrElse(0.0)
|
||||
} else {
|
||||
normsAndCounts.favWeightsOnFavEdgesSum.getOrElse(0.0)
|
||||
}
|
||||
(
|
||||
clusterId,
|
||||
BipartiteClusterQuality(
|
||||
incomingFollowEdges = Some(followerCount),
|
||||
incomingFavEdges = Some(faverCount),
|
||||
favWtSumOfIncomingFollowEdges = Some(favWtSumOfIncomingFollows),
|
||||
favWtSumOfIncomingFavEdges = Some(favWtSumOfIncomingFavs)
|
||||
))
|
||||
}
|
||||
}
|
||||
.sumByKey
|
||||
.toTypedPipe
|
||||
}
|
||||
|
||||
def loadOrMake(
|
||||
pipe: TypedPipe[(Int, BipartiteClusterQuality)],
|
||||
modelVersion: String,
|
||||
path: String
|
||||
): Execution[TypedPipe[(Int, BipartiteClusterQuality)]] = {
|
||||
val mapped = pipe.map {
|
||||
case (clusterId, struct) => ((modelVersion, clusterId), struct)
|
||||
}
|
||||
makeForKeyValSource(mapped, AdhocKeyValSources.bipartiteQualitySource(path), path).map { pipe =>
|
||||
// discard model version
|
||||
pipe.map { case ((_, clusterId), struct) => (clusterId, struct) }
|
||||
}
|
||||
}
|
||||
|
||||
def makeForKeyValSource[K, V](
|
||||
pipe: TypedPipe[(K, V)],
|
||||
dest: VersionedKeyValSource[K, V],
|
||||
path: String
|
||||
): Execution[TypedPipe[(K, V)]] =
|
||||
Execution.getMode.flatMap { mode =>
|
||||
if (dest.resourceExists(mode)) {
|
||||
println(s"validated path $path")
|
||||
Execution.from(TypedPipe.from(dest))
|
||||
} else {
|
||||
println(s"Could not load from $path")
|
||||
pipe.writeThrough(dest)
|
||||
}
|
||||
}
|
||||
|
||||
def precisionOfWholeGraph(
|
||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
|
||||
interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
||||
clusterIncomingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]]
|
||||
): Execution[Option[Double]] = {
|
||||
val knownForSizeExec = knownFor.aggregate(Aggregator.size).toOptionExecution
|
||||
val interestedInSizeExec =
|
||||
interestedIn.aggregate(Aggregator.size).toOptionExecution
|
||||
val numExec = clusterIncomingVolumesExec.flatMap { volumes =>
|
||||
volumes.values.flatMap(_.favWtSumOfIncomingFavEdges).sum.toOptionExecution
|
||||
}
|
||||
Execution.zip(numExec, interestedInSizeExec, knownForSizeExec).map {
|
||||
case (Some(num), Some(interestedInSize), Some(knownForSize)) =>
|
||||
Some(num / interestedInSize / knownForSize)
|
||||
case x @ _ =>
|
||||
println("Precision of whole graph zip: " + x)
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
def finalPerClusterResults(
|
||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
|
||||
interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
||||
resultsWithOutgoingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]],
|
||||
incomingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]]
|
||||
): Execution[TypedPipe[(Int, BipartiteClusterQuality)]] = {
|
||||
val knownForTranspose = KnownForSources.transpose(knownFor)
|
||||
|
||||
val precisionOfWholeGraphExec =
|
||||
precisionOfWholeGraph(knownFor, interestedIn, incomingVolumesExec)
|
||||
|
||||
Execution
|
||||
.zip(resultsWithOutgoingVolumesExec, incomingVolumesExec, precisionOfWholeGraphExec)
|
||||
.map {
|
||||
case (resultsWithOutgoingVolumes, clusterIncomingVolumes, precisionOfWholeGraph) =>
|
||||
println("Precision of whole graph " + precisionOfWholeGraph)
|
||||
resultsWithOutgoingVolumes
|
||||
.join(knownForTranspose)
|
||||
.leftJoin(clusterIncomingVolumes)
|
||||
.withReducers(500)
|
||||
.map {
|
||||
case (clusterId, ((outgoingVolumeQuality, knownForList), incomingVolumesOpt)) =>
|
||||
val incomingVolumes =
|
||||
incomingVolumesOpt.getOrElse(BipartiteClusterQuality())
|
||||
val knownForMap = knownForList.toMap
|
||||
(
|
||||
clusterId,
|
||||
getFullQuality(
|
||||
outgoingVolumeQuality,
|
||||
incomingVolumes,
|
||||
knownForMap,
|
||||
precisionOfWholeGraph))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def getFullQuality(
|
||||
qualityWithOutgoingVolumes: BipartiteClusterQuality,
|
||||
incomingVolumes: BipartiteClusterQuality,
|
||||
knownFor: Map[Long, Float],
|
||||
precisionOfWholeGraph: Option[Double]
|
||||
): BipartiteClusterQuality = {
|
||||
val newSampledEdges = qualityWithOutgoingVolumes.sampledEdges.map { sampledEdges =>
|
||||
sampledEdges.map { sampledEdge =>
|
||||
val knownForScore = knownFor.getOrElse(sampledEdge.followeeId, 0.0f)
|
||||
sampledEdge.copy(
|
||||
predictedFollowScore = sampledEdge.followScoreToCluster.map { x => x * knownForScore },
|
||||
predictedFavScore = sampledEdge.favScoreToCluster.map { x => x * knownForScore }
|
||||
)
|
||||
}
|
||||
}
|
||||
val correlationOfFavWtIfFollow = newSampledEdges.map { samples =>
|
||||
val pairs = samples.map { s =>
|
||||
(s.predictedFollowScore.getOrElse(0.0), s.favWtIfFollowEdge.getOrElse(0.0))
|
||||
}
|
||||
Util.computeCorrelation(pairs.iterator)
|
||||
}
|
||||
val correlationOfFavWtIfFav = newSampledEdges.map { samples =>
|
||||
val pairs = samples.map { s =>
|
||||
(s.predictedFavScore.getOrElse(0.0), s.favWtIfFavEdge.getOrElse(0.0))
|
||||
}
|
||||
Util.computeCorrelation(pairs.iterator)
|
||||
}
|
||||
val relativePrecisionNum = {
|
||||
if (qualityWithOutgoingVolumes.interestedInSize.exists(_ > 0) && knownFor.nonEmpty) {
|
||||
qualityWithOutgoingVolumes.favWtSumOfInClusterFavEdges
|
||||
.getOrElse(0.0) / qualityWithOutgoingVolumes.interestedInSize.get / knownFor.size
|
||||
} else 0.0
|
||||
}
|
||||
val relativePrecision = if (precisionOfWholeGraph.exists(_ > 0.0)) {
|
||||
Some(relativePrecisionNum / precisionOfWholeGraph.get)
|
||||
} else None
|
||||
qualityWithOutgoingVolumes.copy(
|
||||
incomingFollowEdges = incomingVolumes.incomingFollowEdges,
|
||||
incomingFavEdges = incomingVolumes.incomingFavEdges,
|
||||
favWtSumOfIncomingFollowEdges = incomingVolumes.favWtSumOfIncomingFollowEdges,
|
||||
favWtSumOfIncomingFavEdges = incomingVolumes.favWtSumOfIncomingFavEdges,
|
||||
knownForSize = Some(knownFor.size),
|
||||
correlationOfFavWtIfFollowWithPredictedFollow = correlationOfFavWtIfFollow,
|
||||
correlationOfFavWtIfFavWithPredictedFav = correlationOfFavWtIfFav,
|
||||
sampledEdges = newSampledEdges,
|
||||
relativePrecisionUsingFavWtIfFav = relativePrecision,
|
||||
averagePrecisionOfWholeGraphUsingFavWtIfFav = precisionOfWholeGraph
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
object DumpBpQuality extends TwitterExecutionApp {
|
||||
def job: Execution[Unit] = Execution.getConfigMode.flatMap {
|
||||
case (config, mode) =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
val args = config.getArgs
|
||||
val inputDir = args("inputDir")
|
||||
|
||||
val clusters = args.list("clusters").map(_.toInt).toSet
|
||||
val input =
|
||||
TypedPipe
|
||||
.from(AdhocKeyValSources.bipartiteQualitySource(inputDir))
|
||||
.map {
|
||||
case ((modelVersion, clusterId), quality) =>
|
||||
(
|
||||
(modelVersion, clusterId),
|
||||
BipartiteClusterEvaluationClasses
|
||||
.printableBipartiteQuality(quality))
|
||||
}
|
||||
|
||||
if (clusters.isEmpty) {
|
||||
input.printSummary("Bipartite quality")
|
||||
} else {
|
||||
input
|
||||
.collect {
|
||||
case rec @ ((_, clusterId), quality) if clusters(clusterId) =>
|
||||
Util.prettyJsonMapper
|
||||
.writeValueAsString(rec)
|
||||
.replaceAll("\n", " ")
|
||||
}
|
||||
.toIterableExecution
|
||||
.map { strings => println(strings.mkString("\n")) }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,316 +0,0 @@
|
||||
package com.twitter.simclusters_v2.scalding
|
||||
|
||||
import com.twitter.algebird.{Monoid, OptionMonoid, Semigroup}
|
||||
import com.twitter.algebird.mutable.PriorityQueueMonoid
|
||||
import com.twitter.scalding.Execution
|
||||
import com.twitter.scalding.typed.TypedPipe
|
||||
import com.twitter.simclusters_v2.scalding.common.Util
|
||||
import com.twitter.simclusters_v2.scalding.common.Util.Distribution
|
||||
import com.twitter.simclusters_v2.thriftscala.{BipartiteClusterQuality, SampledEdge}
|
||||
import java.util.PriorityQueue
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object BipartiteClusterEvaluationClasses {
|
||||
case class Weights(
|
||||
isFollowEdge: Double,
|
||||
isFavEdge: Double,
|
||||
favWtIfFollowEdge: Double,
|
||||
favWtIfFavEdge: Double)
|
||||
|
||||
object WeightsMonoid extends Monoid[Weights] {
|
||||
override def zero = Weights(0.0, 0.0, 0.0, 0.0)
|
||||
|
||||
override def plus(l: Weights, r: Weights): Weights = {
|
||||
Weights(
|
||||
l.isFollowEdge + r.isFollowEdge,
|
||||
l.isFavEdge + r.isFavEdge,
|
||||
l.favWtIfFollowEdge + r.favWtIfFollowEdge,
|
||||
l.favWtIfFavEdge + r.favWtIfFavEdge
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
implicit val wm: Monoid[Weights] = WeightsMonoid
|
||||
|
||||
case class SampledEdgeData(
|
||||
favWtIfFollowEdge: Double,
|
||||
favWtIfFavEdge: Double,
|
||||
followScoreToCluster: Double,
|
||||
favScoreToCluster: Double)
|
||||
|
||||
implicit val samplerMonoid: PriorityQueueMonoid[((Long, Long), SampledEdgeData)] =
|
||||
Util.reservoirSamplerMonoidForPairs[(Long, Long), SampledEdgeData](2000)(Util.edgeOrdering)
|
||||
|
||||
implicit val sampledEdgesMonoid: PriorityQueueMonoid[SampledEdge] =
|
||||
Util.reservoirSamplerMonoid(
|
||||
10000,
|
||||
{ sampledEdge: SampledEdge => (sampledEdge.followerId, sampledEdge.followeeId) }
|
||||
)(Util.edgeOrdering)
|
||||
|
||||
case class BipartiteIntermediateResults(
|
||||
inClusterWeights: Weights,
|
||||
totalOutgoingVolumes: Weights,
|
||||
interestedInSize: Int,
|
||||
edgeSample: PriorityQueue[((Long, Long), SampledEdgeData)]) {
|
||||
override def toString: String = {
|
||||
"BCR(%s, %s, %d, %s)".format(
|
||||
inClusterWeights,
|
||||
totalOutgoingVolumes,
|
||||
interestedInSize,
|
||||
edgeSample.iterator().asScala.toSeq.toString()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
object BIRMonoid extends Monoid[BipartiteIntermediateResults] {
|
||||
override def zero =
|
||||
BipartiteIntermediateResults(WeightsMonoid.zero, WeightsMonoid.zero, 0, samplerMonoid.zero)
|
||||
|
||||
override def plus(
|
||||
l: BipartiteIntermediateResults,
|
||||
r: BipartiteIntermediateResults
|
||||
): BipartiteIntermediateResults = {
|
||||
BipartiteIntermediateResults(
|
||||
WeightsMonoid.plus(l.inClusterWeights, r.inClusterWeights),
|
||||
WeightsMonoid.plus(l.totalOutgoingVolumes, r.totalOutgoingVolumes),
|
||||
l.interestedInSize + r.interestedInSize,
|
||||
samplerMonoid.plus(l.edgeSample, r.edgeSample)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
implicit val bIRMonoid: Monoid[BipartiteIntermediateResults] = BIRMonoid
|
||||
|
||||
def makeThriftSampledEdge(edge: (Long, Long), data: SampledEdgeData): SampledEdge = {
|
||||
val (followerId, followeeId) = edge
|
||||
SampledEdge(
|
||||
followerId = followerId,
|
||||
followeeId = followeeId,
|
||||
favWtIfFollowEdge = Some(data.favWtIfFollowEdge),
|
||||
favWtIfFavEdge = Some(data.favWtIfFavEdge),
|
||||
followScoreToCluster = Some(data.followScoreToCluster),
|
||||
favScoreToCluster = Some(data.favScoreToCluster)
|
||||
)
|
||||
}
|
||||
|
||||
object ClusterQualitySemigroup extends Semigroup[BipartiteClusterQuality] {
|
||||
val doubleOM: Monoid[Option[Double]] = new OptionMonoid[Double]
|
||||
val intOM: Monoid[Option[Int]] = new OptionMonoid[Int]
|
||||
val longOM: Monoid[Option[Long]] = new OptionMonoid[Long]
|
||||
|
||||
override def plus(l: BipartiteClusterQuality, r: BipartiteClusterQuality) =
|
||||
BipartiteClusterQuality(
|
||||
inClusterFollowEdges = doubleOM.plus(l.inClusterFollowEdges, r.inClusterFollowEdges),
|
||||
inClusterFavEdges = doubleOM.plus(l.inClusterFavEdges, r.inClusterFavEdges),
|
||||
favWtSumOfInClusterFollowEdges = doubleOM
|
||||
.plus(l.favWtSumOfInClusterFollowEdges, r.favWtSumOfInClusterFollowEdges),
|
||||
favWtSumOfInClusterFavEdges = doubleOM
|
||||
.plus(l.favWtSumOfInClusterFavEdges, r.favWtSumOfInClusterFavEdges),
|
||||
outgoingFollowEdges = doubleOM.plus(l.outgoingFollowEdges, r.outgoingFollowEdges),
|
||||
outgoingFavEdges = doubleOM.plus(l.outgoingFavEdges, r.outgoingFavEdges),
|
||||
favWtSumOfOutgoingFollowEdges = doubleOM
|
||||
.plus(l.favWtSumOfOutgoingFollowEdges, r.favWtSumOfOutgoingFollowEdges),
|
||||
favWtSumOfOutgoingFavEdges = doubleOM
|
||||
.plus(l.favWtSumOfOutgoingFavEdges, r.favWtSumOfOutgoingFavEdges),
|
||||
incomingFollowEdges = doubleOM.plus(l.incomingFollowEdges, r.incomingFollowEdges),
|
||||
incomingFavEdges = doubleOM.plus(l.incomingFavEdges, r.incomingFavEdges),
|
||||
favWtSumOfIncomingFollowEdges = doubleOM
|
||||
.plus(l.favWtSumOfIncomingFollowEdges, r.favWtSumOfIncomingFollowEdges),
|
||||
favWtSumOfIncomingFavEdges = doubleOM
|
||||
.plus(l.favWtSumOfIncomingFavEdges, r.favWtSumOfIncomingFavEdges),
|
||||
interestedInSize = None,
|
||||
sampledEdges = Some(
|
||||
sampledEdgesMonoid
|
||||
.plus(
|
||||
sampledEdgesMonoid.build(l.sampledEdges.getOrElse(Nil)),
|
||||
sampledEdgesMonoid.build(r.sampledEdges.getOrElse(Nil))
|
||||
)
|
||||
.iterator()
|
||||
.asScala
|
||||
.toSeq),
|
||||
knownForSize = intOM.plus(l.knownForSize, r.knownForSize),
|
||||
correlationOfFavWtIfFollowWithPredictedFollow = None,
|
||||
correlationOfFavWtIfFavWithPredictedFav = None,
|
||||
relativePrecisionUsingFavWtIfFav = None,
|
||||
averagePrecisionOfWholeGraphUsingFavWtIfFav = l.averagePrecisionOfWholeGraphUsingFavWtIfFav
|
||||
)
|
||||
}
|
||||
|
||||
implicit val bcqSemigroup: Semigroup[BipartiteClusterQuality] =
|
||||
ClusterQualitySemigroup
|
||||
|
||||
case class PrintableBipartiteQuality(
|
||||
incomingFollowUnweightedRecall: String,
|
||||
incomingFavUnweightedRecall: String,
|
||||
incomingFollowWeightedRecall: String,
|
||||
incomingFavWeightedRecall: String,
|
||||
outgoingFollowUnweightedRecall: String,
|
||||
outgoingFavUnweightedRecall: String,
|
||||
outgoingFollowWeightedRecall: String,
|
||||
outgoingFavWeightedRecall: String,
|
||||
incomingFollowEdges: String,
|
||||
incomingFavEdges: String,
|
||||
favWtSumOfIncomingFollowEdges: String,
|
||||
favWtSumOfIncomingFavEdges: String,
|
||||
outgoingFollowEdges: String,
|
||||
outgoingFavEdges: String,
|
||||
favWtSumOfOutgoingFollowEdges: String,
|
||||
favWtSumOfOutgoingFavEdges: String,
|
||||
correlationOfFavWtIfFollow: String,
|
||||
correlationOfFavWtIfFav: String,
|
||||
relativePrecisionUsingFavWt: String,
|
||||
averagePrecisionOfWholeGraphUsingFavWt: String,
|
||||
interestedInSize: String,
|
||||
knownForSize: String)
|
||||
|
||||
def printableBipartiteQuality(in: BipartiteClusterQuality): PrintableBipartiteQuality = {
|
||||
def getRatio(numOpt: Option[Double], denOpt: Option[Double]): String = {
|
||||
val r = if (denOpt.exists(_ > 0)) {
|
||||
numOpt.getOrElse(0.0) / denOpt.get
|
||||
} else 0.0
|
||||
"%.3f".format(r)
|
||||
}
|
||||
|
||||
val formatter = new java.text.DecimalFormat("###,###.#")
|
||||
|
||||
def denString(denOpt: Option[Double]): String =
|
||||
formatter.format(denOpt.getOrElse(0.0))
|
||||
|
||||
val correlationOfFavWtIfFollow =
|
||||
in.correlationOfFavWtIfFollowWithPredictedFollow match {
|
||||
case None =>
|
||||
in.sampledEdges.map { samples =>
|
||||
val pairs = samples.map { s =>
|
||||
(s.predictedFollowScore.getOrElse(0.0), s.favWtIfFollowEdge.getOrElse(0.0))
|
||||
}
|
||||
Util.computeCorrelation(pairs.iterator)
|
||||
}
|
||||
case x @ _ => x
|
||||
}
|
||||
|
||||
val correlationOfFavWtIfFav =
|
||||
in.correlationOfFavWtIfFavWithPredictedFav match {
|
||||
case None =>
|
||||
in.sampledEdges.map { samples =>
|
||||
val pairs = samples.map { s =>
|
||||
(s.predictedFavScore.getOrElse(0.0), s.favWtIfFavEdge.getOrElse(0.0))
|
||||
}
|
||||
Util.computeCorrelation(pairs.iterator)
|
||||
}
|
||||
case x @ _ => x
|
||||
}
|
||||
|
||||
PrintableBipartiteQuality(
|
||||
incomingFollowUnweightedRecall = getRatio(in.inClusterFollowEdges, in.incomingFollowEdges),
|
||||
incomingFavUnweightedRecall = getRatio(in.inClusterFavEdges, in.incomingFavEdges),
|
||||
incomingFollowWeightedRecall =
|
||||
getRatio(in.favWtSumOfInClusterFollowEdges, in.favWtSumOfIncomingFollowEdges),
|
||||
incomingFavWeightedRecall =
|
||||
getRatio(in.favWtSumOfInClusterFavEdges, in.favWtSumOfIncomingFavEdges),
|
||||
outgoingFollowUnweightedRecall = getRatio(in.inClusterFollowEdges, in.outgoingFollowEdges),
|
||||
outgoingFavUnweightedRecall = getRatio(in.inClusterFavEdges, in.outgoingFavEdges),
|
||||
outgoingFollowWeightedRecall =
|
||||
getRatio(in.favWtSumOfInClusterFollowEdges, in.favWtSumOfOutgoingFollowEdges),
|
||||
outgoingFavWeightedRecall =
|
||||
getRatio(in.favWtSumOfInClusterFavEdges, in.favWtSumOfOutgoingFavEdges),
|
||||
incomingFollowEdges = denString(in.incomingFollowEdges),
|
||||
incomingFavEdges = denString(in.incomingFavEdges),
|
||||
favWtSumOfIncomingFollowEdges = denString(in.favWtSumOfIncomingFollowEdges),
|
||||
favWtSumOfIncomingFavEdges = denString(in.favWtSumOfIncomingFavEdges),
|
||||
outgoingFollowEdges = denString(in.outgoingFollowEdges),
|
||||
outgoingFavEdges = denString(in.outgoingFavEdges),
|
||||
favWtSumOfOutgoingFollowEdges = denString(in.favWtSumOfOutgoingFollowEdges),
|
||||
favWtSumOfOutgoingFavEdges = denString(in.favWtSumOfOutgoingFavEdges),
|
||||
correlationOfFavWtIfFollow = "%.3f"
|
||||
.format(correlationOfFavWtIfFollow.getOrElse(0.0)),
|
||||
correlationOfFavWtIfFav = "%.3f"
|
||||
.format(correlationOfFavWtIfFav.getOrElse(0.0)),
|
||||
relativePrecisionUsingFavWt =
|
||||
"%.2g".format(in.relativePrecisionUsingFavWtIfFav.getOrElse(0.0)),
|
||||
averagePrecisionOfWholeGraphUsingFavWt =
|
||||
"%.2g".format(in.averagePrecisionOfWholeGraphUsingFavWtIfFav.getOrElse(0.0)),
|
||||
interestedInSize = in.interestedInSize.getOrElse(0).toString,
|
||||
knownForSize = in.knownForSize.getOrElse(0).toString
|
||||
)
|
||||
}
|
||||
|
||||
case class ClusterResultsSummary(
|
||||
numClustersWithZeroInterestedIn: Int,
|
||||
numClustersWithZeroFollowWtRecall: Int,
|
||||
numClustersWithZeroFavWtRecall: Int,
|
||||
numClustersWithZeroFollowAndFavWtRecall: Int,
|
||||
interestedInSizeDist: Distribution,
|
||||
outgoingFollowWtRecallDist: Distribution,
|
||||
outgoingFavWtRecallDist: Distribution,
|
||||
incomingFollowWtRecallDist: Distribution,
|
||||
incomingFavWtRecallDist: Distribution,
|
||||
followCorrelationDist: Distribution,
|
||||
favCorrelationDist: Distribution,
|
||||
relativePrecisionDist: Distribution)
|
||||
|
||||
def getClusterResultsSummary(
|
||||
perClusterResults: TypedPipe[BipartiteClusterQuality]
|
||||
): Execution[Option[ClusterResultsSummary]] = {
|
||||
perClusterResults
|
||||
.map { clusterQuality =>
|
||||
val printableQuality = printableBipartiteQuality(clusterQuality)
|
||||
val isFollowRecallZero =
|
||||
if (!clusterQuality.favWtSumOfInClusterFollowEdges
|
||||
.exists(_ > 0)) 1
|
||||
else 0
|
||||
val isFavRecallZero =
|
||||
if (!clusterQuality.favWtSumOfInClusterFavEdges.exists(_ > 0)) 1
|
||||
else 0
|
||||
(
|
||||
if (!clusterQuality.interestedInSize.exists(_ > 0)) 1 else 0,
|
||||
isFollowRecallZero,
|
||||
isFavRecallZero,
|
||||
isFavRecallZero * isFollowRecallZero,
|
||||
clusterQuality.interestedInSize.toList.map(_.toDouble),
|
||||
List(printableQuality.outgoingFollowWeightedRecall.toDouble),
|
||||
List(printableQuality.outgoingFavWeightedRecall.toDouble),
|
||||
List(printableQuality.incomingFollowWeightedRecall.toDouble),
|
||||
List(printableQuality.incomingFavWeightedRecall.toDouble),
|
||||
List(printableQuality.correlationOfFavWtIfFollow.toDouble),
|
||||
List(printableQuality.correlationOfFavWtIfFav.toDouble),
|
||||
List(printableQuality.relativePrecisionUsingFavWt.toDouble)
|
||||
)
|
||||
}
|
||||
.sum
|
||||
.toOptionExecution
|
||||
.map { opt =>
|
||||
opt.map {
|
||||
case (
|
||||
zeroInterestedIn,
|
||||
zeroFollowRecall,
|
||||
zeroFavRecall,
|
||||
zeroFollowAndFavRecall,
|
||||
interestedInSizeList,
|
||||
outgoingFollowWtRecallList,
|
||||
outgoingFavWtRecallList,
|
||||
incomingFollowWtRecallList,
|
||||
incomingFavWtRecallList,
|
||||
followCorrelationList,
|
||||
favCorrelationList,
|
||||
relativePrecisionList
|
||||
) =>
|
||||
ClusterResultsSummary(
|
||||
numClustersWithZeroInterestedIn = zeroInterestedIn,
|
||||
numClustersWithZeroFollowWtRecall = zeroFollowRecall,
|
||||
numClustersWithZeroFavWtRecall = zeroFavRecall,
|
||||
numClustersWithZeroFollowAndFavWtRecall = zeroFollowAndFavRecall,
|
||||
interestedInSizeDist = Util.distributionFromArray(interestedInSizeList.toArray),
|
||||
outgoingFollowWtRecallDist = Util
|
||||
.distributionFromArray(outgoingFollowWtRecallList.toArray),
|
||||
outgoingFavWtRecallDist = Util.distributionFromArray(outgoingFavWtRecallList.toArray),
|
||||
incomingFollowWtRecallDist = Util
|
||||
.distributionFromArray(incomingFollowWtRecallList.toArray),
|
||||
incomingFavWtRecallDist = Util.distributionFromArray(incomingFavWtRecallList.toArray),
|
||||
followCorrelationDist = Util.distributionFromArray(followCorrelationList.toArray),
|
||||
favCorrelationDist = Util.distributionFromArray(favCorrelationList.toArray),
|
||||
relativePrecisionDist = Util.distributionFromArray(relativePrecisionList.toArray)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,794 +0,0 @@
|
||||
package com.twitter.simclusters_v2.scalding
|
||||
|
||||
import com.twitter.algebird.OptionMonoid
|
||||
import com.twitter.algebird.QTree
|
||||
import com.twitter.algebird.QTreeSemigroup
|
||||
import com.twitter.algebird.Semigroup
|
||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
||||
import com.twitter.dal.client.dataset.SnapshotDALDataset
|
||||
import com.twitter.hermit.candidate.thriftscala.Candidates
|
||||
import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource
|
||||
import com.twitter.pluck.source.cassowary.SimsCandidatesSource
|
||||
import com.twitter.scalding._
|
||||
import com.twitter.scalding_internal.dalv2.DAL
|
||||
import com.twitter.scalding_internal.dalv2.DALWrite._
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
|
||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
||||
import com.twitter.scalding_internal.job.analytics_batch._
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
||||
import com.twitter.simclusters_v2.common.ModelVersions
|
||||
import com.twitter.simclusters_v2.hdfs_sources._
|
||||
import com.twitter.simclusters_v2.scalding.common.Util
|
||||
import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources
|
||||
import com.twitter.simclusters_v2.thriftscala._
|
||||
import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset
|
||||
import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser
|
||||
|
||||
object ClusterDetailsJob {
|
||||
case class Scores(followScore: Double, favScore: Double, logFavScore: Double)
|
||||
|
||||
case class IntermediateDetails(
|
||||
numUsersWithAnyNonZeroScore: Int,
|
||||
numUsersWithNonZeroFollowScore: Int,
|
||||
numUsersWithNonZeroFavScore: Int,
|
||||
favQTree: Option[QTree[Double]],
|
||||
followQTree: Option[QTree[Double]],
|
||||
logFavQTree: Option[QTree[Double]],
|
||||
sumOfSquares: Scores,
|
||||
sum: Scores,
|
||||
min: Scores,
|
||||
max: Scores)
|
||||
|
||||
case class InfoFromUserSource(
|
||||
fractionMarkedNSFWUser: Double,
|
||||
languageToFractionDeviceLanguage: Map[String, Double],
|
||||
countryCodeToFractionKnownForWithCountryCode: Map[String, Double],
|
||||
languageToFractionInferredLanguage: Map[String, Double])
|
||||
|
||||
def positiveMin(a: Double, b: Double) = {
|
||||
if (math.min(a, b) == 0.0) math.max(a, b) else math.min(a, b)
|
||||
}
|
||||
|
||||
case class ClusterDetailsSemigroup(implicit qtreeSemigroup: Semigroup[QTree[Double]])
|
||||
extends Semigroup[IntermediateDetails] {
|
||||
val optionMonoid: OptionMonoid[QTree[Double]] = new OptionMonoid[QTree[Double]]()
|
||||
override def plus(
|
||||
left: IntermediateDetails,
|
||||
right: IntermediateDetails
|
||||
): IntermediateDetails = {
|
||||
IntermediateDetails(
|
||||
left.numUsersWithAnyNonZeroScore + right.numUsersWithAnyNonZeroScore,
|
||||
left.numUsersWithNonZeroFollowScore + right.numUsersWithNonZeroFollowScore,
|
||||
left.numUsersWithNonZeroFavScore + right.numUsersWithNonZeroFavScore,
|
||||
optionMonoid.plus(left.favQTree, right.favQTree),
|
||||
optionMonoid.plus(left.followQTree, right.followQTree),
|
||||
optionMonoid.plus(left.logFavQTree, right.logFavQTree),
|
||||
Scores(
|
||||
left.sumOfSquares.followScore + right.sumOfSquares.followScore,
|
||||
left.sumOfSquares.favScore + right.sumOfSquares.favScore,
|
||||
left.sumOfSquares.logFavScore + right.sumOfSquares.logFavScore
|
||||
),
|
||||
Scores(
|
||||
left.sum.followScore + right.sum.followScore,
|
||||
left.sum.favScore + right.sum.favScore,
|
||||
left.sum.logFavScore + right.sum.logFavScore
|
||||
),
|
||||
Scores(
|
||||
positiveMin(left.min.followScore, right.min.followScore),
|
||||
positiveMin(left.min.favScore, right.min.favScore),
|
||||
positiveMin(left.min.logFavScore, right.min.logFavScore)
|
||||
),
|
||||
Scores(
|
||||
math.max(left.max.followScore, right.max.followScore),
|
||||
math.max(left.max.favScore, right.max.favScore),
|
||||
math.max(left.max.logFavScore, right.max.logFavScore)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
def intermediateDetailsPipe(
|
||||
input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
||||
qtreeSemigroupKParameter: Int
|
||||
): TypedPipe[(Int, IntermediateDetails)] = {
|
||||
implicit val qtSg: Semigroup[QTree[Double]] =
|
||||
new QTreeSemigroup[Double](qtreeSemigroupKParameter)
|
||||
implicit val cdSg: Semigroup[IntermediateDetails] = ClusterDetailsSemigroup()
|
||||
input
|
||||
.flatMap {
|
||||
case (userId, clusterScoresStruct) =>
|
||||
val clusterScoresArray = clusterScoresStruct.clusterIdToScores.toArray
|
||||
clusterScoresArray.map {
|
||||
case (clusterId, scoresStruct) =>
|
||||
val followScore = scoresStruct.followScore.getOrElse(0.0)
|
||||
val favScore = scoresStruct.favScore.getOrElse(0.0)
|
||||
val logFavScore = scoresStruct.logFavScore.getOrElse(0.0)
|
||||
(
|
||||
clusterId,
|
||||
IntermediateDetails(
|
||||
numUsersWithAnyNonZeroScore = 1,
|
||||
numUsersWithNonZeroFollowScore = if (followScore > 0) 1 else 0,
|
||||
numUsersWithNonZeroFavScore = if (favScore > 0) 1 else 0,
|
||||
favQTree = if (favScore > 0) Some(QTree(favScore)) else None,
|
||||
followQTree = if (followScore > 0) Some(QTree(followScore)) else None,
|
||||
logFavQTree = if (logFavScore > 0) Some(QTree(logFavScore)) else None,
|
||||
sumOfSquares = Scores(
|
||||
followScore * followScore,
|
||||
favScore * favScore,
|
||||
logFavScore * logFavScore),
|
||||
sum = Scores(followScore, favScore, logFavScore),
|
||||
min = Scores(followScore, favScore, logFavScore),
|
||||
max = Scores(followScore, favScore, logFavScore)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
.sumByKey
|
||||
// Uncomment for adhoc job
|
||||
//.withReducers(100)
|
||||
.toTypedPipe
|
||||
}
|
||||
|
||||
private def safeGetDoubleOpt(x: Option[Double]): Double = {
|
||||
x.map { y => if (y.isNaN) 0 else y }.getOrElse(0)
|
||||
}
|
||||
|
||||
private def getSimilaritiesForAllPairs(
|
||||
input: TypedPipe[(Long, ClustersUserIsInterestedIn)]
|
||||
)(
|
||||
implicit uniqueID: UniqueID
|
||||
): TypedPipe[((Int, Int), Scores)] = {
|
||||
val allClusterPairsBeforeSumByKey = Stat("all_cluster_pairs_before_sum_by_key")
|
||||
val clusterPairsWithin10Ratio = Stat("cluster_pairs_within_10_ratio")
|
||||
val clusterPairsBeforeTopK = Stat("cluster_pairs_before_thresholding")
|
||||
|
||||
input
|
||||
.flatMap {
|
||||
case (userId, clusterScoresStruct) =>
|
||||
val clusterScoresArray = clusterScoresStruct.clusterIdToScores.toArray
|
||||
(0 until clusterScoresArray.length).flatMap { i =>
|
||||
(0 until clusterScoresArray.length).map { j =>
|
||||
val (clusterI, scoresI) = clusterScoresArray(i)
|
||||
val (clusterJ, scoresJ) = clusterScoresArray(j)
|
||||
val ratioOfSizes =
|
||||
scoresI.numUsersInterestedInThisClusterUpperBound.getOrElse(1).toDouble /
|
||||
scoresJ.numUsersInterestedInThisClusterUpperBound.getOrElse(1).toDouble
|
||||
allClusterPairsBeforeSumByKey.inc()
|
||||
if (ratioOfSizes > 0.1 && ratioOfSizes < 10) {
|
||||
clusterPairsWithin10Ratio.inc()
|
||||
}
|
||||
val followI = safeGetDoubleOpt(scoresI.followScoreClusterNormalizedOnly)
|
||||
val followJ = safeGetDoubleOpt(scoresJ.followScoreClusterNormalizedOnly)
|
||||
val follow = followI * followJ
|
||||
val favI = safeGetDoubleOpt(scoresI.favScoreClusterNormalizedOnly)
|
||||
val favJ = safeGetDoubleOpt(scoresJ.favScoreClusterNormalizedOnly)
|
||||
val fav = favI * favJ
|
||||
val logFavI = safeGetDoubleOpt(scoresI.logFavScoreClusterNormalizedOnly)
|
||||
val logFavJ = safeGetDoubleOpt(scoresJ.logFavScoreClusterNormalizedOnly)
|
||||
val logFav = logFavI * logFavJ
|
||||
((clusterI, clusterJ), (follow, fav, logFav))
|
||||
}
|
||||
}
|
||||
}
|
||||
.sumByKey
|
||||
// Uncomment for adhoc job
|
||||
//.withReducers(600)
|
||||
.map {
|
||||
case (key, (follow, fav, logFav)) =>
|
||||
clusterPairsBeforeTopK.inc()
|
||||
(key, Scores(follow, fav, logFav))
|
||||
}
|
||||
}
|
||||
|
||||
private def keepTopNeighbors(
|
||||
allPairs: TypedPipe[((Int, Int), Scores)],
|
||||
cosineThreshold: Double
|
||||
)(
|
||||
implicit uniqueID: UniqueID
|
||||
): TypedPipe[(Int, List[ClusterNeighbor])] = {
|
||||
val clusterPairsMoreThanThreshold = Stat("cluster_pairs_cosine_gt_" + cosineThreshold)
|
||||
val clusterPairsAfterTopK = Stat("cluster_pairs_after_topk")
|
||||
val clustersWithFewNeighbors = Stat(s"clusters_with_fewer_than_100_neighbors")
|
||||
val clustersWithManyNeighbors = Stat(s"clusters_with_more_than_100_neighbors")
|
||||
|
||||
allPairs
|
||||
.flatMap {
|
||||
case ((cI, cJ), Scores(followScore, favScore, logFavScore)) =>
|
||||
if (followScore > cosineThreshold || logFavScore > cosineThreshold || favScore > cosineThreshold) {
|
||||
clusterPairsMoreThanThreshold.inc()
|
||||
Some((cI, ClusterNeighbor(cJ, Some(followScore), Some(favScore), Some(logFavScore))))
|
||||
} else None
|
||||
}
|
||||
.group
|
||||
.toList
|
||||
// Uncomment for adhoc job
|
||||
//.withReducers(40)
|
||||
.map {
|
||||
case (key, seq) =>
|
||||
val finalSize = seq.size
|
||||
clusterPairsAfterTopK.incBy(finalSize)
|
||||
if (finalSize < 100) {
|
||||
clustersWithFewNeighbors.inc()
|
||||
} else {
|
||||
clustersWithManyNeighbors.inc()
|
||||
}
|
||||
(
|
||||
key,
|
||||
seq.sortBy {
|
||||
case cn: ClusterNeighbor =>
|
||||
-(cn.followCosineSimilarity.getOrElse(0.0) + cn.logFavCosineSimilarity.getOrElse(
|
||||
0.0)) / 2
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
def getTopSimilarClustersWithCosine(
|
||||
input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
||||
cosineThreshold: Double
|
||||
)(
|
||||
implicit uniqueID: UniqueID
|
||||
): TypedPipe[(Int, List[ClusterNeighbor])] = {
|
||||
keepTopNeighbors(getSimilaritiesForAllPairs(input), cosineThreshold)
|
||||
}
|
||||
|
||||
def getDistributionDetails(
|
||||
qtree: QTree[Double],
|
||||
sum: Double,
|
||||
sumOfSquares: Double,
|
||||
min: Double,
|
||||
max: Double,
|
||||
fullSize: Int
|
||||
): DistributionDetails = {
|
||||
val mean = sum / fullSize
|
||||
// note that the below is the naive calculation, and not the sample standard dev formula
|
||||
// that divides by n-1. I don't think it makes a difference at our scale whether we use n or n-1
|
||||
// and I'd rather use the simpler one.
|
||||
val stdDev = math.sqrt(sumOfSquares / fullSize - mean * mean)
|
||||
|
||||
def getQB(percentile: Double): QuantileBounds = {
|
||||
val (lb, ub) = qtree.quantileBounds(percentile)
|
||||
QuantileBounds(lb, ub)
|
||||
}
|
||||
|
||||
DistributionDetails(
|
||||
mean = mean,
|
||||
standardDeviation = Some(stdDev),
|
||||
min = Some(min),
|
||||
p25 = Some(getQB(0.25)),
|
||||
p50 = Some(getQB(0.5)),
|
||||
p75 = Some(getQB(0.75)),
|
||||
p95 = Some(getQB(0.95)),
|
||||
max = Some(max)
|
||||
)
|
||||
}
|
||||
|
||||
def keepCorrectModel(
|
||||
input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
||||
modelVersionToKeep: String
|
||||
)(
|
||||
implicit uniqId: UniqueID
|
||||
): TypedPipe[(Long, ClustersUserIsInterestedIn)] = {
|
||||
val allRecords = Stat("all_input_records")
|
||||
val withCorrectVersion = Stat("with_correct_version")
|
||||
input.filter {
|
||||
case (_, clusterScoresStruct) =>
|
||||
// allRecords.inc()
|
||||
val result = clusterScoresStruct.knownForModelVersion == modelVersionToKeep
|
||||
// if (result) withCorrectVersion.inc()
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
def getInfoFromUserSource(
|
||||
knownFor: TypedPipe[(Int, List[(Long, Float)])],
|
||||
usersource: TypedPipe[FlatUser],
|
||||
inferredLanguages: TypedPipe[(Long, Seq[(String, Double)])]
|
||||
)(
|
||||
implicit uniqId: UniqueID
|
||||
): TypedPipe[(Int, InfoFromUserSource)] = {
|
||||
val knownForUsers = knownFor.flatMap {
|
||||
case (clusterId, userScoreList) =>
|
||||
userScoreList.map {
|
||||
case (userId, _) =>
|
||||
(userId, clusterId)
|
||||
}
|
||||
}
|
||||
|
||||
usersource
|
||||
.collect {
|
||||
case fuser: FlatUser if fuser.id.isDefined =>
|
||||
(
|
||||
fuser.id.get,
|
||||
(
|
||||
fuser.accountCountryCode.getOrElse(""),
|
||||
fuser.language.getOrElse(""),
|
||||
fuser.nsfwUser.getOrElse(false)
|
||||
))
|
||||
}
|
||||
.join(knownForUsers)
|
||||
.leftJoin(inferredLanguages)
|
||||
.map {
|
||||
case (_, (((countryCode, language, nsfw), clusterId), inferredLangsOpt)) =>
|
||||
val nsfwInt = if (nsfw) 1 else 0
|
||||
(
|
||||
clusterId,
|
||||
(
|
||||
1,
|
||||
nsfwInt,
|
||||
Map(language -> 1),
|
||||
Map(countryCode -> 1),
|
||||
inferredLangsOpt.getOrElse(Seq(("", 1.0))).toMap
|
||||
)
|
||||
)
|
||||
}
|
||||
.sumByKey
|
||||
.mapValues {
|
||||
case (
|
||||
denominator,
|
||||
nsfwNumerator,
|
||||
languageNumeratorsMap,
|
||||
countryNumeratorsMap,
|
||||
inferredLangsNumeratorsMap) =>
|
||||
InfoFromUserSource(
|
||||
nsfwNumerator * 1.0 / denominator,
|
||||
languageNumeratorsMap.mapValues { x => x * 1.0 / denominator },
|
||||
countryNumeratorsMap.mapValues { x => x * 1.0 / denominator },
|
||||
inferredLangsNumeratorsMap.mapValues { x => x * 1.0 / denominator }
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the cluster details job and return the details for each cluster
|
||||
* @param input interestedIn data
|
||||
* @param qtreeSemigroupKParameter parameter for calculating percentiles using qtree monoid (set to a small number, usually < 7)
|
||||
* @param modelVersionToKeep which modelVersion to use from interestedIn dataset
|
||||
* @param knownFor clusterId -> users known for this cluster and their scores
|
||||
* @param knownForTranspose userId -> clusters this user is known for and their scores
|
||||
* @param usersource -> user source
|
||||
* @param simsGraph -> sims graph in the form of userId -> adjacency list
|
||||
* @param cosineThreshold -> cosine threshold to include a cluster in the list of similar clusters for a given cluster
|
||||
* @param uniqId
|
||||
* @return pipe with (modelVersion, clusterId) as the key and ClusterDetails struct as the value.
|
||||
*/
|
||||
def run(
|
||||
input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
||||
qtreeSemigroupKParameter: Int,
|
||||
modelVersionToKeep: String,
|
||||
knownFor: TypedPipe[(Int, List[(Long, Float)])],
|
||||
knownForTranspose: TypedPipe[(Long, Array[(Int, Float)])],
|
||||
usersource: Option[TypedPipe[FlatUser]],
|
||||
inferredLanguageSource: Option[TypedPipe[(Long, Seq[(String, Double)])]],
|
||||
simsGraph: Option[TypedPipe[(Long, Map[Long, Float])]],
|
||||
cosineThreshold: Double
|
||||
)(
|
||||
implicit uniqId: UniqueID
|
||||
): Execution[TypedPipe[((String, Int), ClusterDetails)]] = {
|
||||
val topSimilarClusters = getTopSimilarClustersWithCosine(input, cosineThreshold)
|
||||
val infoFromUserSource: TypedPipe[(Int, InfoFromUserSource)] = (for {
|
||||
us <- usersource
|
||||
inferredLanguages <- inferredLanguageSource
|
||||
} yield getInfoFromUserSource(knownFor, us, inferredLanguages)).getOrElse(TypedPipe.empty)
|
||||
|
||||
val clusterEvaluationExec = simsGraph match {
|
||||
case Some(sg) =>
|
||||
ClusterEvaluation.clusterLevelEvaluation(sg, knownForTranspose, "eval")
|
||||
case None =>
|
||||
val dummyPipe: TypedPipe[(Int, (Int, ClusterQuality))] = TypedPipe.empty
|
||||
Execution.from(dummyPipe)
|
||||
}
|
||||
|
||||
clusterEvaluationExec
|
||||
.map { clusterIdToSizesAndQualities =>
|
||||
val clusterQualities: TypedPipe[(Int, ClusterQuality)] =
|
||||
clusterIdToSizesAndQualities.mapValues(_._2)
|
||||
intermediateDetailsPipe(
|
||||
keepCorrectModel(input, modelVersionToKeep),
|
||||
qtreeSemigroupKParameter)
|
||||
.leftJoin(topSimilarClusters)
|
||||
.leftJoin(infoFromUserSource)
|
||||
.leftJoin(clusterQualities)
|
||||
.join(knownFor)
|
||||
.map {
|
||||
case (
|
||||
clusterId,
|
||||
(
|
||||
(
|
||||
((intermediateDetails, topSimilarNeighborsOpt), userSourceInfoOpt),
|
||||
qualityOpt),
|
||||
knownForUsers)
|
||||
) =>
|
||||
val knownForSorted = knownForUsers.sortBy(-_._2).map {
|
||||
case (userId, score) =>
|
||||
UserWithScore(userId, score)
|
||||
}
|
||||
(modelVersionToKeep, clusterId) ->
|
||||
ClusterDetails(
|
||||
numUsersWithAnyNonZeroScore = intermediateDetails.numUsersWithAnyNonZeroScore,
|
||||
numUsersWithNonZeroFavScore = intermediateDetails.numUsersWithNonZeroFavScore,
|
||||
numUsersWithNonZeroFollowScore =
|
||||
intermediateDetails.numUsersWithNonZeroFollowScore,
|
||||
favScoreDistributionDetails = intermediateDetails.favQTree.map { qt =>
|
||||
getDistributionDetails(
|
||||
qtree = qt,
|
||||
sum = intermediateDetails.sum.favScore,
|
||||
sumOfSquares = intermediateDetails.sumOfSquares.favScore,
|
||||
min = intermediateDetails.min.favScore,
|
||||
max = intermediateDetails.max.favScore,
|
||||
fullSize = intermediateDetails.numUsersWithNonZeroFavScore
|
||||
)
|
||||
},
|
||||
followScoreDistributionDetails = intermediateDetails.followQTree.map { qt =>
|
||||
getDistributionDetails(
|
||||
qtree = qt,
|
||||
sum = intermediateDetails.sum.followScore,
|
||||
sumOfSquares = intermediateDetails.sumOfSquares.followScore,
|
||||
min = intermediateDetails.min.followScore,
|
||||
max = intermediateDetails.max.followScore,
|
||||
fullSize = intermediateDetails.numUsersWithNonZeroFollowScore
|
||||
)
|
||||
},
|
||||
logFavScoreDistributionDetails = intermediateDetails.logFavQTree.map { qt =>
|
||||
getDistributionDetails(
|
||||
qtree = qt,
|
||||
sum = intermediateDetails.sum.logFavScore,
|
||||
sumOfSquares = intermediateDetails.sumOfSquares.logFavScore,
|
||||
min = intermediateDetails.min.logFavScore,
|
||||
max = intermediateDetails.max.logFavScore,
|
||||
// note: user has non-zero fav score iff a user has non-zero log-fav score
|
||||
fullSize = intermediateDetails.numUsersWithNonZeroFavScore
|
||||
)
|
||||
},
|
||||
knownForUsersAndScores = Some(knownForSorted),
|
||||
neighborClusters = topSimilarNeighborsOpt,
|
||||
fractionKnownForMarkedNSFWUser = userSourceInfoOpt.map(_.fractionMarkedNSFWUser),
|
||||
languageToFractionDeviceLanguage =
|
||||
userSourceInfoOpt.map(_.languageToFractionDeviceLanguage),
|
||||
countryCodeToFractionKnownForWithCountryCode =
|
||||
userSourceInfoOpt.map(_.countryCodeToFractionKnownForWithCountryCode),
|
||||
qualityMeasuredOnSimsGraph = qualityOpt,
|
||||
languageToFractionInferredLanguage =
|
||||
userSourceInfoOpt.map(_.languageToFractionInferredLanguage),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def getTruncatedSims(
|
||||
sims: TypedPipe[Candidates],
|
||||
maxNeighbors: Int
|
||||
): TypedPipe[(Long, Map[Long, Float])] = {
|
||||
sims.map { cands =>
|
||||
(
|
||||
cands.userId,
|
||||
// These candidates are already sorted, but leaving it in just in case the behavior changes upstream
|
||||
cands.candidates
|
||||
.map { c => (c.userId, c.score.toFloat) }.sortBy(-_._2).take(maxNeighbors).toMap
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
scalding remote run --main-class com.twitter.simclusters_v2.scalding.ClusterDetailsAdhoc \
|
||||
--target src/scala/com/twitter/simclusters_v2/scalding:cluster_details-adhoc \
|
||||
--hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \
|
||||
--user recos-platform -- \
|
||||
--date 2020-06-25 \
|
||||
--dateForUserSource 2020-06-25 \
|
||||
--includeUserSource \
|
||||
--outputDir /user/recos-platform/adhoc/your_ldap/cluster_details_inferred_lang
|
||||
*/
|
||||
object ClusterDetailsAdhoc extends TwitterExecutionApp {
|
||||
implicit val tz: java.util.TimeZone = DateOps.UTC
|
||||
implicit val dp = DateParser.default
|
||||
|
||||
def job: Execution[Unit] =
|
||||
Execution.getConfigMode.flatMap {
|
||||
case (config, mode) =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
val args = config.getArgs
|
||||
val date = DateRange.parse(args("dateForUserSource"))
|
||||
val (knownFor, knownForTranspose) =
|
||||
args
|
||||
.optional("knownForDir").map { location =>
|
||||
(
|
||||
KnownForSources.transpose(KnownForSources.readKnownFor(location)),
|
||||
KnownForSources.readKnownFor(location)
|
||||
)
|
||||
}.getOrElse(
|
||||
(
|
||||
KnownForSources.clusterToKnownFor_20M_145K_updated,
|
||||
KnownForSources.knownFor_20M_145K_updated
|
||||
)
|
||||
)
|
||||
|
||||
val interestedIn = args
|
||||
.optional("inputDir").map { interestedInInputDir =>
|
||||
TypedPipe.from(AdhocKeyValSources.interestedInSource(interestedInInputDir))
|
||||
}.getOrElse(
|
||||
DAL
|
||||
.readMostRecentSnapshotNoOlderThan(
|
||||
SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
|
||||
Days(14))
|
||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
||||
.toTypedPipe
|
||||
.map {
|
||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
||||
(userId, clustersUserIsInterestedIn)
|
||||
}
|
||||
)
|
||||
|
||||
val userSourceOpt = if (args.boolean("includeUserSource")) {
|
||||
Some(DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe)
|
||||
} else None
|
||||
|
||||
val inferredLanguagesOpt = if (args.boolean("includeUserSource")) {
|
||||
Some(ExternalDataSources.inferredUserProducedLanguageSource)
|
||||
} else None
|
||||
|
||||
val simsGraphOpt = args.optional("simsForEvalInputDir").map { sgDir =>
|
||||
ClusterDetailsJob.getTruncatedSims(
|
||||
TypedPipe.from(WTFCandidatesSource(sgDir)),
|
||||
args.int("maxSimsNeighborsForEval", 20)
|
||||
)
|
||||
}
|
||||
|
||||
Util.printCounters(
|
||||
ClusterDetailsJob
|
||||
.run(
|
||||
interestedIn,
|
||||
args.int("qtreeSemigroupKParameter", 3),
|
||||
args.getOrElse("modelVersion", "20M_145K_updated"),
|
||||
knownFor,
|
||||
knownForTranspose,
|
||||
userSourceOpt,
|
||||
inferredLanguagesOpt,
|
||||
simsGraphOpt,
|
||||
cosineThreshold = args.double("cosineThreshold", 0.01)
|
||||
).flatMap(
|
||||
_.writeExecution(AdhocKeyValSources.clusterDetailsSource(args("outputDir"))))
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
trait ClusterDetailsBatchTrait extends TwitterScheduledExecutionApp {
|
||||
implicit val tz = DateOps.UTC
|
||||
implicit val parser = DateParser.default
|
||||
|
||||
def firstTime: String
|
||||
def batchIncrement: Duration
|
||||
def manhattanOutputPath: String
|
||||
def clusterDetailsLiteOutputPath: String
|
||||
def modelVersion: String
|
||||
def knownForDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
|
||||
def interestedInDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
|
||||
def outputDataset: KeyValDALDataset[KeyVal[(String, Int), ClusterDetails]]
|
||||
def clusterDetailsLiteOutputDataset: SnapshotDALDataset[ClusterDetailsLite]
|
||||
|
||||
private lazy val execArgs = AnalyticsBatchExecutionArgs(
|
||||
batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
|
||||
firstTime = BatchFirstTime(RichDate(firstTime)),
|
||||
lastTime = None,
|
||||
batchIncrement = BatchIncrement(batchIncrement)
|
||||
)
|
||||
|
||||
override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
|
||||
implicit dateRange =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
Execution.withArgs { args =>
|
||||
val qtreeSemigroupKParameter = args.int("qtreeSemigroupKParameter", 5)
|
||||
val maxSimsNeighborsForEval = args.int("maxSimsNeighborsForEval", 20)
|
||||
val knownForTranspose =
|
||||
KnownForSources.fromKeyVal(
|
||||
DAL.readMostRecentSnapshot(knownForDataset, dateRange.extend(Days(7))).toTypedPipe,
|
||||
modelVersion)
|
||||
val knownFor = KnownForSources.transpose(knownForTranspose)
|
||||
val cosineThreshold = args.double("cosineThreshold", 0.01)
|
||||
val interestedIn =
|
||||
DAL
|
||||
.readMostRecentSnapshot(interestedInDataset, dateRange.extend(Days(7)))
|
||||
.toTypedPipe
|
||||
.map {
|
||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
||||
(userId, clustersUserIsInterestedIn)
|
||||
}
|
||||
val sims = if (modelVersion == ModelVersions.Model20M145K2020) {
|
||||
// The model version 20m_145k_2020 uses approximate_cosine_follow as the input sims graph
|
||||
// to cluster users. The same graph is used to evaluate the clusters
|
||||
TypedPipe
|
||||
.from(FollowingsCosineSimilaritiesManhattanSource())
|
||||
.map(_._2)
|
||||
} else {
|
||||
TypedPipe.from(
|
||||
SimsCandidatesSource()(
|
||||
dateRange = dateRange,
|
||||
suffixPath = "/classified_candidates_rollup"
|
||||
))
|
||||
}
|
||||
val resultExec = ClusterDetailsJob
|
||||
.run(
|
||||
interestedIn,
|
||||
qtreeSemigroupKParameter,
|
||||
modelVersion,
|
||||
knownFor,
|
||||
knownForTranspose,
|
||||
Some(DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange).toTypedPipe),
|
||||
Some(ExternalDataSources.inferredUserProducedLanguageSource),
|
||||
Some(
|
||||
ClusterDetailsJob.getTruncatedSims(sims, maxNeighbors = maxSimsNeighborsForEval)),
|
||||
cosineThreshold
|
||||
).flatMap { resultUnmapped =>
|
||||
val clusterDetailsExec = resultUnmapped
|
||||
.map {
|
||||
case (clusterKey, details) =>
|
||||
KeyVal(clusterKey, details)
|
||||
}.writeDALVersionedKeyValExecution(
|
||||
outputDataset,
|
||||
D.Suffix(manhattanOutputPath)
|
||||
)
|
||||
|
||||
val clusterDetailsLiteExec =
|
||||
resultUnmapped
|
||||
.map {
|
||||
case ((_, clusterId), details)
|
||||
if modelVersion == ModelVersions.Model20M145KDec11 =>
|
||||
ClusterDetailsLite(
|
||||
FullClusterId(ModelVersion.Model20m145kDec11, clusterId),
|
||||
details.numUsersWithAnyNonZeroScore,
|
||||
details.numUsersWithNonZeroFollowScore,
|
||||
details.numUsersWithNonZeroFavScore,
|
||||
details.knownForUsersAndScores.getOrElse(Nil)
|
||||
)
|
||||
case ((_, clusterId), details)
|
||||
if modelVersion == ModelVersions.Model20M145KUpdated =>
|
||||
ClusterDetailsLite(
|
||||
FullClusterId(ModelVersion.Model20m145kUpdated, clusterId),
|
||||
details.numUsersWithAnyNonZeroScore,
|
||||
details.numUsersWithNonZeroFollowScore,
|
||||
details.numUsersWithNonZeroFavScore,
|
||||
details.knownForUsersAndScores.getOrElse(Nil)
|
||||
)
|
||||
case ((_, clusterId), details)
|
||||
if modelVersion == ModelVersions.Model20M145K2020 =>
|
||||
ClusterDetailsLite(
|
||||
FullClusterId(ModelVersion.Model20m145k2020, clusterId),
|
||||
details.numUsersWithAnyNonZeroScore,
|
||||
details.numUsersWithNonZeroFollowScore,
|
||||
details.numUsersWithNonZeroFavScore,
|
||||
details.knownForUsersAndScores.getOrElse(Nil)
|
||||
)
|
||||
}.writeDALSnapshotExecution(
|
||||
clusterDetailsLiteOutputDataset,
|
||||
D.Daily,
|
||||
D.Suffix(clusterDetailsLiteOutputPath),
|
||||
D.EBLzo(),
|
||||
dateRange.end)
|
||||
|
||||
Execution.zip(clusterDetailsExec, clusterDetailsLiteExec)
|
||||
}
|
||||
|
||||
Util.printCounters(resultExec)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
object ClusterDetailsBatch extends ClusterDetailsBatchTrait {
|
||||
override val firstTime: String = "2018-07-28"
|
||||
override val batchIncrement: Duration = Days(7)
|
||||
|
||||
override val manhattanOutputPath: String =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details"
|
||||
|
||||
override val clusterDetailsLiteOutputPath: String =
|
||||
"/user/cassowary/processed/simclusters_v2_cluster_details_lite"
|
||||
|
||||
override val modelVersion: String = ModelVersions.Model20M145KDec11
|
||||
override val knownForDataset = SimclustersV2KnownFor20M145KDec11ScalaDataset
|
||||
override val interestedInDataset = SimclustersV2InterestedInScalaDataset
|
||||
override val outputDataset = SimclustersV2ClusterDetailsScalaDataset
|
||||
override val clusterDetailsLiteOutputDataset =
|
||||
SimclustersV2ClusterDetailsLiteScalaDataset
|
||||
}
|
||||
|
||||
object ClusterDetails20M145KUpdated extends ClusterDetailsBatchTrait {
|
||||
override val firstTime: String = "2019-06-16"
|
||||
override val batchIncrement: Duration = Days(7)
|
||||
|
||||
override val manhattanOutputPath: String =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_updated"
|
||||
|
||||
override val clusterDetailsLiteOutputPath: String =
|
||||
"/user/cassowary/processed/simclusters_v2_cluster_details_lite_20m_145k_updated"
|
||||
|
||||
override val modelVersion: String = ModelVersions.Model20M145KUpdated
|
||||
override val knownForDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
|
||||
override val interestedInDataset = SimclustersV2InterestedIn20M145KUpdatedScalaDataset
|
||||
override val outputDataset = SimclustersV2ClusterDetails20M145KUpdatedScalaDataset
|
||||
override val clusterDetailsLiteOutputDataset =
|
||||
SimclustersV2ClusterDetailsLite20M145KUpdatedScalaDataset
|
||||
}
|
||||
|
||||
/**
|
||||
* capesospy-v2 update --build_locally --start_cron cluster_details_20m_145k_2020 \
|
||||
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
|
||||
*/
|
||||
object ClusterDetails20M145K2020 extends ClusterDetailsBatchTrait {
|
||||
override val firstTime: String = "2020-10-15"
|
||||
override val batchIncrement: Duration = Days(7)
|
||||
|
||||
override val manhattanOutputPath: String =
|
||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_2020"
|
||||
|
||||
override val clusterDetailsLiteOutputPath: String =
|
||||
"/user/cassowary/processed/simclusters_v2_cluster_details_lite_20m_145k_2020"
|
||||
|
||||
override val modelVersion: String = ModelVersions.Model20M145K2020
|
||||
override val knownForDataset = SimclustersV2KnownFor20M145K2020ScalaDataset
|
||||
override val interestedInDataset = SimclustersV2InterestedIn20M145K2020ScalaDataset
|
||||
override val outputDataset = SimclustersV2ClusterDetails20M145K2020ScalaDataset
|
||||
override val clusterDetailsLiteOutputDataset =
|
||||
SimclustersV2ClusterDetailsLite20M145K2020ScalaDataset
|
||||
}
|
||||
|
||||
/**
|
||||
scalding remote run --main-class com.twitter.simclusters_v2.scalding.DumpClusterDetailsAdhoc \
|
||||
--target src/scala/com/twitter/simclusters_v2/scalding:cluster_details-dump \
|
||||
--user recos-platform -- \
|
||||
--date 2020-06-25 \
|
||||
--clusterIds 5542 129677 48645 \
|
||||
--inputDir /user/recos-platform/adhoc/your_ldap/cluster_details_inferred_lang
|
||||
*/
|
||||
object DumpClusterDetailsAdhoc extends TwitterExecutionApp {
|
||||
def job: Execution[Unit] =
|
||||
Execution.getConfigMode.flatMap {
|
||||
case (config, mode) =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
val args = config.getArgs
|
||||
val clusters = args.list("clusterIds").map(_.toInt).toSet //(1 to 2500).toSet //
|
||||
TypedPipe
|
||||
.from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
|
||||
.filter { case ((modelVersion, clusterId), details) => clusters.contains(clusterId) }
|
||||
.toIterableExecution
|
||||
.map { iter =>
|
||||
iter.foreach { x => println(Util.prettyJsonMapper.writeValueAsString(x)) }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:cluster_details && \
|
||||
* oscar hdfs --user cassowary --host hadoopnest2.atla.twitter.com --bundle cluster_details \
|
||||
* --tool com.twitter.simclusters_v2.scalding.DumpClusterSimilaritiesAdhoc --screen --screen-detached \
|
||||
* --tee your_ldap/dumpClusterSimilarities_20200103 -- \
|
||||
* --inputDir /user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_updated/ \
|
||||
* --outputDir adhoc/your_ldap
|
||||
*/
|
||||
object DumpClusterSimilaritiesAdhoc extends TwitterExecutionApp {
|
||||
def job: Execution[Unit] =
|
||||
Execution.getConfigMode.flatMap {
|
||||
case (config, mode) =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
val args = config.getArgs
|
||||
TypedPipe
|
||||
.from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
|
||||
.flatMap {
|
||||
case ((_, clusterId), details) =>
|
||||
details.neighborClusters.getOrElse(Nil).map { neighbor =>
|
||||
val compositeScore = (neighbor.followCosineSimilarity
|
||||
.getOrElse(0.0) + neighbor.favCosineSimilarity.getOrElse(0.0)) / 2
|
||||
(
|
||||
clusterId,
|
||||
neighbor.clusterId,
|
||||
"%.4f".format(compositeScore)
|
||||
)
|
||||
}
|
||||
}.writeExecution(TypedTsv(args("outputDir")))
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,607 +0,0 @@
|
||||
package com.twitter.simclusters_v2.scalding
|
||||
|
||||
import com.twitter.algebird.Monoid
|
||||
import com.twitter.algebird.mutable.PriorityQueueMonoid
|
||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
||||
import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource
|
||||
import com.twitter.scalding._
|
||||
import com.twitter.scalding_internal.dalv2.DAL
|
||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
||||
import com.twitter.scalding_internal.job.analytics_batch._
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
||||
import com.twitter.simclusters_v2.common.ModelVersions
|
||||
import com.twitter.simclusters_v2.hdfs_sources._
|
||||
import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._
|
||||
import com.twitter.simclusters_v2.scalding.common.Util
|
||||
import com.twitter.simclusters_v2.scalding.common.Util.Distribution
|
||||
import com.twitter.simclusters_v2.thriftscala.ClusterQuality
|
||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor
|
||||
import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset
|
||||
import java.util.PriorityQueue
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object ClusterEvaluation {
|
||||
|
||||
val samplerMonoid: PriorityQueueMonoid[((Long, Long), (Double, Double))] =
|
||||
Util.reservoirSamplerMonoidForPairs[(Long, Long), (Double, Double)](5000)(Util.edgeOrdering)
|
||||
|
||||
case class ClusterResults(
|
||||
numEdgesInsideCluster: Int,
|
||||
wtOfEdgesInsideCluster: Double,
|
||||
numEdgesOutsideCluster: Int,
|
||||
wtOfEdgesOutsideCluster: Double,
|
||||
originalWtAndProductOfNodeScoresSample: PriorityQueue[((Long, Long), (Double, Double))]) {
|
||||
def clusterQuality(clusterSize: Int, averagePrecisionWholeGraph: Double): ClusterQuality = {
|
||||
val unweightedRecallDenominator = numEdgesInsideCluster + numEdgesOutsideCluster
|
||||
val unweightedRecall = if (unweightedRecallDenominator > 0) {
|
||||
numEdgesInsideCluster.toDouble / unweightedRecallDenominator.toDouble
|
||||
} else 0.0
|
||||
|
||||
val weightedRecallDenominator = wtOfEdgesInsideCluster + wtOfEdgesOutsideCluster
|
||||
val weightedRecall = if (weightedRecallDenominator > 0) {
|
||||
wtOfEdgesInsideCluster / weightedRecallDenominator
|
||||
} else 0.0
|
||||
|
||||
val precision = if (clusterSize > 1) {
|
||||
Some(wtOfEdgesInsideCluster / (clusterSize * (clusterSize - 1)))
|
||||
} else Some(0.0)
|
||||
|
||||
val relativePrecision = if (averagePrecisionWholeGraph > 0) {
|
||||
precision.flatMap { p => Some(p / averagePrecisionWholeGraph) }
|
||||
} else Some(0.0)
|
||||
|
||||
ClusterQuality(
|
||||
unweightedRecall = Some(unweightedRecall),
|
||||
weightedRecall = Some(weightedRecall),
|
||||
unweightedRecallDenominator = Some(unweightedRecallDenominator),
|
||||
weightedRecallDenominator = Some(weightedRecallDenominator),
|
||||
relativePrecisionNumerator = precision,
|
||||
relativePrecision = relativePrecision,
|
||||
weightAndProductOfNodeScoresCorrelation = Some(
|
||||
Util.computeCorrelation(
|
||||
originalWtAndProductOfNodeScoresSample.iterator.asScala.map(_._2)))
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
object ClusterResultsMonoid extends Monoid[ClusterResults] {
|
||||
override def zero = ClusterResults(0, 0, 0, 0, samplerMonoid.zero)
|
||||
override def plus(l: ClusterResults, r: ClusterResults) = ClusterResults(
|
||||
l.numEdgesInsideCluster + r.numEdgesInsideCluster,
|
||||
l.wtOfEdgesInsideCluster + r.wtOfEdgesInsideCluster,
|
||||
l.numEdgesOutsideCluster + r.numEdgesOutsideCluster,
|
||||
l.wtOfEdgesOutsideCluster + r.wtOfEdgesOutsideCluster,
|
||||
samplerMonoid
|
||||
.plus(l.originalWtAndProductOfNodeScoresSample, r.originalWtAndProductOfNodeScoresSample)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate the quality of a cluster.
|
||||
* @param memberScores A map with the members of the cluster as the keys and their scores
|
||||
* inside the cluster as values. The more central a member is inside the score,
|
||||
* the higher it's score is.
|
||||
* @param membersAdjLists A map that gives the weighted neighbors of each member in the cluster.
|
||||
*/
|
||||
def evaluateCluster(
|
||||
memberScores: Map[Long, Double],
|
||||
membersAdjLists: Map[Long, Map[Long, Float]]
|
||||
): ClusterResults = {
|
||||
val resultsIter = membersAdjLists.flatMap {
|
||||
case (fromNodeId, adjList) =>
|
||||
val fromNodeWt = memberScores.getOrElse(fromNodeId, 0.0)
|
||||
adjList.map {
|
||||
case (toNodeId, edgeWt) =>
|
||||
if (memberScores.contains(toNodeId)) {
|
||||
val productOfMembershipScores = fromNodeWt * memberScores(toNodeId)
|
||||
ClusterResults(
|
||||
1,
|
||||
edgeWt,
|
||||
0,
|
||||
0,
|
||||
samplerMonoid.build(
|
||||
((fromNodeId, toNodeId), (edgeWt.toDouble, productOfMembershipScores))))
|
||||
} else {
|
||||
ClusterResults(0, 0, 1, edgeWt, samplerMonoid.zero)
|
||||
}
|
||||
}
|
||||
}
|
||||
Monoid.sum(resultsIter)(ClusterResultsMonoid)
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate each cluster with respect to the provided graph.
|
||||
* @param graph graph represented via the adjacency lists of each node, needs to be symmetrized i.e. if u is in v's adjlist, then v needs to be in u's adjlist as well
|
||||
* @param clusters cluster memberships of each node.
|
||||
* @param statsPrefix convenience argument to act as prefix for stats counters
|
||||
* @return key-value pipe with clusterId as key and (size of the cluster, quality struct) as value
|
||||
*/
|
||||
def clusterLevelEvaluation(
|
||||
graph: TypedPipe[(Long, Map[Long, Float])],
|
||||
clusters: TypedPipe[(Long, Array[(Int, Float)])],
|
||||
statsPrefix: String = ""
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): Execution[TypedPipe[(Int, (Int, ClusterQuality))]] = {
|
||||
val numRealClusters = Stat(s"${statsPrefix}/numRealClusters")
|
||||
val numFakeClusters = Stat(s"${statsPrefix}/numFakeClusters")
|
||||
|
||||
val numNodesAndEdgesExec = graph
|
||||
.map {
|
||||
case (nId, nbrMap) =>
|
||||
(1L, nbrMap.size.toLong, nbrMap.values.sum.toDouble)
|
||||
}.sum.getExecution
|
||||
|
||||
numNodesAndEdgesExec.map {
|
||||
case (numNodes, numEdges, sumOfAllEdgeWts) =>
|
||||
println("numNodes " + numNodes)
|
||||
println("numEdges " + numEdges)
|
||||
println("sumOfAllEdgeWts " + sumOfAllEdgeWts)
|
||||
|
||||
val numFakeClustersForUnassignedNodes = numNodes / 1e4
|
||||
|
||||
val averagePrecisionWholeGraph = sumOfAllEdgeWts / (numNodes * (numNodes - 1))
|
||||
graph
|
||||
.leftJoin(clusters)
|
||||
// uncomment for adhoc job
|
||||
.withReducers(200)
|
||||
.flatMap {
|
||||
case (nodeId, (adjList, assignedClustersOpt)) =>
|
||||
val nodeDegree = adjList.size.toLong
|
||||
val nodeWeightedDegree = adjList.values.sum
|
||||
assignedClustersOpt match {
|
||||
case Some(assignedClusters) if assignedClusters.nonEmpty =>
|
||||
assignedClusters.toList.map {
|
||||
case (clusterId, scoreOfNodeInCluster) =>
|
||||
(
|
||||
clusterId,
|
||||
(
|
||||
Map(nodeId -> (scoreOfNodeInCluster.toDouble, adjList)),
|
||||
1,
|
||||
nodeDegree,
|
||||
nodeWeightedDegree))
|
||||
}
|
||||
case _ =>
|
||||
// For nodes that don't belong to any cluster, create a fake clusterId (0 or lesser)
|
||||
// and add the node's statistics to that clusterId. We don't need the adjacency lists for
|
||||
// unassigned nodes, we'll simply track how many edges are incident on those nodes and their weighted sum etc
|
||||
val fakeClusterId =
|
||||
(-1 * (math.abs(
|
||||
Util.hashToLong(nodeId)) % numFakeClustersForUnassignedNodes)).toInt
|
||||
List(
|
||||
(
|
||||
fakeClusterId,
|
||||
(
|
||||
Map.empty[Long, (Double, Map[Long, Float])],
|
||||
1,
|
||||
nodeDegree,
|
||||
nodeWeightedDegree)))
|
||||
}
|
||||
}
|
||||
.sumByKey
|
||||
// uncomment for adhoc job
|
||||
.withReducers(60)
|
||||
.map {
|
||||
case (clusterId, (membersMap, clusterSize, volumeOfCluster, weightedVolumeOfCluster)) =>
|
||||
if (clusterId > 0) {
|
||||
numRealClusters.inc()
|
||||
|
||||
val scoresMap =
|
||||
if (clusterId > 0) membersMap.mapValues(_._1) else Map.empty[Long, Double]
|
||||
val adjListsMap = membersMap.mapValues(_._2)
|
||||
|
||||
val quality = evaluateCluster(scoresMap, adjListsMap)
|
||||
.clusterQuality(clusterSize, averagePrecisionWholeGraph)
|
||||
|
||||
(clusterId, (clusterSize, quality))
|
||||
} else {
|
||||
// clusterId <= 0 means that this is a fake cluster.
|
||||
numFakeClusters.inc()
|
||||
(
|
||||
clusterId,
|
||||
(
|
||||
clusterSize,
|
||||
ClusterQuality(
|
||||
unweightedRecallDenominator = Some(volumeOfCluster),
|
||||
weightedRecallDenominator = Some(weightedVolumeOfCluster)
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
case class OverallResults(
|
||||
unweightedRecall: Double,
|
||||
edgesInsideClusters: Long,
|
||||
allEdges: Long,
|
||||
allNodes: Int,
|
||||
weightedRecall: Double,
|
||||
wtOnEdgesInsideClusters: Double,
|
||||
wtOnAllEdges: Double,
|
||||
weightCorrelation: Double,
|
||||
relativePrecision: Double,
|
||||
numUnassignedNodes: Int,
|
||||
numAssignedNodes: Int,
|
||||
sizeDist: Distribution,
|
||||
recallDist: Distribution,
|
||||
weightedRecallDist: Distribution,
|
||||
relativePrecisionDist: Distribution,
|
||||
weightCorrelationDist: Distribution,
|
||||
numClustersWithNegativeCorrelation: Double,
|
||||
numClustersWithZeroRecall: Double,
|
||||
numClustersWithLessThanOneRelativePrecision: Double,
|
||||
numSingletonClusters: Int)
|
||||
|
||||
def summarizePerClusterResults(
|
||||
perClusterResults: TypedPipe[(Int, (Int, ClusterQuality))]
|
||||
): Execution[Option[OverallResults]] = {
|
||||
perClusterResults
|
||||
.map {
|
||||
case (clusterId, (size, quality)) =>
|
||||
val unweightedRecallDen = quality.unweightedRecallDenominator.getOrElse(0.0)
|
||||
val unweightedRecallNum = quality.unweightedRecall.getOrElse(0.0) * unweightedRecallDen
|
||||
val weightedRecallDen = quality.weightedRecallDenominator.getOrElse(0.0)
|
||||
val weightedRecallNum = quality.weightedRecall.getOrElse(0.0) * weightedRecallDen
|
||||
|
||||
val weightCorrelationDen = size
|
||||
val weightCorrelationNum =
|
||||
weightCorrelationDen * quality.weightAndProductOfNodeScoresCorrelation
|
||||
.getOrElse(0.0)
|
||||
|
||||
val relativePrecisionDen = size
|
||||
val relativePrecisionNum = relativePrecisionDen * quality.relativePrecision.getOrElse(0.0)
|
||||
|
||||
val numClustersWithNegativeCorrelation =
|
||||
if (weightCorrelationNum < 0 && clusterId > 0) 1 else 0
|
||||
val numClustersWithLessThanOneRelativePrecision =
|
||||
if (quality.relativePrecision.getOrElse(0.0) < 1 && clusterId > 0) 1 else 0
|
||||
val numClustersWithZeroRecall = if (weightedRecallNum < 1e-5 && clusterId > 0) 1 else 0
|
||||
val numUnassignedNodes = if (clusterId < 1) size else 0
|
||||
val numAssignedNodes = if (clusterId > 0) size else 0
|
||||
val numSingletonClusters = if (clusterId > 0 && size == 1) 1 else 0
|
||||
|
||||
(
|
||||
unweightedRecallDen,
|
||||
unweightedRecallNum,
|
||||
weightedRecallDen,
|
||||
weightedRecallNum,
|
||||
weightCorrelationDen,
|
||||
weightCorrelationNum,
|
||||
relativePrecisionDen,
|
||||
relativePrecisionNum,
|
||||
numClustersWithNegativeCorrelation,
|
||||
numClustersWithLessThanOneRelativePrecision,
|
||||
numClustersWithZeroRecall,
|
||||
List(size.toDouble),
|
||||
List(quality.unweightedRecall.getOrElse(0.0)),
|
||||
List(quality.weightedRecall.getOrElse(0.0)),
|
||||
List(quality.relativePrecision.getOrElse(0.0)),
|
||||
List(quality.weightAndProductOfNodeScoresCorrelation.getOrElse(0.0)),
|
||||
numUnassignedNodes,
|
||||
numAssignedNodes,
|
||||
numSingletonClusters
|
||||
)
|
||||
}
|
||||
.sum
|
||||
.toOptionExecution
|
||||
.map { opt =>
|
||||
opt.map {
|
||||
case (
|
||||
unweightedRecallDen,
|
||||
unweightedRecallNum,
|
||||
weightedRecallDen,
|
||||
weightedRecallNum,
|
||||
weightCorrelationDen,
|
||||
weightCorrelationNum,
|
||||
relativePrecisionDen,
|
||||
relativePrecisionNum,
|
||||
numClustersWithNegativeCorrelation,
|
||||
numClustersWithLessThanOneRelativePrecision,
|
||||
numClustersWithZeroRecall,
|
||||
sizeList,
|
||||
unweightedRecallList,
|
||||
weightedRecallList,
|
||||
relativePrecisionList,
|
||||
weightCorrelationList,
|
||||
numUnassignedNodes,
|
||||
numAssignedNodes,
|
||||
numSingletonClusters) =>
|
||||
OverallResults(
|
||||
unweightedRecall = unweightedRecallNum / unweightedRecallDen,
|
||||
edgesInsideClusters = unweightedRecallNum.toLong,
|
||||
allEdges = unweightedRecallDen.toLong,
|
||||
allNodes = numAssignedNodes + numUnassignedNodes,
|
||||
weightedRecall = weightedRecallNum / weightedRecallDen,
|
||||
wtOnEdgesInsideClusters = weightedRecallNum,
|
||||
wtOnAllEdges = weightedRecallDen,
|
||||
weightCorrelation = weightCorrelationNum / weightCorrelationDen,
|
||||
relativePrecision = relativePrecisionNum / relativePrecisionDen,
|
||||
numAssignedNodes = numAssignedNodes,
|
||||
numUnassignedNodes = numUnassignedNodes,
|
||||
sizeDist = Util.distributionFromArray(sizeList.toArray),
|
||||
recallDist = Util.distributionFromArray(unweightedRecallList.toArray),
|
||||
weightedRecallDist = Util.distributionFromArray(weightedRecallList.toArray),
|
||||
weightCorrelationDist = Util.distributionFromArray(weightCorrelationList.toArray),
|
||||
relativePrecisionDist = Util.distributionFromArray(relativePrecisionList.toArray),
|
||||
numClustersWithNegativeCorrelation = numClustersWithNegativeCorrelation,
|
||||
numClustersWithLessThanOneRelativePrecision =
|
||||
numClustersWithLessThanOneRelativePrecision,
|
||||
numClustersWithZeroRecall = numClustersWithZeroRecall,
|
||||
numSingletonClusters = numSingletonClusters
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param graph Input similarity graph, needs to be symmetrized i.e. if u is in v's adjlist, then v needs to be in u's adjlist as well
|
||||
* @param clusters cluster assignments to be evaluated
|
||||
* @return summary of results
|
||||
*/
|
||||
def overallEvaluation(
|
||||
graph: TypedPipe[(Long, Map[Long, Float])],
|
||||
clusters: TypedPipe[(Long, Array[(Int, Float)])],
|
||||
statsPrefix: String
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): Execution[Option[OverallResults]] = {
|
||||
clusterLevelEvaluation(graph, clusters, statsPrefix).flatMap(summarizePerClusterResults)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:cluster_evaluation && \
|
||||
* oscar hdfs --user frigate --host hadoopnest1.atla.twitter.com --bundle cluster_evaluation \
|
||||
* --tool com.twitter.simclusters_v2.scalding.ClusterEvaluationAdhoc --screen --screen-detached \
|
||||
* --tee logs/clusterQualityFor_updatedUnnormalizedInputScores_usingSims20190318 -- \
|
||||
* --simsInputDir /user/frigate/your_ldap/commonDirForClusterEvaluation/classifiedSims_20190314_copiedFromAtlaProc \
|
||||
* --topK 20000000 --date 2019-03-18 --minActiveFollowers 400 \
|
||||
* --topUsersDir /user/frigate/your_ldap/commonDirForClusterEvaluation/top20MUsers_minActiveFollowers400_20190215 \
|
||||
* --maxSimsNeighborsForEval 40 \
|
||||
* --preparedSimsGraph /user/frigate/your_ldap/commonDirForClusterEvaluation/symmetrized_classifiedSims20190318_top20MUsers \
|
||||
* --outputDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownForClusterEvaluation \
|
||||
* --knownForDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownFor
|
||||
*/
|
||||
object ClusterEvaluationAdhoc extends TwitterExecutionApp {
|
||||
implicit val tz: java.util.TimeZone = DateOps.UTC
|
||||
implicit val dp = DateParser.default
|
||||
|
||||
def job: Execution[Unit] =
|
||||
Execution.getConfigMode.flatMap {
|
||||
case (config, mode) =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
val args = config.getArgs
|
||||
val knownFor = args
|
||||
.optional("knownForDir").map { location =>
|
||||
KnownForSources.readKnownFor(location)
|
||||
}.getOrElse(KnownForSources.knownFor_20M_Dec11_145K)
|
||||
|
||||
val minActiveFollowers = args.int("minActiveFollowers", 400)
|
||||
val topK = args.int("topK")
|
||||
val date = DateRange.parse(args("date"))
|
||||
|
||||
val topUsersExec =
|
||||
TopUsersSimilarityGraph
|
||||
.topUsers(
|
||||
DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe,
|
||||
minActiveFollowers,
|
||||
topK
|
||||
)
|
||||
.map(_.id)
|
||||
.count("num_top_users")
|
||||
.make(TypedTsv(args("topUsersDir")))
|
||||
|
||||
val simsGraphExec = topUsersExec.flatMap { topUsers =>
|
||||
TopUsersSimilarityGraph.makeGraph(
|
||||
TopUsersSimilarityGraph.getSubgraphFromUserGroupedInput(
|
||||
TypedPipe.from(WTFCandidatesSource(args("simsInputDir"))),
|
||||
topUsers,
|
||||
args.int("maxSimsNeighborsForEval", 40),
|
||||
degreeThresholdForStat = 5
|
||||
),
|
||||
args("preparedSimsGraph")
|
||||
)
|
||||
}
|
||||
|
||||
val fullExec = simsGraphExec.flatMap { sims =>
|
||||
ClusterEvaluation
|
||||
.clusterLevelEvaluation(sims, knownFor, "eval")
|
||||
.flatMap { clusterResultsPipe =>
|
||||
val clusterResults = clusterResultsPipe.forceToDiskExecution
|
||||
val outputExec = clusterResults.flatMap { pipe =>
|
||||
pipe
|
||||
.map {
|
||||
case (clusterId, (clusterSize, quality)) =>
|
||||
"%d\t%d\t%.2g\t%.2g\t%.1f\t%.2g\t%.2f\t%.2g\t%.2g"
|
||||
.format(
|
||||
clusterId,
|
||||
clusterSize,
|
||||
quality.unweightedRecall.getOrElse(0.0),
|
||||
quality.weightedRecall.getOrElse(0.0),
|
||||
quality.unweightedRecallDenominator.getOrElse(0.0),
|
||||
quality.weightedRecallDenominator.getOrElse(0.0),
|
||||
quality.relativePrecision.getOrElse(0.0),
|
||||
quality.relativePrecisionNumerator.getOrElse(0.0),
|
||||
quality.weightAndProductOfNodeScoresCorrelation.getOrElse(0.0)
|
||||
)
|
||||
}.writeExecution(TypedTsv(args("outputDir")))
|
||||
}
|
||||
|
||||
val printExec = clusterResults.flatMap { pipe =>
|
||||
ClusterEvaluation.summarizePerClusterResults(pipe).map {
|
||||
case Some(res) =>
|
||||
println("Overall results: " + Util.prettyJsonMapper.writeValueAsString(res))
|
||||
case None =>
|
||||
println("No overall results!!! Probably cluster results pipe is empty.")
|
||||
}
|
||||
}
|
||||
|
||||
Execution.zip(outputExec, printExec)
|
||||
}
|
||||
}
|
||||
|
||||
Util.printCounters(fullExec)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
trait ClusterEvaluationBatch extends TwitterScheduledExecutionApp {
|
||||
implicit val tz: java.util.TimeZone = DateOps.UTC
|
||||
implicit val dp = DateParser.default
|
||||
|
||||
def firstTime: String
|
||||
|
||||
def batchDescription: String
|
||||
|
||||
def batchIncrement: Duration
|
||||
|
||||
private lazy val execArgs = AnalyticsBatchExecutionArgs(
|
||||
batchDesc = BatchDescription(batchDescription),
|
||||
firstTime = BatchFirstTime(RichDate(firstTime)),
|
||||
lastTime = None,
|
||||
batchIncrement = BatchIncrement(batchIncrement)
|
||||
)
|
||||
|
||||
val emailAddress: String = "no-reply@twitter.com"
|
||||
|
||||
def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
|
||||
|
||||
def knownForModelVersion: String
|
||||
|
||||
def baselineKnownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
|
||||
|
||||
def baselineKnownForModelVersion: String
|
||||
|
||||
override def scheduledJob: Execution[Unit] =
|
||||
AnalyticsBatchExecution(execArgs) { implicit dateRange =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
Execution.withArgs { args =>
|
||||
val baselineKnownFor =
|
||||
KnownForSources.fromKeyVal(
|
||||
DAL
|
||||
.readMostRecentSnapshot(baselineKnownForDALDataset, dateRange.prepend(Days(7)))
|
||||
.toTypedPipe,
|
||||
baselineKnownForModelVersion
|
||||
)
|
||||
|
||||
val knownFor =
|
||||
KnownForSources.fromKeyVal(
|
||||
DAL
|
||||
.readMostRecentSnapshot(knownForDALDataset, dateRange.prepend(Days(7)))
|
||||
.toTypedPipe,
|
||||
knownForModelVersion
|
||||
)
|
||||
|
||||
val inputSimsGraph = TypedPipe
|
||||
.from(FollowingsCosineSimilaritiesManhattanSource())
|
||||
.map(_._2)
|
||||
|
||||
val minActiveFollowers = args.int("minActiveFollowers")
|
||||
val topK = args.int("topK")
|
||||
val maxSimsNeighborsForEval =
|
||||
args.int("maxSimsNeighborsForEval", 40)
|
||||
|
||||
val topUsers = TopUsersSimilarityGraph
|
||||
.topUsers(
|
||||
DAL
|
||||
.readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange)
|
||||
.toTypedPipe,
|
||||
minActiveFollowers,
|
||||
topK
|
||||
)
|
||||
.map(_.id)
|
||||
.count("num_top_users")
|
||||
|
||||
TopUsersSimilarityGraph
|
||||
.getSubgraphFromUserGroupedInput(
|
||||
fullGraph = inputSimsGraph,
|
||||
usersToInclude = topUsers,
|
||||
maxNeighborsPerNode = maxSimsNeighborsForEval,
|
||||
degreeThresholdForStat = 2
|
||||
)
|
||||
.forceToDiskExecution
|
||||
.flatMap { symmetrizedSims =>
|
||||
val baselineResultsExec = ClusterEvaluation
|
||||
.overallEvaluation(symmetrizedSims, baselineKnownFor, "baselineKnownForEval")
|
||||
val newResultsExec = ClusterEvaluation
|
||||
.overallEvaluation(symmetrizedSims, knownFor, "newKnownForEval")
|
||||
val minSizeOfBiggerClusterForComparison = 10
|
||||
val compareExec = CompareClusters.summarize(
|
||||
CompareClusters.compare(
|
||||
KnownForSources.transpose(baselineKnownFor),
|
||||
KnownForSources.transpose(knownFor),
|
||||
minSizeOfBiggerCluster = minSizeOfBiggerClusterForComparison
|
||||
))
|
||||
|
||||
Execution
|
||||
.zip(baselineResultsExec, newResultsExec, compareExec)
|
||||
.map {
|
||||
case (oldResults, newResults, compareResults) =>
|
||||
val emailText =
|
||||
s"Evaluation Results for baseline knownFor: $baselineKnownForModelVersion \n" +
|
||||
Util.prettyJsonMapper.writeValueAsString(oldResults) +
|
||||
"\n\n-------------------\n\n" +
|
||||
s"Evaluation Results for new knownFor:$knownForModelVersion\n" +
|
||||
Util.prettyJsonMapper.writeValueAsString(newResults) +
|
||||
"\n\n-------------------\n\n" +
|
||||
s"Cosine similarity distribution between $baselineKnownForModelVersion and " +
|
||||
s"$knownForModelVersion cluster membership vectors for " +
|
||||
s"clusters with at least $minSizeOfBiggerClusterForComparison members:\n" +
|
||||
Util.prettyJsonMapper
|
||||
.writeValueAsString(compareResults)
|
||||
|
||||
Util
|
||||
.sendEmail(
|
||||
emailText,
|
||||
s"Evaluation results comparing $knownForModelVersion with baseline $baselineKnownForModelVersion",
|
||||
emailAddress)
|
||||
()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* capesospy-v2 update --build_locally --start_cron cluster_evaluation_for_20M_145k \
|
||||
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
|
||||
*/
|
||||
object ClusterEvaluationFor20M145K extends ClusterEvaluationBatch {
|
||||
override val firstTime: String = "2019-06-11"
|
||||
|
||||
override val batchIncrement: Duration = Days(7)
|
||||
|
||||
override val batchDescription = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K"
|
||||
|
||||
override val knownForDALDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
|
||||
|
||||
override val knownForModelVersion = ModelVersions.Model20M145KUpdated
|
||||
|
||||
override val baselineKnownForDALDataset = SimclustersV2KnownFor20M145KDec11ScalaDataset
|
||||
|
||||
override val baselineKnownForModelVersion = ModelVersions.Model20M145KDec11
|
||||
}
|
||||
|
||||
/**
|
||||
* capesospy-v2 update --build_locally --start_cron cluster_evaluation_for_20M_145k_2020 \
|
||||
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
|
||||
*/
|
||||
object ClusterEvaluationFor20M145K2020 extends ClusterEvaluationBatch {
|
||||
override val firstTime: String = "2021-01-25"
|
||||
|
||||
override val batchIncrement: Duration = Days(7)
|
||||
|
||||
override val batchDescription =
|
||||
"com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K2020"
|
||||
|
||||
override val knownForDALDataset = SimclustersV2KnownFor20M145K2020ScalaDataset
|
||||
|
||||
override val knownForModelVersion = ModelVersions.Model20M145K2020
|
||||
|
||||
override val baselineKnownForDALDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
|
||||
|
||||
override val baselineKnownForModelVersion = ModelVersions.Model20M145KUpdated
|
||||
}
|
Binary file not shown.
@ -1,131 +0,0 @@
|
||||
package com.twitter.simclusters_v2.scalding
|
||||
|
||||
import com.twitter.scalding.{DateOps, DateParser, Execution, Stat, TypedPipe, TypedTsv, UniqueID}
|
||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
||||
import com.twitter.simclusters_v2.common.{ClusterId, UserId}
|
||||
import com.twitter.simclusters_v2.scalding.common.Util
|
||||
import com.twitter.simclusters_v2.scalding.common.Util.Distribution
|
||||
|
||||
object CompareClusters {
|
||||
def norm(a: Iterable[Float]): Float = {
|
||||
math
|
||||
.sqrt(a.map { x => x * x }.sum).toFloat
|
||||
}
|
||||
|
||||
def cosine(a: Map[Long, Float], b: Map[Long, Float]): Float = {
|
||||
val intersect = a.toList.collect {
|
||||
case (id, score) if b.contains(id) =>
|
||||
score * b(id)
|
||||
}
|
||||
val dot = if (intersect.nonEmpty) intersect.sum else 0
|
||||
val aNorm = norm(a.values)
|
||||
val bNorm = norm(b.values)
|
||||
if (aNorm > 0 && bNorm > 0) {
|
||||
dot / aNorm / bNorm
|
||||
} else 0
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two known-for data set, and generate change in cluster assignment stats
|
||||
*/
|
||||
def compareClusterAssignments(
|
||||
newKnownFor: TypedPipe[(UserId, List[(ClusterId, Float)])],
|
||||
oldKnownFor: TypedPipe[(UserId, List[(ClusterId, Float)])]
|
||||
)(
|
||||
implicit uniqueID: UniqueID
|
||||
): Execution[String] = {
|
||||
|
||||
val emptyToSomething = Stat("no_assignment_to_some")
|
||||
val somethingToEmpty = Stat("some_assignment_to_none")
|
||||
val emptyToEmpty = Stat("empty_to_empty")
|
||||
val sameCluster = Stat("same_cluster")
|
||||
val diffCluster = Stat("diff_cluster")
|
||||
|
||||
val calculateStatExec = newKnownFor
|
||||
.outerJoin(oldKnownFor)
|
||||
.map {
|
||||
case (userId, (newKnownForListOpt, oldKnownForListOpt)) =>
|
||||
val newKnownFor = newKnownForListOpt.getOrElse(Nil)
|
||||
val oldKnownFor = oldKnownForListOpt.getOrElse(Nil)
|
||||
|
||||
if (newKnownFor.nonEmpty && oldKnownFor.isEmpty) {
|
||||
emptyToSomething.inc()
|
||||
}
|
||||
if (newKnownFor.isEmpty && oldKnownFor.nonEmpty) {
|
||||
somethingToEmpty.inc()
|
||||
}
|
||||
if (newKnownFor.isEmpty && oldKnownFor.isEmpty) {
|
||||
emptyToEmpty.inc()
|
||||
}
|
||||
|
||||
if (newKnownFor.nonEmpty && oldKnownFor.nonEmpty) {
|
||||
val newClusterId = newKnownFor.head._1
|
||||
val oldClusterId = oldKnownFor.head._1
|
||||
|
||||
if (newClusterId == oldClusterId) {
|
||||
sameCluster.inc()
|
||||
} else {
|
||||
diffCluster.inc()
|
||||
}
|
||||
}
|
||||
userId
|
||||
}
|
||||
.toIterableExecution
|
||||
|
||||
Util.getCustomCountersString(calculateStatExec)
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two cluster assignments in terms of cosine similarity of corresponding clusters.
|
||||
* Excludes clusters which are too small
|
||||
* @param knownForA
|
||||
* @param knownForB
|
||||
* @param minSizeOfBiggerCluster Set to 10 or some such.
|
||||
* @return
|
||||
*/
|
||||
def compare(
|
||||
knownForA: TypedPipe[(Int, List[(Long, Float)])],
|
||||
knownForB: TypedPipe[(Int, List[(Long, Float)])],
|
||||
minSizeOfBiggerCluster: Int
|
||||
): TypedPipe[(Int, Float)] = {
|
||||
knownForA
|
||||
.outerJoin(knownForB)
|
||||
.collect {
|
||||
case (clusterId, (membersInAOpt, membersInBOpt))
|
||||
if membersInAOpt.exists(_.size >= minSizeOfBiggerCluster) || membersInBOpt
|
||||
.exists(_.size >= minSizeOfBiggerCluster) =>
|
||||
val membersInA =
|
||||
membersInAOpt.map(_.toMap).getOrElse(Map.empty[Long, Float])
|
||||
val membersInB =
|
||||
membersInBOpt.map(_.toMap).getOrElse(Map.empty[Long, Float])
|
||||
(clusterId, cosine(membersInA, membersInB))
|
||||
}
|
||||
}
|
||||
|
||||
def summarize(clusterToCosines: TypedPipe[(Int, Float)]): Execution[Option[Distribution]] = {
|
||||
clusterToCosines.values.map(x => List(x)).sum.toOptionExecution.map { listOpt =>
|
||||
listOpt.map { list => Util.distributionFromArray(list.map(_.toDouble).toArray) }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object CompareClustersAdhoc extends TwitterExecutionApp {
|
||||
implicit val tz: java.util.TimeZone = DateOps.UTC
|
||||
implicit val dp = DateParser.default
|
||||
|
||||
def job: Execution[Unit] =
|
||||
Execution.getConfigMode.flatMap {
|
||||
case (config, mode) =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
val args = config.getArgs
|
||||
|
||||
val knownForA = KnownForSources.transpose(KnownForSources.readKnownFor(args("knownForA")))
|
||||
val knownForB = KnownForSources.transpose(KnownForSources.readKnownFor(args("knownForB")))
|
||||
|
||||
CompareClusters
|
||||
.compare(knownForA, knownForB, minSizeOfBiggerCluster = 10)
|
||||
.map { case (cId, cos) => "%d\t%.2f".format(cId, cos) }
|
||||
.writeExecution(TypedTsv(args("outputDir")))
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,330 +0,0 @@
|
||||
package com.twitter.simclusters_v2.scalding
|
||||
|
||||
import com.twitter.algebird.Monoid
|
||||
import com.twitter.logging.Logger
|
||||
import com.twitter.scalding.{Execution, TypedPipe, TypedTsv}
|
||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
||||
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
|
||||
import java.util
|
||||
import no.uib.cipr.matrix.Matrix
|
||||
import no.uib.cipr.matrix.sparse.{ArpackSym, LinkedSparseMatrix}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object EigenVectorsForSparseSymmetric {
|
||||
val log: Logger = Logger()
|
||||
|
||||
/**
|
||||
* Construct matrix from the rows of the matrix, specified as a map. The outer map is indexed by rowId, and the inner maps are indexed by columnId.
|
||||
* Note that the input matrix is intended to be symmetric.
|
||||
*
|
||||
* @param map A map specifying the rows of the matrix. The outer map is indexed by rowId, and the inner maps are indexed by columnId. Both rows and columns are zero-indexed.
|
||||
* @param nRows number of rows in matrix
|
||||
* @param nCols number of columns in matrix
|
||||
*
|
||||
* @return the constructed matrix
|
||||
*/
|
||||
def getMatrix(map: Map[Int, Map[Int, Double]], nRows: Int, nCols: Int): Matrix = {
|
||||
val nonzeros = map.toSeq.flatMap {
|
||||
case (i, subMap) =>
|
||||
subMap.toSeq.map {
|
||||
case (j, value) =>
|
||||
(i, j, value)
|
||||
}
|
||||
}
|
||||
getMatrix(nonzeros, nRows, nCols)
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct matrix from iterable of the non-zero entries. Note that the input matrix is intended to be symmetric.
|
||||
*
|
||||
* @param nonzeros non-zeros in (i, j, v) format, where i is row, j is column, and v is value. Both rows and columns are zero-indexed.
|
||||
* @param nRows number of rows in matrix
|
||||
* @param nCols number of columns in matrix
|
||||
*
|
||||
* @return the constructed matrix
|
||||
*/
|
||||
def getMatrix(nonzeros: Iterable[(Int, Int, Double)], nRows: Int, nCols: Int): Matrix = {
|
||||
val matrix = new LinkedSparseMatrix(nRows, nCols)
|
||||
var numEntries = 0
|
||||
var maxRow = 0
|
||||
var maxCol = 0
|
||||
|
||||
nonzeros.foreach {
|
||||
case (i, j, v) =>
|
||||
if (i > maxRow) {
|
||||
maxRow = i
|
||||
}
|
||||
if (j > maxCol) {
|
||||
maxCol = j
|
||||
}
|
||||
numEntries += 1
|
||||
matrix.set(i, j, v)
|
||||
}
|
||||
log.info(
|
||||
"Finished building matrix with %d entries and maxRow %d and maxCol %d"
|
||||
.format(numEntries, maxRow, maxCol))
|
||||
|
||||
matrix
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints out various diagnostics about how much the given matrix differs from a perfect
|
||||
* symmetric matrix. If (i,j) and (j,i) are different, it sets both of them to be the max of the two.
|
||||
* Call this function before invoking EVD.
|
||||
*
|
||||
* @param matrix Matrix which is modified (if need be) in place.
|
||||
*/
|
||||
def ensureMatrixIsSymmetric(matrix: Matrix): Unit = {
|
||||
var numUnequalEntries = 0
|
||||
var numEntriesDifferentBy1Percent = 0
|
||||
var numEqualEntries = 0
|
||||
var numUnequalDueToZero = 0
|
||||
var maxUnequal = (0, 0, 0.0, 0.0)
|
||||
matrix.iterator().asScala.foreach { entry =>
|
||||
val curr = entry.get()
|
||||
val opp = matrix.get(entry.column(), entry.row())
|
||||
if (curr == opp) {
|
||||
numEqualEntries += 1
|
||||
} else {
|
||||
numUnequalEntries += 1
|
||||
if (opp == 0) {
|
||||
numUnequalDueToZero += 1
|
||||
}
|
||||
if (opp != 0 && (math.abs(curr - opp) / math.min(curr, opp)) > 0.01) {
|
||||
numEntriesDifferentBy1Percent += 1
|
||||
}
|
||||
if (opp != 0 && math.abs(curr - opp) > maxUnequal._4) {
|
||||
maxUnequal = (entry.row(), entry.column(), curr, math.abs(curr - opp))
|
||||
}
|
||||
val max = math.max(curr, opp)
|
||||
matrix.set(entry.column(), entry.row(), max)
|
||||
matrix.set(entry.row(), entry.column(), max)
|
||||
}
|
||||
}
|
||||
|
||||
var numUnEqualPrinted = 0
|
||||
matrix.iterator().asScala.foreach { entry =>
|
||||
val opp = matrix.get(entry.column(), entry.row())
|
||||
if (numUnEqualPrinted < 10 && entry.get() != opp) {
|
||||
numUnEqualPrinted += 1
|
||||
log.info(
|
||||
"Entries for (%d, %d) are %s and %s"
|
||||
.format(entry.row(), entry.column(), entry.get(), opp))
|
||||
}
|
||||
}
|
||||
|
||||
log.info(
|
||||
"Num unequal entries: %d, num unequal due to zero: %d, num unequal by 1percent or more: %d, num equal entries: %d, maxUnequal: %s"
|
||||
.format(
|
||||
numUnequalEntries,
|
||||
numUnequalDueToZero,
|
||||
numEntriesDifferentBy1Percent,
|
||||
numEqualEntries,
|
||||
maxUnequal))
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the top-k eigenvalues (largest magnitude) and eigenvectors for an input matrix.
|
||||
* Top eigenvalues means they're the largest in magnitude.
|
||||
* Input matrix needs to be perfectly symmetric; if it's not, this function will fail.
|
||||
*
|
||||
* Many of the eigenvectors will have very small values along most of the dimensions. This method also
|
||||
* only retains the bigger entries in an eigenvector.
|
||||
*
|
||||
* @param matrix symmetric input matrix.
|
||||
* @param k how many of the top eigenvectors to get.
|
||||
* @param ratioToLargestCutoff An entry needs to be at least 1/ratioToLargestCutoff of the biggest entry in that vector to be retained.
|
||||
*
|
||||
* @return seq of (eigenvalue, eigenvector) pairs.
|
||||
*/
|
||||
def getTruncatedEVD(
|
||||
matrix: Matrix,
|
||||
k: Int,
|
||||
ratioToLargestCutoff: Float
|
||||
): Seq[(Double, Seq[(Int, Double)])] = {
|
||||
val solver = new ArpackSym(matrix)
|
||||
val resultsMap = solver.solve(k, ArpackSym.Ritz.LM).asScala.toMap
|
||||
val results = resultsMap.toIndexedSeq.sortBy { case (eigValue, _) => -eigValue }
|
||||
results.zipWithIndex.map {
|
||||
case ((eigValue, denseVectorJava), index) =>
|
||||
val denseVector = new Array[Double](denseVectorJava.size())
|
||||
denseVector.indices.foreach { index => denseVector(index) = denseVectorJava.get(index) }
|
||||
val denseVectorMax = denseVector.maxBy { entry => math.abs(entry) }
|
||||
val cutOff = math.abs(denseVectorMax) / ratioToLargestCutoff
|
||||
val significantEntries = denseVector.zipWithIndex
|
||||
.filter { case (vectorEntry, _) => math.abs(vectorEntry) >= cutOff }
|
||||
.sortBy { case (vectorEntry, _) => -1 * math.abs(vectorEntry) }
|
||||
(eigValue.toDouble, significantEntries.toSeq.map(_.swap))
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute U*Diag*Ut - where Diag is a diagonal matrix, and U is a sparse matrix.
|
||||
* This is primarily for testing - to make sure that the computed eigenvectors can be used to
|
||||
* reconstruct the original matrix up to some reasonable approximation.
|
||||
*
|
||||
* @param diagToUColumns seq of (diagonal entries, associated column in U)
|
||||
* @param cutoff cutoff for including a value in the result.
|
||||
*
|
||||
* @return result of multiplication, returned as a map of the rows in the results.
|
||||
*/
|
||||
def uTimesDiagTimesUT(
|
||||
diagToUColumns: Seq[(Double, Seq[(Int, Double)])],
|
||||
cutoff: Double
|
||||
): Map[Int, Map[Int, Double]] = {
|
||||
val result = new util.HashMap[Int, util.HashMap[Int, Double]]()
|
||||
diagToUColumns.foreach {
|
||||
case (diag, uColumn) =>
|
||||
uColumn.foreach {
|
||||
case (i, iVal) =>
|
||||
uColumn.foreach {
|
||||
case (j, jVal) =>
|
||||
val prod = diag * iVal * jVal
|
||||
if (result.containsKey(i)) {
|
||||
val newVal = if (result.get(i).containsKey(j)) {
|
||||
result.get(i).get(j) + prod
|
||||
} else prod
|
||||
result.get(i).put(j, newVal)
|
||||
} else {
|
||||
result.put(i, new util.HashMap[Int, Double])
|
||||
result.get(i).put(j, prod)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
val unfiltered = result.asScala.toMap.mapValues(_.asScala.toMap)
|
||||
unfiltered
|
||||
.mapValues { m => m.filter { case (_, value) => math.abs(value) >= cutoff } }
|
||||
.filter { case (_, vector) => vector.nonEmpty }
|
||||
}
|
||||
|
||||
/** Note: This requires a full EVD to correctly compute the inverse! :-( */
|
||||
def getInverseFromEVD(
|
||||
evd: Seq[(Double, Seq[(Int, Double)])],
|
||||
cutoff: Double
|
||||
): Map[Int, Map[Int, Double]] = {
|
||||
val evdInverse = evd.map {
|
||||
case (eigValue, eigVector) =>
|
||||
(1.0 / eigValue, eigVector)
|
||||
}
|
||||
uTimesDiagTimesUT(evdInverse, cutoff)
|
||||
}
|
||||
}
|
||||
|
||||
object PCAProjectionMatrixAdhoc extends TwitterExecutionApp {
|
||||
val log = Logger()
|
||||
|
||||
def job: Execution[Unit] =
|
||||
Execution.getConfigMode.flatMap {
|
||||
case (config, _) =>
|
||||
Execution.withId { _ =>
|
||||
val args = config.getArgs
|
||||
val k = args.int("k", 100)
|
||||
val ratioToLargestEntryInVectorCutoff = args.int("ratioToLargestEntryInVectorCutoff", 100)
|
||||
val minClusterFavers = args.int("minClusterFavers", 1000)
|
||||
val input = TypedPipe.from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
|
||||
val outputDir = args("outputDir")
|
||||
|
||||
val filteredClustersExec =
|
||||
input
|
||||
.collect {
|
||||
case ((_, clusterId), details)
|
||||
if details.numUsersWithNonZeroFavScore > minClusterFavers =>
|
||||
clusterId
|
||||
}
|
||||
.toIterableExecution
|
||||
.map { fc =>
|
||||
val fcSet = fc.toSet
|
||||
log.info("Number of clusters with favers more than %d is %d"
|
||||
.format(minClusterFavers, fcSet.size))
|
||||
fcSet
|
||||
}
|
||||
|
||||
filteredClustersExec
|
||||
.flatMap { filteredClusters =>
|
||||
input.flatMap {
|
||||
case ((_, clusterId), details) =>
|
||||
if (filteredClusters(clusterId)) {
|
||||
details.neighborClusters.getOrElse(Nil).collect {
|
||||
case neighbor
|
||||
if filteredClusters(
|
||||
neighbor.clusterId) && neighbor.favCosineSimilarity.isDefined =>
|
||||
(clusterId, neighbor.clusterId, neighbor.favCosineSimilarity.get)
|
||||
}
|
||||
} else Nil
|
||||
}.toIterableExecution
|
||||
}
|
||||
.flatMap { edgesIter =>
|
||||
val edges = edgesIter.toSeq
|
||||
val oldIdToNewId = edges
|
||||
.flatMap { case (i, j, _) => Seq(i, j) }
|
||||
.distinct
|
||||
.zipWithIndex
|
||||
.toMap
|
||||
|
||||
val mapString = oldIdToNewId.toList
|
||||
.take(5).map {
|
||||
case (old, nw) =>
|
||||
Seq(old, nw).mkString(" ")
|
||||
}.mkString("\n")
|
||||
log.info("A few entries of OldId to NewId map is")
|
||||
log.info(mapString)
|
||||
|
||||
val newIdToOldId = oldIdToNewId.map(_.swap)
|
||||
log.info(
|
||||
"Num clusters after filtering out those with no neighbors with favers more than %d is %d"
|
||||
.format(minClusterFavers, oldIdToNewId.size))
|
||||
val newEdges = edges.map {
|
||||
case (oldI, oldJ, value) =>
|
||||
(oldIdToNewId(oldI), oldIdToNewId(oldJ), value)
|
||||
}
|
||||
log.info("Going to build matrix")
|
||||
val matrix = EigenVectorsForSparseSymmetric.getMatrix(
|
||||
newEdges,
|
||||
oldIdToNewId.size,
|
||||
oldIdToNewId.size)
|
||||
EigenVectorsForSparseSymmetric.ensureMatrixIsSymmetric(matrix)
|
||||
|
||||
log.info("Going to solve now for %d eigenvalues".format(k))
|
||||
val tic = System.currentTimeMillis()
|
||||
val results = EigenVectorsForSparseSymmetric.getTruncatedEVD(
|
||||
matrix,
|
||||
k,
|
||||
ratioToLargestEntryInVectorCutoff)
|
||||
val toc = System.currentTimeMillis()
|
||||
log.info("Finished solving in %.2f minutes".format((toc - tic) / 1000 / 60.0))
|
||||
|
||||
val eigValues = results.map(_._1).map { x => "%.3g".format(x) }.mkString(" ")
|
||||
val eigValueNorm = math.sqrt(results.map(_._1).map(x => x * x).sum)
|
||||
val matrixNorm = math.sqrt(matrix.iterator().asScala.map(_.get()).map(x => x * x).sum)
|
||||
|
||||
println(
|
||||
"matrixNorm %s, eigValueNorm %s, explained fraction %s"
|
||||
.format(matrixNorm, eigValueNorm, eigValueNorm / matrixNorm))
|
||||
|
||||
log.info("The eigenvalues are:")
|
||||
log.info(eigValues)
|
||||
|
||||
val nnzInEigenVectors = results.map(_._2.size).sum
|
||||
log.info("Average nnz per eigenvector using ratioToLargestCutoff %d is %.2g"
|
||||
.format(ratioToLargestEntryInVectorCutoff, nnzInEigenVectors * 1.0 / results.size))
|
||||
val transposedRaw = results.zipWithIndex.flatMap {
|
||||
case ((_, eigVector), eigIndex) =>
|
||||
eigVector.map {
|
||||
case (index, vectorEntry) =>
|
||||
val clusterId = newIdToOldId(index)
|
||||
Map(clusterId -> List((eigIndex, vectorEntry)))
|
||||
}
|
||||
}
|
||||
val transposed = Monoid.sum(transposedRaw).mapValues { rowForCluster =>
|
||||
rowForCluster
|
||||
.map {
|
||||
case (dimId, weight) =>
|
||||
"%d:%.2g".format(dimId, weight)
|
||||
}.mkString(" ")
|
||||
}
|
||||
TypedPipe.from(transposed.toSeq).writeExecution(TypedTsv(outputDir))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,332 +0,0 @@
|
||||
package com.twitter.simclusters_v2.scalding
|
||||
|
||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
||||
import com.twitter.dal.client.dataset.SnapshotDALDataset
|
||||
import com.twitter.scalding._
|
||||
import com.twitter.scalding_internal.dalv2.DAL
|
||||
import com.twitter.scalding_internal.dalv2.DALWrite.D
|
||||
import com.twitter.scalding_internal.dalv2.DALWrite.WriteExtension
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
||||
import com.twitter.simclusters_v2.common.ClusterId
|
||||
import com.twitter.simclusters_v2.common.ModelVersions
|
||||
import com.twitter.simclusters_v2.common.UserId
|
||||
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
|
||||
import com.twitter.simclusters_v2.hdfs_sources.AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
|
||||
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
|
||||
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2UserToInterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
|
||||
import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
|
||||
import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
|
||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
|
||||
import com.twitter.simclusters_v2.thriftscala.InternalId
|
||||
import com.twitter.simclusters_v2.thriftscala.ModelVersion
|
||||
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding
|
||||
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
|
||||
import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
|
||||
import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores
|
||||
import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusters
|
||||
import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
|
||||
import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp
|
||||
import java.util.TimeZone
|
||||
|
||||
/**
|
||||
* Production job for computing interestedIn data set from the aggregatable producer embeddings for the model version 20M145K2020.
|
||||
* It writes the data set in KeyVal format to produce a MH DAL data set.
|
||||
*
|
||||
* A high level description of this job:
|
||||
* - Read the APE dataset
|
||||
* - Apply log1p to the scores from the above dataset as the scores for producers is high
|
||||
* - Normalize the scores for each producer (offline benchmarking has shown better results from this step.)
|
||||
* - Truncate the number of clusters for each producer from the APE dataset to reduce noise
|
||||
* - Compute interestedIn
|
||||
*
|
||||
* To deploy the job:
|
||||
*
|
||||
* capesospy-v2 update --build_locally --start_cron interested_in_from_ape_2020 \
|
||||
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
|
||||
*/
|
||||
object InterestedInFromAPE2020BatchApp extends InterestedInFromAggregatableProducerEmbeddingsBase {
|
||||
|
||||
override val firstTime: RichDate = RichDate("2021-03-03")
|
||||
|
||||
override val batchIncrement: Duration = Days(7)
|
||||
|
||||
override def modelVersion: ModelVersion = ModelVersion.Model20m145k2020
|
||||
|
||||
override def producerEmbeddingsInputKVDataset: KeyValDALDataset[
|
||||
KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]
|
||||
] = AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
|
||||
|
||||
override def interestedInFromAPEOutputKVDataset: KeyValDALDataset[
|
||||
KeyVal[UserId, ClustersUserIsInterestedIn]
|
||||
] = SimclustersV2InterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
|
||||
|
||||
override def interestedInFromAPEOutputThriftDatset: SnapshotDALDataset[
|
||||
UserToInterestedInClusters
|
||||
] = SimclustersV2UserToInterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
|
||||
}
|
||||
|
||||
trait InterestedInFromAggregatableProducerEmbeddingsBase extends ScheduledExecutionApp {
|
||||
def modelVersion: ModelVersion
|
||||
|
||||
def interestedInFromAPEOutputKVDataset: KeyValDALDataset[
|
||||
KeyVal[UserId, ClustersUserIsInterestedIn]
|
||||
]
|
||||
|
||||
def producerEmbeddingsInputKVDataset: KeyValDALDataset[
|
||||
KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]
|
||||
]
|
||||
|
||||
def interestedInFromAPEOutputThriftDatset: SnapshotDALDataset[UserToInterestedInClusters]
|
||||
|
||||
override def runOnDateRange(
|
||||
args: Args
|
||||
)(
|
||||
implicit dateRange: DateRange,
|
||||
timeZone: TimeZone,
|
||||
uniqueID: UniqueID
|
||||
): Execution[Unit] = {
|
||||
//Input args for the run
|
||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
||||
val maxClustersFromProducer = args.int("maxClustersPerProducer", 5)
|
||||
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 200)
|
||||
|
||||
//Path variables
|
||||
val interestedInFromProducersPath =
|
||||
s"/user/cassowary/manhattan_sequence_files/interested_in_from_ape/" + modelVersion
|
||||
|
||||
val interestedInFromProducersThriftPath =
|
||||
s"/user/cassowary/manhattan_sequence_files/interested_in_from_ape_thrift/" + modelVersion
|
||||
|
||||
val userUserGraph: TypedPipe[UserAndNeighbors] =
|
||||
DAL
|
||||
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
|
||||
.withRemoteReadPolicy(AllowCrossDC)
|
||||
.toTypedPipe
|
||||
|
||||
val producerEmbeddings = DAL
|
||||
.readMostRecentSnapshotNoOlderThan(
|
||||
producerEmbeddingsInputKVDataset,
|
||||
Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe.map {
|
||||
case KeyVal(producer, embeddings) => (producer, embeddings)
|
||||
}
|
||||
|
||||
val result = InterestedInFromAggregatableProducerEmbeddingsBase.run(
|
||||
userUserGraph,
|
||||
producerEmbeddings,
|
||||
maxClustersFromProducer,
|
||||
socialProofThreshold,
|
||||
maxClustersPerUserFinalResult,
|
||||
modelVersion)
|
||||
|
||||
val keyValExec =
|
||||
result
|
||||
.map { case (userId, clusters) => KeyVal(userId, clusters) }
|
||||
.writeDALVersionedKeyValExecution(
|
||||
interestedInFromAPEOutputKVDataset,
|
||||
D.Suffix(interestedInFromProducersPath)
|
||||
)
|
||||
val thriftExec =
|
||||
result
|
||||
.map {
|
||||
case (userId, clusters) =>
|
||||
UserToInterestedInClusters(
|
||||
userId,
|
||||
ModelVersions.toKnownForModelVersion(modelVersion),
|
||||
clusters.clusterIdToScores)
|
||||
}
|
||||
.writeDALSnapshotExecution(
|
||||
interestedInFromAPEOutputThriftDatset,
|
||||
D.Daily,
|
||||
D.Suffix(interestedInFromProducersThriftPath),
|
||||
D.EBLzo(),
|
||||
dateRange.end
|
||||
)
|
||||
Execution.zip(keyValExec, thriftExec).unit
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adhoc job to generate the interestedIn from aggregatable producer embeddings for the model version 20M145K2020
|
||||
*
|
||||
* scalding remote run \
|
||||
* --user cassowary \
|
||||
* --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
|
||||
* --principal service_acoount@TWITTER.BIZ \
|
||||
* --cluster bluebird-qus1 \
|
||||
* --main-class com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020AdhocApp \
|
||||
* --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_ape_2020-adhoc \
|
||||
* --hadoop-properties "mapreduce.map.memory.mb=8192 mapreduce.map.java.opts='-Xmx7618M' mapreduce.reduce.memory.mb=8192 mapreduce.reduce.java.opts='-Xmx7618M'" \
|
||||
* -- --outputDir /gcs/user/cassowary/adhoc/your_ldap/interested_in_from_ape_2020_keyval --date 2021-03-05
|
||||
*/
|
||||
object InterestedInFromAPE2020AdhocApp extends AdhocExecutionApp {
|
||||
override def runOnDateRange(
|
||||
args: Args
|
||||
)(
|
||||
implicit dateRange: DateRange,
|
||||
timeZone: TimeZone,
|
||||
uniqueID: UniqueID
|
||||
): Execution[Unit] = {
|
||||
val outputDir = args("outputDir")
|
||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
||||
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 200)
|
||||
val maxClustersFromProducer = args.int("maxClustersFromProducer", 5)
|
||||
val inputGraph = args.optional("graphInputDir") match {
|
||||
case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir))
|
||||
case None =>
|
||||
DAL
|
||||
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
|
||||
.withRemoteReadPolicy(AllowCrossClusterSameDC)
|
||||
.toTypedPipe
|
||||
}
|
||||
|
||||
val producerEmbeddings = DAL
|
||||
.readMostRecentSnapshotNoOlderThan(
|
||||
AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset,
|
||||
Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe.map {
|
||||
case KeyVal(producer, embeddings) => (producer, embeddings)
|
||||
}
|
||||
|
||||
val result = InterestedInFromAggregatableProducerEmbeddingsBase.run(
|
||||
inputGraph,
|
||||
producerEmbeddings,
|
||||
maxClustersFromProducer,
|
||||
socialProofThreshold,
|
||||
maxClustersPerUserFinalResult,
|
||||
ModelVersion.Model20m145k2020)
|
||||
|
||||
result
|
||||
.writeExecution(AdhocKeyValSources.interestedInSource(outputDir))
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper functions
|
||||
*/
|
||||
object InterestedInFromAggregatableProducerEmbeddingsBase {
|
||||
|
||||
/**
|
||||
* Helper function to prune the embeddings
|
||||
* @param embeddingsWithScore embeddings
|
||||
* @param maxClusters number of clusters to keep, per userId
|
||||
* @param uniqueId for stats
|
||||
* @return
|
||||
*/
|
||||
def getPrunedEmbeddings(
|
||||
embeddingsWithScore: TypedPipe[(UserId, Seq[(ClusterId, Float)])],
|
||||
maxClusters: Int
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): TypedPipe[(UserId, Array[(ClusterId, Float)])] = {
|
||||
val numProducerMappings = Stat("num_producer_embeddings_total")
|
||||
val numProducersWithLargeClusterMappings = Stat(
|
||||
"num_producers_with_more_clusters_than_threshold")
|
||||
val numProducersWithSmallClusterMappings = Stat(
|
||||
"num_producers_with_clusters_less_than_threshold")
|
||||
val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings")
|
||||
embeddingsWithScore.map {
|
||||
case (producerId, clusterArray) =>
|
||||
numProducerMappings.inc()
|
||||
val clusterSize = clusterArray.size
|
||||
totalClustersCoverageProducerEmbeddings.incBy(clusterSize)
|
||||
val prunedList = if (clusterSize > maxClusters) {
|
||||
numProducersWithLargeClusterMappings.inc()
|
||||
clusterArray
|
||||
.sortBy {
|
||||
case (_, knownForScore) => -knownForScore
|
||||
}.take(maxClusters)
|
||||
} else {
|
||||
numProducersWithSmallClusterMappings.inc()
|
||||
clusterArray
|
||||
}
|
||||
(producerId, prunedList.toArray)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* helper function to remove all scores except follow and logFav
|
||||
* @param interestedInResult interestedIn clusters for a user
|
||||
* @return
|
||||
*/
|
||||
def getInterestedInDiscardScores(
|
||||
interestedInResult: TypedPipe[(UserId, List[(ClusterId, UserToInterestedInClusterScores)])]
|
||||
): TypedPipe[(UserId, List[(ClusterId, UserToInterestedInClusterScores)])] = {
|
||||
interestedInResult.map {
|
||||
case (srcId, fullClusterList) =>
|
||||
val fullClusterListWithDiscardedScores = fullClusterList.map {
|
||||
case (clusterId, clusterDetails) =>
|
||||
val clusterDetailsWithoutSocial = UserToInterestedInClusterScores(
|
||||
// We are not planning to use the other scores except for logFav and Follow.
|
||||
// Hence, setting others as None for now, we can add them back when needed
|
||||
followScore = clusterDetails.followScore,
|
||||
logFavScore = clusterDetails.logFavScore,
|
||||
logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly
|
||||
)
|
||||
(clusterId, clusterDetailsWithoutSocial)
|
||||
}
|
||||
(srcId, fullClusterListWithDiscardedScores)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to normalize the embeddings
|
||||
* @param embeddings cluster embeddings
|
||||
* @return
|
||||
*/
|
||||
def getNormalizedEmbeddings(
|
||||
embeddings: TypedPipe[(UserId, Seq[(ClusterId, Float)])]
|
||||
): TypedPipe[(UserId, Seq[(ClusterId, Float)])] = {
|
||||
embeddings.map {
|
||||
case (userId, clustersWithScores) =>
|
||||
val l2norm = math.sqrt(clustersWithScores.map(_._2).map(score => score * score).sum)
|
||||
(
|
||||
userId,
|
||||
clustersWithScores.map {
|
||||
case (clusterId, score) => (clusterId, (score / l2norm).toFloat)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
def run(
|
||||
userUserGraph: TypedPipe[UserAndNeighbors],
|
||||
producerEmbeddings: TypedPipe[(SimClustersEmbeddingId, SimClustersEmbedding)],
|
||||
maxClustersFromProducer: Int,
|
||||
socialProofThreshold: Int,
|
||||
maxClustersPerUserFinalResult: Int,
|
||||
modelVersion: ModelVersion
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
import InterestedInFromKnownFor._
|
||||
|
||||
val producerEmbeddingsWithScore: TypedPipe[(UserId, Seq[(ClusterId, Float)])] =
|
||||
producerEmbeddings.map {
|
||||
case (
|
||||
SimClustersEmbeddingId(embeddingType, modelVersion, InternalId.UserId(producerId)),
|
||||
simclusterEmbedding) =>
|
||||
(
|
||||
producerId,
|
||||
simclusterEmbedding.embedding.map { simclusterWithScore =>
|
||||
// APE dataset has very high producer scores, hence applying log to smoothen them out before
|
||||
// computing interestedIn
|
||||
(simclusterWithScore.clusterId, math.log(1.0 + simclusterWithScore.score).toFloat)
|
||||
})
|
||||
}
|
||||
|
||||
val result = keepOnlyTopClusters(
|
||||
getInterestedInDiscardScores(
|
||||
attachNormalizedScores(
|
||||
userClusterPairsWithoutNormalization(
|
||||
userUserGraph,
|
||||
getPrunedEmbeddings(
|
||||
getNormalizedEmbeddings(producerEmbeddingsWithScore),
|
||||
maxClustersFromProducer),
|
||||
socialProofThreshold,
|
||||
))),
|
||||
maxClustersPerUserFinalResult,
|
||||
ModelVersions.toKnownForModelVersion(modelVersion)
|
||||
)
|
||||
result
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,666 +0,0 @@
|
||||
package com.twitter.simclusters_v2.scalding
|
||||
|
||||
import com.twitter.algebird.Semigroup
|
||||
import com.twitter.bijection.Injection
|
||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
||||
import com.twitter.scalding.TypedPipe
|
||||
import com.twitter.scalding._
|
||||
import com.twitter.scalding_internal.dalv2.DAL
|
||||
import com.twitter.scalding_internal.dalv2.DALWrite._
|
||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
||||
import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecution
|
||||
import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecutionArgs
|
||||
import com.twitter.scalding_internal.job.analytics_batch.BatchDescription
|
||||
import com.twitter.scalding_internal.job.analytics_batch.BatchFirstTime
|
||||
import com.twitter.scalding_internal.job.analytics_batch.BatchIncrement
|
||||
import com.twitter.scalding_internal.job.analytics_batch.TwitterScheduledExecutionApp
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
||||
import com.twitter.simclusters_v2.common.ClusterId
|
||||
import com.twitter.simclusters_v2.common.ModelVersions
|
||||
import com.twitter.simclusters_v2.common.UserId
|
||||
import com.twitter.simclusters_v2.hdfs_sources._
|
||||
import com.twitter.simclusters_v2.scalding.common.Util
|
||||
import com.twitter.simclusters_v2.thriftscala._
|
||||
|
||||
/**
|
||||
* This file implements the job for computing users' interestedIn vector from KnownFor data set.
|
||||
*
|
||||
* It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then
|
||||
* based on the known-for clusters of each followed/faved user, we calculate how much a user is
|
||||
* interestedIn a cluster.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Production job for computing interestedIn data set for the model version 20M145K2020.
|
||||
*
|
||||
* To deploy the job:
|
||||
*
|
||||
* capesospy-v2 update --build_locally --start_cron interested_in_for_20M_145k_2020 \
|
||||
src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
|
||||
*/
|
||||
object InterestedInFromKnownFor20M145K2020 extends InterestedInFromKnownForBatchBase {
|
||||
override val firstTime: String = "2020-10-06"
|
||||
override val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
|
||||
SimclustersV2RawInterestedIn20M145K2020ScalaDataset
|
||||
override val outputPath: String = InternalDataPaths.RawInterestedIn2020Path
|
||||
override val knownForModelVersion: String = ModelVersions.Model20M145K2020
|
||||
override val knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] =
|
||||
SimclustersV2KnownFor20M145K2020ScalaDataset
|
||||
}
|
||||
|
||||
/**
|
||||
* base class for the main logic of computing interestedIn from KnownFor data set.
|
||||
*/
|
||||
trait InterestedInFromKnownForBatchBase extends TwitterScheduledExecutionApp {
|
||||
implicit val tz = DateOps.UTC
|
||||
implicit val parser = DateParser.default
|
||||
|
||||
def firstTime: String
|
||||
val batchIncrement: Duration = Days(7)
|
||||
val lookBackDays: Duration = Days(30)
|
||||
|
||||
def outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
|
||||
def outputPath: String
|
||||
def knownForModelVersion: String
|
||||
def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
|
||||
|
||||
private lazy val execArgs = AnalyticsBatchExecutionArgs(
|
||||
batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
|
||||
firstTime = BatchFirstTime(RichDate(firstTime)),
|
||||
lastTime = None,
|
||||
batchIncrement = BatchIncrement(batchIncrement)
|
||||
)
|
||||
|
||||
override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
|
||||
implicit dateRange =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
Execution.withArgs { args =>
|
||||
val normalizedGraph =
|
||||
DAL.readMostRecentSnapshot(UserUserNormalizedGraphScalaDataset).toTypedPipe
|
||||
val knownFor = KnownForSources.fromKeyVal(
|
||||
DAL.readMostRecentSnapshot(knownForDALDataset, dateRange.extend(Days(30))).toTypedPipe,
|
||||
knownForModelVersion
|
||||
)
|
||||
|
||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
||||
val maxClustersPerUser = args.int("maxClustersPerUser", 50)
|
||||
|
||||
val result = InterestedInFromKnownFor
|
||||
.run(
|
||||
normalizedGraph,
|
||||
knownFor,
|
||||
socialProofThreshold,
|
||||
maxClustersPerUser,
|
||||
knownForModelVersion
|
||||
)
|
||||
|
||||
val writeKeyValResultExec = result
|
||||
.map { case (userId, clusters) => KeyVal(userId, clusters) }
|
||||
.writeDALVersionedKeyValExecution(
|
||||
outputKVDataset,
|
||||
D.Suffix(outputPath)
|
||||
)
|
||||
|
||||
// read previous data set for validation purpose
|
||||
val previousDataset = if (RichDate(firstTime).timestamp != dateRange.start.timestamp) {
|
||||
DAL
|
||||
.readMostRecentSnapshot(outputKVDataset, dateRange.prepend(lookBackDays)).toTypedPipe
|
||||
.map {
|
||||
case KeyVal(user, interestedIn) =>
|
||||
(user, interestedIn)
|
||||
}
|
||||
} else {
|
||||
TypedPipe.empty
|
||||
}
|
||||
|
||||
Util.printCounters(
|
||||
Execution
|
||||
.zip(
|
||||
writeKeyValResultExec,
|
||||
InterestedInFromKnownFor.dataSetStats(result, "NewResult"),
|
||||
InterestedInFromKnownFor.dataSetStats(previousDataset, "OldResult")
|
||||
).unit
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adhoc job to compute user interestedIn.
|
||||
*
|
||||
* scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_adhoc \
|
||||
* --user recos-platform \
|
||||
* --submitter hadoopnest2.atla.twitter.com \
|
||||
* --main-class com.twitter.simclusters_v2.scalding.InterestedInFromKnownForAdhoc -- \
|
||||
* --date 2019-08-26 --outputDir /user/recos-platform/adhoc/simclusters_interested_in_log_fav
|
||||
*/
|
||||
object InterestedInFromKnownForAdhoc extends TwitterExecutionApp {
|
||||
def job: Execution[Unit] =
|
||||
Execution.getConfigMode.flatMap {
|
||||
case (config, mode) =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
val args = config.getArgs
|
||||
val normalizedGraph = TypedPipe.from(
|
||||
UserAndNeighborsFixedPathSource(args("graphInputDir"))
|
||||
)
|
||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
||||
val maxClustersPerUser = args.int("maxClustersPerUser", 20)
|
||||
val knownForModelVersion = args("knownForModelVersion")
|
||||
val knownFor = KnownForSources.readKnownFor(args("knownForInputDir"))
|
||||
|
||||
val outputSink = AdhocKeyValSources.interestedInSource(args("outputDir"))
|
||||
Util.printCounters(
|
||||
InterestedInFromKnownFor
|
||||
.run(
|
||||
normalizedGraph,
|
||||
knownFor,
|
||||
socialProofThreshold,
|
||||
maxClustersPerUser,
|
||||
knownForModelVersion
|
||||
).writeExecution(outputSink)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adhoc job to check the output of an adhoc interestedInSource.
|
||||
*/
|
||||
object DumpInterestedInAdhoc extends TwitterExecutionApp {
|
||||
def job: Execution[Unit] =
|
||||
Execution.getConfigMode.flatMap {
|
||||
case (config, mode) =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
val args = config.getArgs
|
||||
val users = args.list("users").map(_.toLong).toSet
|
||||
val input = TypedPipe.from(AdhocKeyValSources.interestedInSource(args("inputDir")))
|
||||
input.filter { case (userId, rec) => users.contains(userId) }.toIterableExecution.map {
|
||||
s => println(s.map(Util.prettyJsonMapper.writeValueAsString).mkString("\n"))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper functions
|
||||
*/
|
||||
object InterestedInFromKnownFor {
|
||||
private def ifNanMake0(x: Double): Double = if (x.isNaN) 0.0 else x
|
||||
|
||||
case class SrcClusterIntermediateInfo(
|
||||
followScore: Double,
|
||||
followScoreProducerNormalized: Double,
|
||||
favScore: Double,
|
||||
favScoreProducerNormalized: Double,
|
||||
logFavScore: Double,
|
||||
logFavScoreProducerNormalized: Double,
|
||||
followSocialProof: List[Long],
|
||||
favSocialProof: List[Long]) {
|
||||
// overriding for the sake of unit tests
|
||||
override def equals(obj: scala.Any): Boolean = {
|
||||
obj match {
|
||||
case that: SrcClusterIntermediateInfo =>
|
||||
math.abs(followScore - that.followScore) < 1e-5 &&
|
||||
math.abs(followScoreProducerNormalized - that.followScoreProducerNormalized) < 1e-5 &&
|
||||
math.abs(favScore - that.favScore) < 1e-5 &&
|
||||
math.abs(favScoreProducerNormalized - that.favScoreProducerNormalized) < 1e-5 &&
|
||||
math.abs(logFavScore - that.logFavScore) < 1e-5 &&
|
||||
math.abs(logFavScoreProducerNormalized - that.logFavScoreProducerNormalized) < 1e-5 &&
|
||||
followSocialProof.toSet == that.followSocialProof.toSet &&
|
||||
favSocialProof.toSet == that.favSocialProof.toSet
|
||||
case _ => false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
implicit object SrcClusterIntermediateInfoSemigroup
|
||||
extends Semigroup[SrcClusterIntermediateInfo] {
|
||||
override def plus(
|
||||
left: SrcClusterIntermediateInfo,
|
||||
right: SrcClusterIntermediateInfo
|
||||
): SrcClusterIntermediateInfo = {
|
||||
SrcClusterIntermediateInfo(
|
||||
followScore = left.followScore + right.followScore,
|
||||
followScoreProducerNormalized =
|
||||
left.followScoreProducerNormalized + right.followScoreProducerNormalized,
|
||||
favScore = left.favScore + right.favScore,
|
||||
favScoreProducerNormalized =
|
||||
left.favScoreProducerNormalized + right.favScoreProducerNormalized,
|
||||
logFavScore = left.logFavScore + right.logFavScore,
|
||||
logFavScoreProducerNormalized =
|
||||
left.logFavScoreProducerNormalized + right.logFavScoreProducerNormalized,
|
||||
followSocialProof =
|
||||
Semigroup.plus(left.followSocialProof, right.followSocialProof).distinct,
|
||||
favSocialProof = Semigroup.plus(left.favSocialProof, right.favSocialProof).distinct
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param adjacencyLists User-User follow/fav graph
|
||||
* @param knownFor KnownFor data set. Each user can be known for several clusters with certain
|
||||
* knownFor weights.
|
||||
* @param socialProofThreshold A user will only be interested in a cluster if they follow/fav at
|
||||
* least certain number of users known for this cluster.
|
||||
* @param uniqueId required for these Stat
|
||||
* @return
|
||||
*/
|
||||
def userClusterPairsWithoutNormalization(
|
||||
adjacencyLists: TypedPipe[UserAndNeighbors],
|
||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
|
||||
socialProofThreshold: Int
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] = {
|
||||
val edgesToUsersWithKnownFor = Stat("num_edges_to_users_with_known_for")
|
||||
val srcDestClusterTriples = Stat("num_src_dest_cluster_triples")
|
||||
val srcClusterPairsBeforeSocialProofThresholding =
|
||||
Stat("num_src_cluster_pairs_before_social_proof_thresholding")
|
||||
val srcClusterPairsAfterSocialProofThresholding =
|
||||
Stat("num_src_cluster_pairs_after_social_proof_thresholding")
|
||||
|
||||
val edges = adjacencyLists.flatMap {
|
||||
case UserAndNeighbors(srcId, neighborsWithWeights) =>
|
||||
neighborsWithWeights.map { neighborWithWeights =>
|
||||
(
|
||||
neighborWithWeights.neighborId,
|
||||
neighborWithWeights.copy(neighborId = srcId)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian
|
||||
|
||||
edges
|
||||
.sketch(4000)
|
||||
.join(knownFor)
|
||||
.flatMap {
|
||||
case (destId, (srcWithWeights, clusterArray)) =>
|
||||
edgesToUsersWithKnownFor.inc()
|
||||
clusterArray.toList.map {
|
||||
case (clusterId, knownForScoreF) =>
|
||||
val knownForScore = math.max(0.0, knownForScoreF.toDouble)
|
||||
|
||||
srcDestClusterTriples.inc()
|
||||
val followScore =
|
||||
if (srcWithWeights.isFollowed.contains(true)) knownForScore else 0.0
|
||||
val followScoreProducerNormalizedOnly =
|
||||
srcWithWeights.followScoreNormalizedByNeighborFollowersL2.getOrElse(
|
||||
0.0) * knownForScore
|
||||
val favScore =
|
||||
srcWithWeights.favScoreHalfLife100Days.getOrElse(0.0) * knownForScore
|
||||
|
||||
val favScoreProducerNormalizedOnly =
|
||||
srcWithWeights.favScoreHalfLife100DaysNormalizedByNeighborFaversL2.getOrElse(
|
||||
0.0) * knownForScore
|
||||
|
||||
val logFavScore = srcWithWeights.logFavScore.getOrElse(0.0) * knownForScore
|
||||
|
||||
val logFavScoreProducerNormalizedOnly = srcWithWeights.logFavScoreL2Normalized
|
||||
.getOrElse(0.0) * knownForScore
|
||||
|
||||
val followSocialProof = if (srcWithWeights.isFollowed.contains(true)) {
|
||||
List(destId)
|
||||
} else Nil
|
||||
val favSocialProof = if (srcWithWeights.favScoreHalfLife100Days.exists(_ > 0)) {
|
||||
List(destId)
|
||||
} else Nil
|
||||
|
||||
(
|
||||
(srcWithWeights.neighborId, clusterId),
|
||||
SrcClusterIntermediateInfo(
|
||||
followScore,
|
||||
followScoreProducerNormalizedOnly,
|
||||
favScore,
|
||||
favScoreProducerNormalizedOnly,
|
||||
logFavScore,
|
||||
logFavScoreProducerNormalizedOnly,
|
||||
followSocialProof,
|
||||
favSocialProof
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
.sumByKey
|
||||
.withReducers(10000)
|
||||
.filter {
|
||||
case ((_, _), SrcClusterIntermediateInfo(_, _, _, _, _, _, followProof, favProof)) =>
|
||||
srcClusterPairsBeforeSocialProofThresholding.inc()
|
||||
val distinctSocialProof = (followProof ++ favProof).toSet
|
||||
val result = distinctSocialProof.size >= socialProofThreshold
|
||||
if (result) {
|
||||
srcClusterPairsAfterSocialProofThresholding.inc()
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the cluster-level l2 norm scores, and use them to normalize follow/fav scores.
|
||||
*/
|
||||
def attachNormalizedScores(
|
||||
intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
|
||||
|
||||
def square(x: Double): Double = x * x
|
||||
|
||||
val clusterCountsAndNorms =
|
||||
intermediate
|
||||
.map {
|
||||
case (
|
||||
(_, clusterId),
|
||||
SrcClusterIntermediateInfo(
|
||||
followScore,
|
||||
followScoreProducerNormalizedOnly,
|
||||
favScore,
|
||||
favScoreProducerNormalizedOnly,
|
||||
logFavScore,
|
||||
logFavScoreProducerNormalizedOnly,
|
||||
_,
|
||||
_
|
||||
)
|
||||
) =>
|
||||
(
|
||||
clusterId,
|
||||
(
|
||||
1,
|
||||
square(followScore),
|
||||
square(followScoreProducerNormalizedOnly),
|
||||
square(favScore),
|
||||
square(favScoreProducerNormalizedOnly),
|
||||
square(logFavScore),
|
||||
square(logFavScoreProducerNormalizedOnly)
|
||||
)
|
||||
)
|
||||
}
|
||||
.sumByKey
|
||||
// .withReducers(100)
|
||||
.map {
|
||||
case (
|
||||
clusterId,
|
||||
(
|
||||
cnt,
|
||||
squareFollowScore,
|
||||
squareFollowScoreProducerNormalizedOnly,
|
||||
squareFavScore,
|
||||
squareFavScoreProducerNormalizedOnly,
|
||||
squareLogFavScore,
|
||||
squareLogFavScoreProducerNormalizedOnly
|
||||
)) =>
|
||||
(
|
||||
clusterId,
|
||||
(
|
||||
cnt,
|
||||
math.sqrt(squareFollowScore),
|
||||
math.sqrt(squareFollowScoreProducerNormalizedOnly),
|
||||
math.sqrt(squareFavScore),
|
||||
math.sqrt(squareFavScoreProducerNormalizedOnly),
|
||||
math.sqrt(squareLogFavScore),
|
||||
math.sqrt(squareLogFavScoreProducerNormalizedOnly)
|
||||
))
|
||||
}
|
||||
|
||||
implicit val i2b: Int => Array[Byte] = Injection.int2BigEndian
|
||||
|
||||
intermediate
|
||||
.map {
|
||||
case ((srcId, clusterId), clusterScoresTuple) =>
|
||||
(clusterId, (srcId, clusterScoresTuple))
|
||||
}
|
||||
.sketch(reducers = 900)
|
||||
.join(clusterCountsAndNorms)
|
||||
.map {
|
||||
case (
|
||||
clusterId,
|
||||
(
|
||||
(
|
||||
srcId,
|
||||
SrcClusterIntermediateInfo(
|
||||
followScore,
|
||||
followScoreProducerNormalizedOnly,
|
||||
favScore,
|
||||
favScoreProducerNormalizedOnly,
|
||||
logFavScore,
|
||||
logFavScoreProducerNormalizedOnly, // not used for now
|
||||
followProof,
|
||||
favProof
|
||||
)
|
||||
),
|
||||
(
|
||||
cnt,
|
||||
followNorm,
|
||||
followProducerNormalizedNorm,
|
||||
favNorm,
|
||||
favProducerNormalizedNorm,
|
||||
logFavNorm,
|
||||
logFavProducerNormalizedNorm // not used for now
|
||||
)
|
||||
)
|
||||
) =>
|
||||
(
|
||||
srcId,
|
||||
List(
|
||||
(
|
||||
clusterId,
|
||||
UserToInterestedInClusterScores(
|
||||
followScore = Some(ifNanMake0(followScore)),
|
||||
followScoreClusterNormalizedOnly = Some(ifNanMake0(followScore / followNorm)),
|
||||
followScoreProducerNormalizedOnly =
|
||||
Some(ifNanMake0(followScoreProducerNormalizedOnly)),
|
||||
followScoreClusterAndProducerNormalized = Some(
|
||||
ifNanMake0(followScoreProducerNormalizedOnly / followProducerNormalizedNorm)),
|
||||
favScore = Some(ifNanMake0(favScore)),
|
||||
favScoreClusterNormalizedOnly = Some(ifNanMake0(favScore / favNorm)),
|
||||
favScoreProducerNormalizedOnly = Some(ifNanMake0(favScoreProducerNormalizedOnly)),
|
||||
favScoreClusterAndProducerNormalized =
|
||||
Some(ifNanMake0(favScoreProducerNormalizedOnly / favProducerNormalizedNorm)),
|
||||
usersBeingFollowed = Some(followProof),
|
||||
usersThatWereFaved = Some(favProof),
|
||||
numUsersInterestedInThisClusterUpperBound = Some(cnt),
|
||||
logFavScore = Some(ifNanMake0(logFavScore)),
|
||||
logFavScoreClusterNormalizedOnly = Some(ifNanMake0(logFavScore / logFavNorm))
|
||||
))
|
||||
)
|
||||
)
|
||||
}
|
||||
.sumByKey
|
||||
// .withReducers(1000)
|
||||
.toTypedPipe
|
||||
}
|
||||
|
||||
/**
|
||||
* aggregate cluster scores for each user, to be used instead of attachNormalizedScores
|
||||
* when we donot want to compute cluster-level l2 norm scores
|
||||
*/
|
||||
def groupClusterScores(
|
||||
intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
|
||||
|
||||
intermediate
|
||||
.map {
|
||||
case (
|
||||
(srcId, clusterId),
|
||||
SrcClusterIntermediateInfo(
|
||||
followScore,
|
||||
followScoreProducerNormalizedOnly,
|
||||
favScore,
|
||||
favScoreProducerNormalizedOnly,
|
||||
logFavScore,
|
||||
logFavScoreProducerNormalizedOnly,
|
||||
followProof,
|
||||
favProof
|
||||
)
|
||||
) =>
|
||||
(
|
||||
srcId,
|
||||
List(
|
||||
(
|
||||
clusterId,
|
||||
UserToInterestedInClusterScores(
|
||||
followScore = Some(ifNanMake0(followScore)),
|
||||
followScoreProducerNormalizedOnly =
|
||||
Some(ifNanMake0(followScoreProducerNormalizedOnly)),
|
||||
favScore = Some(ifNanMake0(favScore)),
|
||||
favScoreProducerNormalizedOnly = Some(ifNanMake0(favScoreProducerNormalizedOnly)),
|
||||
usersBeingFollowed = Some(followProof),
|
||||
usersThatWereFaved = Some(favProof),
|
||||
logFavScore = Some(ifNanMake0(logFavScore)),
|
||||
))
|
||||
)
|
||||
)
|
||||
}
|
||||
.sumByKey
|
||||
.withReducers(1000)
|
||||
.toTypedPipe
|
||||
}
|
||||
|
||||
/**
|
||||
* For each user, only keep up to a certain number of clusters.
|
||||
* @param allInterests user with a list of interestedIn clusters.
|
||||
* @param maxClustersPerUser number of clusters to keep for each user
|
||||
* @param knownForModelVersion known for model version
|
||||
* @param uniqueId required for these Stat
|
||||
* @return
|
||||
*/
|
||||
def keepOnlyTopClusters(
|
||||
allInterests: TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])],
|
||||
maxClustersPerUser: Int,
|
||||
knownForModelVersion: String
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): TypedPipe[(Long, ClustersUserIsInterestedIn)] = {
|
||||
val userClusterPairsBeforeUserTruncation =
|
||||
Stat("num_user_cluster_pairs_before_user_truncation")
|
||||
val userClusterPairsAfterUserTruncation =
|
||||
Stat("num_user_cluster_pairs_after_user_truncation")
|
||||
val usersWithALotOfClusters =
|
||||
Stat(s"num_users_with_more_than_${maxClustersPerUser}_clusters")
|
||||
|
||||
allInterests
|
||||
.map {
|
||||
case (srcId, fullClusterList) =>
|
||||
userClusterPairsBeforeUserTruncation.incBy(fullClusterList.size)
|
||||
val truncatedClusters = if (fullClusterList.size > maxClustersPerUser) {
|
||||
usersWithALotOfClusters.inc()
|
||||
fullClusterList
|
||||
.sortBy {
|
||||
case (_, clusterScores) =>
|
||||
(
|
||||
-clusterScores.favScore.getOrElse(0.0),
|
||||
-clusterScores.logFavScore.getOrElse(0.0),
|
||||
-clusterScores.followScore.getOrElse(0.0),
|
||||
-clusterScores.logFavScoreClusterNormalizedOnly.getOrElse(0.0),
|
||||
-clusterScores.followScoreProducerNormalizedOnly.getOrElse(0.0)
|
||||
)
|
||||
}
|
||||
.take(maxClustersPerUser)
|
||||
} else {
|
||||
fullClusterList
|
||||
}
|
||||
userClusterPairsAfterUserTruncation.incBy(truncatedClusters.size)
|
||||
(srcId, ClustersUserIsInterestedIn(knownForModelVersion, truncatedClusters.toMap))
|
||||
}
|
||||
}
|
||||
|
||||
def run(
|
||||
adjacencyLists: TypedPipe[UserAndNeighbors],
|
||||
knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
|
||||
socialProofThreshold: Int,
|
||||
maxClustersPerUser: Int,
|
||||
knownForModelVersion: String
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
keepOnlyTopClusters(
|
||||
attachNormalizedScores(
|
||||
userClusterPairsWithoutNormalization(
|
||||
adjacencyLists,
|
||||
knownFor,
|
||||
socialProofThreshold
|
||||
)
|
||||
),
|
||||
maxClustersPerUser,
|
||||
knownForModelVersion
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* run the interestedIn job, cluster normalized scores are not attached to user's clusters.
|
||||
*/
|
||||
def runWithoutClusterNormalizedScores(
|
||||
adjacencyLists: TypedPipe[UserAndNeighbors],
|
||||
knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
|
||||
socialProofThreshold: Int,
|
||||
maxClustersPerUser: Int,
|
||||
knownForModelVersion: String
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
keepOnlyTopClusters(
|
||||
groupClusterScores(
|
||||
userClusterPairsWithoutNormalization(
|
||||
adjacencyLists,
|
||||
knownFor,
|
||||
socialProofThreshold
|
||||
)
|
||||
),
|
||||
maxClustersPerUser,
|
||||
knownForModelVersion
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* print out some basic stats of the data set to make sure things are not broken
|
||||
*/
|
||||
def dataSetStats(
|
||||
interestedInData: TypedPipe[(UserId, ClustersUserIsInterestedIn)],
|
||||
dataSetName: String = ""
|
||||
): Execution[Unit] = {
|
||||
|
||||
Execution
|
||||
.zip(
|
||||
Util.printSummaryOfNumericColumn(
|
||||
interestedInData.map {
|
||||
case (user, interestedIn) =>
|
||||
interestedIn.clusterIdToScores.size
|
||||
},
|
||||
Some(s"$dataSetName UserInterestedIn Size")
|
||||
),
|
||||
Util.printSummaryOfNumericColumn(
|
||||
interestedInData.flatMap {
|
||||
case (user, interestedIn) =>
|
||||
interestedIn.clusterIdToScores.map {
|
||||
case (_, scores) =>
|
||||
scores.favScore.getOrElse(0.0)
|
||||
}
|
||||
},
|
||||
Some(s"$dataSetName UserInterestedIn favScore")
|
||||
),
|
||||
Util.printSummaryOfNumericColumn(
|
||||
interestedInData.flatMap {
|
||||
case (user, interestedIn) =>
|
||||
interestedIn.clusterIdToScores.map {
|
||||
case (_, scores) =>
|
||||
scores.favScoreClusterNormalizedOnly.getOrElse(0.0)
|
||||
}
|
||||
},
|
||||
Some(s"$dataSetName UserInterestedIn favScoreClusterNormalizedOnly")
|
||||
),
|
||||
Util.printSummaryOfNumericColumn(
|
||||
interestedInData.flatMap {
|
||||
case (user, interestedIn) =>
|
||||
interestedIn.clusterIdToScores.map {
|
||||
case (_, scores) =>
|
||||
scores.logFavScoreClusterNormalizedOnly.getOrElse(0.0)
|
||||
}
|
||||
},
|
||||
Some(s"$dataSetName UserInterestedIn logFavScoreClusterNormalizedOnly")
|
||||
)
|
||||
).unit
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,354 +0,0 @@
|
||||
package com.twitter.simclusters_v2.scalding
|
||||
|
||||
import com.twitter.algebird.Semigroup
|
||||
import com.twitter.bijection.Injection
|
||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
||||
import com.twitter.scalding._
|
||||
import com.twitter.scalding_internal.dalv2.DAL
|
||||
import com.twitter.scalding_internal.dalv2.DALWrite.{D, WriteExtension}
|
||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
||||
import com.twitter.scalding_internal.job.analytics_batch.{
|
||||
AnalyticsBatchExecution,
|
||||
AnalyticsBatchExecutionArgs,
|
||||
BatchDescription,
|
||||
BatchFirstTime,
|
||||
BatchIncrement,
|
||||
TwitterScheduledExecutionApp
|
||||
}
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
||||
import com.twitter.simclusters_v2.common.{ClusterId, ModelVersions, UserId}
|
||||
import com.twitter.simclusters_v2.hdfs_sources.{
|
||||
AdhocKeyValSources,
|
||||
InternalDataPaths,
|
||||
SimclustersV2KnownFor20M145K2020ScalaDataset,
|
||||
SimclustersV2RawInterestedInLite20M145K2020ScalaDataset,
|
||||
SimclustersV2RawInterestedIn20M145KUpdatedScalaDataset,
|
||||
UserAndNeighborsFixedPathSource,
|
||||
UserUserGraphScalaDataset
|
||||
}
|
||||
import com.twitter.simclusters_v2.scalding.common.Util
|
||||
import com.twitter.simclusters_v2.thriftscala.{
|
||||
ClustersUserIsInterestedIn,
|
||||
ClustersUserIsKnownFor,
|
||||
UserAndNeighbors,
|
||||
UserToInterestedInClusterScores
|
||||
}
|
||||
import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
|
||||
import java.util.TimeZone
|
||||
|
||||
/**
|
||||
* This file implements the job for computing users' interestedIn vector from KnownFor data set.
|
||||
*
|
||||
* It reads the UserUserGraphScalaDataset to get user-user follow + fav graph, and then
|
||||
* based on the known-for clusters of each followed/faved user, we calculate how much a user is
|
||||
* interestedIn a cluster.
|
||||
*
|
||||
* The main differences of the InterestedInFromKnownForLite compared to InterestedInFromKnownFor are
|
||||
* the following:
|
||||
* - We read the UserUserGraph dataset that doesnot contain the producer normalized scores
|
||||
* - We donot compute the cluster normalized scores for the clusters per user
|
||||
* - For social proof thresholding, we donot keep track of the entire list of follow and
|
||||
* fav social proofs but rather make use of numFollowSocial and numFavSocial (this introduces
|
||||
* some noise if follow and fav social proof contain the same users)
|
||||
* - Store 200 clusters per user compared to 50 in IIKF
|
||||
* - Runs more frequently compared to weekly in IIKF
|
||||
*/
|
||||
/**
|
||||
* Production job for computing interestedIn data set for the model version 20M145K2020.
|
||||
*
|
||||
* To deploy the job:
|
||||
*
|
||||
* capesospy-v2 update --build_locally --start_cron interested_in_lite_for_20M_145k_2020 \
|
||||
src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
|
||||
*/
|
||||
object InterestedInFromKnownForLite20M145K2020 extends InterestedInFromKnownForLite {
|
||||
override val firstTime: String = "2021-04-24"
|
||||
override val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
|
||||
SimclustersV2RawInterestedInLite20M145K2020ScalaDataset
|
||||
override val outputPath: String = InternalDataPaths.RawInterestedInLite2020Path
|
||||
override val knownForModelVersion: String = ModelVersions.Model20M145K2020
|
||||
override val knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] =
|
||||
SimclustersV2KnownFor20M145K2020ScalaDataset
|
||||
}
|
||||
trait InterestedInFromKnownForLite extends TwitterScheduledExecutionApp {
|
||||
implicit val tz = DateOps.UTC
|
||||
implicit val parser = DateParser.default
|
||||
|
||||
def firstTime: String
|
||||
val batchIncrement: Duration = Days(2)
|
||||
val lookBackDays: Duration = Days(30)
|
||||
|
||||
def outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
|
||||
def outputPath: String
|
||||
def knownForModelVersion: String
|
||||
def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
|
||||
|
||||
private lazy val execArgs = AnalyticsBatchExecutionArgs(
|
||||
batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
|
||||
firstTime = BatchFirstTime(RichDate(firstTime)),
|
||||
lastTime = None,
|
||||
batchIncrement = BatchIncrement(batchIncrement)
|
||||
)
|
||||
|
||||
override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
|
||||
implicit dateRange =>
|
||||
Execution.withId { implicit uniqueId =>
|
||||
Execution.withArgs { args =>
|
||||
val userUserGraph =
|
||||
DAL.readMostRecentSnapshot(UserUserGraphScalaDataset).toTypedPipe
|
||||
val knownFor = KnownForSources.fromKeyVal(
|
||||
DAL.readMostRecentSnapshot(knownForDALDataset, dateRange.extend(Days(30))).toTypedPipe,
|
||||
knownForModelVersion
|
||||
)
|
||||
|
||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
||||
val maxClustersPerUser = args.int("maxClustersPerUser", 200)
|
||||
|
||||
val result = InterestedInFromKnownForLite
|
||||
.run(
|
||||
userUserGraph,
|
||||
knownFor,
|
||||
socialProofThreshold,
|
||||
maxClustersPerUser,
|
||||
knownForModelVersion
|
||||
)
|
||||
|
||||
val writeKeyValResultExec = result
|
||||
.map {
|
||||
case (userId, clusters) => KeyVal(userId, clusters)
|
||||
}.writeDALVersionedKeyValExecution(
|
||||
outputKVDataset,
|
||||
D.Suffix(outputPath)
|
||||
)
|
||||
Util.printCounters(writeKeyValResultExec)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adhoc job to compute user interestedIn.
|
||||
*
|
||||
* scalding remote run \
|
||||
* --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_lite_20m_145k_2020-adhoc \
|
||||
* --main-class com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020Adhoc \
|
||||
* --user cassowary --cluster bluebird-qus1 \
|
||||
* --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
|
||||
* --principal service_acoount@TWITTER.BIZ \
|
||||
* -- \
|
||||
* --outputDir /gcs/user/cassowary/adhoc/interested_in_from_knownfor_lite/ \
|
||||
* --date 2020-08-25
|
||||
*/
|
||||
object InterestedInFromKnownForLite20M145K2020Adhoc extends AdhocExecutionApp {
|
||||
override def runOnDateRange(
|
||||
args: Args
|
||||
)(
|
||||
implicit dateRange: DateRange,
|
||||
timeZone: TimeZone,
|
||||
uniqueID: UniqueID
|
||||
): Execution[Unit] = {
|
||||
val userUserGraph = DAL.readMostRecentSnapshot(UserUserGraphScalaDataset).toTypedPipe
|
||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
||||
val maxClustersPerUser = args.int("maxClustersPerUser", 200)
|
||||
val knownForModelVersion = ModelVersions.Model20M145K2020
|
||||
val knownFor = KnownForSources.fromKeyVal(
|
||||
DAL
|
||||
.readMostRecentSnapshotNoOlderThan(
|
||||
SimclustersV2KnownFor20M145K2020ScalaDataset,
|
||||
Days(30)).toTypedPipe,
|
||||
knownForModelVersion
|
||||
)
|
||||
|
||||
val outputSink = AdhocKeyValSources.interestedInSource(args("outputDir"))
|
||||
Util.printCounters(
|
||||
InterestedInFromKnownForLite
|
||||
.run(
|
||||
userUserGraph,
|
||||
knownFor,
|
||||
socialProofThreshold,
|
||||
maxClustersPerUser,
|
||||
knownForModelVersion
|
||||
).writeExecution(outputSink)
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
object InterestedInFromKnownForLite {
|
||||
private def ifNanMake0(x: Double): Double = if (x.isNaN) 0.0 else x
|
||||
|
||||
case class SrcClusterIntermediateInfo(
|
||||
followScore: Double,
|
||||
favScore: Double,
|
||||
logFavScore: Double,
|
||||
numFollowed: Int,
|
||||
numFaved: Int) {
|
||||
|
||||
// helper function used for test cases
|
||||
override def equals(obj: scala.Any): Boolean = {
|
||||
obj match {
|
||||
case that: SrcClusterIntermediateInfo =>
|
||||
math.abs(followScore - that.followScore) < 1e-5 &&
|
||||
math.abs(favScore - that.favScore) < 1e-5 &&
|
||||
math.abs(logFavScore - that.logFavScore) < 1e-5 &&
|
||||
numFollowed == that.numFollowed &&
|
||||
numFaved == that.numFaved
|
||||
case _ => false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
implicit object SrcClusterIntermediateInfoSemigroup
|
||||
extends Semigroup[SrcClusterIntermediateInfo] {
|
||||
override def plus(
|
||||
left: SrcClusterIntermediateInfo,
|
||||
right: SrcClusterIntermediateInfo
|
||||
): SrcClusterIntermediateInfo = {
|
||||
SrcClusterIntermediateInfo(
|
||||
followScore = left.followScore + right.followScore,
|
||||
favScore = left.favScore + right.favScore,
|
||||
logFavScore = left.logFavScore + right.logFavScore,
|
||||
numFollowed = left.numFollowed + right.numFollowed,
|
||||
numFaved = left.numFaved + right.numFaved
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
def run(
|
||||
adjacencyLists: TypedPipe[UserAndNeighbors],
|
||||
knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
|
||||
socialProofThreshold: Int,
|
||||
maxClustersPerUser: Int,
|
||||
knownForModelVersion: String
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
InterestedInFromKnownFor.keepOnlyTopClusters(
|
||||
groupClusterScores(
|
||||
userClusterPairs(
|
||||
adjacencyLists,
|
||||
knownFor,
|
||||
socialProofThreshold
|
||||
)
|
||||
),
|
||||
maxClustersPerUser,
|
||||
knownForModelVersion
|
||||
)
|
||||
}
|
||||
|
||||
def userClusterPairs(
|
||||
adjacencyLists: TypedPipe[UserAndNeighbors],
|
||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
|
||||
socialProofThreshold: Int
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] = {
|
||||
val edgesToUsersWithKnownFor = Stat("num_edges_to_users_with_known_for")
|
||||
val srcDestClusterTriples = Stat("num_src_dest_cluster_triples")
|
||||
val srcClusterPairsBeforeSocialProofThresholding =
|
||||
Stat("num_src_cluster_pairs_before_social_proof_thresholding")
|
||||
val srcClusterPairsAfterSocialProofThresholding =
|
||||
Stat("num_src_cluster_pairs_after_social_proof_thresholding")
|
||||
|
||||
val edges = adjacencyLists.flatMap {
|
||||
case UserAndNeighbors(srcId, neighborsWithWeights) =>
|
||||
neighborsWithWeights.map { neighborWithWeights =>
|
||||
(
|
||||
neighborWithWeights.neighborId,
|
||||
neighborWithWeights.copy(neighborId = srcId)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian
|
||||
|
||||
edges
|
||||
.sketch(4000)
|
||||
.join(knownFor)
|
||||
.flatMap {
|
||||
case (destId, (srcWithWeights, clusterArray)) =>
|
||||
edgesToUsersWithKnownFor.inc()
|
||||
clusterArray.toList.map {
|
||||
case (clusterId, knownForScoreF) =>
|
||||
val knownForScore = math.max(0.0, knownForScoreF.toDouble)
|
||||
|
||||
srcDestClusterTriples.inc()
|
||||
val followScore =
|
||||
if (srcWithWeights.isFollowed.contains(true)) knownForScore else 0.0
|
||||
val favScore =
|
||||
srcWithWeights.favScoreHalfLife100Days.getOrElse(0.0) * knownForScore
|
||||
val logFavScore = srcWithWeights.logFavScore.getOrElse(0.0) * knownForScore
|
||||
val numFollowed = if (srcWithWeights.isFollowed.contains(true)) {
|
||||
1
|
||||
} else 0
|
||||
|
||||
val numFaved = if (srcWithWeights.favScoreHalfLife100Days.exists(_ > 0)) {
|
||||
1
|
||||
} else 0
|
||||
|
||||
(
|
||||
(srcWithWeights.neighborId, clusterId),
|
||||
SrcClusterIntermediateInfo(
|
||||
followScore,
|
||||
favScore,
|
||||
logFavScore,
|
||||
numFollowed,
|
||||
numFaved
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
.sumByKey
|
||||
.withReducers(10000)
|
||||
.filter {
|
||||
case ((_, _), SrcClusterIntermediateInfo(_, _, _, numFollowed, numFaved)) =>
|
||||
srcClusterPairsBeforeSocialProofThresholding.inc()
|
||||
// we donot remove duplicates
|
||||
val socialProofSize = numFollowed + numFaved
|
||||
val result = socialProofSize >= socialProofThreshold
|
||||
if (result) {
|
||||
srcClusterPairsAfterSocialProofThresholding.inc()
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
def groupClusterScores(
|
||||
intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
|
||||
)(
|
||||
implicit uniqueId: UniqueID
|
||||
): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
|
||||
|
||||
implicit val i2b: Int => Array[Byte] = Injection.int2BigEndian
|
||||
|
||||
intermediate
|
||||
.map {
|
||||
case (
|
||||
(srcId, clusterId),
|
||||
SrcClusterIntermediateInfo(
|
||||
followScore,
|
||||
favScore,
|
||||
logFavScore,
|
||||
numFollowed,
|
||||
numFaved
|
||||
)) =>
|
||||
(
|
||||
srcId,
|
||||
List(
|
||||
(
|
||||
clusterId,
|
||||
UserToInterestedInClusterScores(
|
||||
followScore = Some(ifNanMake0(followScore)),
|
||||
favScore = Some(ifNanMake0(favScore)),
|
||||
logFavScore = Some(ifNanMake0(logFavScore)),
|
||||
numUsersBeingFollowed = Some(numFollowed),
|
||||
numUsersThatWereFaved = Some(numFaved)
|
||||
))
|
||||
)
|
||||
)
|
||||
}
|
||||
.sumByKey
|
||||
// .withReducers(1000)
|
||||
.toTypedPipe
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,290 +0,0 @@
|
||||
package com.twitter.simclusters_v2.scalding
|
||||
|
||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
||||
import com.twitter.scalding.Execution
|
||||
import com.twitter.scalding.TypedTsv
|
||||
import com.twitter.scalding._
|
||||
import com.twitter.scalding_internal.dalv2.DAL
|
||||
import com.twitter.scalding_internal.dalv2.DALWrite._
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
|
||||
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
||||
import com.twitter.simclusters_v2.common.ModelVersions
|
||||
import com.twitter.simclusters_v2.common.UserId
|
||||
import com.twitter.simclusters_v2.hdfs_sources.ProducerEmbeddingSources
|
||||
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
|
||||
import com.twitter.simclusters_v2.hdfs_sources.DataSources
|
||||
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset
|
||||
import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
|
||||
import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
|
||||
import com.twitter.simclusters_v2.scalding.common.Util
|
||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
|
||||
import com.twitter.simclusters_v2.thriftscala.EmbeddingType
|
||||
import com.twitter.simclusters_v2.thriftscala.SimClusterWithScore
|
||||
import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore
|
||||
import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores
|
||||
import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
|
||||
import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp
|
||||
import java.util.TimeZone
|
||||
import scala.util.Random
|
||||
|
||||
/**
|
||||
* This file implements the job for computing users' interestedIn vector from the producerEmbeddings data set.
|
||||
*
|
||||
* It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then
|
||||
* based on the producerEmbedding clusters of each followed/faved user, we calculate how much a user is
|
||||
* interestedIn a cluster. To compute the engagement and determine the clusters for the user, we reuse
|
||||
* the functions defined in InterestedInKnownFor.
|
||||
*
|
||||
* Using producerEmbeddings instead of knownFor to obtain interestedIn increases the coverage (especially
|
||||
* for medium and light users) and also the density of the cluster embeddings for the user.
|
||||
*/
|
||||
/**
|
||||
* Adhoc job to generate the interestedIn from producer embeddings for the model version 20M145KUpdated
|
||||
*
|
||||
scalding remote run \
|
||||
--target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_producer_embeddings \
|
||||
--main-class com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsAdhocApp \
|
||||
--user cassowary --cluster bluebird-qus1 \
|
||||
--keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
|
||||
--principal service_acoount@TWITTER.BIZ \
|
||||
-- \
|
||||
--outputDir /gcs/user/cassowary/adhoc/interested_in_from_prod_embeddings/ \
|
||||
--date 2020-08-25 --typedTsv true
|
||||
*/
|
||||
object InterestedInFromProducerEmbeddingsAdhocApp extends AdhocExecutionApp {
|
||||
override def runOnDateRange(
|
||||
args: Args
|
||||
)(
|
||||
implicit dateRange: DateRange,
|
||||
timeZone: TimeZone,
|
||||
uniqueID: UniqueID
|
||||
): Execution[Unit] = {
|
||||
|
||||
val outputDir = args("outputDir")
|
||||
val inputGraph = args.optional("graphInputDir") match {
|
||||
case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir))
|
||||
case None =>
|
||||
DAL
|
||||
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
|
||||
.toTypedPipe
|
||||
}
|
||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
||||
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50)
|
||||
val maxClustersFromProducer = args.int("maxClustersPerProducer", 25)
|
||||
val typedTsvTag = args.boolean("typedTsv")
|
||||
|
||||
val embeddingType =
|
||||
EmbeddingType.ProducerFavBasedSemanticCoreEntity
|
||||
val modelVersion = ModelVersions.Model20M145KUpdated
|
||||
val producerEmbeddings = ProducerEmbeddingSources
|
||||
.producerEmbeddingSourceLegacy(embeddingType, ModelVersions.toModelVersion(modelVersion))(
|
||||
dateRange.embiggen(Days(7)))
|
||||
|
||||
import InterestedInFromProducerEmbeddingsBatchApp._
|
||||
|
||||
val numProducerMappings = Stat("num_producer_embeddings_total")
|
||||
val numProducersWithLargeClusterMappings = Stat(
|
||||
"num_producers_with_more_clusters_than_threshold")
|
||||
val numProducersWithSmallClusterMappings = Stat(
|
||||
"num_producers_with_clusters_less_than_threshold")
|
||||
val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings")
|
||||
|
||||
val producerEmbeddingsWithScore = producerEmbeddings.map {
|
||||
case (userId: Long, topSimClusters: TopSimClustersWithScore) =>
|
||||
(
|
||||
userId,
|
||||
topSimClusters.topClusters.toArray
|
||||
.map {
|
||||
case (simCluster: SimClusterWithScore) =>
|
||||
(simCluster.clusterId, simCluster.score.toFloat)
|
||||
}
|
||||
)
|
||||
}
|
||||
val producerEmbeddingsPruned = producerEmbeddingsWithScore.map {
|
||||
case (producerId, clusterArray) =>
|
||||
numProducerMappings.inc()
|
||||
val clusterSize = clusterArray.size
|
||||
totalClustersCoverageProducerEmbeddings.incBy(clusterSize)
|
||||
val prunedList = if (clusterSize > maxClustersFromProducer) {
|
||||
numProducersWithLargeClusterMappings.inc()
|
||||
clusterArray
|
||||
.sortBy {
|
||||
case (_, knownForScore) => -knownForScore
|
||||
}.take(maxClustersFromProducer)
|
||||
} else {
|
||||
numProducersWithSmallClusterMappings.inc()
|
||||
clusterArray
|
||||
}
|
||||
(producerId, prunedList)
|
||||
}
|
||||
|
||||
val result = InterestedInFromKnownFor
|
||||
.run(
|
||||
inputGraph,
|
||||
producerEmbeddingsPruned,
|
||||
socialProofThreshold,
|
||||
maxClustersPerUserFinalResult,
|
||||
modelVersion
|
||||
)
|
||||
|
||||
val resultWithoutSocial = getInterestedInDiscardSocial(result)
|
||||
|
||||
if (typedTsvTag) {
|
||||
Util.printCounters(
|
||||
resultWithoutSocial
|
||||
.map {
|
||||
case (userId: Long, clusters: ClustersUserIsInterestedIn) =>
|
||||
(
|
||||
userId,
|
||||
clusters.clusterIdToScores.keys.toString()
|
||||
)
|
||||
}
|
||||
.writeExecution(
|
||||
TypedTsv(outputDir)
|
||||
)
|
||||
)
|
||||
} else {
|
||||
Util.printCounters(
|
||||
resultWithoutSocial
|
||||
.writeExecution(
|
||||
AdhocKeyValSources.interestedInSource(outputDir)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Production job for computing interestedIn data set from the producer embeddings for the model version 20M145KUpdated.
|
||||
* It writes the data set in KeyVal format to produce a MH DAL data set.
|
||||
*
|
||||
* To deploy the job:
|
||||
*
|
||||
* capesospy-v2 update --build_locally --start_cron
|
||||
* --start_cron interested_in_from_producer_embeddings
|
||||
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml
|
||||
*/
|
||||
object InterestedInFromProducerEmbeddingsBatchApp extends ScheduledExecutionApp {
|
||||
override val firstTime: RichDate = RichDate("2019-11-01")
|
||||
|
||||
override val batchIncrement: Duration = Days(7)
|
||||
|
||||
def getPrunedEmbeddings(
|
||||
producerEmbeddings: TypedPipe[(Long, TopSimClustersWithScore)],
|
||||
maxClustersFromProducer: Int
|
||||
): TypedPipe[(Long, TopSimClustersWithScore)] = {
|
||||
producerEmbeddings.map {
|
||||
case (producerId, producerClusters) =>
|
||||
val prunedProducerClusters =
|
||||
producerClusters.topClusters
|
||||
.sortBy {
|
||||
case simCluster => -simCluster.score.toFloat
|
||||
}.take(maxClustersFromProducer)
|
||||
(producerId, TopSimClustersWithScore(prunedProducerClusters, producerClusters.modelVersion))
|
||||
}
|
||||
}
|
||||
|
||||
def getInterestedInDiscardSocial(
|
||||
interestedInFromProducersResult: TypedPipe[(UserId, ClustersUserIsInterestedIn)]
|
||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
||||
interestedInFromProducersResult.map {
|
||||
case (srcId, fullClusterList) =>
|
||||
val fullClusterListWithoutSocial = fullClusterList.clusterIdToScores.map {
|
||||
case (clusterId, clusterDetails) =>
|
||||
val clusterDetailsWithoutSocial = UserToInterestedInClusterScores(
|
||||
followScore = clusterDetails.followScore,
|
||||
followScoreClusterNormalizedOnly = clusterDetails.followScoreClusterNormalizedOnly,
|
||||
followScoreProducerNormalizedOnly = clusterDetails.followScoreProducerNormalizedOnly,
|
||||
followScoreClusterAndProducerNormalized =
|
||||
clusterDetails.followScoreClusterAndProducerNormalized,
|
||||
favScore = clusterDetails.favScore,
|
||||
favScoreClusterNormalizedOnly = clusterDetails.favScoreClusterNormalizedOnly,
|
||||
favScoreProducerNormalizedOnly = clusterDetails.favScoreProducerNormalizedOnly,
|
||||
favScoreClusterAndProducerNormalized =
|
||||
clusterDetails.favScoreClusterAndProducerNormalized,
|
||||
// Social proof is currently not being used anywhere else, hence being discarded to reduce space for this dataset
|
||||
usersBeingFollowed = None,
|
||||
usersThatWereFaved = None,
|
||||
numUsersInterestedInThisClusterUpperBound =
|
||||
clusterDetails.numUsersInterestedInThisClusterUpperBound,
|
||||
logFavScore = clusterDetails.logFavScore,
|
||||
logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly,
|
||||
// Counts of the social proof are maintained
|
||||
numUsersBeingFollowed = Some(clusterDetails.usersBeingFollowed.getOrElse(Nil).size),
|
||||
numUsersThatWereFaved = Some(clusterDetails.usersThatWereFaved.getOrElse(Nil).size)
|
||||
)
|
||||
(clusterId, clusterDetailsWithoutSocial)
|
||||
}
|
||||
(
|
||||
srcId,
|
||||
ClustersUserIsInterestedIn(
|
||||
fullClusterList.knownForModelVersion,
|
||||
fullClusterListWithoutSocial))
|
||||
}
|
||||
}
|
||||
|
||||
override def runOnDateRange(
|
||||
args: Args
|
||||
)(
|
||||
implicit dateRange: DateRange,
|
||||
timeZone: TimeZone,
|
||||
uniqueID: UniqueID
|
||||
): Execution[Unit] = {
|
||||
|
||||
//Input args for the run
|
||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
||||
val maxClustersFromProducer = args.int("maxClustersPerProducer", 25)
|
||||
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50)
|
||||
|
||||
//Path variables
|
||||
val modelVersionUpdated = ModelVersions.toModelVersion(ModelVersions.Model20M145KUpdated)
|
||||
val rootPath: String = s"/user/cassowary/manhattan_sequence_files"
|
||||
val interestedInFromProducersPath =
|
||||
rootPath + "/interested_in_from_producer_embeddings/" + modelVersionUpdated
|
||||
|
||||
//Input adjacency list and producer embeddings
|
||||
val userUserNormalGraph =
|
||||
DataSources.userUserNormalizedGraphSource(dateRange.prepend(Days(7))).forceToDisk
|
||||
val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
|
||||
SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset
|
||||
val producerEmbeddings = ProducerEmbeddingSources
|
||||
.producerEmbeddingSourceLegacy(
|
||||
EmbeddingType.ProducerFavBasedSemanticCoreEntity,
|
||||
modelVersionUpdated)(dateRange.embiggen(Days(7)))
|
||||
|
||||
val producerEmbeddingsPruned = getPrunedEmbeddings(producerEmbeddings, maxClustersFromProducer)
|
||||
val producerEmbeddingsWithScore = producerEmbeddingsPruned.map {
|
||||
case (userId: Long, topSimClusters: TopSimClustersWithScore) =>
|
||||
(
|
||||
userId,
|
||||
topSimClusters.topClusters.toArray
|
||||
.map {
|
||||
case (simCluster: SimClusterWithScore) =>
|
||||
(simCluster.clusterId, simCluster.score.toFloat)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
val interestedInFromProducersResult =
|
||||
InterestedInFromKnownFor.run(
|
||||
userUserNormalGraph,
|
||||
producerEmbeddingsWithScore,
|
||||
socialProofThreshold,
|
||||
maxClustersPerUserFinalResult,
|
||||
modelVersionUpdated.toString
|
||||
)
|
||||
|
||||
val interestedInFromProducersWithoutSocial =
|
||||
getInterestedInDiscardSocial(interestedInFromProducersResult)
|
||||
|
||||
val writeKeyValResultExec = interestedInFromProducersWithoutSocial
|
||||
.map { case (userId, clusters) => KeyVal(userId, clusters) }
|
||||
.writeDALVersionedKeyValExecution(
|
||||
outputKVDataset,
|
||||
D.Suffix(interestedInFromProducersPath)
|
||||
)
|
||||
writeKeyValResultExec
|
||||
}
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user