mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-11-16 08:29:21 +01:00
[docx] split commit for file 5000
Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
parent
c4b4b821a3
commit
2f5f511bb8
Binary file not shown.
@ -1,32 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.common
|
|
||||||
|
|
||||||
import com.twitter.simclusters_v2.common.SimClustersMultiEmbeddingId._
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding.{Ids, Values}
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.{
|
|
||||||
SimClustersMultiEmbedding,
|
|
||||||
SimClustersEmbeddingId,
|
|
||||||
SimClustersMultiEmbeddingId
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper methods for SimClustersMultiEmbedding
|
|
||||||
*/
|
|
||||||
object SimClustersMultiEmbedding {
|
|
||||||
|
|
||||||
// Convert a multiEmbedding to a list of (embeddingId, score)
|
|
||||||
def toSimClustersEmbeddingIdWithScores(
|
|
||||||
simClustersMultiEmbeddingId: SimClustersMultiEmbeddingId,
|
|
||||||
simClustersMultiEmbedding: SimClustersMultiEmbedding
|
|
||||||
): Seq[(SimClustersEmbeddingId, Double)] = {
|
|
||||||
simClustersMultiEmbedding match {
|
|
||||||
case Values(values) =>
|
|
||||||
values.embeddings.zipWithIndex.map {
|
|
||||||
case (embeddingWithScore, i) =>
|
|
||||||
(toEmbeddingId(simClustersMultiEmbeddingId, i), embeddingWithScore.score)
|
|
||||||
}
|
|
||||||
case Ids(ids) =>
|
|
||||||
ids.ids.map(_.toTuple)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Binary file not shown.
@ -1,96 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.common
|
|
||||||
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.{
|
|
||||||
EmbeddingType,
|
|
||||||
InternalId,
|
|
||||||
MultiEmbeddingType,
|
|
||||||
TopicId,
|
|
||||||
TopicSubId,
|
|
||||||
SimClustersEmbeddingId => ThriftEmbeddingId,
|
|
||||||
SimClustersMultiEmbeddingId => ThriftMultiEmbeddingId
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper methods for SimClustersMultiEmbeddingId
|
|
||||||
*/
|
|
||||||
object SimClustersMultiEmbeddingId {
|
|
||||||
|
|
||||||
private val MultiEmbeddingTypeToEmbeddingType: Map[MultiEmbeddingType, EmbeddingType] =
|
|
||||||
Map(
|
|
||||||
MultiEmbeddingType.LogFavApeBasedMuseTopic -> EmbeddingType.LogFavApeBasedMuseTopic,
|
|
||||||
MultiEmbeddingType.TwiceUserInterestedIn -> EmbeddingType.TwiceUserInterestedIn,
|
|
||||||
)
|
|
||||||
|
|
||||||
private val EmbeddingTypeToMultiEmbeddingType: Map[EmbeddingType, MultiEmbeddingType] =
|
|
||||||
MultiEmbeddingTypeToEmbeddingType.map(_.swap)
|
|
||||||
|
|
||||||
def toEmbeddingType(multiEmbeddingType: MultiEmbeddingType): EmbeddingType = {
|
|
||||||
MultiEmbeddingTypeToEmbeddingType.getOrElse(
|
|
||||||
multiEmbeddingType,
|
|
||||||
throw new IllegalArgumentException(s"Invalid type: $multiEmbeddingType"))
|
|
||||||
}
|
|
||||||
|
|
||||||
def toMultiEmbeddingType(embeddingType: EmbeddingType): MultiEmbeddingType = {
|
|
||||||
EmbeddingTypeToMultiEmbeddingType.getOrElse(
|
|
||||||
embeddingType,
|
|
||||||
throw new IllegalArgumentException(s"Invalid type: $embeddingType")
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a SimClusters Multi-Embedding Id and SubId to SimClusters Embedding Id.
|
|
||||||
*/
|
|
||||||
def toEmbeddingId(
|
|
||||||
simClustersMultiEmbeddingId: ThriftMultiEmbeddingId,
|
|
||||||
subId: Int
|
|
||||||
): ThriftEmbeddingId = {
|
|
||||||
val internalId = simClustersMultiEmbeddingId.internalId match {
|
|
||||||
case InternalId.TopicId(topicId) =>
|
|
||||||
InternalId.TopicSubId(
|
|
||||||
TopicSubId(topicId.entityId, topicId.language, topicId.country, subId))
|
|
||||||
case _ =>
|
|
||||||
throw new IllegalArgumentException(
|
|
||||||
s"Invalid simClusters InternalId ${simClustersMultiEmbeddingId.internalId}")
|
|
||||||
}
|
|
||||||
ThriftEmbeddingId(
|
|
||||||
toEmbeddingType(simClustersMultiEmbeddingId.embeddingType),
|
|
||||||
simClustersMultiEmbeddingId.modelVersion,
|
|
||||||
internalId
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Fetch a subId from a SimClusters EmbeddingId.
|
|
||||||
*/
|
|
||||||
def toSubId(simClustersEmbeddingId: ThriftEmbeddingId): Int = {
|
|
||||||
simClustersEmbeddingId.internalId match {
|
|
||||||
case InternalId.TopicSubId(topicSubId) =>
|
|
||||||
topicSubId.subId
|
|
||||||
case _ =>
|
|
||||||
throw new IllegalArgumentException(
|
|
||||||
s"Invalid SimClustersEmbeddingId InternalId type, $simClustersEmbeddingId")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert a SimClustersEmbeddingId to SimClustersMultiEmbeddingId.
|
|
||||||
* Only support the Multi embedding based EmbeddingTypes.
|
|
||||||
*/
|
|
||||||
def toMultiEmbeddingId(
|
|
||||||
simClustersEmbeddingId: ThriftEmbeddingId
|
|
||||||
): ThriftMultiEmbeddingId = {
|
|
||||||
simClustersEmbeddingId.internalId match {
|
|
||||||
case InternalId.TopicSubId(topicSubId) =>
|
|
||||||
ThriftMultiEmbeddingId(
|
|
||||||
toMultiEmbeddingType(simClustersEmbeddingId.embeddingType),
|
|
||||||
simClustersEmbeddingId.modelVersion,
|
|
||||||
InternalId.TopicId(TopicId(topicSubId.entityId, topicSubId.language, topicSubId.country))
|
|
||||||
)
|
|
||||||
|
|
||||||
case _ =>
|
|
||||||
throw new IllegalArgumentException(
|
|
||||||
s"Invalid SimClustersEmbeddingId InternalId type, $simClustersEmbeddingId")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,11 +0,0 @@
|
|||||||
scala_library(
|
|
||||||
compiler_option_sets = ["fatal_warnings"],
|
|
||||||
platform = "java8",
|
|
||||||
tags = ["bazel-compatible"],
|
|
||||||
dependencies = [
|
|
||||||
"eventdetection/common/src/main/java/com/twitter/eventdetection/common/louvain",
|
|
||||||
"eventdetection/common/src/main/java/com/twitter/eventdetection/common/model",
|
|
||||||
"src/java/com/twitter/sbf/graph",
|
|
||||||
"src/scala/com/twitter/simclusters_v2/scalding/common",
|
|
||||||
],
|
|
||||||
)
|
|
Binary file not shown.
Binary file not shown.
@ -1,30 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.common.clustering
|
|
||||||
|
|
||||||
import com.twitter.simclusters_v2.common.UserId
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Select a cluster member as cluster representative.
|
|
||||||
*/
|
|
||||||
trait ClusterRepresentativeSelectionMethod[T] {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The main external-facing method. Sub-classes should implement this method.
|
|
||||||
*
|
|
||||||
* @param cluster A set of NeighborWithWeights.
|
|
||||||
* @param embeddings A map of producer ID -> embedding.
|
|
||||||
*
|
|
||||||
* @return UserId of the member chosen as representative.
|
|
||||||
*/
|
|
||||||
def selectClusterRepresentative(
|
|
||||||
cluster: Set[NeighborWithWeights],
|
|
||||||
embeddings: Map[UserId, T]
|
|
||||||
): UserId
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
object ClusterRepresentativeSelectionStatistics {
|
|
||||||
|
|
||||||
// Statistics, to be imported where recorded.
|
|
||||||
val StatClusterRepresentativeSelectionTime = "cluster_representative_selection_total_time_ms"
|
|
||||||
}
|
|
Binary file not shown.
@ -1,34 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.common.clustering
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Partitions a set of entities into clusters.
|
|
||||||
* NOTE: The selection/construction of the cluster representatives (e.g. medoid, random, average) is implemented in ClusterRepresentativeSelectionMethod.scala
|
|
||||||
*/
|
|
||||||
trait ClusteringMethod {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The main external-facing method. Sub-classes should implement this method.
|
|
||||||
*
|
|
||||||
* @param embeddings map of entity IDs and corresponding embeddings
|
|
||||||
* @param similarityFn function that outputs similarity (>=0, the larger, more similar), given two embeddings
|
|
||||||
* @tparam T embedding type. e.g. SimClustersEmbedding
|
|
||||||
*
|
|
||||||
* @return A set of sets of entity IDs, each set representing a distinct cluster.
|
|
||||||
*/
|
|
||||||
def cluster[T](
|
|
||||||
embeddings: Map[Long, T],
|
|
||||||
similarityFn: (T, T) => Double,
|
|
||||||
recordStatCallback: (String, Long) => Unit = (_, _) => ()
|
|
||||||
): Set[Set[Long]]
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
object ClusteringStatistics {
|
|
||||||
|
|
||||||
// Statistics, to be imported where recorded.
|
|
||||||
val StatSimilarityGraphTotalBuildTime = "similarity_graph_total_build_time_ms"
|
|
||||||
val StatClusteringAlgorithmRunTime = "clustering_algorithm_total_run_time_ms"
|
|
||||||
val StatMedoidSelectionTime = "medoid_selection_total_time_ms"
|
|
||||||
val StatComputedSimilarityBeforeFilter = "computed_similarity_before_filter"
|
|
||||||
|
|
||||||
}
|
|
Binary file not shown.
@ -1,67 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.common.clustering
|
|
||||||
|
|
||||||
import com.twitter.sbf.graph.ConnectedComponents
|
|
||||||
import com.twitter.sbf.graph.Graph
|
|
||||||
import com.twitter.util.Stopwatch
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntSet
|
|
||||||
import scala.collection.SortedMap
|
|
||||||
import scala.jdk.CollectionConverters._
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Aggregate entities into clusters such that a cluster contains all embeddings with a similarity
|
|
||||||
* above a configurable threshold to any other embedding.
|
|
||||||
*
|
|
||||||
* @param similarityThreshold: When building the edges between entities, edges with weight
|
|
||||||
* less than or equal to this threshold will be filtered out.
|
|
||||||
*/
|
|
||||||
class ConnectedComponentsClusteringMethod(
|
|
||||||
similarityThreshold: Double)
|
|
||||||
extends ClusteringMethod {
|
|
||||||
|
|
||||||
import ClusteringStatistics._
|
|
||||||
|
|
||||||
def cluster[T](
|
|
||||||
embeddings: Map[Long, T],
|
|
||||||
similarityFn: (T, T) => Double,
|
|
||||||
recordStatCallback: (String, Long) => Unit = (_, _) => ()
|
|
||||||
): Set[Set[Long]] = {
|
|
||||||
|
|
||||||
val timeSinceGraphBuildStart = Stopwatch.start()
|
|
||||||
// com.twitter.sbf.graph.Graph expects neighbors to be sorted in ascending order.
|
|
||||||
val sourcesById = SortedMap(embeddings.zipWithIndex.map {
|
|
||||||
case (source, idx) => idx -> source
|
|
||||||
}.toSeq: _*)
|
|
||||||
|
|
||||||
val neighbours = sourcesById.map {
|
|
||||||
case (srcIdx, (_, src)) =>
|
|
||||||
sourcesById
|
|
||||||
.collect {
|
|
||||||
case (dstIdx, (_, dst)) if srcIdx != dstIdx => // avoid self-edges
|
|
||||||
val similarity = similarityFn(src, dst)
|
|
||||||
recordStatCallback(
|
|
||||||
StatComputedSimilarityBeforeFilter,
|
|
||||||
(similarity * 100).toLong // preserve up to two decimal points
|
|
||||||
)
|
|
||||||
if (similarity > similarityThreshold)
|
|
||||||
Some(dstIdx)
|
|
||||||
else None
|
|
||||||
}.flatten.toArray
|
|
||||||
}.toArray
|
|
||||||
|
|
||||||
recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
|
|
||||||
|
|
||||||
val timeSinceClusteringAlgRunStart = Stopwatch.start()
|
|
||||||
val nEdges = neighbours.map(_.length).sum / 2 // Graph expects count of undirected edges
|
|
||||||
val graph = new Graph(sourcesById.size, nEdges, neighbours)
|
|
||||||
|
|
||||||
val clusters = ConnectedComponents
|
|
||||||
.connectedComponents(graph).asScala.toSet
|
|
||||||
.map { i: IntSet => i.asScala.map(sourcesById(_)._1).toSet }
|
|
||||||
|
|
||||||
recordStatCallback(
|
|
||||||
StatClusteringAlgorithmRunTime,
|
|
||||||
timeSinceClusteringAlgRunStart().inMilliseconds)
|
|
||||||
|
|
||||||
clusters
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,33 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.common.clustering
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Groups entities by a single embedding dimension with the largest score.
|
|
||||||
*/
|
|
||||||
class LargestDimensionClusteringMethod extends ClusteringMethod {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param embeddings map of entity IDs and corresponding embeddings
|
|
||||||
* @param similarityFn function that outputs discrete value (0.0 or 1.0).
|
|
||||||
* 1.0 if the dimensions of the highest score (weight) from two given embeddings match.
|
|
||||||
* 0.0 otherwise.
|
|
||||||
* e.g.
|
|
||||||
* case 1: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.3, 0.8, 0.0]. similarityFn(E1, E2)=1.0
|
|
||||||
* case 2: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.4, 0.2, 0.0]. similarityFn(E1, E2)=0.0
|
|
||||||
* @tparam T embedding type. e.g. SimClustersEmbedding
|
|
||||||
*
|
|
||||||
* @return A set of sets of entity IDs, each set representing a distinct cluster.
|
|
||||||
*/
|
|
||||||
override def cluster[T](
|
|
||||||
embeddings: Map[Long, T],
|
|
||||||
similarityFn: (T, T) => Double,
|
|
||||||
recordStatCallback: (String, Long) => Unit
|
|
||||||
): Set[Set[Long]] = {
|
|
||||||
|
|
||||||
// rely on clustering by connected component.
|
|
||||||
// similarityThreshold=0.1 because it's larger than 0.0 (similarityFn returns 0.0 if two embeddings
|
|
||||||
// don't share the largest dimension.
|
|
||||||
new ConnectedComponentsClusteringMethod(similarityThreshold = 0.1)
|
|
||||||
.cluster(embeddings, similarityFn, recordStatCallback)
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Binary file not shown.
@ -1,236 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.common.clustering
|
|
||||||
|
|
||||||
import com.twitter.eventdetection.common.louvain.LouvainDriver
|
|
||||||
import com.twitter.eventdetection.common.louvain.NetworkFactory
|
|
||||||
import com.twitter.eventdetection.common.model.Entity
|
|
||||||
import com.twitter.eventdetection.common.model.NetworkInput
|
|
||||||
import com.twitter.eventdetection.common.model.TextEntityValue
|
|
||||||
import com.twitter.util.Stopwatch
|
|
||||||
import scala.collection.JavaConverters._
|
|
||||||
import scala.math.max
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Groups entities by the Louvain clustering method.
|
|
||||||
* @param similarityThreshold: When building the edges between entities, edges with weight
|
|
||||||
* less than or equal to this threshold will be filtered out.
|
|
||||||
* @param appliedResolutionFactor: If present, will be used to multiply the applied resolution
|
|
||||||
* parameter of the Louvain method by this factor.
|
|
||||||
* Note that the DEFAULT_MAX_RESOLUTION will not be applied.
|
|
||||||
*/
|
|
||||||
class LouvainClusteringMethod(
|
|
||||||
similarityThreshold: Double,
|
|
||||||
appliedResolutionFactor: Option[Double])
|
|
||||||
extends ClusteringMethod {
|
|
||||||
|
|
||||||
import ClusteringStatistics._
|
|
||||||
|
|
||||||
def cluster[T](
|
|
||||||
embeddings: Map[Long, T],
|
|
||||||
similarityFn: (T, T) => Double,
|
|
||||||
recordStatCallback: (String, Long) => Unit = (_, _) => ()
|
|
||||||
): Set[Set[Long]] = {
|
|
||||||
|
|
||||||
// 1. Build the graph on which to run Louvain:
|
|
||||||
// - Weigh edges by the similarity between the 2 embeddings,
|
|
||||||
// - Filter out edges with weight <= threshold.
|
|
||||||
val timeSinceGraphBuildStart = Stopwatch.start()
|
|
||||||
val edges: Seq[((Long, Long), Double)] = embeddings.toSeq
|
|
||||||
.combinations(2)
|
|
||||||
.map { pair: Seq[(Long, T)] => // pair of 2
|
|
||||||
val (user1, embedding1) = pair.head
|
|
||||||
val (user2, embedding2) = pair(1)
|
|
||||||
val similarity = similarityFn(embedding1, embedding2)
|
|
||||||
|
|
||||||
recordStatCallback(
|
|
||||||
StatComputedSimilarityBeforeFilter,
|
|
||||||
(similarity * 100).toLong // preserve up to two decimal places
|
|
||||||
)
|
|
||||||
|
|
||||||
((user1, user2), similarity)
|
|
||||||
}
|
|
||||||
.filter(_._2 > similarityThreshold)
|
|
||||||
.toSeq
|
|
||||||
|
|
||||||
recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
|
|
||||||
|
|
||||||
// check if some entities do not have any incoming / outgoing edge
|
|
||||||
// these are size-1 clusters (i.e. their own)
|
|
||||||
val individualClusters: Set[Long] = embeddings.keySet -- edges.flatMap {
|
|
||||||
case ((user1, user2), _) => Set(user1, user2)
|
|
||||||
}.toSet
|
|
||||||
|
|
||||||
// 2. LouvainDriver uses "Entity" as input, so build 2 mappings
|
|
||||||
// - Long (entity id) -> Entity
|
|
||||||
// - Entity -> Long (entity id)
|
|
||||||
val embeddingIdToEntity: Map[Long, Entity] = embeddings.map {
|
|
||||||
case (id, _) => id -> Entity(TextEntityValue(id.toString, Some(id.toString)), None)
|
|
||||||
}
|
|
||||||
val entityToEmbeddingId: Map[Entity, Long] = embeddingIdToEntity.map {
|
|
||||||
case (id, e) => e -> id
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. Create the list of NetworkInput on which to run LouvainDriver
|
|
||||||
val networkInputList = edges
|
|
||||||
.map {
|
|
||||||
case ((fromUserId: Long, toUserId: Long), weight: Double) =>
|
|
||||||
new NetworkInput(embeddingIdToEntity(fromUserId), embeddingIdToEntity(toUserId), weight)
|
|
||||||
}.toList.asJava
|
|
||||||
|
|
||||||
val timeSinceClusteringAlgRunStart = Stopwatch.start()
|
|
||||||
val networkDictionary = NetworkFactory.buildDictionary(networkInputList)
|
|
||||||
val network = NetworkFactory.buildNetwork(networkInputList, networkDictionary)
|
|
||||||
|
|
||||||
if (networkInputList.size() == 0) {
|
|
||||||
// handle case if no edge at all (only one entity or all entities are too far apart)
|
|
||||||
embeddings.keySet.map(e => Set(e))
|
|
||||||
} else {
|
|
||||||
// 4. Run clustering algorithm
|
|
||||||
val clusteredIds = appliedResolutionFactor match {
|
|
||||||
case Some(res) =>
|
|
||||||
LouvainDriver.clusterAppliedResolutionFactor(network, networkDictionary, res)
|
|
||||||
case None => LouvainDriver.cluster(network, networkDictionary)
|
|
||||||
}
|
|
||||||
|
|
||||||
recordStatCallback(
|
|
||||||
StatClusteringAlgorithmRunTime,
|
|
||||||
timeSinceClusteringAlgRunStart().inMilliseconds)
|
|
||||||
|
|
||||||
// 5. Post-processing
|
|
||||||
val atLeast2MembersClusters: Set[Set[Long]] = clusteredIds.asScala
|
|
||||||
.groupBy(_._2)
|
|
||||||
.mapValues(_.map { case (e, _) => entityToEmbeddingId(e) }.toSet)
|
|
||||||
.values.toSet
|
|
||||||
|
|
||||||
atLeast2MembersClusters ++ individualClusters.map { e => Set(e) }
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def clusterWithSilhouette[T](
|
|
||||||
embeddings: Map[Long, T],
|
|
||||||
similarityFn: (T, T) => Double,
|
|
||||||
similarityFnForSil: (T, T) => Double,
|
|
||||||
recordStatCallback: (String, Long) => Unit = (_, _) => ()
|
|
||||||
): (Set[Set[Long]], Set[Set[(Long, Double)]]) = {
|
|
||||||
|
|
||||||
// 1. Build the graph on which to run Louvain:
|
|
||||||
// - Weigh edges by the similarity between the 2 embeddings,
|
|
||||||
// - Filter out edges with weight <= threshold.
|
|
||||||
val timeSinceGraphBuildStart = Stopwatch.start()
|
|
||||||
val edgesSimilarityMap = collection.mutable.Map[(Long, Long), Double]()
|
|
||||||
|
|
||||||
val edges: Seq[((Long, Long), Double)] = embeddings.toSeq
|
|
||||||
.combinations(2)
|
|
||||||
.map { pair: Seq[(Long, T)] => // pair of 2
|
|
||||||
val (user1, embedding1) = pair.head
|
|
||||||
val (user2, embedding2) = pair(1)
|
|
||||||
val similarity = similarityFn(embedding1, embedding2)
|
|
||||||
val similarityForSil = similarityFnForSil(embedding1, embedding2)
|
|
||||||
edgesSimilarityMap.put((user1, user2), similarityForSil)
|
|
||||||
edgesSimilarityMap.put((user2, user1), similarityForSil)
|
|
||||||
|
|
||||||
recordStatCallback(
|
|
||||||
StatComputedSimilarityBeforeFilter,
|
|
||||||
(similarity * 100).toLong // preserve up to two decimal places
|
|
||||||
)
|
|
||||||
|
|
||||||
((user1, user2), similarity)
|
|
||||||
}
|
|
||||||
.filter(_._2 > similarityThreshold)
|
|
||||||
.toSeq
|
|
||||||
|
|
||||||
recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
|
|
||||||
|
|
||||||
// check if some entities do not have any incoming / outgoing edge
|
|
||||||
// these are size-1 clusters (i.e. their own)
|
|
||||||
val individualClusters: Set[Long] = embeddings.keySet -- edges.flatMap {
|
|
||||||
case ((user1, user2), _) => Set(user1, user2)
|
|
||||||
}.toSet
|
|
||||||
|
|
||||||
// 2. LouvainDriver uses "Entity" as input, so build 2 mappings
|
|
||||||
// - Long (entity id) -> Entity
|
|
||||||
// - Entity -> Long (entity id)
|
|
||||||
val embeddingIdToEntity: Map[Long, Entity] = embeddings.map {
|
|
||||||
case (id, _) => id -> Entity(TextEntityValue(id.toString, Some(id.toString)), None)
|
|
||||||
}
|
|
||||||
val entityToEmbeddingId: Map[Entity, Long] = embeddingIdToEntity.map {
|
|
||||||
case (id, e) => e -> id
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. Create the list of NetworkInput on which to run LouvainDriver
|
|
||||||
val networkInputList = edges
|
|
||||||
.map {
|
|
||||||
case ((fromUserId: Long, toUserId: Long), weight: Double) =>
|
|
||||||
new NetworkInput(embeddingIdToEntity(fromUserId), embeddingIdToEntity(toUserId), weight)
|
|
||||||
}.toList.asJava
|
|
||||||
|
|
||||||
val timeSinceClusteringAlgRunStart = Stopwatch.start()
|
|
||||||
val networkDictionary = NetworkFactory.buildDictionary(networkInputList)
|
|
||||||
val network = NetworkFactory.buildNetwork(networkInputList, networkDictionary)
|
|
||||||
|
|
||||||
val clusters = if (networkInputList.size() == 0) {
|
|
||||||
// handle case if no edge at all (only one entity or all entities are too far apart)
|
|
||||||
embeddings.keySet.map(e => Set(e))
|
|
||||||
} else {
|
|
||||||
// 4. Run clustering algorithm
|
|
||||||
val clusteredIds = appliedResolutionFactor match {
|
|
||||||
case Some(res) =>
|
|
||||||
LouvainDriver.clusterAppliedResolutionFactor(network, networkDictionary, res)
|
|
||||||
case None => LouvainDriver.cluster(network, networkDictionary)
|
|
||||||
}
|
|
||||||
|
|
||||||
recordStatCallback(
|
|
||||||
StatClusteringAlgorithmRunTime,
|
|
||||||
timeSinceClusteringAlgRunStart().inMilliseconds)
|
|
||||||
|
|
||||||
// 5. Post-processing
|
|
||||||
val atLeast2MembersClusters: Set[Set[Long]] = clusteredIds.asScala
|
|
||||||
.groupBy(_._2)
|
|
||||||
.mapValues(_.map { case (e, _) => entityToEmbeddingId(e) }.toSet)
|
|
||||||
.values.toSet
|
|
||||||
|
|
||||||
atLeast2MembersClusters ++ individualClusters.map { e => Set(e) }
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate silhouette metrics
|
|
||||||
val contactIdWithSilhouette = clusters.map {
|
|
||||||
case cluster =>
|
|
||||||
val otherClusters = clusters - cluster
|
|
||||||
|
|
||||||
cluster.map {
|
|
||||||
case contactId =>
|
|
||||||
if (otherClusters.isEmpty) {
|
|
||||||
(contactId, 0.0)
|
|
||||||
} else {
|
|
||||||
val otherSameClusterContacts = cluster - contactId
|
|
||||||
|
|
||||||
if (otherSameClusterContacts.isEmpty) {
|
|
||||||
(contactId, 0.0)
|
|
||||||
} else {
|
|
||||||
// calculate similarity of given userId with all other users in the same cluster
|
|
||||||
val a_i = otherSameClusterContacts.map {
|
|
||||||
case sameClusterContact =>
|
|
||||||
edgesSimilarityMap((contactId, sameClusterContact))
|
|
||||||
}.sum / otherSameClusterContacts.size
|
|
||||||
|
|
||||||
// calculate similarity of given userId to all other clusters, find the best nearest cluster
|
|
||||||
val b_i = otherClusters.map {
|
|
||||||
case otherCluster =>
|
|
||||||
otherCluster.map {
|
|
||||||
case otherClusterContact =>
|
|
||||||
edgesSimilarityMap((contactId, otherClusterContact))
|
|
||||||
}.sum / otherCluster.size
|
|
||||||
}.max
|
|
||||||
|
|
||||||
// silhouette (value) of one userId i
|
|
||||||
val s_i = (a_i - b_i) / max(a_i, b_i)
|
|
||||||
(contactId, s_i)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
(clusters, contactIdWithSilhouette)
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,21 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.common.clustering
|
|
||||||
|
|
||||||
import com.twitter.simclusters_v2.common.UserId
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
|
|
||||||
|
|
||||||
class MaxFavScoreRepresentativeSelectionMethod[T] extends ClusterRepresentativeSelectionMethod[T] {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Identify the member with largest favScoreHalfLife100Days and return it.
|
|
||||||
*
|
|
||||||
* @param cluster A set of NeighborWithWeights.
|
|
||||||
* @param embeddings A map of producer ID -> embedding.
|
|
||||||
*/
|
|
||||||
def selectClusterRepresentative(
|
|
||||||
cluster: Set[NeighborWithWeights],
|
|
||||||
embeddings: Map[UserId, T],
|
|
||||||
): UserId = {
|
|
||||||
val key = cluster.maxBy { x: NeighborWithWeights => x.favScoreHalfLife100Days.getOrElse(0.0) }
|
|
||||||
key.neighborId
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,28 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.common.clustering
|
|
||||||
|
|
||||||
import com.twitter.simclusters_v2.common.UserId
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
|
|
||||||
|
|
||||||
class MedoidRepresentativeSelectionMethod[T](
|
|
||||||
producerProducerSimilarityFn: (T, T) => Double)
|
|
||||||
extends ClusterRepresentativeSelectionMethod[T] {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Identify the medoid of a cluster and return it.
|
|
||||||
*
|
|
||||||
* @param cluster A set of NeighborWithWeights.
|
|
||||||
* @param embeddings A map of producer ID -> embedding.
|
|
||||||
*/
|
|
||||||
def selectClusterRepresentative(
|
|
||||||
cluster: Set[NeighborWithWeights],
|
|
||||||
embeddings: Map[UserId, T],
|
|
||||||
): UserId = {
|
|
||||||
val key = cluster.maxBy {
|
|
||||||
id1 => // maxBy because we use similarity, which gets larger as we get closer.
|
|
||||||
val v = embeddings(id1.neighborId)
|
|
||||||
cluster
|
|
||||||
.map(id2 => producerProducerSimilarityFn(v, embeddings(id2.neighborId))).sum
|
|
||||||
}
|
|
||||||
key.neighborId
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,32 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.common.clustering
|
|
||||||
|
|
||||||
import com.twitter.simclusters_v2.common.SimClustersEmbedding
|
|
||||||
|
|
||||||
/**
|
|
||||||
* SimilarityFunctions provide commonly used similarity functions that this clustering library needs.
|
|
||||||
*/
|
|
||||||
object SimilarityFunctions {
|
|
||||||
def simClustersCosineSimilarity: (SimClustersEmbedding, SimClustersEmbedding) => Double =
|
|
||||||
(e1, e2) => e1.cosineSimilarity(e2)
|
|
||||||
|
|
||||||
def simClustersMatchingLargestDimension: (
|
|
||||||
SimClustersEmbedding,
|
|
||||||
SimClustersEmbedding
|
|
||||||
) => Double = (e1, e2) => {
|
|
||||||
val doesMatchLargestDimension: Boolean = e1
|
|
||||||
.topClusterIds(1)
|
|
||||||
.exists { id1 =>
|
|
||||||
e2.topClusterIds(1).contains(id1)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (doesMatchLargestDimension) 1.0
|
|
||||||
else 0.0
|
|
||||||
}
|
|
||||||
|
|
||||||
def simClustersFuzzyJaccardSimilarity: (
|
|
||||||
SimClustersEmbedding,
|
|
||||||
SimClustersEmbedding
|
|
||||||
) => Double = (e1, e2) => {
|
|
||||||
e1.fuzzyJaccardSimilarity(e2)
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,12 +0,0 @@
|
|||||||
# This package/target is separate from other simclusters common packages because the ml/api dep is
|
|
||||||
# large (350MB+). Having it as a separate target means that we can avoid bundling it with targets
|
|
||||||
# that do not need it.
|
|
||||||
scala_library(
|
|
||||||
platform = "java8",
|
|
||||||
tags = ["bazel-compatible"],
|
|
||||||
dependencies = [
|
|
||||||
"src/java/com/twitter/ml/api:api-base",
|
|
||||||
"src/scala/com/twitter/ml/api/util",
|
|
||||||
"src/scala/com/twitter/simclusters_v2/common",
|
|
||||||
],
|
|
||||||
)
|
|
BIN
src/scala/com/twitter/simclusters_v2/common/ml/BUILD.docx
Normal file
BIN
src/scala/com/twitter/simclusters_v2/common/ml/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,39 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.common.ml
|
|
||||||
|
|
||||||
import com.twitter.ml.api.Feature.Continuous
|
|
||||||
import com.twitter.ml.api.Feature.SparseContinuous
|
|
||||||
import com.twitter.ml.api._
|
|
||||||
import com.twitter.ml.api.util.FDsl._
|
|
||||||
import com.twitter.simclusters_v2.common.SimClustersEmbedding
|
|
||||||
|
|
||||||
class SimClustersEmbeddingAdapter(embeddingFeature: SparseContinuous)
|
|
||||||
extends IRecordOneToOneAdapter[SimClustersEmbedding] {
|
|
||||||
|
|
||||||
override def getFeatureContext: FeatureContext = new FeatureContext(embeddingFeature)
|
|
||||||
|
|
||||||
override def adaptToDataRecord(embedding: SimClustersEmbedding): DataRecord = {
|
|
||||||
val embeddingMap = embedding.embedding.map {
|
|
||||||
case (clusterId, score) =>
|
|
||||||
(clusterId.toString, score)
|
|
||||||
}.toMap
|
|
||||||
|
|
||||||
new DataRecord().setFeatureValue(embeddingFeature, embeddingMap)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class NormalizedSimClustersEmbeddingAdapter(
|
|
||||||
embeddingFeature: SparseContinuous,
|
|
||||||
normFeature: Continuous)
|
|
||||||
extends IRecordOneToOneAdapter[SimClustersEmbedding] {
|
|
||||||
|
|
||||||
override def getFeatureContext: FeatureContext = new FeatureContext(embeddingFeature, normFeature)
|
|
||||||
|
|
||||||
override def adaptToDataRecord(embedding: SimClustersEmbedding): DataRecord = {
|
|
||||||
|
|
||||||
val normalizedEmbedding = Map(
|
|
||||||
embedding.sortedClusterIds.map(_.toString).zip(embedding.normalizedSortedScores): _*)
|
|
||||||
|
|
||||||
val dataRecord = new DataRecord().setFeatureValue(embeddingFeature, normalizedEmbedding)
|
|
||||||
dataRecord.setFeatureValue(normFeature, embedding.l2norm)
|
|
||||||
}
|
|
||||||
}
|
|
BIN
src/scala/com/twitter/simclusters_v2/common/package.docx
Normal file
BIN
src/scala/com/twitter/simclusters_v2/common/package.docx
Normal file
Binary file not shown.
@ -1,17 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2
|
|
||||||
|
|
||||||
package object common {
|
|
||||||
|
|
||||||
type TweetId = Long
|
|
||||||
type UserId = Long
|
|
||||||
type ClusterId = Int
|
|
||||||
type SemanticCoreEntityId = Long // Use TopicId if it's a Topic related project.
|
|
||||||
type UTTEntityId = Long
|
|
||||||
type Timestamp = Long
|
|
||||||
type Language = String
|
|
||||||
type Country = String
|
|
||||||
type LocaleEntity = (Long, Language)
|
|
||||||
type TopicId = Long
|
|
||||||
type GroupId = Long
|
|
||||||
type SpaceId = String
|
|
||||||
}
|
|
Binary file not shown.
@ -1,164 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources
|
|
||||||
|
|
||||||
import com.twitter.bijection.scrooge.BinaryScalaCodec
|
|
||||||
import com.twitter.bijection.scrooge.CompactScalaCodec
|
|
||||||
import com.twitter.bijection.Bufferable
|
|
||||||
import com.twitter.bijection.Injection
|
|
||||||
import com.twitter.hermit.candidate.thriftscala.Candidates
|
|
||||||
import com.twitter.scalding.DateRange
|
|
||||||
import com.twitter.scalding.commons.source.VersionedKeyValSource
|
|
||||||
import com.twitter.scalding_internal.source.lzo_scrooge.DailySuffixMostRecentLzoScrooge
|
|
||||||
import com.twitter.scalding_internal.source.lzo_scrooge.FixedPathLzoScrooge
|
|
||||||
import com.twitter.scalding_internal.source.lzo_scrooge.HourlySuffixMostRecentLzoScrooge
|
|
||||||
import com.twitter.simclusters_v2.thriftscala._
|
|
||||||
|
|
||||||
case class EdgeWithDecayedWtsFixedPathSource(path: String)
|
|
||||||
extends FixedPathLzoScrooge[EdgeWithDecayedWeights](path, EdgeWithDecayedWeights)
|
|
||||||
|
|
||||||
case class UserAndNeighborsFixedPathSource(path: String)
|
|
||||||
extends FixedPathLzoScrooge[UserAndNeighbors](path, UserAndNeighbors)
|
|
||||||
|
|
||||||
case class NormsAndCountsFixedPathSource(path: String)
|
|
||||||
extends FixedPathLzoScrooge[NormsAndCounts](path, NormsAndCounts)
|
|
||||||
|
|
||||||
case class UserToInterestedInClustersFixedPathSource(path: String)
|
|
||||||
extends FixedPathLzoScrooge[UserToInterestedInClusters](path, UserToInterestedInClusters)
|
|
||||||
|
|
||||||
case class TimelineDataExtractorFixedPathSource(path: String)
|
|
||||||
extends FixedPathLzoScrooge[ReferenceTweets](path, ReferenceTweets)
|
|
||||||
|
|
||||||
case class TweetClusterScoresHourlySuffixSource(path: String, override val dateRange: DateRange)
|
|
||||||
extends HourlySuffixMostRecentLzoScrooge[TweetAndClusterScores](path, dateRange)
|
|
||||||
|
|
||||||
case class TweetTopKClustersHourlySuffixSource(path: String, override val dateRange: DateRange)
|
|
||||||
extends HourlySuffixMostRecentLzoScrooge[TweetTopKClustersWithScores](
|
|
||||||
path,
|
|
||||||
dateRange
|
|
||||||
)
|
|
||||||
|
|
||||||
case class ClusterTopKTweetsHourlySuffixSource(path: String, override val dateRange: DateRange)
|
|
||||||
extends HourlySuffixMostRecentLzoScrooge[ClusterTopKTweetsWithScores](
|
|
||||||
path,
|
|
||||||
dateRange
|
|
||||||
)
|
|
||||||
|
|
||||||
case class TweetSimilarityUnhydratedPairsSource(path: String, override val dateRange: DateRange)
|
|
||||||
extends DailySuffixMostRecentLzoScrooge[LabelledTweetPairs](
|
|
||||||
path,
|
|
||||||
dateRange
|
|
||||||
)
|
|
||||||
|
|
||||||
case class WTFCandidatesSource(path: String)
|
|
||||||
extends FixedPathLzoScrooge[Candidates](path, Candidates)
|
|
||||||
|
|
||||||
case class EmbeddingsLiteSource(path: String)
|
|
||||||
extends FixedPathLzoScrooge[EmbeddingsLite](path, EmbeddingsLite)
|
|
||||||
|
|
||||||
object AdhocKeyValSources {
|
|
||||||
def interestedInSource(path: String): VersionedKeyValSource[Long, ClustersUserIsInterestedIn] = {
|
|
||||||
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
|
|
||||||
implicit val valInject: Injection[ClustersUserIsInterestedIn, Array[Byte]] =
|
|
||||||
CompactScalaCodec(ClustersUserIsInterestedIn)
|
|
||||||
VersionedKeyValSource[Long, ClustersUserIsInterestedIn](path)
|
|
||||||
}
|
|
||||||
|
|
||||||
def clusterDetailsSource(path: String): VersionedKeyValSource[(String, Int), ClusterDetails] = {
|
|
||||||
implicit val keyInject: Injection[(String, Int), Array[Byte]] =
|
|
||||||
Bufferable.injectionOf[(String, Int)]
|
|
||||||
implicit val valInject: Injection[ClusterDetails, Array[Byte]] =
|
|
||||||
CompactScalaCodec(ClusterDetails)
|
|
||||||
VersionedKeyValSource[(String, Int), ClusterDetails](path)
|
|
||||||
}
|
|
||||||
|
|
||||||
def bipartiteQualitySource(
|
|
||||||
path: String
|
|
||||||
): VersionedKeyValSource[(String, Int), BipartiteClusterQuality] = {
|
|
||||||
implicit val keyInject: Injection[(String, Int), Array[Byte]] =
|
|
||||||
Bufferable.injectionOf[(String, Int)]
|
|
||||||
implicit val valInject: Injection[BipartiteClusterQuality, Array[Byte]] =
|
|
||||||
CompactScalaCodec(BipartiteClusterQuality)
|
|
||||||
VersionedKeyValSource[(String, Int), BipartiteClusterQuality](path)
|
|
||||||
}
|
|
||||||
|
|
||||||
def entityToClustersSource(
|
|
||||||
path: String
|
|
||||||
): VersionedKeyValSource[SimClustersEmbeddingId, SimClustersEmbedding] = {
|
|
||||||
implicit val keyInject: Injection[SimClustersEmbeddingId, Array[Byte]] =
|
|
||||||
BinaryScalaCodec(SimClustersEmbeddingId)
|
|
||||||
implicit val valInject: Injection[SimClustersEmbedding, Array[Byte]] =
|
|
||||||
BinaryScalaCodec(SimClustersEmbedding)
|
|
||||||
VersionedKeyValSource[SimClustersEmbeddingId, SimClustersEmbedding](path)
|
|
||||||
}
|
|
||||||
|
|
||||||
def clusterToEntitiesSource(
|
|
||||||
path: String
|
|
||||||
): VersionedKeyValSource[SimClustersEmbeddingId, InternalIdEmbedding] = {
|
|
||||||
implicit val keyInject: Injection[SimClustersEmbeddingId, Array[Byte]] = BinaryScalaCodec(
|
|
||||||
SimClustersEmbeddingId)
|
|
||||||
implicit val valInject: Injection[InternalIdEmbedding, Array[Byte]] =
|
|
||||||
BinaryScalaCodec(InternalIdEmbedding)
|
|
||||||
VersionedKeyValSource[SimClustersEmbeddingId, InternalIdEmbedding](path)
|
|
||||||
}
|
|
||||||
|
|
||||||
// For storing producer-simclusters embeddings
|
|
||||||
def topProducerToClusterEmbeddingsSource(
|
|
||||||
path: String
|
|
||||||
): VersionedKeyValSource[Long, TopSimClustersWithScore] = {
|
|
||||||
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
|
|
||||||
implicit val valInject: Injection[TopSimClustersWithScore, Array[Byte]] =
|
|
||||||
CompactScalaCodec(TopSimClustersWithScore)
|
|
||||||
VersionedKeyValSource[Long, TopSimClustersWithScore](path)
|
|
||||||
}
|
|
||||||
|
|
||||||
// For storing producer-simclusters embeddings
|
|
||||||
def topClusterEmbeddingsToProducerSource(
|
|
||||||
path: String
|
|
||||||
): VersionedKeyValSource[PersistedFullClusterId, TopProducersWithScore] = {
|
|
||||||
implicit val keyInject: Injection[PersistedFullClusterId, Array[Byte]] =
|
|
||||||
CompactScalaCodec(PersistedFullClusterId)
|
|
||||||
implicit val valInject: Injection[TopProducersWithScore, Array[Byte]] =
|
|
||||||
CompactScalaCodec(TopProducersWithScore)
|
|
||||||
VersionedKeyValSource[PersistedFullClusterId, TopProducersWithScore](path)
|
|
||||||
}
|
|
||||||
|
|
||||||
def userToInferredEntitiesSource(
|
|
||||||
path: String
|
|
||||||
): VersionedKeyValSource[Long, SimClustersInferredEntities] = {
|
|
||||||
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
|
|
||||||
implicit val valInject: Injection[SimClustersInferredEntities, Array[Byte]] =
|
|
||||||
CompactScalaCodec(SimClustersInferredEntities)
|
|
||||||
VersionedKeyValSource[Long, SimClustersInferredEntities](path)
|
|
||||||
}
|
|
||||||
|
|
||||||
def knownForAdhocSource(path: String): VersionedKeyValSource[Long, ClustersUserIsKnownFor] = {
|
|
||||||
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
|
|
||||||
implicit val valInject: Injection[ClustersUserIsKnownFor, Array[Byte]] =
|
|
||||||
CompactScalaCodec(ClustersUserIsKnownFor)
|
|
||||||
VersionedKeyValSource[Long, ClustersUserIsKnownFor](path)
|
|
||||||
}
|
|
||||||
|
|
||||||
def knownForSBFResultsDevelSource(
|
|
||||||
path: String
|
|
||||||
): VersionedKeyValSource[Long, Array[(Int, Float)]] = {
|
|
||||||
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
|
|
||||||
implicit val valInject: Injection[Array[(Int, Float)], Array[Byte]] =
|
|
||||||
Bufferable.injectionOf[Array[(Int, Float)]]
|
|
||||||
VersionedKeyValSource[Long, Array[(Int, Float)]](path)
|
|
||||||
}
|
|
||||||
|
|
||||||
// injection to store adjlist in the mapped indices space for users
|
|
||||||
def intermediateSBFResultsDevelSource(
|
|
||||||
path: String
|
|
||||||
): VersionedKeyValSource[Int, List[(Int, Float)]] = {
|
|
||||||
implicit val keyInject: Injection[Int, Array[Byte]] = Injection.int2BigEndian
|
|
||||||
implicit val valInject: Injection[List[(Int, Float)], Array[Byte]] =
|
|
||||||
Bufferable.injectionOf[List[(Int, Float)]]
|
|
||||||
VersionedKeyValSource[Int, List[(Int, Float)]](path)
|
|
||||||
}
|
|
||||||
|
|
||||||
def mappedIndicesDevelSource(path: String): VersionedKeyValSource[Int, Long] = {
|
|
||||||
implicit val keyInject: Injection[Int, Array[Byte]] = Injection.int2BigEndian
|
|
||||||
implicit val valInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
|
|
||||||
VersionedKeyValSource[Int, Long](path)
|
|
||||||
}
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
BIN
src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD.docx
Normal file
BIN
src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD.docx
Normal file
Binary file not shown.
BIN
src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.docx
Normal file
BIN
src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.docx
Normal file
Binary file not shown.
@ -1,49 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources
|
|
||||||
|
|
||||||
object DataPaths {
|
|
||||||
|
|
||||||
val InterestedIn2020Path =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_20M_145K_2020"
|
|
||||||
|
|
||||||
val InterestedIn2020ThriftPath =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_20M_145K_2020_thrift"
|
|
||||||
|
|
||||||
val InterestedInLite2020Path =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_lite_20M_145K_2020"
|
|
||||||
|
|
||||||
val InterestedInLite2020ThriftPath =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_lite_20M_145K_2020_thrift"
|
|
||||||
|
|
||||||
val KnownFor2020Path =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_known_for_20M_145K_2020"
|
|
||||||
|
|
||||||
// keep this inside /user/cassowary/manhattan_sequence_files/ to use the latest 3 retention policy
|
|
||||||
val KnownFor2020ThriftDatasetPath =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_known_for_20M_145K_2020_thrift"
|
|
||||||
|
|
||||||
val OfflineClusterTopMediaTweets2020DatasetPath =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/cluster_top_media_tweets_20M_145K_2020"
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* These should only be accessed from simclusters_v2 data pipeline for intermediate data, these
|
|
||||||
* are not opt-out compliant and shouldn't be exposed externally.
|
|
||||||
*/
|
|
||||||
object InternalDataPaths {
|
|
||||||
// Internal versions, not to be read or written outside of simcluster_v2
|
|
||||||
|
|
||||||
private[simclusters_v2] val RawInterestedIn2020Path =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_interested_in_20M_145K_2020"
|
|
||||||
|
|
||||||
private[simclusters_v2] val RawInterestedInLite2020Path =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_interested_in_lite_20M_145K_2020"
|
|
||||||
|
|
||||||
private[simclusters_v2] val RawKnownForDec11Path =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_dec11"
|
|
||||||
|
|
||||||
private[simclusters_v2] val RawKnownForUpdatedPath =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_updated"
|
|
||||||
|
|
||||||
private[simclusters_v2] val RawKnownFor2020Path =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_2020"
|
|
||||||
}
|
|
Binary file not shown.
@ -1,39 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources
|
|
||||||
|
|
||||||
import com.twitter.scalding.DateOps
|
|
||||||
import com.twitter.scalding.DateRange
|
|
||||||
import com.twitter.scalding.Days
|
|
||||||
import com.twitter.scalding.TypedPipe
|
|
||||||
import com.twitter.scalding_internal.dalv2.DAL
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.NormsAndCounts
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
|
|
||||||
import java.util.TimeZone
|
|
||||||
|
|
||||||
object DataSources {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads production normalized graph data from atla-proc
|
|
||||||
*/
|
|
||||||
def userUserNormalizedGraphSource(implicit dateRange: DateRange): TypedPipe[UserAndNeighbors] = {
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(14)(DateOps.UTC))
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads production user norms and counts data from atla-proc
|
|
||||||
*/
|
|
||||||
def userNormsAndCounts(
|
|
||||||
implicit dateRange: DateRange,
|
|
||||||
timeZone: TimeZone
|
|
||||||
): TypedPipe[NormsAndCounts] = {
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(ProducerNormsAndCountsScalaDataset, dateRange.prepend(Days(14)))
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Binary file not shown.
@ -1,222 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources
|
|
||||||
|
|
||||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
|
||||||
import com.twitter.scalding.DateRange
|
|
||||||
import com.twitter.scalding.typed.TypedPipe
|
|
||||||
import com.twitter.scalding_internal.dalv2.DAL
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
|
||||||
import com.twitter.simclusters_v2.thriftscala._
|
|
||||||
import com.twitter.wtf.entity_real_graph.thriftscala.EntityType
|
|
||||||
import com.twitter.simclusters_v2.common.ClusterId
|
|
||||||
import com.twitter.simclusters_v2.common.ModelVersions
|
|
||||||
|
|
||||||
object EntityEmbeddingsSources {
|
|
||||||
|
|
||||||
final val SemanticCoreSimClustersEmbeddingsDec11Dataset =
|
|
||||||
SemanticCoreSimclustersEmbeddingsScalaDataset
|
|
||||||
|
|
||||||
final val SemanticCoreSimClustersEmbeddingsUpdatedDataset =
|
|
||||||
SemanticCoreSimclustersEmbeddingsUpdatedScalaDataset
|
|
||||||
|
|
||||||
final val SemanticCoreSimClustersEmbeddings2020Dataset =
|
|
||||||
SemanticCoreSimclustersEmbeddings2020ScalaDataset
|
|
||||||
|
|
||||||
final val SemanticCorePerLanguageSimClustersEmbeddingsDataset =
|
|
||||||
SemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
|
|
||||||
|
|
||||||
final val LogFavSemanticCorePerLanguageSimClustersEmbeddingsDataset =
|
|
||||||
LogFavSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
|
|
||||||
|
|
||||||
final val HashtagSimClustersEmbeddingsUpdatedDataset =
|
|
||||||
HashtagSimclustersEmbeddingsUpdatedScalaDataset
|
|
||||||
|
|
||||||
final val ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset =
|
|
||||||
ReverseIndexSemanticCoreSimclustersEmbeddingsScalaDataset
|
|
||||||
|
|
||||||
final val ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset =
|
|
||||||
ReverseIndexSemanticCoreSimclustersEmbeddingsUpdatedScalaDataset
|
|
||||||
|
|
||||||
final val ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset =
|
|
||||||
ReverseIndexSemanticCoreSimclustersEmbeddings2020ScalaDataset
|
|
||||||
|
|
||||||
final val ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
|
|
||||||
ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
|
|
||||||
|
|
||||||
final val LogFavReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
|
|
||||||
LogFavReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
|
|
||||||
|
|
||||||
final val ReverseIndexHashtagSimClustersEmbeddingsUpdatedDataset =
|
|
||||||
ReverseIndexHashtagSimclustersEmbeddingsUpdatedScalaDataset
|
|
||||||
|
|
||||||
// Fav-based TFG topic embeddings built from user device languages
|
|
||||||
// Keyed by SimClustersEmbeddingId with InternalId.TopicId ((topic, language) pair, with country = None)
|
|
||||||
final val FavTfgTopicEmbeddingsDataset = FavTfgTopicEmbeddingsScalaDataset
|
|
||||||
|
|
||||||
final val FavTfgTopicEmbeddingsParquetDataset = FavTfgTopicEmbeddingsParquetScalaDataset
|
|
||||||
|
|
||||||
final val FavTfgTopicEmbeddings2020Dataset = FavTfgTopicEmbeddings2020ScalaDataset
|
|
||||||
|
|
||||||
final val FavTfgTopicEmbeddings2020ParquetDataset = FavTfgTopicEmbeddings2020ParquetScalaDataset
|
|
||||||
|
|
||||||
// Logfav-based TFG topic embeddings built from user device languages
|
|
||||||
// Keyed by SimClustersEmbeddingId with InternalId.LocaleEntityId ((topic, language) pair)
|
|
||||||
final val LogFavTfgTopicEmbeddingsDataset = LogFavTfgTopicEmbeddingsScalaDataset
|
|
||||||
|
|
||||||
final val LogFavTfgTopicEmbeddingsParquetDataset = LogFavTfgTopicEmbeddingsParquetScalaDataset
|
|
||||||
|
|
||||||
// Fav-based TFG topic embeddings built from inferred user consumed languages
|
|
||||||
// Keyed by SimClustersEmbeddingId with InternalId.TopicId ((topic, country, language) tuple)
|
|
||||||
final val FavInferredLanguageTfgTopicEmbeddingsDataset =
|
|
||||||
FavInferredLanguageTfgTopicEmbeddingsScalaDataset
|
|
||||||
|
|
||||||
private val validSemanticCoreEmbeddingTypes = Seq(
|
|
||||||
EmbeddingType.FavBasedSematicCoreEntity,
|
|
||||||
EmbeddingType.FollowBasedSematicCoreEntity
|
|
||||||
)
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given a fav/follow/etc embedding type and a ModelVersion, retrieve the corresponding dataset to
|
|
||||||
* (SemanticCore entityId -> List(clusterId)) from a certain dateRange.
|
|
||||||
*/
|
|
||||||
def getSemanticCoreEntityEmbeddingsSource(
|
|
||||||
embeddingType: EmbeddingType,
|
|
||||||
modelVersion: String,
|
|
||||||
dateRange: DateRange
|
|
||||||
): TypedPipe[(Long, SimClustersEmbedding)] = {
|
|
||||||
val dataSet = modelVersion match {
|
|
||||||
case ModelVersions.Model20M145KDec11 => SemanticCoreSimClustersEmbeddingsDec11Dataset
|
|
||||||
case ModelVersions.Model20M145KUpdated => SemanticCoreSimClustersEmbeddingsUpdatedDataset
|
|
||||||
case _ => throw new IllegalArgumentException(s"ModelVersion $modelVersion is not supported")
|
|
||||||
}
|
|
||||||
assert(validSemanticCoreEmbeddingTypes.contains(embeddingType))
|
|
||||||
entityEmbeddingsSource(dataSet, embeddingType, dateRange)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given a fav/follow/etc embedding type and a ModelVersion, retrieve the corresponding dataset to
|
|
||||||
* (clusterId -> List(SemanticCore entityId)) from a certain dateRange.
|
|
||||||
*/
|
|
||||||
def getReverseIndexedSemanticCoreEntityEmbeddingsSource(
|
|
||||||
embeddingType: EmbeddingType,
|
|
||||||
modelVersion: String,
|
|
||||||
dateRange: DateRange
|
|
||||||
): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
|
|
||||||
val dataSet = modelVersion match {
|
|
||||||
case ModelVersions.Model20M145KDec11 =>
|
|
||||||
ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset
|
|
||||||
case ModelVersions.Model20M145KUpdated =>
|
|
||||||
ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset
|
|
||||||
case ModelVersions.Model20M145K2020 =>
|
|
||||||
ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset
|
|
||||||
case _ => throw new IllegalArgumentException(s"ModelVersion $modelVersion is not supported")
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(validSemanticCoreEmbeddingTypes.contains(embeddingType))
|
|
||||||
reverseIndexedEntityEmbeddingsSource(dataSet, embeddingType, dateRange)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the raw DAL dataset reference. Use this if you're writing to DAL.
|
|
||||||
def getEntityEmbeddingsDataset(
|
|
||||||
entityType: EntityType,
|
|
||||||
modelVersion: String,
|
|
||||||
isEmbeddingsPerLocale: Boolean = false
|
|
||||||
): KeyValDALDataset[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] = {
|
|
||||||
(entityType, modelVersion) match {
|
|
||||||
case (EntityType.SemanticCore, ModelVersions.Model20M145KDec11) =>
|
|
||||||
SemanticCoreSimClustersEmbeddingsDec11Dataset
|
|
||||||
case (EntityType.SemanticCore, ModelVersions.Model20M145KUpdated) =>
|
|
||||||
if (isEmbeddingsPerLocale) {
|
|
||||||
SemanticCorePerLanguageSimClustersEmbeddingsDataset
|
|
||||||
} else {
|
|
||||||
SemanticCoreSimClustersEmbeddingsUpdatedDataset
|
|
||||||
}
|
|
||||||
case (EntityType.SemanticCore, ModelVersions.Model20M145K2020) =>
|
|
||||||
SemanticCoreSimClustersEmbeddings2020Dataset
|
|
||||||
case (EntityType.Hashtag, ModelVersions.Model20M145KUpdated) =>
|
|
||||||
HashtagSimClustersEmbeddingsUpdatedDataset
|
|
||||||
case (entityType, modelVersion) =>
|
|
||||||
throw new IllegalArgumentException(
|
|
||||||
s"(Entity Type, ModelVersion) ($entityType, $modelVersion) not supported.")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the raw DAL dataset reference. Use this if you're writing to DAL.
|
|
||||||
def getReverseIndexedEntityEmbeddingsDataset(
|
|
||||||
entityType: EntityType,
|
|
||||||
modelVersion: String,
|
|
||||||
isEmbeddingsPerLocale: Boolean = false
|
|
||||||
): KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]] = {
|
|
||||||
(entityType, modelVersion) match {
|
|
||||||
case (EntityType.SemanticCore, ModelVersions.Model20M145KDec11) =>
|
|
||||||
ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset
|
|
||||||
case (EntityType.SemanticCore, ModelVersions.Model20M145KUpdated) =>
|
|
||||||
if (isEmbeddingsPerLocale) {
|
|
||||||
ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset
|
|
||||||
} else {
|
|
||||||
ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset
|
|
||||||
}
|
|
||||||
case (EntityType.SemanticCore, ModelVersions.Model20M145K2020) =>
|
|
||||||
ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset
|
|
||||||
case (EntityType.Hashtag, ModelVersions.Model20M145KUpdated) =>
|
|
||||||
ReverseIndexHashtagSimClustersEmbeddingsUpdatedDataset
|
|
||||||
case (entityType, modelVersion) =>
|
|
||||||
throw new IllegalArgumentException(
|
|
||||||
s"(Entity Type, ModelVersion) ($entityType, $modelVersion) not supported.")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private def entityEmbeddingsSource(
|
|
||||||
dataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]],
|
|
||||||
embeddingType: EmbeddingType,
|
|
||||||
dateRange: DateRange
|
|
||||||
): TypedPipe[(Long, SimClustersEmbedding)] = {
|
|
||||||
val pipe = DAL
|
|
||||||
.readMostRecentSnapshot(dataset, dateRange)
|
|
||||||
.withRemoteReadPolicy(AllowCrossDC)
|
|
||||||
.toTypedPipe
|
|
||||||
filterEntityEmbeddingsByType(pipe, embeddingType)
|
|
||||||
}
|
|
||||||
|
|
||||||
private def reverseIndexedEntityEmbeddingsSource(
|
|
||||||
dataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]],
|
|
||||||
embeddingType: EmbeddingType,
|
|
||||||
dateRange: DateRange
|
|
||||||
): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
|
|
||||||
val pipe = DAL
|
|
||||||
.readMostRecentSnapshot(dataset, dateRange)
|
|
||||||
.withRemoteReadPolicy(AllowCrossDC)
|
|
||||||
.toTypedPipe
|
|
||||||
filterReverseIndexedEntityEmbeddingsByType(pipe, embeddingType)
|
|
||||||
}
|
|
||||||
|
|
||||||
private[hdfs_sources] def filterEntityEmbeddingsByType(
|
|
||||||
pipe: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]],
|
|
||||||
embeddingType: EmbeddingType
|
|
||||||
): TypedPipe[(Long, SimClustersEmbedding)] = {
|
|
||||||
pipe.collect {
|
|
||||||
case KeyVal(
|
|
||||||
SimClustersEmbeddingId(_embeddingType, _, InternalId.EntityId(entityId)),
|
|
||||||
embedding
|
|
||||||
) if _embeddingType == embeddingType =>
|
|
||||||
(entityId, embedding)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private[hdfs_sources] def filterReverseIndexedEntityEmbeddingsByType(
|
|
||||||
pipe: TypedPipe[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]],
|
|
||||||
embeddingType: EmbeddingType
|
|
||||||
): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
|
|
||||||
pipe.collect {
|
|
||||||
case KeyVal(
|
|
||||||
SimClustersEmbeddingId(_embeddingType, _, InternalId.ClusterId(clusterId)),
|
|
||||||
embedding
|
|
||||||
) if _embeddingType == embeddingType =>
|
|
||||||
val entitiesWithScores = embedding.embedding.collect {
|
|
||||||
case InternalIdWithScore(InternalId.EntityId(entityId), score) =>
|
|
||||||
SemanticCoreEntityWithScore(entityId, score)
|
|
||||||
}
|
|
||||||
(clusterId, entitiesWithScores)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,178 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources
|
|
||||||
|
|
||||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
|
||||||
import com.twitter.scalding.{DateOps, DateRange, Days, TypedPipe}
|
|
||||||
import com.twitter.scalding_internal.dalv2.DAL
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, ProcAtla}
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
|
||||||
import com.twitter.simclusters_v2.common.UserId
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.ModelVersion
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
|
|
||||||
import java.util.TimeZone
|
|
||||||
|
|
||||||
object InterestedInSources {
|
|
||||||
|
|
||||||
private val ModelVersionInterestedInDatasetMap: Map[ModelVersion, KeyValDALDataset[
|
|
||||||
KeyVal[UserId, ClustersUserIsInterestedIn]
|
|
||||||
]] = Map(
|
|
||||||
ModelVersion.Model20m145kDec11 -> SimclustersV2InterestedInScalaDataset,
|
|
||||||
ModelVersion.Model20m145kUpdated -> SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
|
|
||||||
ModelVersion.Model20m145k2020 -> SimclustersV2InterestedIn20M145K2020ScalaDataset
|
|
||||||
)
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Internal version, not PDP compliant, not to be used outside simclusters_v2
|
|
||||||
* Reads 20M145KDec11 production InterestedIn data from atla-proc, with a 14-day extended window
|
|
||||||
*/
|
|
||||||
private[simclusters_v2] def simClustersRawInterestedInDec11Source(
|
|
||||||
dateRange: DateRange,
|
|
||||||
timeZone: TimeZone
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(
|
|
||||||
SimclustersV2RawInterestedIn20M145KDec11ScalaDataset,
|
|
||||||
dateRange.prepend(Days(14)(timeZone))
|
|
||||||
)
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe
|
|
||||||
.map {
|
|
||||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
|
||||||
(userId, clustersUserIsInterestedIn)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Internal version, not PDP compliant, not to be used outside simclusters_v2
|
|
||||||
* Reads 20M145KUpdated InterestedIn data from atla-proc, with a 14-day extended window
|
|
||||||
*/
|
|
||||||
private[simclusters_v2] def simClustersRawInterestedInUpdatedSource(
|
|
||||||
dateRange: DateRange,
|
|
||||||
timeZone: TimeZone
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(
|
|
||||||
SimclustersV2RawInterestedIn20M145KUpdatedScalaDataset,
|
|
||||||
dateRange.prepend(Days(14)(timeZone))
|
|
||||||
)
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe.map {
|
|
||||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
|
||||||
(userId, clustersUserIsInterestedIn)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Internal version, not PDP compliant, not to be used outside simclusters_v2
|
|
||||||
* Reads 20M145K2020 InterestedIn data from atla-proc, with a 14-day extended window
|
|
||||||
*/
|
|
||||||
private[simclusters_v2] def simClustersRawInterestedIn2020Source(
|
|
||||||
dateRange: DateRange,
|
|
||||||
timeZone: TimeZone
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(
|
|
||||||
SimclustersV2RawInterestedIn20M145K2020ScalaDataset,
|
|
||||||
dateRange.prepend(Days(14)(timeZone))
|
|
||||||
)
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe.map {
|
|
||||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
|
||||||
(userId, clustersUserIsInterestedIn)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private[simclusters_v2] def simClustersRawInterestedInLite2020Source(
|
|
||||||
dateRange: DateRange,
|
|
||||||
timeZone: TimeZone
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(
|
|
||||||
SimclustersV2RawInterestedInLite20M145K2020ScalaDataset,
|
|
||||||
dateRange.extend(Days(14)(timeZone)))
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe.map {
|
|
||||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
|
||||||
(userId, clustersUserIsInterestedIn)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads 20M145KDec11 production InterestedIn data from atla-proc, with a 14-day extended window
|
|
||||||
*/
|
|
||||||
def simClustersInterestedInDec11Source(
|
|
||||||
dateRange: DateRange,
|
|
||||||
timeZone: TimeZone
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(
|
|
||||||
SimclustersV2InterestedInScalaDataset,
|
|
||||||
dateRange.prepend(Days(14)(timeZone)))
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe.map {
|
|
||||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
|
||||||
(userId, clustersUserIsInterestedIn)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads 20M145KUpdated InterestedIn data from atla-proc, with a 14-day extended window
|
|
||||||
*/
|
|
||||||
def simClustersInterestedInUpdatedSource(
|
|
||||||
dateRange: DateRange,
|
|
||||||
timeZone: TimeZone
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(
|
|
||||||
SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
|
|
||||||
dateRange.prepend(Days(14)(timeZone))
|
|
||||||
)
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe.map {
|
|
||||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
|
||||||
(userId, clustersUserIsInterestedIn)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads 20M145K2020 InterestedIn data from atla-proc, with a 14-day extended window
|
|
||||||
*/
|
|
||||||
def simClustersInterestedIn2020Source(
|
|
||||||
dateRange: DateRange,
|
|
||||||
timeZone: TimeZone
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(
|
|
||||||
SimclustersV2InterestedIn20M145K2020ScalaDataset,
|
|
||||||
dateRange.prepend(Days(14)(timeZone))
|
|
||||||
)
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe.map {
|
|
||||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
|
||||||
(userId, clustersUserIsInterestedIn)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads InterestedIn data based on ModelVersion from atla-proc, with a 14-day extended window
|
|
||||||
*/
|
|
||||||
def simClustersInterestedInSource(
|
|
||||||
modelVersion: ModelVersion,
|
|
||||||
dateRange: DateRange,
|
|
||||||
timeZone: TimeZone
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(
|
|
||||||
ModelVersionInterestedInDatasetMap(modelVersion),
|
|
||||||
dateRange.prepend(Days(14)(timeZone))
|
|
||||||
)
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe.map {
|
|
||||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
|
||||||
(userId, clustersUserIsInterestedIn)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Binary file not shown.
@ -1,86 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources
|
|
||||||
|
|
||||||
import com.twitter.scalding.DateRange
|
|
||||||
import com.twitter.scalding.TypedPipe
|
|
||||||
import com.twitter.scalding_internal.dalv2.DAL
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.Proc3Atla
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.EmbeddingType
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.InternalId
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.ModelVersion
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore
|
|
||||||
|
|
||||||
object ProducerEmbeddingSources {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper function to retrieve producer SimClusters embeddings with the legacy `TopSimClustersWithScore`
|
|
||||||
* value type.
|
|
||||||
*/
|
|
||||||
def producerEmbeddingSourceLegacy(
|
|
||||||
embeddingType: EmbeddingType,
|
|
||||||
modelVersion: ModelVersion
|
|
||||||
)(
|
|
||||||
implicit dateRange: DateRange
|
|
||||||
): TypedPipe[(Long, TopSimClustersWithScore)] = {
|
|
||||||
val producerEmbeddingDataset = (embeddingType, modelVersion) match {
|
|
||||||
case (EmbeddingType.ProducerFollowBasedSemanticCoreEntity, ModelVersion.Model20m145kDec11) =>
|
|
||||||
ProducerTopKSimclusterEmbeddingsByFollowScoreScalaDataset
|
|
||||||
case (EmbeddingType.ProducerFavBasedSemanticCoreEntity, ModelVersion.Model20m145kDec11) =>
|
|
||||||
ProducerTopKSimclusterEmbeddingsByFavScoreScalaDataset
|
|
||||||
case (
|
|
||||||
EmbeddingType.ProducerFollowBasedSemanticCoreEntity,
|
|
||||||
ModelVersion.Model20m145kUpdated) =>
|
|
||||||
ProducerTopKSimclusterEmbeddingsByFollowScoreUpdatedScalaDataset
|
|
||||||
case (EmbeddingType.ProducerFavBasedSemanticCoreEntity, ModelVersion.Model20m145kUpdated) =>
|
|
||||||
ProducerTopKSimclusterEmbeddingsByFavScoreUpdatedScalaDataset
|
|
||||||
case (_, _) =>
|
|
||||||
throw new ClassNotFoundException(
|
|
||||||
"Unsupported embedding type: " + embeddingType + " and model version: " + modelVersion)
|
|
||||||
}
|
|
||||||
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(producerEmbeddingDataset).withRemoteReadPolicy(
|
|
||||||
AllowCrossClusterSameDC)
|
|
||||||
.toTypedPipe.map {
|
|
||||||
case KeyVal(producerId, topSimClustersWithScore) =>
|
|
||||||
(producerId, topSimClustersWithScore)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def producerEmbeddingSource(
|
|
||||||
embeddingType: EmbeddingType,
|
|
||||||
modelVersion: ModelVersion
|
|
||||||
)(
|
|
||||||
implicit dateRange: DateRange
|
|
||||||
): TypedPipe[(Long, SimClustersEmbedding)] = {
|
|
||||||
val producerEmbeddingDataset = (embeddingType, modelVersion) match {
|
|
||||||
case (EmbeddingType.AggregatableLogFavBasedProducer, ModelVersion.Model20m145k2020) =>
|
|
||||||
AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
|
|
||||||
case (EmbeddingType.AggregatableFollowBasedProducer, ModelVersion.Model20m145k2020) =>
|
|
||||||
AggregatableProducerSimclustersEmbeddingsByFollowScore2020ScalaDataset
|
|
||||||
case (EmbeddingType.RelaxedAggregatableLogFavBasedProducer, ModelVersion.Model20m145k2020) =>
|
|
||||||
AggregatableProducerSimclustersEmbeddingsByLogFavScoreRelaxedFavEngagementThreshold2020ScalaDataset
|
|
||||||
case (_, _) =>
|
|
||||||
throw new ClassNotFoundException(
|
|
||||||
"Unsupported embedding type: " + embeddingType + " and model version: " + modelVersion)
|
|
||||||
}
|
|
||||||
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(
|
|
||||||
producerEmbeddingDataset
|
|
||||||
)
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(Proc3Atla))
|
|
||||||
.toTypedPipe
|
|
||||||
.map {
|
|
||||||
case KeyVal(
|
|
||||||
SimClustersEmbeddingId(_, _, InternalId.UserId(producerId: Long)),
|
|
||||||
embedding: SimClustersEmbedding) =>
|
|
||||||
(producerId, embedding)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,13 +0,0 @@
|
|||||||
scala_library(
|
|
||||||
sources = ["*.scala"],
|
|
||||||
platform = "java8",
|
|
||||||
tags = ["bazel-compatible"],
|
|
||||||
dependencies = [
|
|
||||||
"src/scala/com/twitter/scalding_internal/multiformat/format",
|
|
||||||
"src/scala/com/twitter/simclusters_v2/common",
|
|
||||||
"src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala",
|
|
||||||
"src/thrift/com/twitter/ml/api:embedding-scala",
|
|
||||||
"src/thrift/com/twitter/recos/entities:entities-thrift-scala",
|
|
||||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
|
|
||||||
],
|
|
||||||
)
|
|
Binary file not shown.
Binary file not shown.
@ -1,16 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
|
||||||
|
|
||||||
import com.twitter.bijection.Bufferable
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
|
|
||||||
ScalaCompactThrift,
|
|
||||||
genericInjection
|
|
||||||
}
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.ClusterDetails
|
|
||||||
|
|
||||||
object ClusterDetailsInjection {
|
|
||||||
val injection = KeyValInjection[(String, Int), ClusterDetails](
|
|
||||||
genericInjection(Bufferable.injectionOf[(String, Int)]),
|
|
||||||
ScalaCompactThrift(ClusterDetails)
|
|
||||||
)
|
|
||||||
}
|
|
Binary file not shown.
@ -1,13 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
|
||||||
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.{TweetsWithScore, DayPartitionedClusterId}
|
|
||||||
|
|
||||||
object ClusterTopMediaTweetsInjection {
|
|
||||||
|
|
||||||
val injection = KeyValInjection[DayPartitionedClusterId, TweetsWithScore](
|
|
||||||
ScalaCompactThrift(DayPartitionedClusterId),
|
|
||||||
ScalaCompactThrift(TweetsWithScore)
|
|
||||||
)
|
|
||||||
}
|
|
Binary file not shown.
@ -1,14 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
|
||||||
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.FullClusterId
|
|
||||||
|
|
||||||
object ClusterTopTweetsInjection {
|
|
||||||
|
|
||||||
val clusterIdToTopKTweetsInjection = KeyValInjection[FullClusterId, TopKTweetsWithScores](
|
|
||||||
ScalaCompactThrift(FullClusterId),
|
|
||||||
ScalaCompactThrift(TopKTweetsWithScores)
|
|
||||||
)
|
|
||||||
}
|
|
Binary file not shown.
@ -1,16 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
|
||||||
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaBinaryThrift
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
|
|
||||||
import com.twitter.simclusters_v2.common.UserId
|
|
||||||
import com.twitter.simclusters_v2.thriftscala._
|
|
||||||
|
|
||||||
object ClusteringInjections {
|
|
||||||
|
|
||||||
final val OrderedClustersAndMembersInjection: KeyValInjection[
|
|
||||||
UserId,
|
|
||||||
OrderedClustersAndMembers
|
|
||||||
] =
|
|
||||||
KeyValInjection(Long2BigEndian, ScalaBinaryThrift(OrderedClustersAndMembers))
|
|
||||||
}
|
|
Binary file not shown.
@ -1,47 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
|
||||||
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaBinaryThrift
|
|
||||||
import com.twitter.simclusters_v2.thriftscala._
|
|
||||||
import com.twitter.ml.api.thriftscala.Embedding
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
|
|
||||||
|
|
||||||
object EntityEmbeddingsInjections {
|
|
||||||
|
|
||||||
final val EntitySimClustersEmbeddingInjection: KeyValInjection[
|
|
||||||
SimClustersEmbeddingId,
|
|
||||||
SimClustersEmbedding
|
|
||||||
] =
|
|
||||||
KeyValInjection(
|
|
||||||
ScalaBinaryThrift(SimClustersEmbeddingId),
|
|
||||||
ScalaBinaryThrift(SimClustersEmbedding)
|
|
||||||
)
|
|
||||||
|
|
||||||
final val InternalIdEmbeddingInjection: KeyValInjection[
|
|
||||||
SimClustersEmbeddingId,
|
|
||||||
InternalIdEmbedding
|
|
||||||
] =
|
|
||||||
KeyValInjection(
|
|
||||||
ScalaBinaryThrift(SimClustersEmbeddingId),
|
|
||||||
ScalaBinaryThrift(InternalIdEmbedding)
|
|
||||||
)
|
|
||||||
|
|
||||||
final val EntitySimClustersMultiEmbeddingInjection: KeyValInjection[
|
|
||||||
SimClustersMultiEmbeddingId,
|
|
||||||
SimClustersMultiEmbedding
|
|
||||||
] =
|
|
||||||
KeyValInjection(
|
|
||||||
ScalaBinaryThrift(SimClustersMultiEmbeddingId),
|
|
||||||
ScalaBinaryThrift(SimClustersMultiEmbedding)
|
|
||||||
)
|
|
||||||
|
|
||||||
final val UserMbcgEmbeddingInjection: KeyValInjection[
|
|
||||||
Long,
|
|
||||||
Embedding
|
|
||||||
] =
|
|
||||||
KeyValInjection[Long, Embedding](
|
|
||||||
Long2BigEndian,
|
|
||||||
ScalaCompactThrift(Embedding)
|
|
||||||
)
|
|
||||||
}
|
|
Binary file not shown.
@ -1,27 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
|
||||||
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
|
|
||||||
Int2BigEndian,
|
|
||||||
Long2BigEndian,
|
|
||||||
ScalaCompactThrift
|
|
||||||
}
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.SimClustersInferredEntities
|
|
||||||
|
|
||||||
object InferredEntitiesInjections {
|
|
||||||
|
|
||||||
final val InferredEntityInjection: KeyValInjection[Long, SimClustersInferredEntities] =
|
|
||||||
KeyValInjection(
|
|
||||||
Long2BigEndian,
|
|
||||||
ScalaCompactThrift(SimClustersInferredEntities)
|
|
||||||
)
|
|
||||||
|
|
||||||
final val InferredEntityKeyedByClusterInjection: KeyValInjection[
|
|
||||||
Int,
|
|
||||||
SimClustersInferredEntities
|
|
||||||
] =
|
|
||||||
KeyValInjection(
|
|
||||||
Int2BigEndian,
|
|
||||||
ScalaCompactThrift(SimClustersInferredEntities)
|
|
||||||
)
|
|
||||||
}
|
|
Binary file not shown.
@ -1,13 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
|
||||||
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.StringUtf8
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
|
|
||||||
|
|
||||||
object InterestedInInjection {
|
|
||||||
val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(ClustersUserIsInterestedIn))
|
|
||||||
val languageInjection =
|
|
||||||
KeyValInjection(StringUtf8, ScalaCompactThrift(ClustersUserIsInterestedIn))
|
|
||||||
}
|
|
Binary file not shown.
@ -1,12 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
|
||||||
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
|
|
||||||
Long2BigEndian,
|
|
||||||
ScalaCompactThrift
|
|
||||||
}
|
|
||||||
import com.twitter.simclusters_v2.thriftscala._
|
|
||||||
|
|
||||||
object KnownForInjection {
|
|
||||||
val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(ClustersUserIsKnownFor))
|
|
||||||
}
|
|
Binary file not shown.
@ -1,31 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
|
||||||
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.LeftNode
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.RightNode
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeightList
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.SimilarRightNodes
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.CandidateTweetsList
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
|
|
||||||
|
|
||||||
object MultiTypeGraphInjections {
|
|
||||||
final val truncatedMultiTypeGraphInjection =
|
|
||||||
KeyValInjection(ScalaCompactThrift(LeftNode), ScalaCompactThrift(RightNodeWithEdgeWeightList))
|
|
||||||
final val topKRightNounListInjection =
|
|
||||||
KeyValInjection(
|
|
||||||
ScalaCompactThrift(RightNodeTypeStruct),
|
|
||||||
ScalaCompactThrift(NounWithFrequencyList))
|
|
||||||
final val similarRightNodesInjection =
|
|
||||||
KeyValInjection[RightNode, SimilarRightNodes](
|
|
||||||
ScalaCompactThrift(RightNode),
|
|
||||||
ScalaCompactThrift(SimilarRightNodes)
|
|
||||||
)
|
|
||||||
final val tweetRecommendationsInjection =
|
|
||||||
KeyValInjection[Long, CandidateTweetsList](
|
|
||||||
Long2BigEndian,
|
|
||||||
ScalaCompactThrift(CandidateTweetsList)
|
|
||||||
)
|
|
||||||
}
|
|
Binary file not shown.
@ -1,45 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
|
||||||
|
|
||||||
import com.twitter.hermit.candidate.thriftscala.Candidates
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
|
|
||||||
Long2BigEndian,
|
|
||||||
ScalaBinaryThrift,
|
|
||||||
ScalaCompactThrift
|
|
||||||
}
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.{
|
|
||||||
PersistedFullClusterId,
|
|
||||||
SimClustersEmbedding,
|
|
||||||
SimClustersEmbeddingId,
|
|
||||||
TopProducersWithScore,
|
|
||||||
TopSimClustersWithScore
|
|
||||||
}
|
|
||||||
|
|
||||||
object ProducerEmbeddingsInjections {
|
|
||||||
final val ProducerTopKSimClusterEmbeddingsInjection: KeyValInjection[
|
|
||||||
Long,
|
|
||||||
TopSimClustersWithScore
|
|
||||||
] =
|
|
||||||
KeyValInjection(
|
|
||||||
keyCodec = Long2BigEndian,
|
|
||||||
valueCodec = ScalaCompactThrift(TopSimClustersWithScore))
|
|
||||||
|
|
||||||
final val SimClusterEmbeddingTopKProducersInjection: KeyValInjection[
|
|
||||||
PersistedFullClusterId,
|
|
||||||
TopProducersWithScore
|
|
||||||
] =
|
|
||||||
KeyValInjection(
|
|
||||||
keyCodec = ScalaCompactThrift(PersistedFullClusterId),
|
|
||||||
valueCodec = ScalaCompactThrift(TopProducersWithScore))
|
|
||||||
|
|
||||||
final val SimilarUsersInjection: KeyValInjection[Long, Candidates] =
|
|
||||||
KeyValInjection(keyCodec = Long2BigEndian, valueCodec = ScalaCompactThrift(Candidates))
|
|
||||||
|
|
||||||
final val ProducerSimClustersEmbeddingInjection: KeyValInjection[
|
|
||||||
SimClustersEmbeddingId,
|
|
||||||
SimClustersEmbedding
|
|
||||||
] =
|
|
||||||
KeyValInjection(
|
|
||||||
keyCodec = ScalaBinaryThrift(SimClustersEmbeddingId),
|
|
||||||
valueCodec = ScalaBinaryThrift(SimClustersEmbedding))
|
|
||||||
}
|
|
Binary file not shown.
@ -1,53 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
|
||||||
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
|
|
||||||
Long2BigEndian,
|
|
||||||
ScalaCompactThrift,
|
|
||||||
StringUtf8
|
|
||||||
}
|
|
||||||
import com.twitter.recos.entities.thriftscala.{
|
|
||||||
SemanticCoreEntityScoreList,
|
|
||||||
SemanticCoreEntityWithLocale,
|
|
||||||
UserIdWithLocale,
|
|
||||||
UserScoreList
|
|
||||||
}
|
|
||||||
|
|
||||||
object SemanticCoreEntitiesInjections {
|
|
||||||
|
|
||||||
final val StringToSemanticCoreEntityScoreListInjection: KeyValInjection[
|
|
||||||
String,
|
|
||||||
SemanticCoreEntityScoreList
|
|
||||||
] =
|
|
||||||
KeyValInjection(
|
|
||||||
StringUtf8,
|
|
||||||
ScalaCompactThrift(SemanticCoreEntityScoreList)
|
|
||||||
)
|
|
||||||
|
|
||||||
final val LongToSemanticCoreEntityScoreListInjection: KeyValInjection[
|
|
||||||
Long,
|
|
||||||
SemanticCoreEntityScoreList
|
|
||||||
] =
|
|
||||||
KeyValInjection(
|
|
||||||
Long2BigEndian,
|
|
||||||
ScalaCompactThrift(SemanticCoreEntityScoreList)
|
|
||||||
)
|
|
||||||
|
|
||||||
final val UserWithLocaleToSemanticCoreEntityScoreListInjection: KeyValInjection[
|
|
||||||
UserIdWithLocale,
|
|
||||||
SemanticCoreEntityScoreList
|
|
||||||
] =
|
|
||||||
KeyValInjection(
|
|
||||||
ScalaCompactThrift(UserIdWithLocale),
|
|
||||||
ScalaCompactThrift(SemanticCoreEntityScoreList)
|
|
||||||
)
|
|
||||||
|
|
||||||
final val SemanticCoreEntityWithLocaleToUsersScoreListInjection: KeyValInjection[
|
|
||||||
SemanticCoreEntityWithLocale,
|
|
||||||
UserScoreList
|
|
||||||
] =
|
|
||||||
KeyValInjection(
|
|
||||||
ScalaCompactThrift(SemanticCoreEntityWithLocale),
|
|
||||||
ScalaCompactThrift(UserScoreList)
|
|
||||||
)
|
|
||||||
}
|
|
Binary file not shown.
@ -1,12 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.injections
|
|
||||||
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
|
|
||||||
Long2BigEndian,
|
|
||||||
ScalaCompactThrift
|
|
||||||
}
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.SingleSideUserScores
|
|
||||||
|
|
||||||
object SingleSideUserScoresInjection {
|
|
||||||
val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(SingleSideUserScores))
|
|
||||||
}
|
|
@ -1,60 +0,0 @@
|
|||||||
scala_library(
|
|
||||||
sources = ["*.scala"],
|
|
||||||
platform = "java8",
|
|
||||||
tags = ["bazel-compatible"],
|
|
||||||
dependencies = [
|
|
||||||
":data_sources",
|
|
||||||
"3rdparty/src/jvm/com/twitter/scalding:core",
|
|
||||||
"src/scala/com/twitter/scalding_internal/dalv2",
|
|
||||||
"src/scala/com/twitter/scalding_internal/multiformat/format",
|
|
||||||
"src/scala/com/twitter/scalding_internal/source/lzo_scrooge",
|
|
||||||
"src/scala/com/twitter/simclusters_v2/common",
|
|
||||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
|
|
||||||
"src/thrift/com/twitter/wtf/entity_real_graph:entity_real_graph-thrift-scala",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
scala_library(
|
|
||||||
name = "data_sources",
|
|
||||||
sources = [],
|
|
||||||
description = "DAL datasets we wish to expose externally",
|
|
||||||
platform = "java8",
|
|
||||||
tags = ["bazel-compatible"],
|
|
||||||
dependencies = [
|
|
||||||
":reverse_index_semantic_core_per_language_simclusters_embeddings_presto-scala",
|
|
||||||
":semantic_core_per_language_simclusters_embeddings_presto-scala",
|
|
||||||
"src/scala/com/twitter/simclusters_v2/common",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
create_datasets(
|
|
||||||
base_name = "reverse_index_semantic_core_per_language_simclusters_embeddings_presto",
|
|
||||||
java_schema = "com.twitter.simclusters_v2.thriftjava.InternalIdEmbeddingWithId",
|
|
||||||
platform = "java8",
|
|
||||||
role = "cassowary",
|
|
||||||
scala_schema = "com.twitter.simclusters_v2.thriftscala.InternalIdEmbeddingWithId",
|
|
||||||
segment_type = "snapshot",
|
|
||||||
tags = ["bazel-compatible"],
|
|
||||||
java_dependencies = [
|
|
||||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java",
|
|
||||||
],
|
|
||||||
scala_dependencies = [
|
|
||||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
create_datasets(
|
|
||||||
base_name = "semantic_core_per_language_simclusters_embeddings_presto",
|
|
||||||
java_schema = "com.twitter.simclusters_v2.thriftjava.SimClustersEmbeddingWithId",
|
|
||||||
platform = "java8",
|
|
||||||
role = "cassowary",
|
|
||||||
scala_schema = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId",
|
|
||||||
segment_type = "snapshot",
|
|
||||||
tags = ["bazel-compatible"],
|
|
||||||
java_dependencies = [
|
|
||||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java",
|
|
||||||
],
|
|
||||||
scala_dependencies = [
|
|
||||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
|
|
||||||
],
|
|
||||||
)
|
|
Binary file not shown.
Binary file not shown.
@ -1,10 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.hdfs_sources.presto_hdfs_sources
|
|
||||||
|
|
||||||
object EntityEmbeddingsPrestoSources {
|
|
||||||
|
|
||||||
final val SemanticCorePerLanguageSimClustersEmbeddingsDataset =
|
|
||||||
SemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset
|
|
||||||
|
|
||||||
final val ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
|
|
||||||
ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset
|
|
||||||
}
|
|
Binary file not shown.
Before Width: | Height: | Size: 61 KiB |
Binary file not shown.
Before Width: | Height: | Size: 66 KiB |
Binary file not shown.
Before Width: | Height: | Size: 26 KiB |
Binary file not shown.
Before Width: | Height: | Size: 71 KiB |
Binary file not shown.
Before Width: | Height: | Size: 233 KiB |
Binary file not shown.
Before Width: | Height: | Size: 70 KiB |
@ -1,521 +0,0 @@
|
|||||||
scala_library(
|
|
||||||
sources = ["*.scala"],
|
|
||||||
platform = "java8",
|
|
||||||
tags = ["bazel-compatible"],
|
|
||||||
dependencies = [
|
|
||||||
"3rdparty/jvm/com/fasterxml/jackson:jackson-module-scala",
|
|
||||||
"3rdparty/jvm/com/fasterxml/jackson/core:jackson-core",
|
|
||||||
"3rdparty/jvm/com/fasterxml/jackson/core:jackson-databind",
|
|
||||||
"3rdparty/jvm/com/fasterxml/jackson/module:jackson-module-scala",
|
|
||||||
"3rdparty/jvm/com/googlecode/matrix-toolkits-java",
|
|
||||||
"3rdparty/jvm/com/twitter/storehaus:algebra",
|
|
||||||
"3rdparty/jvm/com/twitter/storehaus:core",
|
|
||||||
"escherbird/src/scala/com/twitter/escherbird/scalding/source",
|
|
||||||
"flockdb-tools/datasets/flock:flock-follows-edges-scala",
|
|
||||||
"src/java/com/twitter/ml/api/constant",
|
|
||||||
"src/java/com/twitter/sbf/core",
|
|
||||||
"src/java/com/twitter/sbf/graph",
|
|
||||||
"src/scala/com/twitter/frigate/user_sampler/common",
|
|
||||||
"src/scala/com/twitter/ml/api:api-base",
|
|
||||||
"src/scala/com/twitter/ml/api/bq",
|
|
||||||
"src/scala/com/twitter/pluck/source/cassowary:sims",
|
|
||||||
"src/scala/com/twitter/pluck/source/core_workflows/user_model:condensed_user_state-scala",
|
|
||||||
"src/scala/com/twitter/scalding_internal/dalv2",
|
|
||||||
"src/scala/com/twitter/scalding_internal/job",
|
|
||||||
"src/scala/com/twitter/scalding_internal/job/analytics_batch",
|
|
||||||
"src/scala/com/twitter/scalding_internal/source",
|
|
||||||
"src/scala/com/twitter/scalding_internal/source/lzo_scrooge",
|
|
||||||
"src/scala/com/twitter/simclusters_v2/candidate_source",
|
|
||||||
"src/scala/com/twitter/simclusters_v2/hdfs_sources",
|
|
||||||
"src/scala/com/twitter/simclusters_v2/scalding/common",
|
|
||||||
"src/scala/com/twitter/simclusters_v2/summingbird/common",
|
|
||||||
"src/scala/com/twitter/timelines/prediction/features/common",
|
|
||||||
"src/scala/com/twitter/timelines/prediction/features/itl",
|
|
||||||
"src/scala/com/twitter/timelines/prediction/features/recap",
|
|
||||||
"src/scala/com/twitter/wtf/entity_real_graph/scalding/common",
|
|
||||||
"src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala",
|
|
||||||
"src/thrift/com/twitter/wtf/scalding/sims:sims-thrift-scala",
|
|
||||||
"twadoop_config/configuration/log_categories/group/recos-platform:content_recommender_get_content_recommendations-scala",
|
|
||||||
"twadoop_config/configuration/log_categories/group/recos-platform:content_recommender_get_topic_tweets_recommendations-scala",
|
|
||||||
"twadoop_config/configuration/log_categories/group/timeline:timeline_service_favorites-scala",
|
|
||||||
"usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala",
|
|
||||||
"usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala",
|
|
||||||
"util/util-core:util-core-util",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "evd_cluster_similarity",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.EigenVectorsForClusterSimilarityAdhoc",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "cluster_evaluation",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationAdhoc",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "cluster_evaluation_20m_145k",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "cluster_evaluation_20m_145k_2020",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K2020",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "bp_cluster_evaluation",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluation",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "update_knownfor",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.UpdateKnownForAdhoc",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "update_knownfor_prod",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.UpdateKnownFor20M145K",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "cluster_details",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.ClusterDetailsBatch",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "cluster_details_20m_145k_updated",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.ClusterDetails20M145KUpdated",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "cluster_details_20m_145k_2020",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.ClusterDetails20M145K2020",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "cluster_details-adhoc",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.ClusterDetailsAdhoc",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "cluster_details-dump",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.DumpClusterDetailsAdhoc",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "interested_in",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForBatch",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "interested_in_from_producer_embeddings",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsBatchApp",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "employee_graph_from_user_user",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.EmployeeGraphFromUserUser",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "interested_in_20m_145k_updated",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145KUpdated",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "interested_in_20m_145k_2020",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145K2020",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "interested_in_lite_20m_145k_2020",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "interested_in_lite_20m_145k_2020-adhoc",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020Adhoc",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "interested_in_from_ape_2020-adhoc",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020AdhocApp",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "interested_in_from_ape_2020",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020BatchApp",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "known_for_to_mh",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.KnownForToMHBatch",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "user_user_normalized_graph",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.UserUserNormalizedGraphBatch",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "user_user_graph",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.UserUserGraphBatch",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "user_user_graph-adhoc",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.UserUserGraphAdhoc",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "producer_norms_and_counts",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.ProducerNormsAndCountsBatch",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "fav_graph",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.UserUserFavGraphBatch",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "top_users_similarity_graph",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.TopUsersSimilarityGraphApp",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "top_users_only",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.TopUsersOnlyApp",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
hadoop_binary(
|
|
||||||
name = "dump_fav_graph_adhoc",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.DumpFavGraphAdhoc",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generated with `capesospy-v2 create_target interested_in_for_20M_145k_2020 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml`, config hash 8f19bf.
|
|
||||||
scalding_job(
|
|
||||||
name = "interested_in_for_20M_145k_2020",
|
|
||||||
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145K2020",
|
|
||||||
args = ["--socialProofThreshold 2 --maxClustersPerUser 50"],
|
|
||||||
config = [
|
|
||||||
("hadoop.combine-input", "true"),
|
|
||||||
("hadoop.map.jvm.total-memory", "3072m"),
|
|
||||||
("hadoop.reduce.jvm.total-memory", "3072m"),
|
|
||||||
("hadoop.submitter.jvm.total-memory", "5120m"),
|
|
||||||
("submitter.tier", "preemptible"),
|
|
||||||
],
|
|
||||||
cron = "14 * * * *",
|
|
||||||
hadoop_cluster = "atla-proc",
|
|
||||||
platform = "java8",
|
|
||||||
role = "cassowary",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":scalding",
|
|
||||||
],
|
|
||||||
)
|
|
BIN
src/scala/com/twitter/simclusters_v2/scalding/BUILD.docx
Normal file
BIN
src/scala/com/twitter/simclusters_v2/scalding/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,513 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.scalding
|
|
||||||
|
|
||||||
import com.twitter.algebird.Aggregator
|
|
||||||
import com.twitter.algebird.Monoid
|
|
||||||
import com.twitter.scalding._
|
|
||||||
import com.twitter.scalding.commons.source.VersionedKeyValSource
|
|
||||||
import com.twitter.scalding.typed.TypedPipe
|
|
||||||
import com.twitter.scalding_internal.dalv2.DAL
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
|
|
||||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.NormsAndCountsFixedPathSource
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.ProducerNormsAndCountsScalaDataset
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInScalaDataset
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
|
|
||||||
import com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluationClasses._
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.Util
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.BipartiteClusterQuality
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.NormsAndCounts
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
|
|
||||||
import scala.collection.JavaConverters._
|
|
||||||
|
|
||||||
object BipartiteClusterEvaluation extends TwitterExecutionApp {
|
|
||||||
|
|
||||||
implicit val tz: java.util.TimeZone = DateOps.UTC
|
|
||||||
implicit val dp = DateParser.default
|
|
||||||
|
|
||||||
private def getClusterL2Norms(
|
|
||||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])]
|
|
||||||
): Execution[Map[Int, Float]] = {
|
|
||||||
knownFor
|
|
||||||
.flatMap {
|
|
||||||
case (_, clusterArray) =>
|
|
||||||
clusterArray.map {
|
|
||||||
case (clusterId, score) =>
|
|
||||||
Map(clusterId -> score * score)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.sum
|
|
||||||
.getExecution
|
|
||||||
.map(_.mapValues { x => math.sqrt(x).toFloat })
|
|
||||||
}
|
|
||||||
|
|
||||||
def l2NormalizeKnownFor(
|
|
||||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])]
|
|
||||||
): Execution[TypedPipe[(Long, Array[(Int, Float)])]] = {
|
|
||||||
getClusterL2Norms(knownFor).map { clusterToNorms =>
|
|
||||||
knownFor.mapValues { clusterScoresArray =>
|
|
||||||
clusterScoresArray.map {
|
|
||||||
case (clusterId, score) =>
|
|
||||||
(clusterId, score / clusterToNorms(clusterId))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:bp_cluster_evaluation && \
|
|
||||||
* oscar hdfs --user frigate --host hadoopnest2.atla.twitter.com --bundle bp_cluster_evaluation \
|
|
||||||
* --tool com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluation --screen --screen-detached \
|
|
||||||
* --tee logs/newBpQuality_updateUnnormalizedScores_interestedInUsing20190329Graph_evaluatedOn20190329Graph_run2 \
|
|
||||||
* -- --normsAndCountsDir /user/frigate/your_ldap/producerNormsAndCounts_20190330 \
|
|
||||||
* --graphInputDir /user/frigate/your_ldap/user_user_normalized_graph_copiedFromAtlaProc_20190329 \
|
|
||||||
* --knownForDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownFor \
|
|
||||||
* --interestedInDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/interestedInUsing20190329Graph \
|
|
||||||
* --outgoingVolumesResultsDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_outgoingVolumes \
|
|
||||||
* --incomingVolumesResultsDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_incomingVolumes \
|
|
||||||
* --outputDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_perCluster \
|
|
||||||
* --toEmailAddress your_ldap@twitter.com --modelVersion 20M_145K_updated
|
|
||||||
*/
|
|
||||||
override def job: Execution[Unit] = Execution.getConfigMode.flatMap {
|
|
||||||
case (config, mode) =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
val args = config.getArgs
|
|
||||||
|
|
||||||
val interestedIn = args.optional("interestedInDir") match {
|
|
||||||
case Some(dir) =>
|
|
||||||
TypedPipe
|
|
||||||
.from(AdhocKeyValSources.interestedInSource(args("interestedInDir")))
|
|
||||||
case None =>
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshotNoOlderThan(
|
|
||||||
SimclustersV2InterestedInScalaDataset,
|
|
||||||
Days(20)
|
|
||||||
)
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe
|
|
||||||
.map {
|
|
||||||
case KeyVal(key, value) => (key, value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
val inputKnownFor = args
|
|
||||||
.optional("knownForDir")
|
|
||||||
.map { location => KnownForSources.readKnownFor(location) }
|
|
||||||
.getOrElse(KnownForSources.knownFor_20M_Dec11_145K)
|
|
||||||
|
|
||||||
val modelVersion =
|
|
||||||
args.optional("modelVersion").getOrElse("20M_145K_dec11")
|
|
||||||
|
|
||||||
val useLogFavWeights = args.boolean("useLogFavWeights")
|
|
||||||
|
|
||||||
val shouldL2NormalizeKnownFor = args.boolean("l2NormalizeKnownFor")
|
|
||||||
|
|
||||||
val toEmailAddressOpt = args.optional("toEmailAddress")
|
|
||||||
|
|
||||||
val knownForExec = if (shouldL2NormalizeKnownFor) {
|
|
||||||
l2NormalizeKnownFor(inputKnownFor)
|
|
||||||
} else {
|
|
||||||
Execution.from(inputKnownFor)
|
|
||||||
}
|
|
||||||
|
|
||||||
val finalExec = knownForExec.flatMap { knownFor =>
|
|
||||||
val graph = args.optional("graphInputDir") match {
|
|
||||||
case Some(dir) =>
|
|
||||||
TypedPipe.from(UserAndNeighborsFixedPathSource(dir))
|
|
||||||
case None =>
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(20))
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe
|
|
||||||
}
|
|
||||||
|
|
||||||
val producerNormsAndCounts = args.optional("normsAndCountsDir") match {
|
|
||||||
case Some(dir) =>
|
|
||||||
TypedPipe.from(NormsAndCountsFixedPathSource(args(dir)))
|
|
||||||
case None =>
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshotNoOlderThan(ProducerNormsAndCountsScalaDataset, Days(20))
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe
|
|
||||||
}
|
|
||||||
|
|
||||||
val clusterIncomingVolumesExec = loadOrMake(
|
|
||||||
computeClusterIncomingVolumes(knownFor, producerNormsAndCounts, useLogFavWeights),
|
|
||||||
modelVersion,
|
|
||||||
args("incomingVolumesResultsDir")
|
|
||||||
)
|
|
||||||
|
|
||||||
val resultsWithOutgoingVolumesExec = loadOrMake(
|
|
||||||
getResultsWithOutgoingVolumes(graph, interestedIn, useLogFavWeights),
|
|
||||||
modelVersion,
|
|
||||||
args("outgoingVolumesResultsDir")
|
|
||||||
)
|
|
||||||
|
|
||||||
val finalPerClusterResultsExec =
|
|
||||||
finalPerClusterResults(
|
|
||||||
knownFor,
|
|
||||||
interestedIn,
|
|
||||||
resultsWithOutgoingVolumesExec,
|
|
||||||
clusterIncomingVolumesExec)
|
|
||||||
.flatMap { pipe => loadOrMake(pipe, modelVersion, args("outputDir")) }
|
|
||||||
|
|
||||||
finalPerClusterResultsExec.flatMap { finalPerClusterResults =>
|
|
||||||
val perClusterResults = finalPerClusterResults.values
|
|
||||||
val distributionResultsExec = getClusterResultsSummary(perClusterResults).map {
|
|
||||||
case Some(summary) =>
|
|
||||||
"Summary of results across clusters: \n" +
|
|
||||||
Util.prettyJsonMapper.writeValueAsString(summary)
|
|
||||||
case _ =>
|
|
||||||
"No summary of results! The cluster level results pipe must be empty!"
|
|
||||||
}
|
|
||||||
|
|
||||||
val overallResultsExec = perClusterResults.sum.toOptionExecution.map {
|
|
||||||
case Some(overallQuality) =>
|
|
||||||
"Overall Quality: \n" +
|
|
||||||
Util.prettyJsonMapper.writeValueAsString(
|
|
||||||
printableBipartiteQuality(overallQuality)
|
|
||||||
)
|
|
||||||
case _ =>
|
|
||||||
"No overall quality! The cluster level results pipe must be empty!"
|
|
||||||
}
|
|
||||||
|
|
||||||
Execution.zip(distributionResultsExec, overallResultsExec).map {
|
|
||||||
case (distResults, overallResults) =>
|
|
||||||
toEmailAddressOpt.foreach { address =>
|
|
||||||
Util.sendEmail(
|
|
||||||
distResults + "\n" + overallResults,
|
|
||||||
"Bipartite cluster quality for " + modelVersion,
|
|
||||||
address
|
|
||||||
)
|
|
||||||
}
|
|
||||||
println(distResults + "\n" + overallResults)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Util.printCounters(finalExec)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def getResultsWithOutgoingVolumes(
|
|
||||||
graph: TypedPipe[UserAndNeighbors],
|
|
||||||
interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
|
||||||
useLogFavWeights: Boolean
|
|
||||||
): TypedPipe[(Int, BipartiteClusterQuality)] = {
|
|
||||||
graph
|
|
||||||
.map { un => (un.userId, un.neighbors) }
|
|
||||||
// should this be a leftJoin? For now, leaving it as an inner join. If in the future,
|
|
||||||
// we want to compare two approaches with very different coverages on interestedIn, this
|
|
||||||
// could become a problem.
|
|
||||||
.join(interestedIn)
|
|
||||||
.withReducers(4000)
|
|
||||||
.flatMap {
|
|
||||||
case (userId, (neighbors, clusters)) =>
|
|
||||||
getBIResultsFromSingleUser(userId, neighbors, clusters, useLogFavWeights)
|
|
||||||
}
|
|
||||||
.sumByKey
|
|
||||||
.withReducers(600)
|
|
||||||
.map {
|
|
||||||
case (clusterId, bir) =>
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
BipartiteClusterQuality(
|
|
||||||
inClusterFollowEdges = Some(bir.inClusterWeights.isFollowEdge),
|
|
||||||
inClusterFavEdges = Some(bir.inClusterWeights.isFavEdge),
|
|
||||||
favWtSumOfInClusterFollowEdges = Some(bir.inClusterWeights.favWtIfFollowEdge),
|
|
||||||
favWtSumOfInClusterFavEdges = Some(bir.inClusterWeights.favWtIfFavEdge),
|
|
||||||
outgoingFollowEdges = Some(bir.totalOutgoingVolumes.isFollowEdge),
|
|
||||||
outgoingFavEdges = Some(bir.totalOutgoingVolumes.isFavEdge),
|
|
||||||
favWtSumOfOutgoingFollowEdges = Some(bir.totalOutgoingVolumes.favWtIfFollowEdge),
|
|
||||||
favWtSumOfOutgoingFavEdges = Some(bir.totalOutgoingVolumes.favWtIfFavEdge),
|
|
||||||
interestedInSize = Some(bir.interestedInSize),
|
|
||||||
sampledEdges = Some(
|
|
||||||
bir.edgeSample
|
|
||||||
.iterator()
|
|
||||||
.asScala
|
|
||||||
.toSeq
|
|
||||||
.map {
|
|
||||||
case (edge, data) => makeThriftSampledEdge(edge, data)
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def getBIResultsFromSingleUser(
|
|
||||||
userId: Long,
|
|
||||||
neighbors: Seq[NeighborWithWeights],
|
|
||||||
clusters: ClustersUserIsInterestedIn,
|
|
||||||
useLogFavScores: Boolean
|
|
||||||
): List[(Int, BipartiteIntermediateResults)] = {
|
|
||||||
val neighborsToWeights = neighbors.map { neighborAndWeights =>
|
|
||||||
val isFollowEdge = neighborAndWeights.isFollowed match {
|
|
||||||
case Some(true) => 1.0
|
|
||||||
case _ => 0.0
|
|
||||||
}
|
|
||||||
val favScore = if (useLogFavScores) {
|
|
||||||
neighborAndWeights.logFavScore.getOrElse(0.0)
|
|
||||||
} else neighborAndWeights.favScoreHalfLife100Days.getOrElse(0.0)
|
|
||||||
val isFavEdge = math.min(1, math.ceil(favScore))
|
|
||||||
neighborAndWeights.neighborId -> Weights(
|
|
||||||
isFollowEdge,
|
|
||||||
isFavEdge,
|
|
||||||
favScore * isFollowEdge,
|
|
||||||
favScore
|
|
||||||
)
|
|
||||||
}.toMap
|
|
||||||
|
|
||||||
val outgoingVolumes = Monoid.sum(neighborsToWeights.values)(WeightsMonoid)
|
|
||||||
|
|
||||||
clusters.clusterIdToScores.toList.map {
|
|
||||||
case (clusterId, scoresStruct) =>
|
|
||||||
val inClusterNeighbors =
|
|
||||||
(scoresStruct.usersBeingFollowed.getOrElse(Nil) ++
|
|
||||||
scoresStruct.usersThatWereFaved.getOrElse(Nil)).toSet
|
|
||||||
val edgesForSampling = inClusterNeighbors.flatMap { neighborId =>
|
|
||||||
if (neighborsToWeights.contains(neighborId)) {
|
|
||||||
Some(
|
|
||||||
(userId, neighborId),
|
|
||||||
SampledEdgeData(
|
|
||||||
neighborsToWeights(neighborId).favWtIfFollowEdge,
|
|
||||||
neighborsToWeights(neighborId).favWtIfFavEdge,
|
|
||||||
scoresStruct.followScore.getOrElse(0.0),
|
|
||||||
scoresStruct.favScore.getOrElse(0.0)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
val inClusterWeights =
|
|
||||||
Monoid.sum(neighborsToWeights.filterKeys(inClusterNeighbors).values)(WeightsMonoid)
|
|
||||||
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
BipartiteIntermediateResults(
|
|
||||||
inClusterWeights,
|
|
||||||
outgoingVolumes,
|
|
||||||
1,
|
|
||||||
samplerMonoid.build(edgesForSampling)
|
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def computeClusterIncomingVolumes(
|
|
||||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
|
|
||||||
producerNormsAndCounts: TypedPipe[NormsAndCounts],
|
|
||||||
useLogFavWeights: Boolean
|
|
||||||
): TypedPipe[(Int, BipartiteClusterQuality)] = {
|
|
||||||
producerNormsAndCounts
|
|
||||||
.map { x => (x.userId, x) }
|
|
||||||
.join(knownFor)
|
|
||||||
.withReducers(100)
|
|
||||||
.flatMap {
|
|
||||||
case (userId, (normsAndCounts, clusters)) =>
|
|
||||||
clusters.map {
|
|
||||||
case (clusterId, _) =>
|
|
||||||
val followerCount =
|
|
||||||
normsAndCounts.followerCount.getOrElse(0L).toDouble
|
|
||||||
val faverCount = normsAndCounts.faverCount.getOrElse(0L).toDouble
|
|
||||||
val favWtSumOfIncomingFollows = if (useLogFavWeights) {
|
|
||||||
normsAndCounts.logFavWeightsOnFollowEdgesSum.getOrElse(0.0)
|
|
||||||
} else {
|
|
||||||
normsAndCounts.favWeightsOnFollowEdgesSum.getOrElse(0.0)
|
|
||||||
}
|
|
||||||
val favWtSumOfIncomingFavs = if (useLogFavWeights) {
|
|
||||||
normsAndCounts.logFavWeightsOnFavEdgesSum.getOrElse(0.0)
|
|
||||||
} else {
|
|
||||||
normsAndCounts.favWeightsOnFavEdgesSum.getOrElse(0.0)
|
|
||||||
}
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
BipartiteClusterQuality(
|
|
||||||
incomingFollowEdges = Some(followerCount),
|
|
||||||
incomingFavEdges = Some(faverCount),
|
|
||||||
favWtSumOfIncomingFollowEdges = Some(favWtSumOfIncomingFollows),
|
|
||||||
favWtSumOfIncomingFavEdges = Some(favWtSumOfIncomingFavs)
|
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.sumByKey
|
|
||||||
.toTypedPipe
|
|
||||||
}
|
|
||||||
|
|
||||||
def loadOrMake(
|
|
||||||
pipe: TypedPipe[(Int, BipartiteClusterQuality)],
|
|
||||||
modelVersion: String,
|
|
||||||
path: String
|
|
||||||
): Execution[TypedPipe[(Int, BipartiteClusterQuality)]] = {
|
|
||||||
val mapped = pipe.map {
|
|
||||||
case (clusterId, struct) => ((modelVersion, clusterId), struct)
|
|
||||||
}
|
|
||||||
makeForKeyValSource(mapped, AdhocKeyValSources.bipartiteQualitySource(path), path).map { pipe =>
|
|
||||||
// discard model version
|
|
||||||
pipe.map { case ((_, clusterId), struct) => (clusterId, struct) }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def makeForKeyValSource[K, V](
|
|
||||||
pipe: TypedPipe[(K, V)],
|
|
||||||
dest: VersionedKeyValSource[K, V],
|
|
||||||
path: String
|
|
||||||
): Execution[TypedPipe[(K, V)]] =
|
|
||||||
Execution.getMode.flatMap { mode =>
|
|
||||||
if (dest.resourceExists(mode)) {
|
|
||||||
println(s"validated path $path")
|
|
||||||
Execution.from(TypedPipe.from(dest))
|
|
||||||
} else {
|
|
||||||
println(s"Could not load from $path")
|
|
||||||
pipe.writeThrough(dest)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def precisionOfWholeGraph(
|
|
||||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
|
|
||||||
interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
|
||||||
clusterIncomingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]]
|
|
||||||
): Execution[Option[Double]] = {
|
|
||||||
val knownForSizeExec = knownFor.aggregate(Aggregator.size).toOptionExecution
|
|
||||||
val interestedInSizeExec =
|
|
||||||
interestedIn.aggregate(Aggregator.size).toOptionExecution
|
|
||||||
val numExec = clusterIncomingVolumesExec.flatMap { volumes =>
|
|
||||||
volumes.values.flatMap(_.favWtSumOfIncomingFavEdges).sum.toOptionExecution
|
|
||||||
}
|
|
||||||
Execution.zip(numExec, interestedInSizeExec, knownForSizeExec).map {
|
|
||||||
case (Some(num), Some(interestedInSize), Some(knownForSize)) =>
|
|
||||||
Some(num / interestedInSize / knownForSize)
|
|
||||||
case x @ _ =>
|
|
||||||
println("Precision of whole graph zip: " + x)
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def finalPerClusterResults(
|
|
||||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
|
|
||||||
interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
|
||||||
resultsWithOutgoingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]],
|
|
||||||
incomingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]]
|
|
||||||
): Execution[TypedPipe[(Int, BipartiteClusterQuality)]] = {
|
|
||||||
val knownForTranspose = KnownForSources.transpose(knownFor)
|
|
||||||
|
|
||||||
val precisionOfWholeGraphExec =
|
|
||||||
precisionOfWholeGraph(knownFor, interestedIn, incomingVolumesExec)
|
|
||||||
|
|
||||||
Execution
|
|
||||||
.zip(resultsWithOutgoingVolumesExec, incomingVolumesExec, precisionOfWholeGraphExec)
|
|
||||||
.map {
|
|
||||||
case (resultsWithOutgoingVolumes, clusterIncomingVolumes, precisionOfWholeGraph) =>
|
|
||||||
println("Precision of whole graph " + precisionOfWholeGraph)
|
|
||||||
resultsWithOutgoingVolumes
|
|
||||||
.join(knownForTranspose)
|
|
||||||
.leftJoin(clusterIncomingVolumes)
|
|
||||||
.withReducers(500)
|
|
||||||
.map {
|
|
||||||
case (clusterId, ((outgoingVolumeQuality, knownForList), incomingVolumesOpt)) =>
|
|
||||||
val incomingVolumes =
|
|
||||||
incomingVolumesOpt.getOrElse(BipartiteClusterQuality())
|
|
||||||
val knownForMap = knownForList.toMap
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
getFullQuality(
|
|
||||||
outgoingVolumeQuality,
|
|
||||||
incomingVolumes,
|
|
||||||
knownForMap,
|
|
||||||
precisionOfWholeGraph))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def getFullQuality(
|
|
||||||
qualityWithOutgoingVolumes: BipartiteClusterQuality,
|
|
||||||
incomingVolumes: BipartiteClusterQuality,
|
|
||||||
knownFor: Map[Long, Float],
|
|
||||||
precisionOfWholeGraph: Option[Double]
|
|
||||||
): BipartiteClusterQuality = {
|
|
||||||
val newSampledEdges = qualityWithOutgoingVolumes.sampledEdges.map { sampledEdges =>
|
|
||||||
sampledEdges.map { sampledEdge =>
|
|
||||||
val knownForScore = knownFor.getOrElse(sampledEdge.followeeId, 0.0f)
|
|
||||||
sampledEdge.copy(
|
|
||||||
predictedFollowScore = sampledEdge.followScoreToCluster.map { x => x * knownForScore },
|
|
||||||
predictedFavScore = sampledEdge.favScoreToCluster.map { x => x * knownForScore }
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
val correlationOfFavWtIfFollow = newSampledEdges.map { samples =>
|
|
||||||
val pairs = samples.map { s =>
|
|
||||||
(s.predictedFollowScore.getOrElse(0.0), s.favWtIfFollowEdge.getOrElse(0.0))
|
|
||||||
}
|
|
||||||
Util.computeCorrelation(pairs.iterator)
|
|
||||||
}
|
|
||||||
val correlationOfFavWtIfFav = newSampledEdges.map { samples =>
|
|
||||||
val pairs = samples.map { s =>
|
|
||||||
(s.predictedFavScore.getOrElse(0.0), s.favWtIfFavEdge.getOrElse(0.0))
|
|
||||||
}
|
|
||||||
Util.computeCorrelation(pairs.iterator)
|
|
||||||
}
|
|
||||||
val relativePrecisionNum = {
|
|
||||||
if (qualityWithOutgoingVolumes.interestedInSize.exists(_ > 0) && knownFor.nonEmpty) {
|
|
||||||
qualityWithOutgoingVolumes.favWtSumOfInClusterFavEdges
|
|
||||||
.getOrElse(0.0) / qualityWithOutgoingVolumes.interestedInSize.get / knownFor.size
|
|
||||||
} else 0.0
|
|
||||||
}
|
|
||||||
val relativePrecision = if (precisionOfWholeGraph.exists(_ > 0.0)) {
|
|
||||||
Some(relativePrecisionNum / precisionOfWholeGraph.get)
|
|
||||||
} else None
|
|
||||||
qualityWithOutgoingVolumes.copy(
|
|
||||||
incomingFollowEdges = incomingVolumes.incomingFollowEdges,
|
|
||||||
incomingFavEdges = incomingVolumes.incomingFavEdges,
|
|
||||||
favWtSumOfIncomingFollowEdges = incomingVolumes.favWtSumOfIncomingFollowEdges,
|
|
||||||
favWtSumOfIncomingFavEdges = incomingVolumes.favWtSumOfIncomingFavEdges,
|
|
||||||
knownForSize = Some(knownFor.size),
|
|
||||||
correlationOfFavWtIfFollowWithPredictedFollow = correlationOfFavWtIfFollow,
|
|
||||||
correlationOfFavWtIfFavWithPredictedFav = correlationOfFavWtIfFav,
|
|
||||||
sampledEdges = newSampledEdges,
|
|
||||||
relativePrecisionUsingFavWtIfFav = relativePrecision,
|
|
||||||
averagePrecisionOfWholeGraphUsingFavWtIfFav = precisionOfWholeGraph
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
object DumpBpQuality extends TwitterExecutionApp {
|
|
||||||
def job: Execution[Unit] = Execution.getConfigMode.flatMap {
|
|
||||||
case (config, mode) =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
val args = config.getArgs
|
|
||||||
val inputDir = args("inputDir")
|
|
||||||
|
|
||||||
val clusters = args.list("clusters").map(_.toInt).toSet
|
|
||||||
val input =
|
|
||||||
TypedPipe
|
|
||||||
.from(AdhocKeyValSources.bipartiteQualitySource(inputDir))
|
|
||||||
.map {
|
|
||||||
case ((modelVersion, clusterId), quality) =>
|
|
||||||
(
|
|
||||||
(modelVersion, clusterId),
|
|
||||||
BipartiteClusterEvaluationClasses
|
|
||||||
.printableBipartiteQuality(quality))
|
|
||||||
}
|
|
||||||
|
|
||||||
if (clusters.isEmpty) {
|
|
||||||
input.printSummary("Bipartite quality")
|
|
||||||
} else {
|
|
||||||
input
|
|
||||||
.collect {
|
|
||||||
case rec @ ((_, clusterId), quality) if clusters(clusterId) =>
|
|
||||||
Util.prettyJsonMapper
|
|
||||||
.writeValueAsString(rec)
|
|
||||||
.replaceAll("\n", " ")
|
|
||||||
}
|
|
||||||
.toIterableExecution
|
|
||||||
.map { strings => println(strings.mkString("\n")) }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,316 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.scalding
|
|
||||||
|
|
||||||
import com.twitter.algebird.{Monoid, OptionMonoid, Semigroup}
|
|
||||||
import com.twitter.algebird.mutable.PriorityQueueMonoid
|
|
||||||
import com.twitter.scalding.Execution
|
|
||||||
import com.twitter.scalding.typed.TypedPipe
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.Util
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.Util.Distribution
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.{BipartiteClusterQuality, SampledEdge}
|
|
||||||
import java.util.PriorityQueue
|
|
||||||
import scala.collection.JavaConverters._
|
|
||||||
|
|
||||||
object BipartiteClusterEvaluationClasses {
|
|
||||||
case class Weights(
|
|
||||||
isFollowEdge: Double,
|
|
||||||
isFavEdge: Double,
|
|
||||||
favWtIfFollowEdge: Double,
|
|
||||||
favWtIfFavEdge: Double)
|
|
||||||
|
|
||||||
object WeightsMonoid extends Monoid[Weights] {
|
|
||||||
override def zero = Weights(0.0, 0.0, 0.0, 0.0)
|
|
||||||
|
|
||||||
override def plus(l: Weights, r: Weights): Weights = {
|
|
||||||
Weights(
|
|
||||||
l.isFollowEdge + r.isFollowEdge,
|
|
||||||
l.isFavEdge + r.isFavEdge,
|
|
||||||
l.favWtIfFollowEdge + r.favWtIfFollowEdge,
|
|
||||||
l.favWtIfFavEdge + r.favWtIfFavEdge
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
implicit val wm: Monoid[Weights] = WeightsMonoid
|
|
||||||
|
|
||||||
case class SampledEdgeData(
|
|
||||||
favWtIfFollowEdge: Double,
|
|
||||||
favWtIfFavEdge: Double,
|
|
||||||
followScoreToCluster: Double,
|
|
||||||
favScoreToCluster: Double)
|
|
||||||
|
|
||||||
implicit val samplerMonoid: PriorityQueueMonoid[((Long, Long), SampledEdgeData)] =
|
|
||||||
Util.reservoirSamplerMonoidForPairs[(Long, Long), SampledEdgeData](2000)(Util.edgeOrdering)
|
|
||||||
|
|
||||||
implicit val sampledEdgesMonoid: PriorityQueueMonoid[SampledEdge] =
|
|
||||||
Util.reservoirSamplerMonoid(
|
|
||||||
10000,
|
|
||||||
{ sampledEdge: SampledEdge => (sampledEdge.followerId, sampledEdge.followeeId) }
|
|
||||||
)(Util.edgeOrdering)
|
|
||||||
|
|
||||||
case class BipartiteIntermediateResults(
|
|
||||||
inClusterWeights: Weights,
|
|
||||||
totalOutgoingVolumes: Weights,
|
|
||||||
interestedInSize: Int,
|
|
||||||
edgeSample: PriorityQueue[((Long, Long), SampledEdgeData)]) {
|
|
||||||
override def toString: String = {
|
|
||||||
"BCR(%s, %s, %d, %s)".format(
|
|
||||||
inClusterWeights,
|
|
||||||
totalOutgoingVolumes,
|
|
||||||
interestedInSize,
|
|
||||||
edgeSample.iterator().asScala.toSeq.toString()
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
object BIRMonoid extends Monoid[BipartiteIntermediateResults] {
|
|
||||||
override def zero =
|
|
||||||
BipartiteIntermediateResults(WeightsMonoid.zero, WeightsMonoid.zero, 0, samplerMonoid.zero)
|
|
||||||
|
|
||||||
override def plus(
|
|
||||||
l: BipartiteIntermediateResults,
|
|
||||||
r: BipartiteIntermediateResults
|
|
||||||
): BipartiteIntermediateResults = {
|
|
||||||
BipartiteIntermediateResults(
|
|
||||||
WeightsMonoid.plus(l.inClusterWeights, r.inClusterWeights),
|
|
||||||
WeightsMonoid.plus(l.totalOutgoingVolumes, r.totalOutgoingVolumes),
|
|
||||||
l.interestedInSize + r.interestedInSize,
|
|
||||||
samplerMonoid.plus(l.edgeSample, r.edgeSample)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
implicit val bIRMonoid: Monoid[BipartiteIntermediateResults] = BIRMonoid
|
|
||||||
|
|
||||||
def makeThriftSampledEdge(edge: (Long, Long), data: SampledEdgeData): SampledEdge = {
|
|
||||||
val (followerId, followeeId) = edge
|
|
||||||
SampledEdge(
|
|
||||||
followerId = followerId,
|
|
||||||
followeeId = followeeId,
|
|
||||||
favWtIfFollowEdge = Some(data.favWtIfFollowEdge),
|
|
||||||
favWtIfFavEdge = Some(data.favWtIfFavEdge),
|
|
||||||
followScoreToCluster = Some(data.followScoreToCluster),
|
|
||||||
favScoreToCluster = Some(data.favScoreToCluster)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
object ClusterQualitySemigroup extends Semigroup[BipartiteClusterQuality] {
|
|
||||||
val doubleOM: Monoid[Option[Double]] = new OptionMonoid[Double]
|
|
||||||
val intOM: Monoid[Option[Int]] = new OptionMonoid[Int]
|
|
||||||
val longOM: Monoid[Option[Long]] = new OptionMonoid[Long]
|
|
||||||
|
|
||||||
override def plus(l: BipartiteClusterQuality, r: BipartiteClusterQuality) =
|
|
||||||
BipartiteClusterQuality(
|
|
||||||
inClusterFollowEdges = doubleOM.plus(l.inClusterFollowEdges, r.inClusterFollowEdges),
|
|
||||||
inClusterFavEdges = doubleOM.plus(l.inClusterFavEdges, r.inClusterFavEdges),
|
|
||||||
favWtSumOfInClusterFollowEdges = doubleOM
|
|
||||||
.plus(l.favWtSumOfInClusterFollowEdges, r.favWtSumOfInClusterFollowEdges),
|
|
||||||
favWtSumOfInClusterFavEdges = doubleOM
|
|
||||||
.plus(l.favWtSumOfInClusterFavEdges, r.favWtSumOfInClusterFavEdges),
|
|
||||||
outgoingFollowEdges = doubleOM.plus(l.outgoingFollowEdges, r.outgoingFollowEdges),
|
|
||||||
outgoingFavEdges = doubleOM.plus(l.outgoingFavEdges, r.outgoingFavEdges),
|
|
||||||
favWtSumOfOutgoingFollowEdges = doubleOM
|
|
||||||
.plus(l.favWtSumOfOutgoingFollowEdges, r.favWtSumOfOutgoingFollowEdges),
|
|
||||||
favWtSumOfOutgoingFavEdges = doubleOM
|
|
||||||
.plus(l.favWtSumOfOutgoingFavEdges, r.favWtSumOfOutgoingFavEdges),
|
|
||||||
incomingFollowEdges = doubleOM.plus(l.incomingFollowEdges, r.incomingFollowEdges),
|
|
||||||
incomingFavEdges = doubleOM.plus(l.incomingFavEdges, r.incomingFavEdges),
|
|
||||||
favWtSumOfIncomingFollowEdges = doubleOM
|
|
||||||
.plus(l.favWtSumOfIncomingFollowEdges, r.favWtSumOfIncomingFollowEdges),
|
|
||||||
favWtSumOfIncomingFavEdges = doubleOM
|
|
||||||
.plus(l.favWtSumOfIncomingFavEdges, r.favWtSumOfIncomingFavEdges),
|
|
||||||
interestedInSize = None,
|
|
||||||
sampledEdges = Some(
|
|
||||||
sampledEdgesMonoid
|
|
||||||
.plus(
|
|
||||||
sampledEdgesMonoid.build(l.sampledEdges.getOrElse(Nil)),
|
|
||||||
sampledEdgesMonoid.build(r.sampledEdges.getOrElse(Nil))
|
|
||||||
)
|
|
||||||
.iterator()
|
|
||||||
.asScala
|
|
||||||
.toSeq),
|
|
||||||
knownForSize = intOM.plus(l.knownForSize, r.knownForSize),
|
|
||||||
correlationOfFavWtIfFollowWithPredictedFollow = None,
|
|
||||||
correlationOfFavWtIfFavWithPredictedFav = None,
|
|
||||||
relativePrecisionUsingFavWtIfFav = None,
|
|
||||||
averagePrecisionOfWholeGraphUsingFavWtIfFav = l.averagePrecisionOfWholeGraphUsingFavWtIfFav
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
implicit val bcqSemigroup: Semigroup[BipartiteClusterQuality] =
|
|
||||||
ClusterQualitySemigroup
|
|
||||||
|
|
||||||
case class PrintableBipartiteQuality(
|
|
||||||
incomingFollowUnweightedRecall: String,
|
|
||||||
incomingFavUnweightedRecall: String,
|
|
||||||
incomingFollowWeightedRecall: String,
|
|
||||||
incomingFavWeightedRecall: String,
|
|
||||||
outgoingFollowUnweightedRecall: String,
|
|
||||||
outgoingFavUnweightedRecall: String,
|
|
||||||
outgoingFollowWeightedRecall: String,
|
|
||||||
outgoingFavWeightedRecall: String,
|
|
||||||
incomingFollowEdges: String,
|
|
||||||
incomingFavEdges: String,
|
|
||||||
favWtSumOfIncomingFollowEdges: String,
|
|
||||||
favWtSumOfIncomingFavEdges: String,
|
|
||||||
outgoingFollowEdges: String,
|
|
||||||
outgoingFavEdges: String,
|
|
||||||
favWtSumOfOutgoingFollowEdges: String,
|
|
||||||
favWtSumOfOutgoingFavEdges: String,
|
|
||||||
correlationOfFavWtIfFollow: String,
|
|
||||||
correlationOfFavWtIfFav: String,
|
|
||||||
relativePrecisionUsingFavWt: String,
|
|
||||||
averagePrecisionOfWholeGraphUsingFavWt: String,
|
|
||||||
interestedInSize: String,
|
|
||||||
knownForSize: String)
|
|
||||||
|
|
||||||
def printableBipartiteQuality(in: BipartiteClusterQuality): PrintableBipartiteQuality = {
|
|
||||||
def getRatio(numOpt: Option[Double], denOpt: Option[Double]): String = {
|
|
||||||
val r = if (denOpt.exists(_ > 0)) {
|
|
||||||
numOpt.getOrElse(0.0) / denOpt.get
|
|
||||||
} else 0.0
|
|
||||||
"%.3f".format(r)
|
|
||||||
}
|
|
||||||
|
|
||||||
val formatter = new java.text.DecimalFormat("###,###.#")
|
|
||||||
|
|
||||||
def denString(denOpt: Option[Double]): String =
|
|
||||||
formatter.format(denOpt.getOrElse(0.0))
|
|
||||||
|
|
||||||
val correlationOfFavWtIfFollow =
|
|
||||||
in.correlationOfFavWtIfFollowWithPredictedFollow match {
|
|
||||||
case None =>
|
|
||||||
in.sampledEdges.map { samples =>
|
|
||||||
val pairs = samples.map { s =>
|
|
||||||
(s.predictedFollowScore.getOrElse(0.0), s.favWtIfFollowEdge.getOrElse(0.0))
|
|
||||||
}
|
|
||||||
Util.computeCorrelation(pairs.iterator)
|
|
||||||
}
|
|
||||||
case x @ _ => x
|
|
||||||
}
|
|
||||||
|
|
||||||
val correlationOfFavWtIfFav =
|
|
||||||
in.correlationOfFavWtIfFavWithPredictedFav match {
|
|
||||||
case None =>
|
|
||||||
in.sampledEdges.map { samples =>
|
|
||||||
val pairs = samples.map { s =>
|
|
||||||
(s.predictedFavScore.getOrElse(0.0), s.favWtIfFavEdge.getOrElse(0.0))
|
|
||||||
}
|
|
||||||
Util.computeCorrelation(pairs.iterator)
|
|
||||||
}
|
|
||||||
case x @ _ => x
|
|
||||||
}
|
|
||||||
|
|
||||||
PrintableBipartiteQuality(
|
|
||||||
incomingFollowUnweightedRecall = getRatio(in.inClusterFollowEdges, in.incomingFollowEdges),
|
|
||||||
incomingFavUnweightedRecall = getRatio(in.inClusterFavEdges, in.incomingFavEdges),
|
|
||||||
incomingFollowWeightedRecall =
|
|
||||||
getRatio(in.favWtSumOfInClusterFollowEdges, in.favWtSumOfIncomingFollowEdges),
|
|
||||||
incomingFavWeightedRecall =
|
|
||||||
getRatio(in.favWtSumOfInClusterFavEdges, in.favWtSumOfIncomingFavEdges),
|
|
||||||
outgoingFollowUnweightedRecall = getRatio(in.inClusterFollowEdges, in.outgoingFollowEdges),
|
|
||||||
outgoingFavUnweightedRecall = getRatio(in.inClusterFavEdges, in.outgoingFavEdges),
|
|
||||||
outgoingFollowWeightedRecall =
|
|
||||||
getRatio(in.favWtSumOfInClusterFollowEdges, in.favWtSumOfOutgoingFollowEdges),
|
|
||||||
outgoingFavWeightedRecall =
|
|
||||||
getRatio(in.favWtSumOfInClusterFavEdges, in.favWtSumOfOutgoingFavEdges),
|
|
||||||
incomingFollowEdges = denString(in.incomingFollowEdges),
|
|
||||||
incomingFavEdges = denString(in.incomingFavEdges),
|
|
||||||
favWtSumOfIncomingFollowEdges = denString(in.favWtSumOfIncomingFollowEdges),
|
|
||||||
favWtSumOfIncomingFavEdges = denString(in.favWtSumOfIncomingFavEdges),
|
|
||||||
outgoingFollowEdges = denString(in.outgoingFollowEdges),
|
|
||||||
outgoingFavEdges = denString(in.outgoingFavEdges),
|
|
||||||
favWtSumOfOutgoingFollowEdges = denString(in.favWtSumOfOutgoingFollowEdges),
|
|
||||||
favWtSumOfOutgoingFavEdges = denString(in.favWtSumOfOutgoingFavEdges),
|
|
||||||
correlationOfFavWtIfFollow = "%.3f"
|
|
||||||
.format(correlationOfFavWtIfFollow.getOrElse(0.0)),
|
|
||||||
correlationOfFavWtIfFav = "%.3f"
|
|
||||||
.format(correlationOfFavWtIfFav.getOrElse(0.0)),
|
|
||||||
relativePrecisionUsingFavWt =
|
|
||||||
"%.2g".format(in.relativePrecisionUsingFavWtIfFav.getOrElse(0.0)),
|
|
||||||
averagePrecisionOfWholeGraphUsingFavWt =
|
|
||||||
"%.2g".format(in.averagePrecisionOfWholeGraphUsingFavWtIfFav.getOrElse(0.0)),
|
|
||||||
interestedInSize = in.interestedInSize.getOrElse(0).toString,
|
|
||||||
knownForSize = in.knownForSize.getOrElse(0).toString
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
case class ClusterResultsSummary(
|
|
||||||
numClustersWithZeroInterestedIn: Int,
|
|
||||||
numClustersWithZeroFollowWtRecall: Int,
|
|
||||||
numClustersWithZeroFavWtRecall: Int,
|
|
||||||
numClustersWithZeroFollowAndFavWtRecall: Int,
|
|
||||||
interestedInSizeDist: Distribution,
|
|
||||||
outgoingFollowWtRecallDist: Distribution,
|
|
||||||
outgoingFavWtRecallDist: Distribution,
|
|
||||||
incomingFollowWtRecallDist: Distribution,
|
|
||||||
incomingFavWtRecallDist: Distribution,
|
|
||||||
followCorrelationDist: Distribution,
|
|
||||||
favCorrelationDist: Distribution,
|
|
||||||
relativePrecisionDist: Distribution)
|
|
||||||
|
|
||||||
def getClusterResultsSummary(
|
|
||||||
perClusterResults: TypedPipe[BipartiteClusterQuality]
|
|
||||||
): Execution[Option[ClusterResultsSummary]] = {
|
|
||||||
perClusterResults
|
|
||||||
.map { clusterQuality =>
|
|
||||||
val printableQuality = printableBipartiteQuality(clusterQuality)
|
|
||||||
val isFollowRecallZero =
|
|
||||||
if (!clusterQuality.favWtSumOfInClusterFollowEdges
|
|
||||||
.exists(_ > 0)) 1
|
|
||||||
else 0
|
|
||||||
val isFavRecallZero =
|
|
||||||
if (!clusterQuality.favWtSumOfInClusterFavEdges.exists(_ > 0)) 1
|
|
||||||
else 0
|
|
||||||
(
|
|
||||||
if (!clusterQuality.interestedInSize.exists(_ > 0)) 1 else 0,
|
|
||||||
isFollowRecallZero,
|
|
||||||
isFavRecallZero,
|
|
||||||
isFavRecallZero * isFollowRecallZero,
|
|
||||||
clusterQuality.interestedInSize.toList.map(_.toDouble),
|
|
||||||
List(printableQuality.outgoingFollowWeightedRecall.toDouble),
|
|
||||||
List(printableQuality.outgoingFavWeightedRecall.toDouble),
|
|
||||||
List(printableQuality.incomingFollowWeightedRecall.toDouble),
|
|
||||||
List(printableQuality.incomingFavWeightedRecall.toDouble),
|
|
||||||
List(printableQuality.correlationOfFavWtIfFollow.toDouble),
|
|
||||||
List(printableQuality.correlationOfFavWtIfFav.toDouble),
|
|
||||||
List(printableQuality.relativePrecisionUsingFavWt.toDouble)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
.sum
|
|
||||||
.toOptionExecution
|
|
||||||
.map { opt =>
|
|
||||||
opt.map {
|
|
||||||
case (
|
|
||||||
zeroInterestedIn,
|
|
||||||
zeroFollowRecall,
|
|
||||||
zeroFavRecall,
|
|
||||||
zeroFollowAndFavRecall,
|
|
||||||
interestedInSizeList,
|
|
||||||
outgoingFollowWtRecallList,
|
|
||||||
outgoingFavWtRecallList,
|
|
||||||
incomingFollowWtRecallList,
|
|
||||||
incomingFavWtRecallList,
|
|
||||||
followCorrelationList,
|
|
||||||
favCorrelationList,
|
|
||||||
relativePrecisionList
|
|
||||||
) =>
|
|
||||||
ClusterResultsSummary(
|
|
||||||
numClustersWithZeroInterestedIn = zeroInterestedIn,
|
|
||||||
numClustersWithZeroFollowWtRecall = zeroFollowRecall,
|
|
||||||
numClustersWithZeroFavWtRecall = zeroFavRecall,
|
|
||||||
numClustersWithZeroFollowAndFavWtRecall = zeroFollowAndFavRecall,
|
|
||||||
interestedInSizeDist = Util.distributionFromArray(interestedInSizeList.toArray),
|
|
||||||
outgoingFollowWtRecallDist = Util
|
|
||||||
.distributionFromArray(outgoingFollowWtRecallList.toArray),
|
|
||||||
outgoingFavWtRecallDist = Util.distributionFromArray(outgoingFavWtRecallList.toArray),
|
|
||||||
incomingFollowWtRecallDist = Util
|
|
||||||
.distributionFromArray(incomingFollowWtRecallList.toArray),
|
|
||||||
incomingFavWtRecallDist = Util.distributionFromArray(incomingFavWtRecallList.toArray),
|
|
||||||
followCorrelationDist = Util.distributionFromArray(followCorrelationList.toArray),
|
|
||||||
favCorrelationDist = Util.distributionFromArray(favCorrelationList.toArray),
|
|
||||||
relativePrecisionDist = Util.distributionFromArray(relativePrecisionList.toArray)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,794 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.scalding
|
|
||||||
|
|
||||||
import com.twitter.algebird.OptionMonoid
|
|
||||||
import com.twitter.algebird.QTree
|
|
||||||
import com.twitter.algebird.QTreeSemigroup
|
|
||||||
import com.twitter.algebird.Semigroup
|
|
||||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
|
||||||
import com.twitter.dal.client.dataset.SnapshotDALDataset
|
|
||||||
import com.twitter.hermit.candidate.thriftscala.Candidates
|
|
||||||
import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource
|
|
||||||
import com.twitter.pluck.source.cassowary.SimsCandidatesSource
|
|
||||||
import com.twitter.scalding._
|
|
||||||
import com.twitter.scalding_internal.dalv2.DAL
|
|
||||||
import com.twitter.scalding_internal.dalv2.DALWrite._
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
|
|
||||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
|
||||||
import com.twitter.scalding_internal.job.analytics_batch._
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
|
||||||
import com.twitter.simclusters_v2.common.ModelVersions
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources._
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.Util
|
|
||||||
import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources
|
|
||||||
import com.twitter.simclusters_v2.thriftscala._
|
|
||||||
import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset
|
|
||||||
import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser
|
|
||||||
|
|
||||||
object ClusterDetailsJob {
|
|
||||||
case class Scores(followScore: Double, favScore: Double, logFavScore: Double)
|
|
||||||
|
|
||||||
case class IntermediateDetails(
|
|
||||||
numUsersWithAnyNonZeroScore: Int,
|
|
||||||
numUsersWithNonZeroFollowScore: Int,
|
|
||||||
numUsersWithNonZeroFavScore: Int,
|
|
||||||
favQTree: Option[QTree[Double]],
|
|
||||||
followQTree: Option[QTree[Double]],
|
|
||||||
logFavQTree: Option[QTree[Double]],
|
|
||||||
sumOfSquares: Scores,
|
|
||||||
sum: Scores,
|
|
||||||
min: Scores,
|
|
||||||
max: Scores)
|
|
||||||
|
|
||||||
case class InfoFromUserSource(
|
|
||||||
fractionMarkedNSFWUser: Double,
|
|
||||||
languageToFractionDeviceLanguage: Map[String, Double],
|
|
||||||
countryCodeToFractionKnownForWithCountryCode: Map[String, Double],
|
|
||||||
languageToFractionInferredLanguage: Map[String, Double])
|
|
||||||
|
|
||||||
def positiveMin(a: Double, b: Double) = {
|
|
||||||
if (math.min(a, b) == 0.0) math.max(a, b) else math.min(a, b)
|
|
||||||
}
|
|
||||||
|
|
||||||
case class ClusterDetailsSemigroup(implicit qtreeSemigroup: Semigroup[QTree[Double]])
|
|
||||||
extends Semigroup[IntermediateDetails] {
|
|
||||||
val optionMonoid: OptionMonoid[QTree[Double]] = new OptionMonoid[QTree[Double]]()
|
|
||||||
override def plus(
|
|
||||||
left: IntermediateDetails,
|
|
||||||
right: IntermediateDetails
|
|
||||||
): IntermediateDetails = {
|
|
||||||
IntermediateDetails(
|
|
||||||
left.numUsersWithAnyNonZeroScore + right.numUsersWithAnyNonZeroScore,
|
|
||||||
left.numUsersWithNonZeroFollowScore + right.numUsersWithNonZeroFollowScore,
|
|
||||||
left.numUsersWithNonZeroFavScore + right.numUsersWithNonZeroFavScore,
|
|
||||||
optionMonoid.plus(left.favQTree, right.favQTree),
|
|
||||||
optionMonoid.plus(left.followQTree, right.followQTree),
|
|
||||||
optionMonoid.plus(left.logFavQTree, right.logFavQTree),
|
|
||||||
Scores(
|
|
||||||
left.sumOfSquares.followScore + right.sumOfSquares.followScore,
|
|
||||||
left.sumOfSquares.favScore + right.sumOfSquares.favScore,
|
|
||||||
left.sumOfSquares.logFavScore + right.sumOfSquares.logFavScore
|
|
||||||
),
|
|
||||||
Scores(
|
|
||||||
left.sum.followScore + right.sum.followScore,
|
|
||||||
left.sum.favScore + right.sum.favScore,
|
|
||||||
left.sum.logFavScore + right.sum.logFavScore
|
|
||||||
),
|
|
||||||
Scores(
|
|
||||||
positiveMin(left.min.followScore, right.min.followScore),
|
|
||||||
positiveMin(left.min.favScore, right.min.favScore),
|
|
||||||
positiveMin(left.min.logFavScore, right.min.logFavScore)
|
|
||||||
),
|
|
||||||
Scores(
|
|
||||||
math.max(left.max.followScore, right.max.followScore),
|
|
||||||
math.max(left.max.favScore, right.max.favScore),
|
|
||||||
math.max(left.max.logFavScore, right.max.logFavScore)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def intermediateDetailsPipe(
|
|
||||||
input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
|
||||||
qtreeSemigroupKParameter: Int
|
|
||||||
): TypedPipe[(Int, IntermediateDetails)] = {
|
|
||||||
implicit val qtSg: Semigroup[QTree[Double]] =
|
|
||||||
new QTreeSemigroup[Double](qtreeSemigroupKParameter)
|
|
||||||
implicit val cdSg: Semigroup[IntermediateDetails] = ClusterDetailsSemigroup()
|
|
||||||
input
|
|
||||||
.flatMap {
|
|
||||||
case (userId, clusterScoresStruct) =>
|
|
||||||
val clusterScoresArray = clusterScoresStruct.clusterIdToScores.toArray
|
|
||||||
clusterScoresArray.map {
|
|
||||||
case (clusterId, scoresStruct) =>
|
|
||||||
val followScore = scoresStruct.followScore.getOrElse(0.0)
|
|
||||||
val favScore = scoresStruct.favScore.getOrElse(0.0)
|
|
||||||
val logFavScore = scoresStruct.logFavScore.getOrElse(0.0)
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
IntermediateDetails(
|
|
||||||
numUsersWithAnyNonZeroScore = 1,
|
|
||||||
numUsersWithNonZeroFollowScore = if (followScore > 0) 1 else 0,
|
|
||||||
numUsersWithNonZeroFavScore = if (favScore > 0) 1 else 0,
|
|
||||||
favQTree = if (favScore > 0) Some(QTree(favScore)) else None,
|
|
||||||
followQTree = if (followScore > 0) Some(QTree(followScore)) else None,
|
|
||||||
logFavQTree = if (logFavScore > 0) Some(QTree(logFavScore)) else None,
|
|
||||||
sumOfSquares = Scores(
|
|
||||||
followScore * followScore,
|
|
||||||
favScore * favScore,
|
|
||||||
logFavScore * logFavScore),
|
|
||||||
sum = Scores(followScore, favScore, logFavScore),
|
|
||||||
min = Scores(followScore, favScore, logFavScore),
|
|
||||||
max = Scores(followScore, favScore, logFavScore)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.sumByKey
|
|
||||||
// Uncomment for adhoc job
|
|
||||||
//.withReducers(100)
|
|
||||||
.toTypedPipe
|
|
||||||
}
|
|
||||||
|
|
||||||
private def safeGetDoubleOpt(x: Option[Double]): Double = {
|
|
||||||
x.map { y => if (y.isNaN) 0 else y }.getOrElse(0)
|
|
||||||
}
|
|
||||||
|
|
||||||
private def getSimilaritiesForAllPairs(
|
|
||||||
input: TypedPipe[(Long, ClustersUserIsInterestedIn)]
|
|
||||||
)(
|
|
||||||
implicit uniqueID: UniqueID
|
|
||||||
): TypedPipe[((Int, Int), Scores)] = {
|
|
||||||
val allClusterPairsBeforeSumByKey = Stat("all_cluster_pairs_before_sum_by_key")
|
|
||||||
val clusterPairsWithin10Ratio = Stat("cluster_pairs_within_10_ratio")
|
|
||||||
val clusterPairsBeforeTopK = Stat("cluster_pairs_before_thresholding")
|
|
||||||
|
|
||||||
input
|
|
||||||
.flatMap {
|
|
||||||
case (userId, clusterScoresStruct) =>
|
|
||||||
val clusterScoresArray = clusterScoresStruct.clusterIdToScores.toArray
|
|
||||||
(0 until clusterScoresArray.length).flatMap { i =>
|
|
||||||
(0 until clusterScoresArray.length).map { j =>
|
|
||||||
val (clusterI, scoresI) = clusterScoresArray(i)
|
|
||||||
val (clusterJ, scoresJ) = clusterScoresArray(j)
|
|
||||||
val ratioOfSizes =
|
|
||||||
scoresI.numUsersInterestedInThisClusterUpperBound.getOrElse(1).toDouble /
|
|
||||||
scoresJ.numUsersInterestedInThisClusterUpperBound.getOrElse(1).toDouble
|
|
||||||
allClusterPairsBeforeSumByKey.inc()
|
|
||||||
if (ratioOfSizes > 0.1 && ratioOfSizes < 10) {
|
|
||||||
clusterPairsWithin10Ratio.inc()
|
|
||||||
}
|
|
||||||
val followI = safeGetDoubleOpt(scoresI.followScoreClusterNormalizedOnly)
|
|
||||||
val followJ = safeGetDoubleOpt(scoresJ.followScoreClusterNormalizedOnly)
|
|
||||||
val follow = followI * followJ
|
|
||||||
val favI = safeGetDoubleOpt(scoresI.favScoreClusterNormalizedOnly)
|
|
||||||
val favJ = safeGetDoubleOpt(scoresJ.favScoreClusterNormalizedOnly)
|
|
||||||
val fav = favI * favJ
|
|
||||||
val logFavI = safeGetDoubleOpt(scoresI.logFavScoreClusterNormalizedOnly)
|
|
||||||
val logFavJ = safeGetDoubleOpt(scoresJ.logFavScoreClusterNormalizedOnly)
|
|
||||||
val logFav = logFavI * logFavJ
|
|
||||||
((clusterI, clusterJ), (follow, fav, logFav))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.sumByKey
|
|
||||||
// Uncomment for adhoc job
|
|
||||||
//.withReducers(600)
|
|
||||||
.map {
|
|
||||||
case (key, (follow, fav, logFav)) =>
|
|
||||||
clusterPairsBeforeTopK.inc()
|
|
||||||
(key, Scores(follow, fav, logFav))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private def keepTopNeighbors(
|
|
||||||
allPairs: TypedPipe[((Int, Int), Scores)],
|
|
||||||
cosineThreshold: Double
|
|
||||||
)(
|
|
||||||
implicit uniqueID: UniqueID
|
|
||||||
): TypedPipe[(Int, List[ClusterNeighbor])] = {
|
|
||||||
val clusterPairsMoreThanThreshold = Stat("cluster_pairs_cosine_gt_" + cosineThreshold)
|
|
||||||
val clusterPairsAfterTopK = Stat("cluster_pairs_after_topk")
|
|
||||||
val clustersWithFewNeighbors = Stat(s"clusters_with_fewer_than_100_neighbors")
|
|
||||||
val clustersWithManyNeighbors = Stat(s"clusters_with_more_than_100_neighbors")
|
|
||||||
|
|
||||||
allPairs
|
|
||||||
.flatMap {
|
|
||||||
case ((cI, cJ), Scores(followScore, favScore, logFavScore)) =>
|
|
||||||
if (followScore > cosineThreshold || logFavScore > cosineThreshold || favScore > cosineThreshold) {
|
|
||||||
clusterPairsMoreThanThreshold.inc()
|
|
||||||
Some((cI, ClusterNeighbor(cJ, Some(followScore), Some(favScore), Some(logFavScore))))
|
|
||||||
} else None
|
|
||||||
}
|
|
||||||
.group
|
|
||||||
.toList
|
|
||||||
// Uncomment for adhoc job
|
|
||||||
//.withReducers(40)
|
|
||||||
.map {
|
|
||||||
case (key, seq) =>
|
|
||||||
val finalSize = seq.size
|
|
||||||
clusterPairsAfterTopK.incBy(finalSize)
|
|
||||||
if (finalSize < 100) {
|
|
||||||
clustersWithFewNeighbors.inc()
|
|
||||||
} else {
|
|
||||||
clustersWithManyNeighbors.inc()
|
|
||||||
}
|
|
||||||
(
|
|
||||||
key,
|
|
||||||
seq.sortBy {
|
|
||||||
case cn: ClusterNeighbor =>
|
|
||||||
-(cn.followCosineSimilarity.getOrElse(0.0) + cn.logFavCosineSimilarity.getOrElse(
|
|
||||||
0.0)) / 2
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def getTopSimilarClustersWithCosine(
|
|
||||||
input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
|
||||||
cosineThreshold: Double
|
|
||||||
)(
|
|
||||||
implicit uniqueID: UniqueID
|
|
||||||
): TypedPipe[(Int, List[ClusterNeighbor])] = {
|
|
||||||
keepTopNeighbors(getSimilaritiesForAllPairs(input), cosineThreshold)
|
|
||||||
}
|
|
||||||
|
|
||||||
def getDistributionDetails(
|
|
||||||
qtree: QTree[Double],
|
|
||||||
sum: Double,
|
|
||||||
sumOfSquares: Double,
|
|
||||||
min: Double,
|
|
||||||
max: Double,
|
|
||||||
fullSize: Int
|
|
||||||
): DistributionDetails = {
|
|
||||||
val mean = sum / fullSize
|
|
||||||
// note that the below is the naive calculation, and not the sample standard dev formula
|
|
||||||
// that divides by n-1. I don't think it makes a difference at our scale whether we use n or n-1
|
|
||||||
// and I'd rather use the simpler one.
|
|
||||||
val stdDev = math.sqrt(sumOfSquares / fullSize - mean * mean)
|
|
||||||
|
|
||||||
def getQB(percentile: Double): QuantileBounds = {
|
|
||||||
val (lb, ub) = qtree.quantileBounds(percentile)
|
|
||||||
QuantileBounds(lb, ub)
|
|
||||||
}
|
|
||||||
|
|
||||||
DistributionDetails(
|
|
||||||
mean = mean,
|
|
||||||
standardDeviation = Some(stdDev),
|
|
||||||
min = Some(min),
|
|
||||||
p25 = Some(getQB(0.25)),
|
|
||||||
p50 = Some(getQB(0.5)),
|
|
||||||
p75 = Some(getQB(0.75)),
|
|
||||||
p95 = Some(getQB(0.95)),
|
|
||||||
max = Some(max)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
def keepCorrectModel(
|
|
||||||
input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
|
||||||
modelVersionToKeep: String
|
|
||||||
)(
|
|
||||||
implicit uniqId: UniqueID
|
|
||||||
): TypedPipe[(Long, ClustersUserIsInterestedIn)] = {
|
|
||||||
val allRecords = Stat("all_input_records")
|
|
||||||
val withCorrectVersion = Stat("with_correct_version")
|
|
||||||
input.filter {
|
|
||||||
case (_, clusterScoresStruct) =>
|
|
||||||
// allRecords.inc()
|
|
||||||
val result = clusterScoresStruct.knownForModelVersion == modelVersionToKeep
|
|
||||||
// if (result) withCorrectVersion.inc()
|
|
||||||
result
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def getInfoFromUserSource(
|
|
||||||
knownFor: TypedPipe[(Int, List[(Long, Float)])],
|
|
||||||
usersource: TypedPipe[FlatUser],
|
|
||||||
inferredLanguages: TypedPipe[(Long, Seq[(String, Double)])]
|
|
||||||
)(
|
|
||||||
implicit uniqId: UniqueID
|
|
||||||
): TypedPipe[(Int, InfoFromUserSource)] = {
|
|
||||||
val knownForUsers = knownFor.flatMap {
|
|
||||||
case (clusterId, userScoreList) =>
|
|
||||||
userScoreList.map {
|
|
||||||
case (userId, _) =>
|
|
||||||
(userId, clusterId)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
usersource
|
|
||||||
.collect {
|
|
||||||
case fuser: FlatUser if fuser.id.isDefined =>
|
|
||||||
(
|
|
||||||
fuser.id.get,
|
|
||||||
(
|
|
||||||
fuser.accountCountryCode.getOrElse(""),
|
|
||||||
fuser.language.getOrElse(""),
|
|
||||||
fuser.nsfwUser.getOrElse(false)
|
|
||||||
))
|
|
||||||
}
|
|
||||||
.join(knownForUsers)
|
|
||||||
.leftJoin(inferredLanguages)
|
|
||||||
.map {
|
|
||||||
case (_, (((countryCode, language, nsfw), clusterId), inferredLangsOpt)) =>
|
|
||||||
val nsfwInt = if (nsfw) 1 else 0
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
(
|
|
||||||
1,
|
|
||||||
nsfwInt,
|
|
||||||
Map(language -> 1),
|
|
||||||
Map(countryCode -> 1),
|
|
||||||
inferredLangsOpt.getOrElse(Seq(("", 1.0))).toMap
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
.sumByKey
|
|
||||||
.mapValues {
|
|
||||||
case (
|
|
||||||
denominator,
|
|
||||||
nsfwNumerator,
|
|
||||||
languageNumeratorsMap,
|
|
||||||
countryNumeratorsMap,
|
|
||||||
inferredLangsNumeratorsMap) =>
|
|
||||||
InfoFromUserSource(
|
|
||||||
nsfwNumerator * 1.0 / denominator,
|
|
||||||
languageNumeratorsMap.mapValues { x => x * 1.0 / denominator },
|
|
||||||
countryNumeratorsMap.mapValues { x => x * 1.0 / denominator },
|
|
||||||
inferredLangsNumeratorsMap.mapValues { x => x * 1.0 / denominator }
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Run the cluster details job and return the details for each cluster
|
|
||||||
* @param input interestedIn data
|
|
||||||
* @param qtreeSemigroupKParameter parameter for calculating percentiles using qtree monoid (set to a small number, usually < 7)
|
|
||||||
* @param modelVersionToKeep which modelVersion to use from interestedIn dataset
|
|
||||||
* @param knownFor clusterId -> users known for this cluster and their scores
|
|
||||||
* @param knownForTranspose userId -> clusters this user is known for and their scores
|
|
||||||
* @param usersource -> user source
|
|
||||||
* @param simsGraph -> sims graph in the form of userId -> adjacency list
|
|
||||||
* @param cosineThreshold -> cosine threshold to include a cluster in the list of similar clusters for a given cluster
|
|
||||||
* @param uniqId
|
|
||||||
* @return pipe with (modelVersion, clusterId) as the key and ClusterDetails struct as the value.
|
|
||||||
*/
|
|
||||||
def run(
|
|
||||||
input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
|
|
||||||
qtreeSemigroupKParameter: Int,
|
|
||||||
modelVersionToKeep: String,
|
|
||||||
knownFor: TypedPipe[(Int, List[(Long, Float)])],
|
|
||||||
knownForTranspose: TypedPipe[(Long, Array[(Int, Float)])],
|
|
||||||
usersource: Option[TypedPipe[FlatUser]],
|
|
||||||
inferredLanguageSource: Option[TypedPipe[(Long, Seq[(String, Double)])]],
|
|
||||||
simsGraph: Option[TypedPipe[(Long, Map[Long, Float])]],
|
|
||||||
cosineThreshold: Double
|
|
||||||
)(
|
|
||||||
implicit uniqId: UniqueID
|
|
||||||
): Execution[TypedPipe[((String, Int), ClusterDetails)]] = {
|
|
||||||
val topSimilarClusters = getTopSimilarClustersWithCosine(input, cosineThreshold)
|
|
||||||
val infoFromUserSource: TypedPipe[(Int, InfoFromUserSource)] = (for {
|
|
||||||
us <- usersource
|
|
||||||
inferredLanguages <- inferredLanguageSource
|
|
||||||
} yield getInfoFromUserSource(knownFor, us, inferredLanguages)).getOrElse(TypedPipe.empty)
|
|
||||||
|
|
||||||
val clusterEvaluationExec = simsGraph match {
|
|
||||||
case Some(sg) =>
|
|
||||||
ClusterEvaluation.clusterLevelEvaluation(sg, knownForTranspose, "eval")
|
|
||||||
case None =>
|
|
||||||
val dummyPipe: TypedPipe[(Int, (Int, ClusterQuality))] = TypedPipe.empty
|
|
||||||
Execution.from(dummyPipe)
|
|
||||||
}
|
|
||||||
|
|
||||||
clusterEvaluationExec
|
|
||||||
.map { clusterIdToSizesAndQualities =>
|
|
||||||
val clusterQualities: TypedPipe[(Int, ClusterQuality)] =
|
|
||||||
clusterIdToSizesAndQualities.mapValues(_._2)
|
|
||||||
intermediateDetailsPipe(
|
|
||||||
keepCorrectModel(input, modelVersionToKeep),
|
|
||||||
qtreeSemigroupKParameter)
|
|
||||||
.leftJoin(topSimilarClusters)
|
|
||||||
.leftJoin(infoFromUserSource)
|
|
||||||
.leftJoin(clusterQualities)
|
|
||||||
.join(knownFor)
|
|
||||||
.map {
|
|
||||||
case (
|
|
||||||
clusterId,
|
|
||||||
(
|
|
||||||
(
|
|
||||||
((intermediateDetails, topSimilarNeighborsOpt), userSourceInfoOpt),
|
|
||||||
qualityOpt),
|
|
||||||
knownForUsers)
|
|
||||||
) =>
|
|
||||||
val knownForSorted = knownForUsers.sortBy(-_._2).map {
|
|
||||||
case (userId, score) =>
|
|
||||||
UserWithScore(userId, score)
|
|
||||||
}
|
|
||||||
(modelVersionToKeep, clusterId) ->
|
|
||||||
ClusterDetails(
|
|
||||||
numUsersWithAnyNonZeroScore = intermediateDetails.numUsersWithAnyNonZeroScore,
|
|
||||||
numUsersWithNonZeroFavScore = intermediateDetails.numUsersWithNonZeroFavScore,
|
|
||||||
numUsersWithNonZeroFollowScore =
|
|
||||||
intermediateDetails.numUsersWithNonZeroFollowScore,
|
|
||||||
favScoreDistributionDetails = intermediateDetails.favQTree.map { qt =>
|
|
||||||
getDistributionDetails(
|
|
||||||
qtree = qt,
|
|
||||||
sum = intermediateDetails.sum.favScore,
|
|
||||||
sumOfSquares = intermediateDetails.sumOfSquares.favScore,
|
|
||||||
min = intermediateDetails.min.favScore,
|
|
||||||
max = intermediateDetails.max.favScore,
|
|
||||||
fullSize = intermediateDetails.numUsersWithNonZeroFavScore
|
|
||||||
)
|
|
||||||
},
|
|
||||||
followScoreDistributionDetails = intermediateDetails.followQTree.map { qt =>
|
|
||||||
getDistributionDetails(
|
|
||||||
qtree = qt,
|
|
||||||
sum = intermediateDetails.sum.followScore,
|
|
||||||
sumOfSquares = intermediateDetails.sumOfSquares.followScore,
|
|
||||||
min = intermediateDetails.min.followScore,
|
|
||||||
max = intermediateDetails.max.followScore,
|
|
||||||
fullSize = intermediateDetails.numUsersWithNonZeroFollowScore
|
|
||||||
)
|
|
||||||
},
|
|
||||||
logFavScoreDistributionDetails = intermediateDetails.logFavQTree.map { qt =>
|
|
||||||
getDistributionDetails(
|
|
||||||
qtree = qt,
|
|
||||||
sum = intermediateDetails.sum.logFavScore,
|
|
||||||
sumOfSquares = intermediateDetails.sumOfSquares.logFavScore,
|
|
||||||
min = intermediateDetails.min.logFavScore,
|
|
||||||
max = intermediateDetails.max.logFavScore,
|
|
||||||
// note: user has non-zero fav score iff a user has non-zero log-fav score
|
|
||||||
fullSize = intermediateDetails.numUsersWithNonZeroFavScore
|
|
||||||
)
|
|
||||||
},
|
|
||||||
knownForUsersAndScores = Some(knownForSorted),
|
|
||||||
neighborClusters = topSimilarNeighborsOpt,
|
|
||||||
fractionKnownForMarkedNSFWUser = userSourceInfoOpt.map(_.fractionMarkedNSFWUser),
|
|
||||||
languageToFractionDeviceLanguage =
|
|
||||||
userSourceInfoOpt.map(_.languageToFractionDeviceLanguage),
|
|
||||||
countryCodeToFractionKnownForWithCountryCode =
|
|
||||||
userSourceInfoOpt.map(_.countryCodeToFractionKnownForWithCountryCode),
|
|
||||||
qualityMeasuredOnSimsGraph = qualityOpt,
|
|
||||||
languageToFractionInferredLanguage =
|
|
||||||
userSourceInfoOpt.map(_.languageToFractionInferredLanguage),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def getTruncatedSims(
|
|
||||||
sims: TypedPipe[Candidates],
|
|
||||||
maxNeighbors: Int
|
|
||||||
): TypedPipe[(Long, Map[Long, Float])] = {
|
|
||||||
sims.map { cands =>
|
|
||||||
(
|
|
||||||
cands.userId,
|
|
||||||
// These candidates are already sorted, but leaving it in just in case the behavior changes upstream
|
|
||||||
cands.candidates
|
|
||||||
.map { c => (c.userId, c.score.toFloat) }.sortBy(-_._2).take(maxNeighbors).toMap
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
scalding remote run --main-class com.twitter.simclusters_v2.scalding.ClusterDetailsAdhoc \
|
|
||||||
--target src/scala/com/twitter/simclusters_v2/scalding:cluster_details-adhoc \
|
|
||||||
--hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \
|
|
||||||
--user recos-platform -- \
|
|
||||||
--date 2020-06-25 \
|
|
||||||
--dateForUserSource 2020-06-25 \
|
|
||||||
--includeUserSource \
|
|
||||||
--outputDir /user/recos-platform/adhoc/your_ldap/cluster_details_inferred_lang
|
|
||||||
*/
|
|
||||||
object ClusterDetailsAdhoc extends TwitterExecutionApp {
|
|
||||||
implicit val tz: java.util.TimeZone = DateOps.UTC
|
|
||||||
implicit val dp = DateParser.default
|
|
||||||
|
|
||||||
def job: Execution[Unit] =
|
|
||||||
Execution.getConfigMode.flatMap {
|
|
||||||
case (config, mode) =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
val args = config.getArgs
|
|
||||||
val date = DateRange.parse(args("dateForUserSource"))
|
|
||||||
val (knownFor, knownForTranspose) =
|
|
||||||
args
|
|
||||||
.optional("knownForDir").map { location =>
|
|
||||||
(
|
|
||||||
KnownForSources.transpose(KnownForSources.readKnownFor(location)),
|
|
||||||
KnownForSources.readKnownFor(location)
|
|
||||||
)
|
|
||||||
}.getOrElse(
|
|
||||||
(
|
|
||||||
KnownForSources.clusterToKnownFor_20M_145K_updated,
|
|
||||||
KnownForSources.knownFor_20M_145K_updated
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
val interestedIn = args
|
|
||||||
.optional("inputDir").map { interestedInInputDir =>
|
|
||||||
TypedPipe.from(AdhocKeyValSources.interestedInSource(interestedInInputDir))
|
|
||||||
}.getOrElse(
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshotNoOlderThan(
|
|
||||||
SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
|
|
||||||
Days(14))
|
|
||||||
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
|
|
||||||
.toTypedPipe
|
|
||||||
.map {
|
|
||||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
|
||||||
(userId, clustersUserIsInterestedIn)
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
val userSourceOpt = if (args.boolean("includeUserSource")) {
|
|
||||||
Some(DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe)
|
|
||||||
} else None
|
|
||||||
|
|
||||||
val inferredLanguagesOpt = if (args.boolean("includeUserSource")) {
|
|
||||||
Some(ExternalDataSources.inferredUserProducedLanguageSource)
|
|
||||||
} else None
|
|
||||||
|
|
||||||
val simsGraphOpt = args.optional("simsForEvalInputDir").map { sgDir =>
|
|
||||||
ClusterDetailsJob.getTruncatedSims(
|
|
||||||
TypedPipe.from(WTFCandidatesSource(sgDir)),
|
|
||||||
args.int("maxSimsNeighborsForEval", 20)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
Util.printCounters(
|
|
||||||
ClusterDetailsJob
|
|
||||||
.run(
|
|
||||||
interestedIn,
|
|
||||||
args.int("qtreeSemigroupKParameter", 3),
|
|
||||||
args.getOrElse("modelVersion", "20M_145K_updated"),
|
|
||||||
knownFor,
|
|
||||||
knownForTranspose,
|
|
||||||
userSourceOpt,
|
|
||||||
inferredLanguagesOpt,
|
|
||||||
simsGraphOpt,
|
|
||||||
cosineThreshold = args.double("cosineThreshold", 0.01)
|
|
||||||
).flatMap(
|
|
||||||
_.writeExecution(AdhocKeyValSources.clusterDetailsSource(args("outputDir"))))
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
trait ClusterDetailsBatchTrait extends TwitterScheduledExecutionApp {
|
|
||||||
implicit val tz = DateOps.UTC
|
|
||||||
implicit val parser = DateParser.default
|
|
||||||
|
|
||||||
def firstTime: String
|
|
||||||
def batchIncrement: Duration
|
|
||||||
def manhattanOutputPath: String
|
|
||||||
def clusterDetailsLiteOutputPath: String
|
|
||||||
def modelVersion: String
|
|
||||||
def knownForDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
|
|
||||||
def interestedInDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
|
|
||||||
def outputDataset: KeyValDALDataset[KeyVal[(String, Int), ClusterDetails]]
|
|
||||||
def clusterDetailsLiteOutputDataset: SnapshotDALDataset[ClusterDetailsLite]
|
|
||||||
|
|
||||||
private lazy val execArgs = AnalyticsBatchExecutionArgs(
|
|
||||||
batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
|
|
||||||
firstTime = BatchFirstTime(RichDate(firstTime)),
|
|
||||||
lastTime = None,
|
|
||||||
batchIncrement = BatchIncrement(batchIncrement)
|
|
||||||
)
|
|
||||||
|
|
||||||
override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
|
|
||||||
implicit dateRange =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
Execution.withArgs { args =>
|
|
||||||
val qtreeSemigroupKParameter = args.int("qtreeSemigroupKParameter", 5)
|
|
||||||
val maxSimsNeighborsForEval = args.int("maxSimsNeighborsForEval", 20)
|
|
||||||
val knownForTranspose =
|
|
||||||
KnownForSources.fromKeyVal(
|
|
||||||
DAL.readMostRecentSnapshot(knownForDataset, dateRange.extend(Days(7))).toTypedPipe,
|
|
||||||
modelVersion)
|
|
||||||
val knownFor = KnownForSources.transpose(knownForTranspose)
|
|
||||||
val cosineThreshold = args.double("cosineThreshold", 0.01)
|
|
||||||
val interestedIn =
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(interestedInDataset, dateRange.extend(Days(7)))
|
|
||||||
.toTypedPipe
|
|
||||||
.map {
|
|
||||||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
|
||||||
(userId, clustersUserIsInterestedIn)
|
|
||||||
}
|
|
||||||
val sims = if (modelVersion == ModelVersions.Model20M145K2020) {
|
|
||||||
// The model version 20m_145k_2020 uses approximate_cosine_follow as the input sims graph
|
|
||||||
// to cluster users. The same graph is used to evaluate the clusters
|
|
||||||
TypedPipe
|
|
||||||
.from(FollowingsCosineSimilaritiesManhattanSource())
|
|
||||||
.map(_._2)
|
|
||||||
} else {
|
|
||||||
TypedPipe.from(
|
|
||||||
SimsCandidatesSource()(
|
|
||||||
dateRange = dateRange,
|
|
||||||
suffixPath = "/classified_candidates_rollup"
|
|
||||||
))
|
|
||||||
}
|
|
||||||
val resultExec = ClusterDetailsJob
|
|
||||||
.run(
|
|
||||||
interestedIn,
|
|
||||||
qtreeSemigroupKParameter,
|
|
||||||
modelVersion,
|
|
||||||
knownFor,
|
|
||||||
knownForTranspose,
|
|
||||||
Some(DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange).toTypedPipe),
|
|
||||||
Some(ExternalDataSources.inferredUserProducedLanguageSource),
|
|
||||||
Some(
|
|
||||||
ClusterDetailsJob.getTruncatedSims(sims, maxNeighbors = maxSimsNeighborsForEval)),
|
|
||||||
cosineThreshold
|
|
||||||
).flatMap { resultUnmapped =>
|
|
||||||
val clusterDetailsExec = resultUnmapped
|
|
||||||
.map {
|
|
||||||
case (clusterKey, details) =>
|
|
||||||
KeyVal(clusterKey, details)
|
|
||||||
}.writeDALVersionedKeyValExecution(
|
|
||||||
outputDataset,
|
|
||||||
D.Suffix(manhattanOutputPath)
|
|
||||||
)
|
|
||||||
|
|
||||||
val clusterDetailsLiteExec =
|
|
||||||
resultUnmapped
|
|
||||||
.map {
|
|
||||||
case ((_, clusterId), details)
|
|
||||||
if modelVersion == ModelVersions.Model20M145KDec11 =>
|
|
||||||
ClusterDetailsLite(
|
|
||||||
FullClusterId(ModelVersion.Model20m145kDec11, clusterId),
|
|
||||||
details.numUsersWithAnyNonZeroScore,
|
|
||||||
details.numUsersWithNonZeroFollowScore,
|
|
||||||
details.numUsersWithNonZeroFavScore,
|
|
||||||
details.knownForUsersAndScores.getOrElse(Nil)
|
|
||||||
)
|
|
||||||
case ((_, clusterId), details)
|
|
||||||
if modelVersion == ModelVersions.Model20M145KUpdated =>
|
|
||||||
ClusterDetailsLite(
|
|
||||||
FullClusterId(ModelVersion.Model20m145kUpdated, clusterId),
|
|
||||||
details.numUsersWithAnyNonZeroScore,
|
|
||||||
details.numUsersWithNonZeroFollowScore,
|
|
||||||
details.numUsersWithNonZeroFavScore,
|
|
||||||
details.knownForUsersAndScores.getOrElse(Nil)
|
|
||||||
)
|
|
||||||
case ((_, clusterId), details)
|
|
||||||
if modelVersion == ModelVersions.Model20M145K2020 =>
|
|
||||||
ClusterDetailsLite(
|
|
||||||
FullClusterId(ModelVersion.Model20m145k2020, clusterId),
|
|
||||||
details.numUsersWithAnyNonZeroScore,
|
|
||||||
details.numUsersWithNonZeroFollowScore,
|
|
||||||
details.numUsersWithNonZeroFavScore,
|
|
||||||
details.knownForUsersAndScores.getOrElse(Nil)
|
|
||||||
)
|
|
||||||
}.writeDALSnapshotExecution(
|
|
||||||
clusterDetailsLiteOutputDataset,
|
|
||||||
D.Daily,
|
|
||||||
D.Suffix(clusterDetailsLiteOutputPath),
|
|
||||||
D.EBLzo(),
|
|
||||||
dateRange.end)
|
|
||||||
|
|
||||||
Execution.zip(clusterDetailsExec, clusterDetailsLiteExec)
|
|
||||||
}
|
|
||||||
|
|
||||||
Util.printCounters(resultExec)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
object ClusterDetailsBatch extends ClusterDetailsBatchTrait {
|
|
||||||
override val firstTime: String = "2018-07-28"
|
|
||||||
override val batchIncrement: Duration = Days(7)
|
|
||||||
|
|
||||||
override val manhattanOutputPath: String =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details"
|
|
||||||
|
|
||||||
override val clusterDetailsLiteOutputPath: String =
|
|
||||||
"/user/cassowary/processed/simclusters_v2_cluster_details_lite"
|
|
||||||
|
|
||||||
override val modelVersion: String = ModelVersions.Model20M145KDec11
|
|
||||||
override val knownForDataset = SimclustersV2KnownFor20M145KDec11ScalaDataset
|
|
||||||
override val interestedInDataset = SimclustersV2InterestedInScalaDataset
|
|
||||||
override val outputDataset = SimclustersV2ClusterDetailsScalaDataset
|
|
||||||
override val clusterDetailsLiteOutputDataset =
|
|
||||||
SimclustersV2ClusterDetailsLiteScalaDataset
|
|
||||||
}
|
|
||||||
|
|
||||||
object ClusterDetails20M145KUpdated extends ClusterDetailsBatchTrait {
|
|
||||||
override val firstTime: String = "2019-06-16"
|
|
||||||
override val batchIncrement: Duration = Days(7)
|
|
||||||
|
|
||||||
override val manhattanOutputPath: String =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_updated"
|
|
||||||
|
|
||||||
override val clusterDetailsLiteOutputPath: String =
|
|
||||||
"/user/cassowary/processed/simclusters_v2_cluster_details_lite_20m_145k_updated"
|
|
||||||
|
|
||||||
override val modelVersion: String = ModelVersions.Model20M145KUpdated
|
|
||||||
override val knownForDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
|
|
||||||
override val interestedInDataset = SimclustersV2InterestedIn20M145KUpdatedScalaDataset
|
|
||||||
override val outputDataset = SimclustersV2ClusterDetails20M145KUpdatedScalaDataset
|
|
||||||
override val clusterDetailsLiteOutputDataset =
|
|
||||||
SimclustersV2ClusterDetailsLite20M145KUpdatedScalaDataset
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* capesospy-v2 update --build_locally --start_cron cluster_details_20m_145k_2020 \
|
|
||||||
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
|
|
||||||
*/
|
|
||||||
object ClusterDetails20M145K2020 extends ClusterDetailsBatchTrait {
|
|
||||||
override val firstTime: String = "2020-10-15"
|
|
||||||
override val batchIncrement: Duration = Days(7)
|
|
||||||
|
|
||||||
override val manhattanOutputPath: String =
|
|
||||||
"/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_2020"
|
|
||||||
|
|
||||||
override val clusterDetailsLiteOutputPath: String =
|
|
||||||
"/user/cassowary/processed/simclusters_v2_cluster_details_lite_20m_145k_2020"
|
|
||||||
|
|
||||||
override val modelVersion: String = ModelVersions.Model20M145K2020
|
|
||||||
override val knownForDataset = SimclustersV2KnownFor20M145K2020ScalaDataset
|
|
||||||
override val interestedInDataset = SimclustersV2InterestedIn20M145K2020ScalaDataset
|
|
||||||
override val outputDataset = SimclustersV2ClusterDetails20M145K2020ScalaDataset
|
|
||||||
override val clusterDetailsLiteOutputDataset =
|
|
||||||
SimclustersV2ClusterDetailsLite20M145K2020ScalaDataset
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
scalding remote run --main-class com.twitter.simclusters_v2.scalding.DumpClusterDetailsAdhoc \
|
|
||||||
--target src/scala/com/twitter/simclusters_v2/scalding:cluster_details-dump \
|
|
||||||
--user recos-platform -- \
|
|
||||||
--date 2020-06-25 \
|
|
||||||
--clusterIds 5542 129677 48645 \
|
|
||||||
--inputDir /user/recos-platform/adhoc/your_ldap/cluster_details_inferred_lang
|
|
||||||
*/
|
|
||||||
object DumpClusterDetailsAdhoc extends TwitterExecutionApp {
|
|
||||||
def job: Execution[Unit] =
|
|
||||||
Execution.getConfigMode.flatMap {
|
|
||||||
case (config, mode) =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
val args = config.getArgs
|
|
||||||
val clusters = args.list("clusterIds").map(_.toInt).toSet //(1 to 2500).toSet //
|
|
||||||
TypedPipe
|
|
||||||
.from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
|
|
||||||
.filter { case ((modelVersion, clusterId), details) => clusters.contains(clusterId) }
|
|
||||||
.toIterableExecution
|
|
||||||
.map { iter =>
|
|
||||||
iter.foreach { x => println(Util.prettyJsonMapper.writeValueAsString(x)) }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:cluster_details && \
|
|
||||||
* oscar hdfs --user cassowary --host hadoopnest2.atla.twitter.com --bundle cluster_details \
|
|
||||||
* --tool com.twitter.simclusters_v2.scalding.DumpClusterSimilaritiesAdhoc --screen --screen-detached \
|
|
||||||
* --tee your_ldap/dumpClusterSimilarities_20200103 -- \
|
|
||||||
* --inputDir /user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_updated/ \
|
|
||||||
* --outputDir adhoc/your_ldap
|
|
||||||
*/
|
|
||||||
object DumpClusterSimilaritiesAdhoc extends TwitterExecutionApp {
|
|
||||||
def job: Execution[Unit] =
|
|
||||||
Execution.getConfigMode.flatMap {
|
|
||||||
case (config, mode) =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
val args = config.getArgs
|
|
||||||
TypedPipe
|
|
||||||
.from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
|
|
||||||
.flatMap {
|
|
||||||
case ((_, clusterId), details) =>
|
|
||||||
details.neighborClusters.getOrElse(Nil).map { neighbor =>
|
|
||||||
val compositeScore = (neighbor.followCosineSimilarity
|
|
||||||
.getOrElse(0.0) + neighbor.favCosineSimilarity.getOrElse(0.0)) / 2
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
neighbor.clusterId,
|
|
||||||
"%.4f".format(compositeScore)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}.writeExecution(TypedTsv(args("outputDir")))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,607 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.scalding
|
|
||||||
|
|
||||||
import com.twitter.algebird.Monoid
|
|
||||||
import com.twitter.algebird.mutable.PriorityQueueMonoid
|
|
||||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
|
||||||
import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource
|
|
||||||
import com.twitter.scalding._
|
|
||||||
import com.twitter.scalding_internal.dalv2.DAL
|
|
||||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
|
||||||
import com.twitter.scalding_internal.job.analytics_batch._
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
|
||||||
import com.twitter.simclusters_v2.common.ModelVersions
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources._
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.Util
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.Util.Distribution
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.ClusterQuality
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor
|
|
||||||
import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset
|
|
||||||
import java.util.PriorityQueue
|
|
||||||
import scala.collection.JavaConverters._
|
|
||||||
|
|
||||||
object ClusterEvaluation {
|
|
||||||
|
|
||||||
val samplerMonoid: PriorityQueueMonoid[((Long, Long), (Double, Double))] =
|
|
||||||
Util.reservoirSamplerMonoidForPairs[(Long, Long), (Double, Double)](5000)(Util.edgeOrdering)
|
|
||||||
|
|
||||||
case class ClusterResults(
|
|
||||||
numEdgesInsideCluster: Int,
|
|
||||||
wtOfEdgesInsideCluster: Double,
|
|
||||||
numEdgesOutsideCluster: Int,
|
|
||||||
wtOfEdgesOutsideCluster: Double,
|
|
||||||
originalWtAndProductOfNodeScoresSample: PriorityQueue[((Long, Long), (Double, Double))]) {
|
|
||||||
def clusterQuality(clusterSize: Int, averagePrecisionWholeGraph: Double): ClusterQuality = {
|
|
||||||
val unweightedRecallDenominator = numEdgesInsideCluster + numEdgesOutsideCluster
|
|
||||||
val unweightedRecall = if (unweightedRecallDenominator > 0) {
|
|
||||||
numEdgesInsideCluster.toDouble / unweightedRecallDenominator.toDouble
|
|
||||||
} else 0.0
|
|
||||||
|
|
||||||
val weightedRecallDenominator = wtOfEdgesInsideCluster + wtOfEdgesOutsideCluster
|
|
||||||
val weightedRecall = if (weightedRecallDenominator > 0) {
|
|
||||||
wtOfEdgesInsideCluster / weightedRecallDenominator
|
|
||||||
} else 0.0
|
|
||||||
|
|
||||||
val precision = if (clusterSize > 1) {
|
|
||||||
Some(wtOfEdgesInsideCluster / (clusterSize * (clusterSize - 1)))
|
|
||||||
} else Some(0.0)
|
|
||||||
|
|
||||||
val relativePrecision = if (averagePrecisionWholeGraph > 0) {
|
|
||||||
precision.flatMap { p => Some(p / averagePrecisionWholeGraph) }
|
|
||||||
} else Some(0.0)
|
|
||||||
|
|
||||||
ClusterQuality(
|
|
||||||
unweightedRecall = Some(unweightedRecall),
|
|
||||||
weightedRecall = Some(weightedRecall),
|
|
||||||
unweightedRecallDenominator = Some(unweightedRecallDenominator),
|
|
||||||
weightedRecallDenominator = Some(weightedRecallDenominator),
|
|
||||||
relativePrecisionNumerator = precision,
|
|
||||||
relativePrecision = relativePrecision,
|
|
||||||
weightAndProductOfNodeScoresCorrelation = Some(
|
|
||||||
Util.computeCorrelation(
|
|
||||||
originalWtAndProductOfNodeScoresSample.iterator.asScala.map(_._2)))
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
object ClusterResultsMonoid extends Monoid[ClusterResults] {
|
|
||||||
override def zero = ClusterResults(0, 0, 0, 0, samplerMonoid.zero)
|
|
||||||
override def plus(l: ClusterResults, r: ClusterResults) = ClusterResults(
|
|
||||||
l.numEdgesInsideCluster + r.numEdgesInsideCluster,
|
|
||||||
l.wtOfEdgesInsideCluster + r.wtOfEdgesInsideCluster,
|
|
||||||
l.numEdgesOutsideCluster + r.numEdgesOutsideCluster,
|
|
||||||
l.wtOfEdgesOutsideCluster + r.wtOfEdgesOutsideCluster,
|
|
||||||
samplerMonoid
|
|
||||||
.plus(l.originalWtAndProductOfNodeScoresSample, r.originalWtAndProductOfNodeScoresSample)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Evaluate the quality of a cluster.
|
|
||||||
* @param memberScores A map with the members of the cluster as the keys and their scores
|
|
||||||
* inside the cluster as values. The more central a member is inside the score,
|
|
||||||
* the higher it's score is.
|
|
||||||
* @param membersAdjLists A map that gives the weighted neighbors of each member in the cluster.
|
|
||||||
*/
|
|
||||||
def evaluateCluster(
|
|
||||||
memberScores: Map[Long, Double],
|
|
||||||
membersAdjLists: Map[Long, Map[Long, Float]]
|
|
||||||
): ClusterResults = {
|
|
||||||
val resultsIter = membersAdjLists.flatMap {
|
|
||||||
case (fromNodeId, adjList) =>
|
|
||||||
val fromNodeWt = memberScores.getOrElse(fromNodeId, 0.0)
|
|
||||||
adjList.map {
|
|
||||||
case (toNodeId, edgeWt) =>
|
|
||||||
if (memberScores.contains(toNodeId)) {
|
|
||||||
val productOfMembershipScores = fromNodeWt * memberScores(toNodeId)
|
|
||||||
ClusterResults(
|
|
||||||
1,
|
|
||||||
edgeWt,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
samplerMonoid.build(
|
|
||||||
((fromNodeId, toNodeId), (edgeWt.toDouble, productOfMembershipScores))))
|
|
||||||
} else {
|
|
||||||
ClusterResults(0, 0, 1, edgeWt, samplerMonoid.zero)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Monoid.sum(resultsIter)(ClusterResultsMonoid)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Evaluate each cluster with respect to the provided graph.
|
|
||||||
* @param graph graph represented via the adjacency lists of each node, needs to be symmetrized i.e. if u is in v's adjlist, then v needs to be in u's adjlist as well
|
|
||||||
* @param clusters cluster memberships of each node.
|
|
||||||
* @param statsPrefix convenience argument to act as prefix for stats counters
|
|
||||||
* @return key-value pipe with clusterId as key and (size of the cluster, quality struct) as value
|
|
||||||
*/
|
|
||||||
def clusterLevelEvaluation(
|
|
||||||
graph: TypedPipe[(Long, Map[Long, Float])],
|
|
||||||
clusters: TypedPipe[(Long, Array[(Int, Float)])],
|
|
||||||
statsPrefix: String = ""
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): Execution[TypedPipe[(Int, (Int, ClusterQuality))]] = {
|
|
||||||
val numRealClusters = Stat(s"${statsPrefix}/numRealClusters")
|
|
||||||
val numFakeClusters = Stat(s"${statsPrefix}/numFakeClusters")
|
|
||||||
|
|
||||||
val numNodesAndEdgesExec = graph
|
|
||||||
.map {
|
|
||||||
case (nId, nbrMap) =>
|
|
||||||
(1L, nbrMap.size.toLong, nbrMap.values.sum.toDouble)
|
|
||||||
}.sum.getExecution
|
|
||||||
|
|
||||||
numNodesAndEdgesExec.map {
|
|
||||||
case (numNodes, numEdges, sumOfAllEdgeWts) =>
|
|
||||||
println("numNodes " + numNodes)
|
|
||||||
println("numEdges " + numEdges)
|
|
||||||
println("sumOfAllEdgeWts " + sumOfAllEdgeWts)
|
|
||||||
|
|
||||||
val numFakeClustersForUnassignedNodes = numNodes / 1e4
|
|
||||||
|
|
||||||
val averagePrecisionWholeGraph = sumOfAllEdgeWts / (numNodes * (numNodes - 1))
|
|
||||||
graph
|
|
||||||
.leftJoin(clusters)
|
|
||||||
// uncomment for adhoc job
|
|
||||||
.withReducers(200)
|
|
||||||
.flatMap {
|
|
||||||
case (nodeId, (adjList, assignedClustersOpt)) =>
|
|
||||||
val nodeDegree = adjList.size.toLong
|
|
||||||
val nodeWeightedDegree = adjList.values.sum
|
|
||||||
assignedClustersOpt match {
|
|
||||||
case Some(assignedClusters) if assignedClusters.nonEmpty =>
|
|
||||||
assignedClusters.toList.map {
|
|
||||||
case (clusterId, scoreOfNodeInCluster) =>
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
(
|
|
||||||
Map(nodeId -> (scoreOfNodeInCluster.toDouble, adjList)),
|
|
||||||
1,
|
|
||||||
nodeDegree,
|
|
||||||
nodeWeightedDegree))
|
|
||||||
}
|
|
||||||
case _ =>
|
|
||||||
// For nodes that don't belong to any cluster, create a fake clusterId (0 or lesser)
|
|
||||||
// and add the node's statistics to that clusterId. We don't need the adjacency lists for
|
|
||||||
// unassigned nodes, we'll simply track how many edges are incident on those nodes and their weighted sum etc
|
|
||||||
val fakeClusterId =
|
|
||||||
(-1 * (math.abs(
|
|
||||||
Util.hashToLong(nodeId)) % numFakeClustersForUnassignedNodes)).toInt
|
|
||||||
List(
|
|
||||||
(
|
|
||||||
fakeClusterId,
|
|
||||||
(
|
|
||||||
Map.empty[Long, (Double, Map[Long, Float])],
|
|
||||||
1,
|
|
||||||
nodeDegree,
|
|
||||||
nodeWeightedDegree)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.sumByKey
|
|
||||||
// uncomment for adhoc job
|
|
||||||
.withReducers(60)
|
|
||||||
.map {
|
|
||||||
case (clusterId, (membersMap, clusterSize, volumeOfCluster, weightedVolumeOfCluster)) =>
|
|
||||||
if (clusterId > 0) {
|
|
||||||
numRealClusters.inc()
|
|
||||||
|
|
||||||
val scoresMap =
|
|
||||||
if (clusterId > 0) membersMap.mapValues(_._1) else Map.empty[Long, Double]
|
|
||||||
val adjListsMap = membersMap.mapValues(_._2)
|
|
||||||
|
|
||||||
val quality = evaluateCluster(scoresMap, adjListsMap)
|
|
||||||
.clusterQuality(clusterSize, averagePrecisionWholeGraph)
|
|
||||||
|
|
||||||
(clusterId, (clusterSize, quality))
|
|
||||||
} else {
|
|
||||||
// clusterId <= 0 means that this is a fake cluster.
|
|
||||||
numFakeClusters.inc()
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
(
|
|
||||||
clusterSize,
|
|
||||||
ClusterQuality(
|
|
||||||
unweightedRecallDenominator = Some(volumeOfCluster),
|
|
||||||
weightedRecallDenominator = Some(weightedVolumeOfCluster)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
case class OverallResults(
|
|
||||||
unweightedRecall: Double,
|
|
||||||
edgesInsideClusters: Long,
|
|
||||||
allEdges: Long,
|
|
||||||
allNodes: Int,
|
|
||||||
weightedRecall: Double,
|
|
||||||
wtOnEdgesInsideClusters: Double,
|
|
||||||
wtOnAllEdges: Double,
|
|
||||||
weightCorrelation: Double,
|
|
||||||
relativePrecision: Double,
|
|
||||||
numUnassignedNodes: Int,
|
|
||||||
numAssignedNodes: Int,
|
|
||||||
sizeDist: Distribution,
|
|
||||||
recallDist: Distribution,
|
|
||||||
weightedRecallDist: Distribution,
|
|
||||||
relativePrecisionDist: Distribution,
|
|
||||||
weightCorrelationDist: Distribution,
|
|
||||||
numClustersWithNegativeCorrelation: Double,
|
|
||||||
numClustersWithZeroRecall: Double,
|
|
||||||
numClustersWithLessThanOneRelativePrecision: Double,
|
|
||||||
numSingletonClusters: Int)
|
|
||||||
|
|
||||||
def summarizePerClusterResults(
|
|
||||||
perClusterResults: TypedPipe[(Int, (Int, ClusterQuality))]
|
|
||||||
): Execution[Option[OverallResults]] = {
|
|
||||||
perClusterResults
|
|
||||||
.map {
|
|
||||||
case (clusterId, (size, quality)) =>
|
|
||||||
val unweightedRecallDen = quality.unweightedRecallDenominator.getOrElse(0.0)
|
|
||||||
val unweightedRecallNum = quality.unweightedRecall.getOrElse(0.0) * unweightedRecallDen
|
|
||||||
val weightedRecallDen = quality.weightedRecallDenominator.getOrElse(0.0)
|
|
||||||
val weightedRecallNum = quality.weightedRecall.getOrElse(0.0) * weightedRecallDen
|
|
||||||
|
|
||||||
val weightCorrelationDen = size
|
|
||||||
val weightCorrelationNum =
|
|
||||||
weightCorrelationDen * quality.weightAndProductOfNodeScoresCorrelation
|
|
||||||
.getOrElse(0.0)
|
|
||||||
|
|
||||||
val relativePrecisionDen = size
|
|
||||||
val relativePrecisionNum = relativePrecisionDen * quality.relativePrecision.getOrElse(0.0)
|
|
||||||
|
|
||||||
val numClustersWithNegativeCorrelation =
|
|
||||||
if (weightCorrelationNum < 0 && clusterId > 0) 1 else 0
|
|
||||||
val numClustersWithLessThanOneRelativePrecision =
|
|
||||||
if (quality.relativePrecision.getOrElse(0.0) < 1 && clusterId > 0) 1 else 0
|
|
||||||
val numClustersWithZeroRecall = if (weightedRecallNum < 1e-5 && clusterId > 0) 1 else 0
|
|
||||||
val numUnassignedNodes = if (clusterId < 1) size else 0
|
|
||||||
val numAssignedNodes = if (clusterId > 0) size else 0
|
|
||||||
val numSingletonClusters = if (clusterId > 0 && size == 1) 1 else 0
|
|
||||||
|
|
||||||
(
|
|
||||||
unweightedRecallDen,
|
|
||||||
unweightedRecallNum,
|
|
||||||
weightedRecallDen,
|
|
||||||
weightedRecallNum,
|
|
||||||
weightCorrelationDen,
|
|
||||||
weightCorrelationNum,
|
|
||||||
relativePrecisionDen,
|
|
||||||
relativePrecisionNum,
|
|
||||||
numClustersWithNegativeCorrelation,
|
|
||||||
numClustersWithLessThanOneRelativePrecision,
|
|
||||||
numClustersWithZeroRecall,
|
|
||||||
List(size.toDouble),
|
|
||||||
List(quality.unweightedRecall.getOrElse(0.0)),
|
|
||||||
List(quality.weightedRecall.getOrElse(0.0)),
|
|
||||||
List(quality.relativePrecision.getOrElse(0.0)),
|
|
||||||
List(quality.weightAndProductOfNodeScoresCorrelation.getOrElse(0.0)),
|
|
||||||
numUnassignedNodes,
|
|
||||||
numAssignedNodes,
|
|
||||||
numSingletonClusters
|
|
||||||
)
|
|
||||||
}
|
|
||||||
.sum
|
|
||||||
.toOptionExecution
|
|
||||||
.map { opt =>
|
|
||||||
opt.map {
|
|
||||||
case (
|
|
||||||
unweightedRecallDen,
|
|
||||||
unweightedRecallNum,
|
|
||||||
weightedRecallDen,
|
|
||||||
weightedRecallNum,
|
|
||||||
weightCorrelationDen,
|
|
||||||
weightCorrelationNum,
|
|
||||||
relativePrecisionDen,
|
|
||||||
relativePrecisionNum,
|
|
||||||
numClustersWithNegativeCorrelation,
|
|
||||||
numClustersWithLessThanOneRelativePrecision,
|
|
||||||
numClustersWithZeroRecall,
|
|
||||||
sizeList,
|
|
||||||
unweightedRecallList,
|
|
||||||
weightedRecallList,
|
|
||||||
relativePrecisionList,
|
|
||||||
weightCorrelationList,
|
|
||||||
numUnassignedNodes,
|
|
||||||
numAssignedNodes,
|
|
||||||
numSingletonClusters) =>
|
|
||||||
OverallResults(
|
|
||||||
unweightedRecall = unweightedRecallNum / unweightedRecallDen,
|
|
||||||
edgesInsideClusters = unweightedRecallNum.toLong,
|
|
||||||
allEdges = unweightedRecallDen.toLong,
|
|
||||||
allNodes = numAssignedNodes + numUnassignedNodes,
|
|
||||||
weightedRecall = weightedRecallNum / weightedRecallDen,
|
|
||||||
wtOnEdgesInsideClusters = weightedRecallNum,
|
|
||||||
wtOnAllEdges = weightedRecallDen,
|
|
||||||
weightCorrelation = weightCorrelationNum / weightCorrelationDen,
|
|
||||||
relativePrecision = relativePrecisionNum / relativePrecisionDen,
|
|
||||||
numAssignedNodes = numAssignedNodes,
|
|
||||||
numUnassignedNodes = numUnassignedNodes,
|
|
||||||
sizeDist = Util.distributionFromArray(sizeList.toArray),
|
|
||||||
recallDist = Util.distributionFromArray(unweightedRecallList.toArray),
|
|
||||||
weightedRecallDist = Util.distributionFromArray(weightedRecallList.toArray),
|
|
||||||
weightCorrelationDist = Util.distributionFromArray(weightCorrelationList.toArray),
|
|
||||||
relativePrecisionDist = Util.distributionFromArray(relativePrecisionList.toArray),
|
|
||||||
numClustersWithNegativeCorrelation = numClustersWithNegativeCorrelation,
|
|
||||||
numClustersWithLessThanOneRelativePrecision =
|
|
||||||
numClustersWithLessThanOneRelativePrecision,
|
|
||||||
numClustersWithZeroRecall = numClustersWithZeroRecall,
|
|
||||||
numSingletonClusters = numSingletonClusters
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param graph Input similarity graph, needs to be symmetrized i.e. if u is in v's adjlist, then v needs to be in u's adjlist as well
|
|
||||||
* @param clusters cluster assignments to be evaluated
|
|
||||||
* @return summary of results
|
|
||||||
*/
|
|
||||||
def overallEvaluation(
|
|
||||||
graph: TypedPipe[(Long, Map[Long, Float])],
|
|
||||||
clusters: TypedPipe[(Long, Array[(Int, Float)])],
|
|
||||||
statsPrefix: String
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): Execution[Option[OverallResults]] = {
|
|
||||||
clusterLevelEvaluation(graph, clusters, statsPrefix).flatMap(summarizePerClusterResults)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:cluster_evaluation && \
|
|
||||||
* oscar hdfs --user frigate --host hadoopnest1.atla.twitter.com --bundle cluster_evaluation \
|
|
||||||
* --tool com.twitter.simclusters_v2.scalding.ClusterEvaluationAdhoc --screen --screen-detached \
|
|
||||||
* --tee logs/clusterQualityFor_updatedUnnormalizedInputScores_usingSims20190318 -- \
|
|
||||||
* --simsInputDir /user/frigate/your_ldap/commonDirForClusterEvaluation/classifiedSims_20190314_copiedFromAtlaProc \
|
|
||||||
* --topK 20000000 --date 2019-03-18 --minActiveFollowers 400 \
|
|
||||||
* --topUsersDir /user/frigate/your_ldap/commonDirForClusterEvaluation/top20MUsers_minActiveFollowers400_20190215 \
|
|
||||||
* --maxSimsNeighborsForEval 40 \
|
|
||||||
* --preparedSimsGraph /user/frigate/your_ldap/commonDirForClusterEvaluation/symmetrized_classifiedSims20190318_top20MUsers \
|
|
||||||
* --outputDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownForClusterEvaluation \
|
|
||||||
* --knownForDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownFor
|
|
||||||
*/
|
|
||||||
object ClusterEvaluationAdhoc extends TwitterExecutionApp {
|
|
||||||
implicit val tz: java.util.TimeZone = DateOps.UTC
|
|
||||||
implicit val dp = DateParser.default
|
|
||||||
|
|
||||||
def job: Execution[Unit] =
|
|
||||||
Execution.getConfigMode.flatMap {
|
|
||||||
case (config, mode) =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
val args = config.getArgs
|
|
||||||
val knownFor = args
|
|
||||||
.optional("knownForDir").map { location =>
|
|
||||||
KnownForSources.readKnownFor(location)
|
|
||||||
}.getOrElse(KnownForSources.knownFor_20M_Dec11_145K)
|
|
||||||
|
|
||||||
val minActiveFollowers = args.int("minActiveFollowers", 400)
|
|
||||||
val topK = args.int("topK")
|
|
||||||
val date = DateRange.parse(args("date"))
|
|
||||||
|
|
||||||
val topUsersExec =
|
|
||||||
TopUsersSimilarityGraph
|
|
||||||
.topUsers(
|
|
||||||
DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe,
|
|
||||||
minActiveFollowers,
|
|
||||||
topK
|
|
||||||
)
|
|
||||||
.map(_.id)
|
|
||||||
.count("num_top_users")
|
|
||||||
.make(TypedTsv(args("topUsersDir")))
|
|
||||||
|
|
||||||
val simsGraphExec = topUsersExec.flatMap { topUsers =>
|
|
||||||
TopUsersSimilarityGraph.makeGraph(
|
|
||||||
TopUsersSimilarityGraph.getSubgraphFromUserGroupedInput(
|
|
||||||
TypedPipe.from(WTFCandidatesSource(args("simsInputDir"))),
|
|
||||||
topUsers,
|
|
||||||
args.int("maxSimsNeighborsForEval", 40),
|
|
||||||
degreeThresholdForStat = 5
|
|
||||||
),
|
|
||||||
args("preparedSimsGraph")
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
val fullExec = simsGraphExec.flatMap { sims =>
|
|
||||||
ClusterEvaluation
|
|
||||||
.clusterLevelEvaluation(sims, knownFor, "eval")
|
|
||||||
.flatMap { clusterResultsPipe =>
|
|
||||||
val clusterResults = clusterResultsPipe.forceToDiskExecution
|
|
||||||
val outputExec = clusterResults.flatMap { pipe =>
|
|
||||||
pipe
|
|
||||||
.map {
|
|
||||||
case (clusterId, (clusterSize, quality)) =>
|
|
||||||
"%d\t%d\t%.2g\t%.2g\t%.1f\t%.2g\t%.2f\t%.2g\t%.2g"
|
|
||||||
.format(
|
|
||||||
clusterId,
|
|
||||||
clusterSize,
|
|
||||||
quality.unweightedRecall.getOrElse(0.0),
|
|
||||||
quality.weightedRecall.getOrElse(0.0),
|
|
||||||
quality.unweightedRecallDenominator.getOrElse(0.0),
|
|
||||||
quality.weightedRecallDenominator.getOrElse(0.0),
|
|
||||||
quality.relativePrecision.getOrElse(0.0),
|
|
||||||
quality.relativePrecisionNumerator.getOrElse(0.0),
|
|
||||||
quality.weightAndProductOfNodeScoresCorrelation.getOrElse(0.0)
|
|
||||||
)
|
|
||||||
}.writeExecution(TypedTsv(args("outputDir")))
|
|
||||||
}
|
|
||||||
|
|
||||||
val printExec = clusterResults.flatMap { pipe =>
|
|
||||||
ClusterEvaluation.summarizePerClusterResults(pipe).map {
|
|
||||||
case Some(res) =>
|
|
||||||
println("Overall results: " + Util.prettyJsonMapper.writeValueAsString(res))
|
|
||||||
case None =>
|
|
||||||
println("No overall results!!! Probably cluster results pipe is empty.")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Execution.zip(outputExec, printExec)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Util.printCounters(fullExec)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
trait ClusterEvaluationBatch extends TwitterScheduledExecutionApp {
|
|
||||||
implicit val tz: java.util.TimeZone = DateOps.UTC
|
|
||||||
implicit val dp = DateParser.default
|
|
||||||
|
|
||||||
def firstTime: String
|
|
||||||
|
|
||||||
def batchDescription: String
|
|
||||||
|
|
||||||
def batchIncrement: Duration
|
|
||||||
|
|
||||||
private lazy val execArgs = AnalyticsBatchExecutionArgs(
|
|
||||||
batchDesc = BatchDescription(batchDescription),
|
|
||||||
firstTime = BatchFirstTime(RichDate(firstTime)),
|
|
||||||
lastTime = None,
|
|
||||||
batchIncrement = BatchIncrement(batchIncrement)
|
|
||||||
)
|
|
||||||
|
|
||||||
val emailAddress: String = "no-reply@twitter.com"
|
|
||||||
|
|
||||||
def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
|
|
||||||
|
|
||||||
def knownForModelVersion: String
|
|
||||||
|
|
||||||
def baselineKnownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
|
|
||||||
|
|
||||||
def baselineKnownForModelVersion: String
|
|
||||||
|
|
||||||
override def scheduledJob: Execution[Unit] =
|
|
||||||
AnalyticsBatchExecution(execArgs) { implicit dateRange =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
Execution.withArgs { args =>
|
|
||||||
val baselineKnownFor =
|
|
||||||
KnownForSources.fromKeyVal(
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(baselineKnownForDALDataset, dateRange.prepend(Days(7)))
|
|
||||||
.toTypedPipe,
|
|
||||||
baselineKnownForModelVersion
|
|
||||||
)
|
|
||||||
|
|
||||||
val knownFor =
|
|
||||||
KnownForSources.fromKeyVal(
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(knownForDALDataset, dateRange.prepend(Days(7)))
|
|
||||||
.toTypedPipe,
|
|
||||||
knownForModelVersion
|
|
||||||
)
|
|
||||||
|
|
||||||
val inputSimsGraph = TypedPipe
|
|
||||||
.from(FollowingsCosineSimilaritiesManhattanSource())
|
|
||||||
.map(_._2)
|
|
||||||
|
|
||||||
val minActiveFollowers = args.int("minActiveFollowers")
|
|
||||||
val topK = args.int("topK")
|
|
||||||
val maxSimsNeighborsForEval =
|
|
||||||
args.int("maxSimsNeighborsForEval", 40)
|
|
||||||
|
|
||||||
val topUsers = TopUsersSimilarityGraph
|
|
||||||
.topUsers(
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange)
|
|
||||||
.toTypedPipe,
|
|
||||||
minActiveFollowers,
|
|
||||||
topK
|
|
||||||
)
|
|
||||||
.map(_.id)
|
|
||||||
.count("num_top_users")
|
|
||||||
|
|
||||||
TopUsersSimilarityGraph
|
|
||||||
.getSubgraphFromUserGroupedInput(
|
|
||||||
fullGraph = inputSimsGraph,
|
|
||||||
usersToInclude = topUsers,
|
|
||||||
maxNeighborsPerNode = maxSimsNeighborsForEval,
|
|
||||||
degreeThresholdForStat = 2
|
|
||||||
)
|
|
||||||
.forceToDiskExecution
|
|
||||||
.flatMap { symmetrizedSims =>
|
|
||||||
val baselineResultsExec = ClusterEvaluation
|
|
||||||
.overallEvaluation(symmetrizedSims, baselineKnownFor, "baselineKnownForEval")
|
|
||||||
val newResultsExec = ClusterEvaluation
|
|
||||||
.overallEvaluation(symmetrizedSims, knownFor, "newKnownForEval")
|
|
||||||
val minSizeOfBiggerClusterForComparison = 10
|
|
||||||
val compareExec = CompareClusters.summarize(
|
|
||||||
CompareClusters.compare(
|
|
||||||
KnownForSources.transpose(baselineKnownFor),
|
|
||||||
KnownForSources.transpose(knownFor),
|
|
||||||
minSizeOfBiggerCluster = minSizeOfBiggerClusterForComparison
|
|
||||||
))
|
|
||||||
|
|
||||||
Execution
|
|
||||||
.zip(baselineResultsExec, newResultsExec, compareExec)
|
|
||||||
.map {
|
|
||||||
case (oldResults, newResults, compareResults) =>
|
|
||||||
val emailText =
|
|
||||||
s"Evaluation Results for baseline knownFor: $baselineKnownForModelVersion \n" +
|
|
||||||
Util.prettyJsonMapper.writeValueAsString(oldResults) +
|
|
||||||
"\n\n-------------------\n\n" +
|
|
||||||
s"Evaluation Results for new knownFor:$knownForModelVersion\n" +
|
|
||||||
Util.prettyJsonMapper.writeValueAsString(newResults) +
|
|
||||||
"\n\n-------------------\n\n" +
|
|
||||||
s"Cosine similarity distribution between $baselineKnownForModelVersion and " +
|
|
||||||
s"$knownForModelVersion cluster membership vectors for " +
|
|
||||||
s"clusters with at least $minSizeOfBiggerClusterForComparison members:\n" +
|
|
||||||
Util.prettyJsonMapper
|
|
||||||
.writeValueAsString(compareResults)
|
|
||||||
|
|
||||||
Util
|
|
||||||
.sendEmail(
|
|
||||||
emailText,
|
|
||||||
s"Evaluation results comparing $knownForModelVersion with baseline $baselineKnownForModelVersion",
|
|
||||||
emailAddress)
|
|
||||||
()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* capesospy-v2 update --build_locally --start_cron cluster_evaluation_for_20M_145k \
|
|
||||||
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
|
|
||||||
*/
|
|
||||||
object ClusterEvaluationFor20M145K extends ClusterEvaluationBatch {
|
|
||||||
override val firstTime: String = "2019-06-11"
|
|
||||||
|
|
||||||
override val batchIncrement: Duration = Days(7)
|
|
||||||
|
|
||||||
override val batchDescription = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K"
|
|
||||||
|
|
||||||
override val knownForDALDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
|
|
||||||
|
|
||||||
override val knownForModelVersion = ModelVersions.Model20M145KUpdated
|
|
||||||
|
|
||||||
override val baselineKnownForDALDataset = SimclustersV2KnownFor20M145KDec11ScalaDataset
|
|
||||||
|
|
||||||
override val baselineKnownForModelVersion = ModelVersions.Model20M145KDec11
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* capesospy-v2 update --build_locally --start_cron cluster_evaluation_for_20M_145k_2020 \
|
|
||||||
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
|
|
||||||
*/
|
|
||||||
object ClusterEvaluationFor20M145K2020 extends ClusterEvaluationBatch {
|
|
||||||
override val firstTime: String = "2021-01-25"
|
|
||||||
|
|
||||||
override val batchIncrement: Duration = Days(7)
|
|
||||||
|
|
||||||
override val batchDescription =
|
|
||||||
"com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K2020"
|
|
||||||
|
|
||||||
override val knownForDALDataset = SimclustersV2KnownFor20M145K2020ScalaDataset
|
|
||||||
|
|
||||||
override val knownForModelVersion = ModelVersions.Model20M145K2020
|
|
||||||
|
|
||||||
override val baselineKnownForDALDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
|
|
||||||
|
|
||||||
override val baselineKnownForModelVersion = ModelVersions.Model20M145KUpdated
|
|
||||||
}
|
|
Binary file not shown.
@ -1,131 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.scalding
|
|
||||||
|
|
||||||
import com.twitter.scalding.{DateOps, DateParser, Execution, Stat, TypedPipe, TypedTsv, UniqueID}
|
|
||||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
|
||||||
import com.twitter.simclusters_v2.common.{ClusterId, UserId}
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.Util
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.Util.Distribution
|
|
||||||
|
|
||||||
object CompareClusters {
|
|
||||||
def norm(a: Iterable[Float]): Float = {
|
|
||||||
math
|
|
||||||
.sqrt(a.map { x => x * x }.sum).toFloat
|
|
||||||
}
|
|
||||||
|
|
||||||
def cosine(a: Map[Long, Float], b: Map[Long, Float]): Float = {
|
|
||||||
val intersect = a.toList.collect {
|
|
||||||
case (id, score) if b.contains(id) =>
|
|
||||||
score * b(id)
|
|
||||||
}
|
|
||||||
val dot = if (intersect.nonEmpty) intersect.sum else 0
|
|
||||||
val aNorm = norm(a.values)
|
|
||||||
val bNorm = norm(b.values)
|
|
||||||
if (aNorm > 0 && bNorm > 0) {
|
|
||||||
dot / aNorm / bNorm
|
|
||||||
} else 0
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Compare two known-for data set, and generate change in cluster assignment stats
|
|
||||||
*/
|
|
||||||
def compareClusterAssignments(
|
|
||||||
newKnownFor: TypedPipe[(UserId, List[(ClusterId, Float)])],
|
|
||||||
oldKnownFor: TypedPipe[(UserId, List[(ClusterId, Float)])]
|
|
||||||
)(
|
|
||||||
implicit uniqueID: UniqueID
|
|
||||||
): Execution[String] = {
|
|
||||||
|
|
||||||
val emptyToSomething = Stat("no_assignment_to_some")
|
|
||||||
val somethingToEmpty = Stat("some_assignment_to_none")
|
|
||||||
val emptyToEmpty = Stat("empty_to_empty")
|
|
||||||
val sameCluster = Stat("same_cluster")
|
|
||||||
val diffCluster = Stat("diff_cluster")
|
|
||||||
|
|
||||||
val calculateStatExec = newKnownFor
|
|
||||||
.outerJoin(oldKnownFor)
|
|
||||||
.map {
|
|
||||||
case (userId, (newKnownForListOpt, oldKnownForListOpt)) =>
|
|
||||||
val newKnownFor = newKnownForListOpt.getOrElse(Nil)
|
|
||||||
val oldKnownFor = oldKnownForListOpt.getOrElse(Nil)
|
|
||||||
|
|
||||||
if (newKnownFor.nonEmpty && oldKnownFor.isEmpty) {
|
|
||||||
emptyToSomething.inc()
|
|
||||||
}
|
|
||||||
if (newKnownFor.isEmpty && oldKnownFor.nonEmpty) {
|
|
||||||
somethingToEmpty.inc()
|
|
||||||
}
|
|
||||||
if (newKnownFor.isEmpty && oldKnownFor.isEmpty) {
|
|
||||||
emptyToEmpty.inc()
|
|
||||||
}
|
|
||||||
|
|
||||||
if (newKnownFor.nonEmpty && oldKnownFor.nonEmpty) {
|
|
||||||
val newClusterId = newKnownFor.head._1
|
|
||||||
val oldClusterId = oldKnownFor.head._1
|
|
||||||
|
|
||||||
if (newClusterId == oldClusterId) {
|
|
||||||
sameCluster.inc()
|
|
||||||
} else {
|
|
||||||
diffCluster.inc()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
userId
|
|
||||||
}
|
|
||||||
.toIterableExecution
|
|
||||||
|
|
||||||
Util.getCustomCountersString(calculateStatExec)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Compare two cluster assignments in terms of cosine similarity of corresponding clusters.
|
|
||||||
* Excludes clusters which are too small
|
|
||||||
* @param knownForA
|
|
||||||
* @param knownForB
|
|
||||||
* @param minSizeOfBiggerCluster Set to 10 or some such.
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
def compare(
|
|
||||||
knownForA: TypedPipe[(Int, List[(Long, Float)])],
|
|
||||||
knownForB: TypedPipe[(Int, List[(Long, Float)])],
|
|
||||||
minSizeOfBiggerCluster: Int
|
|
||||||
): TypedPipe[(Int, Float)] = {
|
|
||||||
knownForA
|
|
||||||
.outerJoin(knownForB)
|
|
||||||
.collect {
|
|
||||||
case (clusterId, (membersInAOpt, membersInBOpt))
|
|
||||||
if membersInAOpt.exists(_.size >= minSizeOfBiggerCluster) || membersInBOpt
|
|
||||||
.exists(_.size >= minSizeOfBiggerCluster) =>
|
|
||||||
val membersInA =
|
|
||||||
membersInAOpt.map(_.toMap).getOrElse(Map.empty[Long, Float])
|
|
||||||
val membersInB =
|
|
||||||
membersInBOpt.map(_.toMap).getOrElse(Map.empty[Long, Float])
|
|
||||||
(clusterId, cosine(membersInA, membersInB))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def summarize(clusterToCosines: TypedPipe[(Int, Float)]): Execution[Option[Distribution]] = {
|
|
||||||
clusterToCosines.values.map(x => List(x)).sum.toOptionExecution.map { listOpt =>
|
|
||||||
listOpt.map { list => Util.distributionFromArray(list.map(_.toDouble).toArray) }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
object CompareClustersAdhoc extends TwitterExecutionApp {
|
|
||||||
implicit val tz: java.util.TimeZone = DateOps.UTC
|
|
||||||
implicit val dp = DateParser.default
|
|
||||||
|
|
||||||
def job: Execution[Unit] =
|
|
||||||
Execution.getConfigMode.flatMap {
|
|
||||||
case (config, mode) =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
val args = config.getArgs
|
|
||||||
|
|
||||||
val knownForA = KnownForSources.transpose(KnownForSources.readKnownFor(args("knownForA")))
|
|
||||||
val knownForB = KnownForSources.transpose(KnownForSources.readKnownFor(args("knownForB")))
|
|
||||||
|
|
||||||
CompareClusters
|
|
||||||
.compare(knownForA, knownForB, minSizeOfBiggerCluster = 10)
|
|
||||||
.map { case (cId, cos) => "%d\t%.2f".format(cId, cos) }
|
|
||||||
.writeExecution(TypedTsv(args("outputDir")))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,330 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.scalding
|
|
||||||
|
|
||||||
import com.twitter.algebird.Monoid
|
|
||||||
import com.twitter.logging.Logger
|
|
||||||
import com.twitter.scalding.{Execution, TypedPipe, TypedTsv}
|
|
||||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
|
|
||||||
import java.util
|
|
||||||
import no.uib.cipr.matrix.Matrix
|
|
||||||
import no.uib.cipr.matrix.sparse.{ArpackSym, LinkedSparseMatrix}
|
|
||||||
import scala.collection.JavaConverters._
|
|
||||||
|
|
||||||
object EigenVectorsForSparseSymmetric {
|
|
||||||
val log: Logger = Logger()
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Construct matrix from the rows of the matrix, specified as a map. The outer map is indexed by rowId, and the inner maps are indexed by columnId.
|
|
||||||
* Note that the input matrix is intended to be symmetric.
|
|
||||||
*
|
|
||||||
* @param map A map specifying the rows of the matrix. The outer map is indexed by rowId, and the inner maps are indexed by columnId. Both rows and columns are zero-indexed.
|
|
||||||
* @param nRows number of rows in matrix
|
|
||||||
* @param nCols number of columns in matrix
|
|
||||||
*
|
|
||||||
* @return the constructed matrix
|
|
||||||
*/
|
|
||||||
def getMatrix(map: Map[Int, Map[Int, Double]], nRows: Int, nCols: Int): Matrix = {
|
|
||||||
val nonzeros = map.toSeq.flatMap {
|
|
||||||
case (i, subMap) =>
|
|
||||||
subMap.toSeq.map {
|
|
||||||
case (j, value) =>
|
|
||||||
(i, j, value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
getMatrix(nonzeros, nRows, nCols)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Construct matrix from iterable of the non-zero entries. Note that the input matrix is intended to be symmetric.
|
|
||||||
*
|
|
||||||
* @param nonzeros non-zeros in (i, j, v) format, where i is row, j is column, and v is value. Both rows and columns are zero-indexed.
|
|
||||||
* @param nRows number of rows in matrix
|
|
||||||
* @param nCols number of columns in matrix
|
|
||||||
*
|
|
||||||
* @return the constructed matrix
|
|
||||||
*/
|
|
||||||
def getMatrix(nonzeros: Iterable[(Int, Int, Double)], nRows: Int, nCols: Int): Matrix = {
|
|
||||||
val matrix = new LinkedSparseMatrix(nRows, nCols)
|
|
||||||
var numEntries = 0
|
|
||||||
var maxRow = 0
|
|
||||||
var maxCol = 0
|
|
||||||
|
|
||||||
nonzeros.foreach {
|
|
||||||
case (i, j, v) =>
|
|
||||||
if (i > maxRow) {
|
|
||||||
maxRow = i
|
|
||||||
}
|
|
||||||
if (j > maxCol) {
|
|
||||||
maxCol = j
|
|
||||||
}
|
|
||||||
numEntries += 1
|
|
||||||
matrix.set(i, j, v)
|
|
||||||
}
|
|
||||||
log.info(
|
|
||||||
"Finished building matrix with %d entries and maxRow %d and maxCol %d"
|
|
||||||
.format(numEntries, maxRow, maxCol))
|
|
||||||
|
|
||||||
matrix
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Prints out various diagnostics about how much the given matrix differs from a perfect
|
|
||||||
* symmetric matrix. If (i,j) and (j,i) are different, it sets both of them to be the max of the two.
|
|
||||||
* Call this function before invoking EVD.
|
|
||||||
*
|
|
||||||
* @param matrix Matrix which is modified (if need be) in place.
|
|
||||||
*/
|
|
||||||
def ensureMatrixIsSymmetric(matrix: Matrix): Unit = {
|
|
||||||
var numUnequalEntries = 0
|
|
||||||
var numEntriesDifferentBy1Percent = 0
|
|
||||||
var numEqualEntries = 0
|
|
||||||
var numUnequalDueToZero = 0
|
|
||||||
var maxUnequal = (0, 0, 0.0, 0.0)
|
|
||||||
matrix.iterator().asScala.foreach { entry =>
|
|
||||||
val curr = entry.get()
|
|
||||||
val opp = matrix.get(entry.column(), entry.row())
|
|
||||||
if (curr == opp) {
|
|
||||||
numEqualEntries += 1
|
|
||||||
} else {
|
|
||||||
numUnequalEntries += 1
|
|
||||||
if (opp == 0) {
|
|
||||||
numUnequalDueToZero += 1
|
|
||||||
}
|
|
||||||
if (opp != 0 && (math.abs(curr - opp) / math.min(curr, opp)) > 0.01) {
|
|
||||||
numEntriesDifferentBy1Percent += 1
|
|
||||||
}
|
|
||||||
if (opp != 0 && math.abs(curr - opp) > maxUnequal._4) {
|
|
||||||
maxUnequal = (entry.row(), entry.column(), curr, math.abs(curr - opp))
|
|
||||||
}
|
|
||||||
val max = math.max(curr, opp)
|
|
||||||
matrix.set(entry.column(), entry.row(), max)
|
|
||||||
matrix.set(entry.row(), entry.column(), max)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var numUnEqualPrinted = 0
|
|
||||||
matrix.iterator().asScala.foreach { entry =>
|
|
||||||
val opp = matrix.get(entry.column(), entry.row())
|
|
||||||
if (numUnEqualPrinted < 10 && entry.get() != opp) {
|
|
||||||
numUnEqualPrinted += 1
|
|
||||||
log.info(
|
|
||||||
"Entries for (%d, %d) are %s and %s"
|
|
||||||
.format(entry.row(), entry.column(), entry.get(), opp))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
log.info(
|
|
||||||
"Num unequal entries: %d, num unequal due to zero: %d, num unequal by 1percent or more: %d, num equal entries: %d, maxUnequal: %s"
|
|
||||||
.format(
|
|
||||||
numUnequalEntries,
|
|
||||||
numUnequalDueToZero,
|
|
||||||
numEntriesDifferentBy1Percent,
|
|
||||||
numEqualEntries,
|
|
||||||
maxUnequal))
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the top-k eigenvalues (largest magnitude) and eigenvectors for an input matrix.
|
|
||||||
* Top eigenvalues means they're the largest in magnitude.
|
|
||||||
* Input matrix needs to be perfectly symmetric; if it's not, this function will fail.
|
|
||||||
*
|
|
||||||
* Many of the eigenvectors will have very small values along most of the dimensions. This method also
|
|
||||||
* only retains the bigger entries in an eigenvector.
|
|
||||||
*
|
|
||||||
* @param matrix symmetric input matrix.
|
|
||||||
* @param k how many of the top eigenvectors to get.
|
|
||||||
* @param ratioToLargestCutoff An entry needs to be at least 1/ratioToLargestCutoff of the biggest entry in that vector to be retained.
|
|
||||||
*
|
|
||||||
* @return seq of (eigenvalue, eigenvector) pairs.
|
|
||||||
*/
|
|
||||||
def getTruncatedEVD(
|
|
||||||
matrix: Matrix,
|
|
||||||
k: Int,
|
|
||||||
ratioToLargestCutoff: Float
|
|
||||||
): Seq[(Double, Seq[(Int, Double)])] = {
|
|
||||||
val solver = new ArpackSym(matrix)
|
|
||||||
val resultsMap = solver.solve(k, ArpackSym.Ritz.LM).asScala.toMap
|
|
||||||
val results = resultsMap.toIndexedSeq.sortBy { case (eigValue, _) => -eigValue }
|
|
||||||
results.zipWithIndex.map {
|
|
||||||
case ((eigValue, denseVectorJava), index) =>
|
|
||||||
val denseVector = new Array[Double](denseVectorJava.size())
|
|
||||||
denseVector.indices.foreach { index => denseVector(index) = denseVectorJava.get(index) }
|
|
||||||
val denseVectorMax = denseVector.maxBy { entry => math.abs(entry) }
|
|
||||||
val cutOff = math.abs(denseVectorMax) / ratioToLargestCutoff
|
|
||||||
val significantEntries = denseVector.zipWithIndex
|
|
||||||
.filter { case (vectorEntry, _) => math.abs(vectorEntry) >= cutOff }
|
|
||||||
.sortBy { case (vectorEntry, _) => -1 * math.abs(vectorEntry) }
|
|
||||||
(eigValue.toDouble, significantEntries.toSeq.map(_.swap))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Compute U*Diag*Ut - where Diag is a diagonal matrix, and U is a sparse matrix.
|
|
||||||
* This is primarily for testing - to make sure that the computed eigenvectors can be used to
|
|
||||||
* reconstruct the original matrix up to some reasonable approximation.
|
|
||||||
*
|
|
||||||
* @param diagToUColumns seq of (diagonal entries, associated column in U)
|
|
||||||
* @param cutoff cutoff for including a value in the result.
|
|
||||||
*
|
|
||||||
* @return result of multiplication, returned as a map of the rows in the results.
|
|
||||||
*/
|
|
||||||
def uTimesDiagTimesUT(
|
|
||||||
diagToUColumns: Seq[(Double, Seq[(Int, Double)])],
|
|
||||||
cutoff: Double
|
|
||||||
): Map[Int, Map[Int, Double]] = {
|
|
||||||
val result = new util.HashMap[Int, util.HashMap[Int, Double]]()
|
|
||||||
diagToUColumns.foreach {
|
|
||||||
case (diag, uColumn) =>
|
|
||||||
uColumn.foreach {
|
|
||||||
case (i, iVal) =>
|
|
||||||
uColumn.foreach {
|
|
||||||
case (j, jVal) =>
|
|
||||||
val prod = diag * iVal * jVal
|
|
||||||
if (result.containsKey(i)) {
|
|
||||||
val newVal = if (result.get(i).containsKey(j)) {
|
|
||||||
result.get(i).get(j) + prod
|
|
||||||
} else prod
|
|
||||||
result.get(i).put(j, newVal)
|
|
||||||
} else {
|
|
||||||
result.put(i, new util.HashMap[Int, Double])
|
|
||||||
result.get(i).put(j, prod)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
val unfiltered = result.asScala.toMap.mapValues(_.asScala.toMap)
|
|
||||||
unfiltered
|
|
||||||
.mapValues { m => m.filter { case (_, value) => math.abs(value) >= cutoff } }
|
|
||||||
.filter { case (_, vector) => vector.nonEmpty }
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Note: This requires a full EVD to correctly compute the inverse! :-( */
|
|
||||||
def getInverseFromEVD(
|
|
||||||
evd: Seq[(Double, Seq[(Int, Double)])],
|
|
||||||
cutoff: Double
|
|
||||||
): Map[Int, Map[Int, Double]] = {
|
|
||||||
val evdInverse = evd.map {
|
|
||||||
case (eigValue, eigVector) =>
|
|
||||||
(1.0 / eigValue, eigVector)
|
|
||||||
}
|
|
||||||
uTimesDiagTimesUT(evdInverse, cutoff)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
object PCAProjectionMatrixAdhoc extends TwitterExecutionApp {
|
|
||||||
val log = Logger()
|
|
||||||
|
|
||||||
def job: Execution[Unit] =
|
|
||||||
Execution.getConfigMode.flatMap {
|
|
||||||
case (config, _) =>
|
|
||||||
Execution.withId { _ =>
|
|
||||||
val args = config.getArgs
|
|
||||||
val k = args.int("k", 100)
|
|
||||||
val ratioToLargestEntryInVectorCutoff = args.int("ratioToLargestEntryInVectorCutoff", 100)
|
|
||||||
val minClusterFavers = args.int("minClusterFavers", 1000)
|
|
||||||
val input = TypedPipe.from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
|
|
||||||
val outputDir = args("outputDir")
|
|
||||||
|
|
||||||
val filteredClustersExec =
|
|
||||||
input
|
|
||||||
.collect {
|
|
||||||
case ((_, clusterId), details)
|
|
||||||
if details.numUsersWithNonZeroFavScore > minClusterFavers =>
|
|
||||||
clusterId
|
|
||||||
}
|
|
||||||
.toIterableExecution
|
|
||||||
.map { fc =>
|
|
||||||
val fcSet = fc.toSet
|
|
||||||
log.info("Number of clusters with favers more than %d is %d"
|
|
||||||
.format(minClusterFavers, fcSet.size))
|
|
||||||
fcSet
|
|
||||||
}
|
|
||||||
|
|
||||||
filteredClustersExec
|
|
||||||
.flatMap { filteredClusters =>
|
|
||||||
input.flatMap {
|
|
||||||
case ((_, clusterId), details) =>
|
|
||||||
if (filteredClusters(clusterId)) {
|
|
||||||
details.neighborClusters.getOrElse(Nil).collect {
|
|
||||||
case neighbor
|
|
||||||
if filteredClusters(
|
|
||||||
neighbor.clusterId) && neighbor.favCosineSimilarity.isDefined =>
|
|
||||||
(clusterId, neighbor.clusterId, neighbor.favCosineSimilarity.get)
|
|
||||||
}
|
|
||||||
} else Nil
|
|
||||||
}.toIterableExecution
|
|
||||||
}
|
|
||||||
.flatMap { edgesIter =>
|
|
||||||
val edges = edgesIter.toSeq
|
|
||||||
val oldIdToNewId = edges
|
|
||||||
.flatMap { case (i, j, _) => Seq(i, j) }
|
|
||||||
.distinct
|
|
||||||
.zipWithIndex
|
|
||||||
.toMap
|
|
||||||
|
|
||||||
val mapString = oldIdToNewId.toList
|
|
||||||
.take(5).map {
|
|
||||||
case (old, nw) =>
|
|
||||||
Seq(old, nw).mkString(" ")
|
|
||||||
}.mkString("\n")
|
|
||||||
log.info("A few entries of OldId to NewId map is")
|
|
||||||
log.info(mapString)
|
|
||||||
|
|
||||||
val newIdToOldId = oldIdToNewId.map(_.swap)
|
|
||||||
log.info(
|
|
||||||
"Num clusters after filtering out those with no neighbors with favers more than %d is %d"
|
|
||||||
.format(minClusterFavers, oldIdToNewId.size))
|
|
||||||
val newEdges = edges.map {
|
|
||||||
case (oldI, oldJ, value) =>
|
|
||||||
(oldIdToNewId(oldI), oldIdToNewId(oldJ), value)
|
|
||||||
}
|
|
||||||
log.info("Going to build matrix")
|
|
||||||
val matrix = EigenVectorsForSparseSymmetric.getMatrix(
|
|
||||||
newEdges,
|
|
||||||
oldIdToNewId.size,
|
|
||||||
oldIdToNewId.size)
|
|
||||||
EigenVectorsForSparseSymmetric.ensureMatrixIsSymmetric(matrix)
|
|
||||||
|
|
||||||
log.info("Going to solve now for %d eigenvalues".format(k))
|
|
||||||
val tic = System.currentTimeMillis()
|
|
||||||
val results = EigenVectorsForSparseSymmetric.getTruncatedEVD(
|
|
||||||
matrix,
|
|
||||||
k,
|
|
||||||
ratioToLargestEntryInVectorCutoff)
|
|
||||||
val toc = System.currentTimeMillis()
|
|
||||||
log.info("Finished solving in %.2f minutes".format((toc - tic) / 1000 / 60.0))
|
|
||||||
|
|
||||||
val eigValues = results.map(_._1).map { x => "%.3g".format(x) }.mkString(" ")
|
|
||||||
val eigValueNorm = math.sqrt(results.map(_._1).map(x => x * x).sum)
|
|
||||||
val matrixNorm = math.sqrt(matrix.iterator().asScala.map(_.get()).map(x => x * x).sum)
|
|
||||||
|
|
||||||
println(
|
|
||||||
"matrixNorm %s, eigValueNorm %s, explained fraction %s"
|
|
||||||
.format(matrixNorm, eigValueNorm, eigValueNorm / matrixNorm))
|
|
||||||
|
|
||||||
log.info("The eigenvalues are:")
|
|
||||||
log.info(eigValues)
|
|
||||||
|
|
||||||
val nnzInEigenVectors = results.map(_._2.size).sum
|
|
||||||
log.info("Average nnz per eigenvector using ratioToLargestCutoff %d is %.2g"
|
|
||||||
.format(ratioToLargestEntryInVectorCutoff, nnzInEigenVectors * 1.0 / results.size))
|
|
||||||
val transposedRaw = results.zipWithIndex.flatMap {
|
|
||||||
case ((_, eigVector), eigIndex) =>
|
|
||||||
eigVector.map {
|
|
||||||
case (index, vectorEntry) =>
|
|
||||||
val clusterId = newIdToOldId(index)
|
|
||||||
Map(clusterId -> List((eigIndex, vectorEntry)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
val transposed = Monoid.sum(transposedRaw).mapValues { rowForCluster =>
|
|
||||||
rowForCluster
|
|
||||||
.map {
|
|
||||||
case (dimId, weight) =>
|
|
||||||
"%d:%.2g".format(dimId, weight)
|
|
||||||
}.mkString(" ")
|
|
||||||
}
|
|
||||||
TypedPipe.from(transposed.toSeq).writeExecution(TypedTsv(outputDir))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,332 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.scalding
|
|
||||||
|
|
||||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
|
||||||
import com.twitter.dal.client.dataset.SnapshotDALDataset
|
|
||||||
import com.twitter.scalding._
|
|
||||||
import com.twitter.scalding_internal.dalv2.DAL
|
|
||||||
import com.twitter.scalding_internal.dalv2.DALWrite.D
|
|
||||||
import com.twitter.scalding_internal.dalv2.DALWrite.WriteExtension
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
|
||||||
import com.twitter.simclusters_v2.common.ClusterId
|
|
||||||
import com.twitter.simclusters_v2.common.ModelVersions
|
|
||||||
import com.twitter.simclusters_v2.common.UserId
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2UserToInterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.InternalId
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.ModelVersion
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusters
|
|
||||||
import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
|
|
||||||
import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp
|
|
||||||
import java.util.TimeZone
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Production job for computing interestedIn data set from the aggregatable producer embeddings for the model version 20M145K2020.
|
|
||||||
* It writes the data set in KeyVal format to produce a MH DAL data set.
|
|
||||||
*
|
|
||||||
* A high level description of this job:
|
|
||||||
* - Read the APE dataset
|
|
||||||
* - Apply log1p to the scores from the above dataset as the scores for producers is high
|
|
||||||
* - Normalize the scores for each producer (offline benchmarking has shown better results from this step.)
|
|
||||||
* - Truncate the number of clusters for each producer from the APE dataset to reduce noise
|
|
||||||
* - Compute interestedIn
|
|
||||||
*
|
|
||||||
* To deploy the job:
|
|
||||||
*
|
|
||||||
* capesospy-v2 update --build_locally --start_cron interested_in_from_ape_2020 \
|
|
||||||
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
|
|
||||||
*/
|
|
||||||
object InterestedInFromAPE2020BatchApp extends InterestedInFromAggregatableProducerEmbeddingsBase {
|
|
||||||
|
|
||||||
override val firstTime: RichDate = RichDate("2021-03-03")
|
|
||||||
|
|
||||||
override val batchIncrement: Duration = Days(7)
|
|
||||||
|
|
||||||
override def modelVersion: ModelVersion = ModelVersion.Model20m145k2020
|
|
||||||
|
|
||||||
override def producerEmbeddingsInputKVDataset: KeyValDALDataset[
|
|
||||||
KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]
|
|
||||||
] = AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
|
|
||||||
|
|
||||||
override def interestedInFromAPEOutputKVDataset: KeyValDALDataset[
|
|
||||||
KeyVal[UserId, ClustersUserIsInterestedIn]
|
|
||||||
] = SimclustersV2InterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
|
|
||||||
|
|
||||||
override def interestedInFromAPEOutputThriftDatset: SnapshotDALDataset[
|
|
||||||
UserToInterestedInClusters
|
|
||||||
] = SimclustersV2UserToInterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
|
|
||||||
}
|
|
||||||
|
|
||||||
trait InterestedInFromAggregatableProducerEmbeddingsBase extends ScheduledExecutionApp {
|
|
||||||
def modelVersion: ModelVersion
|
|
||||||
|
|
||||||
def interestedInFromAPEOutputKVDataset: KeyValDALDataset[
|
|
||||||
KeyVal[UserId, ClustersUserIsInterestedIn]
|
|
||||||
]
|
|
||||||
|
|
||||||
def producerEmbeddingsInputKVDataset: KeyValDALDataset[
|
|
||||||
KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]
|
|
||||||
]
|
|
||||||
|
|
||||||
def interestedInFromAPEOutputThriftDatset: SnapshotDALDataset[UserToInterestedInClusters]
|
|
||||||
|
|
||||||
override def runOnDateRange(
|
|
||||||
args: Args
|
|
||||||
)(
|
|
||||||
implicit dateRange: DateRange,
|
|
||||||
timeZone: TimeZone,
|
|
||||||
uniqueID: UniqueID
|
|
||||||
): Execution[Unit] = {
|
|
||||||
//Input args for the run
|
|
||||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
|
||||||
val maxClustersFromProducer = args.int("maxClustersPerProducer", 5)
|
|
||||||
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 200)
|
|
||||||
|
|
||||||
//Path variables
|
|
||||||
val interestedInFromProducersPath =
|
|
||||||
s"/user/cassowary/manhattan_sequence_files/interested_in_from_ape/" + modelVersion
|
|
||||||
|
|
||||||
val interestedInFromProducersThriftPath =
|
|
||||||
s"/user/cassowary/manhattan_sequence_files/interested_in_from_ape_thrift/" + modelVersion
|
|
||||||
|
|
||||||
val userUserGraph: TypedPipe[UserAndNeighbors] =
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
|
|
||||||
.withRemoteReadPolicy(AllowCrossDC)
|
|
||||||
.toTypedPipe
|
|
||||||
|
|
||||||
val producerEmbeddings = DAL
|
|
||||||
.readMostRecentSnapshotNoOlderThan(
|
|
||||||
producerEmbeddingsInputKVDataset,
|
|
||||||
Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe.map {
|
|
||||||
case KeyVal(producer, embeddings) => (producer, embeddings)
|
|
||||||
}
|
|
||||||
|
|
||||||
val result = InterestedInFromAggregatableProducerEmbeddingsBase.run(
|
|
||||||
userUserGraph,
|
|
||||||
producerEmbeddings,
|
|
||||||
maxClustersFromProducer,
|
|
||||||
socialProofThreshold,
|
|
||||||
maxClustersPerUserFinalResult,
|
|
||||||
modelVersion)
|
|
||||||
|
|
||||||
val keyValExec =
|
|
||||||
result
|
|
||||||
.map { case (userId, clusters) => KeyVal(userId, clusters) }
|
|
||||||
.writeDALVersionedKeyValExecution(
|
|
||||||
interestedInFromAPEOutputKVDataset,
|
|
||||||
D.Suffix(interestedInFromProducersPath)
|
|
||||||
)
|
|
||||||
val thriftExec =
|
|
||||||
result
|
|
||||||
.map {
|
|
||||||
case (userId, clusters) =>
|
|
||||||
UserToInterestedInClusters(
|
|
||||||
userId,
|
|
||||||
ModelVersions.toKnownForModelVersion(modelVersion),
|
|
||||||
clusters.clusterIdToScores)
|
|
||||||
}
|
|
||||||
.writeDALSnapshotExecution(
|
|
||||||
interestedInFromAPEOutputThriftDatset,
|
|
||||||
D.Daily,
|
|
||||||
D.Suffix(interestedInFromProducersThriftPath),
|
|
||||||
D.EBLzo(),
|
|
||||||
dateRange.end
|
|
||||||
)
|
|
||||||
Execution.zip(keyValExec, thriftExec).unit
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adhoc job to generate the interestedIn from aggregatable producer embeddings for the model version 20M145K2020
|
|
||||||
*
|
|
||||||
* scalding remote run \
|
|
||||||
* --user cassowary \
|
|
||||||
* --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
|
|
||||||
* --principal service_acoount@TWITTER.BIZ \
|
|
||||||
* --cluster bluebird-qus1 \
|
|
||||||
* --main-class com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020AdhocApp \
|
|
||||||
* --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_ape_2020-adhoc \
|
|
||||||
* --hadoop-properties "mapreduce.map.memory.mb=8192 mapreduce.map.java.opts='-Xmx7618M' mapreduce.reduce.memory.mb=8192 mapreduce.reduce.java.opts='-Xmx7618M'" \
|
|
||||||
* -- --outputDir /gcs/user/cassowary/adhoc/your_ldap/interested_in_from_ape_2020_keyval --date 2021-03-05
|
|
||||||
*/
|
|
||||||
object InterestedInFromAPE2020AdhocApp extends AdhocExecutionApp {
|
|
||||||
override def runOnDateRange(
|
|
||||||
args: Args
|
|
||||||
)(
|
|
||||||
implicit dateRange: DateRange,
|
|
||||||
timeZone: TimeZone,
|
|
||||||
uniqueID: UniqueID
|
|
||||||
): Execution[Unit] = {
|
|
||||||
val outputDir = args("outputDir")
|
|
||||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
|
||||||
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 200)
|
|
||||||
val maxClustersFromProducer = args.int("maxClustersFromProducer", 5)
|
|
||||||
val inputGraph = args.optional("graphInputDir") match {
|
|
||||||
case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir))
|
|
||||||
case None =>
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
|
|
||||||
.withRemoteReadPolicy(AllowCrossClusterSameDC)
|
|
||||||
.toTypedPipe
|
|
||||||
}
|
|
||||||
|
|
||||||
val producerEmbeddings = DAL
|
|
||||||
.readMostRecentSnapshotNoOlderThan(
|
|
||||||
AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset,
|
|
||||||
Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe.map {
|
|
||||||
case KeyVal(producer, embeddings) => (producer, embeddings)
|
|
||||||
}
|
|
||||||
|
|
||||||
val result = InterestedInFromAggregatableProducerEmbeddingsBase.run(
|
|
||||||
inputGraph,
|
|
||||||
producerEmbeddings,
|
|
||||||
maxClustersFromProducer,
|
|
||||||
socialProofThreshold,
|
|
||||||
maxClustersPerUserFinalResult,
|
|
||||||
ModelVersion.Model20m145k2020)
|
|
||||||
|
|
||||||
result
|
|
||||||
.writeExecution(AdhocKeyValSources.interestedInSource(outputDir))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper functions
|
|
||||||
*/
|
|
||||||
object InterestedInFromAggregatableProducerEmbeddingsBase {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper function to prune the embeddings
|
|
||||||
* @param embeddingsWithScore embeddings
|
|
||||||
* @param maxClusters number of clusters to keep, per userId
|
|
||||||
* @param uniqueId for stats
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
def getPrunedEmbeddings(
|
|
||||||
embeddingsWithScore: TypedPipe[(UserId, Seq[(ClusterId, Float)])],
|
|
||||||
maxClusters: Int
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): TypedPipe[(UserId, Array[(ClusterId, Float)])] = {
|
|
||||||
val numProducerMappings = Stat("num_producer_embeddings_total")
|
|
||||||
val numProducersWithLargeClusterMappings = Stat(
|
|
||||||
"num_producers_with_more_clusters_than_threshold")
|
|
||||||
val numProducersWithSmallClusterMappings = Stat(
|
|
||||||
"num_producers_with_clusters_less_than_threshold")
|
|
||||||
val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings")
|
|
||||||
embeddingsWithScore.map {
|
|
||||||
case (producerId, clusterArray) =>
|
|
||||||
numProducerMappings.inc()
|
|
||||||
val clusterSize = clusterArray.size
|
|
||||||
totalClustersCoverageProducerEmbeddings.incBy(clusterSize)
|
|
||||||
val prunedList = if (clusterSize > maxClusters) {
|
|
||||||
numProducersWithLargeClusterMappings.inc()
|
|
||||||
clusterArray
|
|
||||||
.sortBy {
|
|
||||||
case (_, knownForScore) => -knownForScore
|
|
||||||
}.take(maxClusters)
|
|
||||||
} else {
|
|
||||||
numProducersWithSmallClusterMappings.inc()
|
|
||||||
clusterArray
|
|
||||||
}
|
|
||||||
(producerId, prunedList.toArray)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* helper function to remove all scores except follow and logFav
|
|
||||||
* @param interestedInResult interestedIn clusters for a user
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
def getInterestedInDiscardScores(
|
|
||||||
interestedInResult: TypedPipe[(UserId, List[(ClusterId, UserToInterestedInClusterScores)])]
|
|
||||||
): TypedPipe[(UserId, List[(ClusterId, UserToInterestedInClusterScores)])] = {
|
|
||||||
interestedInResult.map {
|
|
||||||
case (srcId, fullClusterList) =>
|
|
||||||
val fullClusterListWithDiscardedScores = fullClusterList.map {
|
|
||||||
case (clusterId, clusterDetails) =>
|
|
||||||
val clusterDetailsWithoutSocial = UserToInterestedInClusterScores(
|
|
||||||
// We are not planning to use the other scores except for logFav and Follow.
|
|
||||||
// Hence, setting others as None for now, we can add them back when needed
|
|
||||||
followScore = clusterDetails.followScore,
|
|
||||||
logFavScore = clusterDetails.logFavScore,
|
|
||||||
logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly
|
|
||||||
)
|
|
||||||
(clusterId, clusterDetailsWithoutSocial)
|
|
||||||
}
|
|
||||||
(srcId, fullClusterListWithDiscardedScores)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper function to normalize the embeddings
|
|
||||||
* @param embeddings cluster embeddings
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
def getNormalizedEmbeddings(
|
|
||||||
embeddings: TypedPipe[(UserId, Seq[(ClusterId, Float)])]
|
|
||||||
): TypedPipe[(UserId, Seq[(ClusterId, Float)])] = {
|
|
||||||
embeddings.map {
|
|
||||||
case (userId, clustersWithScores) =>
|
|
||||||
val l2norm = math.sqrt(clustersWithScores.map(_._2).map(score => score * score).sum)
|
|
||||||
(
|
|
||||||
userId,
|
|
||||||
clustersWithScores.map {
|
|
||||||
case (clusterId, score) => (clusterId, (score / l2norm).toFloat)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def run(
|
|
||||||
userUserGraph: TypedPipe[UserAndNeighbors],
|
|
||||||
producerEmbeddings: TypedPipe[(SimClustersEmbeddingId, SimClustersEmbedding)],
|
|
||||||
maxClustersFromProducer: Int,
|
|
||||||
socialProofThreshold: Int,
|
|
||||||
maxClustersPerUserFinalResult: Int,
|
|
||||||
modelVersion: ModelVersion
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
import InterestedInFromKnownFor._
|
|
||||||
|
|
||||||
val producerEmbeddingsWithScore: TypedPipe[(UserId, Seq[(ClusterId, Float)])] =
|
|
||||||
producerEmbeddings.map {
|
|
||||||
case (
|
|
||||||
SimClustersEmbeddingId(embeddingType, modelVersion, InternalId.UserId(producerId)),
|
|
||||||
simclusterEmbedding) =>
|
|
||||||
(
|
|
||||||
producerId,
|
|
||||||
simclusterEmbedding.embedding.map { simclusterWithScore =>
|
|
||||||
// APE dataset has very high producer scores, hence applying log to smoothen them out before
|
|
||||||
// computing interestedIn
|
|
||||||
(simclusterWithScore.clusterId, math.log(1.0 + simclusterWithScore.score).toFloat)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
val result = keepOnlyTopClusters(
|
|
||||||
getInterestedInDiscardScores(
|
|
||||||
attachNormalizedScores(
|
|
||||||
userClusterPairsWithoutNormalization(
|
|
||||||
userUserGraph,
|
|
||||||
getPrunedEmbeddings(
|
|
||||||
getNormalizedEmbeddings(producerEmbeddingsWithScore),
|
|
||||||
maxClustersFromProducer),
|
|
||||||
socialProofThreshold,
|
|
||||||
))),
|
|
||||||
maxClustersPerUserFinalResult,
|
|
||||||
ModelVersions.toKnownForModelVersion(modelVersion)
|
|
||||||
)
|
|
||||||
result
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,666 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.scalding
|
|
||||||
|
|
||||||
import com.twitter.algebird.Semigroup
|
|
||||||
import com.twitter.bijection.Injection
|
|
||||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
|
||||||
import com.twitter.scalding.TypedPipe
|
|
||||||
import com.twitter.scalding._
|
|
||||||
import com.twitter.scalding_internal.dalv2.DAL
|
|
||||||
import com.twitter.scalding_internal.dalv2.DALWrite._
|
|
||||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
|
||||||
import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecution
|
|
||||||
import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecutionArgs
|
|
||||||
import com.twitter.scalding_internal.job.analytics_batch.BatchDescription
|
|
||||||
import com.twitter.scalding_internal.job.analytics_batch.BatchFirstTime
|
|
||||||
import com.twitter.scalding_internal.job.analytics_batch.BatchIncrement
|
|
||||||
import com.twitter.scalding_internal.job.analytics_batch.TwitterScheduledExecutionApp
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
|
||||||
import com.twitter.simclusters_v2.common.ClusterId
|
|
||||||
import com.twitter.simclusters_v2.common.ModelVersions
|
|
||||||
import com.twitter.simclusters_v2.common.UserId
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources._
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.Util
|
|
||||||
import com.twitter.simclusters_v2.thriftscala._
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This file implements the job for computing users' interestedIn vector from KnownFor data set.
|
|
||||||
*
|
|
||||||
* It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then
|
|
||||||
* based on the known-for clusters of each followed/faved user, we calculate how much a user is
|
|
||||||
* interestedIn a cluster.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Production job for computing interestedIn data set for the model version 20M145K2020.
|
|
||||||
*
|
|
||||||
* To deploy the job:
|
|
||||||
*
|
|
||||||
* capesospy-v2 update --build_locally --start_cron interested_in_for_20M_145k_2020 \
|
|
||||||
src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
|
|
||||||
*/
|
|
||||||
object InterestedInFromKnownFor20M145K2020 extends InterestedInFromKnownForBatchBase {
|
|
||||||
override val firstTime: String = "2020-10-06"
|
|
||||||
override val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
|
|
||||||
SimclustersV2RawInterestedIn20M145K2020ScalaDataset
|
|
||||||
override val outputPath: String = InternalDataPaths.RawInterestedIn2020Path
|
|
||||||
override val knownForModelVersion: String = ModelVersions.Model20M145K2020
|
|
||||||
override val knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] =
|
|
||||||
SimclustersV2KnownFor20M145K2020ScalaDataset
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* base class for the main logic of computing interestedIn from KnownFor data set.
|
|
||||||
*/
|
|
||||||
trait InterestedInFromKnownForBatchBase extends TwitterScheduledExecutionApp {
|
|
||||||
implicit val tz = DateOps.UTC
|
|
||||||
implicit val parser = DateParser.default
|
|
||||||
|
|
||||||
def firstTime: String
|
|
||||||
val batchIncrement: Duration = Days(7)
|
|
||||||
val lookBackDays: Duration = Days(30)
|
|
||||||
|
|
||||||
def outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
|
|
||||||
def outputPath: String
|
|
||||||
def knownForModelVersion: String
|
|
||||||
def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
|
|
||||||
|
|
||||||
private lazy val execArgs = AnalyticsBatchExecutionArgs(
|
|
||||||
batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
|
|
||||||
firstTime = BatchFirstTime(RichDate(firstTime)),
|
|
||||||
lastTime = None,
|
|
||||||
batchIncrement = BatchIncrement(batchIncrement)
|
|
||||||
)
|
|
||||||
|
|
||||||
override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
|
|
||||||
implicit dateRange =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
Execution.withArgs { args =>
|
|
||||||
val normalizedGraph =
|
|
||||||
DAL.readMostRecentSnapshot(UserUserNormalizedGraphScalaDataset).toTypedPipe
|
|
||||||
val knownFor = KnownForSources.fromKeyVal(
|
|
||||||
DAL.readMostRecentSnapshot(knownForDALDataset, dateRange.extend(Days(30))).toTypedPipe,
|
|
||||||
knownForModelVersion
|
|
||||||
)
|
|
||||||
|
|
||||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
|
||||||
val maxClustersPerUser = args.int("maxClustersPerUser", 50)
|
|
||||||
|
|
||||||
val result = InterestedInFromKnownFor
|
|
||||||
.run(
|
|
||||||
normalizedGraph,
|
|
||||||
knownFor,
|
|
||||||
socialProofThreshold,
|
|
||||||
maxClustersPerUser,
|
|
||||||
knownForModelVersion
|
|
||||||
)
|
|
||||||
|
|
||||||
val writeKeyValResultExec = result
|
|
||||||
.map { case (userId, clusters) => KeyVal(userId, clusters) }
|
|
||||||
.writeDALVersionedKeyValExecution(
|
|
||||||
outputKVDataset,
|
|
||||||
D.Suffix(outputPath)
|
|
||||||
)
|
|
||||||
|
|
||||||
// read previous data set for validation purpose
|
|
||||||
val previousDataset = if (RichDate(firstTime).timestamp != dateRange.start.timestamp) {
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshot(outputKVDataset, dateRange.prepend(lookBackDays)).toTypedPipe
|
|
||||||
.map {
|
|
||||||
case KeyVal(user, interestedIn) =>
|
|
||||||
(user, interestedIn)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
TypedPipe.empty
|
|
||||||
}
|
|
||||||
|
|
||||||
Util.printCounters(
|
|
||||||
Execution
|
|
||||||
.zip(
|
|
||||||
writeKeyValResultExec,
|
|
||||||
InterestedInFromKnownFor.dataSetStats(result, "NewResult"),
|
|
||||||
InterestedInFromKnownFor.dataSetStats(previousDataset, "OldResult")
|
|
||||||
).unit
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adhoc job to compute user interestedIn.
|
|
||||||
*
|
|
||||||
* scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_adhoc \
|
|
||||||
* --user recos-platform \
|
|
||||||
* --submitter hadoopnest2.atla.twitter.com \
|
|
||||||
* --main-class com.twitter.simclusters_v2.scalding.InterestedInFromKnownForAdhoc -- \
|
|
||||||
* --date 2019-08-26 --outputDir /user/recos-platform/adhoc/simclusters_interested_in_log_fav
|
|
||||||
*/
|
|
||||||
object InterestedInFromKnownForAdhoc extends TwitterExecutionApp {
|
|
||||||
def job: Execution[Unit] =
|
|
||||||
Execution.getConfigMode.flatMap {
|
|
||||||
case (config, mode) =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
val args = config.getArgs
|
|
||||||
val normalizedGraph = TypedPipe.from(
|
|
||||||
UserAndNeighborsFixedPathSource(args("graphInputDir"))
|
|
||||||
)
|
|
||||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
|
||||||
val maxClustersPerUser = args.int("maxClustersPerUser", 20)
|
|
||||||
val knownForModelVersion = args("knownForModelVersion")
|
|
||||||
val knownFor = KnownForSources.readKnownFor(args("knownForInputDir"))
|
|
||||||
|
|
||||||
val outputSink = AdhocKeyValSources.interestedInSource(args("outputDir"))
|
|
||||||
Util.printCounters(
|
|
||||||
InterestedInFromKnownFor
|
|
||||||
.run(
|
|
||||||
normalizedGraph,
|
|
||||||
knownFor,
|
|
||||||
socialProofThreshold,
|
|
||||||
maxClustersPerUser,
|
|
||||||
knownForModelVersion
|
|
||||||
).writeExecution(outputSink)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adhoc job to check the output of an adhoc interestedInSource.
|
|
||||||
*/
|
|
||||||
object DumpInterestedInAdhoc extends TwitterExecutionApp {
|
|
||||||
def job: Execution[Unit] =
|
|
||||||
Execution.getConfigMode.flatMap {
|
|
||||||
case (config, mode) =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
val args = config.getArgs
|
|
||||||
val users = args.list("users").map(_.toLong).toSet
|
|
||||||
val input = TypedPipe.from(AdhocKeyValSources.interestedInSource(args("inputDir")))
|
|
||||||
input.filter { case (userId, rec) => users.contains(userId) }.toIterableExecution.map {
|
|
||||||
s => println(s.map(Util.prettyJsonMapper.writeValueAsString).mkString("\n"))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper functions
|
|
||||||
*/
|
|
||||||
object InterestedInFromKnownFor {
|
|
||||||
private def ifNanMake0(x: Double): Double = if (x.isNaN) 0.0 else x
|
|
||||||
|
|
||||||
case class SrcClusterIntermediateInfo(
|
|
||||||
followScore: Double,
|
|
||||||
followScoreProducerNormalized: Double,
|
|
||||||
favScore: Double,
|
|
||||||
favScoreProducerNormalized: Double,
|
|
||||||
logFavScore: Double,
|
|
||||||
logFavScoreProducerNormalized: Double,
|
|
||||||
followSocialProof: List[Long],
|
|
||||||
favSocialProof: List[Long]) {
|
|
||||||
// overriding for the sake of unit tests
|
|
||||||
override def equals(obj: scala.Any): Boolean = {
|
|
||||||
obj match {
|
|
||||||
case that: SrcClusterIntermediateInfo =>
|
|
||||||
math.abs(followScore - that.followScore) < 1e-5 &&
|
|
||||||
math.abs(followScoreProducerNormalized - that.followScoreProducerNormalized) < 1e-5 &&
|
|
||||||
math.abs(favScore - that.favScore) < 1e-5 &&
|
|
||||||
math.abs(favScoreProducerNormalized - that.favScoreProducerNormalized) < 1e-5 &&
|
|
||||||
math.abs(logFavScore - that.logFavScore) < 1e-5 &&
|
|
||||||
math.abs(logFavScoreProducerNormalized - that.logFavScoreProducerNormalized) < 1e-5 &&
|
|
||||||
followSocialProof.toSet == that.followSocialProof.toSet &&
|
|
||||||
favSocialProof.toSet == that.favSocialProof.toSet
|
|
||||||
case _ => false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
implicit object SrcClusterIntermediateInfoSemigroup
|
|
||||||
extends Semigroup[SrcClusterIntermediateInfo] {
|
|
||||||
override def plus(
|
|
||||||
left: SrcClusterIntermediateInfo,
|
|
||||||
right: SrcClusterIntermediateInfo
|
|
||||||
): SrcClusterIntermediateInfo = {
|
|
||||||
SrcClusterIntermediateInfo(
|
|
||||||
followScore = left.followScore + right.followScore,
|
|
||||||
followScoreProducerNormalized =
|
|
||||||
left.followScoreProducerNormalized + right.followScoreProducerNormalized,
|
|
||||||
favScore = left.favScore + right.favScore,
|
|
||||||
favScoreProducerNormalized =
|
|
||||||
left.favScoreProducerNormalized + right.favScoreProducerNormalized,
|
|
||||||
logFavScore = left.logFavScore + right.logFavScore,
|
|
||||||
logFavScoreProducerNormalized =
|
|
||||||
left.logFavScoreProducerNormalized + right.logFavScoreProducerNormalized,
|
|
||||||
followSocialProof =
|
|
||||||
Semigroup.plus(left.followSocialProof, right.followSocialProof).distinct,
|
|
||||||
favSocialProof = Semigroup.plus(left.favSocialProof, right.favSocialProof).distinct
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param adjacencyLists User-User follow/fav graph
|
|
||||||
* @param knownFor KnownFor data set. Each user can be known for several clusters with certain
|
|
||||||
* knownFor weights.
|
|
||||||
* @param socialProofThreshold A user will only be interested in a cluster if they follow/fav at
|
|
||||||
* least certain number of users known for this cluster.
|
|
||||||
* @param uniqueId required for these Stat
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
def userClusterPairsWithoutNormalization(
|
|
||||||
adjacencyLists: TypedPipe[UserAndNeighbors],
|
|
||||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
|
|
||||||
socialProofThreshold: Int
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] = {
|
|
||||||
val edgesToUsersWithKnownFor = Stat("num_edges_to_users_with_known_for")
|
|
||||||
val srcDestClusterTriples = Stat("num_src_dest_cluster_triples")
|
|
||||||
val srcClusterPairsBeforeSocialProofThresholding =
|
|
||||||
Stat("num_src_cluster_pairs_before_social_proof_thresholding")
|
|
||||||
val srcClusterPairsAfterSocialProofThresholding =
|
|
||||||
Stat("num_src_cluster_pairs_after_social_proof_thresholding")
|
|
||||||
|
|
||||||
val edges = adjacencyLists.flatMap {
|
|
||||||
case UserAndNeighbors(srcId, neighborsWithWeights) =>
|
|
||||||
neighborsWithWeights.map { neighborWithWeights =>
|
|
||||||
(
|
|
||||||
neighborWithWeights.neighborId,
|
|
||||||
neighborWithWeights.copy(neighborId = srcId)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian
|
|
||||||
|
|
||||||
edges
|
|
||||||
.sketch(4000)
|
|
||||||
.join(knownFor)
|
|
||||||
.flatMap {
|
|
||||||
case (destId, (srcWithWeights, clusterArray)) =>
|
|
||||||
edgesToUsersWithKnownFor.inc()
|
|
||||||
clusterArray.toList.map {
|
|
||||||
case (clusterId, knownForScoreF) =>
|
|
||||||
val knownForScore = math.max(0.0, knownForScoreF.toDouble)
|
|
||||||
|
|
||||||
srcDestClusterTriples.inc()
|
|
||||||
val followScore =
|
|
||||||
if (srcWithWeights.isFollowed.contains(true)) knownForScore else 0.0
|
|
||||||
val followScoreProducerNormalizedOnly =
|
|
||||||
srcWithWeights.followScoreNormalizedByNeighborFollowersL2.getOrElse(
|
|
||||||
0.0) * knownForScore
|
|
||||||
val favScore =
|
|
||||||
srcWithWeights.favScoreHalfLife100Days.getOrElse(0.0) * knownForScore
|
|
||||||
|
|
||||||
val favScoreProducerNormalizedOnly =
|
|
||||||
srcWithWeights.favScoreHalfLife100DaysNormalizedByNeighborFaversL2.getOrElse(
|
|
||||||
0.0) * knownForScore
|
|
||||||
|
|
||||||
val logFavScore = srcWithWeights.logFavScore.getOrElse(0.0) * knownForScore
|
|
||||||
|
|
||||||
val logFavScoreProducerNormalizedOnly = srcWithWeights.logFavScoreL2Normalized
|
|
||||||
.getOrElse(0.0) * knownForScore
|
|
||||||
|
|
||||||
val followSocialProof = if (srcWithWeights.isFollowed.contains(true)) {
|
|
||||||
List(destId)
|
|
||||||
} else Nil
|
|
||||||
val favSocialProof = if (srcWithWeights.favScoreHalfLife100Days.exists(_ > 0)) {
|
|
||||||
List(destId)
|
|
||||||
} else Nil
|
|
||||||
|
|
||||||
(
|
|
||||||
(srcWithWeights.neighborId, clusterId),
|
|
||||||
SrcClusterIntermediateInfo(
|
|
||||||
followScore,
|
|
||||||
followScoreProducerNormalizedOnly,
|
|
||||||
favScore,
|
|
||||||
favScoreProducerNormalizedOnly,
|
|
||||||
logFavScore,
|
|
||||||
logFavScoreProducerNormalizedOnly,
|
|
||||||
followSocialProof,
|
|
||||||
favSocialProof
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.sumByKey
|
|
||||||
.withReducers(10000)
|
|
||||||
.filter {
|
|
||||||
case ((_, _), SrcClusterIntermediateInfo(_, _, _, _, _, _, followProof, favProof)) =>
|
|
||||||
srcClusterPairsBeforeSocialProofThresholding.inc()
|
|
||||||
val distinctSocialProof = (followProof ++ favProof).toSet
|
|
||||||
val result = distinctSocialProof.size >= socialProofThreshold
|
|
||||||
if (result) {
|
|
||||||
srcClusterPairsAfterSocialProofThresholding.inc()
|
|
||||||
}
|
|
||||||
result
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Add the cluster-level l2 norm scores, and use them to normalize follow/fav scores.
|
|
||||||
*/
|
|
||||||
def attachNormalizedScores(
|
|
||||||
intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
|
|
||||||
|
|
||||||
def square(x: Double): Double = x * x
|
|
||||||
|
|
||||||
val clusterCountsAndNorms =
|
|
||||||
intermediate
|
|
||||||
.map {
|
|
||||||
case (
|
|
||||||
(_, clusterId),
|
|
||||||
SrcClusterIntermediateInfo(
|
|
||||||
followScore,
|
|
||||||
followScoreProducerNormalizedOnly,
|
|
||||||
favScore,
|
|
||||||
favScoreProducerNormalizedOnly,
|
|
||||||
logFavScore,
|
|
||||||
logFavScoreProducerNormalizedOnly,
|
|
||||||
_,
|
|
||||||
_
|
|
||||||
)
|
|
||||||
) =>
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
(
|
|
||||||
1,
|
|
||||||
square(followScore),
|
|
||||||
square(followScoreProducerNormalizedOnly),
|
|
||||||
square(favScore),
|
|
||||||
square(favScoreProducerNormalizedOnly),
|
|
||||||
square(logFavScore),
|
|
||||||
square(logFavScoreProducerNormalizedOnly)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
.sumByKey
|
|
||||||
// .withReducers(100)
|
|
||||||
.map {
|
|
||||||
case (
|
|
||||||
clusterId,
|
|
||||||
(
|
|
||||||
cnt,
|
|
||||||
squareFollowScore,
|
|
||||||
squareFollowScoreProducerNormalizedOnly,
|
|
||||||
squareFavScore,
|
|
||||||
squareFavScoreProducerNormalizedOnly,
|
|
||||||
squareLogFavScore,
|
|
||||||
squareLogFavScoreProducerNormalizedOnly
|
|
||||||
)) =>
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
(
|
|
||||||
cnt,
|
|
||||||
math.sqrt(squareFollowScore),
|
|
||||||
math.sqrt(squareFollowScoreProducerNormalizedOnly),
|
|
||||||
math.sqrt(squareFavScore),
|
|
||||||
math.sqrt(squareFavScoreProducerNormalizedOnly),
|
|
||||||
math.sqrt(squareLogFavScore),
|
|
||||||
math.sqrt(squareLogFavScoreProducerNormalizedOnly)
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
implicit val i2b: Int => Array[Byte] = Injection.int2BigEndian
|
|
||||||
|
|
||||||
intermediate
|
|
||||||
.map {
|
|
||||||
case ((srcId, clusterId), clusterScoresTuple) =>
|
|
||||||
(clusterId, (srcId, clusterScoresTuple))
|
|
||||||
}
|
|
||||||
.sketch(reducers = 900)
|
|
||||||
.join(clusterCountsAndNorms)
|
|
||||||
.map {
|
|
||||||
case (
|
|
||||||
clusterId,
|
|
||||||
(
|
|
||||||
(
|
|
||||||
srcId,
|
|
||||||
SrcClusterIntermediateInfo(
|
|
||||||
followScore,
|
|
||||||
followScoreProducerNormalizedOnly,
|
|
||||||
favScore,
|
|
||||||
favScoreProducerNormalizedOnly,
|
|
||||||
logFavScore,
|
|
||||||
logFavScoreProducerNormalizedOnly, // not used for now
|
|
||||||
followProof,
|
|
||||||
favProof
|
|
||||||
)
|
|
||||||
),
|
|
||||||
(
|
|
||||||
cnt,
|
|
||||||
followNorm,
|
|
||||||
followProducerNormalizedNorm,
|
|
||||||
favNorm,
|
|
||||||
favProducerNormalizedNorm,
|
|
||||||
logFavNorm,
|
|
||||||
logFavProducerNormalizedNorm // not used for now
|
|
||||||
)
|
|
||||||
)
|
|
||||||
) =>
|
|
||||||
(
|
|
||||||
srcId,
|
|
||||||
List(
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
UserToInterestedInClusterScores(
|
|
||||||
followScore = Some(ifNanMake0(followScore)),
|
|
||||||
followScoreClusterNormalizedOnly = Some(ifNanMake0(followScore / followNorm)),
|
|
||||||
followScoreProducerNormalizedOnly =
|
|
||||||
Some(ifNanMake0(followScoreProducerNormalizedOnly)),
|
|
||||||
followScoreClusterAndProducerNormalized = Some(
|
|
||||||
ifNanMake0(followScoreProducerNormalizedOnly / followProducerNormalizedNorm)),
|
|
||||||
favScore = Some(ifNanMake0(favScore)),
|
|
||||||
favScoreClusterNormalizedOnly = Some(ifNanMake0(favScore / favNorm)),
|
|
||||||
favScoreProducerNormalizedOnly = Some(ifNanMake0(favScoreProducerNormalizedOnly)),
|
|
||||||
favScoreClusterAndProducerNormalized =
|
|
||||||
Some(ifNanMake0(favScoreProducerNormalizedOnly / favProducerNormalizedNorm)),
|
|
||||||
usersBeingFollowed = Some(followProof),
|
|
||||||
usersThatWereFaved = Some(favProof),
|
|
||||||
numUsersInterestedInThisClusterUpperBound = Some(cnt),
|
|
||||||
logFavScore = Some(ifNanMake0(logFavScore)),
|
|
||||||
logFavScoreClusterNormalizedOnly = Some(ifNanMake0(logFavScore / logFavNorm))
|
|
||||||
))
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
.sumByKey
|
|
||||||
// .withReducers(1000)
|
|
||||||
.toTypedPipe
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* aggregate cluster scores for each user, to be used instead of attachNormalizedScores
|
|
||||||
* when we donot want to compute cluster-level l2 norm scores
|
|
||||||
*/
|
|
||||||
def groupClusterScores(
|
|
||||||
intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
|
|
||||||
|
|
||||||
intermediate
|
|
||||||
.map {
|
|
||||||
case (
|
|
||||||
(srcId, clusterId),
|
|
||||||
SrcClusterIntermediateInfo(
|
|
||||||
followScore,
|
|
||||||
followScoreProducerNormalizedOnly,
|
|
||||||
favScore,
|
|
||||||
favScoreProducerNormalizedOnly,
|
|
||||||
logFavScore,
|
|
||||||
logFavScoreProducerNormalizedOnly,
|
|
||||||
followProof,
|
|
||||||
favProof
|
|
||||||
)
|
|
||||||
) =>
|
|
||||||
(
|
|
||||||
srcId,
|
|
||||||
List(
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
UserToInterestedInClusterScores(
|
|
||||||
followScore = Some(ifNanMake0(followScore)),
|
|
||||||
followScoreProducerNormalizedOnly =
|
|
||||||
Some(ifNanMake0(followScoreProducerNormalizedOnly)),
|
|
||||||
favScore = Some(ifNanMake0(favScore)),
|
|
||||||
favScoreProducerNormalizedOnly = Some(ifNanMake0(favScoreProducerNormalizedOnly)),
|
|
||||||
usersBeingFollowed = Some(followProof),
|
|
||||||
usersThatWereFaved = Some(favProof),
|
|
||||||
logFavScore = Some(ifNanMake0(logFavScore)),
|
|
||||||
))
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
.sumByKey
|
|
||||||
.withReducers(1000)
|
|
||||||
.toTypedPipe
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* For each user, only keep up to a certain number of clusters.
|
|
||||||
* @param allInterests user with a list of interestedIn clusters.
|
|
||||||
* @param maxClustersPerUser number of clusters to keep for each user
|
|
||||||
* @param knownForModelVersion known for model version
|
|
||||||
* @param uniqueId required for these Stat
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
def keepOnlyTopClusters(
|
|
||||||
allInterests: TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])],
|
|
||||||
maxClustersPerUser: Int,
|
|
||||||
knownForModelVersion: String
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): TypedPipe[(Long, ClustersUserIsInterestedIn)] = {
|
|
||||||
val userClusterPairsBeforeUserTruncation =
|
|
||||||
Stat("num_user_cluster_pairs_before_user_truncation")
|
|
||||||
val userClusterPairsAfterUserTruncation =
|
|
||||||
Stat("num_user_cluster_pairs_after_user_truncation")
|
|
||||||
val usersWithALotOfClusters =
|
|
||||||
Stat(s"num_users_with_more_than_${maxClustersPerUser}_clusters")
|
|
||||||
|
|
||||||
allInterests
|
|
||||||
.map {
|
|
||||||
case (srcId, fullClusterList) =>
|
|
||||||
userClusterPairsBeforeUserTruncation.incBy(fullClusterList.size)
|
|
||||||
val truncatedClusters = if (fullClusterList.size > maxClustersPerUser) {
|
|
||||||
usersWithALotOfClusters.inc()
|
|
||||||
fullClusterList
|
|
||||||
.sortBy {
|
|
||||||
case (_, clusterScores) =>
|
|
||||||
(
|
|
||||||
-clusterScores.favScore.getOrElse(0.0),
|
|
||||||
-clusterScores.logFavScore.getOrElse(0.0),
|
|
||||||
-clusterScores.followScore.getOrElse(0.0),
|
|
||||||
-clusterScores.logFavScoreClusterNormalizedOnly.getOrElse(0.0),
|
|
||||||
-clusterScores.followScoreProducerNormalizedOnly.getOrElse(0.0)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
.take(maxClustersPerUser)
|
|
||||||
} else {
|
|
||||||
fullClusterList
|
|
||||||
}
|
|
||||||
userClusterPairsAfterUserTruncation.incBy(truncatedClusters.size)
|
|
||||||
(srcId, ClustersUserIsInterestedIn(knownForModelVersion, truncatedClusters.toMap))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def run(
|
|
||||||
adjacencyLists: TypedPipe[UserAndNeighbors],
|
|
||||||
knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
|
|
||||||
socialProofThreshold: Int,
|
|
||||||
maxClustersPerUser: Int,
|
|
||||||
knownForModelVersion: String
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
keepOnlyTopClusters(
|
|
||||||
attachNormalizedScores(
|
|
||||||
userClusterPairsWithoutNormalization(
|
|
||||||
adjacencyLists,
|
|
||||||
knownFor,
|
|
||||||
socialProofThreshold
|
|
||||||
)
|
|
||||||
),
|
|
||||||
maxClustersPerUser,
|
|
||||||
knownForModelVersion
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* run the interestedIn job, cluster normalized scores are not attached to user's clusters.
|
|
||||||
*/
|
|
||||||
def runWithoutClusterNormalizedScores(
|
|
||||||
adjacencyLists: TypedPipe[UserAndNeighbors],
|
|
||||||
knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
|
|
||||||
socialProofThreshold: Int,
|
|
||||||
maxClustersPerUser: Int,
|
|
||||||
knownForModelVersion: String
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
keepOnlyTopClusters(
|
|
||||||
groupClusterScores(
|
|
||||||
userClusterPairsWithoutNormalization(
|
|
||||||
adjacencyLists,
|
|
||||||
knownFor,
|
|
||||||
socialProofThreshold
|
|
||||||
)
|
|
||||||
),
|
|
||||||
maxClustersPerUser,
|
|
||||||
knownForModelVersion
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* print out some basic stats of the data set to make sure things are not broken
|
|
||||||
*/
|
|
||||||
def dataSetStats(
|
|
||||||
interestedInData: TypedPipe[(UserId, ClustersUserIsInterestedIn)],
|
|
||||||
dataSetName: String = ""
|
|
||||||
): Execution[Unit] = {
|
|
||||||
|
|
||||||
Execution
|
|
||||||
.zip(
|
|
||||||
Util.printSummaryOfNumericColumn(
|
|
||||||
interestedInData.map {
|
|
||||||
case (user, interestedIn) =>
|
|
||||||
interestedIn.clusterIdToScores.size
|
|
||||||
},
|
|
||||||
Some(s"$dataSetName UserInterestedIn Size")
|
|
||||||
),
|
|
||||||
Util.printSummaryOfNumericColumn(
|
|
||||||
interestedInData.flatMap {
|
|
||||||
case (user, interestedIn) =>
|
|
||||||
interestedIn.clusterIdToScores.map {
|
|
||||||
case (_, scores) =>
|
|
||||||
scores.favScore.getOrElse(0.0)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Some(s"$dataSetName UserInterestedIn favScore")
|
|
||||||
),
|
|
||||||
Util.printSummaryOfNumericColumn(
|
|
||||||
interestedInData.flatMap {
|
|
||||||
case (user, interestedIn) =>
|
|
||||||
interestedIn.clusterIdToScores.map {
|
|
||||||
case (_, scores) =>
|
|
||||||
scores.favScoreClusterNormalizedOnly.getOrElse(0.0)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Some(s"$dataSetName UserInterestedIn favScoreClusterNormalizedOnly")
|
|
||||||
),
|
|
||||||
Util.printSummaryOfNumericColumn(
|
|
||||||
interestedInData.flatMap {
|
|
||||||
case (user, interestedIn) =>
|
|
||||||
interestedIn.clusterIdToScores.map {
|
|
||||||
case (_, scores) =>
|
|
||||||
scores.logFavScoreClusterNormalizedOnly.getOrElse(0.0)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Some(s"$dataSetName UserInterestedIn logFavScoreClusterNormalizedOnly")
|
|
||||||
)
|
|
||||||
).unit
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,354 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.scalding
|
|
||||||
|
|
||||||
import com.twitter.algebird.Semigroup
|
|
||||||
import com.twitter.bijection.Injection
|
|
||||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
|
||||||
import com.twitter.scalding._
|
|
||||||
import com.twitter.scalding_internal.dalv2.DAL
|
|
||||||
import com.twitter.scalding_internal.dalv2.DALWrite.{D, WriteExtension}
|
|
||||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
|
||||||
import com.twitter.scalding_internal.job.analytics_batch.{
|
|
||||||
AnalyticsBatchExecution,
|
|
||||||
AnalyticsBatchExecutionArgs,
|
|
||||||
BatchDescription,
|
|
||||||
BatchFirstTime,
|
|
||||||
BatchIncrement,
|
|
||||||
TwitterScheduledExecutionApp
|
|
||||||
}
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
|
||||||
import com.twitter.simclusters_v2.common.{ClusterId, ModelVersions, UserId}
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.{
|
|
||||||
AdhocKeyValSources,
|
|
||||||
InternalDataPaths,
|
|
||||||
SimclustersV2KnownFor20M145K2020ScalaDataset,
|
|
||||||
SimclustersV2RawInterestedInLite20M145K2020ScalaDataset,
|
|
||||||
SimclustersV2RawInterestedIn20M145KUpdatedScalaDataset,
|
|
||||||
UserAndNeighborsFixedPathSource,
|
|
||||||
UserUserGraphScalaDataset
|
|
||||||
}
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.Util
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.{
|
|
||||||
ClustersUserIsInterestedIn,
|
|
||||||
ClustersUserIsKnownFor,
|
|
||||||
UserAndNeighbors,
|
|
||||||
UserToInterestedInClusterScores
|
|
||||||
}
|
|
||||||
import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
|
|
||||||
import java.util.TimeZone
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This file implements the job for computing users' interestedIn vector from KnownFor data set.
|
|
||||||
*
|
|
||||||
* It reads the UserUserGraphScalaDataset to get user-user follow + fav graph, and then
|
|
||||||
* based on the known-for clusters of each followed/faved user, we calculate how much a user is
|
|
||||||
* interestedIn a cluster.
|
|
||||||
*
|
|
||||||
* The main differences of the InterestedInFromKnownForLite compared to InterestedInFromKnownFor are
|
|
||||||
* the following:
|
|
||||||
* - We read the UserUserGraph dataset that doesnot contain the producer normalized scores
|
|
||||||
* - We donot compute the cluster normalized scores for the clusters per user
|
|
||||||
* - For social proof thresholding, we donot keep track of the entire list of follow and
|
|
||||||
* fav social proofs but rather make use of numFollowSocial and numFavSocial (this introduces
|
|
||||||
* some noise if follow and fav social proof contain the same users)
|
|
||||||
* - Store 200 clusters per user compared to 50 in IIKF
|
|
||||||
* - Runs more frequently compared to weekly in IIKF
|
|
||||||
*/
|
|
||||||
/**
|
|
||||||
* Production job for computing interestedIn data set for the model version 20M145K2020.
|
|
||||||
*
|
|
||||||
* To deploy the job:
|
|
||||||
*
|
|
||||||
* capesospy-v2 update --build_locally --start_cron interested_in_lite_for_20M_145k_2020 \
|
|
||||||
src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
|
|
||||||
*/
|
|
||||||
object InterestedInFromKnownForLite20M145K2020 extends InterestedInFromKnownForLite {
|
|
||||||
override val firstTime: String = "2021-04-24"
|
|
||||||
override val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
|
|
||||||
SimclustersV2RawInterestedInLite20M145K2020ScalaDataset
|
|
||||||
override val outputPath: String = InternalDataPaths.RawInterestedInLite2020Path
|
|
||||||
override val knownForModelVersion: String = ModelVersions.Model20M145K2020
|
|
||||||
override val knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] =
|
|
||||||
SimclustersV2KnownFor20M145K2020ScalaDataset
|
|
||||||
}
|
|
||||||
trait InterestedInFromKnownForLite extends TwitterScheduledExecutionApp {
|
|
||||||
implicit val tz = DateOps.UTC
|
|
||||||
implicit val parser = DateParser.default
|
|
||||||
|
|
||||||
def firstTime: String
|
|
||||||
val batchIncrement: Duration = Days(2)
|
|
||||||
val lookBackDays: Duration = Days(30)
|
|
||||||
|
|
||||||
def outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
|
|
||||||
def outputPath: String
|
|
||||||
def knownForModelVersion: String
|
|
||||||
def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
|
|
||||||
|
|
||||||
private lazy val execArgs = AnalyticsBatchExecutionArgs(
|
|
||||||
batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
|
|
||||||
firstTime = BatchFirstTime(RichDate(firstTime)),
|
|
||||||
lastTime = None,
|
|
||||||
batchIncrement = BatchIncrement(batchIncrement)
|
|
||||||
)
|
|
||||||
|
|
||||||
override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
|
|
||||||
implicit dateRange =>
|
|
||||||
Execution.withId { implicit uniqueId =>
|
|
||||||
Execution.withArgs { args =>
|
|
||||||
val userUserGraph =
|
|
||||||
DAL.readMostRecentSnapshot(UserUserGraphScalaDataset).toTypedPipe
|
|
||||||
val knownFor = KnownForSources.fromKeyVal(
|
|
||||||
DAL.readMostRecentSnapshot(knownForDALDataset, dateRange.extend(Days(30))).toTypedPipe,
|
|
||||||
knownForModelVersion
|
|
||||||
)
|
|
||||||
|
|
||||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
|
||||||
val maxClustersPerUser = args.int("maxClustersPerUser", 200)
|
|
||||||
|
|
||||||
val result = InterestedInFromKnownForLite
|
|
||||||
.run(
|
|
||||||
userUserGraph,
|
|
||||||
knownFor,
|
|
||||||
socialProofThreshold,
|
|
||||||
maxClustersPerUser,
|
|
||||||
knownForModelVersion
|
|
||||||
)
|
|
||||||
|
|
||||||
val writeKeyValResultExec = result
|
|
||||||
.map {
|
|
||||||
case (userId, clusters) => KeyVal(userId, clusters)
|
|
||||||
}.writeDALVersionedKeyValExecution(
|
|
||||||
outputKVDataset,
|
|
||||||
D.Suffix(outputPath)
|
|
||||||
)
|
|
||||||
Util.printCounters(writeKeyValResultExec)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adhoc job to compute user interestedIn.
|
|
||||||
*
|
|
||||||
* scalding remote run \
|
|
||||||
* --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_lite_20m_145k_2020-adhoc \
|
|
||||||
* --main-class com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020Adhoc \
|
|
||||||
* --user cassowary --cluster bluebird-qus1 \
|
|
||||||
* --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
|
|
||||||
* --principal service_acoount@TWITTER.BIZ \
|
|
||||||
* -- \
|
|
||||||
* --outputDir /gcs/user/cassowary/adhoc/interested_in_from_knownfor_lite/ \
|
|
||||||
* --date 2020-08-25
|
|
||||||
*/
|
|
||||||
object InterestedInFromKnownForLite20M145K2020Adhoc extends AdhocExecutionApp {
|
|
||||||
override def runOnDateRange(
|
|
||||||
args: Args
|
|
||||||
)(
|
|
||||||
implicit dateRange: DateRange,
|
|
||||||
timeZone: TimeZone,
|
|
||||||
uniqueID: UniqueID
|
|
||||||
): Execution[Unit] = {
|
|
||||||
val userUserGraph = DAL.readMostRecentSnapshot(UserUserGraphScalaDataset).toTypedPipe
|
|
||||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
|
||||||
val maxClustersPerUser = args.int("maxClustersPerUser", 200)
|
|
||||||
val knownForModelVersion = ModelVersions.Model20M145K2020
|
|
||||||
val knownFor = KnownForSources.fromKeyVal(
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshotNoOlderThan(
|
|
||||||
SimclustersV2KnownFor20M145K2020ScalaDataset,
|
|
||||||
Days(30)).toTypedPipe,
|
|
||||||
knownForModelVersion
|
|
||||||
)
|
|
||||||
|
|
||||||
val outputSink = AdhocKeyValSources.interestedInSource(args("outputDir"))
|
|
||||||
Util.printCounters(
|
|
||||||
InterestedInFromKnownForLite
|
|
||||||
.run(
|
|
||||||
userUserGraph,
|
|
||||||
knownFor,
|
|
||||||
socialProofThreshold,
|
|
||||||
maxClustersPerUser,
|
|
||||||
knownForModelVersion
|
|
||||||
).writeExecution(outputSink)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
object InterestedInFromKnownForLite {
|
|
||||||
private def ifNanMake0(x: Double): Double = if (x.isNaN) 0.0 else x
|
|
||||||
|
|
||||||
case class SrcClusterIntermediateInfo(
|
|
||||||
followScore: Double,
|
|
||||||
favScore: Double,
|
|
||||||
logFavScore: Double,
|
|
||||||
numFollowed: Int,
|
|
||||||
numFaved: Int) {
|
|
||||||
|
|
||||||
// helper function used for test cases
|
|
||||||
override def equals(obj: scala.Any): Boolean = {
|
|
||||||
obj match {
|
|
||||||
case that: SrcClusterIntermediateInfo =>
|
|
||||||
math.abs(followScore - that.followScore) < 1e-5 &&
|
|
||||||
math.abs(favScore - that.favScore) < 1e-5 &&
|
|
||||||
math.abs(logFavScore - that.logFavScore) < 1e-5 &&
|
|
||||||
numFollowed == that.numFollowed &&
|
|
||||||
numFaved == that.numFaved
|
|
||||||
case _ => false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
implicit object SrcClusterIntermediateInfoSemigroup
|
|
||||||
extends Semigroup[SrcClusterIntermediateInfo] {
|
|
||||||
override def plus(
|
|
||||||
left: SrcClusterIntermediateInfo,
|
|
||||||
right: SrcClusterIntermediateInfo
|
|
||||||
): SrcClusterIntermediateInfo = {
|
|
||||||
SrcClusterIntermediateInfo(
|
|
||||||
followScore = left.followScore + right.followScore,
|
|
||||||
favScore = left.favScore + right.favScore,
|
|
||||||
logFavScore = left.logFavScore + right.logFavScore,
|
|
||||||
numFollowed = left.numFollowed + right.numFollowed,
|
|
||||||
numFaved = left.numFaved + right.numFaved
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def run(
|
|
||||||
adjacencyLists: TypedPipe[UserAndNeighbors],
|
|
||||||
knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
|
|
||||||
socialProofThreshold: Int,
|
|
||||||
maxClustersPerUser: Int,
|
|
||||||
knownForModelVersion: String
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
InterestedInFromKnownFor.keepOnlyTopClusters(
|
|
||||||
groupClusterScores(
|
|
||||||
userClusterPairs(
|
|
||||||
adjacencyLists,
|
|
||||||
knownFor,
|
|
||||||
socialProofThreshold
|
|
||||||
)
|
|
||||||
),
|
|
||||||
maxClustersPerUser,
|
|
||||||
knownForModelVersion
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
def userClusterPairs(
|
|
||||||
adjacencyLists: TypedPipe[UserAndNeighbors],
|
|
||||||
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
|
|
||||||
socialProofThreshold: Int
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] = {
|
|
||||||
val edgesToUsersWithKnownFor = Stat("num_edges_to_users_with_known_for")
|
|
||||||
val srcDestClusterTriples = Stat("num_src_dest_cluster_triples")
|
|
||||||
val srcClusterPairsBeforeSocialProofThresholding =
|
|
||||||
Stat("num_src_cluster_pairs_before_social_proof_thresholding")
|
|
||||||
val srcClusterPairsAfterSocialProofThresholding =
|
|
||||||
Stat("num_src_cluster_pairs_after_social_proof_thresholding")
|
|
||||||
|
|
||||||
val edges = adjacencyLists.flatMap {
|
|
||||||
case UserAndNeighbors(srcId, neighborsWithWeights) =>
|
|
||||||
neighborsWithWeights.map { neighborWithWeights =>
|
|
||||||
(
|
|
||||||
neighborWithWeights.neighborId,
|
|
||||||
neighborWithWeights.copy(neighborId = srcId)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian
|
|
||||||
|
|
||||||
edges
|
|
||||||
.sketch(4000)
|
|
||||||
.join(knownFor)
|
|
||||||
.flatMap {
|
|
||||||
case (destId, (srcWithWeights, clusterArray)) =>
|
|
||||||
edgesToUsersWithKnownFor.inc()
|
|
||||||
clusterArray.toList.map {
|
|
||||||
case (clusterId, knownForScoreF) =>
|
|
||||||
val knownForScore = math.max(0.0, knownForScoreF.toDouble)
|
|
||||||
|
|
||||||
srcDestClusterTriples.inc()
|
|
||||||
val followScore =
|
|
||||||
if (srcWithWeights.isFollowed.contains(true)) knownForScore else 0.0
|
|
||||||
val favScore =
|
|
||||||
srcWithWeights.favScoreHalfLife100Days.getOrElse(0.0) * knownForScore
|
|
||||||
val logFavScore = srcWithWeights.logFavScore.getOrElse(0.0) * knownForScore
|
|
||||||
val numFollowed = if (srcWithWeights.isFollowed.contains(true)) {
|
|
||||||
1
|
|
||||||
} else 0
|
|
||||||
|
|
||||||
val numFaved = if (srcWithWeights.favScoreHalfLife100Days.exists(_ > 0)) {
|
|
||||||
1
|
|
||||||
} else 0
|
|
||||||
|
|
||||||
(
|
|
||||||
(srcWithWeights.neighborId, clusterId),
|
|
||||||
SrcClusterIntermediateInfo(
|
|
||||||
followScore,
|
|
||||||
favScore,
|
|
||||||
logFavScore,
|
|
||||||
numFollowed,
|
|
||||||
numFaved
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.sumByKey
|
|
||||||
.withReducers(10000)
|
|
||||||
.filter {
|
|
||||||
case ((_, _), SrcClusterIntermediateInfo(_, _, _, numFollowed, numFaved)) =>
|
|
||||||
srcClusterPairsBeforeSocialProofThresholding.inc()
|
|
||||||
// we donot remove duplicates
|
|
||||||
val socialProofSize = numFollowed + numFaved
|
|
||||||
val result = socialProofSize >= socialProofThreshold
|
|
||||||
if (result) {
|
|
||||||
srcClusterPairsAfterSocialProofThresholding.inc()
|
|
||||||
}
|
|
||||||
result
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def groupClusterScores(
|
|
||||||
intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
|
|
||||||
)(
|
|
||||||
implicit uniqueId: UniqueID
|
|
||||||
): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
|
|
||||||
|
|
||||||
implicit val i2b: Int => Array[Byte] = Injection.int2BigEndian
|
|
||||||
|
|
||||||
intermediate
|
|
||||||
.map {
|
|
||||||
case (
|
|
||||||
(srcId, clusterId),
|
|
||||||
SrcClusterIntermediateInfo(
|
|
||||||
followScore,
|
|
||||||
favScore,
|
|
||||||
logFavScore,
|
|
||||||
numFollowed,
|
|
||||||
numFaved
|
|
||||||
)) =>
|
|
||||||
(
|
|
||||||
srcId,
|
|
||||||
List(
|
|
||||||
(
|
|
||||||
clusterId,
|
|
||||||
UserToInterestedInClusterScores(
|
|
||||||
followScore = Some(ifNanMake0(followScore)),
|
|
||||||
favScore = Some(ifNanMake0(favScore)),
|
|
||||||
logFavScore = Some(ifNanMake0(logFavScore)),
|
|
||||||
numUsersBeingFollowed = Some(numFollowed),
|
|
||||||
numUsersThatWereFaved = Some(numFaved)
|
|
||||||
))
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
.sumByKey
|
|
||||||
// .withReducers(1000)
|
|
||||||
.toTypedPipe
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,290 +0,0 @@
|
|||||||
package com.twitter.simclusters_v2.scalding
|
|
||||||
|
|
||||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
|
||||||
import com.twitter.scalding.Execution
|
|
||||||
import com.twitter.scalding.TypedTsv
|
|
||||||
import com.twitter.scalding._
|
|
||||||
import com.twitter.scalding_internal.dalv2.DAL
|
|
||||||
import com.twitter.scalding_internal.dalv2.DALWrite._
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
|
|
||||||
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
|
|
||||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
|
||||||
import com.twitter.simclusters_v2.common.ModelVersions
|
|
||||||
import com.twitter.simclusters_v2.common.UserId
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.ProducerEmbeddingSources
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.DataSources
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
|
|
||||||
import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
|
|
||||||
import com.twitter.simclusters_v2.scalding.common.Util
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.EmbeddingType
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.SimClusterWithScore
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore
|
|
||||||
import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores
|
|
||||||
import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
|
|
||||||
import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp
|
|
||||||
import java.util.TimeZone
|
|
||||||
import scala.util.Random
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This file implements the job for computing users' interestedIn vector from the producerEmbeddings data set.
|
|
||||||
*
|
|
||||||
* It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then
|
|
||||||
* based on the producerEmbedding clusters of each followed/faved user, we calculate how much a user is
|
|
||||||
* interestedIn a cluster. To compute the engagement and determine the clusters for the user, we reuse
|
|
||||||
* the functions defined in InterestedInKnownFor.
|
|
||||||
*
|
|
||||||
* Using producerEmbeddings instead of knownFor to obtain interestedIn increases the coverage (especially
|
|
||||||
* for medium and light users) and also the density of the cluster embeddings for the user.
|
|
||||||
*/
|
|
||||||
/**
|
|
||||||
* Adhoc job to generate the interestedIn from producer embeddings for the model version 20M145KUpdated
|
|
||||||
*
|
|
||||||
scalding remote run \
|
|
||||||
--target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_producer_embeddings \
|
|
||||||
--main-class com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsAdhocApp \
|
|
||||||
--user cassowary --cluster bluebird-qus1 \
|
|
||||||
--keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
|
|
||||||
--principal service_acoount@TWITTER.BIZ \
|
|
||||||
-- \
|
|
||||||
--outputDir /gcs/user/cassowary/adhoc/interested_in_from_prod_embeddings/ \
|
|
||||||
--date 2020-08-25 --typedTsv true
|
|
||||||
*/
|
|
||||||
object InterestedInFromProducerEmbeddingsAdhocApp extends AdhocExecutionApp {
|
|
||||||
override def runOnDateRange(
|
|
||||||
args: Args
|
|
||||||
)(
|
|
||||||
implicit dateRange: DateRange,
|
|
||||||
timeZone: TimeZone,
|
|
||||||
uniqueID: UniqueID
|
|
||||||
): Execution[Unit] = {
|
|
||||||
|
|
||||||
val outputDir = args("outputDir")
|
|
||||||
val inputGraph = args.optional("graphInputDir") match {
|
|
||||||
case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir))
|
|
||||||
case None =>
|
|
||||||
DAL
|
|
||||||
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
|
|
||||||
.toTypedPipe
|
|
||||||
}
|
|
||||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
|
||||||
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50)
|
|
||||||
val maxClustersFromProducer = args.int("maxClustersPerProducer", 25)
|
|
||||||
val typedTsvTag = args.boolean("typedTsv")
|
|
||||||
|
|
||||||
val embeddingType =
|
|
||||||
EmbeddingType.ProducerFavBasedSemanticCoreEntity
|
|
||||||
val modelVersion = ModelVersions.Model20M145KUpdated
|
|
||||||
val producerEmbeddings = ProducerEmbeddingSources
|
|
||||||
.producerEmbeddingSourceLegacy(embeddingType, ModelVersions.toModelVersion(modelVersion))(
|
|
||||||
dateRange.embiggen(Days(7)))
|
|
||||||
|
|
||||||
import InterestedInFromProducerEmbeddingsBatchApp._
|
|
||||||
|
|
||||||
val numProducerMappings = Stat("num_producer_embeddings_total")
|
|
||||||
val numProducersWithLargeClusterMappings = Stat(
|
|
||||||
"num_producers_with_more_clusters_than_threshold")
|
|
||||||
val numProducersWithSmallClusterMappings = Stat(
|
|
||||||
"num_producers_with_clusters_less_than_threshold")
|
|
||||||
val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings")
|
|
||||||
|
|
||||||
val producerEmbeddingsWithScore = producerEmbeddings.map {
|
|
||||||
case (userId: Long, topSimClusters: TopSimClustersWithScore) =>
|
|
||||||
(
|
|
||||||
userId,
|
|
||||||
topSimClusters.topClusters.toArray
|
|
||||||
.map {
|
|
||||||
case (simCluster: SimClusterWithScore) =>
|
|
||||||
(simCluster.clusterId, simCluster.score.toFloat)
|
|
||||||
}
|
|
||||||
)
|
|
||||||
}
|
|
||||||
val producerEmbeddingsPruned = producerEmbeddingsWithScore.map {
|
|
||||||
case (producerId, clusterArray) =>
|
|
||||||
numProducerMappings.inc()
|
|
||||||
val clusterSize = clusterArray.size
|
|
||||||
totalClustersCoverageProducerEmbeddings.incBy(clusterSize)
|
|
||||||
val prunedList = if (clusterSize > maxClustersFromProducer) {
|
|
||||||
numProducersWithLargeClusterMappings.inc()
|
|
||||||
clusterArray
|
|
||||||
.sortBy {
|
|
||||||
case (_, knownForScore) => -knownForScore
|
|
||||||
}.take(maxClustersFromProducer)
|
|
||||||
} else {
|
|
||||||
numProducersWithSmallClusterMappings.inc()
|
|
||||||
clusterArray
|
|
||||||
}
|
|
||||||
(producerId, prunedList)
|
|
||||||
}
|
|
||||||
|
|
||||||
val result = InterestedInFromKnownFor
|
|
||||||
.run(
|
|
||||||
inputGraph,
|
|
||||||
producerEmbeddingsPruned,
|
|
||||||
socialProofThreshold,
|
|
||||||
maxClustersPerUserFinalResult,
|
|
||||||
modelVersion
|
|
||||||
)
|
|
||||||
|
|
||||||
val resultWithoutSocial = getInterestedInDiscardSocial(result)
|
|
||||||
|
|
||||||
if (typedTsvTag) {
|
|
||||||
Util.printCounters(
|
|
||||||
resultWithoutSocial
|
|
||||||
.map {
|
|
||||||
case (userId: Long, clusters: ClustersUserIsInterestedIn) =>
|
|
||||||
(
|
|
||||||
userId,
|
|
||||||
clusters.clusterIdToScores.keys.toString()
|
|
||||||
)
|
|
||||||
}
|
|
||||||
.writeExecution(
|
|
||||||
TypedTsv(outputDir)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
Util.printCounters(
|
|
||||||
resultWithoutSocial
|
|
||||||
.writeExecution(
|
|
||||||
AdhocKeyValSources.interestedInSource(outputDir)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Production job for computing interestedIn data set from the producer embeddings for the model version 20M145KUpdated.
|
|
||||||
* It writes the data set in KeyVal format to produce a MH DAL data set.
|
|
||||||
*
|
|
||||||
* To deploy the job:
|
|
||||||
*
|
|
||||||
* capesospy-v2 update --build_locally --start_cron
|
|
||||||
* --start_cron interested_in_from_producer_embeddings
|
|
||||||
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml
|
|
||||||
*/
|
|
||||||
object InterestedInFromProducerEmbeddingsBatchApp extends ScheduledExecutionApp {
|
|
||||||
override val firstTime: RichDate = RichDate("2019-11-01")
|
|
||||||
|
|
||||||
override val batchIncrement: Duration = Days(7)
|
|
||||||
|
|
||||||
def getPrunedEmbeddings(
|
|
||||||
producerEmbeddings: TypedPipe[(Long, TopSimClustersWithScore)],
|
|
||||||
maxClustersFromProducer: Int
|
|
||||||
): TypedPipe[(Long, TopSimClustersWithScore)] = {
|
|
||||||
producerEmbeddings.map {
|
|
||||||
case (producerId, producerClusters) =>
|
|
||||||
val prunedProducerClusters =
|
|
||||||
producerClusters.topClusters
|
|
||||||
.sortBy {
|
|
||||||
case simCluster => -simCluster.score.toFloat
|
|
||||||
}.take(maxClustersFromProducer)
|
|
||||||
(producerId, TopSimClustersWithScore(prunedProducerClusters, producerClusters.modelVersion))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def getInterestedInDiscardSocial(
|
|
||||||
interestedInFromProducersResult: TypedPipe[(UserId, ClustersUserIsInterestedIn)]
|
|
||||||
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
|
|
||||||
interestedInFromProducersResult.map {
|
|
||||||
case (srcId, fullClusterList) =>
|
|
||||||
val fullClusterListWithoutSocial = fullClusterList.clusterIdToScores.map {
|
|
||||||
case (clusterId, clusterDetails) =>
|
|
||||||
val clusterDetailsWithoutSocial = UserToInterestedInClusterScores(
|
|
||||||
followScore = clusterDetails.followScore,
|
|
||||||
followScoreClusterNormalizedOnly = clusterDetails.followScoreClusterNormalizedOnly,
|
|
||||||
followScoreProducerNormalizedOnly = clusterDetails.followScoreProducerNormalizedOnly,
|
|
||||||
followScoreClusterAndProducerNormalized =
|
|
||||||
clusterDetails.followScoreClusterAndProducerNormalized,
|
|
||||||
favScore = clusterDetails.favScore,
|
|
||||||
favScoreClusterNormalizedOnly = clusterDetails.favScoreClusterNormalizedOnly,
|
|
||||||
favScoreProducerNormalizedOnly = clusterDetails.favScoreProducerNormalizedOnly,
|
|
||||||
favScoreClusterAndProducerNormalized =
|
|
||||||
clusterDetails.favScoreClusterAndProducerNormalized,
|
|
||||||
// Social proof is currently not being used anywhere else, hence being discarded to reduce space for this dataset
|
|
||||||
usersBeingFollowed = None,
|
|
||||||
usersThatWereFaved = None,
|
|
||||||
numUsersInterestedInThisClusterUpperBound =
|
|
||||||
clusterDetails.numUsersInterestedInThisClusterUpperBound,
|
|
||||||
logFavScore = clusterDetails.logFavScore,
|
|
||||||
logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly,
|
|
||||||
// Counts of the social proof are maintained
|
|
||||||
numUsersBeingFollowed = Some(clusterDetails.usersBeingFollowed.getOrElse(Nil).size),
|
|
||||||
numUsersThatWereFaved = Some(clusterDetails.usersThatWereFaved.getOrElse(Nil).size)
|
|
||||||
)
|
|
||||||
(clusterId, clusterDetailsWithoutSocial)
|
|
||||||
}
|
|
||||||
(
|
|
||||||
srcId,
|
|
||||||
ClustersUserIsInterestedIn(
|
|
||||||
fullClusterList.knownForModelVersion,
|
|
||||||
fullClusterListWithoutSocial))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
override def runOnDateRange(
|
|
||||||
args: Args
|
|
||||||
)(
|
|
||||||
implicit dateRange: DateRange,
|
|
||||||
timeZone: TimeZone,
|
|
||||||
uniqueID: UniqueID
|
|
||||||
): Execution[Unit] = {
|
|
||||||
|
|
||||||
//Input args for the run
|
|
||||||
val socialProofThreshold = args.int("socialProofThreshold", 2)
|
|
||||||
val maxClustersFromProducer = args.int("maxClustersPerProducer", 25)
|
|
||||||
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50)
|
|
||||||
|
|
||||||
//Path variables
|
|
||||||
val modelVersionUpdated = ModelVersions.toModelVersion(ModelVersions.Model20M145KUpdated)
|
|
||||||
val rootPath: String = s"/user/cassowary/manhattan_sequence_files"
|
|
||||||
val interestedInFromProducersPath =
|
|
||||||
rootPath + "/interested_in_from_producer_embeddings/" + modelVersionUpdated
|
|
||||||
|
|
||||||
//Input adjacency list and producer embeddings
|
|
||||||
val userUserNormalGraph =
|
|
||||||
DataSources.userUserNormalizedGraphSource(dateRange.prepend(Days(7))).forceToDisk
|
|
||||||
val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
|
|
||||||
SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset
|
|
||||||
val producerEmbeddings = ProducerEmbeddingSources
|
|
||||||
.producerEmbeddingSourceLegacy(
|
|
||||||
EmbeddingType.ProducerFavBasedSemanticCoreEntity,
|
|
||||||
modelVersionUpdated)(dateRange.embiggen(Days(7)))
|
|
||||||
|
|
||||||
val producerEmbeddingsPruned = getPrunedEmbeddings(producerEmbeddings, maxClustersFromProducer)
|
|
||||||
val producerEmbeddingsWithScore = producerEmbeddingsPruned.map {
|
|
||||||
case (userId: Long, topSimClusters: TopSimClustersWithScore) =>
|
|
||||||
(
|
|
||||||
userId,
|
|
||||||
topSimClusters.topClusters.toArray
|
|
||||||
.map {
|
|
||||||
case (simCluster: SimClusterWithScore) =>
|
|
||||||
(simCluster.clusterId, simCluster.score.toFloat)
|
|
||||||
}
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
val interestedInFromProducersResult =
|
|
||||||
InterestedInFromKnownFor.run(
|
|
||||||
userUserNormalGraph,
|
|
||||||
producerEmbeddingsWithScore,
|
|
||||||
socialProofThreshold,
|
|
||||||
maxClustersPerUserFinalResult,
|
|
||||||
modelVersionUpdated.toString
|
|
||||||
)
|
|
||||||
|
|
||||||
val interestedInFromProducersWithoutSocial =
|
|
||||||
getInterestedInDiscardSocial(interestedInFromProducersResult)
|
|
||||||
|
|
||||||
val writeKeyValResultExec = interestedInFromProducersWithoutSocial
|
|
||||||
.map { case (userId, clusters) => KeyVal(userId, clusters) }
|
|
||||||
.writeDALVersionedKeyValExecution(
|
|
||||||
outputKVDataset,
|
|
||||||
D.Suffix(interestedInFromProducersPath)
|
|
||||||
)
|
|
||||||
writeKeyValResultExec
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user