[docx] split commit for file 5000

Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
Ari Archer 2024-01-23 19:17:38 +02:00
parent c4b4b821a3
commit 2f5f511bb8
No known key found for this signature in database
GPG Key ID: A50D5B4B599AF8A2
394 changed files with 0 additions and 37240 deletions

View File

@ -1,32 +0,0 @@
package com.twitter.simclusters_v2.common
import com.twitter.simclusters_v2.common.SimClustersMultiEmbeddingId._
import com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding.{Ids, Values}
import com.twitter.simclusters_v2.thriftscala.{
SimClustersMultiEmbedding,
SimClustersEmbeddingId,
SimClustersMultiEmbeddingId
}
/**
* Helper methods for SimClustersMultiEmbedding
*/
object SimClustersMultiEmbedding {
// Convert a multiEmbedding to a list of (embeddingId, score)
def toSimClustersEmbeddingIdWithScores(
simClustersMultiEmbeddingId: SimClustersMultiEmbeddingId,
simClustersMultiEmbedding: SimClustersMultiEmbedding
): Seq[(SimClustersEmbeddingId, Double)] = {
simClustersMultiEmbedding match {
case Values(values) =>
values.embeddings.zipWithIndex.map {
case (embeddingWithScore, i) =>
(toEmbeddingId(simClustersMultiEmbeddingId, i), embeddingWithScore.score)
}
case Ids(ids) =>
ids.ids.map(_.toTuple)
}
}
}

View File

@ -1,96 +0,0 @@
package com.twitter.simclusters_v2.common
import com.twitter.simclusters_v2.thriftscala.{
EmbeddingType,
InternalId,
MultiEmbeddingType,
TopicId,
TopicSubId,
SimClustersEmbeddingId => ThriftEmbeddingId,
SimClustersMultiEmbeddingId => ThriftMultiEmbeddingId
}
/**
* Helper methods for SimClustersMultiEmbeddingId
*/
object SimClustersMultiEmbeddingId {
private val MultiEmbeddingTypeToEmbeddingType: Map[MultiEmbeddingType, EmbeddingType] =
Map(
MultiEmbeddingType.LogFavApeBasedMuseTopic -> EmbeddingType.LogFavApeBasedMuseTopic,
MultiEmbeddingType.TwiceUserInterestedIn -> EmbeddingType.TwiceUserInterestedIn,
)
private val EmbeddingTypeToMultiEmbeddingType: Map[EmbeddingType, MultiEmbeddingType] =
MultiEmbeddingTypeToEmbeddingType.map(_.swap)
def toEmbeddingType(multiEmbeddingType: MultiEmbeddingType): EmbeddingType = {
MultiEmbeddingTypeToEmbeddingType.getOrElse(
multiEmbeddingType,
throw new IllegalArgumentException(s"Invalid type: $multiEmbeddingType"))
}
def toMultiEmbeddingType(embeddingType: EmbeddingType): MultiEmbeddingType = {
EmbeddingTypeToMultiEmbeddingType.getOrElse(
embeddingType,
throw new IllegalArgumentException(s"Invalid type: $embeddingType")
)
}
/**
* Convert a SimClusters Multi-Embedding Id and SubId to SimClusters Embedding Id.
*/
def toEmbeddingId(
simClustersMultiEmbeddingId: ThriftMultiEmbeddingId,
subId: Int
): ThriftEmbeddingId = {
val internalId = simClustersMultiEmbeddingId.internalId match {
case InternalId.TopicId(topicId) =>
InternalId.TopicSubId(
TopicSubId(topicId.entityId, topicId.language, topicId.country, subId))
case _ =>
throw new IllegalArgumentException(
s"Invalid simClusters InternalId ${simClustersMultiEmbeddingId.internalId}")
}
ThriftEmbeddingId(
toEmbeddingType(simClustersMultiEmbeddingId.embeddingType),
simClustersMultiEmbeddingId.modelVersion,
internalId
)
}
/**
* Fetch a subId from a SimClusters EmbeddingId.
*/
def toSubId(simClustersEmbeddingId: ThriftEmbeddingId): Int = {
simClustersEmbeddingId.internalId match {
case InternalId.TopicSubId(topicSubId) =>
topicSubId.subId
case _ =>
throw new IllegalArgumentException(
s"Invalid SimClustersEmbeddingId InternalId type, $simClustersEmbeddingId")
}
}
/**
* Convert a SimClustersEmbeddingId to SimClustersMultiEmbeddingId.
* Only support the Multi embedding based EmbeddingTypes.
*/
def toMultiEmbeddingId(
simClustersEmbeddingId: ThriftEmbeddingId
): ThriftMultiEmbeddingId = {
simClustersEmbeddingId.internalId match {
case InternalId.TopicSubId(topicSubId) =>
ThriftMultiEmbeddingId(
toMultiEmbeddingType(simClustersEmbeddingId.embeddingType),
simClustersEmbeddingId.modelVersion,
InternalId.TopicId(TopicId(topicSubId.entityId, topicSubId.language, topicSubId.country))
)
case _ =>
throw new IllegalArgumentException(
s"Invalid SimClustersEmbeddingId InternalId type, $simClustersEmbeddingId")
}
}
}

View File

@ -1,11 +0,0 @@
scala_library(
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"eventdetection/common/src/main/java/com/twitter/eventdetection/common/louvain",
"eventdetection/common/src/main/java/com/twitter/eventdetection/common/model",
"src/java/com/twitter/sbf/graph",
"src/scala/com/twitter/simclusters_v2/scalding/common",
],
)

View File

@ -1,30 +0,0 @@
package com.twitter.simclusters_v2.common.clustering
import com.twitter.simclusters_v2.common.UserId
import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
/**
* Select a cluster member as cluster representative.
*/
trait ClusterRepresentativeSelectionMethod[T] {
/**
* The main external-facing method. Sub-classes should implement this method.
*
* @param cluster A set of NeighborWithWeights.
* @param embeddings A map of producer ID -> embedding.
*
* @return UserId of the member chosen as representative.
*/
def selectClusterRepresentative(
cluster: Set[NeighborWithWeights],
embeddings: Map[UserId, T]
): UserId
}
object ClusterRepresentativeSelectionStatistics {
// Statistics, to be imported where recorded.
val StatClusterRepresentativeSelectionTime = "cluster_representative_selection_total_time_ms"
}

View File

@ -1,34 +0,0 @@
package com.twitter.simclusters_v2.common.clustering
/**
* Partitions a set of entities into clusters.
* NOTE: The selection/construction of the cluster representatives (e.g. medoid, random, average) is implemented in ClusterRepresentativeSelectionMethod.scala
*/
trait ClusteringMethod {
/**
* The main external-facing method. Sub-classes should implement this method.
*
* @param embeddings map of entity IDs and corresponding embeddings
* @param similarityFn function that outputs similarity (>=0, the larger, more similar), given two embeddings
* @tparam T embedding type. e.g. SimClustersEmbedding
*
* @return A set of sets of entity IDs, each set representing a distinct cluster.
*/
def cluster[T](
embeddings: Map[Long, T],
similarityFn: (T, T) => Double,
recordStatCallback: (String, Long) => Unit = (_, _) => ()
): Set[Set[Long]]
}
object ClusteringStatistics {
// Statistics, to be imported where recorded.
val StatSimilarityGraphTotalBuildTime = "similarity_graph_total_build_time_ms"
val StatClusteringAlgorithmRunTime = "clustering_algorithm_total_run_time_ms"
val StatMedoidSelectionTime = "medoid_selection_total_time_ms"
val StatComputedSimilarityBeforeFilter = "computed_similarity_before_filter"
}

View File

@ -1,67 +0,0 @@
package com.twitter.simclusters_v2.common.clustering
import com.twitter.sbf.graph.ConnectedComponents
import com.twitter.sbf.graph.Graph
import com.twitter.util.Stopwatch
import it.unimi.dsi.fastutil.ints.IntSet
import scala.collection.SortedMap
import scala.jdk.CollectionConverters._
/**
* Aggregate entities into clusters such that a cluster contains all embeddings with a similarity
* above a configurable threshold to any other embedding.
*
* @param similarityThreshold: When building the edges between entities, edges with weight
* less than or equal to this threshold will be filtered out.
*/
class ConnectedComponentsClusteringMethod(
similarityThreshold: Double)
extends ClusteringMethod {
import ClusteringStatistics._
def cluster[T](
embeddings: Map[Long, T],
similarityFn: (T, T) => Double,
recordStatCallback: (String, Long) => Unit = (_, _) => ()
): Set[Set[Long]] = {
val timeSinceGraphBuildStart = Stopwatch.start()
// com.twitter.sbf.graph.Graph expects neighbors to be sorted in ascending order.
val sourcesById = SortedMap(embeddings.zipWithIndex.map {
case (source, idx) => idx -> source
}.toSeq: _*)
val neighbours = sourcesById.map {
case (srcIdx, (_, src)) =>
sourcesById
.collect {
case (dstIdx, (_, dst)) if srcIdx != dstIdx => // avoid self-edges
val similarity = similarityFn(src, dst)
recordStatCallback(
StatComputedSimilarityBeforeFilter,
(similarity * 100).toLong // preserve up to two decimal points
)
if (similarity > similarityThreshold)
Some(dstIdx)
else None
}.flatten.toArray
}.toArray
recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
val timeSinceClusteringAlgRunStart = Stopwatch.start()
val nEdges = neighbours.map(_.length).sum / 2 // Graph expects count of undirected edges
val graph = new Graph(sourcesById.size, nEdges, neighbours)
val clusters = ConnectedComponents
.connectedComponents(graph).asScala.toSet
.map { i: IntSet => i.asScala.map(sourcesById(_)._1).toSet }
recordStatCallback(
StatClusteringAlgorithmRunTime,
timeSinceClusteringAlgRunStart().inMilliseconds)
clusters
}
}

View File

@ -1,33 +0,0 @@
package com.twitter.simclusters_v2.common.clustering
/**
* Groups entities by a single embedding dimension with the largest score.
*/
class LargestDimensionClusteringMethod extends ClusteringMethod {
/**
* @param embeddings map of entity IDs and corresponding embeddings
* @param similarityFn function that outputs discrete value (0.0 or 1.0).
* 1.0 if the dimensions of the highest score (weight) from two given embeddings match.
* 0.0 otherwise.
* e.g.
* case 1: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.3, 0.8, 0.0]. similarityFn(E1, E2)=1.0
* case 2: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.4, 0.2, 0.0]. similarityFn(E1, E2)=0.0
* @tparam T embedding type. e.g. SimClustersEmbedding
*
* @return A set of sets of entity IDs, each set representing a distinct cluster.
*/
override def cluster[T](
embeddings: Map[Long, T],
similarityFn: (T, T) => Double,
recordStatCallback: (String, Long) => Unit
): Set[Set[Long]] = {
// rely on clustering by connected component.
// similarityThreshold=0.1 because it's larger than 0.0 (similarityFn returns 0.0 if two embeddings
// don't share the largest dimension.
new ConnectedComponentsClusteringMethod(similarityThreshold = 0.1)
.cluster(embeddings, similarityFn, recordStatCallback)
}
}

View File

@ -1,236 +0,0 @@
package com.twitter.simclusters_v2.common.clustering
import com.twitter.eventdetection.common.louvain.LouvainDriver
import com.twitter.eventdetection.common.louvain.NetworkFactory
import com.twitter.eventdetection.common.model.Entity
import com.twitter.eventdetection.common.model.NetworkInput
import com.twitter.eventdetection.common.model.TextEntityValue
import com.twitter.util.Stopwatch
import scala.collection.JavaConverters._
import scala.math.max
/**
* Groups entities by the Louvain clustering method.
* @param similarityThreshold: When building the edges between entities, edges with weight
* less than or equal to this threshold will be filtered out.
* @param appliedResolutionFactor: If present, will be used to multiply the applied resolution
* parameter of the Louvain method by this factor.
* Note that the DEFAULT_MAX_RESOLUTION will not be applied.
*/
class LouvainClusteringMethod(
similarityThreshold: Double,
appliedResolutionFactor: Option[Double])
extends ClusteringMethod {
import ClusteringStatistics._
def cluster[T](
embeddings: Map[Long, T],
similarityFn: (T, T) => Double,
recordStatCallback: (String, Long) => Unit = (_, _) => ()
): Set[Set[Long]] = {
// 1. Build the graph on which to run Louvain:
// - Weigh edges by the similarity between the 2 embeddings,
// - Filter out edges with weight <= threshold.
val timeSinceGraphBuildStart = Stopwatch.start()
val edges: Seq[((Long, Long), Double)] = embeddings.toSeq
.combinations(2)
.map { pair: Seq[(Long, T)] => // pair of 2
val (user1, embedding1) = pair.head
val (user2, embedding2) = pair(1)
val similarity = similarityFn(embedding1, embedding2)
recordStatCallback(
StatComputedSimilarityBeforeFilter,
(similarity * 100).toLong // preserve up to two decimal places
)
((user1, user2), similarity)
}
.filter(_._2 > similarityThreshold)
.toSeq
recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
// check if some entities do not have any incoming / outgoing edge
// these are size-1 clusters (i.e. their own)
val individualClusters: Set[Long] = embeddings.keySet -- edges.flatMap {
case ((user1, user2), _) => Set(user1, user2)
}.toSet
// 2. LouvainDriver uses "Entity" as input, so build 2 mappings
// - Long (entity id) -> Entity
// - Entity -> Long (entity id)
val embeddingIdToEntity: Map[Long, Entity] = embeddings.map {
case (id, _) => id -> Entity(TextEntityValue(id.toString, Some(id.toString)), None)
}
val entityToEmbeddingId: Map[Entity, Long] = embeddingIdToEntity.map {
case (id, e) => e -> id
}
// 3. Create the list of NetworkInput on which to run LouvainDriver
val networkInputList = edges
.map {
case ((fromUserId: Long, toUserId: Long), weight: Double) =>
new NetworkInput(embeddingIdToEntity(fromUserId), embeddingIdToEntity(toUserId), weight)
}.toList.asJava
val timeSinceClusteringAlgRunStart = Stopwatch.start()
val networkDictionary = NetworkFactory.buildDictionary(networkInputList)
val network = NetworkFactory.buildNetwork(networkInputList, networkDictionary)
if (networkInputList.size() == 0) {
// handle case if no edge at all (only one entity or all entities are too far apart)
embeddings.keySet.map(e => Set(e))
} else {
// 4. Run clustering algorithm
val clusteredIds = appliedResolutionFactor match {
case Some(res) =>
LouvainDriver.clusterAppliedResolutionFactor(network, networkDictionary, res)
case None => LouvainDriver.cluster(network, networkDictionary)
}
recordStatCallback(
StatClusteringAlgorithmRunTime,
timeSinceClusteringAlgRunStart().inMilliseconds)
// 5. Post-processing
val atLeast2MembersClusters: Set[Set[Long]] = clusteredIds.asScala
.groupBy(_._2)
.mapValues(_.map { case (e, _) => entityToEmbeddingId(e) }.toSet)
.values.toSet
atLeast2MembersClusters ++ individualClusters.map { e => Set(e) }
}
}
def clusterWithSilhouette[T](
embeddings: Map[Long, T],
similarityFn: (T, T) => Double,
similarityFnForSil: (T, T) => Double,
recordStatCallback: (String, Long) => Unit = (_, _) => ()
): (Set[Set[Long]], Set[Set[(Long, Double)]]) = {
// 1. Build the graph on which to run Louvain:
// - Weigh edges by the similarity between the 2 embeddings,
// - Filter out edges with weight <= threshold.
val timeSinceGraphBuildStart = Stopwatch.start()
val edgesSimilarityMap = collection.mutable.Map[(Long, Long), Double]()
val edges: Seq[((Long, Long), Double)] = embeddings.toSeq
.combinations(2)
.map { pair: Seq[(Long, T)] => // pair of 2
val (user1, embedding1) = pair.head
val (user2, embedding2) = pair(1)
val similarity = similarityFn(embedding1, embedding2)
val similarityForSil = similarityFnForSil(embedding1, embedding2)
edgesSimilarityMap.put((user1, user2), similarityForSil)
edgesSimilarityMap.put((user2, user1), similarityForSil)
recordStatCallback(
StatComputedSimilarityBeforeFilter,
(similarity * 100).toLong // preserve up to two decimal places
)
((user1, user2), similarity)
}
.filter(_._2 > similarityThreshold)
.toSeq
recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
// check if some entities do not have any incoming / outgoing edge
// these are size-1 clusters (i.e. their own)
val individualClusters: Set[Long] = embeddings.keySet -- edges.flatMap {
case ((user1, user2), _) => Set(user1, user2)
}.toSet
// 2. LouvainDriver uses "Entity" as input, so build 2 mappings
// - Long (entity id) -> Entity
// - Entity -> Long (entity id)
val embeddingIdToEntity: Map[Long, Entity] = embeddings.map {
case (id, _) => id -> Entity(TextEntityValue(id.toString, Some(id.toString)), None)
}
val entityToEmbeddingId: Map[Entity, Long] = embeddingIdToEntity.map {
case (id, e) => e -> id
}
// 3. Create the list of NetworkInput on which to run LouvainDriver
val networkInputList = edges
.map {
case ((fromUserId: Long, toUserId: Long), weight: Double) =>
new NetworkInput(embeddingIdToEntity(fromUserId), embeddingIdToEntity(toUserId), weight)
}.toList.asJava
val timeSinceClusteringAlgRunStart = Stopwatch.start()
val networkDictionary = NetworkFactory.buildDictionary(networkInputList)
val network = NetworkFactory.buildNetwork(networkInputList, networkDictionary)
val clusters = if (networkInputList.size() == 0) {
// handle case if no edge at all (only one entity or all entities are too far apart)
embeddings.keySet.map(e => Set(e))
} else {
// 4. Run clustering algorithm
val clusteredIds = appliedResolutionFactor match {
case Some(res) =>
LouvainDriver.clusterAppliedResolutionFactor(network, networkDictionary, res)
case None => LouvainDriver.cluster(network, networkDictionary)
}
recordStatCallback(
StatClusteringAlgorithmRunTime,
timeSinceClusteringAlgRunStart().inMilliseconds)
// 5. Post-processing
val atLeast2MembersClusters: Set[Set[Long]] = clusteredIds.asScala
.groupBy(_._2)
.mapValues(_.map { case (e, _) => entityToEmbeddingId(e) }.toSet)
.values.toSet
atLeast2MembersClusters ++ individualClusters.map { e => Set(e) }
}
// Calculate silhouette metrics
val contactIdWithSilhouette = clusters.map {
case cluster =>
val otherClusters = clusters - cluster
cluster.map {
case contactId =>
if (otherClusters.isEmpty) {
(contactId, 0.0)
} else {
val otherSameClusterContacts = cluster - contactId
if (otherSameClusterContacts.isEmpty) {
(contactId, 0.0)
} else {
// calculate similarity of given userId with all other users in the same cluster
val a_i = otherSameClusterContacts.map {
case sameClusterContact =>
edgesSimilarityMap((contactId, sameClusterContact))
}.sum / otherSameClusterContacts.size
// calculate similarity of given userId to all other clusters, find the best nearest cluster
val b_i = otherClusters.map {
case otherCluster =>
otherCluster.map {
case otherClusterContact =>
edgesSimilarityMap((contactId, otherClusterContact))
}.sum / otherCluster.size
}.max
// silhouette (value) of one userId i
val s_i = (a_i - b_i) / max(a_i, b_i)
(contactId, s_i)
}
}
}
}
(clusters, contactIdWithSilhouette)
}
}

View File

@ -1,21 +0,0 @@
package com.twitter.simclusters_v2.common.clustering
import com.twitter.simclusters_v2.common.UserId
import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
class MaxFavScoreRepresentativeSelectionMethod[T] extends ClusterRepresentativeSelectionMethod[T] {
/**
* Identify the member with largest favScoreHalfLife100Days and return it.
*
* @param cluster A set of NeighborWithWeights.
* @param embeddings A map of producer ID -> embedding.
*/
def selectClusterRepresentative(
cluster: Set[NeighborWithWeights],
embeddings: Map[UserId, T],
): UserId = {
val key = cluster.maxBy { x: NeighborWithWeights => x.favScoreHalfLife100Days.getOrElse(0.0) }
key.neighborId
}
}

View File

@ -1,28 +0,0 @@
package com.twitter.simclusters_v2.common.clustering
import com.twitter.simclusters_v2.common.UserId
import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
class MedoidRepresentativeSelectionMethod[T](
producerProducerSimilarityFn: (T, T) => Double)
extends ClusterRepresentativeSelectionMethod[T] {
/**
* Identify the medoid of a cluster and return it.
*
* @param cluster A set of NeighborWithWeights.
* @param embeddings A map of producer ID -> embedding.
*/
def selectClusterRepresentative(
cluster: Set[NeighborWithWeights],
embeddings: Map[UserId, T],
): UserId = {
val key = cluster.maxBy {
id1 => // maxBy because we use similarity, which gets larger as we get closer.
val v = embeddings(id1.neighborId)
cluster
.map(id2 => producerProducerSimilarityFn(v, embeddings(id2.neighborId))).sum
}
key.neighborId
}
}

View File

@ -1,32 +0,0 @@
package com.twitter.simclusters_v2.common.clustering
import com.twitter.simclusters_v2.common.SimClustersEmbedding
/**
* SimilarityFunctions provide commonly used similarity functions that this clustering library needs.
*/
object SimilarityFunctions {
def simClustersCosineSimilarity: (SimClustersEmbedding, SimClustersEmbedding) => Double =
(e1, e2) => e1.cosineSimilarity(e2)
def simClustersMatchingLargestDimension: (
SimClustersEmbedding,
SimClustersEmbedding
) => Double = (e1, e2) => {
val doesMatchLargestDimension: Boolean = e1
.topClusterIds(1)
.exists { id1 =>
e2.topClusterIds(1).contains(id1)
}
if (doesMatchLargestDimension) 1.0
else 0.0
}
def simClustersFuzzyJaccardSimilarity: (
SimClustersEmbedding,
SimClustersEmbedding
) => Double = (e1, e2) => {
e1.fuzzyJaccardSimilarity(e2)
}
}

View File

@ -1,12 +0,0 @@
# This package/target is separate from other simclusters common packages because the ml/api dep is
# large (350MB+). Having it as a separate target means that we can avoid bundling it with targets
# that do not need it.
scala_library(
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"src/java/com/twitter/ml/api:api-base",
"src/scala/com/twitter/ml/api/util",
"src/scala/com/twitter/simclusters_v2/common",
],
)

View File

@ -1,39 +0,0 @@
package com.twitter.simclusters_v2.common.ml
import com.twitter.ml.api.Feature.Continuous
import com.twitter.ml.api.Feature.SparseContinuous
import com.twitter.ml.api._
import com.twitter.ml.api.util.FDsl._
import com.twitter.simclusters_v2.common.SimClustersEmbedding
class SimClustersEmbeddingAdapter(embeddingFeature: SparseContinuous)
extends IRecordOneToOneAdapter[SimClustersEmbedding] {
override def getFeatureContext: FeatureContext = new FeatureContext(embeddingFeature)
override def adaptToDataRecord(embedding: SimClustersEmbedding): DataRecord = {
val embeddingMap = embedding.embedding.map {
case (clusterId, score) =>
(clusterId.toString, score)
}.toMap
new DataRecord().setFeatureValue(embeddingFeature, embeddingMap)
}
}
class NormalizedSimClustersEmbeddingAdapter(
embeddingFeature: SparseContinuous,
normFeature: Continuous)
extends IRecordOneToOneAdapter[SimClustersEmbedding] {
override def getFeatureContext: FeatureContext = new FeatureContext(embeddingFeature, normFeature)
override def adaptToDataRecord(embedding: SimClustersEmbedding): DataRecord = {
val normalizedEmbedding = Map(
embedding.sortedClusterIds.map(_.toString).zip(embedding.normalizedSortedScores): _*)
val dataRecord = new DataRecord().setFeatureValue(embeddingFeature, normalizedEmbedding)
dataRecord.setFeatureValue(normFeature, embedding.l2norm)
}
}

View File

@ -1,17 +0,0 @@
package com.twitter.simclusters_v2
package object common {
type TweetId = Long
type UserId = Long
type ClusterId = Int
type SemanticCoreEntityId = Long // Use TopicId if it's a Topic related project.
type UTTEntityId = Long
type Timestamp = Long
type Language = String
type Country = String
type LocaleEntity = (Long, Language)
type TopicId = Long
type GroupId = Long
type SpaceId = String
}

View File

@ -1,164 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources
import com.twitter.bijection.scrooge.BinaryScalaCodec
import com.twitter.bijection.scrooge.CompactScalaCodec
import com.twitter.bijection.Bufferable
import com.twitter.bijection.Injection
import com.twitter.hermit.candidate.thriftscala.Candidates
import com.twitter.scalding.DateRange
import com.twitter.scalding.commons.source.VersionedKeyValSource
import com.twitter.scalding_internal.source.lzo_scrooge.DailySuffixMostRecentLzoScrooge
import com.twitter.scalding_internal.source.lzo_scrooge.FixedPathLzoScrooge
import com.twitter.scalding_internal.source.lzo_scrooge.HourlySuffixMostRecentLzoScrooge
import com.twitter.simclusters_v2.thriftscala._
case class EdgeWithDecayedWtsFixedPathSource(path: String)
extends FixedPathLzoScrooge[EdgeWithDecayedWeights](path, EdgeWithDecayedWeights)
case class UserAndNeighborsFixedPathSource(path: String)
extends FixedPathLzoScrooge[UserAndNeighbors](path, UserAndNeighbors)
case class NormsAndCountsFixedPathSource(path: String)
extends FixedPathLzoScrooge[NormsAndCounts](path, NormsAndCounts)
case class UserToInterestedInClustersFixedPathSource(path: String)
extends FixedPathLzoScrooge[UserToInterestedInClusters](path, UserToInterestedInClusters)
case class TimelineDataExtractorFixedPathSource(path: String)
extends FixedPathLzoScrooge[ReferenceTweets](path, ReferenceTweets)
case class TweetClusterScoresHourlySuffixSource(path: String, override val dateRange: DateRange)
extends HourlySuffixMostRecentLzoScrooge[TweetAndClusterScores](path, dateRange)
case class TweetTopKClustersHourlySuffixSource(path: String, override val dateRange: DateRange)
extends HourlySuffixMostRecentLzoScrooge[TweetTopKClustersWithScores](
path,
dateRange
)
case class ClusterTopKTweetsHourlySuffixSource(path: String, override val dateRange: DateRange)
extends HourlySuffixMostRecentLzoScrooge[ClusterTopKTweetsWithScores](
path,
dateRange
)
case class TweetSimilarityUnhydratedPairsSource(path: String, override val dateRange: DateRange)
extends DailySuffixMostRecentLzoScrooge[LabelledTweetPairs](
path,
dateRange
)
case class WTFCandidatesSource(path: String)
extends FixedPathLzoScrooge[Candidates](path, Candidates)
case class EmbeddingsLiteSource(path: String)
extends FixedPathLzoScrooge[EmbeddingsLite](path, EmbeddingsLite)
object AdhocKeyValSources {
def interestedInSource(path: String): VersionedKeyValSource[Long, ClustersUserIsInterestedIn] = {
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
implicit val valInject: Injection[ClustersUserIsInterestedIn, Array[Byte]] =
CompactScalaCodec(ClustersUserIsInterestedIn)
VersionedKeyValSource[Long, ClustersUserIsInterestedIn](path)
}
def clusterDetailsSource(path: String): VersionedKeyValSource[(String, Int), ClusterDetails] = {
implicit val keyInject: Injection[(String, Int), Array[Byte]] =
Bufferable.injectionOf[(String, Int)]
implicit val valInject: Injection[ClusterDetails, Array[Byte]] =
CompactScalaCodec(ClusterDetails)
VersionedKeyValSource[(String, Int), ClusterDetails](path)
}
def bipartiteQualitySource(
path: String
): VersionedKeyValSource[(String, Int), BipartiteClusterQuality] = {
implicit val keyInject: Injection[(String, Int), Array[Byte]] =
Bufferable.injectionOf[(String, Int)]
implicit val valInject: Injection[BipartiteClusterQuality, Array[Byte]] =
CompactScalaCodec(BipartiteClusterQuality)
VersionedKeyValSource[(String, Int), BipartiteClusterQuality](path)
}
def entityToClustersSource(
path: String
): VersionedKeyValSource[SimClustersEmbeddingId, SimClustersEmbedding] = {
implicit val keyInject: Injection[SimClustersEmbeddingId, Array[Byte]] =
BinaryScalaCodec(SimClustersEmbeddingId)
implicit val valInject: Injection[SimClustersEmbedding, Array[Byte]] =
BinaryScalaCodec(SimClustersEmbedding)
VersionedKeyValSource[SimClustersEmbeddingId, SimClustersEmbedding](path)
}
def clusterToEntitiesSource(
path: String
): VersionedKeyValSource[SimClustersEmbeddingId, InternalIdEmbedding] = {
implicit val keyInject: Injection[SimClustersEmbeddingId, Array[Byte]] = BinaryScalaCodec(
SimClustersEmbeddingId)
implicit val valInject: Injection[InternalIdEmbedding, Array[Byte]] =
BinaryScalaCodec(InternalIdEmbedding)
VersionedKeyValSource[SimClustersEmbeddingId, InternalIdEmbedding](path)
}
// For storing producer-simclusters embeddings
def topProducerToClusterEmbeddingsSource(
path: String
): VersionedKeyValSource[Long, TopSimClustersWithScore] = {
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
implicit val valInject: Injection[TopSimClustersWithScore, Array[Byte]] =
CompactScalaCodec(TopSimClustersWithScore)
VersionedKeyValSource[Long, TopSimClustersWithScore](path)
}
// For storing producer-simclusters embeddings
def topClusterEmbeddingsToProducerSource(
path: String
): VersionedKeyValSource[PersistedFullClusterId, TopProducersWithScore] = {
implicit val keyInject: Injection[PersistedFullClusterId, Array[Byte]] =
CompactScalaCodec(PersistedFullClusterId)
implicit val valInject: Injection[TopProducersWithScore, Array[Byte]] =
CompactScalaCodec(TopProducersWithScore)
VersionedKeyValSource[PersistedFullClusterId, TopProducersWithScore](path)
}
def userToInferredEntitiesSource(
path: String
): VersionedKeyValSource[Long, SimClustersInferredEntities] = {
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
implicit val valInject: Injection[SimClustersInferredEntities, Array[Byte]] =
CompactScalaCodec(SimClustersInferredEntities)
VersionedKeyValSource[Long, SimClustersInferredEntities](path)
}
def knownForAdhocSource(path: String): VersionedKeyValSource[Long, ClustersUserIsKnownFor] = {
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
implicit val valInject: Injection[ClustersUserIsKnownFor, Array[Byte]] =
CompactScalaCodec(ClustersUserIsKnownFor)
VersionedKeyValSource[Long, ClustersUserIsKnownFor](path)
}
def knownForSBFResultsDevelSource(
path: String
): VersionedKeyValSource[Long, Array[(Int, Float)]] = {
implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
implicit val valInject: Injection[Array[(Int, Float)], Array[Byte]] =
Bufferable.injectionOf[Array[(Int, Float)]]
VersionedKeyValSource[Long, Array[(Int, Float)]](path)
}
// injection to store adjlist in the mapped indices space for users
def intermediateSBFResultsDevelSource(
path: String
): VersionedKeyValSource[Int, List[(Int, Float)]] = {
implicit val keyInject: Injection[Int, Array[Byte]] = Injection.int2BigEndian
implicit val valInject: Injection[List[(Int, Float)], Array[Byte]] =
Bufferable.injectionOf[List[(Int, Float)]]
VersionedKeyValSource[Int, List[(Int, Float)]](path)
}
def mappedIndicesDevelSource(path: String): VersionedKeyValSource[Int, Long] = {
implicit val keyInject: Injection[Int, Array[Byte]] = Injection.int2BigEndian
implicit val valInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
VersionedKeyValSource[Int, Long](path)
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,49 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources
object DataPaths {
val InterestedIn2020Path =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_20M_145K_2020"
val InterestedIn2020ThriftPath =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_20M_145K_2020_thrift"
val InterestedInLite2020Path =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_lite_20M_145K_2020"
val InterestedInLite2020ThriftPath =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_lite_20M_145K_2020_thrift"
val KnownFor2020Path =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_known_for_20M_145K_2020"
// keep this inside /user/cassowary/manhattan_sequence_files/ to use the latest 3 retention policy
val KnownFor2020ThriftDatasetPath =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_known_for_20M_145K_2020_thrift"
val OfflineClusterTopMediaTweets2020DatasetPath =
"/user/cassowary/manhattan_sequence_files/cluster_top_media_tweets_20M_145K_2020"
}
/**
* These should only be accessed from simclusters_v2 data pipeline for intermediate data, these
* are not opt-out compliant and shouldn't be exposed externally.
*/
object InternalDataPaths {
// Internal versions, not to be read or written outside of simcluster_v2
private[simclusters_v2] val RawInterestedIn2020Path =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_interested_in_20M_145K_2020"
private[simclusters_v2] val RawInterestedInLite2020Path =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_interested_in_lite_20M_145K_2020"
private[simclusters_v2] val RawKnownForDec11Path =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_dec11"
private[simclusters_v2] val RawKnownForUpdatedPath =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_updated"
private[simclusters_v2] val RawKnownFor2020Path =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_2020"
}

View File

@ -1,39 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources
import com.twitter.scalding.DateOps
import com.twitter.scalding.DateRange
import com.twitter.scalding.Days
import com.twitter.scalding.TypedPipe
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
import com.twitter.simclusters_v2.thriftscala.NormsAndCounts
import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
import java.util.TimeZone
object DataSources {
/**
* Reads production normalized graph data from atla-proc
*/
def userUserNormalizedGraphSource(implicit dateRange: DateRange): TypedPipe[UserAndNeighbors] = {
DAL
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(14)(DateOps.UTC))
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe
}
/**
* Reads production user norms and counts data from atla-proc
*/
def userNormsAndCounts(
implicit dateRange: DateRange,
timeZone: TimeZone
): TypedPipe[NormsAndCounts] = {
DAL
.readMostRecentSnapshot(ProducerNormsAndCountsScalaDataset, dateRange.prepend(Days(14)))
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe
}
}

View File

@ -1,222 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources
import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.scalding.DateRange
import com.twitter.scalding.typed.TypedPipe
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.simclusters_v2.thriftscala._
import com.twitter.wtf.entity_real_graph.thriftscala.EntityType
import com.twitter.simclusters_v2.common.ClusterId
import com.twitter.simclusters_v2.common.ModelVersions
object EntityEmbeddingsSources {
final val SemanticCoreSimClustersEmbeddingsDec11Dataset =
SemanticCoreSimclustersEmbeddingsScalaDataset
final val SemanticCoreSimClustersEmbeddingsUpdatedDataset =
SemanticCoreSimclustersEmbeddingsUpdatedScalaDataset
final val SemanticCoreSimClustersEmbeddings2020Dataset =
SemanticCoreSimclustersEmbeddings2020ScalaDataset
final val SemanticCorePerLanguageSimClustersEmbeddingsDataset =
SemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
final val LogFavSemanticCorePerLanguageSimClustersEmbeddingsDataset =
LogFavSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
final val HashtagSimClustersEmbeddingsUpdatedDataset =
HashtagSimclustersEmbeddingsUpdatedScalaDataset
final val ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset =
ReverseIndexSemanticCoreSimclustersEmbeddingsScalaDataset
final val ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset =
ReverseIndexSemanticCoreSimclustersEmbeddingsUpdatedScalaDataset
final val ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset =
ReverseIndexSemanticCoreSimclustersEmbeddings2020ScalaDataset
final val ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
final val LogFavReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
LogFavReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
final val ReverseIndexHashtagSimClustersEmbeddingsUpdatedDataset =
ReverseIndexHashtagSimclustersEmbeddingsUpdatedScalaDataset
// Fav-based TFG topic embeddings built from user device languages
// Keyed by SimClustersEmbeddingId with InternalId.TopicId ((topic, language) pair, with country = None)
final val FavTfgTopicEmbeddingsDataset = FavTfgTopicEmbeddingsScalaDataset
final val FavTfgTopicEmbeddingsParquetDataset = FavTfgTopicEmbeddingsParquetScalaDataset
final val FavTfgTopicEmbeddings2020Dataset = FavTfgTopicEmbeddings2020ScalaDataset
final val FavTfgTopicEmbeddings2020ParquetDataset = FavTfgTopicEmbeddings2020ParquetScalaDataset
// Logfav-based TFG topic embeddings built from user device languages
// Keyed by SimClustersEmbeddingId with InternalId.LocaleEntityId ((topic, language) pair)
final val LogFavTfgTopicEmbeddingsDataset = LogFavTfgTopicEmbeddingsScalaDataset
final val LogFavTfgTopicEmbeddingsParquetDataset = LogFavTfgTopicEmbeddingsParquetScalaDataset
// Fav-based TFG topic embeddings built from inferred user consumed languages
// Keyed by SimClustersEmbeddingId with InternalId.TopicId ((topic, country, language) tuple)
final val FavInferredLanguageTfgTopicEmbeddingsDataset =
FavInferredLanguageTfgTopicEmbeddingsScalaDataset
private val validSemanticCoreEmbeddingTypes = Seq(
EmbeddingType.FavBasedSematicCoreEntity,
EmbeddingType.FollowBasedSematicCoreEntity
)
/**
* Given a fav/follow/etc embedding type and a ModelVersion, retrieve the corresponding dataset to
* (SemanticCore entityId -> List(clusterId)) from a certain dateRange.
*/
def getSemanticCoreEntityEmbeddingsSource(
embeddingType: EmbeddingType,
modelVersion: String,
dateRange: DateRange
): TypedPipe[(Long, SimClustersEmbedding)] = {
val dataSet = modelVersion match {
case ModelVersions.Model20M145KDec11 => SemanticCoreSimClustersEmbeddingsDec11Dataset
case ModelVersions.Model20M145KUpdated => SemanticCoreSimClustersEmbeddingsUpdatedDataset
case _ => throw new IllegalArgumentException(s"ModelVersion $modelVersion is not supported")
}
assert(validSemanticCoreEmbeddingTypes.contains(embeddingType))
entityEmbeddingsSource(dataSet, embeddingType, dateRange)
}
/**
* Given a fav/follow/etc embedding type and a ModelVersion, retrieve the corresponding dataset to
* (clusterId -> List(SemanticCore entityId)) from a certain dateRange.
*/
def getReverseIndexedSemanticCoreEntityEmbeddingsSource(
embeddingType: EmbeddingType,
modelVersion: String,
dateRange: DateRange
): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
val dataSet = modelVersion match {
case ModelVersions.Model20M145KDec11 =>
ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset
case ModelVersions.Model20M145KUpdated =>
ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset
case ModelVersions.Model20M145K2020 =>
ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset
case _ => throw new IllegalArgumentException(s"ModelVersion $modelVersion is not supported")
}
assert(validSemanticCoreEmbeddingTypes.contains(embeddingType))
reverseIndexedEntityEmbeddingsSource(dataSet, embeddingType, dateRange)
}
// Return the raw DAL dataset reference. Use this if you're writing to DAL.
def getEntityEmbeddingsDataset(
entityType: EntityType,
modelVersion: String,
isEmbeddingsPerLocale: Boolean = false
): KeyValDALDataset[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] = {
(entityType, modelVersion) match {
case (EntityType.SemanticCore, ModelVersions.Model20M145KDec11) =>
SemanticCoreSimClustersEmbeddingsDec11Dataset
case (EntityType.SemanticCore, ModelVersions.Model20M145KUpdated) =>
if (isEmbeddingsPerLocale) {
SemanticCorePerLanguageSimClustersEmbeddingsDataset
} else {
SemanticCoreSimClustersEmbeddingsUpdatedDataset
}
case (EntityType.SemanticCore, ModelVersions.Model20M145K2020) =>
SemanticCoreSimClustersEmbeddings2020Dataset
case (EntityType.Hashtag, ModelVersions.Model20M145KUpdated) =>
HashtagSimClustersEmbeddingsUpdatedDataset
case (entityType, modelVersion) =>
throw new IllegalArgumentException(
s"(Entity Type, ModelVersion) ($entityType, $modelVersion) not supported.")
}
}
// Return the raw DAL dataset reference. Use this if you're writing to DAL.
def getReverseIndexedEntityEmbeddingsDataset(
entityType: EntityType,
modelVersion: String,
isEmbeddingsPerLocale: Boolean = false
): KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]] = {
(entityType, modelVersion) match {
case (EntityType.SemanticCore, ModelVersions.Model20M145KDec11) =>
ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset
case (EntityType.SemanticCore, ModelVersions.Model20M145KUpdated) =>
if (isEmbeddingsPerLocale) {
ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset
} else {
ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset
}
case (EntityType.SemanticCore, ModelVersions.Model20M145K2020) =>
ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset
case (EntityType.Hashtag, ModelVersions.Model20M145KUpdated) =>
ReverseIndexHashtagSimClustersEmbeddingsUpdatedDataset
case (entityType, modelVersion) =>
throw new IllegalArgumentException(
s"(Entity Type, ModelVersion) ($entityType, $modelVersion) not supported.")
}
}
private def entityEmbeddingsSource(
dataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]],
embeddingType: EmbeddingType,
dateRange: DateRange
): TypedPipe[(Long, SimClustersEmbedding)] = {
val pipe = DAL
.readMostRecentSnapshot(dataset, dateRange)
.withRemoteReadPolicy(AllowCrossDC)
.toTypedPipe
filterEntityEmbeddingsByType(pipe, embeddingType)
}
private def reverseIndexedEntityEmbeddingsSource(
dataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]],
embeddingType: EmbeddingType,
dateRange: DateRange
): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
val pipe = DAL
.readMostRecentSnapshot(dataset, dateRange)
.withRemoteReadPolicy(AllowCrossDC)
.toTypedPipe
filterReverseIndexedEntityEmbeddingsByType(pipe, embeddingType)
}
private[hdfs_sources] def filterEntityEmbeddingsByType(
pipe: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]],
embeddingType: EmbeddingType
): TypedPipe[(Long, SimClustersEmbedding)] = {
pipe.collect {
case KeyVal(
SimClustersEmbeddingId(_embeddingType, _, InternalId.EntityId(entityId)),
embedding
) if _embeddingType == embeddingType =>
(entityId, embedding)
}
}
private[hdfs_sources] def filterReverseIndexedEntityEmbeddingsByType(
pipe: TypedPipe[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]],
embeddingType: EmbeddingType
): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
pipe.collect {
case KeyVal(
SimClustersEmbeddingId(_embeddingType, _, InternalId.ClusterId(clusterId)),
embedding
) if _embeddingType == embeddingType =>
val entitiesWithScores = embedding.embedding.collect {
case InternalIdWithScore(InternalId.EntityId(entityId), score) =>
SemanticCoreEntityWithScore(entityId, score)
}
(clusterId, entitiesWithScores)
}
}
}

View File

@ -1,178 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources
import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.scalding.{DateOps, DateRange, Days, TypedPipe}
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, ProcAtla}
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.simclusters_v2.common.UserId
import com.twitter.simclusters_v2.thriftscala.ModelVersion
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
import java.util.TimeZone
object InterestedInSources {
private val ModelVersionInterestedInDatasetMap: Map[ModelVersion, KeyValDALDataset[
KeyVal[UserId, ClustersUserIsInterestedIn]
]] = Map(
ModelVersion.Model20m145kDec11 -> SimclustersV2InterestedInScalaDataset,
ModelVersion.Model20m145kUpdated -> SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
ModelVersion.Model20m145k2020 -> SimclustersV2InterestedIn20M145K2020ScalaDataset
)
/**
* Internal version, not PDP compliant, not to be used outside simclusters_v2
* Reads 20M145KDec11 production InterestedIn data from atla-proc, with a 14-day extended window
*/
private[simclusters_v2] def simClustersRawInterestedInDec11Source(
dateRange: DateRange,
timeZone: TimeZone
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
DAL
.readMostRecentSnapshot(
SimclustersV2RawInterestedIn20M145KDec11ScalaDataset,
dateRange.prepend(Days(14)(timeZone))
)
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe
.map {
case KeyVal(userId, clustersUserIsInterestedIn) =>
(userId, clustersUserIsInterestedIn)
}
}
/**
* Internal version, not PDP compliant, not to be used outside simclusters_v2
* Reads 20M145KUpdated InterestedIn data from atla-proc, with a 14-day extended window
*/
private[simclusters_v2] def simClustersRawInterestedInUpdatedSource(
dateRange: DateRange,
timeZone: TimeZone
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
DAL
.readMostRecentSnapshot(
SimclustersV2RawInterestedIn20M145KUpdatedScalaDataset,
dateRange.prepend(Days(14)(timeZone))
)
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe.map {
case KeyVal(userId, clustersUserIsInterestedIn) =>
(userId, clustersUserIsInterestedIn)
}
}
/**
* Internal version, not PDP compliant, not to be used outside simclusters_v2
* Reads 20M145K2020 InterestedIn data from atla-proc, with a 14-day extended window
*/
private[simclusters_v2] def simClustersRawInterestedIn2020Source(
dateRange: DateRange,
timeZone: TimeZone
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
DAL
.readMostRecentSnapshot(
SimclustersV2RawInterestedIn20M145K2020ScalaDataset,
dateRange.prepend(Days(14)(timeZone))
)
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe.map {
case KeyVal(userId, clustersUserIsInterestedIn) =>
(userId, clustersUserIsInterestedIn)
}
}
private[simclusters_v2] def simClustersRawInterestedInLite2020Source(
dateRange: DateRange,
timeZone: TimeZone
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
DAL
.readMostRecentSnapshot(
SimclustersV2RawInterestedInLite20M145K2020ScalaDataset,
dateRange.extend(Days(14)(timeZone)))
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe.map {
case KeyVal(userId, clustersUserIsInterestedIn) =>
(userId, clustersUserIsInterestedIn)
}
}
/**
* Reads 20M145KDec11 production InterestedIn data from atla-proc, with a 14-day extended window
*/
def simClustersInterestedInDec11Source(
dateRange: DateRange,
timeZone: TimeZone
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
DAL
.readMostRecentSnapshot(
SimclustersV2InterestedInScalaDataset,
dateRange.prepend(Days(14)(timeZone)))
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe.map {
case KeyVal(userId, clustersUserIsInterestedIn) =>
(userId, clustersUserIsInterestedIn)
}
}
/**
* Reads 20M145KUpdated InterestedIn data from atla-proc, with a 14-day extended window
*/
def simClustersInterestedInUpdatedSource(
dateRange: DateRange,
timeZone: TimeZone
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
DAL
.readMostRecentSnapshot(
SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
dateRange.prepend(Days(14)(timeZone))
)
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe.map {
case KeyVal(userId, clustersUserIsInterestedIn) =>
(userId, clustersUserIsInterestedIn)
}
}
/**
* Reads 20M145K2020 InterestedIn data from atla-proc, with a 14-day extended window
*/
def simClustersInterestedIn2020Source(
dateRange: DateRange,
timeZone: TimeZone
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
DAL
.readMostRecentSnapshot(
SimclustersV2InterestedIn20M145K2020ScalaDataset,
dateRange.prepend(Days(14)(timeZone))
)
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe.map {
case KeyVal(userId, clustersUserIsInterestedIn) =>
(userId, clustersUserIsInterestedIn)
}
}
/**
* Reads InterestedIn data based on ModelVersion from atla-proc, with a 14-day extended window
*/
def simClustersInterestedInSource(
modelVersion: ModelVersion,
dateRange: DateRange,
timeZone: TimeZone
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
DAL
.readMostRecentSnapshot(
ModelVersionInterestedInDatasetMap(modelVersion),
dateRange.prepend(Days(14)(timeZone))
)
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe.map {
case KeyVal(userId, clustersUserIsInterestedIn) =>
(userId, clustersUserIsInterestedIn)
}
}
}

View File

@ -1,86 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources
import com.twitter.scalding.DateRange
import com.twitter.scalding.TypedPipe
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
import com.twitter.scalding_internal.dalv2.remote_access.Proc3Atla
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.simclusters_v2.thriftscala.EmbeddingType
import com.twitter.simclusters_v2.thriftscala.InternalId
import com.twitter.simclusters_v2.thriftscala.ModelVersion
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore
object ProducerEmbeddingSources {
/**
* Helper function to retrieve producer SimClusters embeddings with the legacy `TopSimClustersWithScore`
* value type.
*/
def producerEmbeddingSourceLegacy(
embeddingType: EmbeddingType,
modelVersion: ModelVersion
)(
implicit dateRange: DateRange
): TypedPipe[(Long, TopSimClustersWithScore)] = {
val producerEmbeddingDataset = (embeddingType, modelVersion) match {
case (EmbeddingType.ProducerFollowBasedSemanticCoreEntity, ModelVersion.Model20m145kDec11) =>
ProducerTopKSimclusterEmbeddingsByFollowScoreScalaDataset
case (EmbeddingType.ProducerFavBasedSemanticCoreEntity, ModelVersion.Model20m145kDec11) =>
ProducerTopKSimclusterEmbeddingsByFavScoreScalaDataset
case (
EmbeddingType.ProducerFollowBasedSemanticCoreEntity,
ModelVersion.Model20m145kUpdated) =>
ProducerTopKSimclusterEmbeddingsByFollowScoreUpdatedScalaDataset
case (EmbeddingType.ProducerFavBasedSemanticCoreEntity, ModelVersion.Model20m145kUpdated) =>
ProducerTopKSimclusterEmbeddingsByFavScoreUpdatedScalaDataset
case (_, _) =>
throw new ClassNotFoundException(
"Unsupported embedding type: " + embeddingType + " and model version: " + modelVersion)
}
DAL
.readMostRecentSnapshot(producerEmbeddingDataset).withRemoteReadPolicy(
AllowCrossClusterSameDC)
.toTypedPipe.map {
case KeyVal(producerId, topSimClustersWithScore) =>
(producerId, topSimClustersWithScore)
}
}
def producerEmbeddingSource(
embeddingType: EmbeddingType,
modelVersion: ModelVersion
)(
implicit dateRange: DateRange
): TypedPipe[(Long, SimClustersEmbedding)] = {
val producerEmbeddingDataset = (embeddingType, modelVersion) match {
case (EmbeddingType.AggregatableLogFavBasedProducer, ModelVersion.Model20m145k2020) =>
AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
case (EmbeddingType.AggregatableFollowBasedProducer, ModelVersion.Model20m145k2020) =>
AggregatableProducerSimclustersEmbeddingsByFollowScore2020ScalaDataset
case (EmbeddingType.RelaxedAggregatableLogFavBasedProducer, ModelVersion.Model20m145k2020) =>
AggregatableProducerSimclustersEmbeddingsByLogFavScoreRelaxedFavEngagementThreshold2020ScalaDataset
case (_, _) =>
throw new ClassNotFoundException(
"Unsupported embedding type: " + embeddingType + " and model version: " + modelVersion)
}
DAL
.readMostRecentSnapshot(
producerEmbeddingDataset
)
.withRemoteReadPolicy(ExplicitLocation(Proc3Atla))
.toTypedPipe
.map {
case KeyVal(
SimClustersEmbeddingId(_, _, InternalId.UserId(producerId: Long)),
embedding: SimClustersEmbedding) =>
(producerId, embedding)
}
}
}

View File

@ -1,13 +0,0 @@
scala_library(
sources = ["*.scala"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"src/scala/com/twitter/scalding_internal/multiformat/format",
"src/scala/com/twitter/simclusters_v2/common",
"src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala",
"src/thrift/com/twitter/ml/api:embedding-scala",
"src/thrift/com/twitter/recos/entities:entities-thrift-scala",
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
],
)

View File

@ -1,16 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.injections
import com.twitter.bijection.Bufferable
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
ScalaCompactThrift,
genericInjection
}
import com.twitter.simclusters_v2.thriftscala.ClusterDetails
object ClusterDetailsInjection {
val injection = KeyValInjection[(String, Int), ClusterDetails](
genericInjection(Bufferable.injectionOf[(String, Int)]),
ScalaCompactThrift(ClusterDetails)
)
}

View File

@ -1,13 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.injections
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
import com.twitter.simclusters_v2.thriftscala.{TweetsWithScore, DayPartitionedClusterId}
object ClusterTopMediaTweetsInjection {
val injection = KeyValInjection[DayPartitionedClusterId, TweetsWithScore](
ScalaCompactThrift(DayPartitionedClusterId),
ScalaCompactThrift(TweetsWithScore)
)
}

View File

@ -1,14 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.injections
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
import com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores
import com.twitter.simclusters_v2.thriftscala.FullClusterId
object ClusterTopTweetsInjection {
val clusterIdToTopKTweetsInjection = KeyValInjection[FullClusterId, TopKTweetsWithScores](
ScalaCompactThrift(FullClusterId),
ScalaCompactThrift(TopKTweetsWithScores)
)
}

View File

@ -1,16 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.injections
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaBinaryThrift
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
import com.twitter.simclusters_v2.common.UserId
import com.twitter.simclusters_v2.thriftscala._
object ClusteringInjections {
final val OrderedClustersAndMembersInjection: KeyValInjection[
UserId,
OrderedClustersAndMembers
] =
KeyValInjection(Long2BigEndian, ScalaBinaryThrift(OrderedClustersAndMembers))
}

View File

@ -1,47 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.injections
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaBinaryThrift
import com.twitter.simclusters_v2.thriftscala._
import com.twitter.ml.api.thriftscala.Embedding
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
object EntityEmbeddingsInjections {
final val EntitySimClustersEmbeddingInjection: KeyValInjection[
SimClustersEmbeddingId,
SimClustersEmbedding
] =
KeyValInjection(
ScalaBinaryThrift(SimClustersEmbeddingId),
ScalaBinaryThrift(SimClustersEmbedding)
)
final val InternalIdEmbeddingInjection: KeyValInjection[
SimClustersEmbeddingId,
InternalIdEmbedding
] =
KeyValInjection(
ScalaBinaryThrift(SimClustersEmbeddingId),
ScalaBinaryThrift(InternalIdEmbedding)
)
final val EntitySimClustersMultiEmbeddingInjection: KeyValInjection[
SimClustersMultiEmbeddingId,
SimClustersMultiEmbedding
] =
KeyValInjection(
ScalaBinaryThrift(SimClustersMultiEmbeddingId),
ScalaBinaryThrift(SimClustersMultiEmbedding)
)
final val UserMbcgEmbeddingInjection: KeyValInjection[
Long,
Embedding
] =
KeyValInjection[Long, Embedding](
Long2BigEndian,
ScalaCompactThrift(Embedding)
)
}

View File

@ -1,27 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.injections
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
Int2BigEndian,
Long2BigEndian,
ScalaCompactThrift
}
import com.twitter.simclusters_v2.thriftscala.SimClustersInferredEntities
object InferredEntitiesInjections {
final val InferredEntityInjection: KeyValInjection[Long, SimClustersInferredEntities] =
KeyValInjection(
Long2BigEndian,
ScalaCompactThrift(SimClustersInferredEntities)
)
final val InferredEntityKeyedByClusterInjection: KeyValInjection[
Int,
SimClustersInferredEntities
] =
KeyValInjection(
Int2BigEndian,
ScalaCompactThrift(SimClustersInferredEntities)
)
}

View File

@ -1,13 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.injections
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.StringUtf8
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
object InterestedInInjection {
val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(ClustersUserIsInterestedIn))
val languageInjection =
KeyValInjection(StringUtf8, ScalaCompactThrift(ClustersUserIsInterestedIn))
}

View File

@ -1,12 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.injections
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
Long2BigEndian,
ScalaCompactThrift
}
import com.twitter.simclusters_v2.thriftscala._
object KnownForInjection {
val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(ClustersUserIsKnownFor))
}

View File

@ -1,31 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.injections
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
import com.twitter.simclusters_v2.thriftscala.LeftNode
import com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList
import com.twitter.simclusters_v2.thriftscala.RightNode
import com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct
import com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeightList
import com.twitter.simclusters_v2.thriftscala.SimilarRightNodes
import com.twitter.simclusters_v2.thriftscala.CandidateTweetsList
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
object MultiTypeGraphInjections {
final val truncatedMultiTypeGraphInjection =
KeyValInjection(ScalaCompactThrift(LeftNode), ScalaCompactThrift(RightNodeWithEdgeWeightList))
final val topKRightNounListInjection =
KeyValInjection(
ScalaCompactThrift(RightNodeTypeStruct),
ScalaCompactThrift(NounWithFrequencyList))
final val similarRightNodesInjection =
KeyValInjection[RightNode, SimilarRightNodes](
ScalaCompactThrift(RightNode),
ScalaCompactThrift(SimilarRightNodes)
)
final val tweetRecommendationsInjection =
KeyValInjection[Long, CandidateTweetsList](
Long2BigEndian,
ScalaCompactThrift(CandidateTweetsList)
)
}

View File

@ -1,45 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.injections
import com.twitter.hermit.candidate.thriftscala.Candidates
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
Long2BigEndian,
ScalaBinaryThrift,
ScalaCompactThrift
}
import com.twitter.simclusters_v2.thriftscala.{
PersistedFullClusterId,
SimClustersEmbedding,
SimClustersEmbeddingId,
TopProducersWithScore,
TopSimClustersWithScore
}
object ProducerEmbeddingsInjections {
final val ProducerTopKSimClusterEmbeddingsInjection: KeyValInjection[
Long,
TopSimClustersWithScore
] =
KeyValInjection(
keyCodec = Long2BigEndian,
valueCodec = ScalaCompactThrift(TopSimClustersWithScore))
final val SimClusterEmbeddingTopKProducersInjection: KeyValInjection[
PersistedFullClusterId,
TopProducersWithScore
] =
KeyValInjection(
keyCodec = ScalaCompactThrift(PersistedFullClusterId),
valueCodec = ScalaCompactThrift(TopProducersWithScore))
final val SimilarUsersInjection: KeyValInjection[Long, Candidates] =
KeyValInjection(keyCodec = Long2BigEndian, valueCodec = ScalaCompactThrift(Candidates))
final val ProducerSimClustersEmbeddingInjection: KeyValInjection[
SimClustersEmbeddingId,
SimClustersEmbedding
] =
KeyValInjection(
keyCodec = ScalaBinaryThrift(SimClustersEmbeddingId),
valueCodec = ScalaBinaryThrift(SimClustersEmbedding))
}

View File

@ -1,53 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.injections
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
Long2BigEndian,
ScalaCompactThrift,
StringUtf8
}
import com.twitter.recos.entities.thriftscala.{
SemanticCoreEntityScoreList,
SemanticCoreEntityWithLocale,
UserIdWithLocale,
UserScoreList
}
object SemanticCoreEntitiesInjections {
final val StringToSemanticCoreEntityScoreListInjection: KeyValInjection[
String,
SemanticCoreEntityScoreList
] =
KeyValInjection(
StringUtf8,
ScalaCompactThrift(SemanticCoreEntityScoreList)
)
final val LongToSemanticCoreEntityScoreListInjection: KeyValInjection[
Long,
SemanticCoreEntityScoreList
] =
KeyValInjection(
Long2BigEndian,
ScalaCompactThrift(SemanticCoreEntityScoreList)
)
final val UserWithLocaleToSemanticCoreEntityScoreListInjection: KeyValInjection[
UserIdWithLocale,
SemanticCoreEntityScoreList
] =
KeyValInjection(
ScalaCompactThrift(UserIdWithLocale),
ScalaCompactThrift(SemanticCoreEntityScoreList)
)
final val SemanticCoreEntityWithLocaleToUsersScoreListInjection: KeyValInjection[
SemanticCoreEntityWithLocale,
UserScoreList
] =
KeyValInjection(
ScalaCompactThrift(SemanticCoreEntityWithLocale),
ScalaCompactThrift(UserScoreList)
)
}

View File

@ -1,12 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.injections
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
Long2BigEndian,
ScalaCompactThrift
}
import com.twitter.simclusters_v2.thriftscala.SingleSideUserScores
object SingleSideUserScoresInjection {
val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(SingleSideUserScores))
}

View File

@ -1,60 +0,0 @@
scala_library(
sources = ["*.scala"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
":data_sources",
"3rdparty/src/jvm/com/twitter/scalding:core",
"src/scala/com/twitter/scalding_internal/dalv2",
"src/scala/com/twitter/scalding_internal/multiformat/format",
"src/scala/com/twitter/scalding_internal/source/lzo_scrooge",
"src/scala/com/twitter/simclusters_v2/common",
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
"src/thrift/com/twitter/wtf/entity_real_graph:entity_real_graph-thrift-scala",
],
)
scala_library(
name = "data_sources",
sources = [],
description = "DAL datasets we wish to expose externally",
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
":reverse_index_semantic_core_per_language_simclusters_embeddings_presto-scala",
":semantic_core_per_language_simclusters_embeddings_presto-scala",
"src/scala/com/twitter/simclusters_v2/common",
],
)
create_datasets(
base_name = "reverse_index_semantic_core_per_language_simclusters_embeddings_presto",
java_schema = "com.twitter.simclusters_v2.thriftjava.InternalIdEmbeddingWithId",
platform = "java8",
role = "cassowary",
scala_schema = "com.twitter.simclusters_v2.thriftscala.InternalIdEmbeddingWithId",
segment_type = "snapshot",
tags = ["bazel-compatible"],
java_dependencies = [
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java",
],
scala_dependencies = [
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
],
)
create_datasets(
base_name = "semantic_core_per_language_simclusters_embeddings_presto",
java_schema = "com.twitter.simclusters_v2.thriftjava.SimClustersEmbeddingWithId",
platform = "java8",
role = "cassowary",
scala_schema = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId",
segment_type = "snapshot",
tags = ["bazel-compatible"],
java_dependencies = [
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java",
],
scala_dependencies = [
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
],
)

View File

@ -1,10 +0,0 @@
package com.twitter.simclusters_v2.hdfs_sources.presto_hdfs_sources
object EntityEmbeddingsPrestoSources {
final val SemanticCorePerLanguageSimClustersEmbeddingsDataset =
SemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset
final val ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 61 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 71 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 233 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 70 KiB

View File

@ -1,521 +0,0 @@
scala_library(
sources = ["*.scala"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/fasterxml/jackson:jackson-module-scala",
"3rdparty/jvm/com/fasterxml/jackson/core:jackson-core",
"3rdparty/jvm/com/fasterxml/jackson/core:jackson-databind",
"3rdparty/jvm/com/fasterxml/jackson/module:jackson-module-scala",
"3rdparty/jvm/com/googlecode/matrix-toolkits-java",
"3rdparty/jvm/com/twitter/storehaus:algebra",
"3rdparty/jvm/com/twitter/storehaus:core",
"escherbird/src/scala/com/twitter/escherbird/scalding/source",
"flockdb-tools/datasets/flock:flock-follows-edges-scala",
"src/java/com/twitter/ml/api/constant",
"src/java/com/twitter/sbf/core",
"src/java/com/twitter/sbf/graph",
"src/scala/com/twitter/frigate/user_sampler/common",
"src/scala/com/twitter/ml/api:api-base",
"src/scala/com/twitter/ml/api/bq",
"src/scala/com/twitter/pluck/source/cassowary:sims",
"src/scala/com/twitter/pluck/source/core_workflows/user_model:condensed_user_state-scala",
"src/scala/com/twitter/scalding_internal/dalv2",
"src/scala/com/twitter/scalding_internal/job",
"src/scala/com/twitter/scalding_internal/job/analytics_batch",
"src/scala/com/twitter/scalding_internal/source",
"src/scala/com/twitter/scalding_internal/source/lzo_scrooge",
"src/scala/com/twitter/simclusters_v2/candidate_source",
"src/scala/com/twitter/simclusters_v2/hdfs_sources",
"src/scala/com/twitter/simclusters_v2/scalding/common",
"src/scala/com/twitter/simclusters_v2/summingbird/common",
"src/scala/com/twitter/timelines/prediction/features/common",
"src/scala/com/twitter/timelines/prediction/features/itl",
"src/scala/com/twitter/timelines/prediction/features/recap",
"src/scala/com/twitter/wtf/entity_real_graph/scalding/common",
"src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala",
"src/thrift/com/twitter/wtf/scalding/sims:sims-thrift-scala",
"twadoop_config/configuration/log_categories/group/recos-platform:content_recommender_get_content_recommendations-scala",
"twadoop_config/configuration/log_categories/group/recos-platform:content_recommender_get_topic_tweets_recommendations-scala",
"twadoop_config/configuration/log_categories/group/timeline:timeline_service_favorites-scala",
"usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala",
"usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala",
"util/util-core:util-core-util",
],
)
hadoop_binary(
name = "evd_cluster_similarity",
main = "com.twitter.simclusters_v2.scalding.EigenVectorsForClusterSimilarityAdhoc",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "cluster_evaluation",
main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationAdhoc",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "cluster_evaluation_20m_145k",
main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "cluster_evaluation_20m_145k_2020",
main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K2020",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "bp_cluster_evaluation",
main = "com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluation",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "update_knownfor",
main = "com.twitter.simclusters_v2.scalding.UpdateKnownForAdhoc",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "update_knownfor_prod",
main = "com.twitter.simclusters_v2.scalding.UpdateKnownFor20M145K",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "cluster_details",
main = "com.twitter.simclusters_v2.scalding.ClusterDetailsBatch",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "cluster_details_20m_145k_updated",
main = "com.twitter.simclusters_v2.scalding.ClusterDetails20M145KUpdated",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "cluster_details_20m_145k_2020",
main = "com.twitter.simclusters_v2.scalding.ClusterDetails20M145K2020",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "cluster_details-adhoc",
main = "com.twitter.simclusters_v2.scalding.ClusterDetailsAdhoc",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "cluster_details-dump",
main = "com.twitter.simclusters_v2.scalding.DumpClusterDetailsAdhoc",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "interested_in",
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForBatch",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "interested_in_from_producer_embeddings",
main = "com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsBatchApp",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "employee_graph_from_user_user",
main = "com.twitter.simclusters_v2.scalding.EmployeeGraphFromUserUser",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "interested_in_20m_145k_updated",
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145KUpdated",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "interested_in_20m_145k_2020",
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145K2020",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "interested_in_lite_20m_145k_2020",
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "interested_in_lite_20m_145k_2020-adhoc",
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020Adhoc",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "interested_in_from_ape_2020-adhoc",
main = "com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020AdhocApp",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "interested_in_from_ape_2020",
main = "com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020BatchApp",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "known_for_to_mh",
main = "com.twitter.simclusters_v2.scalding.KnownForToMHBatch",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "user_user_normalized_graph",
main = "com.twitter.simclusters_v2.scalding.UserUserNormalizedGraphBatch",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "user_user_graph",
main = "com.twitter.simclusters_v2.scalding.UserUserGraphBatch",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "user_user_graph-adhoc",
main = "com.twitter.simclusters_v2.scalding.UserUserGraphAdhoc",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "producer_norms_and_counts",
main = "com.twitter.simclusters_v2.scalding.ProducerNormsAndCountsBatch",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "fav_graph",
main = "com.twitter.simclusters_v2.scalding.UserUserFavGraphBatch",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "top_users_similarity_graph",
main = "com.twitter.simclusters_v2.scalding.TopUsersSimilarityGraphApp",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "top_users_only",
main = "com.twitter.simclusters_v2.scalding.TopUsersOnlyApp",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
hadoop_binary(
name = "dump_fav_graph_adhoc",
main = "com.twitter.simclusters_v2.scalding.DumpFavGraphAdhoc",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)
# Generated with `capesospy-v2 create_target interested_in_for_20M_145k_2020 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml`, config hash 8f19bf.
scalding_job(
name = "interested_in_for_20M_145k_2020",
main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145K2020",
args = ["--socialProofThreshold 2 --maxClustersPerUser 50"],
config = [
("hadoop.combine-input", "true"),
("hadoop.map.jvm.total-memory", "3072m"),
("hadoop.reduce.jvm.total-memory", "3072m"),
("hadoop.submitter.jvm.total-memory", "5120m"),
("submitter.tier", "preemptible"),
],
cron = "14 * * * *",
hadoop_cluster = "atla-proc",
platform = "java8",
role = "cassowary",
runtime_platform = "java8",
tags = [
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)

View File

@ -1,513 +0,0 @@
package com.twitter.simclusters_v2.scalding
import com.twitter.algebird.Aggregator
import com.twitter.algebird.Monoid
import com.twitter.scalding._
import com.twitter.scalding.commons.source.VersionedKeyValSource
import com.twitter.scalding.typed.TypedPipe
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
import com.twitter.scalding_internal.job.TwitterExecutionApp
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
import com.twitter.simclusters_v2.hdfs_sources.NormsAndCountsFixedPathSource
import com.twitter.simclusters_v2.hdfs_sources.ProducerNormsAndCountsScalaDataset
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInScalaDataset
import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
import com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluationClasses._
import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._
import com.twitter.simclusters_v2.scalding.common.Util
import com.twitter.simclusters_v2.thriftscala.BipartiteClusterQuality
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
import com.twitter.simclusters_v2.thriftscala.NormsAndCounts
import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
import scala.collection.JavaConverters._
object BipartiteClusterEvaluation extends TwitterExecutionApp {
implicit val tz: java.util.TimeZone = DateOps.UTC
implicit val dp = DateParser.default
private def getClusterL2Norms(
knownFor: TypedPipe[(Long, Array[(Int, Float)])]
): Execution[Map[Int, Float]] = {
knownFor
.flatMap {
case (_, clusterArray) =>
clusterArray.map {
case (clusterId, score) =>
Map(clusterId -> score * score)
}
}
.sum
.getExecution
.map(_.mapValues { x => math.sqrt(x).toFloat })
}
def l2NormalizeKnownFor(
knownFor: TypedPipe[(Long, Array[(Int, Float)])]
): Execution[TypedPipe[(Long, Array[(Int, Float)])]] = {
getClusterL2Norms(knownFor).map { clusterToNorms =>
knownFor.mapValues { clusterScoresArray =>
clusterScoresArray.map {
case (clusterId, score) =>
(clusterId, score / clusterToNorms(clusterId))
}
}
}
}
/**
* ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:bp_cluster_evaluation && \
* oscar hdfs --user frigate --host hadoopnest2.atla.twitter.com --bundle bp_cluster_evaluation \
* --tool com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluation --screen --screen-detached \
* --tee logs/newBpQuality_updateUnnormalizedScores_interestedInUsing20190329Graph_evaluatedOn20190329Graph_run2 \
* -- --normsAndCountsDir /user/frigate/your_ldap/producerNormsAndCounts_20190330 \
* --graphInputDir /user/frigate/your_ldap/user_user_normalized_graph_copiedFromAtlaProc_20190329 \
* --knownForDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownFor \
* --interestedInDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/interestedInUsing20190329Graph \
* --outgoingVolumesResultsDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_outgoingVolumes \
* --incomingVolumesResultsDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_incomingVolumes \
* --outputDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_perCluster \
* --toEmailAddress your_ldap@twitter.com --modelVersion 20M_145K_updated
*/
override def job: Execution[Unit] = Execution.getConfigMode.flatMap {
case (config, mode) =>
Execution.withId { implicit uniqueId =>
val args = config.getArgs
val interestedIn = args.optional("interestedInDir") match {
case Some(dir) =>
TypedPipe
.from(AdhocKeyValSources.interestedInSource(args("interestedInDir")))
case None =>
DAL
.readMostRecentSnapshotNoOlderThan(
SimclustersV2InterestedInScalaDataset,
Days(20)
)
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe
.map {
case KeyVal(key, value) => (key, value)
}
}
val inputKnownFor = args
.optional("knownForDir")
.map { location => KnownForSources.readKnownFor(location) }
.getOrElse(KnownForSources.knownFor_20M_Dec11_145K)
val modelVersion =
args.optional("modelVersion").getOrElse("20M_145K_dec11")
val useLogFavWeights = args.boolean("useLogFavWeights")
val shouldL2NormalizeKnownFor = args.boolean("l2NormalizeKnownFor")
val toEmailAddressOpt = args.optional("toEmailAddress")
val knownForExec = if (shouldL2NormalizeKnownFor) {
l2NormalizeKnownFor(inputKnownFor)
} else {
Execution.from(inputKnownFor)
}
val finalExec = knownForExec.flatMap { knownFor =>
val graph = args.optional("graphInputDir") match {
case Some(dir) =>
TypedPipe.from(UserAndNeighborsFixedPathSource(dir))
case None =>
DAL
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(20))
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe
}
val producerNormsAndCounts = args.optional("normsAndCountsDir") match {
case Some(dir) =>
TypedPipe.from(NormsAndCountsFixedPathSource(args(dir)))
case None =>
DAL
.readMostRecentSnapshotNoOlderThan(ProducerNormsAndCountsScalaDataset, Days(20))
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe
}
val clusterIncomingVolumesExec = loadOrMake(
computeClusterIncomingVolumes(knownFor, producerNormsAndCounts, useLogFavWeights),
modelVersion,
args("incomingVolumesResultsDir")
)
val resultsWithOutgoingVolumesExec = loadOrMake(
getResultsWithOutgoingVolumes(graph, interestedIn, useLogFavWeights),
modelVersion,
args("outgoingVolumesResultsDir")
)
val finalPerClusterResultsExec =
finalPerClusterResults(
knownFor,
interestedIn,
resultsWithOutgoingVolumesExec,
clusterIncomingVolumesExec)
.flatMap { pipe => loadOrMake(pipe, modelVersion, args("outputDir")) }
finalPerClusterResultsExec.flatMap { finalPerClusterResults =>
val perClusterResults = finalPerClusterResults.values
val distributionResultsExec = getClusterResultsSummary(perClusterResults).map {
case Some(summary) =>
"Summary of results across clusters: \n" +
Util.prettyJsonMapper.writeValueAsString(summary)
case _ =>
"No summary of results! The cluster level results pipe must be empty!"
}
val overallResultsExec = perClusterResults.sum.toOptionExecution.map {
case Some(overallQuality) =>
"Overall Quality: \n" +
Util.prettyJsonMapper.writeValueAsString(
printableBipartiteQuality(overallQuality)
)
case _ =>
"No overall quality! The cluster level results pipe must be empty!"
}
Execution.zip(distributionResultsExec, overallResultsExec).map {
case (distResults, overallResults) =>
toEmailAddressOpt.foreach { address =>
Util.sendEmail(
distResults + "\n" + overallResults,
"Bipartite cluster quality for " + modelVersion,
address
)
}
println(distResults + "\n" + overallResults)
}
}
}
Util.printCounters(finalExec)
}
}
def getResultsWithOutgoingVolumes(
graph: TypedPipe[UserAndNeighbors],
interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
useLogFavWeights: Boolean
): TypedPipe[(Int, BipartiteClusterQuality)] = {
graph
.map { un => (un.userId, un.neighbors) }
// should this be a leftJoin? For now, leaving it as an inner join. If in the future,
// we want to compare two approaches with very different coverages on interestedIn, this
// could become a problem.
.join(interestedIn)
.withReducers(4000)
.flatMap {
case (userId, (neighbors, clusters)) =>
getBIResultsFromSingleUser(userId, neighbors, clusters, useLogFavWeights)
}
.sumByKey
.withReducers(600)
.map {
case (clusterId, bir) =>
(
clusterId,
BipartiteClusterQuality(
inClusterFollowEdges = Some(bir.inClusterWeights.isFollowEdge),
inClusterFavEdges = Some(bir.inClusterWeights.isFavEdge),
favWtSumOfInClusterFollowEdges = Some(bir.inClusterWeights.favWtIfFollowEdge),
favWtSumOfInClusterFavEdges = Some(bir.inClusterWeights.favWtIfFavEdge),
outgoingFollowEdges = Some(bir.totalOutgoingVolumes.isFollowEdge),
outgoingFavEdges = Some(bir.totalOutgoingVolumes.isFavEdge),
favWtSumOfOutgoingFollowEdges = Some(bir.totalOutgoingVolumes.favWtIfFollowEdge),
favWtSumOfOutgoingFavEdges = Some(bir.totalOutgoingVolumes.favWtIfFavEdge),
interestedInSize = Some(bir.interestedInSize),
sampledEdges = Some(
bir.edgeSample
.iterator()
.asScala
.toSeq
.map {
case (edge, data) => makeThriftSampledEdge(edge, data)
}
)
)
)
}
}
def getBIResultsFromSingleUser(
userId: Long,
neighbors: Seq[NeighborWithWeights],
clusters: ClustersUserIsInterestedIn,
useLogFavScores: Boolean
): List[(Int, BipartiteIntermediateResults)] = {
val neighborsToWeights = neighbors.map { neighborAndWeights =>
val isFollowEdge = neighborAndWeights.isFollowed match {
case Some(true) => 1.0
case _ => 0.0
}
val favScore = if (useLogFavScores) {
neighborAndWeights.logFavScore.getOrElse(0.0)
} else neighborAndWeights.favScoreHalfLife100Days.getOrElse(0.0)
val isFavEdge = math.min(1, math.ceil(favScore))
neighborAndWeights.neighborId -> Weights(
isFollowEdge,
isFavEdge,
favScore * isFollowEdge,
favScore
)
}.toMap
val outgoingVolumes = Monoid.sum(neighborsToWeights.values)(WeightsMonoid)
clusters.clusterIdToScores.toList.map {
case (clusterId, scoresStruct) =>
val inClusterNeighbors =
(scoresStruct.usersBeingFollowed.getOrElse(Nil) ++
scoresStruct.usersThatWereFaved.getOrElse(Nil)).toSet
val edgesForSampling = inClusterNeighbors.flatMap { neighborId =>
if (neighborsToWeights.contains(neighborId)) {
Some(
(userId, neighborId),
SampledEdgeData(
neighborsToWeights(neighborId).favWtIfFollowEdge,
neighborsToWeights(neighborId).favWtIfFavEdge,
scoresStruct.followScore.getOrElse(0.0),
scoresStruct.favScore.getOrElse(0.0)
)
)
} else {
None
}
}
val inClusterWeights =
Monoid.sum(neighborsToWeights.filterKeys(inClusterNeighbors).values)(WeightsMonoid)
(
clusterId,
BipartiteIntermediateResults(
inClusterWeights,
outgoingVolumes,
1,
samplerMonoid.build(edgesForSampling)
))
}
}
def computeClusterIncomingVolumes(
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
producerNormsAndCounts: TypedPipe[NormsAndCounts],
useLogFavWeights: Boolean
): TypedPipe[(Int, BipartiteClusterQuality)] = {
producerNormsAndCounts
.map { x => (x.userId, x) }
.join(knownFor)
.withReducers(100)
.flatMap {
case (userId, (normsAndCounts, clusters)) =>
clusters.map {
case (clusterId, _) =>
val followerCount =
normsAndCounts.followerCount.getOrElse(0L).toDouble
val faverCount = normsAndCounts.faverCount.getOrElse(0L).toDouble
val favWtSumOfIncomingFollows = if (useLogFavWeights) {
normsAndCounts.logFavWeightsOnFollowEdgesSum.getOrElse(0.0)
} else {
normsAndCounts.favWeightsOnFollowEdgesSum.getOrElse(0.0)
}
val favWtSumOfIncomingFavs = if (useLogFavWeights) {
normsAndCounts.logFavWeightsOnFavEdgesSum.getOrElse(0.0)
} else {
normsAndCounts.favWeightsOnFavEdgesSum.getOrElse(0.0)
}
(
clusterId,
BipartiteClusterQuality(
incomingFollowEdges = Some(followerCount),
incomingFavEdges = Some(faverCount),
favWtSumOfIncomingFollowEdges = Some(favWtSumOfIncomingFollows),
favWtSumOfIncomingFavEdges = Some(favWtSumOfIncomingFavs)
))
}
}
.sumByKey
.toTypedPipe
}
def loadOrMake(
pipe: TypedPipe[(Int, BipartiteClusterQuality)],
modelVersion: String,
path: String
): Execution[TypedPipe[(Int, BipartiteClusterQuality)]] = {
val mapped = pipe.map {
case (clusterId, struct) => ((modelVersion, clusterId), struct)
}
makeForKeyValSource(mapped, AdhocKeyValSources.bipartiteQualitySource(path), path).map { pipe =>
// discard model version
pipe.map { case ((_, clusterId), struct) => (clusterId, struct) }
}
}
def makeForKeyValSource[K, V](
pipe: TypedPipe[(K, V)],
dest: VersionedKeyValSource[K, V],
path: String
): Execution[TypedPipe[(K, V)]] =
Execution.getMode.flatMap { mode =>
if (dest.resourceExists(mode)) {
println(s"validated path $path")
Execution.from(TypedPipe.from(dest))
} else {
println(s"Could not load from $path")
pipe.writeThrough(dest)
}
}
def precisionOfWholeGraph(
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
clusterIncomingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]]
): Execution[Option[Double]] = {
val knownForSizeExec = knownFor.aggregate(Aggregator.size).toOptionExecution
val interestedInSizeExec =
interestedIn.aggregate(Aggregator.size).toOptionExecution
val numExec = clusterIncomingVolumesExec.flatMap { volumes =>
volumes.values.flatMap(_.favWtSumOfIncomingFavEdges).sum.toOptionExecution
}
Execution.zip(numExec, interestedInSizeExec, knownForSizeExec).map {
case (Some(num), Some(interestedInSize), Some(knownForSize)) =>
Some(num / interestedInSize / knownForSize)
case x @ _ =>
println("Precision of whole graph zip: " + x)
None
}
}
def finalPerClusterResults(
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
resultsWithOutgoingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]],
incomingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]]
): Execution[TypedPipe[(Int, BipartiteClusterQuality)]] = {
val knownForTranspose = KnownForSources.transpose(knownFor)
val precisionOfWholeGraphExec =
precisionOfWholeGraph(knownFor, interestedIn, incomingVolumesExec)
Execution
.zip(resultsWithOutgoingVolumesExec, incomingVolumesExec, precisionOfWholeGraphExec)
.map {
case (resultsWithOutgoingVolumes, clusterIncomingVolumes, precisionOfWholeGraph) =>
println("Precision of whole graph " + precisionOfWholeGraph)
resultsWithOutgoingVolumes
.join(knownForTranspose)
.leftJoin(clusterIncomingVolumes)
.withReducers(500)
.map {
case (clusterId, ((outgoingVolumeQuality, knownForList), incomingVolumesOpt)) =>
val incomingVolumes =
incomingVolumesOpt.getOrElse(BipartiteClusterQuality())
val knownForMap = knownForList.toMap
(
clusterId,
getFullQuality(
outgoingVolumeQuality,
incomingVolumes,
knownForMap,
precisionOfWholeGraph))
}
}
}
def getFullQuality(
qualityWithOutgoingVolumes: BipartiteClusterQuality,
incomingVolumes: BipartiteClusterQuality,
knownFor: Map[Long, Float],
precisionOfWholeGraph: Option[Double]
): BipartiteClusterQuality = {
val newSampledEdges = qualityWithOutgoingVolumes.sampledEdges.map { sampledEdges =>
sampledEdges.map { sampledEdge =>
val knownForScore = knownFor.getOrElse(sampledEdge.followeeId, 0.0f)
sampledEdge.copy(
predictedFollowScore = sampledEdge.followScoreToCluster.map { x => x * knownForScore },
predictedFavScore = sampledEdge.favScoreToCluster.map { x => x * knownForScore }
)
}
}
val correlationOfFavWtIfFollow = newSampledEdges.map { samples =>
val pairs = samples.map { s =>
(s.predictedFollowScore.getOrElse(0.0), s.favWtIfFollowEdge.getOrElse(0.0))
}
Util.computeCorrelation(pairs.iterator)
}
val correlationOfFavWtIfFav = newSampledEdges.map { samples =>
val pairs = samples.map { s =>
(s.predictedFavScore.getOrElse(0.0), s.favWtIfFavEdge.getOrElse(0.0))
}
Util.computeCorrelation(pairs.iterator)
}
val relativePrecisionNum = {
if (qualityWithOutgoingVolumes.interestedInSize.exists(_ > 0) && knownFor.nonEmpty) {
qualityWithOutgoingVolumes.favWtSumOfInClusterFavEdges
.getOrElse(0.0) / qualityWithOutgoingVolumes.interestedInSize.get / knownFor.size
} else 0.0
}
val relativePrecision = if (precisionOfWholeGraph.exists(_ > 0.0)) {
Some(relativePrecisionNum / precisionOfWholeGraph.get)
} else None
qualityWithOutgoingVolumes.copy(
incomingFollowEdges = incomingVolumes.incomingFollowEdges,
incomingFavEdges = incomingVolumes.incomingFavEdges,
favWtSumOfIncomingFollowEdges = incomingVolumes.favWtSumOfIncomingFollowEdges,
favWtSumOfIncomingFavEdges = incomingVolumes.favWtSumOfIncomingFavEdges,
knownForSize = Some(knownFor.size),
correlationOfFavWtIfFollowWithPredictedFollow = correlationOfFavWtIfFollow,
correlationOfFavWtIfFavWithPredictedFav = correlationOfFavWtIfFav,
sampledEdges = newSampledEdges,
relativePrecisionUsingFavWtIfFav = relativePrecision,
averagePrecisionOfWholeGraphUsingFavWtIfFav = precisionOfWholeGraph
)
}
}
object DumpBpQuality extends TwitterExecutionApp {
def job: Execution[Unit] = Execution.getConfigMode.flatMap {
case (config, mode) =>
Execution.withId { implicit uniqueId =>
val args = config.getArgs
val inputDir = args("inputDir")
val clusters = args.list("clusters").map(_.toInt).toSet
val input =
TypedPipe
.from(AdhocKeyValSources.bipartiteQualitySource(inputDir))
.map {
case ((modelVersion, clusterId), quality) =>
(
(modelVersion, clusterId),
BipartiteClusterEvaluationClasses
.printableBipartiteQuality(quality))
}
if (clusters.isEmpty) {
input.printSummary("Bipartite quality")
} else {
input
.collect {
case rec @ ((_, clusterId), quality) if clusters(clusterId) =>
Util.prettyJsonMapper
.writeValueAsString(rec)
.replaceAll("\n", " ")
}
.toIterableExecution
.map { strings => println(strings.mkString("\n")) }
}
}
}
}

View File

@ -1,316 +0,0 @@
package com.twitter.simclusters_v2.scalding
import com.twitter.algebird.{Monoid, OptionMonoid, Semigroup}
import com.twitter.algebird.mutable.PriorityQueueMonoid
import com.twitter.scalding.Execution
import com.twitter.scalding.typed.TypedPipe
import com.twitter.simclusters_v2.scalding.common.Util
import com.twitter.simclusters_v2.scalding.common.Util.Distribution
import com.twitter.simclusters_v2.thriftscala.{BipartiteClusterQuality, SampledEdge}
import java.util.PriorityQueue
import scala.collection.JavaConverters._
object BipartiteClusterEvaluationClasses {
case class Weights(
isFollowEdge: Double,
isFavEdge: Double,
favWtIfFollowEdge: Double,
favWtIfFavEdge: Double)
object WeightsMonoid extends Monoid[Weights] {
override def zero = Weights(0.0, 0.0, 0.0, 0.0)
override def plus(l: Weights, r: Weights): Weights = {
Weights(
l.isFollowEdge + r.isFollowEdge,
l.isFavEdge + r.isFavEdge,
l.favWtIfFollowEdge + r.favWtIfFollowEdge,
l.favWtIfFavEdge + r.favWtIfFavEdge
)
}
}
implicit val wm: Monoid[Weights] = WeightsMonoid
case class SampledEdgeData(
favWtIfFollowEdge: Double,
favWtIfFavEdge: Double,
followScoreToCluster: Double,
favScoreToCluster: Double)
implicit val samplerMonoid: PriorityQueueMonoid[((Long, Long), SampledEdgeData)] =
Util.reservoirSamplerMonoidForPairs[(Long, Long), SampledEdgeData](2000)(Util.edgeOrdering)
implicit val sampledEdgesMonoid: PriorityQueueMonoid[SampledEdge] =
Util.reservoirSamplerMonoid(
10000,
{ sampledEdge: SampledEdge => (sampledEdge.followerId, sampledEdge.followeeId) }
)(Util.edgeOrdering)
case class BipartiteIntermediateResults(
inClusterWeights: Weights,
totalOutgoingVolumes: Weights,
interestedInSize: Int,
edgeSample: PriorityQueue[((Long, Long), SampledEdgeData)]) {
override def toString: String = {
"BCR(%s, %s, %d, %s)".format(
inClusterWeights,
totalOutgoingVolumes,
interestedInSize,
edgeSample.iterator().asScala.toSeq.toString()
)
}
}
object BIRMonoid extends Monoid[BipartiteIntermediateResults] {
override def zero =
BipartiteIntermediateResults(WeightsMonoid.zero, WeightsMonoid.zero, 0, samplerMonoid.zero)
override def plus(
l: BipartiteIntermediateResults,
r: BipartiteIntermediateResults
): BipartiteIntermediateResults = {
BipartiteIntermediateResults(
WeightsMonoid.plus(l.inClusterWeights, r.inClusterWeights),
WeightsMonoid.plus(l.totalOutgoingVolumes, r.totalOutgoingVolumes),
l.interestedInSize + r.interestedInSize,
samplerMonoid.plus(l.edgeSample, r.edgeSample)
)
}
}
implicit val bIRMonoid: Monoid[BipartiteIntermediateResults] = BIRMonoid
def makeThriftSampledEdge(edge: (Long, Long), data: SampledEdgeData): SampledEdge = {
val (followerId, followeeId) = edge
SampledEdge(
followerId = followerId,
followeeId = followeeId,
favWtIfFollowEdge = Some(data.favWtIfFollowEdge),
favWtIfFavEdge = Some(data.favWtIfFavEdge),
followScoreToCluster = Some(data.followScoreToCluster),
favScoreToCluster = Some(data.favScoreToCluster)
)
}
object ClusterQualitySemigroup extends Semigroup[BipartiteClusterQuality] {
val doubleOM: Monoid[Option[Double]] = new OptionMonoid[Double]
val intOM: Monoid[Option[Int]] = new OptionMonoid[Int]
val longOM: Monoid[Option[Long]] = new OptionMonoid[Long]
override def plus(l: BipartiteClusterQuality, r: BipartiteClusterQuality) =
BipartiteClusterQuality(
inClusterFollowEdges = doubleOM.plus(l.inClusterFollowEdges, r.inClusterFollowEdges),
inClusterFavEdges = doubleOM.plus(l.inClusterFavEdges, r.inClusterFavEdges),
favWtSumOfInClusterFollowEdges = doubleOM
.plus(l.favWtSumOfInClusterFollowEdges, r.favWtSumOfInClusterFollowEdges),
favWtSumOfInClusterFavEdges = doubleOM
.plus(l.favWtSumOfInClusterFavEdges, r.favWtSumOfInClusterFavEdges),
outgoingFollowEdges = doubleOM.plus(l.outgoingFollowEdges, r.outgoingFollowEdges),
outgoingFavEdges = doubleOM.plus(l.outgoingFavEdges, r.outgoingFavEdges),
favWtSumOfOutgoingFollowEdges = doubleOM
.plus(l.favWtSumOfOutgoingFollowEdges, r.favWtSumOfOutgoingFollowEdges),
favWtSumOfOutgoingFavEdges = doubleOM
.plus(l.favWtSumOfOutgoingFavEdges, r.favWtSumOfOutgoingFavEdges),
incomingFollowEdges = doubleOM.plus(l.incomingFollowEdges, r.incomingFollowEdges),
incomingFavEdges = doubleOM.plus(l.incomingFavEdges, r.incomingFavEdges),
favWtSumOfIncomingFollowEdges = doubleOM
.plus(l.favWtSumOfIncomingFollowEdges, r.favWtSumOfIncomingFollowEdges),
favWtSumOfIncomingFavEdges = doubleOM
.plus(l.favWtSumOfIncomingFavEdges, r.favWtSumOfIncomingFavEdges),
interestedInSize = None,
sampledEdges = Some(
sampledEdgesMonoid
.plus(
sampledEdgesMonoid.build(l.sampledEdges.getOrElse(Nil)),
sampledEdgesMonoid.build(r.sampledEdges.getOrElse(Nil))
)
.iterator()
.asScala
.toSeq),
knownForSize = intOM.plus(l.knownForSize, r.knownForSize),
correlationOfFavWtIfFollowWithPredictedFollow = None,
correlationOfFavWtIfFavWithPredictedFav = None,
relativePrecisionUsingFavWtIfFav = None,
averagePrecisionOfWholeGraphUsingFavWtIfFav = l.averagePrecisionOfWholeGraphUsingFavWtIfFav
)
}
implicit val bcqSemigroup: Semigroup[BipartiteClusterQuality] =
ClusterQualitySemigroup
case class PrintableBipartiteQuality(
incomingFollowUnweightedRecall: String,
incomingFavUnweightedRecall: String,
incomingFollowWeightedRecall: String,
incomingFavWeightedRecall: String,
outgoingFollowUnweightedRecall: String,
outgoingFavUnweightedRecall: String,
outgoingFollowWeightedRecall: String,
outgoingFavWeightedRecall: String,
incomingFollowEdges: String,
incomingFavEdges: String,
favWtSumOfIncomingFollowEdges: String,
favWtSumOfIncomingFavEdges: String,
outgoingFollowEdges: String,
outgoingFavEdges: String,
favWtSumOfOutgoingFollowEdges: String,
favWtSumOfOutgoingFavEdges: String,
correlationOfFavWtIfFollow: String,
correlationOfFavWtIfFav: String,
relativePrecisionUsingFavWt: String,
averagePrecisionOfWholeGraphUsingFavWt: String,
interestedInSize: String,
knownForSize: String)
def printableBipartiteQuality(in: BipartiteClusterQuality): PrintableBipartiteQuality = {
def getRatio(numOpt: Option[Double], denOpt: Option[Double]): String = {
val r = if (denOpt.exists(_ > 0)) {
numOpt.getOrElse(0.0) / denOpt.get
} else 0.0
"%.3f".format(r)
}
val formatter = new java.text.DecimalFormat("###,###.#")
def denString(denOpt: Option[Double]): String =
formatter.format(denOpt.getOrElse(0.0))
val correlationOfFavWtIfFollow =
in.correlationOfFavWtIfFollowWithPredictedFollow match {
case None =>
in.sampledEdges.map { samples =>
val pairs = samples.map { s =>
(s.predictedFollowScore.getOrElse(0.0), s.favWtIfFollowEdge.getOrElse(0.0))
}
Util.computeCorrelation(pairs.iterator)
}
case x @ _ => x
}
val correlationOfFavWtIfFav =
in.correlationOfFavWtIfFavWithPredictedFav match {
case None =>
in.sampledEdges.map { samples =>
val pairs = samples.map { s =>
(s.predictedFavScore.getOrElse(0.0), s.favWtIfFavEdge.getOrElse(0.0))
}
Util.computeCorrelation(pairs.iterator)
}
case x @ _ => x
}
PrintableBipartiteQuality(
incomingFollowUnweightedRecall = getRatio(in.inClusterFollowEdges, in.incomingFollowEdges),
incomingFavUnweightedRecall = getRatio(in.inClusterFavEdges, in.incomingFavEdges),
incomingFollowWeightedRecall =
getRatio(in.favWtSumOfInClusterFollowEdges, in.favWtSumOfIncomingFollowEdges),
incomingFavWeightedRecall =
getRatio(in.favWtSumOfInClusterFavEdges, in.favWtSumOfIncomingFavEdges),
outgoingFollowUnweightedRecall = getRatio(in.inClusterFollowEdges, in.outgoingFollowEdges),
outgoingFavUnweightedRecall = getRatio(in.inClusterFavEdges, in.outgoingFavEdges),
outgoingFollowWeightedRecall =
getRatio(in.favWtSumOfInClusterFollowEdges, in.favWtSumOfOutgoingFollowEdges),
outgoingFavWeightedRecall =
getRatio(in.favWtSumOfInClusterFavEdges, in.favWtSumOfOutgoingFavEdges),
incomingFollowEdges = denString(in.incomingFollowEdges),
incomingFavEdges = denString(in.incomingFavEdges),
favWtSumOfIncomingFollowEdges = denString(in.favWtSumOfIncomingFollowEdges),
favWtSumOfIncomingFavEdges = denString(in.favWtSumOfIncomingFavEdges),
outgoingFollowEdges = denString(in.outgoingFollowEdges),
outgoingFavEdges = denString(in.outgoingFavEdges),
favWtSumOfOutgoingFollowEdges = denString(in.favWtSumOfOutgoingFollowEdges),
favWtSumOfOutgoingFavEdges = denString(in.favWtSumOfOutgoingFavEdges),
correlationOfFavWtIfFollow = "%.3f"
.format(correlationOfFavWtIfFollow.getOrElse(0.0)),
correlationOfFavWtIfFav = "%.3f"
.format(correlationOfFavWtIfFav.getOrElse(0.0)),
relativePrecisionUsingFavWt =
"%.2g".format(in.relativePrecisionUsingFavWtIfFav.getOrElse(0.0)),
averagePrecisionOfWholeGraphUsingFavWt =
"%.2g".format(in.averagePrecisionOfWholeGraphUsingFavWtIfFav.getOrElse(0.0)),
interestedInSize = in.interestedInSize.getOrElse(0).toString,
knownForSize = in.knownForSize.getOrElse(0).toString
)
}
case class ClusterResultsSummary(
numClustersWithZeroInterestedIn: Int,
numClustersWithZeroFollowWtRecall: Int,
numClustersWithZeroFavWtRecall: Int,
numClustersWithZeroFollowAndFavWtRecall: Int,
interestedInSizeDist: Distribution,
outgoingFollowWtRecallDist: Distribution,
outgoingFavWtRecallDist: Distribution,
incomingFollowWtRecallDist: Distribution,
incomingFavWtRecallDist: Distribution,
followCorrelationDist: Distribution,
favCorrelationDist: Distribution,
relativePrecisionDist: Distribution)
def getClusterResultsSummary(
perClusterResults: TypedPipe[BipartiteClusterQuality]
): Execution[Option[ClusterResultsSummary]] = {
perClusterResults
.map { clusterQuality =>
val printableQuality = printableBipartiteQuality(clusterQuality)
val isFollowRecallZero =
if (!clusterQuality.favWtSumOfInClusterFollowEdges
.exists(_ > 0)) 1
else 0
val isFavRecallZero =
if (!clusterQuality.favWtSumOfInClusterFavEdges.exists(_ > 0)) 1
else 0
(
if (!clusterQuality.interestedInSize.exists(_ > 0)) 1 else 0,
isFollowRecallZero,
isFavRecallZero,
isFavRecallZero * isFollowRecallZero,
clusterQuality.interestedInSize.toList.map(_.toDouble),
List(printableQuality.outgoingFollowWeightedRecall.toDouble),
List(printableQuality.outgoingFavWeightedRecall.toDouble),
List(printableQuality.incomingFollowWeightedRecall.toDouble),
List(printableQuality.incomingFavWeightedRecall.toDouble),
List(printableQuality.correlationOfFavWtIfFollow.toDouble),
List(printableQuality.correlationOfFavWtIfFav.toDouble),
List(printableQuality.relativePrecisionUsingFavWt.toDouble)
)
}
.sum
.toOptionExecution
.map { opt =>
opt.map {
case (
zeroInterestedIn,
zeroFollowRecall,
zeroFavRecall,
zeroFollowAndFavRecall,
interestedInSizeList,
outgoingFollowWtRecallList,
outgoingFavWtRecallList,
incomingFollowWtRecallList,
incomingFavWtRecallList,
followCorrelationList,
favCorrelationList,
relativePrecisionList
) =>
ClusterResultsSummary(
numClustersWithZeroInterestedIn = zeroInterestedIn,
numClustersWithZeroFollowWtRecall = zeroFollowRecall,
numClustersWithZeroFavWtRecall = zeroFavRecall,
numClustersWithZeroFollowAndFavWtRecall = zeroFollowAndFavRecall,
interestedInSizeDist = Util.distributionFromArray(interestedInSizeList.toArray),
outgoingFollowWtRecallDist = Util
.distributionFromArray(outgoingFollowWtRecallList.toArray),
outgoingFavWtRecallDist = Util.distributionFromArray(outgoingFavWtRecallList.toArray),
incomingFollowWtRecallDist = Util
.distributionFromArray(incomingFollowWtRecallList.toArray),
incomingFavWtRecallDist = Util.distributionFromArray(incomingFavWtRecallList.toArray),
followCorrelationDist = Util.distributionFromArray(followCorrelationList.toArray),
favCorrelationDist = Util.distributionFromArray(favCorrelationList.toArray),
relativePrecisionDist = Util.distributionFromArray(relativePrecisionList.toArray)
)
}
}
}
}

View File

@ -1,794 +0,0 @@
package com.twitter.simclusters_v2.scalding
import com.twitter.algebird.OptionMonoid
import com.twitter.algebird.QTree
import com.twitter.algebird.QTreeSemigroup
import com.twitter.algebird.Semigroup
import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.dal.client.dataset.SnapshotDALDataset
import com.twitter.hermit.candidate.thriftscala.Candidates
import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource
import com.twitter.pluck.source.cassowary.SimsCandidatesSource
import com.twitter.scalding._
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.DALWrite._
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
import com.twitter.scalding_internal.job.TwitterExecutionApp
import com.twitter.scalding_internal.job.analytics_batch._
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.simclusters_v2.common.ModelVersions
import com.twitter.simclusters_v2.hdfs_sources._
import com.twitter.simclusters_v2.scalding.common.Util
import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources
import com.twitter.simclusters_v2.thriftscala._
import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset
import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser
object ClusterDetailsJob {
case class Scores(followScore: Double, favScore: Double, logFavScore: Double)
case class IntermediateDetails(
numUsersWithAnyNonZeroScore: Int,
numUsersWithNonZeroFollowScore: Int,
numUsersWithNonZeroFavScore: Int,
favQTree: Option[QTree[Double]],
followQTree: Option[QTree[Double]],
logFavQTree: Option[QTree[Double]],
sumOfSquares: Scores,
sum: Scores,
min: Scores,
max: Scores)
case class InfoFromUserSource(
fractionMarkedNSFWUser: Double,
languageToFractionDeviceLanguage: Map[String, Double],
countryCodeToFractionKnownForWithCountryCode: Map[String, Double],
languageToFractionInferredLanguage: Map[String, Double])
def positiveMin(a: Double, b: Double) = {
if (math.min(a, b) == 0.0) math.max(a, b) else math.min(a, b)
}
case class ClusterDetailsSemigroup(implicit qtreeSemigroup: Semigroup[QTree[Double]])
extends Semigroup[IntermediateDetails] {
val optionMonoid: OptionMonoid[QTree[Double]] = new OptionMonoid[QTree[Double]]()
override def plus(
left: IntermediateDetails,
right: IntermediateDetails
): IntermediateDetails = {
IntermediateDetails(
left.numUsersWithAnyNonZeroScore + right.numUsersWithAnyNonZeroScore,
left.numUsersWithNonZeroFollowScore + right.numUsersWithNonZeroFollowScore,
left.numUsersWithNonZeroFavScore + right.numUsersWithNonZeroFavScore,
optionMonoid.plus(left.favQTree, right.favQTree),
optionMonoid.plus(left.followQTree, right.followQTree),
optionMonoid.plus(left.logFavQTree, right.logFavQTree),
Scores(
left.sumOfSquares.followScore + right.sumOfSquares.followScore,
left.sumOfSquares.favScore + right.sumOfSquares.favScore,
left.sumOfSquares.logFavScore + right.sumOfSquares.logFavScore
),
Scores(
left.sum.followScore + right.sum.followScore,
left.sum.favScore + right.sum.favScore,
left.sum.logFavScore + right.sum.logFavScore
),
Scores(
positiveMin(left.min.followScore, right.min.followScore),
positiveMin(left.min.favScore, right.min.favScore),
positiveMin(left.min.logFavScore, right.min.logFavScore)
),
Scores(
math.max(left.max.followScore, right.max.followScore),
math.max(left.max.favScore, right.max.favScore),
math.max(left.max.logFavScore, right.max.logFavScore)
)
)
}
}
def intermediateDetailsPipe(
input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
qtreeSemigroupKParameter: Int
): TypedPipe[(Int, IntermediateDetails)] = {
implicit val qtSg: Semigroup[QTree[Double]] =
new QTreeSemigroup[Double](qtreeSemigroupKParameter)
implicit val cdSg: Semigroup[IntermediateDetails] = ClusterDetailsSemigroup()
input
.flatMap {
case (userId, clusterScoresStruct) =>
val clusterScoresArray = clusterScoresStruct.clusterIdToScores.toArray
clusterScoresArray.map {
case (clusterId, scoresStruct) =>
val followScore = scoresStruct.followScore.getOrElse(0.0)
val favScore = scoresStruct.favScore.getOrElse(0.0)
val logFavScore = scoresStruct.logFavScore.getOrElse(0.0)
(
clusterId,
IntermediateDetails(
numUsersWithAnyNonZeroScore = 1,
numUsersWithNonZeroFollowScore = if (followScore > 0) 1 else 0,
numUsersWithNonZeroFavScore = if (favScore > 0) 1 else 0,
favQTree = if (favScore > 0) Some(QTree(favScore)) else None,
followQTree = if (followScore > 0) Some(QTree(followScore)) else None,
logFavQTree = if (logFavScore > 0) Some(QTree(logFavScore)) else None,
sumOfSquares = Scores(
followScore * followScore,
favScore * favScore,
logFavScore * logFavScore),
sum = Scores(followScore, favScore, logFavScore),
min = Scores(followScore, favScore, logFavScore),
max = Scores(followScore, favScore, logFavScore)
)
)
}
}
.sumByKey
// Uncomment for adhoc job
//.withReducers(100)
.toTypedPipe
}
private def safeGetDoubleOpt(x: Option[Double]): Double = {
x.map { y => if (y.isNaN) 0 else y }.getOrElse(0)
}
private def getSimilaritiesForAllPairs(
input: TypedPipe[(Long, ClustersUserIsInterestedIn)]
)(
implicit uniqueID: UniqueID
): TypedPipe[((Int, Int), Scores)] = {
val allClusterPairsBeforeSumByKey = Stat("all_cluster_pairs_before_sum_by_key")
val clusterPairsWithin10Ratio = Stat("cluster_pairs_within_10_ratio")
val clusterPairsBeforeTopK = Stat("cluster_pairs_before_thresholding")
input
.flatMap {
case (userId, clusterScoresStruct) =>
val clusterScoresArray = clusterScoresStruct.clusterIdToScores.toArray
(0 until clusterScoresArray.length).flatMap { i =>
(0 until clusterScoresArray.length).map { j =>
val (clusterI, scoresI) = clusterScoresArray(i)
val (clusterJ, scoresJ) = clusterScoresArray(j)
val ratioOfSizes =
scoresI.numUsersInterestedInThisClusterUpperBound.getOrElse(1).toDouble /
scoresJ.numUsersInterestedInThisClusterUpperBound.getOrElse(1).toDouble
allClusterPairsBeforeSumByKey.inc()
if (ratioOfSizes > 0.1 && ratioOfSizes < 10) {
clusterPairsWithin10Ratio.inc()
}
val followI = safeGetDoubleOpt(scoresI.followScoreClusterNormalizedOnly)
val followJ = safeGetDoubleOpt(scoresJ.followScoreClusterNormalizedOnly)
val follow = followI * followJ
val favI = safeGetDoubleOpt(scoresI.favScoreClusterNormalizedOnly)
val favJ = safeGetDoubleOpt(scoresJ.favScoreClusterNormalizedOnly)
val fav = favI * favJ
val logFavI = safeGetDoubleOpt(scoresI.logFavScoreClusterNormalizedOnly)
val logFavJ = safeGetDoubleOpt(scoresJ.logFavScoreClusterNormalizedOnly)
val logFav = logFavI * logFavJ
((clusterI, clusterJ), (follow, fav, logFav))
}
}
}
.sumByKey
// Uncomment for adhoc job
//.withReducers(600)
.map {
case (key, (follow, fav, logFav)) =>
clusterPairsBeforeTopK.inc()
(key, Scores(follow, fav, logFav))
}
}
private def keepTopNeighbors(
allPairs: TypedPipe[((Int, Int), Scores)],
cosineThreshold: Double
)(
implicit uniqueID: UniqueID
): TypedPipe[(Int, List[ClusterNeighbor])] = {
val clusterPairsMoreThanThreshold = Stat("cluster_pairs_cosine_gt_" + cosineThreshold)
val clusterPairsAfterTopK = Stat("cluster_pairs_after_topk")
val clustersWithFewNeighbors = Stat(s"clusters_with_fewer_than_100_neighbors")
val clustersWithManyNeighbors = Stat(s"clusters_with_more_than_100_neighbors")
allPairs
.flatMap {
case ((cI, cJ), Scores(followScore, favScore, logFavScore)) =>
if (followScore > cosineThreshold || logFavScore > cosineThreshold || favScore > cosineThreshold) {
clusterPairsMoreThanThreshold.inc()
Some((cI, ClusterNeighbor(cJ, Some(followScore), Some(favScore), Some(logFavScore))))
} else None
}
.group
.toList
// Uncomment for adhoc job
//.withReducers(40)
.map {
case (key, seq) =>
val finalSize = seq.size
clusterPairsAfterTopK.incBy(finalSize)
if (finalSize < 100) {
clustersWithFewNeighbors.inc()
} else {
clustersWithManyNeighbors.inc()
}
(
key,
seq.sortBy {
case cn: ClusterNeighbor =>
-(cn.followCosineSimilarity.getOrElse(0.0) + cn.logFavCosineSimilarity.getOrElse(
0.0)) / 2
})
}
}
def getTopSimilarClustersWithCosine(
input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
cosineThreshold: Double
)(
implicit uniqueID: UniqueID
): TypedPipe[(Int, List[ClusterNeighbor])] = {
keepTopNeighbors(getSimilaritiesForAllPairs(input), cosineThreshold)
}
def getDistributionDetails(
qtree: QTree[Double],
sum: Double,
sumOfSquares: Double,
min: Double,
max: Double,
fullSize: Int
): DistributionDetails = {
val mean = sum / fullSize
// note that the below is the naive calculation, and not the sample standard dev formula
// that divides by n-1. I don't think it makes a difference at our scale whether we use n or n-1
// and I'd rather use the simpler one.
val stdDev = math.sqrt(sumOfSquares / fullSize - mean * mean)
def getQB(percentile: Double): QuantileBounds = {
val (lb, ub) = qtree.quantileBounds(percentile)
QuantileBounds(lb, ub)
}
DistributionDetails(
mean = mean,
standardDeviation = Some(stdDev),
min = Some(min),
p25 = Some(getQB(0.25)),
p50 = Some(getQB(0.5)),
p75 = Some(getQB(0.75)),
p95 = Some(getQB(0.95)),
max = Some(max)
)
}
def keepCorrectModel(
input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
modelVersionToKeep: String
)(
implicit uniqId: UniqueID
): TypedPipe[(Long, ClustersUserIsInterestedIn)] = {
val allRecords = Stat("all_input_records")
val withCorrectVersion = Stat("with_correct_version")
input.filter {
case (_, clusterScoresStruct) =>
// allRecords.inc()
val result = clusterScoresStruct.knownForModelVersion == modelVersionToKeep
// if (result) withCorrectVersion.inc()
result
}
}
def getInfoFromUserSource(
knownFor: TypedPipe[(Int, List[(Long, Float)])],
usersource: TypedPipe[FlatUser],
inferredLanguages: TypedPipe[(Long, Seq[(String, Double)])]
)(
implicit uniqId: UniqueID
): TypedPipe[(Int, InfoFromUserSource)] = {
val knownForUsers = knownFor.flatMap {
case (clusterId, userScoreList) =>
userScoreList.map {
case (userId, _) =>
(userId, clusterId)
}
}
usersource
.collect {
case fuser: FlatUser if fuser.id.isDefined =>
(
fuser.id.get,
(
fuser.accountCountryCode.getOrElse(""),
fuser.language.getOrElse(""),
fuser.nsfwUser.getOrElse(false)
))
}
.join(knownForUsers)
.leftJoin(inferredLanguages)
.map {
case (_, (((countryCode, language, nsfw), clusterId), inferredLangsOpt)) =>
val nsfwInt = if (nsfw) 1 else 0
(
clusterId,
(
1,
nsfwInt,
Map(language -> 1),
Map(countryCode -> 1),
inferredLangsOpt.getOrElse(Seq(("", 1.0))).toMap
)
)
}
.sumByKey
.mapValues {
case (
denominator,
nsfwNumerator,
languageNumeratorsMap,
countryNumeratorsMap,
inferredLangsNumeratorsMap) =>
InfoFromUserSource(
nsfwNumerator * 1.0 / denominator,
languageNumeratorsMap.mapValues { x => x * 1.0 / denominator },
countryNumeratorsMap.mapValues { x => x * 1.0 / denominator },
inferredLangsNumeratorsMap.mapValues { x => x * 1.0 / denominator }
)
}
}
/**
* Run the cluster details job and return the details for each cluster
* @param input interestedIn data
* @param qtreeSemigroupKParameter parameter for calculating percentiles using qtree monoid (set to a small number, usually < 7)
* @param modelVersionToKeep which modelVersion to use from interestedIn dataset
* @param knownFor clusterId -> users known for this cluster and their scores
* @param knownForTranspose userId -> clusters this user is known for and their scores
* @param usersource -> user source
* @param simsGraph -> sims graph in the form of userId -> adjacency list
* @param cosineThreshold -> cosine threshold to include a cluster in the list of similar clusters for a given cluster
* @param uniqId
* @return pipe with (modelVersion, clusterId) as the key and ClusterDetails struct as the value.
*/
def run(
input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
qtreeSemigroupKParameter: Int,
modelVersionToKeep: String,
knownFor: TypedPipe[(Int, List[(Long, Float)])],
knownForTranspose: TypedPipe[(Long, Array[(Int, Float)])],
usersource: Option[TypedPipe[FlatUser]],
inferredLanguageSource: Option[TypedPipe[(Long, Seq[(String, Double)])]],
simsGraph: Option[TypedPipe[(Long, Map[Long, Float])]],
cosineThreshold: Double
)(
implicit uniqId: UniqueID
): Execution[TypedPipe[((String, Int), ClusterDetails)]] = {
val topSimilarClusters = getTopSimilarClustersWithCosine(input, cosineThreshold)
val infoFromUserSource: TypedPipe[(Int, InfoFromUserSource)] = (for {
us <- usersource
inferredLanguages <- inferredLanguageSource
} yield getInfoFromUserSource(knownFor, us, inferredLanguages)).getOrElse(TypedPipe.empty)
val clusterEvaluationExec = simsGraph match {
case Some(sg) =>
ClusterEvaluation.clusterLevelEvaluation(sg, knownForTranspose, "eval")
case None =>
val dummyPipe: TypedPipe[(Int, (Int, ClusterQuality))] = TypedPipe.empty
Execution.from(dummyPipe)
}
clusterEvaluationExec
.map { clusterIdToSizesAndQualities =>
val clusterQualities: TypedPipe[(Int, ClusterQuality)] =
clusterIdToSizesAndQualities.mapValues(_._2)
intermediateDetailsPipe(
keepCorrectModel(input, modelVersionToKeep),
qtreeSemigroupKParameter)
.leftJoin(topSimilarClusters)
.leftJoin(infoFromUserSource)
.leftJoin(clusterQualities)
.join(knownFor)
.map {
case (
clusterId,
(
(
((intermediateDetails, topSimilarNeighborsOpt), userSourceInfoOpt),
qualityOpt),
knownForUsers)
) =>
val knownForSorted = knownForUsers.sortBy(-_._2).map {
case (userId, score) =>
UserWithScore(userId, score)
}
(modelVersionToKeep, clusterId) ->
ClusterDetails(
numUsersWithAnyNonZeroScore = intermediateDetails.numUsersWithAnyNonZeroScore,
numUsersWithNonZeroFavScore = intermediateDetails.numUsersWithNonZeroFavScore,
numUsersWithNonZeroFollowScore =
intermediateDetails.numUsersWithNonZeroFollowScore,
favScoreDistributionDetails = intermediateDetails.favQTree.map { qt =>
getDistributionDetails(
qtree = qt,
sum = intermediateDetails.sum.favScore,
sumOfSquares = intermediateDetails.sumOfSquares.favScore,
min = intermediateDetails.min.favScore,
max = intermediateDetails.max.favScore,
fullSize = intermediateDetails.numUsersWithNonZeroFavScore
)
},
followScoreDistributionDetails = intermediateDetails.followQTree.map { qt =>
getDistributionDetails(
qtree = qt,
sum = intermediateDetails.sum.followScore,
sumOfSquares = intermediateDetails.sumOfSquares.followScore,
min = intermediateDetails.min.followScore,
max = intermediateDetails.max.followScore,
fullSize = intermediateDetails.numUsersWithNonZeroFollowScore
)
},
logFavScoreDistributionDetails = intermediateDetails.logFavQTree.map { qt =>
getDistributionDetails(
qtree = qt,
sum = intermediateDetails.sum.logFavScore,
sumOfSquares = intermediateDetails.sumOfSquares.logFavScore,
min = intermediateDetails.min.logFavScore,
max = intermediateDetails.max.logFavScore,
// note: user has non-zero fav score iff a user has non-zero log-fav score
fullSize = intermediateDetails.numUsersWithNonZeroFavScore
)
},
knownForUsersAndScores = Some(knownForSorted),
neighborClusters = topSimilarNeighborsOpt,
fractionKnownForMarkedNSFWUser = userSourceInfoOpt.map(_.fractionMarkedNSFWUser),
languageToFractionDeviceLanguage =
userSourceInfoOpt.map(_.languageToFractionDeviceLanguage),
countryCodeToFractionKnownForWithCountryCode =
userSourceInfoOpt.map(_.countryCodeToFractionKnownForWithCountryCode),
qualityMeasuredOnSimsGraph = qualityOpt,
languageToFractionInferredLanguage =
userSourceInfoOpt.map(_.languageToFractionInferredLanguage),
)
}
}
}
def getTruncatedSims(
sims: TypedPipe[Candidates],
maxNeighbors: Int
): TypedPipe[(Long, Map[Long, Float])] = {
sims.map { cands =>
(
cands.userId,
// These candidates are already sorted, but leaving it in just in case the behavior changes upstream
cands.candidates
.map { c => (c.userId, c.score.toFloat) }.sortBy(-_._2).take(maxNeighbors).toMap
)
}
}
}
/**
scalding remote run --main-class com.twitter.simclusters_v2.scalding.ClusterDetailsAdhoc \
--target src/scala/com/twitter/simclusters_v2/scalding:cluster_details-adhoc \
--hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \
--user recos-platform -- \
--date 2020-06-25 \
--dateForUserSource 2020-06-25 \
--includeUserSource \
--outputDir /user/recos-platform/adhoc/your_ldap/cluster_details_inferred_lang
*/
object ClusterDetailsAdhoc extends TwitterExecutionApp {
implicit val tz: java.util.TimeZone = DateOps.UTC
implicit val dp = DateParser.default
def job: Execution[Unit] =
Execution.getConfigMode.flatMap {
case (config, mode) =>
Execution.withId { implicit uniqueId =>
val args = config.getArgs
val date = DateRange.parse(args("dateForUserSource"))
val (knownFor, knownForTranspose) =
args
.optional("knownForDir").map { location =>
(
KnownForSources.transpose(KnownForSources.readKnownFor(location)),
KnownForSources.readKnownFor(location)
)
}.getOrElse(
(
KnownForSources.clusterToKnownFor_20M_145K_updated,
KnownForSources.knownFor_20M_145K_updated
)
)
val interestedIn = args
.optional("inputDir").map { interestedInInputDir =>
TypedPipe.from(AdhocKeyValSources.interestedInSource(interestedInInputDir))
}.getOrElse(
DAL
.readMostRecentSnapshotNoOlderThan(
SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
Days(14))
.withRemoteReadPolicy(ExplicitLocation(ProcAtla))
.toTypedPipe
.map {
case KeyVal(userId, clustersUserIsInterestedIn) =>
(userId, clustersUserIsInterestedIn)
}
)
val userSourceOpt = if (args.boolean("includeUserSource")) {
Some(DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe)
} else None
val inferredLanguagesOpt = if (args.boolean("includeUserSource")) {
Some(ExternalDataSources.inferredUserProducedLanguageSource)
} else None
val simsGraphOpt = args.optional("simsForEvalInputDir").map { sgDir =>
ClusterDetailsJob.getTruncatedSims(
TypedPipe.from(WTFCandidatesSource(sgDir)),
args.int("maxSimsNeighborsForEval", 20)
)
}
Util.printCounters(
ClusterDetailsJob
.run(
interestedIn,
args.int("qtreeSemigroupKParameter", 3),
args.getOrElse("modelVersion", "20M_145K_updated"),
knownFor,
knownForTranspose,
userSourceOpt,
inferredLanguagesOpt,
simsGraphOpt,
cosineThreshold = args.double("cosineThreshold", 0.01)
).flatMap(
_.writeExecution(AdhocKeyValSources.clusterDetailsSource(args("outputDir"))))
)
}
}
}
trait ClusterDetailsBatchTrait extends TwitterScheduledExecutionApp {
implicit val tz = DateOps.UTC
implicit val parser = DateParser.default
def firstTime: String
def batchIncrement: Duration
def manhattanOutputPath: String
def clusterDetailsLiteOutputPath: String
def modelVersion: String
def knownForDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
def interestedInDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
def outputDataset: KeyValDALDataset[KeyVal[(String, Int), ClusterDetails]]
def clusterDetailsLiteOutputDataset: SnapshotDALDataset[ClusterDetailsLite]
private lazy val execArgs = AnalyticsBatchExecutionArgs(
batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
firstTime = BatchFirstTime(RichDate(firstTime)),
lastTime = None,
batchIncrement = BatchIncrement(batchIncrement)
)
override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
implicit dateRange =>
Execution.withId { implicit uniqueId =>
Execution.withArgs { args =>
val qtreeSemigroupKParameter = args.int("qtreeSemigroupKParameter", 5)
val maxSimsNeighborsForEval = args.int("maxSimsNeighborsForEval", 20)
val knownForTranspose =
KnownForSources.fromKeyVal(
DAL.readMostRecentSnapshot(knownForDataset, dateRange.extend(Days(7))).toTypedPipe,
modelVersion)
val knownFor = KnownForSources.transpose(knownForTranspose)
val cosineThreshold = args.double("cosineThreshold", 0.01)
val interestedIn =
DAL
.readMostRecentSnapshot(interestedInDataset, dateRange.extend(Days(7)))
.toTypedPipe
.map {
case KeyVal(userId, clustersUserIsInterestedIn) =>
(userId, clustersUserIsInterestedIn)
}
val sims = if (modelVersion == ModelVersions.Model20M145K2020) {
// The model version 20m_145k_2020 uses approximate_cosine_follow as the input sims graph
// to cluster users. The same graph is used to evaluate the clusters
TypedPipe
.from(FollowingsCosineSimilaritiesManhattanSource())
.map(_._2)
} else {
TypedPipe.from(
SimsCandidatesSource()(
dateRange = dateRange,
suffixPath = "/classified_candidates_rollup"
))
}
val resultExec = ClusterDetailsJob
.run(
interestedIn,
qtreeSemigroupKParameter,
modelVersion,
knownFor,
knownForTranspose,
Some(DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange).toTypedPipe),
Some(ExternalDataSources.inferredUserProducedLanguageSource),
Some(
ClusterDetailsJob.getTruncatedSims(sims, maxNeighbors = maxSimsNeighborsForEval)),
cosineThreshold
).flatMap { resultUnmapped =>
val clusterDetailsExec = resultUnmapped
.map {
case (clusterKey, details) =>
KeyVal(clusterKey, details)
}.writeDALVersionedKeyValExecution(
outputDataset,
D.Suffix(manhattanOutputPath)
)
val clusterDetailsLiteExec =
resultUnmapped
.map {
case ((_, clusterId), details)
if modelVersion == ModelVersions.Model20M145KDec11 =>
ClusterDetailsLite(
FullClusterId(ModelVersion.Model20m145kDec11, clusterId),
details.numUsersWithAnyNonZeroScore,
details.numUsersWithNonZeroFollowScore,
details.numUsersWithNonZeroFavScore,
details.knownForUsersAndScores.getOrElse(Nil)
)
case ((_, clusterId), details)
if modelVersion == ModelVersions.Model20M145KUpdated =>
ClusterDetailsLite(
FullClusterId(ModelVersion.Model20m145kUpdated, clusterId),
details.numUsersWithAnyNonZeroScore,
details.numUsersWithNonZeroFollowScore,
details.numUsersWithNonZeroFavScore,
details.knownForUsersAndScores.getOrElse(Nil)
)
case ((_, clusterId), details)
if modelVersion == ModelVersions.Model20M145K2020 =>
ClusterDetailsLite(
FullClusterId(ModelVersion.Model20m145k2020, clusterId),
details.numUsersWithAnyNonZeroScore,
details.numUsersWithNonZeroFollowScore,
details.numUsersWithNonZeroFavScore,
details.knownForUsersAndScores.getOrElse(Nil)
)
}.writeDALSnapshotExecution(
clusterDetailsLiteOutputDataset,
D.Daily,
D.Suffix(clusterDetailsLiteOutputPath),
D.EBLzo(),
dateRange.end)
Execution.zip(clusterDetailsExec, clusterDetailsLiteExec)
}
Util.printCounters(resultExec)
}
}
}
}
object ClusterDetailsBatch extends ClusterDetailsBatchTrait {
override val firstTime: String = "2018-07-28"
override val batchIncrement: Duration = Days(7)
override val manhattanOutputPath: String =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details"
override val clusterDetailsLiteOutputPath: String =
"/user/cassowary/processed/simclusters_v2_cluster_details_lite"
override val modelVersion: String = ModelVersions.Model20M145KDec11
override val knownForDataset = SimclustersV2KnownFor20M145KDec11ScalaDataset
override val interestedInDataset = SimclustersV2InterestedInScalaDataset
override val outputDataset = SimclustersV2ClusterDetailsScalaDataset
override val clusterDetailsLiteOutputDataset =
SimclustersV2ClusterDetailsLiteScalaDataset
}
object ClusterDetails20M145KUpdated extends ClusterDetailsBatchTrait {
override val firstTime: String = "2019-06-16"
override val batchIncrement: Duration = Days(7)
override val manhattanOutputPath: String =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_updated"
override val clusterDetailsLiteOutputPath: String =
"/user/cassowary/processed/simclusters_v2_cluster_details_lite_20m_145k_updated"
override val modelVersion: String = ModelVersions.Model20M145KUpdated
override val knownForDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
override val interestedInDataset = SimclustersV2InterestedIn20M145KUpdatedScalaDataset
override val outputDataset = SimclustersV2ClusterDetails20M145KUpdatedScalaDataset
override val clusterDetailsLiteOutputDataset =
SimclustersV2ClusterDetailsLite20M145KUpdatedScalaDataset
}
/**
* capesospy-v2 update --build_locally --start_cron cluster_details_20m_145k_2020 \
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
*/
object ClusterDetails20M145K2020 extends ClusterDetailsBatchTrait {
override val firstTime: String = "2020-10-15"
override val batchIncrement: Duration = Days(7)
override val manhattanOutputPath: String =
"/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_2020"
override val clusterDetailsLiteOutputPath: String =
"/user/cassowary/processed/simclusters_v2_cluster_details_lite_20m_145k_2020"
override val modelVersion: String = ModelVersions.Model20M145K2020
override val knownForDataset = SimclustersV2KnownFor20M145K2020ScalaDataset
override val interestedInDataset = SimclustersV2InterestedIn20M145K2020ScalaDataset
override val outputDataset = SimclustersV2ClusterDetails20M145K2020ScalaDataset
override val clusterDetailsLiteOutputDataset =
SimclustersV2ClusterDetailsLite20M145K2020ScalaDataset
}
/**
scalding remote run --main-class com.twitter.simclusters_v2.scalding.DumpClusterDetailsAdhoc \
--target src/scala/com/twitter/simclusters_v2/scalding:cluster_details-dump \
--user recos-platform -- \
--date 2020-06-25 \
--clusterIds 5542 129677 48645 \
--inputDir /user/recos-platform/adhoc/your_ldap/cluster_details_inferred_lang
*/
object DumpClusterDetailsAdhoc extends TwitterExecutionApp {
def job: Execution[Unit] =
Execution.getConfigMode.flatMap {
case (config, mode) =>
Execution.withId { implicit uniqueId =>
val args = config.getArgs
val clusters = args.list("clusterIds").map(_.toInt).toSet //(1 to 2500).toSet //
TypedPipe
.from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
.filter { case ((modelVersion, clusterId), details) => clusters.contains(clusterId) }
.toIterableExecution
.map { iter =>
iter.foreach { x => println(Util.prettyJsonMapper.writeValueAsString(x)) }
}
}
}
}
/**
* ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:cluster_details && \
* oscar hdfs --user cassowary --host hadoopnest2.atla.twitter.com --bundle cluster_details \
* --tool com.twitter.simclusters_v2.scalding.DumpClusterSimilaritiesAdhoc --screen --screen-detached \
* --tee your_ldap/dumpClusterSimilarities_20200103 -- \
* --inputDir /user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_updated/ \
* --outputDir adhoc/your_ldap
*/
object DumpClusterSimilaritiesAdhoc extends TwitterExecutionApp {
def job: Execution[Unit] =
Execution.getConfigMode.flatMap {
case (config, mode) =>
Execution.withId { implicit uniqueId =>
val args = config.getArgs
TypedPipe
.from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
.flatMap {
case ((_, clusterId), details) =>
details.neighborClusters.getOrElse(Nil).map { neighbor =>
val compositeScore = (neighbor.followCosineSimilarity
.getOrElse(0.0) + neighbor.favCosineSimilarity.getOrElse(0.0)) / 2
(
clusterId,
neighbor.clusterId,
"%.4f".format(compositeScore)
)
}
}.writeExecution(TypedTsv(args("outputDir")))
}
}
}

View File

@ -1,607 +0,0 @@
package com.twitter.simclusters_v2.scalding
import com.twitter.algebird.Monoid
import com.twitter.algebird.mutable.PriorityQueueMonoid
import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource
import com.twitter.scalding._
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.job.TwitterExecutionApp
import com.twitter.scalding_internal.job.analytics_batch._
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.simclusters_v2.common.ModelVersions
import com.twitter.simclusters_v2.hdfs_sources._
import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._
import com.twitter.simclusters_v2.scalding.common.Util
import com.twitter.simclusters_v2.scalding.common.Util.Distribution
import com.twitter.simclusters_v2.thriftscala.ClusterQuality
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor
import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset
import java.util.PriorityQueue
import scala.collection.JavaConverters._
object ClusterEvaluation {
val samplerMonoid: PriorityQueueMonoid[((Long, Long), (Double, Double))] =
Util.reservoirSamplerMonoidForPairs[(Long, Long), (Double, Double)](5000)(Util.edgeOrdering)
case class ClusterResults(
numEdgesInsideCluster: Int,
wtOfEdgesInsideCluster: Double,
numEdgesOutsideCluster: Int,
wtOfEdgesOutsideCluster: Double,
originalWtAndProductOfNodeScoresSample: PriorityQueue[((Long, Long), (Double, Double))]) {
def clusterQuality(clusterSize: Int, averagePrecisionWholeGraph: Double): ClusterQuality = {
val unweightedRecallDenominator = numEdgesInsideCluster + numEdgesOutsideCluster
val unweightedRecall = if (unweightedRecallDenominator > 0) {
numEdgesInsideCluster.toDouble / unweightedRecallDenominator.toDouble
} else 0.0
val weightedRecallDenominator = wtOfEdgesInsideCluster + wtOfEdgesOutsideCluster
val weightedRecall = if (weightedRecallDenominator > 0) {
wtOfEdgesInsideCluster / weightedRecallDenominator
} else 0.0
val precision = if (clusterSize > 1) {
Some(wtOfEdgesInsideCluster / (clusterSize * (clusterSize - 1)))
} else Some(0.0)
val relativePrecision = if (averagePrecisionWholeGraph > 0) {
precision.flatMap { p => Some(p / averagePrecisionWholeGraph) }
} else Some(0.0)
ClusterQuality(
unweightedRecall = Some(unweightedRecall),
weightedRecall = Some(weightedRecall),
unweightedRecallDenominator = Some(unweightedRecallDenominator),
weightedRecallDenominator = Some(weightedRecallDenominator),
relativePrecisionNumerator = precision,
relativePrecision = relativePrecision,
weightAndProductOfNodeScoresCorrelation = Some(
Util.computeCorrelation(
originalWtAndProductOfNodeScoresSample.iterator.asScala.map(_._2)))
)
}
}
object ClusterResultsMonoid extends Monoid[ClusterResults] {
override def zero = ClusterResults(0, 0, 0, 0, samplerMonoid.zero)
override def plus(l: ClusterResults, r: ClusterResults) = ClusterResults(
l.numEdgesInsideCluster + r.numEdgesInsideCluster,
l.wtOfEdgesInsideCluster + r.wtOfEdgesInsideCluster,
l.numEdgesOutsideCluster + r.numEdgesOutsideCluster,
l.wtOfEdgesOutsideCluster + r.wtOfEdgesOutsideCluster,
samplerMonoid
.plus(l.originalWtAndProductOfNodeScoresSample, r.originalWtAndProductOfNodeScoresSample)
)
}
/**
* Evaluate the quality of a cluster.
* @param memberScores A map with the members of the cluster as the keys and their scores
* inside the cluster as values. The more central a member is inside the score,
* the higher it's score is.
* @param membersAdjLists A map that gives the weighted neighbors of each member in the cluster.
*/
def evaluateCluster(
memberScores: Map[Long, Double],
membersAdjLists: Map[Long, Map[Long, Float]]
): ClusterResults = {
val resultsIter = membersAdjLists.flatMap {
case (fromNodeId, adjList) =>
val fromNodeWt = memberScores.getOrElse(fromNodeId, 0.0)
adjList.map {
case (toNodeId, edgeWt) =>
if (memberScores.contains(toNodeId)) {
val productOfMembershipScores = fromNodeWt * memberScores(toNodeId)
ClusterResults(
1,
edgeWt,
0,
0,
samplerMonoid.build(
((fromNodeId, toNodeId), (edgeWt.toDouble, productOfMembershipScores))))
} else {
ClusterResults(0, 0, 1, edgeWt, samplerMonoid.zero)
}
}
}
Monoid.sum(resultsIter)(ClusterResultsMonoid)
}
/**
* Evaluate each cluster with respect to the provided graph.
* @param graph graph represented via the adjacency lists of each node, needs to be symmetrized i.e. if u is in v's adjlist, then v needs to be in u's adjlist as well
* @param clusters cluster memberships of each node.
* @param statsPrefix convenience argument to act as prefix for stats counters
* @return key-value pipe with clusterId as key and (size of the cluster, quality struct) as value
*/
def clusterLevelEvaluation(
graph: TypedPipe[(Long, Map[Long, Float])],
clusters: TypedPipe[(Long, Array[(Int, Float)])],
statsPrefix: String = ""
)(
implicit uniqueId: UniqueID
): Execution[TypedPipe[(Int, (Int, ClusterQuality))]] = {
val numRealClusters = Stat(s"${statsPrefix}/numRealClusters")
val numFakeClusters = Stat(s"${statsPrefix}/numFakeClusters")
val numNodesAndEdgesExec = graph
.map {
case (nId, nbrMap) =>
(1L, nbrMap.size.toLong, nbrMap.values.sum.toDouble)
}.sum.getExecution
numNodesAndEdgesExec.map {
case (numNodes, numEdges, sumOfAllEdgeWts) =>
println("numNodes " + numNodes)
println("numEdges " + numEdges)
println("sumOfAllEdgeWts " + sumOfAllEdgeWts)
val numFakeClustersForUnassignedNodes = numNodes / 1e4
val averagePrecisionWholeGraph = sumOfAllEdgeWts / (numNodes * (numNodes - 1))
graph
.leftJoin(clusters)
// uncomment for adhoc job
.withReducers(200)
.flatMap {
case (nodeId, (adjList, assignedClustersOpt)) =>
val nodeDegree = adjList.size.toLong
val nodeWeightedDegree = adjList.values.sum
assignedClustersOpt match {
case Some(assignedClusters) if assignedClusters.nonEmpty =>
assignedClusters.toList.map {
case (clusterId, scoreOfNodeInCluster) =>
(
clusterId,
(
Map(nodeId -> (scoreOfNodeInCluster.toDouble, adjList)),
1,
nodeDegree,
nodeWeightedDegree))
}
case _ =>
// For nodes that don't belong to any cluster, create a fake clusterId (0 or lesser)
// and add the node's statistics to that clusterId. We don't need the adjacency lists for
// unassigned nodes, we'll simply track how many edges are incident on those nodes and their weighted sum etc
val fakeClusterId =
(-1 * (math.abs(
Util.hashToLong(nodeId)) % numFakeClustersForUnassignedNodes)).toInt
List(
(
fakeClusterId,
(
Map.empty[Long, (Double, Map[Long, Float])],
1,
nodeDegree,
nodeWeightedDegree)))
}
}
.sumByKey
// uncomment for adhoc job
.withReducers(60)
.map {
case (clusterId, (membersMap, clusterSize, volumeOfCluster, weightedVolumeOfCluster)) =>
if (clusterId > 0) {
numRealClusters.inc()
val scoresMap =
if (clusterId > 0) membersMap.mapValues(_._1) else Map.empty[Long, Double]
val adjListsMap = membersMap.mapValues(_._2)
val quality = evaluateCluster(scoresMap, adjListsMap)
.clusterQuality(clusterSize, averagePrecisionWholeGraph)
(clusterId, (clusterSize, quality))
} else {
// clusterId <= 0 means that this is a fake cluster.
numFakeClusters.inc()
(
clusterId,
(
clusterSize,
ClusterQuality(
unweightedRecallDenominator = Some(volumeOfCluster),
weightedRecallDenominator = Some(weightedVolumeOfCluster)
)
)
)
}
}
}
}
case class OverallResults(
unweightedRecall: Double,
edgesInsideClusters: Long,
allEdges: Long,
allNodes: Int,
weightedRecall: Double,
wtOnEdgesInsideClusters: Double,
wtOnAllEdges: Double,
weightCorrelation: Double,
relativePrecision: Double,
numUnassignedNodes: Int,
numAssignedNodes: Int,
sizeDist: Distribution,
recallDist: Distribution,
weightedRecallDist: Distribution,
relativePrecisionDist: Distribution,
weightCorrelationDist: Distribution,
numClustersWithNegativeCorrelation: Double,
numClustersWithZeroRecall: Double,
numClustersWithLessThanOneRelativePrecision: Double,
numSingletonClusters: Int)
def summarizePerClusterResults(
perClusterResults: TypedPipe[(Int, (Int, ClusterQuality))]
): Execution[Option[OverallResults]] = {
perClusterResults
.map {
case (clusterId, (size, quality)) =>
val unweightedRecallDen = quality.unweightedRecallDenominator.getOrElse(0.0)
val unweightedRecallNum = quality.unweightedRecall.getOrElse(0.0) * unweightedRecallDen
val weightedRecallDen = quality.weightedRecallDenominator.getOrElse(0.0)
val weightedRecallNum = quality.weightedRecall.getOrElse(0.0) * weightedRecallDen
val weightCorrelationDen = size
val weightCorrelationNum =
weightCorrelationDen * quality.weightAndProductOfNodeScoresCorrelation
.getOrElse(0.0)
val relativePrecisionDen = size
val relativePrecisionNum = relativePrecisionDen * quality.relativePrecision.getOrElse(0.0)
val numClustersWithNegativeCorrelation =
if (weightCorrelationNum < 0 && clusterId > 0) 1 else 0
val numClustersWithLessThanOneRelativePrecision =
if (quality.relativePrecision.getOrElse(0.0) < 1 && clusterId > 0) 1 else 0
val numClustersWithZeroRecall = if (weightedRecallNum < 1e-5 && clusterId > 0) 1 else 0
val numUnassignedNodes = if (clusterId < 1) size else 0
val numAssignedNodes = if (clusterId > 0) size else 0
val numSingletonClusters = if (clusterId > 0 && size == 1) 1 else 0
(
unweightedRecallDen,
unweightedRecallNum,
weightedRecallDen,
weightedRecallNum,
weightCorrelationDen,
weightCorrelationNum,
relativePrecisionDen,
relativePrecisionNum,
numClustersWithNegativeCorrelation,
numClustersWithLessThanOneRelativePrecision,
numClustersWithZeroRecall,
List(size.toDouble),
List(quality.unweightedRecall.getOrElse(0.0)),
List(quality.weightedRecall.getOrElse(0.0)),
List(quality.relativePrecision.getOrElse(0.0)),
List(quality.weightAndProductOfNodeScoresCorrelation.getOrElse(0.0)),
numUnassignedNodes,
numAssignedNodes,
numSingletonClusters
)
}
.sum
.toOptionExecution
.map { opt =>
opt.map {
case (
unweightedRecallDen,
unweightedRecallNum,
weightedRecallDen,
weightedRecallNum,
weightCorrelationDen,
weightCorrelationNum,
relativePrecisionDen,
relativePrecisionNum,
numClustersWithNegativeCorrelation,
numClustersWithLessThanOneRelativePrecision,
numClustersWithZeroRecall,
sizeList,
unweightedRecallList,
weightedRecallList,
relativePrecisionList,
weightCorrelationList,
numUnassignedNodes,
numAssignedNodes,
numSingletonClusters) =>
OverallResults(
unweightedRecall = unweightedRecallNum / unweightedRecallDen,
edgesInsideClusters = unweightedRecallNum.toLong,
allEdges = unweightedRecallDen.toLong,
allNodes = numAssignedNodes + numUnassignedNodes,
weightedRecall = weightedRecallNum / weightedRecallDen,
wtOnEdgesInsideClusters = weightedRecallNum,
wtOnAllEdges = weightedRecallDen,
weightCorrelation = weightCorrelationNum / weightCorrelationDen,
relativePrecision = relativePrecisionNum / relativePrecisionDen,
numAssignedNodes = numAssignedNodes,
numUnassignedNodes = numUnassignedNodes,
sizeDist = Util.distributionFromArray(sizeList.toArray),
recallDist = Util.distributionFromArray(unweightedRecallList.toArray),
weightedRecallDist = Util.distributionFromArray(weightedRecallList.toArray),
weightCorrelationDist = Util.distributionFromArray(weightCorrelationList.toArray),
relativePrecisionDist = Util.distributionFromArray(relativePrecisionList.toArray),
numClustersWithNegativeCorrelation = numClustersWithNegativeCorrelation,
numClustersWithLessThanOneRelativePrecision =
numClustersWithLessThanOneRelativePrecision,
numClustersWithZeroRecall = numClustersWithZeroRecall,
numSingletonClusters = numSingletonClusters
)
}
}
}
/**
* @param graph Input similarity graph, needs to be symmetrized i.e. if u is in v's adjlist, then v needs to be in u's adjlist as well
* @param clusters cluster assignments to be evaluated
* @return summary of results
*/
def overallEvaluation(
graph: TypedPipe[(Long, Map[Long, Float])],
clusters: TypedPipe[(Long, Array[(Int, Float)])],
statsPrefix: String
)(
implicit uniqueId: UniqueID
): Execution[Option[OverallResults]] = {
clusterLevelEvaluation(graph, clusters, statsPrefix).flatMap(summarizePerClusterResults)
}
}
/**
* ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:cluster_evaluation && \
* oscar hdfs --user frigate --host hadoopnest1.atla.twitter.com --bundle cluster_evaluation \
* --tool com.twitter.simclusters_v2.scalding.ClusterEvaluationAdhoc --screen --screen-detached \
* --tee logs/clusterQualityFor_updatedUnnormalizedInputScores_usingSims20190318 -- \
* --simsInputDir /user/frigate/your_ldap/commonDirForClusterEvaluation/classifiedSims_20190314_copiedFromAtlaProc \
* --topK 20000000 --date 2019-03-18 --minActiveFollowers 400 \
* --topUsersDir /user/frigate/your_ldap/commonDirForClusterEvaluation/top20MUsers_minActiveFollowers400_20190215 \
* --maxSimsNeighborsForEval 40 \
* --preparedSimsGraph /user/frigate/your_ldap/commonDirForClusterEvaluation/symmetrized_classifiedSims20190318_top20MUsers \
* --outputDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownForClusterEvaluation \
* --knownForDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownFor
*/
object ClusterEvaluationAdhoc extends TwitterExecutionApp {
implicit val tz: java.util.TimeZone = DateOps.UTC
implicit val dp = DateParser.default
def job: Execution[Unit] =
Execution.getConfigMode.flatMap {
case (config, mode) =>
Execution.withId { implicit uniqueId =>
val args = config.getArgs
val knownFor = args
.optional("knownForDir").map { location =>
KnownForSources.readKnownFor(location)
}.getOrElse(KnownForSources.knownFor_20M_Dec11_145K)
val minActiveFollowers = args.int("minActiveFollowers", 400)
val topK = args.int("topK")
val date = DateRange.parse(args("date"))
val topUsersExec =
TopUsersSimilarityGraph
.topUsers(
DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe,
minActiveFollowers,
topK
)
.map(_.id)
.count("num_top_users")
.make(TypedTsv(args("topUsersDir")))
val simsGraphExec = topUsersExec.flatMap { topUsers =>
TopUsersSimilarityGraph.makeGraph(
TopUsersSimilarityGraph.getSubgraphFromUserGroupedInput(
TypedPipe.from(WTFCandidatesSource(args("simsInputDir"))),
topUsers,
args.int("maxSimsNeighborsForEval", 40),
degreeThresholdForStat = 5
),
args("preparedSimsGraph")
)
}
val fullExec = simsGraphExec.flatMap { sims =>
ClusterEvaluation
.clusterLevelEvaluation(sims, knownFor, "eval")
.flatMap { clusterResultsPipe =>
val clusterResults = clusterResultsPipe.forceToDiskExecution
val outputExec = clusterResults.flatMap { pipe =>
pipe
.map {
case (clusterId, (clusterSize, quality)) =>
"%d\t%d\t%.2g\t%.2g\t%.1f\t%.2g\t%.2f\t%.2g\t%.2g"
.format(
clusterId,
clusterSize,
quality.unweightedRecall.getOrElse(0.0),
quality.weightedRecall.getOrElse(0.0),
quality.unweightedRecallDenominator.getOrElse(0.0),
quality.weightedRecallDenominator.getOrElse(0.0),
quality.relativePrecision.getOrElse(0.0),
quality.relativePrecisionNumerator.getOrElse(0.0),
quality.weightAndProductOfNodeScoresCorrelation.getOrElse(0.0)
)
}.writeExecution(TypedTsv(args("outputDir")))
}
val printExec = clusterResults.flatMap { pipe =>
ClusterEvaluation.summarizePerClusterResults(pipe).map {
case Some(res) =>
println("Overall results: " + Util.prettyJsonMapper.writeValueAsString(res))
case None =>
println("No overall results!!! Probably cluster results pipe is empty.")
}
}
Execution.zip(outputExec, printExec)
}
}
Util.printCounters(fullExec)
}
}
}
trait ClusterEvaluationBatch extends TwitterScheduledExecutionApp {
implicit val tz: java.util.TimeZone = DateOps.UTC
implicit val dp = DateParser.default
def firstTime: String
def batchDescription: String
def batchIncrement: Duration
private lazy val execArgs = AnalyticsBatchExecutionArgs(
batchDesc = BatchDescription(batchDescription),
firstTime = BatchFirstTime(RichDate(firstTime)),
lastTime = None,
batchIncrement = BatchIncrement(batchIncrement)
)
val emailAddress: String = "no-reply@twitter.com"
def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
def knownForModelVersion: String
def baselineKnownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
def baselineKnownForModelVersion: String
override def scheduledJob: Execution[Unit] =
AnalyticsBatchExecution(execArgs) { implicit dateRange =>
Execution.withId { implicit uniqueId =>
Execution.withArgs { args =>
val baselineKnownFor =
KnownForSources.fromKeyVal(
DAL
.readMostRecentSnapshot(baselineKnownForDALDataset, dateRange.prepend(Days(7)))
.toTypedPipe,
baselineKnownForModelVersion
)
val knownFor =
KnownForSources.fromKeyVal(
DAL
.readMostRecentSnapshot(knownForDALDataset, dateRange.prepend(Days(7)))
.toTypedPipe,
knownForModelVersion
)
val inputSimsGraph = TypedPipe
.from(FollowingsCosineSimilaritiesManhattanSource())
.map(_._2)
val minActiveFollowers = args.int("minActiveFollowers")
val topK = args.int("topK")
val maxSimsNeighborsForEval =
args.int("maxSimsNeighborsForEval", 40)
val topUsers = TopUsersSimilarityGraph
.topUsers(
DAL
.readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange)
.toTypedPipe,
minActiveFollowers,
topK
)
.map(_.id)
.count("num_top_users")
TopUsersSimilarityGraph
.getSubgraphFromUserGroupedInput(
fullGraph = inputSimsGraph,
usersToInclude = topUsers,
maxNeighborsPerNode = maxSimsNeighborsForEval,
degreeThresholdForStat = 2
)
.forceToDiskExecution
.flatMap { symmetrizedSims =>
val baselineResultsExec = ClusterEvaluation
.overallEvaluation(symmetrizedSims, baselineKnownFor, "baselineKnownForEval")
val newResultsExec = ClusterEvaluation
.overallEvaluation(symmetrizedSims, knownFor, "newKnownForEval")
val minSizeOfBiggerClusterForComparison = 10
val compareExec = CompareClusters.summarize(
CompareClusters.compare(
KnownForSources.transpose(baselineKnownFor),
KnownForSources.transpose(knownFor),
minSizeOfBiggerCluster = minSizeOfBiggerClusterForComparison
))
Execution
.zip(baselineResultsExec, newResultsExec, compareExec)
.map {
case (oldResults, newResults, compareResults) =>
val emailText =
s"Evaluation Results for baseline knownFor: $baselineKnownForModelVersion \n" +
Util.prettyJsonMapper.writeValueAsString(oldResults) +
"\n\n-------------------\n\n" +
s"Evaluation Results for new knownFor:$knownForModelVersion\n" +
Util.prettyJsonMapper.writeValueAsString(newResults) +
"\n\n-------------------\n\n" +
s"Cosine similarity distribution between $baselineKnownForModelVersion and " +
s"$knownForModelVersion cluster membership vectors for " +
s"clusters with at least $minSizeOfBiggerClusterForComparison members:\n" +
Util.prettyJsonMapper
.writeValueAsString(compareResults)
Util
.sendEmail(
emailText,
s"Evaluation results comparing $knownForModelVersion with baseline $baselineKnownForModelVersion",
emailAddress)
()
}
}
}
}
}
}
/**
* capesospy-v2 update --build_locally --start_cron cluster_evaluation_for_20M_145k \
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
*/
object ClusterEvaluationFor20M145K extends ClusterEvaluationBatch {
override val firstTime: String = "2019-06-11"
override val batchIncrement: Duration = Days(7)
override val batchDescription = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K"
override val knownForDALDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
override val knownForModelVersion = ModelVersions.Model20M145KUpdated
override val baselineKnownForDALDataset = SimclustersV2KnownFor20M145KDec11ScalaDataset
override val baselineKnownForModelVersion = ModelVersions.Model20M145KDec11
}
/**
* capesospy-v2 update --build_locally --start_cron cluster_evaluation_for_20M_145k_2020 \
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
*/
object ClusterEvaluationFor20M145K2020 extends ClusterEvaluationBatch {
override val firstTime: String = "2021-01-25"
override val batchIncrement: Duration = Days(7)
override val batchDescription =
"com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K2020"
override val knownForDALDataset = SimclustersV2KnownFor20M145K2020ScalaDataset
override val knownForModelVersion = ModelVersions.Model20M145K2020
override val baselineKnownForDALDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
override val baselineKnownForModelVersion = ModelVersions.Model20M145KUpdated
}

View File

@ -1,131 +0,0 @@
package com.twitter.simclusters_v2.scalding
import com.twitter.scalding.{DateOps, DateParser, Execution, Stat, TypedPipe, TypedTsv, UniqueID}
import com.twitter.scalding_internal.job.TwitterExecutionApp
import com.twitter.simclusters_v2.common.{ClusterId, UserId}
import com.twitter.simclusters_v2.scalding.common.Util
import com.twitter.simclusters_v2.scalding.common.Util.Distribution
object CompareClusters {
def norm(a: Iterable[Float]): Float = {
math
.sqrt(a.map { x => x * x }.sum).toFloat
}
def cosine(a: Map[Long, Float], b: Map[Long, Float]): Float = {
val intersect = a.toList.collect {
case (id, score) if b.contains(id) =>
score * b(id)
}
val dot = if (intersect.nonEmpty) intersect.sum else 0
val aNorm = norm(a.values)
val bNorm = norm(b.values)
if (aNorm > 0 && bNorm > 0) {
dot / aNorm / bNorm
} else 0
}
/**
* Compare two known-for data set, and generate change in cluster assignment stats
*/
def compareClusterAssignments(
newKnownFor: TypedPipe[(UserId, List[(ClusterId, Float)])],
oldKnownFor: TypedPipe[(UserId, List[(ClusterId, Float)])]
)(
implicit uniqueID: UniqueID
): Execution[String] = {
val emptyToSomething = Stat("no_assignment_to_some")
val somethingToEmpty = Stat("some_assignment_to_none")
val emptyToEmpty = Stat("empty_to_empty")
val sameCluster = Stat("same_cluster")
val diffCluster = Stat("diff_cluster")
val calculateStatExec = newKnownFor
.outerJoin(oldKnownFor)
.map {
case (userId, (newKnownForListOpt, oldKnownForListOpt)) =>
val newKnownFor = newKnownForListOpt.getOrElse(Nil)
val oldKnownFor = oldKnownForListOpt.getOrElse(Nil)
if (newKnownFor.nonEmpty && oldKnownFor.isEmpty) {
emptyToSomething.inc()
}
if (newKnownFor.isEmpty && oldKnownFor.nonEmpty) {
somethingToEmpty.inc()
}
if (newKnownFor.isEmpty && oldKnownFor.isEmpty) {
emptyToEmpty.inc()
}
if (newKnownFor.nonEmpty && oldKnownFor.nonEmpty) {
val newClusterId = newKnownFor.head._1
val oldClusterId = oldKnownFor.head._1
if (newClusterId == oldClusterId) {
sameCluster.inc()
} else {
diffCluster.inc()
}
}
userId
}
.toIterableExecution
Util.getCustomCountersString(calculateStatExec)
}
/**
* Compare two cluster assignments in terms of cosine similarity of corresponding clusters.
* Excludes clusters which are too small
* @param knownForA
* @param knownForB
* @param minSizeOfBiggerCluster Set to 10 or some such.
* @return
*/
def compare(
knownForA: TypedPipe[(Int, List[(Long, Float)])],
knownForB: TypedPipe[(Int, List[(Long, Float)])],
minSizeOfBiggerCluster: Int
): TypedPipe[(Int, Float)] = {
knownForA
.outerJoin(knownForB)
.collect {
case (clusterId, (membersInAOpt, membersInBOpt))
if membersInAOpt.exists(_.size >= minSizeOfBiggerCluster) || membersInBOpt
.exists(_.size >= minSizeOfBiggerCluster) =>
val membersInA =
membersInAOpt.map(_.toMap).getOrElse(Map.empty[Long, Float])
val membersInB =
membersInBOpt.map(_.toMap).getOrElse(Map.empty[Long, Float])
(clusterId, cosine(membersInA, membersInB))
}
}
def summarize(clusterToCosines: TypedPipe[(Int, Float)]): Execution[Option[Distribution]] = {
clusterToCosines.values.map(x => List(x)).sum.toOptionExecution.map { listOpt =>
listOpt.map { list => Util.distributionFromArray(list.map(_.toDouble).toArray) }
}
}
}
object CompareClustersAdhoc extends TwitterExecutionApp {
implicit val tz: java.util.TimeZone = DateOps.UTC
implicit val dp = DateParser.default
def job: Execution[Unit] =
Execution.getConfigMode.flatMap {
case (config, mode) =>
Execution.withId { implicit uniqueId =>
val args = config.getArgs
val knownForA = KnownForSources.transpose(KnownForSources.readKnownFor(args("knownForA")))
val knownForB = KnownForSources.transpose(KnownForSources.readKnownFor(args("knownForB")))
CompareClusters
.compare(knownForA, knownForB, minSizeOfBiggerCluster = 10)
.map { case (cId, cos) => "%d\t%.2f".format(cId, cos) }
.writeExecution(TypedTsv(args("outputDir")))
}
}
}

View File

@ -1,330 +0,0 @@
package com.twitter.simclusters_v2.scalding
import com.twitter.algebird.Monoid
import com.twitter.logging.Logger
import com.twitter.scalding.{Execution, TypedPipe, TypedTsv}
import com.twitter.scalding_internal.job.TwitterExecutionApp
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
import java.util
import no.uib.cipr.matrix.Matrix
import no.uib.cipr.matrix.sparse.{ArpackSym, LinkedSparseMatrix}
import scala.collection.JavaConverters._
object EigenVectorsForSparseSymmetric {
val log: Logger = Logger()
/**
* Construct matrix from the rows of the matrix, specified as a map. The outer map is indexed by rowId, and the inner maps are indexed by columnId.
* Note that the input matrix is intended to be symmetric.
*
* @param map A map specifying the rows of the matrix. The outer map is indexed by rowId, and the inner maps are indexed by columnId. Both rows and columns are zero-indexed.
* @param nRows number of rows in matrix
* @param nCols number of columns in matrix
*
* @return the constructed matrix
*/
def getMatrix(map: Map[Int, Map[Int, Double]], nRows: Int, nCols: Int): Matrix = {
val nonzeros = map.toSeq.flatMap {
case (i, subMap) =>
subMap.toSeq.map {
case (j, value) =>
(i, j, value)
}
}
getMatrix(nonzeros, nRows, nCols)
}
/**
* Construct matrix from iterable of the non-zero entries. Note that the input matrix is intended to be symmetric.
*
* @param nonzeros non-zeros in (i, j, v) format, where i is row, j is column, and v is value. Both rows and columns are zero-indexed.
* @param nRows number of rows in matrix
* @param nCols number of columns in matrix
*
* @return the constructed matrix
*/
def getMatrix(nonzeros: Iterable[(Int, Int, Double)], nRows: Int, nCols: Int): Matrix = {
val matrix = new LinkedSparseMatrix(nRows, nCols)
var numEntries = 0
var maxRow = 0
var maxCol = 0
nonzeros.foreach {
case (i, j, v) =>
if (i > maxRow) {
maxRow = i
}
if (j > maxCol) {
maxCol = j
}
numEntries += 1
matrix.set(i, j, v)
}
log.info(
"Finished building matrix with %d entries and maxRow %d and maxCol %d"
.format(numEntries, maxRow, maxCol))
matrix
}
/**
* Prints out various diagnostics about how much the given matrix differs from a perfect
* symmetric matrix. If (i,j) and (j,i) are different, it sets both of them to be the max of the two.
* Call this function before invoking EVD.
*
* @param matrix Matrix which is modified (if need be) in place.
*/
def ensureMatrixIsSymmetric(matrix: Matrix): Unit = {
var numUnequalEntries = 0
var numEntriesDifferentBy1Percent = 0
var numEqualEntries = 0
var numUnequalDueToZero = 0
var maxUnequal = (0, 0, 0.0, 0.0)
matrix.iterator().asScala.foreach { entry =>
val curr = entry.get()
val opp = matrix.get(entry.column(), entry.row())
if (curr == opp) {
numEqualEntries += 1
} else {
numUnequalEntries += 1
if (opp == 0) {
numUnequalDueToZero += 1
}
if (opp != 0 && (math.abs(curr - opp) / math.min(curr, opp)) > 0.01) {
numEntriesDifferentBy1Percent += 1
}
if (opp != 0 && math.abs(curr - opp) > maxUnequal._4) {
maxUnequal = (entry.row(), entry.column(), curr, math.abs(curr - opp))
}
val max = math.max(curr, opp)
matrix.set(entry.column(), entry.row(), max)
matrix.set(entry.row(), entry.column(), max)
}
}
var numUnEqualPrinted = 0
matrix.iterator().asScala.foreach { entry =>
val opp = matrix.get(entry.column(), entry.row())
if (numUnEqualPrinted < 10 && entry.get() != opp) {
numUnEqualPrinted += 1
log.info(
"Entries for (%d, %d) are %s and %s"
.format(entry.row(), entry.column(), entry.get(), opp))
}
}
log.info(
"Num unequal entries: %d, num unequal due to zero: %d, num unequal by 1percent or more: %d, num equal entries: %d, maxUnequal: %s"
.format(
numUnequalEntries,
numUnequalDueToZero,
numEntriesDifferentBy1Percent,
numEqualEntries,
maxUnequal))
}
/**
* Get the top-k eigenvalues (largest magnitude) and eigenvectors for an input matrix.
* Top eigenvalues means they're the largest in magnitude.
* Input matrix needs to be perfectly symmetric; if it's not, this function will fail.
*
* Many of the eigenvectors will have very small values along most of the dimensions. This method also
* only retains the bigger entries in an eigenvector.
*
* @param matrix symmetric input matrix.
* @param k how many of the top eigenvectors to get.
* @param ratioToLargestCutoff An entry needs to be at least 1/ratioToLargestCutoff of the biggest entry in that vector to be retained.
*
* @return seq of (eigenvalue, eigenvector) pairs.
*/
def getTruncatedEVD(
matrix: Matrix,
k: Int,
ratioToLargestCutoff: Float
): Seq[(Double, Seq[(Int, Double)])] = {
val solver = new ArpackSym(matrix)
val resultsMap = solver.solve(k, ArpackSym.Ritz.LM).asScala.toMap
val results = resultsMap.toIndexedSeq.sortBy { case (eigValue, _) => -eigValue }
results.zipWithIndex.map {
case ((eigValue, denseVectorJava), index) =>
val denseVector = new Array[Double](denseVectorJava.size())
denseVector.indices.foreach { index => denseVector(index) = denseVectorJava.get(index) }
val denseVectorMax = denseVector.maxBy { entry => math.abs(entry) }
val cutOff = math.abs(denseVectorMax) / ratioToLargestCutoff
val significantEntries = denseVector.zipWithIndex
.filter { case (vectorEntry, _) => math.abs(vectorEntry) >= cutOff }
.sortBy { case (vectorEntry, _) => -1 * math.abs(vectorEntry) }
(eigValue.toDouble, significantEntries.toSeq.map(_.swap))
}
}
/**
* Compute U*Diag*Ut - where Diag is a diagonal matrix, and U is a sparse matrix.
* This is primarily for testing - to make sure that the computed eigenvectors can be used to
* reconstruct the original matrix up to some reasonable approximation.
*
* @param diagToUColumns seq of (diagonal entries, associated column in U)
* @param cutoff cutoff for including a value in the result.
*
* @return result of multiplication, returned as a map of the rows in the results.
*/
def uTimesDiagTimesUT(
diagToUColumns: Seq[(Double, Seq[(Int, Double)])],
cutoff: Double
): Map[Int, Map[Int, Double]] = {
val result = new util.HashMap[Int, util.HashMap[Int, Double]]()
diagToUColumns.foreach {
case (diag, uColumn) =>
uColumn.foreach {
case (i, iVal) =>
uColumn.foreach {
case (j, jVal) =>
val prod = diag * iVal * jVal
if (result.containsKey(i)) {
val newVal = if (result.get(i).containsKey(j)) {
result.get(i).get(j) + prod
} else prod
result.get(i).put(j, newVal)
} else {
result.put(i, new util.HashMap[Int, Double])
result.get(i).put(j, prod)
}
}
}
}
val unfiltered = result.asScala.toMap.mapValues(_.asScala.toMap)
unfiltered
.mapValues { m => m.filter { case (_, value) => math.abs(value) >= cutoff } }
.filter { case (_, vector) => vector.nonEmpty }
}
/** Note: This requires a full EVD to correctly compute the inverse! :-( */
def getInverseFromEVD(
evd: Seq[(Double, Seq[(Int, Double)])],
cutoff: Double
): Map[Int, Map[Int, Double]] = {
val evdInverse = evd.map {
case (eigValue, eigVector) =>
(1.0 / eigValue, eigVector)
}
uTimesDiagTimesUT(evdInverse, cutoff)
}
}
object PCAProjectionMatrixAdhoc extends TwitterExecutionApp {
val log = Logger()
def job: Execution[Unit] =
Execution.getConfigMode.flatMap {
case (config, _) =>
Execution.withId { _ =>
val args = config.getArgs
val k = args.int("k", 100)
val ratioToLargestEntryInVectorCutoff = args.int("ratioToLargestEntryInVectorCutoff", 100)
val minClusterFavers = args.int("minClusterFavers", 1000)
val input = TypedPipe.from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
val outputDir = args("outputDir")
val filteredClustersExec =
input
.collect {
case ((_, clusterId), details)
if details.numUsersWithNonZeroFavScore > minClusterFavers =>
clusterId
}
.toIterableExecution
.map { fc =>
val fcSet = fc.toSet
log.info("Number of clusters with favers more than %d is %d"
.format(minClusterFavers, fcSet.size))
fcSet
}
filteredClustersExec
.flatMap { filteredClusters =>
input.flatMap {
case ((_, clusterId), details) =>
if (filteredClusters(clusterId)) {
details.neighborClusters.getOrElse(Nil).collect {
case neighbor
if filteredClusters(
neighbor.clusterId) && neighbor.favCosineSimilarity.isDefined =>
(clusterId, neighbor.clusterId, neighbor.favCosineSimilarity.get)
}
} else Nil
}.toIterableExecution
}
.flatMap { edgesIter =>
val edges = edgesIter.toSeq
val oldIdToNewId = edges
.flatMap { case (i, j, _) => Seq(i, j) }
.distinct
.zipWithIndex
.toMap
val mapString = oldIdToNewId.toList
.take(5).map {
case (old, nw) =>
Seq(old, nw).mkString(" ")
}.mkString("\n")
log.info("A few entries of OldId to NewId map is")
log.info(mapString)
val newIdToOldId = oldIdToNewId.map(_.swap)
log.info(
"Num clusters after filtering out those with no neighbors with favers more than %d is %d"
.format(minClusterFavers, oldIdToNewId.size))
val newEdges = edges.map {
case (oldI, oldJ, value) =>
(oldIdToNewId(oldI), oldIdToNewId(oldJ), value)
}
log.info("Going to build matrix")
val matrix = EigenVectorsForSparseSymmetric.getMatrix(
newEdges,
oldIdToNewId.size,
oldIdToNewId.size)
EigenVectorsForSparseSymmetric.ensureMatrixIsSymmetric(matrix)
log.info("Going to solve now for %d eigenvalues".format(k))
val tic = System.currentTimeMillis()
val results = EigenVectorsForSparseSymmetric.getTruncatedEVD(
matrix,
k,
ratioToLargestEntryInVectorCutoff)
val toc = System.currentTimeMillis()
log.info("Finished solving in %.2f minutes".format((toc - tic) / 1000 / 60.0))
val eigValues = results.map(_._1).map { x => "%.3g".format(x) }.mkString(" ")
val eigValueNorm = math.sqrt(results.map(_._1).map(x => x * x).sum)
val matrixNorm = math.sqrt(matrix.iterator().asScala.map(_.get()).map(x => x * x).sum)
println(
"matrixNorm %s, eigValueNorm %s, explained fraction %s"
.format(matrixNorm, eigValueNorm, eigValueNorm / matrixNorm))
log.info("The eigenvalues are:")
log.info(eigValues)
val nnzInEigenVectors = results.map(_._2.size).sum
log.info("Average nnz per eigenvector using ratioToLargestCutoff %d is %.2g"
.format(ratioToLargestEntryInVectorCutoff, nnzInEigenVectors * 1.0 / results.size))
val transposedRaw = results.zipWithIndex.flatMap {
case ((_, eigVector), eigIndex) =>
eigVector.map {
case (index, vectorEntry) =>
val clusterId = newIdToOldId(index)
Map(clusterId -> List((eigIndex, vectorEntry)))
}
}
val transposed = Monoid.sum(transposedRaw).mapValues { rowForCluster =>
rowForCluster
.map {
case (dimId, weight) =>
"%d:%.2g".format(dimId, weight)
}.mkString(" ")
}
TypedPipe.from(transposed.toSeq).writeExecution(TypedTsv(outputDir))
}
}
}
}

View File

@ -1,332 +0,0 @@
package com.twitter.simclusters_v2.scalding
import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.dal.client.dataset.SnapshotDALDataset
import com.twitter.scalding._
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.DALWrite.D
import com.twitter.scalding_internal.dalv2.DALWrite.WriteExtension
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.simclusters_v2.common.ClusterId
import com.twitter.simclusters_v2.common.ModelVersions
import com.twitter.simclusters_v2.common.UserId
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
import com.twitter.simclusters_v2.hdfs_sources.AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2UserToInterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
import com.twitter.simclusters_v2.thriftscala.InternalId
import com.twitter.simclusters_v2.thriftscala.ModelVersion
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores
import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusters
import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp
import java.util.TimeZone
/**
* Production job for computing interestedIn data set from the aggregatable producer embeddings for the model version 20M145K2020.
* It writes the data set in KeyVal format to produce a MH DAL data set.
*
* A high level description of this job:
* - Read the APE dataset
* - Apply log1p to the scores from the above dataset as the scores for producers is high
* - Normalize the scores for each producer (offline benchmarking has shown better results from this step.)
* - Truncate the number of clusters for each producer from the APE dataset to reduce noise
* - Compute interestedIn
*
* To deploy the job:
*
* capesospy-v2 update --build_locally --start_cron interested_in_from_ape_2020 \
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
*/
object InterestedInFromAPE2020BatchApp extends InterestedInFromAggregatableProducerEmbeddingsBase {
override val firstTime: RichDate = RichDate("2021-03-03")
override val batchIncrement: Duration = Days(7)
override def modelVersion: ModelVersion = ModelVersion.Model20m145k2020
override def producerEmbeddingsInputKVDataset: KeyValDALDataset[
KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]
] = AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
override def interestedInFromAPEOutputKVDataset: KeyValDALDataset[
KeyVal[UserId, ClustersUserIsInterestedIn]
] = SimclustersV2InterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
override def interestedInFromAPEOutputThriftDatset: SnapshotDALDataset[
UserToInterestedInClusters
] = SimclustersV2UserToInterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
}
trait InterestedInFromAggregatableProducerEmbeddingsBase extends ScheduledExecutionApp {
def modelVersion: ModelVersion
def interestedInFromAPEOutputKVDataset: KeyValDALDataset[
KeyVal[UserId, ClustersUserIsInterestedIn]
]
def producerEmbeddingsInputKVDataset: KeyValDALDataset[
KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]
]
def interestedInFromAPEOutputThriftDatset: SnapshotDALDataset[UserToInterestedInClusters]
override def runOnDateRange(
args: Args
)(
implicit dateRange: DateRange,
timeZone: TimeZone,
uniqueID: UniqueID
): Execution[Unit] = {
//Input args for the run
val socialProofThreshold = args.int("socialProofThreshold", 2)
val maxClustersFromProducer = args.int("maxClustersPerProducer", 5)
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 200)
//Path variables
val interestedInFromProducersPath =
s"/user/cassowary/manhattan_sequence_files/interested_in_from_ape/" + modelVersion
val interestedInFromProducersThriftPath =
s"/user/cassowary/manhattan_sequence_files/interested_in_from_ape_thrift/" + modelVersion
val userUserGraph: TypedPipe[UserAndNeighbors] =
DAL
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
.withRemoteReadPolicy(AllowCrossDC)
.toTypedPipe
val producerEmbeddings = DAL
.readMostRecentSnapshotNoOlderThan(
producerEmbeddingsInputKVDataset,
Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe.map {
case KeyVal(producer, embeddings) => (producer, embeddings)
}
val result = InterestedInFromAggregatableProducerEmbeddingsBase.run(
userUserGraph,
producerEmbeddings,
maxClustersFromProducer,
socialProofThreshold,
maxClustersPerUserFinalResult,
modelVersion)
val keyValExec =
result
.map { case (userId, clusters) => KeyVal(userId, clusters) }
.writeDALVersionedKeyValExecution(
interestedInFromAPEOutputKVDataset,
D.Suffix(interestedInFromProducersPath)
)
val thriftExec =
result
.map {
case (userId, clusters) =>
UserToInterestedInClusters(
userId,
ModelVersions.toKnownForModelVersion(modelVersion),
clusters.clusterIdToScores)
}
.writeDALSnapshotExecution(
interestedInFromAPEOutputThriftDatset,
D.Daily,
D.Suffix(interestedInFromProducersThriftPath),
D.EBLzo(),
dateRange.end
)
Execution.zip(keyValExec, thriftExec).unit
}
}
/**
* Adhoc job to generate the interestedIn from aggregatable producer embeddings for the model version 20M145K2020
*
* scalding remote run \
* --user cassowary \
* --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
* --principal service_acoount@TWITTER.BIZ \
* --cluster bluebird-qus1 \
* --main-class com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020AdhocApp \
* --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_ape_2020-adhoc \
* --hadoop-properties "mapreduce.map.memory.mb=8192 mapreduce.map.java.opts='-Xmx7618M' mapreduce.reduce.memory.mb=8192 mapreduce.reduce.java.opts='-Xmx7618M'" \
* -- --outputDir /gcs/user/cassowary/adhoc/your_ldap/interested_in_from_ape_2020_keyval --date 2021-03-05
*/
object InterestedInFromAPE2020AdhocApp extends AdhocExecutionApp {
override def runOnDateRange(
args: Args
)(
implicit dateRange: DateRange,
timeZone: TimeZone,
uniqueID: UniqueID
): Execution[Unit] = {
val outputDir = args("outputDir")
val socialProofThreshold = args.int("socialProofThreshold", 2)
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 200)
val maxClustersFromProducer = args.int("maxClustersFromProducer", 5)
val inputGraph = args.optional("graphInputDir") match {
case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir))
case None =>
DAL
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
.withRemoteReadPolicy(AllowCrossClusterSameDC)
.toTypedPipe
}
val producerEmbeddings = DAL
.readMostRecentSnapshotNoOlderThan(
AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset,
Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe.map {
case KeyVal(producer, embeddings) => (producer, embeddings)
}
val result = InterestedInFromAggregatableProducerEmbeddingsBase.run(
inputGraph,
producerEmbeddings,
maxClustersFromProducer,
socialProofThreshold,
maxClustersPerUserFinalResult,
ModelVersion.Model20m145k2020)
result
.writeExecution(AdhocKeyValSources.interestedInSource(outputDir))
}
}
/**
* Helper functions
*/
object InterestedInFromAggregatableProducerEmbeddingsBase {
/**
* Helper function to prune the embeddings
* @param embeddingsWithScore embeddings
* @param maxClusters number of clusters to keep, per userId
* @param uniqueId for stats
* @return
*/
def getPrunedEmbeddings(
embeddingsWithScore: TypedPipe[(UserId, Seq[(ClusterId, Float)])],
maxClusters: Int
)(
implicit uniqueId: UniqueID
): TypedPipe[(UserId, Array[(ClusterId, Float)])] = {
val numProducerMappings = Stat("num_producer_embeddings_total")
val numProducersWithLargeClusterMappings = Stat(
"num_producers_with_more_clusters_than_threshold")
val numProducersWithSmallClusterMappings = Stat(
"num_producers_with_clusters_less_than_threshold")
val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings")
embeddingsWithScore.map {
case (producerId, clusterArray) =>
numProducerMappings.inc()
val clusterSize = clusterArray.size
totalClustersCoverageProducerEmbeddings.incBy(clusterSize)
val prunedList = if (clusterSize > maxClusters) {
numProducersWithLargeClusterMappings.inc()
clusterArray
.sortBy {
case (_, knownForScore) => -knownForScore
}.take(maxClusters)
} else {
numProducersWithSmallClusterMappings.inc()
clusterArray
}
(producerId, prunedList.toArray)
}
}
/**
* helper function to remove all scores except follow and logFav
* @param interestedInResult interestedIn clusters for a user
* @return
*/
def getInterestedInDiscardScores(
interestedInResult: TypedPipe[(UserId, List[(ClusterId, UserToInterestedInClusterScores)])]
): TypedPipe[(UserId, List[(ClusterId, UserToInterestedInClusterScores)])] = {
interestedInResult.map {
case (srcId, fullClusterList) =>
val fullClusterListWithDiscardedScores = fullClusterList.map {
case (clusterId, clusterDetails) =>
val clusterDetailsWithoutSocial = UserToInterestedInClusterScores(
// We are not planning to use the other scores except for logFav and Follow.
// Hence, setting others as None for now, we can add them back when needed
followScore = clusterDetails.followScore,
logFavScore = clusterDetails.logFavScore,
logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly
)
(clusterId, clusterDetailsWithoutSocial)
}
(srcId, fullClusterListWithDiscardedScores)
}
}
/**
* Helper function to normalize the embeddings
* @param embeddings cluster embeddings
* @return
*/
def getNormalizedEmbeddings(
embeddings: TypedPipe[(UserId, Seq[(ClusterId, Float)])]
): TypedPipe[(UserId, Seq[(ClusterId, Float)])] = {
embeddings.map {
case (userId, clustersWithScores) =>
val l2norm = math.sqrt(clustersWithScores.map(_._2).map(score => score * score).sum)
(
userId,
clustersWithScores.map {
case (clusterId, score) => (clusterId, (score / l2norm).toFloat)
})
}
}
def run(
userUserGraph: TypedPipe[UserAndNeighbors],
producerEmbeddings: TypedPipe[(SimClustersEmbeddingId, SimClustersEmbedding)],
maxClustersFromProducer: Int,
socialProofThreshold: Int,
maxClustersPerUserFinalResult: Int,
modelVersion: ModelVersion
)(
implicit uniqueId: UniqueID
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
import InterestedInFromKnownFor._
val producerEmbeddingsWithScore: TypedPipe[(UserId, Seq[(ClusterId, Float)])] =
producerEmbeddings.map {
case (
SimClustersEmbeddingId(embeddingType, modelVersion, InternalId.UserId(producerId)),
simclusterEmbedding) =>
(
producerId,
simclusterEmbedding.embedding.map { simclusterWithScore =>
// APE dataset has very high producer scores, hence applying log to smoothen them out before
// computing interestedIn
(simclusterWithScore.clusterId, math.log(1.0 + simclusterWithScore.score).toFloat)
})
}
val result = keepOnlyTopClusters(
getInterestedInDiscardScores(
attachNormalizedScores(
userClusterPairsWithoutNormalization(
userUserGraph,
getPrunedEmbeddings(
getNormalizedEmbeddings(producerEmbeddingsWithScore),
maxClustersFromProducer),
socialProofThreshold,
))),
maxClustersPerUserFinalResult,
ModelVersions.toKnownForModelVersion(modelVersion)
)
result
}
}

View File

@ -1,666 +0,0 @@
package com.twitter.simclusters_v2.scalding
import com.twitter.algebird.Semigroup
import com.twitter.bijection.Injection
import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.scalding.TypedPipe
import com.twitter.scalding._
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.DALWrite._
import com.twitter.scalding_internal.job.TwitterExecutionApp
import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecution
import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecutionArgs
import com.twitter.scalding_internal.job.analytics_batch.BatchDescription
import com.twitter.scalding_internal.job.analytics_batch.BatchFirstTime
import com.twitter.scalding_internal.job.analytics_batch.BatchIncrement
import com.twitter.scalding_internal.job.analytics_batch.TwitterScheduledExecutionApp
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.simclusters_v2.common.ClusterId
import com.twitter.simclusters_v2.common.ModelVersions
import com.twitter.simclusters_v2.common.UserId
import com.twitter.simclusters_v2.hdfs_sources._
import com.twitter.simclusters_v2.scalding.common.Util
import com.twitter.simclusters_v2.thriftscala._
/**
* This file implements the job for computing users' interestedIn vector from KnownFor data set.
*
* It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then
* based on the known-for clusters of each followed/faved user, we calculate how much a user is
* interestedIn a cluster.
*/
/**
* Production job for computing interestedIn data set for the model version 20M145K2020.
*
* To deploy the job:
*
* capesospy-v2 update --build_locally --start_cron interested_in_for_20M_145k_2020 \
src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
*/
object InterestedInFromKnownFor20M145K2020 extends InterestedInFromKnownForBatchBase {
override val firstTime: String = "2020-10-06"
override val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
SimclustersV2RawInterestedIn20M145K2020ScalaDataset
override val outputPath: String = InternalDataPaths.RawInterestedIn2020Path
override val knownForModelVersion: String = ModelVersions.Model20M145K2020
override val knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] =
SimclustersV2KnownFor20M145K2020ScalaDataset
}
/**
* base class for the main logic of computing interestedIn from KnownFor data set.
*/
trait InterestedInFromKnownForBatchBase extends TwitterScheduledExecutionApp {
implicit val tz = DateOps.UTC
implicit val parser = DateParser.default
def firstTime: String
val batchIncrement: Duration = Days(7)
val lookBackDays: Duration = Days(30)
def outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
def outputPath: String
def knownForModelVersion: String
def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
private lazy val execArgs = AnalyticsBatchExecutionArgs(
batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
firstTime = BatchFirstTime(RichDate(firstTime)),
lastTime = None,
batchIncrement = BatchIncrement(batchIncrement)
)
override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
implicit dateRange =>
Execution.withId { implicit uniqueId =>
Execution.withArgs { args =>
val normalizedGraph =
DAL.readMostRecentSnapshot(UserUserNormalizedGraphScalaDataset).toTypedPipe
val knownFor = KnownForSources.fromKeyVal(
DAL.readMostRecentSnapshot(knownForDALDataset, dateRange.extend(Days(30))).toTypedPipe,
knownForModelVersion
)
val socialProofThreshold = args.int("socialProofThreshold", 2)
val maxClustersPerUser = args.int("maxClustersPerUser", 50)
val result = InterestedInFromKnownFor
.run(
normalizedGraph,
knownFor,
socialProofThreshold,
maxClustersPerUser,
knownForModelVersion
)
val writeKeyValResultExec = result
.map { case (userId, clusters) => KeyVal(userId, clusters) }
.writeDALVersionedKeyValExecution(
outputKVDataset,
D.Suffix(outputPath)
)
// read previous data set for validation purpose
val previousDataset = if (RichDate(firstTime).timestamp != dateRange.start.timestamp) {
DAL
.readMostRecentSnapshot(outputKVDataset, dateRange.prepend(lookBackDays)).toTypedPipe
.map {
case KeyVal(user, interestedIn) =>
(user, interestedIn)
}
} else {
TypedPipe.empty
}
Util.printCounters(
Execution
.zip(
writeKeyValResultExec,
InterestedInFromKnownFor.dataSetStats(result, "NewResult"),
InterestedInFromKnownFor.dataSetStats(previousDataset, "OldResult")
).unit
)
}
}
}
}
/**
* Adhoc job to compute user interestedIn.
*
* scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_adhoc \
* --user recos-platform \
* --submitter hadoopnest2.atla.twitter.com \
* --main-class com.twitter.simclusters_v2.scalding.InterestedInFromKnownForAdhoc -- \
* --date 2019-08-26 --outputDir /user/recos-platform/adhoc/simclusters_interested_in_log_fav
*/
object InterestedInFromKnownForAdhoc extends TwitterExecutionApp {
def job: Execution[Unit] =
Execution.getConfigMode.flatMap {
case (config, mode) =>
Execution.withId { implicit uniqueId =>
val args = config.getArgs
val normalizedGraph = TypedPipe.from(
UserAndNeighborsFixedPathSource(args("graphInputDir"))
)
val socialProofThreshold = args.int("socialProofThreshold", 2)
val maxClustersPerUser = args.int("maxClustersPerUser", 20)
val knownForModelVersion = args("knownForModelVersion")
val knownFor = KnownForSources.readKnownFor(args("knownForInputDir"))
val outputSink = AdhocKeyValSources.interestedInSource(args("outputDir"))
Util.printCounters(
InterestedInFromKnownFor
.run(
normalizedGraph,
knownFor,
socialProofThreshold,
maxClustersPerUser,
knownForModelVersion
).writeExecution(outputSink)
)
}
}
}
/**
* Adhoc job to check the output of an adhoc interestedInSource.
*/
object DumpInterestedInAdhoc extends TwitterExecutionApp {
def job: Execution[Unit] =
Execution.getConfigMode.flatMap {
case (config, mode) =>
Execution.withId { implicit uniqueId =>
val args = config.getArgs
val users = args.list("users").map(_.toLong).toSet
val input = TypedPipe.from(AdhocKeyValSources.interestedInSource(args("inputDir")))
input.filter { case (userId, rec) => users.contains(userId) }.toIterableExecution.map {
s => println(s.map(Util.prettyJsonMapper.writeValueAsString).mkString("\n"))
}
}
}
}
/**
* Helper functions
*/
object InterestedInFromKnownFor {
private def ifNanMake0(x: Double): Double = if (x.isNaN) 0.0 else x
case class SrcClusterIntermediateInfo(
followScore: Double,
followScoreProducerNormalized: Double,
favScore: Double,
favScoreProducerNormalized: Double,
logFavScore: Double,
logFavScoreProducerNormalized: Double,
followSocialProof: List[Long],
favSocialProof: List[Long]) {
// overriding for the sake of unit tests
override def equals(obj: scala.Any): Boolean = {
obj match {
case that: SrcClusterIntermediateInfo =>
math.abs(followScore - that.followScore) < 1e-5 &&
math.abs(followScoreProducerNormalized - that.followScoreProducerNormalized) < 1e-5 &&
math.abs(favScore - that.favScore) < 1e-5 &&
math.abs(favScoreProducerNormalized - that.favScoreProducerNormalized) < 1e-5 &&
math.abs(logFavScore - that.logFavScore) < 1e-5 &&
math.abs(logFavScoreProducerNormalized - that.logFavScoreProducerNormalized) < 1e-5 &&
followSocialProof.toSet == that.followSocialProof.toSet &&
favSocialProof.toSet == that.favSocialProof.toSet
case _ => false
}
}
}
implicit object SrcClusterIntermediateInfoSemigroup
extends Semigroup[SrcClusterIntermediateInfo] {
override def plus(
left: SrcClusterIntermediateInfo,
right: SrcClusterIntermediateInfo
): SrcClusterIntermediateInfo = {
SrcClusterIntermediateInfo(
followScore = left.followScore + right.followScore,
followScoreProducerNormalized =
left.followScoreProducerNormalized + right.followScoreProducerNormalized,
favScore = left.favScore + right.favScore,
favScoreProducerNormalized =
left.favScoreProducerNormalized + right.favScoreProducerNormalized,
logFavScore = left.logFavScore + right.logFavScore,
logFavScoreProducerNormalized =
left.logFavScoreProducerNormalized + right.logFavScoreProducerNormalized,
followSocialProof =
Semigroup.plus(left.followSocialProof, right.followSocialProof).distinct,
favSocialProof = Semigroup.plus(left.favSocialProof, right.favSocialProof).distinct
)
}
}
/**
* @param adjacencyLists User-User follow/fav graph
* @param knownFor KnownFor data set. Each user can be known for several clusters with certain
* knownFor weights.
* @param socialProofThreshold A user will only be interested in a cluster if they follow/fav at
* least certain number of users known for this cluster.
* @param uniqueId required for these Stat
* @return
*/
def userClusterPairsWithoutNormalization(
adjacencyLists: TypedPipe[UserAndNeighbors],
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
socialProofThreshold: Int
)(
implicit uniqueId: UniqueID
): TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] = {
val edgesToUsersWithKnownFor = Stat("num_edges_to_users_with_known_for")
val srcDestClusterTriples = Stat("num_src_dest_cluster_triples")
val srcClusterPairsBeforeSocialProofThresholding =
Stat("num_src_cluster_pairs_before_social_proof_thresholding")
val srcClusterPairsAfterSocialProofThresholding =
Stat("num_src_cluster_pairs_after_social_proof_thresholding")
val edges = adjacencyLists.flatMap {
case UserAndNeighbors(srcId, neighborsWithWeights) =>
neighborsWithWeights.map { neighborWithWeights =>
(
neighborWithWeights.neighborId,
neighborWithWeights.copy(neighborId = srcId)
)
}
}
implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian
edges
.sketch(4000)
.join(knownFor)
.flatMap {
case (destId, (srcWithWeights, clusterArray)) =>
edgesToUsersWithKnownFor.inc()
clusterArray.toList.map {
case (clusterId, knownForScoreF) =>
val knownForScore = math.max(0.0, knownForScoreF.toDouble)
srcDestClusterTriples.inc()
val followScore =
if (srcWithWeights.isFollowed.contains(true)) knownForScore else 0.0
val followScoreProducerNormalizedOnly =
srcWithWeights.followScoreNormalizedByNeighborFollowersL2.getOrElse(
0.0) * knownForScore
val favScore =
srcWithWeights.favScoreHalfLife100Days.getOrElse(0.0) * knownForScore
val favScoreProducerNormalizedOnly =
srcWithWeights.favScoreHalfLife100DaysNormalizedByNeighborFaversL2.getOrElse(
0.0) * knownForScore
val logFavScore = srcWithWeights.logFavScore.getOrElse(0.0) * knownForScore
val logFavScoreProducerNormalizedOnly = srcWithWeights.logFavScoreL2Normalized
.getOrElse(0.0) * knownForScore
val followSocialProof = if (srcWithWeights.isFollowed.contains(true)) {
List(destId)
} else Nil
val favSocialProof = if (srcWithWeights.favScoreHalfLife100Days.exists(_ > 0)) {
List(destId)
} else Nil
(
(srcWithWeights.neighborId, clusterId),
SrcClusterIntermediateInfo(
followScore,
followScoreProducerNormalizedOnly,
favScore,
favScoreProducerNormalizedOnly,
logFavScore,
logFavScoreProducerNormalizedOnly,
followSocialProof,
favSocialProof
)
)
}
}
.sumByKey
.withReducers(10000)
.filter {
case ((_, _), SrcClusterIntermediateInfo(_, _, _, _, _, _, followProof, favProof)) =>
srcClusterPairsBeforeSocialProofThresholding.inc()
val distinctSocialProof = (followProof ++ favProof).toSet
val result = distinctSocialProof.size >= socialProofThreshold
if (result) {
srcClusterPairsAfterSocialProofThresholding.inc()
}
result
}
}
/**
* Add the cluster-level l2 norm scores, and use them to normalize follow/fav scores.
*/
def attachNormalizedScores(
intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
)(
implicit uniqueId: UniqueID
): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
def square(x: Double): Double = x * x
val clusterCountsAndNorms =
intermediate
.map {
case (
(_, clusterId),
SrcClusterIntermediateInfo(
followScore,
followScoreProducerNormalizedOnly,
favScore,
favScoreProducerNormalizedOnly,
logFavScore,
logFavScoreProducerNormalizedOnly,
_,
_
)
) =>
(
clusterId,
(
1,
square(followScore),
square(followScoreProducerNormalizedOnly),
square(favScore),
square(favScoreProducerNormalizedOnly),
square(logFavScore),
square(logFavScoreProducerNormalizedOnly)
)
)
}
.sumByKey
// .withReducers(100)
.map {
case (
clusterId,
(
cnt,
squareFollowScore,
squareFollowScoreProducerNormalizedOnly,
squareFavScore,
squareFavScoreProducerNormalizedOnly,
squareLogFavScore,
squareLogFavScoreProducerNormalizedOnly
)) =>
(
clusterId,
(
cnt,
math.sqrt(squareFollowScore),
math.sqrt(squareFollowScoreProducerNormalizedOnly),
math.sqrt(squareFavScore),
math.sqrt(squareFavScoreProducerNormalizedOnly),
math.sqrt(squareLogFavScore),
math.sqrt(squareLogFavScoreProducerNormalizedOnly)
))
}
implicit val i2b: Int => Array[Byte] = Injection.int2BigEndian
intermediate
.map {
case ((srcId, clusterId), clusterScoresTuple) =>
(clusterId, (srcId, clusterScoresTuple))
}
.sketch(reducers = 900)
.join(clusterCountsAndNorms)
.map {
case (
clusterId,
(
(
srcId,
SrcClusterIntermediateInfo(
followScore,
followScoreProducerNormalizedOnly,
favScore,
favScoreProducerNormalizedOnly,
logFavScore,
logFavScoreProducerNormalizedOnly, // not used for now
followProof,
favProof
)
),
(
cnt,
followNorm,
followProducerNormalizedNorm,
favNorm,
favProducerNormalizedNorm,
logFavNorm,
logFavProducerNormalizedNorm // not used for now
)
)
) =>
(
srcId,
List(
(
clusterId,
UserToInterestedInClusterScores(
followScore = Some(ifNanMake0(followScore)),
followScoreClusterNormalizedOnly = Some(ifNanMake0(followScore / followNorm)),
followScoreProducerNormalizedOnly =
Some(ifNanMake0(followScoreProducerNormalizedOnly)),
followScoreClusterAndProducerNormalized = Some(
ifNanMake0(followScoreProducerNormalizedOnly / followProducerNormalizedNorm)),
favScore = Some(ifNanMake0(favScore)),
favScoreClusterNormalizedOnly = Some(ifNanMake0(favScore / favNorm)),
favScoreProducerNormalizedOnly = Some(ifNanMake0(favScoreProducerNormalizedOnly)),
favScoreClusterAndProducerNormalized =
Some(ifNanMake0(favScoreProducerNormalizedOnly / favProducerNormalizedNorm)),
usersBeingFollowed = Some(followProof),
usersThatWereFaved = Some(favProof),
numUsersInterestedInThisClusterUpperBound = Some(cnt),
logFavScore = Some(ifNanMake0(logFavScore)),
logFavScoreClusterNormalizedOnly = Some(ifNanMake0(logFavScore / logFavNorm))
))
)
)
}
.sumByKey
// .withReducers(1000)
.toTypedPipe
}
/**
* aggregate cluster scores for each user, to be used instead of attachNormalizedScores
* when we donot want to compute cluster-level l2 norm scores
*/
def groupClusterScores(
intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
)(
implicit uniqueId: UniqueID
): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
intermediate
.map {
case (
(srcId, clusterId),
SrcClusterIntermediateInfo(
followScore,
followScoreProducerNormalizedOnly,
favScore,
favScoreProducerNormalizedOnly,
logFavScore,
logFavScoreProducerNormalizedOnly,
followProof,
favProof
)
) =>
(
srcId,
List(
(
clusterId,
UserToInterestedInClusterScores(
followScore = Some(ifNanMake0(followScore)),
followScoreProducerNormalizedOnly =
Some(ifNanMake0(followScoreProducerNormalizedOnly)),
favScore = Some(ifNanMake0(favScore)),
favScoreProducerNormalizedOnly = Some(ifNanMake0(favScoreProducerNormalizedOnly)),
usersBeingFollowed = Some(followProof),
usersThatWereFaved = Some(favProof),
logFavScore = Some(ifNanMake0(logFavScore)),
))
)
)
}
.sumByKey
.withReducers(1000)
.toTypedPipe
}
/**
* For each user, only keep up to a certain number of clusters.
* @param allInterests user with a list of interestedIn clusters.
* @param maxClustersPerUser number of clusters to keep for each user
* @param knownForModelVersion known for model version
* @param uniqueId required for these Stat
* @return
*/
def keepOnlyTopClusters(
allInterests: TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])],
maxClustersPerUser: Int,
knownForModelVersion: String
)(
implicit uniqueId: UniqueID
): TypedPipe[(Long, ClustersUserIsInterestedIn)] = {
val userClusterPairsBeforeUserTruncation =
Stat("num_user_cluster_pairs_before_user_truncation")
val userClusterPairsAfterUserTruncation =
Stat("num_user_cluster_pairs_after_user_truncation")
val usersWithALotOfClusters =
Stat(s"num_users_with_more_than_${maxClustersPerUser}_clusters")
allInterests
.map {
case (srcId, fullClusterList) =>
userClusterPairsBeforeUserTruncation.incBy(fullClusterList.size)
val truncatedClusters = if (fullClusterList.size > maxClustersPerUser) {
usersWithALotOfClusters.inc()
fullClusterList
.sortBy {
case (_, clusterScores) =>
(
-clusterScores.favScore.getOrElse(0.0),
-clusterScores.logFavScore.getOrElse(0.0),
-clusterScores.followScore.getOrElse(0.0),
-clusterScores.logFavScoreClusterNormalizedOnly.getOrElse(0.0),
-clusterScores.followScoreProducerNormalizedOnly.getOrElse(0.0)
)
}
.take(maxClustersPerUser)
} else {
fullClusterList
}
userClusterPairsAfterUserTruncation.incBy(truncatedClusters.size)
(srcId, ClustersUserIsInterestedIn(knownForModelVersion, truncatedClusters.toMap))
}
}
def run(
adjacencyLists: TypedPipe[UserAndNeighbors],
knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
socialProofThreshold: Int,
maxClustersPerUser: Int,
knownForModelVersion: String
)(
implicit uniqueId: UniqueID
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
keepOnlyTopClusters(
attachNormalizedScores(
userClusterPairsWithoutNormalization(
adjacencyLists,
knownFor,
socialProofThreshold
)
),
maxClustersPerUser,
knownForModelVersion
)
}
/**
* run the interestedIn job, cluster normalized scores are not attached to user's clusters.
*/
def runWithoutClusterNormalizedScores(
adjacencyLists: TypedPipe[UserAndNeighbors],
knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
socialProofThreshold: Int,
maxClustersPerUser: Int,
knownForModelVersion: String
)(
implicit uniqueId: UniqueID
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
keepOnlyTopClusters(
groupClusterScores(
userClusterPairsWithoutNormalization(
adjacencyLists,
knownFor,
socialProofThreshold
)
),
maxClustersPerUser,
knownForModelVersion
)
}
/**
* print out some basic stats of the data set to make sure things are not broken
*/
def dataSetStats(
interestedInData: TypedPipe[(UserId, ClustersUserIsInterestedIn)],
dataSetName: String = ""
): Execution[Unit] = {
Execution
.zip(
Util.printSummaryOfNumericColumn(
interestedInData.map {
case (user, interestedIn) =>
interestedIn.clusterIdToScores.size
},
Some(s"$dataSetName UserInterestedIn Size")
),
Util.printSummaryOfNumericColumn(
interestedInData.flatMap {
case (user, interestedIn) =>
interestedIn.clusterIdToScores.map {
case (_, scores) =>
scores.favScore.getOrElse(0.0)
}
},
Some(s"$dataSetName UserInterestedIn favScore")
),
Util.printSummaryOfNumericColumn(
interestedInData.flatMap {
case (user, interestedIn) =>
interestedIn.clusterIdToScores.map {
case (_, scores) =>
scores.favScoreClusterNormalizedOnly.getOrElse(0.0)
}
},
Some(s"$dataSetName UserInterestedIn favScoreClusterNormalizedOnly")
),
Util.printSummaryOfNumericColumn(
interestedInData.flatMap {
case (user, interestedIn) =>
interestedIn.clusterIdToScores.map {
case (_, scores) =>
scores.logFavScoreClusterNormalizedOnly.getOrElse(0.0)
}
},
Some(s"$dataSetName UserInterestedIn logFavScoreClusterNormalizedOnly")
)
).unit
}
}

View File

@ -1,354 +0,0 @@
package com.twitter.simclusters_v2.scalding
import com.twitter.algebird.Semigroup
import com.twitter.bijection.Injection
import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.scalding._
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.DALWrite.{D, WriteExtension}
import com.twitter.scalding_internal.job.TwitterExecutionApp
import com.twitter.scalding_internal.job.analytics_batch.{
AnalyticsBatchExecution,
AnalyticsBatchExecutionArgs,
BatchDescription,
BatchFirstTime,
BatchIncrement,
TwitterScheduledExecutionApp
}
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.simclusters_v2.common.{ClusterId, ModelVersions, UserId}
import com.twitter.simclusters_v2.hdfs_sources.{
AdhocKeyValSources,
InternalDataPaths,
SimclustersV2KnownFor20M145K2020ScalaDataset,
SimclustersV2RawInterestedInLite20M145K2020ScalaDataset,
SimclustersV2RawInterestedIn20M145KUpdatedScalaDataset,
UserAndNeighborsFixedPathSource,
UserUserGraphScalaDataset
}
import com.twitter.simclusters_v2.scalding.common.Util
import com.twitter.simclusters_v2.thriftscala.{
ClustersUserIsInterestedIn,
ClustersUserIsKnownFor,
UserAndNeighbors,
UserToInterestedInClusterScores
}
import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
import java.util.TimeZone
/**
* This file implements the job for computing users' interestedIn vector from KnownFor data set.
*
* It reads the UserUserGraphScalaDataset to get user-user follow + fav graph, and then
* based on the known-for clusters of each followed/faved user, we calculate how much a user is
* interestedIn a cluster.
*
* The main differences of the InterestedInFromKnownForLite compared to InterestedInFromKnownFor are
* the following:
* - We read the UserUserGraph dataset that doesnot contain the producer normalized scores
* - We donot compute the cluster normalized scores for the clusters per user
* - For social proof thresholding, we donot keep track of the entire list of follow and
* fav social proofs but rather make use of numFollowSocial and numFavSocial (this introduces
* some noise if follow and fav social proof contain the same users)
* - Store 200 clusters per user compared to 50 in IIKF
* - Runs more frequently compared to weekly in IIKF
*/
/**
* Production job for computing interestedIn data set for the model version 20M145K2020.
*
* To deploy the job:
*
* capesospy-v2 update --build_locally --start_cron interested_in_lite_for_20M_145k_2020 \
src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
*/
object InterestedInFromKnownForLite20M145K2020 extends InterestedInFromKnownForLite {
override val firstTime: String = "2021-04-24"
override val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
SimclustersV2RawInterestedInLite20M145K2020ScalaDataset
override val outputPath: String = InternalDataPaths.RawInterestedInLite2020Path
override val knownForModelVersion: String = ModelVersions.Model20M145K2020
override val knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] =
SimclustersV2KnownFor20M145K2020ScalaDataset
}
trait InterestedInFromKnownForLite extends TwitterScheduledExecutionApp {
implicit val tz = DateOps.UTC
implicit val parser = DateParser.default
def firstTime: String
val batchIncrement: Duration = Days(2)
val lookBackDays: Duration = Days(30)
def outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
def outputPath: String
def knownForModelVersion: String
def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
private lazy val execArgs = AnalyticsBatchExecutionArgs(
batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
firstTime = BatchFirstTime(RichDate(firstTime)),
lastTime = None,
batchIncrement = BatchIncrement(batchIncrement)
)
override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
implicit dateRange =>
Execution.withId { implicit uniqueId =>
Execution.withArgs { args =>
val userUserGraph =
DAL.readMostRecentSnapshot(UserUserGraphScalaDataset).toTypedPipe
val knownFor = KnownForSources.fromKeyVal(
DAL.readMostRecentSnapshot(knownForDALDataset, dateRange.extend(Days(30))).toTypedPipe,
knownForModelVersion
)
val socialProofThreshold = args.int("socialProofThreshold", 2)
val maxClustersPerUser = args.int("maxClustersPerUser", 200)
val result = InterestedInFromKnownForLite
.run(
userUserGraph,
knownFor,
socialProofThreshold,
maxClustersPerUser,
knownForModelVersion
)
val writeKeyValResultExec = result
.map {
case (userId, clusters) => KeyVal(userId, clusters)
}.writeDALVersionedKeyValExecution(
outputKVDataset,
D.Suffix(outputPath)
)
Util.printCounters(writeKeyValResultExec)
}
}
}
}
/**
* Adhoc job to compute user interestedIn.
*
* scalding remote run \
* --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_lite_20m_145k_2020-adhoc \
* --main-class com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020Adhoc \
* --user cassowary --cluster bluebird-qus1 \
* --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
* --principal service_acoount@TWITTER.BIZ \
* -- \
* --outputDir /gcs/user/cassowary/adhoc/interested_in_from_knownfor_lite/ \
* --date 2020-08-25
*/
object InterestedInFromKnownForLite20M145K2020Adhoc extends AdhocExecutionApp {
override def runOnDateRange(
args: Args
)(
implicit dateRange: DateRange,
timeZone: TimeZone,
uniqueID: UniqueID
): Execution[Unit] = {
val userUserGraph = DAL.readMostRecentSnapshot(UserUserGraphScalaDataset).toTypedPipe
val socialProofThreshold = args.int("socialProofThreshold", 2)
val maxClustersPerUser = args.int("maxClustersPerUser", 200)
val knownForModelVersion = ModelVersions.Model20M145K2020
val knownFor = KnownForSources.fromKeyVal(
DAL
.readMostRecentSnapshotNoOlderThan(
SimclustersV2KnownFor20M145K2020ScalaDataset,
Days(30)).toTypedPipe,
knownForModelVersion
)
val outputSink = AdhocKeyValSources.interestedInSource(args("outputDir"))
Util.printCounters(
InterestedInFromKnownForLite
.run(
userUserGraph,
knownFor,
socialProofThreshold,
maxClustersPerUser,
knownForModelVersion
).writeExecution(outputSink)
)
}
}
object InterestedInFromKnownForLite {
private def ifNanMake0(x: Double): Double = if (x.isNaN) 0.0 else x
case class SrcClusterIntermediateInfo(
followScore: Double,
favScore: Double,
logFavScore: Double,
numFollowed: Int,
numFaved: Int) {
// helper function used for test cases
override def equals(obj: scala.Any): Boolean = {
obj match {
case that: SrcClusterIntermediateInfo =>
math.abs(followScore - that.followScore) < 1e-5 &&
math.abs(favScore - that.favScore) < 1e-5 &&
math.abs(logFavScore - that.logFavScore) < 1e-5 &&
numFollowed == that.numFollowed &&
numFaved == that.numFaved
case _ => false
}
}
}
implicit object SrcClusterIntermediateInfoSemigroup
extends Semigroup[SrcClusterIntermediateInfo] {
override def plus(
left: SrcClusterIntermediateInfo,
right: SrcClusterIntermediateInfo
): SrcClusterIntermediateInfo = {
SrcClusterIntermediateInfo(
followScore = left.followScore + right.followScore,
favScore = left.favScore + right.favScore,
logFavScore = left.logFavScore + right.logFavScore,
numFollowed = left.numFollowed + right.numFollowed,
numFaved = left.numFaved + right.numFaved
)
}
}
def run(
adjacencyLists: TypedPipe[UserAndNeighbors],
knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
socialProofThreshold: Int,
maxClustersPerUser: Int,
knownForModelVersion: String
)(
implicit uniqueId: UniqueID
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
InterestedInFromKnownFor.keepOnlyTopClusters(
groupClusterScores(
userClusterPairs(
adjacencyLists,
knownFor,
socialProofThreshold
)
),
maxClustersPerUser,
knownForModelVersion
)
}
def userClusterPairs(
adjacencyLists: TypedPipe[UserAndNeighbors],
knownFor: TypedPipe[(Long, Array[(Int, Float)])],
socialProofThreshold: Int
)(
implicit uniqueId: UniqueID
): TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] = {
val edgesToUsersWithKnownFor = Stat("num_edges_to_users_with_known_for")
val srcDestClusterTriples = Stat("num_src_dest_cluster_triples")
val srcClusterPairsBeforeSocialProofThresholding =
Stat("num_src_cluster_pairs_before_social_proof_thresholding")
val srcClusterPairsAfterSocialProofThresholding =
Stat("num_src_cluster_pairs_after_social_proof_thresholding")
val edges = adjacencyLists.flatMap {
case UserAndNeighbors(srcId, neighborsWithWeights) =>
neighborsWithWeights.map { neighborWithWeights =>
(
neighborWithWeights.neighborId,
neighborWithWeights.copy(neighborId = srcId)
)
}
}
implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian
edges
.sketch(4000)
.join(knownFor)
.flatMap {
case (destId, (srcWithWeights, clusterArray)) =>
edgesToUsersWithKnownFor.inc()
clusterArray.toList.map {
case (clusterId, knownForScoreF) =>
val knownForScore = math.max(0.0, knownForScoreF.toDouble)
srcDestClusterTriples.inc()
val followScore =
if (srcWithWeights.isFollowed.contains(true)) knownForScore else 0.0
val favScore =
srcWithWeights.favScoreHalfLife100Days.getOrElse(0.0) * knownForScore
val logFavScore = srcWithWeights.logFavScore.getOrElse(0.0) * knownForScore
val numFollowed = if (srcWithWeights.isFollowed.contains(true)) {
1
} else 0
val numFaved = if (srcWithWeights.favScoreHalfLife100Days.exists(_ > 0)) {
1
} else 0
(
(srcWithWeights.neighborId, clusterId),
SrcClusterIntermediateInfo(
followScore,
favScore,
logFavScore,
numFollowed,
numFaved
)
)
}
}
.sumByKey
.withReducers(10000)
.filter {
case ((_, _), SrcClusterIntermediateInfo(_, _, _, numFollowed, numFaved)) =>
srcClusterPairsBeforeSocialProofThresholding.inc()
// we donot remove duplicates
val socialProofSize = numFollowed + numFaved
val result = socialProofSize >= socialProofThreshold
if (result) {
srcClusterPairsAfterSocialProofThresholding.inc()
}
result
}
}
def groupClusterScores(
intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
)(
implicit uniqueId: UniqueID
): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
implicit val i2b: Int => Array[Byte] = Injection.int2BigEndian
intermediate
.map {
case (
(srcId, clusterId),
SrcClusterIntermediateInfo(
followScore,
favScore,
logFavScore,
numFollowed,
numFaved
)) =>
(
srcId,
List(
(
clusterId,
UserToInterestedInClusterScores(
followScore = Some(ifNanMake0(followScore)),
favScore = Some(ifNanMake0(favScore)),
logFavScore = Some(ifNanMake0(logFavScore)),
numUsersBeingFollowed = Some(numFollowed),
numUsersThatWereFaved = Some(numFaved)
))
)
)
}
.sumByKey
// .withReducers(1000)
.toTypedPipe
}
}

View File

@ -1,290 +0,0 @@
package com.twitter.simclusters_v2.scalding
import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.scalding.Execution
import com.twitter.scalding.TypedTsv
import com.twitter.scalding._
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.DALWrite._
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.simclusters_v2.common.ModelVersions
import com.twitter.simclusters_v2.common.UserId
import com.twitter.simclusters_v2.hdfs_sources.ProducerEmbeddingSources
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
import com.twitter.simclusters_v2.hdfs_sources.DataSources
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset
import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
import com.twitter.simclusters_v2.scalding.common.Util
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
import com.twitter.simclusters_v2.thriftscala.EmbeddingType
import com.twitter.simclusters_v2.thriftscala.SimClusterWithScore
import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore
import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores
import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp
import java.util.TimeZone
import scala.util.Random
/**
* This file implements the job for computing users' interestedIn vector from the producerEmbeddings data set.
*
* It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then
* based on the producerEmbedding clusters of each followed/faved user, we calculate how much a user is
* interestedIn a cluster. To compute the engagement and determine the clusters for the user, we reuse
* the functions defined in InterestedInKnownFor.
*
* Using producerEmbeddings instead of knownFor to obtain interestedIn increases the coverage (especially
* for medium and light users) and also the density of the cluster embeddings for the user.
*/
/**
* Adhoc job to generate the interestedIn from producer embeddings for the model version 20M145KUpdated
*
scalding remote run \
--target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_producer_embeddings \
--main-class com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsAdhocApp \
--user cassowary --cluster bluebird-qus1 \
--keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
--principal service_acoount@TWITTER.BIZ \
-- \
--outputDir /gcs/user/cassowary/adhoc/interested_in_from_prod_embeddings/ \
--date 2020-08-25 --typedTsv true
*/
object InterestedInFromProducerEmbeddingsAdhocApp extends AdhocExecutionApp {
override def runOnDateRange(
args: Args
)(
implicit dateRange: DateRange,
timeZone: TimeZone,
uniqueID: UniqueID
): Execution[Unit] = {
val outputDir = args("outputDir")
val inputGraph = args.optional("graphInputDir") match {
case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir))
case None =>
DAL
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
.toTypedPipe
}
val socialProofThreshold = args.int("socialProofThreshold", 2)
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50)
val maxClustersFromProducer = args.int("maxClustersPerProducer", 25)
val typedTsvTag = args.boolean("typedTsv")
val embeddingType =
EmbeddingType.ProducerFavBasedSemanticCoreEntity
val modelVersion = ModelVersions.Model20M145KUpdated
val producerEmbeddings = ProducerEmbeddingSources
.producerEmbeddingSourceLegacy(embeddingType, ModelVersions.toModelVersion(modelVersion))(
dateRange.embiggen(Days(7)))
import InterestedInFromProducerEmbeddingsBatchApp._
val numProducerMappings = Stat("num_producer_embeddings_total")
val numProducersWithLargeClusterMappings = Stat(
"num_producers_with_more_clusters_than_threshold")
val numProducersWithSmallClusterMappings = Stat(
"num_producers_with_clusters_less_than_threshold")
val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings")
val producerEmbeddingsWithScore = producerEmbeddings.map {
case (userId: Long, topSimClusters: TopSimClustersWithScore) =>
(
userId,
topSimClusters.topClusters.toArray
.map {
case (simCluster: SimClusterWithScore) =>
(simCluster.clusterId, simCluster.score.toFloat)
}
)
}
val producerEmbeddingsPruned = producerEmbeddingsWithScore.map {
case (producerId, clusterArray) =>
numProducerMappings.inc()
val clusterSize = clusterArray.size
totalClustersCoverageProducerEmbeddings.incBy(clusterSize)
val prunedList = if (clusterSize > maxClustersFromProducer) {
numProducersWithLargeClusterMappings.inc()
clusterArray
.sortBy {
case (_, knownForScore) => -knownForScore
}.take(maxClustersFromProducer)
} else {
numProducersWithSmallClusterMappings.inc()
clusterArray
}
(producerId, prunedList)
}
val result = InterestedInFromKnownFor
.run(
inputGraph,
producerEmbeddingsPruned,
socialProofThreshold,
maxClustersPerUserFinalResult,
modelVersion
)
val resultWithoutSocial = getInterestedInDiscardSocial(result)
if (typedTsvTag) {
Util.printCounters(
resultWithoutSocial
.map {
case (userId: Long, clusters: ClustersUserIsInterestedIn) =>
(
userId,
clusters.clusterIdToScores.keys.toString()
)
}
.writeExecution(
TypedTsv(outputDir)
)
)
} else {
Util.printCounters(
resultWithoutSocial
.writeExecution(
AdhocKeyValSources.interestedInSource(outputDir)
)
)
}
}
}
/**
* Production job for computing interestedIn data set from the producer embeddings for the model version 20M145KUpdated.
* It writes the data set in KeyVal format to produce a MH DAL data set.
*
* To deploy the job:
*
* capesospy-v2 update --build_locally --start_cron
* --start_cron interested_in_from_producer_embeddings
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml
*/
object InterestedInFromProducerEmbeddingsBatchApp extends ScheduledExecutionApp {
override val firstTime: RichDate = RichDate("2019-11-01")
override val batchIncrement: Duration = Days(7)
def getPrunedEmbeddings(
producerEmbeddings: TypedPipe[(Long, TopSimClustersWithScore)],
maxClustersFromProducer: Int
): TypedPipe[(Long, TopSimClustersWithScore)] = {
producerEmbeddings.map {
case (producerId, producerClusters) =>
val prunedProducerClusters =
producerClusters.topClusters
.sortBy {
case simCluster => -simCluster.score.toFloat
}.take(maxClustersFromProducer)
(producerId, TopSimClustersWithScore(prunedProducerClusters, producerClusters.modelVersion))
}
}
def getInterestedInDiscardSocial(
interestedInFromProducersResult: TypedPipe[(UserId, ClustersUserIsInterestedIn)]
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
interestedInFromProducersResult.map {
case (srcId, fullClusterList) =>
val fullClusterListWithoutSocial = fullClusterList.clusterIdToScores.map {
case (clusterId, clusterDetails) =>
val clusterDetailsWithoutSocial = UserToInterestedInClusterScores(
followScore = clusterDetails.followScore,
followScoreClusterNormalizedOnly = clusterDetails.followScoreClusterNormalizedOnly,
followScoreProducerNormalizedOnly = clusterDetails.followScoreProducerNormalizedOnly,
followScoreClusterAndProducerNormalized =
clusterDetails.followScoreClusterAndProducerNormalized,
favScore = clusterDetails.favScore,
favScoreClusterNormalizedOnly = clusterDetails.favScoreClusterNormalizedOnly,
favScoreProducerNormalizedOnly = clusterDetails.favScoreProducerNormalizedOnly,
favScoreClusterAndProducerNormalized =
clusterDetails.favScoreClusterAndProducerNormalized,
// Social proof is currently not being used anywhere else, hence being discarded to reduce space for this dataset
usersBeingFollowed = None,
usersThatWereFaved = None,
numUsersInterestedInThisClusterUpperBound =
clusterDetails.numUsersInterestedInThisClusterUpperBound,
logFavScore = clusterDetails.logFavScore,
logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly,
// Counts of the social proof are maintained
numUsersBeingFollowed = Some(clusterDetails.usersBeingFollowed.getOrElse(Nil).size),
numUsersThatWereFaved = Some(clusterDetails.usersThatWereFaved.getOrElse(Nil).size)
)
(clusterId, clusterDetailsWithoutSocial)
}
(
srcId,
ClustersUserIsInterestedIn(
fullClusterList.knownForModelVersion,
fullClusterListWithoutSocial))
}
}
override def runOnDateRange(
args: Args
)(
implicit dateRange: DateRange,
timeZone: TimeZone,
uniqueID: UniqueID
): Execution[Unit] = {
//Input args for the run
val socialProofThreshold = args.int("socialProofThreshold", 2)
val maxClustersFromProducer = args.int("maxClustersPerProducer", 25)
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50)
//Path variables
val modelVersionUpdated = ModelVersions.toModelVersion(ModelVersions.Model20M145KUpdated)
val rootPath: String = s"/user/cassowary/manhattan_sequence_files"
val interestedInFromProducersPath =
rootPath + "/interested_in_from_producer_embeddings/" + modelVersionUpdated
//Input adjacency list and producer embeddings
val userUserNormalGraph =
DataSources.userUserNormalizedGraphSource(dateRange.prepend(Days(7))).forceToDisk
val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset
val producerEmbeddings = ProducerEmbeddingSources
.producerEmbeddingSourceLegacy(
EmbeddingType.ProducerFavBasedSemanticCoreEntity,
modelVersionUpdated)(dateRange.embiggen(Days(7)))
val producerEmbeddingsPruned = getPrunedEmbeddings(producerEmbeddings, maxClustersFromProducer)
val producerEmbeddingsWithScore = producerEmbeddingsPruned.map {
case (userId: Long, topSimClusters: TopSimClustersWithScore) =>
(
userId,
topSimClusters.topClusters.toArray
.map {
case (simCluster: SimClusterWithScore) =>
(simCluster.clusterId, simCluster.score.toFloat)
}
)
}
val interestedInFromProducersResult =
InterestedInFromKnownFor.run(
userUserNormalGraph,
producerEmbeddingsWithScore,
socialProofThreshold,
maxClustersPerUserFinalResult,
modelVersionUpdated.toString
)
val interestedInFromProducersWithoutSocial =
getInterestedInDiscardSocial(interestedInFromProducersResult)
val writeKeyValResultExec = interestedInFromProducersWithoutSocial
.map { case (userId, clusters) => KeyVal(userId, clusters) }
.writeDALVersionedKeyValExecution(
outputKVDataset,
D.Suffix(interestedInFromProducersPath)
)
writeKeyValResultExec
}
}

Some files were not shown because too many files have changed in this diff Show More