diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.docx b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.docx new file mode 100644 index 000000000..f762f0abd Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.scala b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.scala deleted file mode 100644 index c9b86be4f..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.scala +++ /dev/null @@ -1,32 +0,0 @@ -package com.twitter.simclusters_v2.common - -import com.twitter.simclusters_v2.common.SimClustersMultiEmbeddingId._ -import com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding.{Ids, Values} -import com.twitter.simclusters_v2.thriftscala.{ - SimClustersMultiEmbedding, - SimClustersEmbeddingId, - SimClustersMultiEmbeddingId -} - -/** - * Helper methods for SimClustersMultiEmbedding - */ -object SimClustersMultiEmbedding { - - // Convert a multiEmbedding to a list of (embeddingId, score) - def toSimClustersEmbeddingIdWithScores( - simClustersMultiEmbeddingId: SimClustersMultiEmbeddingId, - simClustersMultiEmbedding: SimClustersMultiEmbedding - ): Seq[(SimClustersEmbeddingId, Double)] = { - simClustersMultiEmbedding match { - case Values(values) => - values.embeddings.zipWithIndex.map { - case (embeddingWithScore, i) => - (toEmbeddingId(simClustersMultiEmbeddingId, i), embeddingWithScore.score) - } - case Ids(ids) => - ids.ids.map(_.toTuple) - } - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.docx b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.docx new file mode 100644 index 000000000..dbfa53347 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.scala b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.scala deleted file mode 100644 index 17d0eb0d6..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.scala +++ /dev/null @@ -1,96 +0,0 @@ -package com.twitter.simclusters_v2.common - -import com.twitter.simclusters_v2.thriftscala.{ - EmbeddingType, - InternalId, - MultiEmbeddingType, - TopicId, - TopicSubId, - SimClustersEmbeddingId => ThriftEmbeddingId, - SimClustersMultiEmbeddingId => ThriftMultiEmbeddingId -} - -/** - * Helper methods for SimClustersMultiEmbeddingId - */ -object SimClustersMultiEmbeddingId { - - private val MultiEmbeddingTypeToEmbeddingType: Map[MultiEmbeddingType, EmbeddingType] = - Map( - MultiEmbeddingType.LogFavApeBasedMuseTopic -> EmbeddingType.LogFavApeBasedMuseTopic, - MultiEmbeddingType.TwiceUserInterestedIn -> EmbeddingType.TwiceUserInterestedIn, - ) - - private val EmbeddingTypeToMultiEmbeddingType: Map[EmbeddingType, MultiEmbeddingType] = - MultiEmbeddingTypeToEmbeddingType.map(_.swap) - - def toEmbeddingType(multiEmbeddingType: MultiEmbeddingType): EmbeddingType = { - MultiEmbeddingTypeToEmbeddingType.getOrElse( - multiEmbeddingType, - throw new IllegalArgumentException(s"Invalid type: $multiEmbeddingType")) - } - - def toMultiEmbeddingType(embeddingType: EmbeddingType): MultiEmbeddingType = { - EmbeddingTypeToMultiEmbeddingType.getOrElse( - embeddingType, - throw new IllegalArgumentException(s"Invalid type: $embeddingType") - ) - } - - /** - * Convert a SimClusters Multi-Embedding Id and SubId to SimClusters Embedding Id. - */ - def toEmbeddingId( - simClustersMultiEmbeddingId: ThriftMultiEmbeddingId, - subId: Int - ): ThriftEmbeddingId = { - val internalId = simClustersMultiEmbeddingId.internalId match { - case InternalId.TopicId(topicId) => - InternalId.TopicSubId( - TopicSubId(topicId.entityId, topicId.language, topicId.country, subId)) - case _ => - throw new IllegalArgumentException( - s"Invalid simClusters InternalId ${simClustersMultiEmbeddingId.internalId}") - } - ThriftEmbeddingId( - toEmbeddingType(simClustersMultiEmbeddingId.embeddingType), - simClustersMultiEmbeddingId.modelVersion, - internalId - ) - } - - /** - * Fetch a subId from a SimClusters EmbeddingId. - */ - def toSubId(simClustersEmbeddingId: ThriftEmbeddingId): Int = { - simClustersEmbeddingId.internalId match { - case InternalId.TopicSubId(topicSubId) => - topicSubId.subId - case _ => - throw new IllegalArgumentException( - s"Invalid SimClustersEmbeddingId InternalId type, $simClustersEmbeddingId") - } - } - - /** - * Convert a SimClustersEmbeddingId to SimClustersMultiEmbeddingId. - * Only support the Multi embedding based EmbeddingTypes. - */ - def toMultiEmbeddingId( - simClustersEmbeddingId: ThriftEmbeddingId - ): ThriftMultiEmbeddingId = { - simClustersEmbeddingId.internalId match { - case InternalId.TopicSubId(topicSubId) => - ThriftMultiEmbeddingId( - toMultiEmbeddingType(simClustersEmbeddingId.embeddingType), - simClustersEmbeddingId.modelVersion, - InternalId.TopicId(TopicId(topicSubId.entityId, topicSubId.language, topicSubId.country)) - ) - - case _ => - throw new IllegalArgumentException( - s"Invalid SimClustersEmbeddingId InternalId type, $simClustersEmbeddingId") - } - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD b/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD deleted file mode 100644 index f394e109a..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD +++ /dev/null @@ -1,11 +0,0 @@ -scala_library( - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "eventdetection/common/src/main/java/com/twitter/eventdetection/common/louvain", - "eventdetection/common/src/main/java/com/twitter/eventdetection/common/model", - "src/java/com/twitter/sbf/graph", - "src/scala/com/twitter/simclusters_v2/scalding/common", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD.docx b/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD.docx new file mode 100644 index 000000000..a179ef591 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.docx b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.docx new file mode 100644 index 000000000..f4816c615 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.scala b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.scala deleted file mode 100644 index 42b585abc..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.scala +++ /dev/null @@ -1,30 +0,0 @@ -package com.twitter.simclusters_v2.common.clustering - -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights - -/** - * Select a cluster member as cluster representative. - */ -trait ClusterRepresentativeSelectionMethod[T] { - - /** - * The main external-facing method. Sub-classes should implement this method. - * - * @param cluster A set of NeighborWithWeights. - * @param embeddings A map of producer ID -> embedding. - * - * @return UserId of the member chosen as representative. - */ - def selectClusterRepresentative( - cluster: Set[NeighborWithWeights], - embeddings: Map[UserId, T] - ): UserId - -} - -object ClusterRepresentativeSelectionStatistics { - - // Statistics, to be imported where recorded. - val StatClusterRepresentativeSelectionTime = "cluster_representative_selection_total_time_ms" -} diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.docx b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.docx new file mode 100644 index 000000000..bfec38a5d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.scala b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.scala deleted file mode 100644 index e379e7051..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.scala +++ /dev/null @@ -1,34 +0,0 @@ -package com.twitter.simclusters_v2.common.clustering - -/** - * Partitions a set of entities into clusters. - * NOTE: The selection/construction of the cluster representatives (e.g. medoid, random, average) is implemented in ClusterRepresentativeSelectionMethod.scala - */ -trait ClusteringMethod { - - /** - * The main external-facing method. Sub-classes should implement this method. - * - * @param embeddings map of entity IDs and corresponding embeddings - * @param similarityFn function that outputs similarity (>=0, the larger, more similar), given two embeddings - * @tparam T embedding type. e.g. SimClustersEmbedding - * - * @return A set of sets of entity IDs, each set representing a distinct cluster. - */ - def cluster[T]( - embeddings: Map[Long, T], - similarityFn: (T, T) => Double, - recordStatCallback: (String, Long) => Unit = (_, _) => () - ): Set[Set[Long]] - -} - -object ClusteringStatistics { - - // Statistics, to be imported where recorded. - val StatSimilarityGraphTotalBuildTime = "similarity_graph_total_build_time_ms" - val StatClusteringAlgorithmRunTime = "clustering_algorithm_total_run_time_ms" - val StatMedoidSelectionTime = "medoid_selection_total_time_ms" - val StatComputedSimilarityBeforeFilter = "computed_similarity_before_filter" - -} diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.docx b/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.docx new file mode 100644 index 000000000..a43ee39f3 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.scala b/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.scala deleted file mode 100644 index 07f785f24..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.scala +++ /dev/null @@ -1,67 +0,0 @@ -package com.twitter.simclusters_v2.common.clustering - -import com.twitter.sbf.graph.ConnectedComponents -import com.twitter.sbf.graph.Graph -import com.twitter.util.Stopwatch -import it.unimi.dsi.fastutil.ints.IntSet -import scala.collection.SortedMap -import scala.jdk.CollectionConverters._ - -/** - * Aggregate entities into clusters such that a cluster contains all embeddings with a similarity - * above a configurable threshold to any other embedding. - * - * @param similarityThreshold: When building the edges between entities, edges with weight - * less than or equal to this threshold will be filtered out. - */ -class ConnectedComponentsClusteringMethod( - similarityThreshold: Double) - extends ClusteringMethod { - - import ClusteringStatistics._ - - def cluster[T]( - embeddings: Map[Long, T], - similarityFn: (T, T) => Double, - recordStatCallback: (String, Long) => Unit = (_, _) => () - ): Set[Set[Long]] = { - - val timeSinceGraphBuildStart = Stopwatch.start() - // com.twitter.sbf.graph.Graph expects neighbors to be sorted in ascending order. - val sourcesById = SortedMap(embeddings.zipWithIndex.map { - case (source, idx) => idx -> source - }.toSeq: _*) - - val neighbours = sourcesById.map { - case (srcIdx, (_, src)) => - sourcesById - .collect { - case (dstIdx, (_, dst)) if srcIdx != dstIdx => // avoid self-edges - val similarity = similarityFn(src, dst) - recordStatCallback( - StatComputedSimilarityBeforeFilter, - (similarity * 100).toLong // preserve up to two decimal points - ) - if (similarity > similarityThreshold) - Some(dstIdx) - else None - }.flatten.toArray - }.toArray - - recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds) - - val timeSinceClusteringAlgRunStart = Stopwatch.start() - val nEdges = neighbours.map(_.length).sum / 2 // Graph expects count of undirected edges - val graph = new Graph(sourcesById.size, nEdges, neighbours) - - val clusters = ConnectedComponents - .connectedComponents(graph).asScala.toSet - .map { i: IntSet => i.asScala.map(sourcesById(_)._1).toSet } - - recordStatCallback( - StatClusteringAlgorithmRunTime, - timeSinceClusteringAlgRunStart().inMilliseconds) - - clusters - } -} diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.docx b/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.docx new file mode 100644 index 000000000..ff8f7931a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.scala b/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.scala deleted file mode 100644 index 826cc7e08..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.scala +++ /dev/null @@ -1,33 +0,0 @@ -package com.twitter.simclusters_v2.common.clustering - -/** - * Groups entities by a single embedding dimension with the largest score. - */ -class LargestDimensionClusteringMethod extends ClusteringMethod { - - /** - * @param embeddings map of entity IDs and corresponding embeddings - * @param similarityFn function that outputs discrete value (0.0 or 1.0). - * 1.0 if the dimensions of the highest score (weight) from two given embeddings match. - * 0.0 otherwise. - * e.g. - * case 1: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.3, 0.8, 0.0]. similarityFn(E1, E2)=1.0 - * case 2: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.4, 0.2, 0.0]. similarityFn(E1, E2)=0.0 - * @tparam T embedding type. e.g. SimClustersEmbedding - * - * @return A set of sets of entity IDs, each set representing a distinct cluster. - */ - override def cluster[T]( - embeddings: Map[Long, T], - similarityFn: (T, T) => Double, - recordStatCallback: (String, Long) => Unit - ): Set[Set[Long]] = { - - // rely on clustering by connected component. - // similarityThreshold=0.1 because it's larger than 0.0 (similarityFn returns 0.0 if two embeddings - // don't share the largest dimension. - new ConnectedComponentsClusteringMethod(similarityThreshold = 0.1) - .cluster(embeddings, similarityFn, recordStatCallback) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.docx b/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.docx new file mode 100644 index 000000000..e54c7ad67 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.scala b/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.scala deleted file mode 100644 index c3337119b..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.scala +++ /dev/null @@ -1,236 +0,0 @@ -package com.twitter.simclusters_v2.common.clustering - -import com.twitter.eventdetection.common.louvain.LouvainDriver -import com.twitter.eventdetection.common.louvain.NetworkFactory -import com.twitter.eventdetection.common.model.Entity -import com.twitter.eventdetection.common.model.NetworkInput -import com.twitter.eventdetection.common.model.TextEntityValue -import com.twitter.util.Stopwatch -import scala.collection.JavaConverters._ -import scala.math.max - -/** - * Groups entities by the Louvain clustering method. - * @param similarityThreshold: When building the edges between entities, edges with weight - * less than or equal to this threshold will be filtered out. - * @param appliedResolutionFactor: If present, will be used to multiply the applied resolution - * parameter of the Louvain method by this factor. - * Note that the DEFAULT_MAX_RESOLUTION will not be applied. - */ -class LouvainClusteringMethod( - similarityThreshold: Double, - appliedResolutionFactor: Option[Double]) - extends ClusteringMethod { - - import ClusteringStatistics._ - - def cluster[T]( - embeddings: Map[Long, T], - similarityFn: (T, T) => Double, - recordStatCallback: (String, Long) => Unit = (_, _) => () - ): Set[Set[Long]] = { - - // 1. Build the graph on which to run Louvain: - // - Weigh edges by the similarity between the 2 embeddings, - // - Filter out edges with weight <= threshold. - val timeSinceGraphBuildStart = Stopwatch.start() - val edges: Seq[((Long, Long), Double)] = embeddings.toSeq - .combinations(2) - .map { pair: Seq[(Long, T)] => // pair of 2 - val (user1, embedding1) = pair.head - val (user2, embedding2) = pair(1) - val similarity = similarityFn(embedding1, embedding2) - - recordStatCallback( - StatComputedSimilarityBeforeFilter, - (similarity * 100).toLong // preserve up to two decimal places - ) - - ((user1, user2), similarity) - } - .filter(_._2 > similarityThreshold) - .toSeq - - recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds) - - // check if some entities do not have any incoming / outgoing edge - // these are size-1 clusters (i.e. their own) - val individualClusters: Set[Long] = embeddings.keySet -- edges.flatMap { - case ((user1, user2), _) => Set(user1, user2) - }.toSet - - // 2. LouvainDriver uses "Entity" as input, so build 2 mappings - // - Long (entity id) -> Entity - // - Entity -> Long (entity id) - val embeddingIdToEntity: Map[Long, Entity] = embeddings.map { - case (id, _) => id -> Entity(TextEntityValue(id.toString, Some(id.toString)), None) - } - val entityToEmbeddingId: Map[Entity, Long] = embeddingIdToEntity.map { - case (id, e) => e -> id - } - - // 3. Create the list of NetworkInput on which to run LouvainDriver - val networkInputList = edges - .map { - case ((fromUserId: Long, toUserId: Long), weight: Double) => - new NetworkInput(embeddingIdToEntity(fromUserId), embeddingIdToEntity(toUserId), weight) - }.toList.asJava - - val timeSinceClusteringAlgRunStart = Stopwatch.start() - val networkDictionary = NetworkFactory.buildDictionary(networkInputList) - val network = NetworkFactory.buildNetwork(networkInputList, networkDictionary) - - if (networkInputList.size() == 0) { - // handle case if no edge at all (only one entity or all entities are too far apart) - embeddings.keySet.map(e => Set(e)) - } else { - // 4. Run clustering algorithm - val clusteredIds = appliedResolutionFactor match { - case Some(res) => - LouvainDriver.clusterAppliedResolutionFactor(network, networkDictionary, res) - case None => LouvainDriver.cluster(network, networkDictionary) - } - - recordStatCallback( - StatClusteringAlgorithmRunTime, - timeSinceClusteringAlgRunStart().inMilliseconds) - - // 5. Post-processing - val atLeast2MembersClusters: Set[Set[Long]] = clusteredIds.asScala - .groupBy(_._2) - .mapValues(_.map { case (e, _) => entityToEmbeddingId(e) }.toSet) - .values.toSet - - atLeast2MembersClusters ++ individualClusters.map { e => Set(e) } - - } - } - - def clusterWithSilhouette[T]( - embeddings: Map[Long, T], - similarityFn: (T, T) => Double, - similarityFnForSil: (T, T) => Double, - recordStatCallback: (String, Long) => Unit = (_, _) => () - ): (Set[Set[Long]], Set[Set[(Long, Double)]]) = { - - // 1. Build the graph on which to run Louvain: - // - Weigh edges by the similarity between the 2 embeddings, - // - Filter out edges with weight <= threshold. - val timeSinceGraphBuildStart = Stopwatch.start() - val edgesSimilarityMap = collection.mutable.Map[(Long, Long), Double]() - - val edges: Seq[((Long, Long), Double)] = embeddings.toSeq - .combinations(2) - .map { pair: Seq[(Long, T)] => // pair of 2 - val (user1, embedding1) = pair.head - val (user2, embedding2) = pair(1) - val similarity = similarityFn(embedding1, embedding2) - val similarityForSil = similarityFnForSil(embedding1, embedding2) - edgesSimilarityMap.put((user1, user2), similarityForSil) - edgesSimilarityMap.put((user2, user1), similarityForSil) - - recordStatCallback( - StatComputedSimilarityBeforeFilter, - (similarity * 100).toLong // preserve up to two decimal places - ) - - ((user1, user2), similarity) - } - .filter(_._2 > similarityThreshold) - .toSeq - - recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds) - - // check if some entities do not have any incoming / outgoing edge - // these are size-1 clusters (i.e. their own) - val individualClusters: Set[Long] = embeddings.keySet -- edges.flatMap { - case ((user1, user2), _) => Set(user1, user2) - }.toSet - - // 2. LouvainDriver uses "Entity" as input, so build 2 mappings - // - Long (entity id) -> Entity - // - Entity -> Long (entity id) - val embeddingIdToEntity: Map[Long, Entity] = embeddings.map { - case (id, _) => id -> Entity(TextEntityValue(id.toString, Some(id.toString)), None) - } - val entityToEmbeddingId: Map[Entity, Long] = embeddingIdToEntity.map { - case (id, e) => e -> id - } - - // 3. Create the list of NetworkInput on which to run LouvainDriver - val networkInputList = edges - .map { - case ((fromUserId: Long, toUserId: Long), weight: Double) => - new NetworkInput(embeddingIdToEntity(fromUserId), embeddingIdToEntity(toUserId), weight) - }.toList.asJava - - val timeSinceClusteringAlgRunStart = Stopwatch.start() - val networkDictionary = NetworkFactory.buildDictionary(networkInputList) - val network = NetworkFactory.buildNetwork(networkInputList, networkDictionary) - - val clusters = if (networkInputList.size() == 0) { - // handle case if no edge at all (only one entity or all entities are too far apart) - embeddings.keySet.map(e => Set(e)) - } else { - // 4. Run clustering algorithm - val clusteredIds = appliedResolutionFactor match { - case Some(res) => - LouvainDriver.clusterAppliedResolutionFactor(network, networkDictionary, res) - case None => LouvainDriver.cluster(network, networkDictionary) - } - - recordStatCallback( - StatClusteringAlgorithmRunTime, - timeSinceClusteringAlgRunStart().inMilliseconds) - - // 5. Post-processing - val atLeast2MembersClusters: Set[Set[Long]] = clusteredIds.asScala - .groupBy(_._2) - .mapValues(_.map { case (e, _) => entityToEmbeddingId(e) }.toSet) - .values.toSet - - atLeast2MembersClusters ++ individualClusters.map { e => Set(e) } - - } - - // Calculate silhouette metrics - val contactIdWithSilhouette = clusters.map { - case cluster => - val otherClusters = clusters - cluster - - cluster.map { - case contactId => - if (otherClusters.isEmpty) { - (contactId, 0.0) - } else { - val otherSameClusterContacts = cluster - contactId - - if (otherSameClusterContacts.isEmpty) { - (contactId, 0.0) - } else { - // calculate similarity of given userId with all other users in the same cluster - val a_i = otherSameClusterContacts.map { - case sameClusterContact => - edgesSimilarityMap((contactId, sameClusterContact)) - }.sum / otherSameClusterContacts.size - - // calculate similarity of given userId to all other clusters, find the best nearest cluster - val b_i = otherClusters.map { - case otherCluster => - otherCluster.map { - case otherClusterContact => - edgesSimilarityMap((contactId, otherClusterContact)) - }.sum / otherCluster.size - }.max - - // silhouette (value) of one userId i - val s_i = (a_i - b_i) / max(a_i, b_i) - (contactId, s_i) - } - } - } - } - - (clusters, contactIdWithSilhouette) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.docx b/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.docx new file mode 100644 index 000000000..159763aff Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.scala b/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.scala deleted file mode 100644 index fec180d4f..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.scala +++ /dev/null @@ -1,21 +0,0 @@ -package com.twitter.simclusters_v2.common.clustering - -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights - -class MaxFavScoreRepresentativeSelectionMethod[T] extends ClusterRepresentativeSelectionMethod[T] { - - /** - * Identify the member with largest favScoreHalfLife100Days and return it. - * - * @param cluster A set of NeighborWithWeights. - * @param embeddings A map of producer ID -> embedding. - */ - def selectClusterRepresentative( - cluster: Set[NeighborWithWeights], - embeddings: Map[UserId, T], - ): UserId = { - val key = cluster.maxBy { x: NeighborWithWeights => x.favScoreHalfLife100Days.getOrElse(0.0) } - key.neighborId - } -} diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.docx b/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.docx new file mode 100644 index 000000000..de703f16b Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.scala b/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.scala deleted file mode 100644 index 1b466250f..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.scala +++ /dev/null @@ -1,28 +0,0 @@ -package com.twitter.simclusters_v2.common.clustering - -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights - -class MedoidRepresentativeSelectionMethod[T]( - producerProducerSimilarityFn: (T, T) => Double) - extends ClusterRepresentativeSelectionMethod[T] { - - /** - * Identify the medoid of a cluster and return it. - * - * @param cluster A set of NeighborWithWeights. - * @param embeddings A map of producer ID -> embedding. - */ - def selectClusterRepresentative( - cluster: Set[NeighborWithWeights], - embeddings: Map[UserId, T], - ): UserId = { - val key = cluster.maxBy { - id1 => // maxBy because we use similarity, which gets larger as we get closer. - val v = embeddings(id1.neighborId) - cluster - .map(id2 => producerProducerSimilarityFn(v, embeddings(id2.neighborId))).sum - } - key.neighborId - } -} diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.docx b/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.docx new file mode 100644 index 000000000..3f0d64cf5 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.scala b/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.scala deleted file mode 100644 index 45e449850..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.scala +++ /dev/null @@ -1,32 +0,0 @@ -package com.twitter.simclusters_v2.common.clustering - -import com.twitter.simclusters_v2.common.SimClustersEmbedding - -/** - * SimilarityFunctions provide commonly used similarity functions that this clustering library needs. - */ -object SimilarityFunctions { - def simClustersCosineSimilarity: (SimClustersEmbedding, SimClustersEmbedding) => Double = - (e1, e2) => e1.cosineSimilarity(e2) - - def simClustersMatchingLargestDimension: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = (e1, e2) => { - val doesMatchLargestDimension: Boolean = e1 - .topClusterIds(1) - .exists { id1 => - e2.topClusterIds(1).contains(id1) - } - - if (doesMatchLargestDimension) 1.0 - else 0.0 - } - - def simClustersFuzzyJaccardSimilarity: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = (e1, e2) => { - e1.fuzzyJaccardSimilarity(e2) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/common/ml/BUILD b/src/scala/com/twitter/simclusters_v2/common/ml/BUILD deleted file mode 100644 index e71aa0c59..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/ml/BUILD +++ /dev/null @@ -1,12 +0,0 @@ -# This package/target is separate from other simclusters common packages because the ml/api dep is -# large (350MB+). Having it as a separate target means that we can avoid bundling it with targets -# that do not need it. -scala_library( - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/scala/com/twitter/ml/api/util", - "src/scala/com/twitter/simclusters_v2/common", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/common/ml/BUILD.docx b/src/scala/com/twitter/simclusters_v2/common/ml/BUILD.docx new file mode 100644 index 000000000..2f2821500 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/ml/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.docx b/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.docx new file mode 100644 index 000000000..26e7d536f Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.scala b/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.scala deleted file mode 100644 index 8ee8291cf..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.scala +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.simclusters_v2.common.ml - -import com.twitter.ml.api.Feature.Continuous -import com.twitter.ml.api.Feature.SparseContinuous -import com.twitter.ml.api._ -import com.twitter.ml.api.util.FDsl._ -import com.twitter.simclusters_v2.common.SimClustersEmbedding - -class SimClustersEmbeddingAdapter(embeddingFeature: SparseContinuous) - extends IRecordOneToOneAdapter[SimClustersEmbedding] { - - override def getFeatureContext: FeatureContext = new FeatureContext(embeddingFeature) - - override def adaptToDataRecord(embedding: SimClustersEmbedding): DataRecord = { - val embeddingMap = embedding.embedding.map { - case (clusterId, score) => - (clusterId.toString, score) - }.toMap - - new DataRecord().setFeatureValue(embeddingFeature, embeddingMap) - } -} - -class NormalizedSimClustersEmbeddingAdapter( - embeddingFeature: SparseContinuous, - normFeature: Continuous) - extends IRecordOneToOneAdapter[SimClustersEmbedding] { - - override def getFeatureContext: FeatureContext = new FeatureContext(embeddingFeature, normFeature) - - override def adaptToDataRecord(embedding: SimClustersEmbedding): DataRecord = { - - val normalizedEmbedding = Map( - embedding.sortedClusterIds.map(_.toString).zip(embedding.normalizedSortedScores): _*) - - val dataRecord = new DataRecord().setFeatureValue(embeddingFeature, normalizedEmbedding) - dataRecord.setFeatureValue(normFeature, embedding.l2norm) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/common/package.docx b/src/scala/com/twitter/simclusters_v2/common/package.docx new file mode 100644 index 000000000..f143a5289 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/package.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/package.scala b/src/scala/com/twitter/simclusters_v2/common/package.scala deleted file mode 100644 index 8be5ad089..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/package.scala +++ /dev/null @@ -1,17 +0,0 @@ -package com.twitter.simclusters_v2 - -package object common { - - type TweetId = Long - type UserId = Long - type ClusterId = Int - type SemanticCoreEntityId = Long // Use TopicId if it's a Topic related project. - type UTTEntityId = Long - type Timestamp = Long - type Language = String - type Country = String - type LocaleEntity = (Long, Language) - type TopicId = Long - type GroupId = Long - type SpaceId = String -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.docx new file mode 100644 index 000000000..cddafde98 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.scala deleted file mode 100644 index 63098e137..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.scala +++ /dev/null @@ -1,164 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources - -import com.twitter.bijection.scrooge.BinaryScalaCodec -import com.twitter.bijection.scrooge.CompactScalaCodec -import com.twitter.bijection.Bufferable -import com.twitter.bijection.Injection -import com.twitter.hermit.candidate.thriftscala.Candidates -import com.twitter.scalding.DateRange -import com.twitter.scalding.commons.source.VersionedKeyValSource -import com.twitter.scalding_internal.source.lzo_scrooge.DailySuffixMostRecentLzoScrooge -import com.twitter.scalding_internal.source.lzo_scrooge.FixedPathLzoScrooge -import com.twitter.scalding_internal.source.lzo_scrooge.HourlySuffixMostRecentLzoScrooge -import com.twitter.simclusters_v2.thriftscala._ - -case class EdgeWithDecayedWtsFixedPathSource(path: String) - extends FixedPathLzoScrooge[EdgeWithDecayedWeights](path, EdgeWithDecayedWeights) - -case class UserAndNeighborsFixedPathSource(path: String) - extends FixedPathLzoScrooge[UserAndNeighbors](path, UserAndNeighbors) - -case class NormsAndCountsFixedPathSource(path: String) - extends FixedPathLzoScrooge[NormsAndCounts](path, NormsAndCounts) - -case class UserToInterestedInClustersFixedPathSource(path: String) - extends FixedPathLzoScrooge[UserToInterestedInClusters](path, UserToInterestedInClusters) - -case class TimelineDataExtractorFixedPathSource(path: String) - extends FixedPathLzoScrooge[ReferenceTweets](path, ReferenceTweets) - -case class TweetClusterScoresHourlySuffixSource(path: String, override val dateRange: DateRange) - extends HourlySuffixMostRecentLzoScrooge[TweetAndClusterScores](path, dateRange) - -case class TweetTopKClustersHourlySuffixSource(path: String, override val dateRange: DateRange) - extends HourlySuffixMostRecentLzoScrooge[TweetTopKClustersWithScores]( - path, - dateRange - ) - -case class ClusterTopKTweetsHourlySuffixSource(path: String, override val dateRange: DateRange) - extends HourlySuffixMostRecentLzoScrooge[ClusterTopKTweetsWithScores]( - path, - dateRange - ) - -case class TweetSimilarityUnhydratedPairsSource(path: String, override val dateRange: DateRange) - extends DailySuffixMostRecentLzoScrooge[LabelledTweetPairs]( - path, - dateRange - ) - -case class WTFCandidatesSource(path: String) - extends FixedPathLzoScrooge[Candidates](path, Candidates) - -case class EmbeddingsLiteSource(path: String) - extends FixedPathLzoScrooge[EmbeddingsLite](path, EmbeddingsLite) - -object AdhocKeyValSources { - def interestedInSource(path: String): VersionedKeyValSource[Long, ClustersUserIsInterestedIn] = { - implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian - implicit val valInject: Injection[ClustersUserIsInterestedIn, Array[Byte]] = - CompactScalaCodec(ClustersUserIsInterestedIn) - VersionedKeyValSource[Long, ClustersUserIsInterestedIn](path) - } - - def clusterDetailsSource(path: String): VersionedKeyValSource[(String, Int), ClusterDetails] = { - implicit val keyInject: Injection[(String, Int), Array[Byte]] = - Bufferable.injectionOf[(String, Int)] - implicit val valInject: Injection[ClusterDetails, Array[Byte]] = - CompactScalaCodec(ClusterDetails) - VersionedKeyValSource[(String, Int), ClusterDetails](path) - } - - def bipartiteQualitySource( - path: String - ): VersionedKeyValSource[(String, Int), BipartiteClusterQuality] = { - implicit val keyInject: Injection[(String, Int), Array[Byte]] = - Bufferable.injectionOf[(String, Int)] - implicit val valInject: Injection[BipartiteClusterQuality, Array[Byte]] = - CompactScalaCodec(BipartiteClusterQuality) - VersionedKeyValSource[(String, Int), BipartiteClusterQuality](path) - } - - def entityToClustersSource( - path: String - ): VersionedKeyValSource[SimClustersEmbeddingId, SimClustersEmbedding] = { - implicit val keyInject: Injection[SimClustersEmbeddingId, Array[Byte]] = - BinaryScalaCodec(SimClustersEmbeddingId) - implicit val valInject: Injection[SimClustersEmbedding, Array[Byte]] = - BinaryScalaCodec(SimClustersEmbedding) - VersionedKeyValSource[SimClustersEmbeddingId, SimClustersEmbedding](path) - } - - def clusterToEntitiesSource( - path: String - ): VersionedKeyValSource[SimClustersEmbeddingId, InternalIdEmbedding] = { - implicit val keyInject: Injection[SimClustersEmbeddingId, Array[Byte]] = BinaryScalaCodec( - SimClustersEmbeddingId) - implicit val valInject: Injection[InternalIdEmbedding, Array[Byte]] = - BinaryScalaCodec(InternalIdEmbedding) - VersionedKeyValSource[SimClustersEmbeddingId, InternalIdEmbedding](path) - } - - // For storing producer-simclusters embeddings - def topProducerToClusterEmbeddingsSource( - path: String - ): VersionedKeyValSource[Long, TopSimClustersWithScore] = { - implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian - implicit val valInject: Injection[TopSimClustersWithScore, Array[Byte]] = - CompactScalaCodec(TopSimClustersWithScore) - VersionedKeyValSource[Long, TopSimClustersWithScore](path) - } - - // For storing producer-simclusters embeddings - def topClusterEmbeddingsToProducerSource( - path: String - ): VersionedKeyValSource[PersistedFullClusterId, TopProducersWithScore] = { - implicit val keyInject: Injection[PersistedFullClusterId, Array[Byte]] = - CompactScalaCodec(PersistedFullClusterId) - implicit val valInject: Injection[TopProducersWithScore, Array[Byte]] = - CompactScalaCodec(TopProducersWithScore) - VersionedKeyValSource[PersistedFullClusterId, TopProducersWithScore](path) - } - - def userToInferredEntitiesSource( - path: String - ): VersionedKeyValSource[Long, SimClustersInferredEntities] = { - implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian - implicit val valInject: Injection[SimClustersInferredEntities, Array[Byte]] = - CompactScalaCodec(SimClustersInferredEntities) - VersionedKeyValSource[Long, SimClustersInferredEntities](path) - } - - def knownForAdhocSource(path: String): VersionedKeyValSource[Long, ClustersUserIsKnownFor] = { - implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian - implicit val valInject: Injection[ClustersUserIsKnownFor, Array[Byte]] = - CompactScalaCodec(ClustersUserIsKnownFor) - VersionedKeyValSource[Long, ClustersUserIsKnownFor](path) - } - - def knownForSBFResultsDevelSource( - path: String - ): VersionedKeyValSource[Long, Array[(Int, Float)]] = { - implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian - implicit val valInject: Injection[Array[(Int, Float)], Array[Byte]] = - Bufferable.injectionOf[Array[(Int, Float)]] - VersionedKeyValSource[Long, Array[(Int, Float)]](path) - } - - // injection to store adjlist in the mapped indices space for users - def intermediateSBFResultsDevelSource( - path: String - ): VersionedKeyValSource[Int, List[(Int, Float)]] = { - implicit val keyInject: Injection[Int, Array[Byte]] = Injection.int2BigEndian - implicit val valInject: Injection[List[(Int, Float)], Array[Byte]] = - Bufferable.injectionOf[List[(Int, Float)]] - VersionedKeyValSource[Int, List[(Int, Float)]](path) - } - - def mappedIndicesDevelSource(path: String): VersionedKeyValSource[Int, Long] = { - implicit val keyInject: Injection[Int, Array[Byte]] = Injection.int2BigEndian - implicit val valInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian - VersionedKeyValSource[Int, Long](path) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD b/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD deleted file mode 100644 index 4cddde193..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD +++ /dev/null @@ -1,2216 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":data_sources", - "3rdparty/src/jvm/com/twitter/scalding:core", - "src/scala/com/twitter/scalding_internal/dalv2", - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/scalding_internal/source/lzo_scrooge", - "src/scala/com/twitter/simclusters_v2/common", - "src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - "src/thrift/com/twitter/wtf/entity_real_graph:entity_real_graph-thrift-scala", - ], -) - -scala_library( - name = "data_sources", - sources = [], - description = "DAL datasets we wish to expose externally", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":ads_fav_based_simclusters_cluster_to_tweet_index-scala", - ":ads_fav_click_based_simclusters_cluster_to_tweet_index-scala", - ":aggregatable_producer_simclusters_embeddings_by_fav_score-scala", - ":aggregatable_producer_simclusters_embeddings_by_fav_score_2020-scala", - ":aggregatable_producer_simclusters_embeddings_by_fav_score_2020_thrift-scala", - ":aggregatable_producer_simclusters_embeddings_by_fav_score_thrift-scala", - ":aggregatable_producer_simclusters_embeddings_by_follow_score_2020-scala", - ":aggregatable_producer_simclusters_embeddings_by_follow_score_2020_thrift-scala", - ":aggregatable_producer_simclusters_embeddings_by_log_fav_score-scala", - ":aggregatable_producer_simclusters_embeddings_by_log_fav_score_2020-scala", - ":aggregatable_producer_simclusters_embeddings_by_log_fav_score_2020_thrift-scala", - ":aggregatable_producer_simclusters_embeddings_by_log_fav_score_relaxed_fav_engagement_threshold_2020-scala", - ":aggregatable_producer_simclusters_embeddings_by_log_fav_score_relaxed_fav_engagement_threshold_2020_thrift-scala", - ":aggregatable_producer_simclusters_embeddings_by_log_fav_score_thrift-scala", - ":clusters_members_connected_components_ape_similarity-scala", - ":clusters_members_largest_dim_ape_similarity-scala", - ":clusters_members_largest_dim_ape_similarity_2_day_update-scala", - ":clusters_members_louvain_ape_similarity-scala", - ":co_engagement_top_k_similar_tweets-scala", - ":explore_mbcg_user_embeddings_kv-scala", - ":fav_based_evergreen_content_simclusters_cluster_to_tweet_index-scala", - ":fav_based_simclusters_cluster_to_tweet_index-scala", - ":fav_based_video_simclusters_cluster_to_tweet_index-scala", - ":fav_inferred_language_tfg_topic_embeddings-scala", - ":fav_tfg_topic_embeddings-scala", - ":fav_tfg_topic_embeddings_2020-scala", - ":fav_tfg_topic_embeddings_2020_parquet-scala", - ":fav_tfg_topic_embeddings_parquet-scala", - ":full_multi_type_graph-scala", - ":geopopular_top_tweet_impressed_topics-scala", - ":hashtag_simclusters_embeddings_updated-scala", - ":interested_in_twice_by_largest_dim-scala", - ":interested_in_twice_by_largest_dim_2_day_update-scala", - ":interested_in_twice_by_largest_dim_fav_score-scala", - ":interested_in_twice_connected_components-scala", - ":interested_in_twice_louvain-scala", - ":log_fav_reverse_index_semantic_core_per_language_simclusters_embeddings-scala", - ":log_fav_semantic_core_per_language_simclusters_embeddings-scala", - ":log_fav_tfg_topic_embeddings-scala", - ":log_fav_tfg_topic_embeddings_parquet-scala", - ":multi_type_graph_for_top_k_right_nodes_thrift_50_m_scio-scala", - ":multi_type_graph_for_top_k_right_nodes_thrift_scio-scala", - ":multi_type_simclusters_right_node_to_clusters_thrift_50_m-scala", - ":multi_type_simclusters_right_node_to_clusters_thrift_fav_90_p_20_m-scala", - ":offline_cluster_top_media_tweets_20M_145K_2020-scala", - ":offline_tweet_recommendations_from_interested_in_20M_145K_2020-scala", - ":offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_0_EL_15-scala", - ":offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_2_EL_15-scala", - ":offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_2_EL_50-scala", - ":offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_8_EL_50-scala", - ":offline_tweet_recommendations_from_mts_consumer_embeddings-scala", - ":producer_norms_and_counts-scala", - ":producer_top_k_simcluster_embeddings_by_fav_score-scala", - ":producer_top_k_simcluster_embeddings_by_fav_score_2020-scala", - ":producer_top_k_simcluster_embeddings_by_fav_score_updated-scala", - ":producer_top_k_simcluster_embeddings_by_follow_score-scala", - ":producer_top_k_simcluster_embeddings_by_follow_score_2020-scala", - ":producer_top_k_simcluster_embeddings_by_follow_score_updated-scala", - ":push_open_based_simclusters_cluster_to_tweet_index-scala", - ":reply_based_simclusters_cluster_to_tweet_index-scala", - ":retweet_based_simclusters_cluster_to_tweet_index-scala", - ":reverse_index_hashtag_simclusters_embeddings_updated-scala", - ":reverse_index_semantic_core_per_language_simclusters_embeddings-scala", - ":reverse_index_semantic_core_simclusters_embeddings-scala", - ":reverse_index_semantic_core_simclusters_embeddings_2020-scala", - ":reverse_index_semantic_core_simclusters_embeddings_updated-scala", - ":right_node_cosine_similarity_scio-scala", - ":right_node_sim_hash_scio-scala", - ":rux_faved_top_k_tweets-scala", - ":semantic_core_embeddings_from_producer-scala", - ":semantic_core_per_language_simclusters_embeddings-scala", - ":semantic_core_simclusters_embeddings-scala", - ":semantic_core_simclusters_embeddings_2020-scala", - ":semantic_core_simclusters_embeddings_updated-scala", - ":simcluster_embedding_top_k_producers_by_fav_score-scala", - ":simcluster_embedding_top_k_producers_by_fav_score_2020-scala", - ":simcluster_embedding_top_k_producers_by_fav_score_updated-scala", - ":simcluster_embedding_top_k_producers_by_follow_score-scala", - ":simcluster_embedding_top_k_producers_by_follow_score_2020-scala", - ":simcluster_embedding_top_k_producers_by_follow_score_updated-scala", - ":simclusters_inferred_entities_from_interested_in-scala", - ":simclusters_inferred_entities_from_interested_in_keyed_by_cluster-scala", - ":simclusters_inferred_entities_from_known_for-scala", - ":simclusters_offline_cluster_top_k_tweets-scala", - ":simclusters_offline_tweet_cluster_scores-scala", - ":simclusters_offline_tweet_top_k_clusters-scala", - ":simclusters_v2_cluster_details-scala", - ":simclusters_v2_cluster_details_20m_145k_2020-scala", - ":simclusters_v2_cluster_details_20m_145k_updated-scala", - ":simclusters_v2_cluster_details_lite-scala", - ":simclusters_v2_cluster_details_lite_20m_145k_2020-scala", - ":simclusters_v2_cluster_details_lite_20m_145k_updated-scala", - ":simclusters_v2_embeddings_lite-scala", - ":simclusters_v2_global_language_embedding-scala", - ":simclusters_v2_global_language_embedding_thrift-scala", - ":simclusters_v2_interested_in-scala", - ":simclusters_v2_interested_in_20M_145K_2020-scala", - ":simclusters_v2_interested_in_20M_145K_updated-scala", - ":simclusters_v2_interested_in_from_aggregatable_producer_embeddings_20M_145K_2020-scala", - ":simclusters_v2_interested_in_from_producer_embeddings_20M_145K_updated-scala", - ":simclusters_v2_interested_in_lite_20M_145K_2020-scala", - ":simclusters_v2_known_for_20M_145K_2020-scala", - ":simclusters_v2_known_for_20M_145K_2020_thrift-scala", - ":simclusters_v2_known_for_20M_145K_dec11-scala", - ":simclusters_v2_known_for_20M_145K_updated-scala", - ":simclusters_v2_known_for_20M_145K_updated_thrift-scala", - ":simclusters_v2_raw_interested_in_20M_145K_2020-scala", - ":simclusters_v2_raw_interested_in_20M_145K_dec11-scala", - ":simclusters_v2_raw_interested_in_20M_145K_updated-scala", - ":simclusters_v2_raw_interested_in_lite_20M_145K_2020-scala", - ":simclusters_v2_raw_known_for_20M_145K_2020-scala", - ":simclusters_v2_raw_known_for_20M_145K_dec11-scala", - ":simclusters_v2_raw_known_for_20M_145K_updated-scala", - ":simclusters_v2_user_to_interested_in_20M_145K_2020-scala", - ":simclusters_v2_user_to_interested_in_20M_145K_dec11-scala", - ":simclusters_v2_user_to_interested_in_20M_145K_updated-scala", - ":simclusters_v2_user_to_interested_in_from_aggregatable_producer_embeddings_20M_145K_2020-scala", - ":simclusters_v2_user_to_interested_in_lite_20M_145K_2020-scala", - ":similar_topics_from_topic_follow_graph-scala", - ":similar_users_by_fav_based_producer_embedding-scala", - ":similar_users_by_follow_based_producer_embedding-scala", - ":top_k_right_nouns-scala", - ":top_k_right_nouns_scio-scala", - ":top_locale_topics_for_producer_from_em-scala", - ":top_producers_for_locale_topics_from_topic_follow_graph-scala", - ":topic_top_producers_em-scala", - ":truncated_multi_type_graph-scala", - ":truncated_multi_type_graph_scio-scala", - ":tweet_evaluation_timelines_reference_set-scala", - ":user_topic_weighted_embedding-scala", - ":user_topic_weighted_embedding_parquet-scala", - ":user_user_fav_graph-scala", - ":user_user_graph-scala", - ":user_user_normalized_graph-scala", - ":video_view_based_simclusters_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/common", - ], -) - -create_datasets( - base_name = "user_user_fav_graph", - java_schema = "com.twitter.simclusters_v2.thriftjava.EdgeWithDecayedWeights", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.EdgeWithDecayedWeights", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "producer_norms_and_counts", - java_schema = "com.twitter.simclusters_v2.thriftjava.NormsAndCounts", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.NormsAndCounts", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "user_user_normalized_graph", - java_schema = "com.twitter.simclusters_v2.thriftjava.UserAndNeighbors", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.UserAndNeighbors", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "multi_type_simclusters_right_node_to_clusters_thrift_fav_90_p_20_m", - java_schema = "com.twitter.simclusters_v2.thriftjava.RightNodeWithClusters", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.RightNodeWithClusters", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "multi_type_simclusters_right_node_to_clusters_thrift_50_m", - java_schema = "com.twitter.simclusters_v2.thriftjava.RightNodeWithClusters", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.RightNodeWithClusters", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "user_user_graph", - java_schema = "com.twitter.simclusters_v2.thriftjava.UserAndNeighbors", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.UserAndNeighbors", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -# InterestedIn -create_datasets( - base_name = "simclusters_v2_raw_interested_in_20M_145K_dec11", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InterestedInInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_raw_interested_in_20M_145K_updated", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InterestedInInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_raw_interested_in_20M_145K_2020", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InterestedInInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_raw_interested_in_lite_20M_145K_2020", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InterestedInInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "multi_type_graph_for_top_k_right_nodes_thrift_fav_90_p_20_m_scio", - java_schema = "com.twitter.simclusters_v2.thriftjava.MultiTypeGraphEdge", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.MultiTypeGraphEdge", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "multi_type_graph_for_top_k_right_nodes_thrift_50_m_scio", - java_schema = "com.twitter.simclusters_v2.thriftjava.MultiTypeGraphEdge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.MultiTypeGraphEdge", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "simclusters_v2_interested_in", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InterestedInInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_interested_in_20M_145K_updated", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InterestedInInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_interested_in_20M_145K_2020", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InterestedInInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_interested_in_lite_20M_145K_2020", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InterestedInInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_user_to_interested_in_20M_145K_dec11", - java_schema = "com.twitter.simclusters_v2.thriftjava.UserToInterestedInClusters", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusters", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "simclusters_v2_user_to_interested_in_20M_145K_updated", - java_schema = "com.twitter.simclusters_v2.thriftjava.UserToInterestedInClusters", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusters", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "simclusters_v2_user_to_interested_in_20M_145K_2020", - java_schema = "com.twitter.simclusters_v2.thriftjava.UserToInterestedInClusters", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusters", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "simclusters_v2_user_to_interested_in_lite_20M_145K_2020", - java_schema = "com.twitter.simclusters_v2.thriftjava.UserToInterestedInClusters", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusters", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "simclusters_v2_user_to_interested_in_from_aggregatable_producer_embeddings_20M_145K_2020", - java_schema = "com.twitter.simclusters_v2.thriftjava.UserToInterestedInClusters", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusters", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) -# end of InterestedIn - -# KnownFor -create_datasets( - base_name = "simclusters_v2_raw_known_for_20M_145K_dec11", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.KnownForInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_raw_known_for_20M_145K_updated", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.KnownForInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_raw_known_for_20M_145K_2020", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.KnownForInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_known_for_20M_145K_dec11", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.KnownForInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_known_for_20M_145K_updated", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.KnownForInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_known_for_20M_145K_updated_thrift", - java_schema = "com.twitter.simclusters_v2.thriftjava.UserToKnownForClusters", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.UserToKnownForClusters", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "simclusters_v2_known_for_20M_145K_2020", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.KnownForInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_known_for_20M_145K_2020_thrift", - java_schema = "com.twitter.simclusters_v2.thriftjava.UserToKnownForClusters", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.UserToKnownForClusters", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -# end of KnownFor - -create_datasets( - base_name = "simclusters_v2_cluster_details", - key_type = "scala.Tuple2[String, Int]", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterDetailsInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClusterDetails", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_cluster_details_lite", - java_schema = "com.twitter.simclusters_v2.thriftjava.ClusterDetailsLite", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.ClusterDetailsLite", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "simclusters_v2_embeddings_lite", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.EmbeddingsLite", - segment_type = "snapshot", - tags = ["bazel-compatible"], - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_cluster_details_20m_145k_updated", - key_type = "scala.Tuple2[String, Int]", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterDetailsInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClusterDetails", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_cluster_details_lite_20m_145k_updated", - java_schema = "com.twitter.simclusters_v2.thriftjava.ClusterDetailsLite", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.ClusterDetailsLite", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "simclusters_v2_cluster_details_20m_145k_2020", - key_type = "scala.Tuple2[String, Int]", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterDetailsInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClusterDetails", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_cluster_details_lite_20m_145k_2020", - java_schema = "com.twitter.simclusters_v2.thriftjava.ClusterDetailsLite", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.ClusterDetailsLite", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "tweet_evaluation_timelines_reference_set", - description = "A Tweet dataset that contains impressed tweets with engagement labels, parsed from Timelines", - java_schema = "com.twitter.simclusters_v2.thriftjava.ReferenceTweets", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.ReferenceTweets", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "semantic_core_simclusters_embeddings", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "semantic_core_simclusters_embeddings_updated", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "semantic_core_simclusters_embeddings_2020", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "semantic_core_per_language_simclusters_embeddings", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "log_fav_semantic_core_per_language_simclusters_embeddings", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "reverse_index_semantic_core_simclusters_embeddings", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.InternalIdEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.InternalIdEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "reverse_index_semantic_core_simclusters_embeddings_updated", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.InternalIdEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.InternalIdEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "reverse_index_semantic_core_simclusters_embeddings_2020", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.InternalIdEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.InternalIdEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "reverse_index_semantic_core_per_language_simclusters_embeddings", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.InternalIdEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.InternalIdEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "log_fav_reverse_index_semantic_core_per_language_simclusters_embeddings", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.InternalIdEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.InternalIdEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "hashtag_simclusters_embeddings_updated", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "fav_tfg_topic_embeddings", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "fav_tfg_topic_embeddings_2020", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "fav_tfg_topic_embeddings_parquet", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.TfgTopicEmbeddings", - segment_type = "snapshot", - tags = ["bazel-compatible"], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "fav_tfg_topic_embeddings_2020_parquet", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.TfgTopicEmbeddings", - segment_type = "snapshot", - tags = ["bazel-compatible"], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "fav_inferred_language_tfg_topic_embeddings", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "log_fav_tfg_topic_embeddings", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "log_fav_tfg_topic_embeddings_parquet", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.TfgTopicEmbeddings", - segment_type = "snapshot", - tags = ["bazel-compatible"], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "reverse_index_hashtag_simclusters_embeddings_updated", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.InternalIdEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.InternalIdEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simcluster_embedding_top_k_producers_by_fav_score", - key_type = "com.twitter.simclusters_v2.thriftscala.PersistedFullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.SimClusterEmbeddingTopKProducersInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopProducersWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simcluster_embedding_top_k_producers_by_fav_score_updated", - key_type = "com.twitter.simclusters_v2.thriftscala.PersistedFullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.SimClusterEmbeddingTopKProducersInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopProducersWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simcluster_embedding_top_k_producers_by_fav_score_2020", - key_type = "com.twitter.simclusters_v2.thriftscala.PersistedFullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.SimClusterEmbeddingTopKProducersInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopProducersWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "producer_top_k_simcluster_embeddings_by_fav_score", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.ProducerTopKSimClusterEmbeddingsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "producer_top_k_simcluster_embeddings_by_fav_score_updated", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.ProducerTopKSimClusterEmbeddingsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "producer_top_k_simcluster_embeddings_by_fav_score_2020", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.ProducerTopKSimClusterEmbeddingsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simcluster_embedding_top_k_producers_by_follow_score", - key_type = "com.twitter.simclusters_v2.thriftscala.PersistedFullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.SimClusterEmbeddingTopKProducersInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopProducersWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simcluster_embedding_top_k_producers_by_follow_score_updated", - key_type = "com.twitter.simclusters_v2.thriftscala.PersistedFullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.SimClusterEmbeddingTopKProducersInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopProducersWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simcluster_embedding_top_k_producers_by_follow_score_2020", - key_type = "com.twitter.simclusters_v2.thriftscala.PersistedFullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.SimClusterEmbeddingTopKProducersInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopProducersWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "producer_top_k_simcluster_embeddings_by_follow_score", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.ProducerTopKSimClusterEmbeddingsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "producer_top_k_simcluster_embeddings_by_follow_score_updated", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.ProducerTopKSimClusterEmbeddingsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "producer_top_k_simcluster_embeddings_by_follow_score_2020", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.ProducerTopKSimClusterEmbeddingsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "similar_users_by_fav_based_producer_embedding", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.SimilarUsersInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.hermit.candidate.thriftscala.Candidates", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "similar_users_by_follow_based_producer_embedding", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.SimilarUsersInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.hermit.candidate.thriftscala.Candidates", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "aggregatable_producer_simclusters_embeddings_by_log_fav_score", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.ProducerSimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "aggregatable_producer_simclusters_embeddings_by_log_fav_score_2020", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.ProducerSimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "aggregatable_producer_simclusters_embeddings_by_follow_score_2020", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.ProducerSimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "aggregatable_producer_simclusters_embeddings_by_log_fav_score_relaxed_fav_engagement_threshold_2020", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.ProducerSimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "aggregatable_producer_simclusters_embeddings_by_fav_score", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.ProducerSimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "aggregatable_producer_simclusters_embeddings_by_fav_score_2020", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ProducerEmbeddingsInjections.ProducerSimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "aggregatable_producer_simclusters_embeddings_by_log_fav_score_thrift", - java_schema = "com.twitter.simclusters_v2.thriftjava.SimClustersEmbeddingWithId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "aggregatable_producer_simclusters_embeddings_by_log_fav_score_2020_thrift", - java_schema = "com.twitter.simclusters_v2.thriftjava.SimClustersEmbeddingWithId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "aggregatable_producer_simclusters_embeddings_by_follow_score_2020_thrift", - java_schema = "com.twitter.simclusters_v2.thriftjava.SimClustersEmbeddingWithId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "aggregatable_producer_simclusters_embeddings_by_fav_score_thrift", - java_schema = "com.twitter.simclusters_v2.thriftjava.SimClustersEmbeddingWithId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "aggregatable_producer_simclusters_embeddings_by_fav_score_2020_thrift", - java_schema = "com.twitter.simclusters_v2.thriftjava.SimClustersEmbeddingWithId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "aggregatable_producer_simclusters_embeddings_by_log_fav_score_relaxed_fav_engagement_threshold_2020_thrift", - java_schema = "com.twitter.simclusters_v2.thriftjava.SimClustersEmbeddingWithId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -# TWICE & Clustering datasets -create_datasets( - base_name = "interested_in_twice_by_largest_dim", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersMultiEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "interested_in_twice_by_largest_dim_fav_score", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersMultiEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "interested_in_twice_by_largest_dim_2_day_update", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersMultiEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "interested_in_twice_louvain", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersMultiEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "interested_in_twice_connected_components", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersMultiEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "clusters_members_largest_dim_ape_similarity", - key_type = "com.twitter.simclusters_v2.common.UserId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusteringInjections.OrderedClustersAndMembersInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.OrderedClustersAndMembers", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "clusters_members_largest_dim_ape_similarity_2_day_update", - key_type = "com.twitter.simclusters_v2.common.UserId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusteringInjections.OrderedClustersAndMembersInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.OrderedClustersAndMembers", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "clusters_members_louvain_ape_similarity", - key_type = "com.twitter.simclusters_v2.common.UserId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusteringInjections.OrderedClustersAndMembersInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.OrderedClustersAndMembers", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "clusters_members_connected_components_ape_similarity", - key_type = "com.twitter.simclusters_v2.common.UserId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusteringInjections.OrderedClustersAndMembersInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.OrderedClustersAndMembers", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -# End of TWICE & Clustering datasets - -create_datasets( - base_name = "simclusters_offline_tweet_cluster_scores", - description = "A dataset that contains the scores for tweet and cluster pairs", - java_schema = "com.twitter.simclusters_v2.thriftjava.TweetAndClusterScores", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.TweetAndClusterScores", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "simclusters_offline_tweet_top_k_clusters", - description = "A dataset that contains the top clusters for each tweet", - java_schema = "com.twitter.simclusters_v2.thriftjava.TweetTopKClustersWithScores", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.TweetTopKClustersWithScores", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "simclusters_offline_cluster_top_k_tweets", - description = "A dataset that contains the top tweets for each cluster", - java_schema = "com.twitter.simclusters_v2.thriftjava.ClusterTopKTweetsWithScores", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.ClusterTopKTweetsWithScores", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "simclusters_inferred_entities_from_known_for", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InferredEntitiesInjections.InferredEntityInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersInferredEntities", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_inferred_entities_from_interested_in", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InferredEntitiesInjections.InferredEntityInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersInferredEntities", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_inferred_entities_from_interested_in_keyed_by_cluster", - key_type = "Int", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InferredEntitiesInjections.InferredEntityKeyedByClusterInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersInferredEntities", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "co_engagement_top_k_similar_tweets", - description = "A dataset that contains the top similar tweets based on co-engagement", - java_schema = "com.twitter.simclusters_v2.thriftjava.TweetTopKTweetsWithScore", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.TweetTopKTweetsWithScore", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "rux_faved_top_k_tweets", - description = "A dataset that contains the top similar tweets based on rux fav-to-impression ratio", - java_schema = "com.twitter.simclusters_v2.thriftjava.TweetTopKTweetsWithScore", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.TweetTopKTweetsWithScore", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "semantic_core_embeddings_from_producer", - key_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.EntitySimClustersEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_interested_in_from_producer_embeddings_20M_145K_updated", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InterestedInInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_interested_in_from_aggregatable_producer_embeddings_20M_145K_2020", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InterestedInInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "geopopular_top_tweet_impressed_topics", - key_type = "String", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.SemanticCoreEntitiesInjections.StringToSemanticCoreEntityScoreListInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.recos.entities.thriftscala.SemanticCoreEntityScoreList", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "similar_topics_from_topic_follow_graph", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.SemanticCoreEntitiesInjections.LongToSemanticCoreEntityScoreListInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.recos.entities.thriftscala.SemanticCoreEntityScoreList", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "top_locale_topics_for_producer_from_em", - key_type = "com.twitter.recos.entities.thriftscala.UserIdWithLocale", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.SemanticCoreEntitiesInjections.UserWithLocaleToSemanticCoreEntityScoreListInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.recos.entities.thriftscala.SemanticCoreEntityScoreList", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "top_producers_for_locale_topics_from_topic_follow_graph", - key_type = "com.twitter.recos.entities.thriftscala.SemanticCoreEntityWithLocale", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.SemanticCoreEntitiesInjections.SemanticCoreEntityWithLocaleToUsersScoreListInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.recos.entities.thriftscala.UserScoreList", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "topic_top_producers_em", - key_type = "com.twitter.recos.entities.thriftscala.SemanticCoreEntityWithLocale", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.SemanticCoreEntitiesInjections.SemanticCoreEntityWithLocaleToUsersScoreListInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.recos.entities.thriftscala.UserScoreList", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "adhoc_abuse_simcluster_features", - java_schema = "com.twitter.simclusters_v2.thriftjava.AdhocSingleSideClusterScores", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.AdhocSingleSideClusterScores", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "search_abuse_simcluster_features_manhattan", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.SingleSideUserScoresInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SingleSideUserScores", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "adhoc_cross_simcluster_block_interaction_features", - java_schema = "com.twitter.simclusters_v2.thriftjava.AdhocCrossSimClusterInteractionScores", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.AdhocCrossSimClusterInteractionScores", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "adhoc_cross_simcluster_fav_interaction_features", - java_schema = "com.twitter.simclusters_v2.thriftjava.AdhocCrossSimClusterInteractionScores", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.AdhocCrossSimClusterInteractionScores", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "top_k_right_nouns", - key_type = "com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.topKRightNounListInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "top_k_right_nouns_scio", - key_type = "com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.topKRightNounListInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "offline_cluster_top_media_tweets_20M_145K_2020", - key_type = "com.twitter.simclusters_v2.thriftscala.DayPartitionedClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopMediaTweetsInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TweetsWithScore", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "truncated_multi_type_graph", - key_type = "com.twitter.simclusters_v2.thriftscala.LeftNode", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.truncatedMultiTypeGraphInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeightList", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "truncated_multi_type_graph_scio", - key_type = "com.twitter.simclusters_v2.thriftscala.LeftNode", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.truncatedMultiTypeGraphInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeightList", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "multi_type_graph_for_top_k_right_nodes_thrift_scio", - java_schema = "com.twitter.simclusters_v2.thriftjava.MultiTypeGraphEdge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.MultiTypeGraphEdge", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "full_multi_type_graph", - java_schema = "com.twitter.simclusters_v2.thriftjava.MultiTypeGraphEdge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.MultiTypeGraphEdge", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "right_node_sim_hash_scio", - java_schema = "com.twitter.simclusters_v2.thriftjava.RightNodeSimHashSketch", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.RightNodeSimHashSketch", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "right_node_cosine_similarity_scio", - key_type = "com.twitter.simclusters_v2.thriftscala.RightNode", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.similarRightNodesInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimilarRightNodes", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "user_topic_weighted_embedding", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InterestedInInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "user_topic_weighted_embedding_parquet", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.UserTopicWeightedEmbedding", - segment_type = "snapshot", - tags = ["bazel-compatible"], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "explore_mbcg_user_embeddings_kv", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.EntityEmbeddingsInjections.UserMbcgEmbeddingInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.ml.api.thriftscala.Embedding", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "offline_tweet_recommendations_from_interested_in_20M_145K_2020", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.tweetRecommendationsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.CandidateTweetsList", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_0_EL_15", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.tweetRecommendationsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.CandidateTweetsList", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_2_EL_15", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.tweetRecommendationsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.CandidateTweetsList", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_2_EL_50", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.tweetRecommendationsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.CandidateTweetsList", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_8_EL_50", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.tweetRecommendationsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.CandidateTweetsList", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "offline_tweet_recommendations_from_mts_consumer_embeddings", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.tweetRecommendationsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.CandidateTweetsList", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "fav_based_simclusters_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "video_view_based_simclusters_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "retweet_based_simclusters_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "reply_based_simclusters_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "push_open_based_simclusters_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "ads_fav_based_simclusters_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "ads_fav_click_based_simclusters_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "fav_based_evergreen_content_simclusters_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "fav_based_video_simclusters_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_global_language_embedding", - key_type = "String", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.InterestedInInjection.languageInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_v2_global_language_embedding_thrift", - java_schema = "com.twitter.simclusters_v2.thriftjava.LanguageToClusters", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.LanguageToClusters", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD.docx new file mode 100644 index 000000000..945abb819 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.docx new file mode 100644 index 000000000..746841c7d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.scala deleted file mode 100644 index 486a21f60..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.scala +++ /dev/null @@ -1,49 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources - -object DataPaths { - - val InterestedIn2020Path = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_20M_145K_2020" - - val InterestedIn2020ThriftPath = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_20M_145K_2020_thrift" - - val InterestedInLite2020Path = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_lite_20M_145K_2020" - - val InterestedInLite2020ThriftPath = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_lite_20M_145K_2020_thrift" - - val KnownFor2020Path = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_known_for_20M_145K_2020" - - // keep this inside /user/cassowary/manhattan_sequence_files/ to use the latest 3 retention policy - val KnownFor2020ThriftDatasetPath = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_known_for_20M_145K_2020_thrift" - - val OfflineClusterTopMediaTweets2020DatasetPath = - "/user/cassowary/manhattan_sequence_files/cluster_top_media_tweets_20M_145K_2020" -} - -/** - * These should only be accessed from simclusters_v2 data pipeline for intermediate data, these - * are not opt-out compliant and shouldn't be exposed externally. - */ -object InternalDataPaths { - // Internal versions, not to be read or written outside of simcluster_v2 - - private[simclusters_v2] val RawInterestedIn2020Path = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_interested_in_20M_145K_2020" - - private[simclusters_v2] val RawInterestedInLite2020Path = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_interested_in_lite_20M_145K_2020" - - private[simclusters_v2] val RawKnownForDec11Path = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_dec11" - - private[simclusters_v2] val RawKnownForUpdatedPath = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_updated" - - private[simclusters_v2] val RawKnownFor2020Path = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_2020" -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.docx new file mode 100644 index 000000000..8790c34a5 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.scala deleted file mode 100644 index c72b25d3f..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.scala +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources - -import com.twitter.scalding.DateOps -import com.twitter.scalding.DateRange -import com.twitter.scalding.Days -import com.twitter.scalding.TypedPipe -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.simclusters_v2.thriftscala.NormsAndCounts -import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors -import java.util.TimeZone - -object DataSources { - - /** - * Reads production normalized graph data from atla-proc - */ - def userUserNormalizedGraphSource(implicit dateRange: DateRange): TypedPipe[UserAndNeighbors] = { - DAL - .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(14)(DateOps.UTC)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - - /** - * Reads production user norms and counts data from atla-proc - */ - def userNormsAndCounts( - implicit dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[NormsAndCounts] = { - DAL - .readMostRecentSnapshot(ProducerNormsAndCountsScalaDataset, dateRange.prepend(Days(14))) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.docx new file mode 100644 index 000000000..04f845f1a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.scala deleted file mode 100644 index a8ad1a69b..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.scala +++ /dev/null @@ -1,222 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding.DateRange -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.wtf.entity_real_graph.thriftscala.EntityType -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.ModelVersions - -object EntityEmbeddingsSources { - - final val SemanticCoreSimClustersEmbeddingsDec11Dataset = - SemanticCoreSimclustersEmbeddingsScalaDataset - - final val SemanticCoreSimClustersEmbeddingsUpdatedDataset = - SemanticCoreSimclustersEmbeddingsUpdatedScalaDataset - - final val SemanticCoreSimClustersEmbeddings2020Dataset = - SemanticCoreSimclustersEmbeddings2020ScalaDataset - - final val SemanticCorePerLanguageSimClustersEmbeddingsDataset = - SemanticCorePerLanguageSimclustersEmbeddingsScalaDataset - - final val LogFavSemanticCorePerLanguageSimClustersEmbeddingsDataset = - LogFavSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset - - final val HashtagSimClustersEmbeddingsUpdatedDataset = - HashtagSimclustersEmbeddingsUpdatedScalaDataset - - final val ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset = - ReverseIndexSemanticCoreSimclustersEmbeddingsScalaDataset - - final val ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset = - ReverseIndexSemanticCoreSimclustersEmbeddingsUpdatedScalaDataset - - final val ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset = - ReverseIndexSemanticCoreSimclustersEmbeddings2020ScalaDataset - - final val ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset = - ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset - - final val LogFavReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset = - LogFavReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset - - final val ReverseIndexHashtagSimClustersEmbeddingsUpdatedDataset = - ReverseIndexHashtagSimclustersEmbeddingsUpdatedScalaDataset - - // Fav-based TFG topic embeddings built from user device languages - // Keyed by SimClustersEmbeddingId with InternalId.TopicId ((topic, language) pair, with country = None) - final val FavTfgTopicEmbeddingsDataset = FavTfgTopicEmbeddingsScalaDataset - - final val FavTfgTopicEmbeddingsParquetDataset = FavTfgTopicEmbeddingsParquetScalaDataset - - final val FavTfgTopicEmbeddings2020Dataset = FavTfgTopicEmbeddings2020ScalaDataset - - final val FavTfgTopicEmbeddings2020ParquetDataset = FavTfgTopicEmbeddings2020ParquetScalaDataset - - // Logfav-based TFG topic embeddings built from user device languages - // Keyed by SimClustersEmbeddingId with InternalId.LocaleEntityId ((topic, language) pair) - final val LogFavTfgTopicEmbeddingsDataset = LogFavTfgTopicEmbeddingsScalaDataset - - final val LogFavTfgTopicEmbeddingsParquetDataset = LogFavTfgTopicEmbeddingsParquetScalaDataset - - // Fav-based TFG topic embeddings built from inferred user consumed languages - // Keyed by SimClustersEmbeddingId with InternalId.TopicId ((topic, country, language) tuple) - final val FavInferredLanguageTfgTopicEmbeddingsDataset = - FavInferredLanguageTfgTopicEmbeddingsScalaDataset - - private val validSemanticCoreEmbeddingTypes = Seq( - EmbeddingType.FavBasedSematicCoreEntity, - EmbeddingType.FollowBasedSematicCoreEntity - ) - - /** - * Given a fav/follow/etc embedding type and a ModelVersion, retrieve the corresponding dataset to - * (SemanticCore entityId -> List(clusterId)) from a certain dateRange. - */ - def getSemanticCoreEntityEmbeddingsSource( - embeddingType: EmbeddingType, - modelVersion: String, - dateRange: DateRange - ): TypedPipe[(Long, SimClustersEmbedding)] = { - val dataSet = modelVersion match { - case ModelVersions.Model20M145KDec11 => SemanticCoreSimClustersEmbeddingsDec11Dataset - case ModelVersions.Model20M145KUpdated => SemanticCoreSimClustersEmbeddingsUpdatedDataset - case _ => throw new IllegalArgumentException(s"ModelVersion $modelVersion is not supported") - } - assert(validSemanticCoreEmbeddingTypes.contains(embeddingType)) - entityEmbeddingsSource(dataSet, embeddingType, dateRange) - } - - /** - * Given a fav/follow/etc embedding type and a ModelVersion, retrieve the corresponding dataset to - * (clusterId -> List(SemanticCore entityId)) from a certain dateRange. - */ - def getReverseIndexedSemanticCoreEntityEmbeddingsSource( - embeddingType: EmbeddingType, - modelVersion: String, - dateRange: DateRange - ): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = { - val dataSet = modelVersion match { - case ModelVersions.Model20M145KDec11 => - ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset - case ModelVersions.Model20M145KUpdated => - ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset - case ModelVersions.Model20M145K2020 => - ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset - case _ => throw new IllegalArgumentException(s"ModelVersion $modelVersion is not supported") - } - - assert(validSemanticCoreEmbeddingTypes.contains(embeddingType)) - reverseIndexedEntityEmbeddingsSource(dataSet, embeddingType, dateRange) - } - - // Return the raw DAL dataset reference. Use this if you're writing to DAL. - def getEntityEmbeddingsDataset( - entityType: EntityType, - modelVersion: String, - isEmbeddingsPerLocale: Boolean = false - ): KeyValDALDataset[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] = { - (entityType, modelVersion) match { - case (EntityType.SemanticCore, ModelVersions.Model20M145KDec11) => - SemanticCoreSimClustersEmbeddingsDec11Dataset - case (EntityType.SemanticCore, ModelVersions.Model20M145KUpdated) => - if (isEmbeddingsPerLocale) { - SemanticCorePerLanguageSimClustersEmbeddingsDataset - } else { - SemanticCoreSimClustersEmbeddingsUpdatedDataset - } - case (EntityType.SemanticCore, ModelVersions.Model20M145K2020) => - SemanticCoreSimClustersEmbeddings2020Dataset - case (EntityType.Hashtag, ModelVersions.Model20M145KUpdated) => - HashtagSimClustersEmbeddingsUpdatedDataset - case (entityType, modelVersion) => - throw new IllegalArgumentException( - s"(Entity Type, ModelVersion) ($entityType, $modelVersion) not supported.") - } - } - - // Return the raw DAL dataset reference. Use this if you're writing to DAL. - def getReverseIndexedEntityEmbeddingsDataset( - entityType: EntityType, - modelVersion: String, - isEmbeddingsPerLocale: Boolean = false - ): KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]] = { - (entityType, modelVersion) match { - case (EntityType.SemanticCore, ModelVersions.Model20M145KDec11) => - ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset - case (EntityType.SemanticCore, ModelVersions.Model20M145KUpdated) => - if (isEmbeddingsPerLocale) { - ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset - } else { - ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset - } - case (EntityType.SemanticCore, ModelVersions.Model20M145K2020) => - ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset - case (EntityType.Hashtag, ModelVersions.Model20M145KUpdated) => - ReverseIndexHashtagSimClustersEmbeddingsUpdatedDataset - case (entityType, modelVersion) => - throw new IllegalArgumentException( - s"(Entity Type, ModelVersion) ($entityType, $modelVersion) not supported.") - } - } - - private def entityEmbeddingsSource( - dataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]], - embeddingType: EmbeddingType, - dateRange: DateRange - ): TypedPipe[(Long, SimClustersEmbedding)] = { - val pipe = DAL - .readMostRecentSnapshot(dataset, dateRange) - .withRemoteReadPolicy(AllowCrossDC) - .toTypedPipe - filterEntityEmbeddingsByType(pipe, embeddingType) - } - - private def reverseIndexedEntityEmbeddingsSource( - dataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]], - embeddingType: EmbeddingType, - dateRange: DateRange - ): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = { - val pipe = DAL - .readMostRecentSnapshot(dataset, dateRange) - .withRemoteReadPolicy(AllowCrossDC) - .toTypedPipe - filterReverseIndexedEntityEmbeddingsByType(pipe, embeddingType) - } - - private[hdfs_sources] def filterEntityEmbeddingsByType( - pipe: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]], - embeddingType: EmbeddingType - ): TypedPipe[(Long, SimClustersEmbedding)] = { - pipe.collect { - case KeyVal( - SimClustersEmbeddingId(_embeddingType, _, InternalId.EntityId(entityId)), - embedding - ) if _embeddingType == embeddingType => - (entityId, embedding) - } - } - - private[hdfs_sources] def filterReverseIndexedEntityEmbeddingsByType( - pipe: TypedPipe[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]], - embeddingType: EmbeddingType - ): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = { - pipe.collect { - case KeyVal( - SimClustersEmbeddingId(_embeddingType, _, InternalId.ClusterId(clusterId)), - embedding - ) if _embeddingType == embeddingType => - val entitiesWithScores = embedding.embedding.collect { - case InternalIdWithScore(InternalId.EntityId(entityId), score) => - SemanticCoreEntityWithScore(entityId, score) - } - (clusterId, entitiesWithScores) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.docx new file mode 100644 index 000000000..f13edcbd2 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.scala deleted file mode 100644 index 518b0be9f..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.scala +++ /dev/null @@ -1,178 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding.{DateOps, DateRange, Days, TypedPipe} -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, ProcAtla} -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import java.util.TimeZone - -object InterestedInSources { - - private val ModelVersionInterestedInDatasetMap: Map[ModelVersion, KeyValDALDataset[ - KeyVal[UserId, ClustersUserIsInterestedIn] - ]] = Map( - ModelVersion.Model20m145kDec11 -> SimclustersV2InterestedInScalaDataset, - ModelVersion.Model20m145kUpdated -> SimclustersV2InterestedIn20M145KUpdatedScalaDataset, - ModelVersion.Model20m145k2020 -> SimclustersV2InterestedIn20M145K2020ScalaDataset - ) - - /** - * Internal version, not PDP compliant, not to be used outside simclusters_v2 - * Reads 20M145KDec11 production InterestedIn data from atla-proc, with a 14-day extended window - */ - private[simclusters_v2] def simClustersRawInterestedInDec11Source( - dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - - DAL - .readMostRecentSnapshot( - SimclustersV2RawInterestedIn20M145KDec11ScalaDataset, - dateRange.prepend(Days(14)(timeZone)) - ) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .map { - case KeyVal(userId, clustersUserIsInterestedIn) => - (userId, clustersUserIsInterestedIn) - } - } - - /** - * Internal version, not PDP compliant, not to be used outside simclusters_v2 - * Reads 20M145KUpdated InterestedIn data from atla-proc, with a 14-day extended window - */ - private[simclusters_v2] def simClustersRawInterestedInUpdatedSource( - dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - DAL - .readMostRecentSnapshot( - SimclustersV2RawInterestedIn20M145KUpdatedScalaDataset, - dateRange.prepend(Days(14)(timeZone)) - ) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe.map { - case KeyVal(userId, clustersUserIsInterestedIn) => - (userId, clustersUserIsInterestedIn) - } - } - - /** - * Internal version, not PDP compliant, not to be used outside simclusters_v2 - * Reads 20M145K2020 InterestedIn data from atla-proc, with a 14-day extended window - */ - private[simclusters_v2] def simClustersRawInterestedIn2020Source( - dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - DAL - .readMostRecentSnapshot( - SimclustersV2RawInterestedIn20M145K2020ScalaDataset, - dateRange.prepend(Days(14)(timeZone)) - ) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe.map { - case KeyVal(userId, clustersUserIsInterestedIn) => - (userId, clustersUserIsInterestedIn) - } - } - - private[simclusters_v2] def simClustersRawInterestedInLite2020Source( - dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - DAL - .readMostRecentSnapshot( - SimclustersV2RawInterestedInLite20M145K2020ScalaDataset, - dateRange.extend(Days(14)(timeZone))) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe.map { - case KeyVal(userId, clustersUserIsInterestedIn) => - (userId, clustersUserIsInterestedIn) - } - } - - /** - * Reads 20M145KDec11 production InterestedIn data from atla-proc, with a 14-day extended window - */ - def simClustersInterestedInDec11Source( - dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - - DAL - .readMostRecentSnapshot( - SimclustersV2InterestedInScalaDataset, - dateRange.prepend(Days(14)(timeZone))) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe.map { - case KeyVal(userId, clustersUserIsInterestedIn) => - (userId, clustersUserIsInterestedIn) - } - } - - /** - * Reads 20M145KUpdated InterestedIn data from atla-proc, with a 14-day extended window - */ - def simClustersInterestedInUpdatedSource( - dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - DAL - .readMostRecentSnapshot( - SimclustersV2InterestedIn20M145KUpdatedScalaDataset, - dateRange.prepend(Days(14)(timeZone)) - ) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe.map { - case KeyVal(userId, clustersUserIsInterestedIn) => - (userId, clustersUserIsInterestedIn) - } - } - - /** - * Reads 20M145K2020 InterestedIn data from atla-proc, with a 14-day extended window - */ - def simClustersInterestedIn2020Source( - dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - DAL - .readMostRecentSnapshot( - SimclustersV2InterestedIn20M145K2020ScalaDataset, - dateRange.prepend(Days(14)(timeZone)) - ) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe.map { - case KeyVal(userId, clustersUserIsInterestedIn) => - (userId, clustersUserIsInterestedIn) - } - } - - /** - * Reads InterestedIn data based on ModelVersion from atla-proc, with a 14-day extended window - */ - def simClustersInterestedInSource( - modelVersion: ModelVersion, - dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - - DAL - .readMostRecentSnapshot( - ModelVersionInterestedInDatasetMap(modelVersion), - dateRange.prepend(Days(14)(timeZone)) - ) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe.map { - case KeyVal(userId, clustersUserIsInterestedIn) => - (userId, clustersUserIsInterestedIn) - } - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.docx new file mode 100644 index 000000000..a6cb14b8f Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.scala deleted file mode 100644 index 01d391f11..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.scala +++ /dev/null @@ -1,86 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources - -import com.twitter.scalding.DateRange -import com.twitter.scalding.TypedPipe -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.Proc3Atla -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore - -object ProducerEmbeddingSources { - - /** - * Helper function to retrieve producer SimClusters embeddings with the legacy `TopSimClustersWithScore` - * value type. - */ - def producerEmbeddingSourceLegacy( - embeddingType: EmbeddingType, - modelVersion: ModelVersion - )( - implicit dateRange: DateRange - ): TypedPipe[(Long, TopSimClustersWithScore)] = { - val producerEmbeddingDataset = (embeddingType, modelVersion) match { - case (EmbeddingType.ProducerFollowBasedSemanticCoreEntity, ModelVersion.Model20m145kDec11) => - ProducerTopKSimclusterEmbeddingsByFollowScoreScalaDataset - case (EmbeddingType.ProducerFavBasedSemanticCoreEntity, ModelVersion.Model20m145kDec11) => - ProducerTopKSimclusterEmbeddingsByFavScoreScalaDataset - case ( - EmbeddingType.ProducerFollowBasedSemanticCoreEntity, - ModelVersion.Model20m145kUpdated) => - ProducerTopKSimclusterEmbeddingsByFollowScoreUpdatedScalaDataset - case (EmbeddingType.ProducerFavBasedSemanticCoreEntity, ModelVersion.Model20m145kUpdated) => - ProducerTopKSimclusterEmbeddingsByFavScoreUpdatedScalaDataset - case (_, _) => - throw new ClassNotFoundException( - "Unsupported embedding type: " + embeddingType + " and model version: " + modelVersion) - } - - DAL - .readMostRecentSnapshot(producerEmbeddingDataset).withRemoteReadPolicy( - AllowCrossClusterSameDC) - .toTypedPipe.map { - case KeyVal(producerId, topSimClustersWithScore) => - (producerId, topSimClustersWithScore) - } - } - - def producerEmbeddingSource( - embeddingType: EmbeddingType, - modelVersion: ModelVersion - )( - implicit dateRange: DateRange - ): TypedPipe[(Long, SimClustersEmbedding)] = { - val producerEmbeddingDataset = (embeddingType, modelVersion) match { - case (EmbeddingType.AggregatableLogFavBasedProducer, ModelVersion.Model20m145k2020) => - AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset - case (EmbeddingType.AggregatableFollowBasedProducer, ModelVersion.Model20m145k2020) => - AggregatableProducerSimclustersEmbeddingsByFollowScore2020ScalaDataset - case (EmbeddingType.RelaxedAggregatableLogFavBasedProducer, ModelVersion.Model20m145k2020) => - AggregatableProducerSimclustersEmbeddingsByLogFavScoreRelaxedFavEngagementThreshold2020ScalaDataset - case (_, _) => - throw new ClassNotFoundException( - "Unsupported embedding type: " + embeddingType + " and model version: " + modelVersion) - } - - DAL - .readMostRecentSnapshot( - producerEmbeddingDataset - ) - .withRemoteReadPolicy(ExplicitLocation(Proc3Atla)) - .toTypedPipe - .map { - case KeyVal( - SimClustersEmbeddingId(_, _, InternalId.UserId(producerId: Long)), - embedding: SimClustersEmbedding) => - (producerId, embedding) - } - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD deleted file mode 100644 index 7926b5dac..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/common", - "src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala", - "src/thrift/com/twitter/ml/api:embedding-scala", - "src/thrift/com/twitter/recos/entities:entities-thrift-scala", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD.docx new file mode 100644 index 000000000..33a2b30d0 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.docx new file mode 100644 index 000000000..1b6a480dc Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.scala deleted file mode 100644 index 9f17cbad0..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.scala +++ /dev/null @@ -1,16 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.injections - -import com.twitter.bijection.Bufferable -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{ - ScalaCompactThrift, - genericInjection -} -import com.twitter.simclusters_v2.thriftscala.ClusterDetails - -object ClusterDetailsInjection { - val injection = KeyValInjection[(String, Int), ClusterDetails]( - genericInjection(Bufferable.injectionOf[(String, Int)]), - ScalaCompactThrift(ClusterDetails) - ) -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.docx new file mode 100644 index 000000000..a952b2893 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.scala deleted file mode 100644 index f542e0cbf..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.scala +++ /dev/null @@ -1,13 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.injections - -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift -import com.twitter.simclusters_v2.thriftscala.{TweetsWithScore, DayPartitionedClusterId} - -object ClusterTopMediaTweetsInjection { - - val injection = KeyValInjection[DayPartitionedClusterId, TweetsWithScore]( - ScalaCompactThrift(DayPartitionedClusterId), - ScalaCompactThrift(TweetsWithScore) - ) -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.docx new file mode 100644 index 000000000..e3f552287 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.scala deleted file mode 100644 index e09176813..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.scala +++ /dev/null @@ -1,14 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.injections - -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift -import com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores -import com.twitter.simclusters_v2.thriftscala.FullClusterId - -object ClusterTopTweetsInjection { - - val clusterIdToTopKTweetsInjection = KeyValInjection[FullClusterId, TopKTweetsWithScores]( - ScalaCompactThrift(FullClusterId), - ScalaCompactThrift(TopKTweetsWithScores) - ) -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.docx new file mode 100644 index 000000000..bcca63879 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.scala deleted file mode 100644 index 22ba173ca..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.scala +++ /dev/null @@ -1,16 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.injections - -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaBinaryThrift -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.thriftscala._ - -object ClusteringInjections { - - final val OrderedClustersAndMembersInjection: KeyValInjection[ - UserId, - OrderedClustersAndMembers - ] = - KeyValInjection(Long2BigEndian, ScalaBinaryThrift(OrderedClustersAndMembers)) -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.docx new file mode 100644 index 000000000..8148f20d4 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.scala deleted file mode 100644 index eb20bf3eb..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.scala +++ /dev/null @@ -1,47 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.injections - -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaBinaryThrift -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.ml.api.thriftscala.Embedding -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift - -object EntityEmbeddingsInjections { - - final val EntitySimClustersEmbeddingInjection: KeyValInjection[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = - KeyValInjection( - ScalaBinaryThrift(SimClustersEmbeddingId), - ScalaBinaryThrift(SimClustersEmbedding) - ) - - final val InternalIdEmbeddingInjection: KeyValInjection[ - SimClustersEmbeddingId, - InternalIdEmbedding - ] = - KeyValInjection( - ScalaBinaryThrift(SimClustersEmbeddingId), - ScalaBinaryThrift(InternalIdEmbedding) - ) - - final val EntitySimClustersMultiEmbeddingInjection: KeyValInjection[ - SimClustersMultiEmbeddingId, - SimClustersMultiEmbedding - ] = - KeyValInjection( - ScalaBinaryThrift(SimClustersMultiEmbeddingId), - ScalaBinaryThrift(SimClustersMultiEmbedding) - ) - - final val UserMbcgEmbeddingInjection: KeyValInjection[ - Long, - Embedding - ] = - KeyValInjection[Long, Embedding]( - Long2BigEndian, - ScalaCompactThrift(Embedding) - ) -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.docx new file mode 100644 index 000000000..9bce36fa5 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.scala deleted file mode 100644 index fcb637a9d..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.scala +++ /dev/null @@ -1,27 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.injections - -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{ - Int2BigEndian, - Long2BigEndian, - ScalaCompactThrift -} -import com.twitter.simclusters_v2.thriftscala.SimClustersInferredEntities - -object InferredEntitiesInjections { - - final val InferredEntityInjection: KeyValInjection[Long, SimClustersInferredEntities] = - KeyValInjection( - Long2BigEndian, - ScalaCompactThrift(SimClustersInferredEntities) - ) - - final val InferredEntityKeyedByClusterInjection: KeyValInjection[ - Int, - SimClustersInferredEntities - ] = - KeyValInjection( - Int2BigEndian, - ScalaCompactThrift(SimClustersInferredEntities) - ) -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.docx new file mode 100644 index 000000000..bb11d0be7 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.scala deleted file mode 100644 index c9642ee94..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.scala +++ /dev/null @@ -1,13 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.injections - -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.StringUtf8 -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn - -object InterestedInInjection { - val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(ClustersUserIsInterestedIn)) - val languageInjection = - KeyValInjection(StringUtf8, ScalaCompactThrift(ClustersUserIsInterestedIn)) -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.docx new file mode 100644 index 000000000..6093443c5 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.scala deleted file mode 100644 index 9aca921ee..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.scala +++ /dev/null @@ -1,12 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.injections - -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{ - Long2BigEndian, - ScalaCompactThrift -} -import com.twitter.simclusters_v2.thriftscala._ - -object KnownForInjection { - val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(ClustersUserIsKnownFor)) -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.docx new file mode 100644 index 000000000..d4e2f97e5 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.scala deleted file mode 100644 index f674324c6..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.scala +++ /dev/null @@ -1,31 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.injections - -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift -import com.twitter.simclusters_v2.thriftscala.LeftNode -import com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList -import com.twitter.simclusters_v2.thriftscala.RightNode -import com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct -import com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeightList -import com.twitter.simclusters_v2.thriftscala.SimilarRightNodes -import com.twitter.simclusters_v2.thriftscala.CandidateTweetsList -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian - -object MultiTypeGraphInjections { - final val truncatedMultiTypeGraphInjection = - KeyValInjection(ScalaCompactThrift(LeftNode), ScalaCompactThrift(RightNodeWithEdgeWeightList)) - final val topKRightNounListInjection = - KeyValInjection( - ScalaCompactThrift(RightNodeTypeStruct), - ScalaCompactThrift(NounWithFrequencyList)) - final val similarRightNodesInjection = - KeyValInjection[RightNode, SimilarRightNodes]( - ScalaCompactThrift(RightNode), - ScalaCompactThrift(SimilarRightNodes) - ) - final val tweetRecommendationsInjection = - KeyValInjection[Long, CandidateTweetsList]( - Long2BigEndian, - ScalaCompactThrift(CandidateTweetsList) - ) -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.docx new file mode 100644 index 000000000..a423d7f0f Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.scala deleted file mode 100644 index 087b6acc5..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.scala +++ /dev/null @@ -1,45 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.injections - -import com.twitter.hermit.candidate.thriftscala.Candidates -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{ - Long2BigEndian, - ScalaBinaryThrift, - ScalaCompactThrift -} -import com.twitter.simclusters_v2.thriftscala.{ - PersistedFullClusterId, - SimClustersEmbedding, - SimClustersEmbeddingId, - TopProducersWithScore, - TopSimClustersWithScore -} - -object ProducerEmbeddingsInjections { - final val ProducerTopKSimClusterEmbeddingsInjection: KeyValInjection[ - Long, - TopSimClustersWithScore - ] = - KeyValInjection( - keyCodec = Long2BigEndian, - valueCodec = ScalaCompactThrift(TopSimClustersWithScore)) - - final val SimClusterEmbeddingTopKProducersInjection: KeyValInjection[ - PersistedFullClusterId, - TopProducersWithScore - ] = - KeyValInjection( - keyCodec = ScalaCompactThrift(PersistedFullClusterId), - valueCodec = ScalaCompactThrift(TopProducersWithScore)) - - final val SimilarUsersInjection: KeyValInjection[Long, Candidates] = - KeyValInjection(keyCodec = Long2BigEndian, valueCodec = ScalaCompactThrift(Candidates)) - - final val ProducerSimClustersEmbeddingInjection: KeyValInjection[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = - KeyValInjection( - keyCodec = ScalaBinaryThrift(SimClustersEmbeddingId), - valueCodec = ScalaBinaryThrift(SimClustersEmbedding)) -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.docx new file mode 100644 index 000000000..3662d9ea3 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.scala deleted file mode 100644 index 10f9d208f..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.scala +++ /dev/null @@ -1,53 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.injections - -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{ - Long2BigEndian, - ScalaCompactThrift, - StringUtf8 -} -import com.twitter.recos.entities.thriftscala.{ - SemanticCoreEntityScoreList, - SemanticCoreEntityWithLocale, - UserIdWithLocale, - UserScoreList -} - -object SemanticCoreEntitiesInjections { - - final val StringToSemanticCoreEntityScoreListInjection: KeyValInjection[ - String, - SemanticCoreEntityScoreList - ] = - KeyValInjection( - StringUtf8, - ScalaCompactThrift(SemanticCoreEntityScoreList) - ) - - final val LongToSemanticCoreEntityScoreListInjection: KeyValInjection[ - Long, - SemanticCoreEntityScoreList - ] = - KeyValInjection( - Long2BigEndian, - ScalaCompactThrift(SemanticCoreEntityScoreList) - ) - - final val UserWithLocaleToSemanticCoreEntityScoreListInjection: KeyValInjection[ - UserIdWithLocale, - SemanticCoreEntityScoreList - ] = - KeyValInjection( - ScalaCompactThrift(UserIdWithLocale), - ScalaCompactThrift(SemanticCoreEntityScoreList) - ) - - final val SemanticCoreEntityWithLocaleToUsersScoreListInjection: KeyValInjection[ - SemanticCoreEntityWithLocale, - UserScoreList - ] = - KeyValInjection( - ScalaCompactThrift(SemanticCoreEntityWithLocale), - ScalaCompactThrift(UserScoreList) - ) -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.docx new file mode 100644 index 000000000..ea37bab42 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.scala deleted file mode 100644 index d3fb79901..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.scala +++ /dev/null @@ -1,12 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.injections - -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{ - Long2BigEndian, - ScalaCompactThrift -} -import com.twitter.simclusters_v2.thriftscala.SingleSideUserScores - -object SingleSideUserScoresInjection { - val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(SingleSideUserScores)) -} diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD deleted file mode 100644 index 0b02e4ce9..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD +++ /dev/null @@ -1,60 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":data_sources", - "3rdparty/src/jvm/com/twitter/scalding:core", - "src/scala/com/twitter/scalding_internal/dalv2", - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/scalding_internal/source/lzo_scrooge", - "src/scala/com/twitter/simclusters_v2/common", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - "src/thrift/com/twitter/wtf/entity_real_graph:entity_real_graph-thrift-scala", - ], -) - -scala_library( - name = "data_sources", - sources = [], - description = "DAL datasets we wish to expose externally", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":reverse_index_semantic_core_per_language_simclusters_embeddings_presto-scala", - ":semantic_core_per_language_simclusters_embeddings_presto-scala", - "src/scala/com/twitter/simclusters_v2/common", - ], -) - -create_datasets( - base_name = "reverse_index_semantic_core_per_language_simclusters_embeddings_presto", - java_schema = "com.twitter.simclusters_v2.thriftjava.InternalIdEmbeddingWithId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.InternalIdEmbeddingWithId", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "semantic_core_per_language_simclusters_embeddings_presto", - java_schema = "com.twitter.simclusters_v2.thriftjava.SimClustersEmbeddingWithId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD.docx new file mode 100644 index 000000000..5193e1755 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.docx b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.docx new file mode 100644 index 000000000..167e7eaa0 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.scala b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.scala deleted file mode 100644 index 740d0fadd..000000000 --- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.scala +++ /dev/null @@ -1,10 +0,0 @@ -package com.twitter.simclusters_v2.hdfs_sources.presto_hdfs_sources - -object EntityEmbeddingsPrestoSources { - - final val SemanticCorePerLanguageSimClustersEmbeddingsDataset = - SemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset - - final val ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset = - ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset -} diff --git a/src/scala/com/twitter/simclusters_v2/images/bipartite_graph.png b/src/scala/com/twitter/simclusters_v2/images/bipartite_graph.png deleted file mode 100644 index 15baf9b82..000000000 Binary files a/src/scala/com/twitter/simclusters_v2/images/bipartite_graph.png and /dev/null differ diff --git a/src/scala/com/twitter/simclusters_v2/images/interestedin.png b/src/scala/com/twitter/simclusters_v2/images/interestedin.png deleted file mode 100644 index 28142e633..000000000 Binary files a/src/scala/com/twitter/simclusters_v2/images/interestedin.png and /dev/null differ diff --git a/src/scala/com/twitter/simclusters_v2/images/knownfor.png b/src/scala/com/twitter/simclusters_v2/images/knownfor.png deleted file mode 100644 index 7625caf3a..000000000 Binary files a/src/scala/com/twitter/simclusters_v2/images/knownfor.png and /dev/null differ diff --git a/src/scala/com/twitter/simclusters_v2/images/producer_embeddings.png b/src/scala/com/twitter/simclusters_v2/images/producer_embeddings.png deleted file mode 100644 index 054e12242..000000000 Binary files a/src/scala/com/twitter/simclusters_v2/images/producer_embeddings.png and /dev/null differ diff --git a/src/scala/com/twitter/simclusters_v2/images/producer_producer_similarity.png b/src/scala/com/twitter/simclusters_v2/images/producer_producer_similarity.png deleted file mode 100644 index 616ca56c0..000000000 Binary files a/src/scala/com/twitter/simclusters_v2/images/producer_producer_similarity.png and /dev/null differ diff --git a/src/scala/com/twitter/simclusters_v2/images/topic_embeddings.png b/src/scala/com/twitter/simclusters_v2/images/topic_embeddings.png deleted file mode 100644 index 758ad1acf..000000000 Binary files a/src/scala/com/twitter/simclusters_v2/images/topic_embeddings.png and /dev/null differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/BUILD b/src/scala/com/twitter/simclusters_v2/scalding/BUILD deleted file mode 100644 index eb0a31038..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/BUILD +++ /dev/null @@ -1,521 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/fasterxml/jackson:jackson-module-scala", - "3rdparty/jvm/com/fasterxml/jackson/core:jackson-core", - "3rdparty/jvm/com/fasterxml/jackson/core:jackson-databind", - "3rdparty/jvm/com/fasterxml/jackson/module:jackson-module-scala", - "3rdparty/jvm/com/googlecode/matrix-toolkits-java", - "3rdparty/jvm/com/twitter/storehaus:algebra", - "3rdparty/jvm/com/twitter/storehaus:core", - "escherbird/src/scala/com/twitter/escherbird/scalding/source", - "flockdb-tools/datasets/flock:flock-follows-edges-scala", - "src/java/com/twitter/ml/api/constant", - "src/java/com/twitter/sbf/core", - "src/java/com/twitter/sbf/graph", - "src/scala/com/twitter/frigate/user_sampler/common", - "src/scala/com/twitter/ml/api:api-base", - "src/scala/com/twitter/ml/api/bq", - "src/scala/com/twitter/pluck/source/cassowary:sims", - "src/scala/com/twitter/pluck/source/core_workflows/user_model:condensed_user_state-scala", - "src/scala/com/twitter/scalding_internal/dalv2", - "src/scala/com/twitter/scalding_internal/job", - "src/scala/com/twitter/scalding_internal/job/analytics_batch", - "src/scala/com/twitter/scalding_internal/source", - "src/scala/com/twitter/scalding_internal/source/lzo_scrooge", - "src/scala/com/twitter/simclusters_v2/candidate_source", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/scala/com/twitter/simclusters_v2/summingbird/common", - "src/scala/com/twitter/timelines/prediction/features/common", - "src/scala/com/twitter/timelines/prediction/features/itl", - "src/scala/com/twitter/timelines/prediction/features/recap", - "src/scala/com/twitter/wtf/entity_real_graph/scalding/common", - "src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala", - "src/thrift/com/twitter/wtf/scalding/sims:sims-thrift-scala", - "twadoop_config/configuration/log_categories/group/recos-platform:content_recommender_get_content_recommendations-scala", - "twadoop_config/configuration/log_categories/group/recos-platform:content_recommender_get_topic_tweets_recommendations-scala", - "twadoop_config/configuration/log_categories/group/timeline:timeline_service_favorites-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - "usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala", - "util/util-core:util-core-util", - ], -) - -hadoop_binary( - name = "evd_cluster_similarity", - main = "com.twitter.simclusters_v2.scalding.EigenVectorsForClusterSimilarityAdhoc", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "cluster_evaluation", - main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationAdhoc", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "cluster_evaluation_20m_145k", - main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "cluster_evaluation_20m_145k_2020", - main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K2020", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "bp_cluster_evaluation", - main = "com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluation", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "update_knownfor", - main = "com.twitter.simclusters_v2.scalding.UpdateKnownForAdhoc", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "update_knownfor_prod", - main = "com.twitter.simclusters_v2.scalding.UpdateKnownFor20M145K", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "cluster_details", - main = "com.twitter.simclusters_v2.scalding.ClusterDetailsBatch", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "cluster_details_20m_145k_updated", - main = "com.twitter.simclusters_v2.scalding.ClusterDetails20M145KUpdated", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "cluster_details_20m_145k_2020", - main = "com.twitter.simclusters_v2.scalding.ClusterDetails20M145K2020", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "cluster_details-adhoc", - main = "com.twitter.simclusters_v2.scalding.ClusterDetailsAdhoc", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "cluster_details-dump", - main = "com.twitter.simclusters_v2.scalding.DumpClusterDetailsAdhoc", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "interested_in", - main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForBatch", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "interested_in_from_producer_embeddings", - main = "com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsBatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "employee_graph_from_user_user", - main = "com.twitter.simclusters_v2.scalding.EmployeeGraphFromUserUser", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "interested_in_20m_145k_updated", - main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145KUpdated", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "interested_in_20m_145k_2020", - main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145K2020", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "interested_in_lite_20m_145k_2020", - main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "interested_in_lite_20m_145k_2020-adhoc", - main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020Adhoc", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "interested_in_from_ape_2020-adhoc", - main = "com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020AdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "interested_in_from_ape_2020", - main = "com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020BatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "known_for_to_mh", - main = "com.twitter.simclusters_v2.scalding.KnownForToMHBatch", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "user_user_normalized_graph", - main = "com.twitter.simclusters_v2.scalding.UserUserNormalizedGraphBatch", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "user_user_graph", - main = "com.twitter.simclusters_v2.scalding.UserUserGraphBatch", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "user_user_graph-adhoc", - main = "com.twitter.simclusters_v2.scalding.UserUserGraphAdhoc", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "producer_norms_and_counts", - main = "com.twitter.simclusters_v2.scalding.ProducerNormsAndCountsBatch", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "fav_graph", - main = "com.twitter.simclusters_v2.scalding.UserUserFavGraphBatch", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "top_users_similarity_graph", - main = "com.twitter.simclusters_v2.scalding.TopUsersSimilarityGraphApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "top_users_only", - main = "com.twitter.simclusters_v2.scalding.TopUsersOnlyApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -hadoop_binary( - name = "dump_fav_graph_adhoc", - main = "com.twitter.simclusters_v2.scalding.DumpFavGraphAdhoc", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) - -# Generated with `capesospy-v2 create_target interested_in_for_20M_145k_2020 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml`, config hash 8f19bf. -scalding_job( - name = "interested_in_for_20M_145k_2020", - main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145K2020", - args = ["--socialProofThreshold 2 --maxClustersPerUser 50"], - config = [ - ("hadoop.combine-input", "true"), - ("hadoop.map.jvm.total-memory", "3072m"), - ("hadoop.reduce.jvm.total-memory", "3072m"), - ("hadoop.submitter.jvm.total-memory", "5120m"), - ("submitter.tier", "preemptible"), - ], - cron = "14 * * * *", - hadoop_cluster = "atla-proc", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":scalding", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/BUILD.docx new file mode 100644 index 000000000..cb3defb72 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.docx b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.docx new file mode 100644 index 000000000..fa635b76e Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.scala b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.scala deleted file mode 100644 index 0382b1472..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.scala +++ /dev/null @@ -1,513 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.algebird.Aggregator -import com.twitter.algebird.Monoid -import com.twitter.scalding._ -import com.twitter.scalding.commons.source.VersionedKeyValSource -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources -import com.twitter.simclusters_v2.hdfs_sources.NormsAndCountsFixedPathSource -import com.twitter.simclusters_v2.hdfs_sources.ProducerNormsAndCountsScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource -import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset -import com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluationClasses._ -import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.BipartiteClusterQuality -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights -import com.twitter.simclusters_v2.thriftscala.NormsAndCounts -import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors -import scala.collection.JavaConverters._ - -object BipartiteClusterEvaluation extends TwitterExecutionApp { - - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - - private def getClusterL2Norms( - knownFor: TypedPipe[(Long, Array[(Int, Float)])] - ): Execution[Map[Int, Float]] = { - knownFor - .flatMap { - case (_, clusterArray) => - clusterArray.map { - case (clusterId, score) => - Map(clusterId -> score * score) - } - } - .sum - .getExecution - .map(_.mapValues { x => math.sqrt(x).toFloat }) - } - - def l2NormalizeKnownFor( - knownFor: TypedPipe[(Long, Array[(Int, Float)])] - ): Execution[TypedPipe[(Long, Array[(Int, Float)])]] = { - getClusterL2Norms(knownFor).map { clusterToNorms => - knownFor.mapValues { clusterScoresArray => - clusterScoresArray.map { - case (clusterId, score) => - (clusterId, score / clusterToNorms(clusterId)) - } - } - } - } - - /** - * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:bp_cluster_evaluation && \ - * oscar hdfs --user frigate --host hadoopnest2.atla.twitter.com --bundle bp_cluster_evaluation \ - * --tool com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluation --screen --screen-detached \ - * --tee logs/newBpQuality_updateUnnormalizedScores_interestedInUsing20190329Graph_evaluatedOn20190329Graph_run2 \ - * -- --normsAndCountsDir /user/frigate/your_ldap/producerNormsAndCounts_20190330 \ - * --graphInputDir /user/frigate/your_ldap/user_user_normalized_graph_copiedFromAtlaProc_20190329 \ - * --knownForDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownFor \ - * --interestedInDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/interestedInUsing20190329Graph \ - * --outgoingVolumesResultsDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_outgoingVolumes \ - * --incomingVolumesResultsDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_incomingVolumes \ - * --outputDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_perCluster \ - * --toEmailAddress your_ldap@twitter.com --modelVersion 20M_145K_updated - */ - override def job: Execution[Unit] = Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - - val interestedIn = args.optional("interestedInDir") match { - case Some(dir) => - TypedPipe - .from(AdhocKeyValSources.interestedInSource(args("interestedInDir"))) - case None => - DAL - .readMostRecentSnapshotNoOlderThan( - SimclustersV2InterestedInScalaDataset, - Days(20) - ) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .map { - case KeyVal(key, value) => (key, value) - } - } - - val inputKnownFor = args - .optional("knownForDir") - .map { location => KnownForSources.readKnownFor(location) } - .getOrElse(KnownForSources.knownFor_20M_Dec11_145K) - - val modelVersion = - args.optional("modelVersion").getOrElse("20M_145K_dec11") - - val useLogFavWeights = args.boolean("useLogFavWeights") - - val shouldL2NormalizeKnownFor = args.boolean("l2NormalizeKnownFor") - - val toEmailAddressOpt = args.optional("toEmailAddress") - - val knownForExec = if (shouldL2NormalizeKnownFor) { - l2NormalizeKnownFor(inputKnownFor) - } else { - Execution.from(inputKnownFor) - } - - val finalExec = knownForExec.flatMap { knownFor => - val graph = args.optional("graphInputDir") match { - case Some(dir) => - TypedPipe.from(UserAndNeighborsFixedPathSource(dir)) - case None => - DAL - .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(20)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - - val producerNormsAndCounts = args.optional("normsAndCountsDir") match { - case Some(dir) => - TypedPipe.from(NormsAndCountsFixedPathSource(args(dir))) - case None => - DAL - .readMostRecentSnapshotNoOlderThan(ProducerNormsAndCountsScalaDataset, Days(20)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - - val clusterIncomingVolumesExec = loadOrMake( - computeClusterIncomingVolumes(knownFor, producerNormsAndCounts, useLogFavWeights), - modelVersion, - args("incomingVolumesResultsDir") - ) - - val resultsWithOutgoingVolumesExec = loadOrMake( - getResultsWithOutgoingVolumes(graph, interestedIn, useLogFavWeights), - modelVersion, - args("outgoingVolumesResultsDir") - ) - - val finalPerClusterResultsExec = - finalPerClusterResults( - knownFor, - interestedIn, - resultsWithOutgoingVolumesExec, - clusterIncomingVolumesExec) - .flatMap { pipe => loadOrMake(pipe, modelVersion, args("outputDir")) } - - finalPerClusterResultsExec.flatMap { finalPerClusterResults => - val perClusterResults = finalPerClusterResults.values - val distributionResultsExec = getClusterResultsSummary(perClusterResults).map { - case Some(summary) => - "Summary of results across clusters: \n" + - Util.prettyJsonMapper.writeValueAsString(summary) - case _ => - "No summary of results! The cluster level results pipe must be empty!" - } - - val overallResultsExec = perClusterResults.sum.toOptionExecution.map { - case Some(overallQuality) => - "Overall Quality: \n" + - Util.prettyJsonMapper.writeValueAsString( - printableBipartiteQuality(overallQuality) - ) - case _ => - "No overall quality! The cluster level results pipe must be empty!" - } - - Execution.zip(distributionResultsExec, overallResultsExec).map { - case (distResults, overallResults) => - toEmailAddressOpt.foreach { address => - Util.sendEmail( - distResults + "\n" + overallResults, - "Bipartite cluster quality for " + modelVersion, - address - ) - } - println(distResults + "\n" + overallResults) - } - } - } - Util.printCounters(finalExec) - } - } - - def getResultsWithOutgoingVolumes( - graph: TypedPipe[UserAndNeighbors], - interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)], - useLogFavWeights: Boolean - ): TypedPipe[(Int, BipartiteClusterQuality)] = { - graph - .map { un => (un.userId, un.neighbors) } - // should this be a leftJoin? For now, leaving it as an inner join. If in the future, - // we want to compare two approaches with very different coverages on interestedIn, this - // could become a problem. - .join(interestedIn) - .withReducers(4000) - .flatMap { - case (userId, (neighbors, clusters)) => - getBIResultsFromSingleUser(userId, neighbors, clusters, useLogFavWeights) - } - .sumByKey - .withReducers(600) - .map { - case (clusterId, bir) => - ( - clusterId, - BipartiteClusterQuality( - inClusterFollowEdges = Some(bir.inClusterWeights.isFollowEdge), - inClusterFavEdges = Some(bir.inClusterWeights.isFavEdge), - favWtSumOfInClusterFollowEdges = Some(bir.inClusterWeights.favWtIfFollowEdge), - favWtSumOfInClusterFavEdges = Some(bir.inClusterWeights.favWtIfFavEdge), - outgoingFollowEdges = Some(bir.totalOutgoingVolumes.isFollowEdge), - outgoingFavEdges = Some(bir.totalOutgoingVolumes.isFavEdge), - favWtSumOfOutgoingFollowEdges = Some(bir.totalOutgoingVolumes.favWtIfFollowEdge), - favWtSumOfOutgoingFavEdges = Some(bir.totalOutgoingVolumes.favWtIfFavEdge), - interestedInSize = Some(bir.interestedInSize), - sampledEdges = Some( - bir.edgeSample - .iterator() - .asScala - .toSeq - .map { - case (edge, data) => makeThriftSampledEdge(edge, data) - } - ) - ) - ) - } - } - - def getBIResultsFromSingleUser( - userId: Long, - neighbors: Seq[NeighborWithWeights], - clusters: ClustersUserIsInterestedIn, - useLogFavScores: Boolean - ): List[(Int, BipartiteIntermediateResults)] = { - val neighborsToWeights = neighbors.map { neighborAndWeights => - val isFollowEdge = neighborAndWeights.isFollowed match { - case Some(true) => 1.0 - case _ => 0.0 - } - val favScore = if (useLogFavScores) { - neighborAndWeights.logFavScore.getOrElse(0.0) - } else neighborAndWeights.favScoreHalfLife100Days.getOrElse(0.0) - val isFavEdge = math.min(1, math.ceil(favScore)) - neighborAndWeights.neighborId -> Weights( - isFollowEdge, - isFavEdge, - favScore * isFollowEdge, - favScore - ) - }.toMap - - val outgoingVolumes = Monoid.sum(neighborsToWeights.values)(WeightsMonoid) - - clusters.clusterIdToScores.toList.map { - case (clusterId, scoresStruct) => - val inClusterNeighbors = - (scoresStruct.usersBeingFollowed.getOrElse(Nil) ++ - scoresStruct.usersThatWereFaved.getOrElse(Nil)).toSet - val edgesForSampling = inClusterNeighbors.flatMap { neighborId => - if (neighborsToWeights.contains(neighborId)) { - Some( - (userId, neighborId), - SampledEdgeData( - neighborsToWeights(neighborId).favWtIfFollowEdge, - neighborsToWeights(neighborId).favWtIfFavEdge, - scoresStruct.followScore.getOrElse(0.0), - scoresStruct.favScore.getOrElse(0.0) - ) - ) - } else { - None - } - } - - val inClusterWeights = - Monoid.sum(neighborsToWeights.filterKeys(inClusterNeighbors).values)(WeightsMonoid) - - ( - clusterId, - BipartiteIntermediateResults( - inClusterWeights, - outgoingVolumes, - 1, - samplerMonoid.build(edgesForSampling) - )) - } - } - - def computeClusterIncomingVolumes( - knownFor: TypedPipe[(Long, Array[(Int, Float)])], - producerNormsAndCounts: TypedPipe[NormsAndCounts], - useLogFavWeights: Boolean - ): TypedPipe[(Int, BipartiteClusterQuality)] = { - producerNormsAndCounts - .map { x => (x.userId, x) } - .join(knownFor) - .withReducers(100) - .flatMap { - case (userId, (normsAndCounts, clusters)) => - clusters.map { - case (clusterId, _) => - val followerCount = - normsAndCounts.followerCount.getOrElse(0L).toDouble - val faverCount = normsAndCounts.faverCount.getOrElse(0L).toDouble - val favWtSumOfIncomingFollows = if (useLogFavWeights) { - normsAndCounts.logFavWeightsOnFollowEdgesSum.getOrElse(0.0) - } else { - normsAndCounts.favWeightsOnFollowEdgesSum.getOrElse(0.0) - } - val favWtSumOfIncomingFavs = if (useLogFavWeights) { - normsAndCounts.logFavWeightsOnFavEdgesSum.getOrElse(0.0) - } else { - normsAndCounts.favWeightsOnFavEdgesSum.getOrElse(0.0) - } - ( - clusterId, - BipartiteClusterQuality( - incomingFollowEdges = Some(followerCount), - incomingFavEdges = Some(faverCount), - favWtSumOfIncomingFollowEdges = Some(favWtSumOfIncomingFollows), - favWtSumOfIncomingFavEdges = Some(favWtSumOfIncomingFavs) - )) - } - } - .sumByKey - .toTypedPipe - } - - def loadOrMake( - pipe: TypedPipe[(Int, BipartiteClusterQuality)], - modelVersion: String, - path: String - ): Execution[TypedPipe[(Int, BipartiteClusterQuality)]] = { - val mapped = pipe.map { - case (clusterId, struct) => ((modelVersion, clusterId), struct) - } - makeForKeyValSource(mapped, AdhocKeyValSources.bipartiteQualitySource(path), path).map { pipe => - // discard model version - pipe.map { case ((_, clusterId), struct) => (clusterId, struct) } - } - } - - def makeForKeyValSource[K, V]( - pipe: TypedPipe[(K, V)], - dest: VersionedKeyValSource[K, V], - path: String - ): Execution[TypedPipe[(K, V)]] = - Execution.getMode.flatMap { mode => - if (dest.resourceExists(mode)) { - println(s"validated path $path") - Execution.from(TypedPipe.from(dest)) - } else { - println(s"Could not load from $path") - pipe.writeThrough(dest) - } - } - - def precisionOfWholeGraph( - knownFor: TypedPipe[(Long, Array[(Int, Float)])], - interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)], - clusterIncomingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]] - ): Execution[Option[Double]] = { - val knownForSizeExec = knownFor.aggregate(Aggregator.size).toOptionExecution - val interestedInSizeExec = - interestedIn.aggregate(Aggregator.size).toOptionExecution - val numExec = clusterIncomingVolumesExec.flatMap { volumes => - volumes.values.flatMap(_.favWtSumOfIncomingFavEdges).sum.toOptionExecution - } - Execution.zip(numExec, interestedInSizeExec, knownForSizeExec).map { - case (Some(num), Some(interestedInSize), Some(knownForSize)) => - Some(num / interestedInSize / knownForSize) - case x @ _ => - println("Precision of whole graph zip: " + x) - None - } - } - - def finalPerClusterResults( - knownFor: TypedPipe[(Long, Array[(Int, Float)])], - interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)], - resultsWithOutgoingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]], - incomingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]] - ): Execution[TypedPipe[(Int, BipartiteClusterQuality)]] = { - val knownForTranspose = KnownForSources.transpose(knownFor) - - val precisionOfWholeGraphExec = - precisionOfWholeGraph(knownFor, interestedIn, incomingVolumesExec) - - Execution - .zip(resultsWithOutgoingVolumesExec, incomingVolumesExec, precisionOfWholeGraphExec) - .map { - case (resultsWithOutgoingVolumes, clusterIncomingVolumes, precisionOfWholeGraph) => - println("Precision of whole graph " + precisionOfWholeGraph) - resultsWithOutgoingVolumes - .join(knownForTranspose) - .leftJoin(clusterIncomingVolumes) - .withReducers(500) - .map { - case (clusterId, ((outgoingVolumeQuality, knownForList), incomingVolumesOpt)) => - val incomingVolumes = - incomingVolumesOpt.getOrElse(BipartiteClusterQuality()) - val knownForMap = knownForList.toMap - ( - clusterId, - getFullQuality( - outgoingVolumeQuality, - incomingVolumes, - knownForMap, - precisionOfWholeGraph)) - } - } - } - - def getFullQuality( - qualityWithOutgoingVolumes: BipartiteClusterQuality, - incomingVolumes: BipartiteClusterQuality, - knownFor: Map[Long, Float], - precisionOfWholeGraph: Option[Double] - ): BipartiteClusterQuality = { - val newSampledEdges = qualityWithOutgoingVolumes.sampledEdges.map { sampledEdges => - sampledEdges.map { sampledEdge => - val knownForScore = knownFor.getOrElse(sampledEdge.followeeId, 0.0f) - sampledEdge.copy( - predictedFollowScore = sampledEdge.followScoreToCluster.map { x => x * knownForScore }, - predictedFavScore = sampledEdge.favScoreToCluster.map { x => x * knownForScore } - ) - } - } - val correlationOfFavWtIfFollow = newSampledEdges.map { samples => - val pairs = samples.map { s => - (s.predictedFollowScore.getOrElse(0.0), s.favWtIfFollowEdge.getOrElse(0.0)) - } - Util.computeCorrelation(pairs.iterator) - } - val correlationOfFavWtIfFav = newSampledEdges.map { samples => - val pairs = samples.map { s => - (s.predictedFavScore.getOrElse(0.0), s.favWtIfFavEdge.getOrElse(0.0)) - } - Util.computeCorrelation(pairs.iterator) - } - val relativePrecisionNum = { - if (qualityWithOutgoingVolumes.interestedInSize.exists(_ > 0) && knownFor.nonEmpty) { - qualityWithOutgoingVolumes.favWtSumOfInClusterFavEdges - .getOrElse(0.0) / qualityWithOutgoingVolumes.interestedInSize.get / knownFor.size - } else 0.0 - } - val relativePrecision = if (precisionOfWholeGraph.exists(_ > 0.0)) { - Some(relativePrecisionNum / precisionOfWholeGraph.get) - } else None - qualityWithOutgoingVolumes.copy( - incomingFollowEdges = incomingVolumes.incomingFollowEdges, - incomingFavEdges = incomingVolumes.incomingFavEdges, - favWtSumOfIncomingFollowEdges = incomingVolumes.favWtSumOfIncomingFollowEdges, - favWtSumOfIncomingFavEdges = incomingVolumes.favWtSumOfIncomingFavEdges, - knownForSize = Some(knownFor.size), - correlationOfFavWtIfFollowWithPredictedFollow = correlationOfFavWtIfFollow, - correlationOfFavWtIfFavWithPredictedFav = correlationOfFavWtIfFav, - sampledEdges = newSampledEdges, - relativePrecisionUsingFavWtIfFav = relativePrecision, - averagePrecisionOfWholeGraphUsingFavWtIfFav = precisionOfWholeGraph - ) - } -} - -object DumpBpQuality extends TwitterExecutionApp { - def job: Execution[Unit] = Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val inputDir = args("inputDir") - - val clusters = args.list("clusters").map(_.toInt).toSet - val input = - TypedPipe - .from(AdhocKeyValSources.bipartiteQualitySource(inputDir)) - .map { - case ((modelVersion, clusterId), quality) => - ( - (modelVersion, clusterId), - BipartiteClusterEvaluationClasses - .printableBipartiteQuality(quality)) - } - - if (clusters.isEmpty) { - input.printSummary("Bipartite quality") - } else { - input - .collect { - case rec @ ((_, clusterId), quality) if clusters(clusterId) => - Util.prettyJsonMapper - .writeValueAsString(rec) - .replaceAll("\n", " ") - } - .toIterableExecution - .map { strings => println(strings.mkString("\n")) } - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.docx b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.docx new file mode 100644 index 000000000..1f7438358 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.scala b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.scala deleted file mode 100644 index f5acc5365..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.scala +++ /dev/null @@ -1,316 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.algebird.{Monoid, OptionMonoid, Semigroup} -import com.twitter.algebird.mutable.PriorityQueueMonoid -import com.twitter.scalding.Execution -import com.twitter.scalding.typed.TypedPipe -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.scalding.common.Util.Distribution -import com.twitter.simclusters_v2.thriftscala.{BipartiteClusterQuality, SampledEdge} -import java.util.PriorityQueue -import scala.collection.JavaConverters._ - -object BipartiteClusterEvaluationClasses { - case class Weights( - isFollowEdge: Double, - isFavEdge: Double, - favWtIfFollowEdge: Double, - favWtIfFavEdge: Double) - - object WeightsMonoid extends Monoid[Weights] { - override def zero = Weights(0.0, 0.0, 0.0, 0.0) - - override def plus(l: Weights, r: Weights): Weights = { - Weights( - l.isFollowEdge + r.isFollowEdge, - l.isFavEdge + r.isFavEdge, - l.favWtIfFollowEdge + r.favWtIfFollowEdge, - l.favWtIfFavEdge + r.favWtIfFavEdge - ) - } - } - - implicit val wm: Monoid[Weights] = WeightsMonoid - - case class SampledEdgeData( - favWtIfFollowEdge: Double, - favWtIfFavEdge: Double, - followScoreToCluster: Double, - favScoreToCluster: Double) - - implicit val samplerMonoid: PriorityQueueMonoid[((Long, Long), SampledEdgeData)] = - Util.reservoirSamplerMonoidForPairs[(Long, Long), SampledEdgeData](2000)(Util.edgeOrdering) - - implicit val sampledEdgesMonoid: PriorityQueueMonoid[SampledEdge] = - Util.reservoirSamplerMonoid( - 10000, - { sampledEdge: SampledEdge => (sampledEdge.followerId, sampledEdge.followeeId) } - )(Util.edgeOrdering) - - case class BipartiteIntermediateResults( - inClusterWeights: Weights, - totalOutgoingVolumes: Weights, - interestedInSize: Int, - edgeSample: PriorityQueue[((Long, Long), SampledEdgeData)]) { - override def toString: String = { - "BCR(%s, %s, %d, %s)".format( - inClusterWeights, - totalOutgoingVolumes, - interestedInSize, - edgeSample.iterator().asScala.toSeq.toString() - ) - } - } - - object BIRMonoid extends Monoid[BipartiteIntermediateResults] { - override def zero = - BipartiteIntermediateResults(WeightsMonoid.zero, WeightsMonoid.zero, 0, samplerMonoid.zero) - - override def plus( - l: BipartiteIntermediateResults, - r: BipartiteIntermediateResults - ): BipartiteIntermediateResults = { - BipartiteIntermediateResults( - WeightsMonoid.plus(l.inClusterWeights, r.inClusterWeights), - WeightsMonoid.plus(l.totalOutgoingVolumes, r.totalOutgoingVolumes), - l.interestedInSize + r.interestedInSize, - samplerMonoid.plus(l.edgeSample, r.edgeSample) - ) - } - } - - implicit val bIRMonoid: Monoid[BipartiteIntermediateResults] = BIRMonoid - - def makeThriftSampledEdge(edge: (Long, Long), data: SampledEdgeData): SampledEdge = { - val (followerId, followeeId) = edge - SampledEdge( - followerId = followerId, - followeeId = followeeId, - favWtIfFollowEdge = Some(data.favWtIfFollowEdge), - favWtIfFavEdge = Some(data.favWtIfFavEdge), - followScoreToCluster = Some(data.followScoreToCluster), - favScoreToCluster = Some(data.favScoreToCluster) - ) - } - - object ClusterQualitySemigroup extends Semigroup[BipartiteClusterQuality] { - val doubleOM: Monoid[Option[Double]] = new OptionMonoid[Double] - val intOM: Monoid[Option[Int]] = new OptionMonoid[Int] - val longOM: Monoid[Option[Long]] = new OptionMonoid[Long] - - override def plus(l: BipartiteClusterQuality, r: BipartiteClusterQuality) = - BipartiteClusterQuality( - inClusterFollowEdges = doubleOM.plus(l.inClusterFollowEdges, r.inClusterFollowEdges), - inClusterFavEdges = doubleOM.plus(l.inClusterFavEdges, r.inClusterFavEdges), - favWtSumOfInClusterFollowEdges = doubleOM - .plus(l.favWtSumOfInClusterFollowEdges, r.favWtSumOfInClusterFollowEdges), - favWtSumOfInClusterFavEdges = doubleOM - .plus(l.favWtSumOfInClusterFavEdges, r.favWtSumOfInClusterFavEdges), - outgoingFollowEdges = doubleOM.plus(l.outgoingFollowEdges, r.outgoingFollowEdges), - outgoingFavEdges = doubleOM.plus(l.outgoingFavEdges, r.outgoingFavEdges), - favWtSumOfOutgoingFollowEdges = doubleOM - .plus(l.favWtSumOfOutgoingFollowEdges, r.favWtSumOfOutgoingFollowEdges), - favWtSumOfOutgoingFavEdges = doubleOM - .plus(l.favWtSumOfOutgoingFavEdges, r.favWtSumOfOutgoingFavEdges), - incomingFollowEdges = doubleOM.plus(l.incomingFollowEdges, r.incomingFollowEdges), - incomingFavEdges = doubleOM.plus(l.incomingFavEdges, r.incomingFavEdges), - favWtSumOfIncomingFollowEdges = doubleOM - .plus(l.favWtSumOfIncomingFollowEdges, r.favWtSumOfIncomingFollowEdges), - favWtSumOfIncomingFavEdges = doubleOM - .plus(l.favWtSumOfIncomingFavEdges, r.favWtSumOfIncomingFavEdges), - interestedInSize = None, - sampledEdges = Some( - sampledEdgesMonoid - .plus( - sampledEdgesMonoid.build(l.sampledEdges.getOrElse(Nil)), - sampledEdgesMonoid.build(r.sampledEdges.getOrElse(Nil)) - ) - .iterator() - .asScala - .toSeq), - knownForSize = intOM.plus(l.knownForSize, r.knownForSize), - correlationOfFavWtIfFollowWithPredictedFollow = None, - correlationOfFavWtIfFavWithPredictedFav = None, - relativePrecisionUsingFavWtIfFav = None, - averagePrecisionOfWholeGraphUsingFavWtIfFav = l.averagePrecisionOfWholeGraphUsingFavWtIfFav - ) - } - - implicit val bcqSemigroup: Semigroup[BipartiteClusterQuality] = - ClusterQualitySemigroup - - case class PrintableBipartiteQuality( - incomingFollowUnweightedRecall: String, - incomingFavUnweightedRecall: String, - incomingFollowWeightedRecall: String, - incomingFavWeightedRecall: String, - outgoingFollowUnweightedRecall: String, - outgoingFavUnweightedRecall: String, - outgoingFollowWeightedRecall: String, - outgoingFavWeightedRecall: String, - incomingFollowEdges: String, - incomingFavEdges: String, - favWtSumOfIncomingFollowEdges: String, - favWtSumOfIncomingFavEdges: String, - outgoingFollowEdges: String, - outgoingFavEdges: String, - favWtSumOfOutgoingFollowEdges: String, - favWtSumOfOutgoingFavEdges: String, - correlationOfFavWtIfFollow: String, - correlationOfFavWtIfFav: String, - relativePrecisionUsingFavWt: String, - averagePrecisionOfWholeGraphUsingFavWt: String, - interestedInSize: String, - knownForSize: String) - - def printableBipartiteQuality(in: BipartiteClusterQuality): PrintableBipartiteQuality = { - def getRatio(numOpt: Option[Double], denOpt: Option[Double]): String = { - val r = if (denOpt.exists(_ > 0)) { - numOpt.getOrElse(0.0) / denOpt.get - } else 0.0 - "%.3f".format(r) - } - - val formatter = new java.text.DecimalFormat("###,###.#") - - def denString(denOpt: Option[Double]): String = - formatter.format(denOpt.getOrElse(0.0)) - - val correlationOfFavWtIfFollow = - in.correlationOfFavWtIfFollowWithPredictedFollow match { - case None => - in.sampledEdges.map { samples => - val pairs = samples.map { s => - (s.predictedFollowScore.getOrElse(0.0), s.favWtIfFollowEdge.getOrElse(0.0)) - } - Util.computeCorrelation(pairs.iterator) - } - case x @ _ => x - } - - val correlationOfFavWtIfFav = - in.correlationOfFavWtIfFavWithPredictedFav match { - case None => - in.sampledEdges.map { samples => - val pairs = samples.map { s => - (s.predictedFavScore.getOrElse(0.0), s.favWtIfFavEdge.getOrElse(0.0)) - } - Util.computeCorrelation(pairs.iterator) - } - case x @ _ => x - } - - PrintableBipartiteQuality( - incomingFollowUnweightedRecall = getRatio(in.inClusterFollowEdges, in.incomingFollowEdges), - incomingFavUnweightedRecall = getRatio(in.inClusterFavEdges, in.incomingFavEdges), - incomingFollowWeightedRecall = - getRatio(in.favWtSumOfInClusterFollowEdges, in.favWtSumOfIncomingFollowEdges), - incomingFavWeightedRecall = - getRatio(in.favWtSumOfInClusterFavEdges, in.favWtSumOfIncomingFavEdges), - outgoingFollowUnweightedRecall = getRatio(in.inClusterFollowEdges, in.outgoingFollowEdges), - outgoingFavUnweightedRecall = getRatio(in.inClusterFavEdges, in.outgoingFavEdges), - outgoingFollowWeightedRecall = - getRatio(in.favWtSumOfInClusterFollowEdges, in.favWtSumOfOutgoingFollowEdges), - outgoingFavWeightedRecall = - getRatio(in.favWtSumOfInClusterFavEdges, in.favWtSumOfOutgoingFavEdges), - incomingFollowEdges = denString(in.incomingFollowEdges), - incomingFavEdges = denString(in.incomingFavEdges), - favWtSumOfIncomingFollowEdges = denString(in.favWtSumOfIncomingFollowEdges), - favWtSumOfIncomingFavEdges = denString(in.favWtSumOfIncomingFavEdges), - outgoingFollowEdges = denString(in.outgoingFollowEdges), - outgoingFavEdges = denString(in.outgoingFavEdges), - favWtSumOfOutgoingFollowEdges = denString(in.favWtSumOfOutgoingFollowEdges), - favWtSumOfOutgoingFavEdges = denString(in.favWtSumOfOutgoingFavEdges), - correlationOfFavWtIfFollow = "%.3f" - .format(correlationOfFavWtIfFollow.getOrElse(0.0)), - correlationOfFavWtIfFav = "%.3f" - .format(correlationOfFavWtIfFav.getOrElse(0.0)), - relativePrecisionUsingFavWt = - "%.2g".format(in.relativePrecisionUsingFavWtIfFav.getOrElse(0.0)), - averagePrecisionOfWholeGraphUsingFavWt = - "%.2g".format(in.averagePrecisionOfWholeGraphUsingFavWtIfFav.getOrElse(0.0)), - interestedInSize = in.interestedInSize.getOrElse(0).toString, - knownForSize = in.knownForSize.getOrElse(0).toString - ) - } - - case class ClusterResultsSummary( - numClustersWithZeroInterestedIn: Int, - numClustersWithZeroFollowWtRecall: Int, - numClustersWithZeroFavWtRecall: Int, - numClustersWithZeroFollowAndFavWtRecall: Int, - interestedInSizeDist: Distribution, - outgoingFollowWtRecallDist: Distribution, - outgoingFavWtRecallDist: Distribution, - incomingFollowWtRecallDist: Distribution, - incomingFavWtRecallDist: Distribution, - followCorrelationDist: Distribution, - favCorrelationDist: Distribution, - relativePrecisionDist: Distribution) - - def getClusterResultsSummary( - perClusterResults: TypedPipe[BipartiteClusterQuality] - ): Execution[Option[ClusterResultsSummary]] = { - perClusterResults - .map { clusterQuality => - val printableQuality = printableBipartiteQuality(clusterQuality) - val isFollowRecallZero = - if (!clusterQuality.favWtSumOfInClusterFollowEdges - .exists(_ > 0)) 1 - else 0 - val isFavRecallZero = - if (!clusterQuality.favWtSumOfInClusterFavEdges.exists(_ > 0)) 1 - else 0 - ( - if (!clusterQuality.interestedInSize.exists(_ > 0)) 1 else 0, - isFollowRecallZero, - isFavRecallZero, - isFavRecallZero * isFollowRecallZero, - clusterQuality.interestedInSize.toList.map(_.toDouble), - List(printableQuality.outgoingFollowWeightedRecall.toDouble), - List(printableQuality.outgoingFavWeightedRecall.toDouble), - List(printableQuality.incomingFollowWeightedRecall.toDouble), - List(printableQuality.incomingFavWeightedRecall.toDouble), - List(printableQuality.correlationOfFavWtIfFollow.toDouble), - List(printableQuality.correlationOfFavWtIfFav.toDouble), - List(printableQuality.relativePrecisionUsingFavWt.toDouble) - ) - } - .sum - .toOptionExecution - .map { opt => - opt.map { - case ( - zeroInterestedIn, - zeroFollowRecall, - zeroFavRecall, - zeroFollowAndFavRecall, - interestedInSizeList, - outgoingFollowWtRecallList, - outgoingFavWtRecallList, - incomingFollowWtRecallList, - incomingFavWtRecallList, - followCorrelationList, - favCorrelationList, - relativePrecisionList - ) => - ClusterResultsSummary( - numClustersWithZeroInterestedIn = zeroInterestedIn, - numClustersWithZeroFollowWtRecall = zeroFollowRecall, - numClustersWithZeroFavWtRecall = zeroFavRecall, - numClustersWithZeroFollowAndFavWtRecall = zeroFollowAndFavRecall, - interestedInSizeDist = Util.distributionFromArray(interestedInSizeList.toArray), - outgoingFollowWtRecallDist = Util - .distributionFromArray(outgoingFollowWtRecallList.toArray), - outgoingFavWtRecallDist = Util.distributionFromArray(outgoingFavWtRecallList.toArray), - incomingFollowWtRecallDist = Util - .distributionFromArray(incomingFollowWtRecallList.toArray), - incomingFavWtRecallDist = Util.distributionFromArray(incomingFavWtRecallList.toArray), - followCorrelationDist = Util.distributionFromArray(followCorrelationList.toArray), - favCorrelationDist = Util.distributionFromArray(favCorrelationList.toArray), - relativePrecisionDist = Util.distributionFromArray(relativePrecisionList.toArray) - ) - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.docx new file mode 100644 index 000000000..ffa9c3d9d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.scala deleted file mode 100644 index f7aa381c4..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.scala +++ /dev/null @@ -1,794 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.algebird.OptionMonoid -import com.twitter.algebird.QTree -import com.twitter.algebird.QTreeSemigroup -import com.twitter.algebird.Semigroup -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.dal.client.dataset.SnapshotDALDataset -import com.twitter.hermit.candidate.thriftscala.Candidates -import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource -import com.twitter.pluck.source.cassowary.SimsCandidatesSource -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.job.analytics_batch._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser - -object ClusterDetailsJob { - case class Scores(followScore: Double, favScore: Double, logFavScore: Double) - - case class IntermediateDetails( - numUsersWithAnyNonZeroScore: Int, - numUsersWithNonZeroFollowScore: Int, - numUsersWithNonZeroFavScore: Int, - favQTree: Option[QTree[Double]], - followQTree: Option[QTree[Double]], - logFavQTree: Option[QTree[Double]], - sumOfSquares: Scores, - sum: Scores, - min: Scores, - max: Scores) - - case class InfoFromUserSource( - fractionMarkedNSFWUser: Double, - languageToFractionDeviceLanguage: Map[String, Double], - countryCodeToFractionKnownForWithCountryCode: Map[String, Double], - languageToFractionInferredLanguage: Map[String, Double]) - - def positiveMin(a: Double, b: Double) = { - if (math.min(a, b) == 0.0) math.max(a, b) else math.min(a, b) - } - - case class ClusterDetailsSemigroup(implicit qtreeSemigroup: Semigroup[QTree[Double]]) - extends Semigroup[IntermediateDetails] { - val optionMonoid: OptionMonoid[QTree[Double]] = new OptionMonoid[QTree[Double]]() - override def plus( - left: IntermediateDetails, - right: IntermediateDetails - ): IntermediateDetails = { - IntermediateDetails( - left.numUsersWithAnyNonZeroScore + right.numUsersWithAnyNonZeroScore, - left.numUsersWithNonZeroFollowScore + right.numUsersWithNonZeroFollowScore, - left.numUsersWithNonZeroFavScore + right.numUsersWithNonZeroFavScore, - optionMonoid.plus(left.favQTree, right.favQTree), - optionMonoid.plus(left.followQTree, right.followQTree), - optionMonoid.plus(left.logFavQTree, right.logFavQTree), - Scores( - left.sumOfSquares.followScore + right.sumOfSquares.followScore, - left.sumOfSquares.favScore + right.sumOfSquares.favScore, - left.sumOfSquares.logFavScore + right.sumOfSquares.logFavScore - ), - Scores( - left.sum.followScore + right.sum.followScore, - left.sum.favScore + right.sum.favScore, - left.sum.logFavScore + right.sum.logFavScore - ), - Scores( - positiveMin(left.min.followScore, right.min.followScore), - positiveMin(left.min.favScore, right.min.favScore), - positiveMin(left.min.logFavScore, right.min.logFavScore) - ), - Scores( - math.max(left.max.followScore, right.max.followScore), - math.max(left.max.favScore, right.max.favScore), - math.max(left.max.logFavScore, right.max.logFavScore) - ) - ) - } - } - - def intermediateDetailsPipe( - input: TypedPipe[(Long, ClustersUserIsInterestedIn)], - qtreeSemigroupKParameter: Int - ): TypedPipe[(Int, IntermediateDetails)] = { - implicit val qtSg: Semigroup[QTree[Double]] = - new QTreeSemigroup[Double](qtreeSemigroupKParameter) - implicit val cdSg: Semigroup[IntermediateDetails] = ClusterDetailsSemigroup() - input - .flatMap { - case (userId, clusterScoresStruct) => - val clusterScoresArray = clusterScoresStruct.clusterIdToScores.toArray - clusterScoresArray.map { - case (clusterId, scoresStruct) => - val followScore = scoresStruct.followScore.getOrElse(0.0) - val favScore = scoresStruct.favScore.getOrElse(0.0) - val logFavScore = scoresStruct.logFavScore.getOrElse(0.0) - ( - clusterId, - IntermediateDetails( - numUsersWithAnyNonZeroScore = 1, - numUsersWithNonZeroFollowScore = if (followScore > 0) 1 else 0, - numUsersWithNonZeroFavScore = if (favScore > 0) 1 else 0, - favQTree = if (favScore > 0) Some(QTree(favScore)) else None, - followQTree = if (followScore > 0) Some(QTree(followScore)) else None, - logFavQTree = if (logFavScore > 0) Some(QTree(logFavScore)) else None, - sumOfSquares = Scores( - followScore * followScore, - favScore * favScore, - logFavScore * logFavScore), - sum = Scores(followScore, favScore, logFavScore), - min = Scores(followScore, favScore, logFavScore), - max = Scores(followScore, favScore, logFavScore) - ) - ) - } - } - .sumByKey - // Uncomment for adhoc job - //.withReducers(100) - .toTypedPipe - } - - private def safeGetDoubleOpt(x: Option[Double]): Double = { - x.map { y => if (y.isNaN) 0 else y }.getOrElse(0) - } - - private def getSimilaritiesForAllPairs( - input: TypedPipe[(Long, ClustersUserIsInterestedIn)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[((Int, Int), Scores)] = { - val allClusterPairsBeforeSumByKey = Stat("all_cluster_pairs_before_sum_by_key") - val clusterPairsWithin10Ratio = Stat("cluster_pairs_within_10_ratio") - val clusterPairsBeforeTopK = Stat("cluster_pairs_before_thresholding") - - input - .flatMap { - case (userId, clusterScoresStruct) => - val clusterScoresArray = clusterScoresStruct.clusterIdToScores.toArray - (0 until clusterScoresArray.length).flatMap { i => - (0 until clusterScoresArray.length).map { j => - val (clusterI, scoresI) = clusterScoresArray(i) - val (clusterJ, scoresJ) = clusterScoresArray(j) - val ratioOfSizes = - scoresI.numUsersInterestedInThisClusterUpperBound.getOrElse(1).toDouble / - scoresJ.numUsersInterestedInThisClusterUpperBound.getOrElse(1).toDouble - allClusterPairsBeforeSumByKey.inc() - if (ratioOfSizes > 0.1 && ratioOfSizes < 10) { - clusterPairsWithin10Ratio.inc() - } - val followI = safeGetDoubleOpt(scoresI.followScoreClusterNormalizedOnly) - val followJ = safeGetDoubleOpt(scoresJ.followScoreClusterNormalizedOnly) - val follow = followI * followJ - val favI = safeGetDoubleOpt(scoresI.favScoreClusterNormalizedOnly) - val favJ = safeGetDoubleOpt(scoresJ.favScoreClusterNormalizedOnly) - val fav = favI * favJ - val logFavI = safeGetDoubleOpt(scoresI.logFavScoreClusterNormalizedOnly) - val logFavJ = safeGetDoubleOpt(scoresJ.logFavScoreClusterNormalizedOnly) - val logFav = logFavI * logFavJ - ((clusterI, clusterJ), (follow, fav, logFav)) - } - } - } - .sumByKey - // Uncomment for adhoc job - //.withReducers(600) - .map { - case (key, (follow, fav, logFav)) => - clusterPairsBeforeTopK.inc() - (key, Scores(follow, fav, logFav)) - } - } - - private def keepTopNeighbors( - allPairs: TypedPipe[((Int, Int), Scores)], - cosineThreshold: Double - )( - implicit uniqueID: UniqueID - ): TypedPipe[(Int, List[ClusterNeighbor])] = { - val clusterPairsMoreThanThreshold = Stat("cluster_pairs_cosine_gt_" + cosineThreshold) - val clusterPairsAfterTopK = Stat("cluster_pairs_after_topk") - val clustersWithFewNeighbors = Stat(s"clusters_with_fewer_than_100_neighbors") - val clustersWithManyNeighbors = Stat(s"clusters_with_more_than_100_neighbors") - - allPairs - .flatMap { - case ((cI, cJ), Scores(followScore, favScore, logFavScore)) => - if (followScore > cosineThreshold || logFavScore > cosineThreshold || favScore > cosineThreshold) { - clusterPairsMoreThanThreshold.inc() - Some((cI, ClusterNeighbor(cJ, Some(followScore), Some(favScore), Some(logFavScore)))) - } else None - } - .group - .toList - // Uncomment for adhoc job - //.withReducers(40) - .map { - case (key, seq) => - val finalSize = seq.size - clusterPairsAfterTopK.incBy(finalSize) - if (finalSize < 100) { - clustersWithFewNeighbors.inc() - } else { - clustersWithManyNeighbors.inc() - } - ( - key, - seq.sortBy { - case cn: ClusterNeighbor => - -(cn.followCosineSimilarity.getOrElse(0.0) + cn.logFavCosineSimilarity.getOrElse( - 0.0)) / 2 - }) - } - } - - def getTopSimilarClustersWithCosine( - input: TypedPipe[(Long, ClustersUserIsInterestedIn)], - cosineThreshold: Double - )( - implicit uniqueID: UniqueID - ): TypedPipe[(Int, List[ClusterNeighbor])] = { - keepTopNeighbors(getSimilaritiesForAllPairs(input), cosineThreshold) - } - - def getDistributionDetails( - qtree: QTree[Double], - sum: Double, - sumOfSquares: Double, - min: Double, - max: Double, - fullSize: Int - ): DistributionDetails = { - val mean = sum / fullSize - // note that the below is the naive calculation, and not the sample standard dev formula - // that divides by n-1. I don't think it makes a difference at our scale whether we use n or n-1 - // and I'd rather use the simpler one. - val stdDev = math.sqrt(sumOfSquares / fullSize - mean * mean) - - def getQB(percentile: Double): QuantileBounds = { - val (lb, ub) = qtree.quantileBounds(percentile) - QuantileBounds(lb, ub) - } - - DistributionDetails( - mean = mean, - standardDeviation = Some(stdDev), - min = Some(min), - p25 = Some(getQB(0.25)), - p50 = Some(getQB(0.5)), - p75 = Some(getQB(0.75)), - p95 = Some(getQB(0.95)), - max = Some(max) - ) - } - - def keepCorrectModel( - input: TypedPipe[(Long, ClustersUserIsInterestedIn)], - modelVersionToKeep: String - )( - implicit uniqId: UniqueID - ): TypedPipe[(Long, ClustersUserIsInterestedIn)] = { - val allRecords = Stat("all_input_records") - val withCorrectVersion = Stat("with_correct_version") - input.filter { - case (_, clusterScoresStruct) => - // allRecords.inc() - val result = clusterScoresStruct.knownForModelVersion == modelVersionToKeep - // if (result) withCorrectVersion.inc() - result - } - } - - def getInfoFromUserSource( - knownFor: TypedPipe[(Int, List[(Long, Float)])], - usersource: TypedPipe[FlatUser], - inferredLanguages: TypedPipe[(Long, Seq[(String, Double)])] - )( - implicit uniqId: UniqueID - ): TypedPipe[(Int, InfoFromUserSource)] = { - val knownForUsers = knownFor.flatMap { - case (clusterId, userScoreList) => - userScoreList.map { - case (userId, _) => - (userId, clusterId) - } - } - - usersource - .collect { - case fuser: FlatUser if fuser.id.isDefined => - ( - fuser.id.get, - ( - fuser.accountCountryCode.getOrElse(""), - fuser.language.getOrElse(""), - fuser.nsfwUser.getOrElse(false) - )) - } - .join(knownForUsers) - .leftJoin(inferredLanguages) - .map { - case (_, (((countryCode, language, nsfw), clusterId), inferredLangsOpt)) => - val nsfwInt = if (nsfw) 1 else 0 - ( - clusterId, - ( - 1, - nsfwInt, - Map(language -> 1), - Map(countryCode -> 1), - inferredLangsOpt.getOrElse(Seq(("", 1.0))).toMap - ) - ) - } - .sumByKey - .mapValues { - case ( - denominator, - nsfwNumerator, - languageNumeratorsMap, - countryNumeratorsMap, - inferredLangsNumeratorsMap) => - InfoFromUserSource( - nsfwNumerator * 1.0 / denominator, - languageNumeratorsMap.mapValues { x => x * 1.0 / denominator }, - countryNumeratorsMap.mapValues { x => x * 1.0 / denominator }, - inferredLangsNumeratorsMap.mapValues { x => x * 1.0 / denominator } - ) - } - } - - /** - * Run the cluster details job and return the details for each cluster - * @param input interestedIn data - * @param qtreeSemigroupKParameter parameter for calculating percentiles using qtree monoid (set to a small number, usually < 7) - * @param modelVersionToKeep which modelVersion to use from interestedIn dataset - * @param knownFor clusterId -> users known for this cluster and their scores - * @param knownForTranspose userId -> clusters this user is known for and their scores - * @param usersource -> user source - * @param simsGraph -> sims graph in the form of userId -> adjacency list - * @param cosineThreshold -> cosine threshold to include a cluster in the list of similar clusters for a given cluster - * @param uniqId - * @return pipe with (modelVersion, clusterId) as the key and ClusterDetails struct as the value. - */ - def run( - input: TypedPipe[(Long, ClustersUserIsInterestedIn)], - qtreeSemigroupKParameter: Int, - modelVersionToKeep: String, - knownFor: TypedPipe[(Int, List[(Long, Float)])], - knownForTranspose: TypedPipe[(Long, Array[(Int, Float)])], - usersource: Option[TypedPipe[FlatUser]], - inferredLanguageSource: Option[TypedPipe[(Long, Seq[(String, Double)])]], - simsGraph: Option[TypedPipe[(Long, Map[Long, Float])]], - cosineThreshold: Double - )( - implicit uniqId: UniqueID - ): Execution[TypedPipe[((String, Int), ClusterDetails)]] = { - val topSimilarClusters = getTopSimilarClustersWithCosine(input, cosineThreshold) - val infoFromUserSource: TypedPipe[(Int, InfoFromUserSource)] = (for { - us <- usersource - inferredLanguages <- inferredLanguageSource - } yield getInfoFromUserSource(knownFor, us, inferredLanguages)).getOrElse(TypedPipe.empty) - - val clusterEvaluationExec = simsGraph match { - case Some(sg) => - ClusterEvaluation.clusterLevelEvaluation(sg, knownForTranspose, "eval") - case None => - val dummyPipe: TypedPipe[(Int, (Int, ClusterQuality))] = TypedPipe.empty - Execution.from(dummyPipe) - } - - clusterEvaluationExec - .map { clusterIdToSizesAndQualities => - val clusterQualities: TypedPipe[(Int, ClusterQuality)] = - clusterIdToSizesAndQualities.mapValues(_._2) - intermediateDetailsPipe( - keepCorrectModel(input, modelVersionToKeep), - qtreeSemigroupKParameter) - .leftJoin(topSimilarClusters) - .leftJoin(infoFromUserSource) - .leftJoin(clusterQualities) - .join(knownFor) - .map { - case ( - clusterId, - ( - ( - ((intermediateDetails, topSimilarNeighborsOpt), userSourceInfoOpt), - qualityOpt), - knownForUsers) - ) => - val knownForSorted = knownForUsers.sortBy(-_._2).map { - case (userId, score) => - UserWithScore(userId, score) - } - (modelVersionToKeep, clusterId) -> - ClusterDetails( - numUsersWithAnyNonZeroScore = intermediateDetails.numUsersWithAnyNonZeroScore, - numUsersWithNonZeroFavScore = intermediateDetails.numUsersWithNonZeroFavScore, - numUsersWithNonZeroFollowScore = - intermediateDetails.numUsersWithNonZeroFollowScore, - favScoreDistributionDetails = intermediateDetails.favQTree.map { qt => - getDistributionDetails( - qtree = qt, - sum = intermediateDetails.sum.favScore, - sumOfSquares = intermediateDetails.sumOfSquares.favScore, - min = intermediateDetails.min.favScore, - max = intermediateDetails.max.favScore, - fullSize = intermediateDetails.numUsersWithNonZeroFavScore - ) - }, - followScoreDistributionDetails = intermediateDetails.followQTree.map { qt => - getDistributionDetails( - qtree = qt, - sum = intermediateDetails.sum.followScore, - sumOfSquares = intermediateDetails.sumOfSquares.followScore, - min = intermediateDetails.min.followScore, - max = intermediateDetails.max.followScore, - fullSize = intermediateDetails.numUsersWithNonZeroFollowScore - ) - }, - logFavScoreDistributionDetails = intermediateDetails.logFavQTree.map { qt => - getDistributionDetails( - qtree = qt, - sum = intermediateDetails.sum.logFavScore, - sumOfSquares = intermediateDetails.sumOfSquares.logFavScore, - min = intermediateDetails.min.logFavScore, - max = intermediateDetails.max.logFavScore, - // note: user has non-zero fav score iff a user has non-zero log-fav score - fullSize = intermediateDetails.numUsersWithNonZeroFavScore - ) - }, - knownForUsersAndScores = Some(knownForSorted), - neighborClusters = topSimilarNeighborsOpt, - fractionKnownForMarkedNSFWUser = userSourceInfoOpt.map(_.fractionMarkedNSFWUser), - languageToFractionDeviceLanguage = - userSourceInfoOpt.map(_.languageToFractionDeviceLanguage), - countryCodeToFractionKnownForWithCountryCode = - userSourceInfoOpt.map(_.countryCodeToFractionKnownForWithCountryCode), - qualityMeasuredOnSimsGraph = qualityOpt, - languageToFractionInferredLanguage = - userSourceInfoOpt.map(_.languageToFractionInferredLanguage), - ) - } - } - } - - def getTruncatedSims( - sims: TypedPipe[Candidates], - maxNeighbors: Int - ): TypedPipe[(Long, Map[Long, Float])] = { - sims.map { cands => - ( - cands.userId, - // These candidates are already sorted, but leaving it in just in case the behavior changes upstream - cands.candidates - .map { c => (c.userId, c.score.toFloat) }.sortBy(-_._2).take(maxNeighbors).toMap - ) - } - } -} - -/** - scalding remote run --main-class com.twitter.simclusters_v2.scalding.ClusterDetailsAdhoc \ - --target src/scala/com/twitter/simclusters_v2/scalding:cluster_details-adhoc \ - --hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \ - --user recos-platform -- \ - --date 2020-06-25 \ - --dateForUserSource 2020-06-25 \ - --includeUserSource \ - --outputDir /user/recos-platform/adhoc/your_ldap/cluster_details_inferred_lang - */ -object ClusterDetailsAdhoc extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val date = DateRange.parse(args("dateForUserSource")) - val (knownFor, knownForTranspose) = - args - .optional("knownForDir").map { location => - ( - KnownForSources.transpose(KnownForSources.readKnownFor(location)), - KnownForSources.readKnownFor(location) - ) - }.getOrElse( - ( - KnownForSources.clusterToKnownFor_20M_145K_updated, - KnownForSources.knownFor_20M_145K_updated - ) - ) - - val interestedIn = args - .optional("inputDir").map { interestedInInputDir => - TypedPipe.from(AdhocKeyValSources.interestedInSource(interestedInInputDir)) - }.getOrElse( - DAL - .readMostRecentSnapshotNoOlderThan( - SimclustersV2InterestedIn20M145KUpdatedScalaDataset, - Days(14)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .map { - case KeyVal(userId, clustersUserIsInterestedIn) => - (userId, clustersUserIsInterestedIn) - } - ) - - val userSourceOpt = if (args.boolean("includeUserSource")) { - Some(DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe) - } else None - - val inferredLanguagesOpt = if (args.boolean("includeUserSource")) { - Some(ExternalDataSources.inferredUserProducedLanguageSource) - } else None - - val simsGraphOpt = args.optional("simsForEvalInputDir").map { sgDir => - ClusterDetailsJob.getTruncatedSims( - TypedPipe.from(WTFCandidatesSource(sgDir)), - args.int("maxSimsNeighborsForEval", 20) - ) - } - - Util.printCounters( - ClusterDetailsJob - .run( - interestedIn, - args.int("qtreeSemigroupKParameter", 3), - args.getOrElse("modelVersion", "20M_145K_updated"), - knownFor, - knownForTranspose, - userSourceOpt, - inferredLanguagesOpt, - simsGraphOpt, - cosineThreshold = args.double("cosineThreshold", 0.01) - ).flatMap( - _.writeExecution(AdhocKeyValSources.clusterDetailsSource(args("outputDir")))) - ) - } - } -} - -trait ClusterDetailsBatchTrait extends TwitterScheduledExecutionApp { - implicit val tz = DateOps.UTC - implicit val parser = DateParser.default - - def firstTime: String - def batchIncrement: Duration - def manhattanOutputPath: String - def clusterDetailsLiteOutputPath: String - def modelVersion: String - def knownForDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] - def interestedInDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] - def outputDataset: KeyValDALDataset[KeyVal[(String, Int), ClusterDetails]] - def clusterDetailsLiteOutputDataset: SnapshotDALDataset[ClusterDetailsLite] - - private lazy val execArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(this.getClass.getName.replace("$", "")), - firstTime = BatchFirstTime(RichDate(firstTime)), - lastTime = None, - batchIncrement = BatchIncrement(batchIncrement) - ) - - override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) { - implicit dateRange => - Execution.withId { implicit uniqueId => - Execution.withArgs { args => - val qtreeSemigroupKParameter = args.int("qtreeSemigroupKParameter", 5) - val maxSimsNeighborsForEval = args.int("maxSimsNeighborsForEval", 20) - val knownForTranspose = - KnownForSources.fromKeyVal( - DAL.readMostRecentSnapshot(knownForDataset, dateRange.extend(Days(7))).toTypedPipe, - modelVersion) - val knownFor = KnownForSources.transpose(knownForTranspose) - val cosineThreshold = args.double("cosineThreshold", 0.01) - val interestedIn = - DAL - .readMostRecentSnapshot(interestedInDataset, dateRange.extend(Days(7))) - .toTypedPipe - .map { - case KeyVal(userId, clustersUserIsInterestedIn) => - (userId, clustersUserIsInterestedIn) - } - val sims = if (modelVersion == ModelVersions.Model20M145K2020) { - // The model version 20m_145k_2020 uses approximate_cosine_follow as the input sims graph - // to cluster users. The same graph is used to evaluate the clusters - TypedPipe - .from(FollowingsCosineSimilaritiesManhattanSource()) - .map(_._2) - } else { - TypedPipe.from( - SimsCandidatesSource()( - dateRange = dateRange, - suffixPath = "/classified_candidates_rollup" - )) - } - val resultExec = ClusterDetailsJob - .run( - interestedIn, - qtreeSemigroupKParameter, - modelVersion, - knownFor, - knownForTranspose, - Some(DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange).toTypedPipe), - Some(ExternalDataSources.inferredUserProducedLanguageSource), - Some( - ClusterDetailsJob.getTruncatedSims(sims, maxNeighbors = maxSimsNeighborsForEval)), - cosineThreshold - ).flatMap { resultUnmapped => - val clusterDetailsExec = resultUnmapped - .map { - case (clusterKey, details) => - KeyVal(clusterKey, details) - }.writeDALVersionedKeyValExecution( - outputDataset, - D.Suffix(manhattanOutputPath) - ) - - val clusterDetailsLiteExec = - resultUnmapped - .map { - case ((_, clusterId), details) - if modelVersion == ModelVersions.Model20M145KDec11 => - ClusterDetailsLite( - FullClusterId(ModelVersion.Model20m145kDec11, clusterId), - details.numUsersWithAnyNonZeroScore, - details.numUsersWithNonZeroFollowScore, - details.numUsersWithNonZeroFavScore, - details.knownForUsersAndScores.getOrElse(Nil) - ) - case ((_, clusterId), details) - if modelVersion == ModelVersions.Model20M145KUpdated => - ClusterDetailsLite( - FullClusterId(ModelVersion.Model20m145kUpdated, clusterId), - details.numUsersWithAnyNonZeroScore, - details.numUsersWithNonZeroFollowScore, - details.numUsersWithNonZeroFavScore, - details.knownForUsersAndScores.getOrElse(Nil) - ) - case ((_, clusterId), details) - if modelVersion == ModelVersions.Model20M145K2020 => - ClusterDetailsLite( - FullClusterId(ModelVersion.Model20m145k2020, clusterId), - details.numUsersWithAnyNonZeroScore, - details.numUsersWithNonZeroFollowScore, - details.numUsersWithNonZeroFavScore, - details.knownForUsersAndScores.getOrElse(Nil) - ) - }.writeDALSnapshotExecution( - clusterDetailsLiteOutputDataset, - D.Daily, - D.Suffix(clusterDetailsLiteOutputPath), - D.EBLzo(), - dateRange.end) - - Execution.zip(clusterDetailsExec, clusterDetailsLiteExec) - } - - Util.printCounters(resultExec) - } - } - } - -} - -object ClusterDetailsBatch extends ClusterDetailsBatchTrait { - override val firstTime: String = "2018-07-28" - override val batchIncrement: Duration = Days(7) - - override val manhattanOutputPath: String = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details" - - override val clusterDetailsLiteOutputPath: String = - "/user/cassowary/processed/simclusters_v2_cluster_details_lite" - - override val modelVersion: String = ModelVersions.Model20M145KDec11 - override val knownForDataset = SimclustersV2KnownFor20M145KDec11ScalaDataset - override val interestedInDataset = SimclustersV2InterestedInScalaDataset - override val outputDataset = SimclustersV2ClusterDetailsScalaDataset - override val clusterDetailsLiteOutputDataset = - SimclustersV2ClusterDetailsLiteScalaDataset -} - -object ClusterDetails20M145KUpdated extends ClusterDetailsBatchTrait { - override val firstTime: String = "2019-06-16" - override val batchIncrement: Duration = Days(7) - - override val manhattanOutputPath: String = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_updated" - - override val clusterDetailsLiteOutputPath: String = - "/user/cassowary/processed/simclusters_v2_cluster_details_lite_20m_145k_updated" - - override val modelVersion: String = ModelVersions.Model20M145KUpdated - override val knownForDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset - override val interestedInDataset = SimclustersV2InterestedIn20M145KUpdatedScalaDataset - override val outputDataset = SimclustersV2ClusterDetails20M145KUpdatedScalaDataset - override val clusterDetailsLiteOutputDataset = - SimclustersV2ClusterDetailsLite20M145KUpdatedScalaDataset -} - -/** - * capesospy-v2 update --build_locally --start_cron cluster_details_20m_145k_2020 \ - * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object ClusterDetails20M145K2020 extends ClusterDetailsBatchTrait { - override val firstTime: String = "2020-10-15" - override val batchIncrement: Duration = Days(7) - - override val manhattanOutputPath: String = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_2020" - - override val clusterDetailsLiteOutputPath: String = - "/user/cassowary/processed/simclusters_v2_cluster_details_lite_20m_145k_2020" - - override val modelVersion: String = ModelVersions.Model20M145K2020 - override val knownForDataset = SimclustersV2KnownFor20M145K2020ScalaDataset - override val interestedInDataset = SimclustersV2InterestedIn20M145K2020ScalaDataset - override val outputDataset = SimclustersV2ClusterDetails20M145K2020ScalaDataset - override val clusterDetailsLiteOutputDataset = - SimclustersV2ClusterDetailsLite20M145K2020ScalaDataset -} - -/** -scalding remote run --main-class com.twitter.simclusters_v2.scalding.DumpClusterDetailsAdhoc \ - --target src/scala/com/twitter/simclusters_v2/scalding:cluster_details-dump \ - --user recos-platform -- \ - --date 2020-06-25 \ - --clusterIds 5542 129677 48645 \ - --inputDir /user/recos-platform/adhoc/your_ldap/cluster_details_inferred_lang - */ -object DumpClusterDetailsAdhoc extends TwitterExecutionApp { - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val clusters = args.list("clusterIds").map(_.toInt).toSet //(1 to 2500).toSet // - TypedPipe - .from(AdhocKeyValSources.clusterDetailsSource(args("inputDir"))) - .filter { case ((modelVersion, clusterId), details) => clusters.contains(clusterId) } - .toIterableExecution - .map { iter => - iter.foreach { x => println(Util.prettyJsonMapper.writeValueAsString(x)) } - } - } - } -} - -/** - * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:cluster_details && \ - * oscar hdfs --user cassowary --host hadoopnest2.atla.twitter.com --bundle cluster_details \ - * --tool com.twitter.simclusters_v2.scalding.DumpClusterSimilaritiesAdhoc --screen --screen-detached \ - * --tee your_ldap/dumpClusterSimilarities_20200103 -- \ - * --inputDir /user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_updated/ \ - * --outputDir adhoc/your_ldap - */ -object DumpClusterSimilaritiesAdhoc extends TwitterExecutionApp { - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - TypedPipe - .from(AdhocKeyValSources.clusterDetailsSource(args("inputDir"))) - .flatMap { - case ((_, clusterId), details) => - details.neighborClusters.getOrElse(Nil).map { neighbor => - val compositeScore = (neighbor.followCosineSimilarity - .getOrElse(0.0) + neighbor.favCosineSimilarity.getOrElse(0.0)) / 2 - ( - clusterId, - neighbor.clusterId, - "%.4f".format(compositeScore) - ) - } - }.writeExecution(TypedTsv(args("outputDir"))) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.docx b/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.docx new file mode 100644 index 000000000..f4c3d7739 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.scala b/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.scala deleted file mode 100644 index 7133382eb..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.scala +++ /dev/null @@ -1,607 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.algebird.Monoid -import com.twitter.algebird.mutable.PriorityQueueMonoid -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.job.analytics_batch._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.scalding.common.Util.Distribution -import com.twitter.simclusters_v2.thriftscala.ClusterQuality -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import java.util.PriorityQueue -import scala.collection.JavaConverters._ - -object ClusterEvaluation { - - val samplerMonoid: PriorityQueueMonoid[((Long, Long), (Double, Double))] = - Util.reservoirSamplerMonoidForPairs[(Long, Long), (Double, Double)](5000)(Util.edgeOrdering) - - case class ClusterResults( - numEdgesInsideCluster: Int, - wtOfEdgesInsideCluster: Double, - numEdgesOutsideCluster: Int, - wtOfEdgesOutsideCluster: Double, - originalWtAndProductOfNodeScoresSample: PriorityQueue[((Long, Long), (Double, Double))]) { - def clusterQuality(clusterSize: Int, averagePrecisionWholeGraph: Double): ClusterQuality = { - val unweightedRecallDenominator = numEdgesInsideCluster + numEdgesOutsideCluster - val unweightedRecall = if (unweightedRecallDenominator > 0) { - numEdgesInsideCluster.toDouble / unweightedRecallDenominator.toDouble - } else 0.0 - - val weightedRecallDenominator = wtOfEdgesInsideCluster + wtOfEdgesOutsideCluster - val weightedRecall = if (weightedRecallDenominator > 0) { - wtOfEdgesInsideCluster / weightedRecallDenominator - } else 0.0 - - val precision = if (clusterSize > 1) { - Some(wtOfEdgesInsideCluster / (clusterSize * (clusterSize - 1))) - } else Some(0.0) - - val relativePrecision = if (averagePrecisionWholeGraph > 0) { - precision.flatMap { p => Some(p / averagePrecisionWholeGraph) } - } else Some(0.0) - - ClusterQuality( - unweightedRecall = Some(unweightedRecall), - weightedRecall = Some(weightedRecall), - unweightedRecallDenominator = Some(unweightedRecallDenominator), - weightedRecallDenominator = Some(weightedRecallDenominator), - relativePrecisionNumerator = precision, - relativePrecision = relativePrecision, - weightAndProductOfNodeScoresCorrelation = Some( - Util.computeCorrelation( - originalWtAndProductOfNodeScoresSample.iterator.asScala.map(_._2))) - ) - } - } - - object ClusterResultsMonoid extends Monoid[ClusterResults] { - override def zero = ClusterResults(0, 0, 0, 0, samplerMonoid.zero) - override def plus(l: ClusterResults, r: ClusterResults) = ClusterResults( - l.numEdgesInsideCluster + r.numEdgesInsideCluster, - l.wtOfEdgesInsideCluster + r.wtOfEdgesInsideCluster, - l.numEdgesOutsideCluster + r.numEdgesOutsideCluster, - l.wtOfEdgesOutsideCluster + r.wtOfEdgesOutsideCluster, - samplerMonoid - .plus(l.originalWtAndProductOfNodeScoresSample, r.originalWtAndProductOfNodeScoresSample) - ) - } - - /** - * Evaluate the quality of a cluster. - * @param memberScores A map with the members of the cluster as the keys and their scores - * inside the cluster as values. The more central a member is inside the score, - * the higher it's score is. - * @param membersAdjLists A map that gives the weighted neighbors of each member in the cluster. - */ - def evaluateCluster( - memberScores: Map[Long, Double], - membersAdjLists: Map[Long, Map[Long, Float]] - ): ClusterResults = { - val resultsIter = membersAdjLists.flatMap { - case (fromNodeId, adjList) => - val fromNodeWt = memberScores.getOrElse(fromNodeId, 0.0) - adjList.map { - case (toNodeId, edgeWt) => - if (memberScores.contains(toNodeId)) { - val productOfMembershipScores = fromNodeWt * memberScores(toNodeId) - ClusterResults( - 1, - edgeWt, - 0, - 0, - samplerMonoid.build( - ((fromNodeId, toNodeId), (edgeWt.toDouble, productOfMembershipScores)))) - } else { - ClusterResults(0, 0, 1, edgeWt, samplerMonoid.zero) - } - } - } - Monoid.sum(resultsIter)(ClusterResultsMonoid) - } - - /** - * Evaluate each cluster with respect to the provided graph. - * @param graph graph represented via the adjacency lists of each node, needs to be symmetrized i.e. if u is in v's adjlist, then v needs to be in u's adjlist as well - * @param clusters cluster memberships of each node. - * @param statsPrefix convenience argument to act as prefix for stats counters - * @return key-value pipe with clusterId as key and (size of the cluster, quality struct) as value - */ - def clusterLevelEvaluation( - graph: TypedPipe[(Long, Map[Long, Float])], - clusters: TypedPipe[(Long, Array[(Int, Float)])], - statsPrefix: String = "" - )( - implicit uniqueId: UniqueID - ): Execution[TypedPipe[(Int, (Int, ClusterQuality))]] = { - val numRealClusters = Stat(s"${statsPrefix}/numRealClusters") - val numFakeClusters = Stat(s"${statsPrefix}/numFakeClusters") - - val numNodesAndEdgesExec = graph - .map { - case (nId, nbrMap) => - (1L, nbrMap.size.toLong, nbrMap.values.sum.toDouble) - }.sum.getExecution - - numNodesAndEdgesExec.map { - case (numNodes, numEdges, sumOfAllEdgeWts) => - println("numNodes " + numNodes) - println("numEdges " + numEdges) - println("sumOfAllEdgeWts " + sumOfAllEdgeWts) - - val numFakeClustersForUnassignedNodes = numNodes / 1e4 - - val averagePrecisionWholeGraph = sumOfAllEdgeWts / (numNodes * (numNodes - 1)) - graph - .leftJoin(clusters) - // uncomment for adhoc job - .withReducers(200) - .flatMap { - case (nodeId, (adjList, assignedClustersOpt)) => - val nodeDegree = adjList.size.toLong - val nodeWeightedDegree = adjList.values.sum - assignedClustersOpt match { - case Some(assignedClusters) if assignedClusters.nonEmpty => - assignedClusters.toList.map { - case (clusterId, scoreOfNodeInCluster) => - ( - clusterId, - ( - Map(nodeId -> (scoreOfNodeInCluster.toDouble, adjList)), - 1, - nodeDegree, - nodeWeightedDegree)) - } - case _ => - // For nodes that don't belong to any cluster, create a fake clusterId (0 or lesser) - // and add the node's statistics to that clusterId. We don't need the adjacency lists for - // unassigned nodes, we'll simply track how many edges are incident on those nodes and their weighted sum etc - val fakeClusterId = - (-1 * (math.abs( - Util.hashToLong(nodeId)) % numFakeClustersForUnassignedNodes)).toInt - List( - ( - fakeClusterId, - ( - Map.empty[Long, (Double, Map[Long, Float])], - 1, - nodeDegree, - nodeWeightedDegree))) - } - } - .sumByKey - // uncomment for adhoc job - .withReducers(60) - .map { - case (clusterId, (membersMap, clusterSize, volumeOfCluster, weightedVolumeOfCluster)) => - if (clusterId > 0) { - numRealClusters.inc() - - val scoresMap = - if (clusterId > 0) membersMap.mapValues(_._1) else Map.empty[Long, Double] - val adjListsMap = membersMap.mapValues(_._2) - - val quality = evaluateCluster(scoresMap, adjListsMap) - .clusterQuality(clusterSize, averagePrecisionWholeGraph) - - (clusterId, (clusterSize, quality)) - } else { - // clusterId <= 0 means that this is a fake cluster. - numFakeClusters.inc() - ( - clusterId, - ( - clusterSize, - ClusterQuality( - unweightedRecallDenominator = Some(volumeOfCluster), - weightedRecallDenominator = Some(weightedVolumeOfCluster) - ) - ) - ) - } - } - } - } - - case class OverallResults( - unweightedRecall: Double, - edgesInsideClusters: Long, - allEdges: Long, - allNodes: Int, - weightedRecall: Double, - wtOnEdgesInsideClusters: Double, - wtOnAllEdges: Double, - weightCorrelation: Double, - relativePrecision: Double, - numUnassignedNodes: Int, - numAssignedNodes: Int, - sizeDist: Distribution, - recallDist: Distribution, - weightedRecallDist: Distribution, - relativePrecisionDist: Distribution, - weightCorrelationDist: Distribution, - numClustersWithNegativeCorrelation: Double, - numClustersWithZeroRecall: Double, - numClustersWithLessThanOneRelativePrecision: Double, - numSingletonClusters: Int) - - def summarizePerClusterResults( - perClusterResults: TypedPipe[(Int, (Int, ClusterQuality))] - ): Execution[Option[OverallResults]] = { - perClusterResults - .map { - case (clusterId, (size, quality)) => - val unweightedRecallDen = quality.unweightedRecallDenominator.getOrElse(0.0) - val unweightedRecallNum = quality.unweightedRecall.getOrElse(0.0) * unweightedRecallDen - val weightedRecallDen = quality.weightedRecallDenominator.getOrElse(0.0) - val weightedRecallNum = quality.weightedRecall.getOrElse(0.0) * weightedRecallDen - - val weightCorrelationDen = size - val weightCorrelationNum = - weightCorrelationDen * quality.weightAndProductOfNodeScoresCorrelation - .getOrElse(0.0) - - val relativePrecisionDen = size - val relativePrecisionNum = relativePrecisionDen * quality.relativePrecision.getOrElse(0.0) - - val numClustersWithNegativeCorrelation = - if (weightCorrelationNum < 0 && clusterId > 0) 1 else 0 - val numClustersWithLessThanOneRelativePrecision = - if (quality.relativePrecision.getOrElse(0.0) < 1 && clusterId > 0) 1 else 0 - val numClustersWithZeroRecall = if (weightedRecallNum < 1e-5 && clusterId > 0) 1 else 0 - val numUnassignedNodes = if (clusterId < 1) size else 0 - val numAssignedNodes = if (clusterId > 0) size else 0 - val numSingletonClusters = if (clusterId > 0 && size == 1) 1 else 0 - - ( - unweightedRecallDen, - unweightedRecallNum, - weightedRecallDen, - weightedRecallNum, - weightCorrelationDen, - weightCorrelationNum, - relativePrecisionDen, - relativePrecisionNum, - numClustersWithNegativeCorrelation, - numClustersWithLessThanOneRelativePrecision, - numClustersWithZeroRecall, - List(size.toDouble), - List(quality.unweightedRecall.getOrElse(0.0)), - List(quality.weightedRecall.getOrElse(0.0)), - List(quality.relativePrecision.getOrElse(0.0)), - List(quality.weightAndProductOfNodeScoresCorrelation.getOrElse(0.0)), - numUnassignedNodes, - numAssignedNodes, - numSingletonClusters - ) - } - .sum - .toOptionExecution - .map { opt => - opt.map { - case ( - unweightedRecallDen, - unweightedRecallNum, - weightedRecallDen, - weightedRecallNum, - weightCorrelationDen, - weightCorrelationNum, - relativePrecisionDen, - relativePrecisionNum, - numClustersWithNegativeCorrelation, - numClustersWithLessThanOneRelativePrecision, - numClustersWithZeroRecall, - sizeList, - unweightedRecallList, - weightedRecallList, - relativePrecisionList, - weightCorrelationList, - numUnassignedNodes, - numAssignedNodes, - numSingletonClusters) => - OverallResults( - unweightedRecall = unweightedRecallNum / unweightedRecallDen, - edgesInsideClusters = unweightedRecallNum.toLong, - allEdges = unweightedRecallDen.toLong, - allNodes = numAssignedNodes + numUnassignedNodes, - weightedRecall = weightedRecallNum / weightedRecallDen, - wtOnEdgesInsideClusters = weightedRecallNum, - wtOnAllEdges = weightedRecallDen, - weightCorrelation = weightCorrelationNum / weightCorrelationDen, - relativePrecision = relativePrecisionNum / relativePrecisionDen, - numAssignedNodes = numAssignedNodes, - numUnassignedNodes = numUnassignedNodes, - sizeDist = Util.distributionFromArray(sizeList.toArray), - recallDist = Util.distributionFromArray(unweightedRecallList.toArray), - weightedRecallDist = Util.distributionFromArray(weightedRecallList.toArray), - weightCorrelationDist = Util.distributionFromArray(weightCorrelationList.toArray), - relativePrecisionDist = Util.distributionFromArray(relativePrecisionList.toArray), - numClustersWithNegativeCorrelation = numClustersWithNegativeCorrelation, - numClustersWithLessThanOneRelativePrecision = - numClustersWithLessThanOneRelativePrecision, - numClustersWithZeroRecall = numClustersWithZeroRecall, - numSingletonClusters = numSingletonClusters - ) - } - } - } - - /** - * @param graph Input similarity graph, needs to be symmetrized i.e. if u is in v's adjlist, then v needs to be in u's adjlist as well - * @param clusters cluster assignments to be evaluated - * @return summary of results - */ - def overallEvaluation( - graph: TypedPipe[(Long, Map[Long, Float])], - clusters: TypedPipe[(Long, Array[(Int, Float)])], - statsPrefix: String - )( - implicit uniqueId: UniqueID - ): Execution[Option[OverallResults]] = { - clusterLevelEvaluation(graph, clusters, statsPrefix).flatMap(summarizePerClusterResults) - } -} - -/** - * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:cluster_evaluation && \ - * oscar hdfs --user frigate --host hadoopnest1.atla.twitter.com --bundle cluster_evaluation \ - * --tool com.twitter.simclusters_v2.scalding.ClusterEvaluationAdhoc --screen --screen-detached \ - * --tee logs/clusterQualityFor_updatedUnnormalizedInputScores_usingSims20190318 -- \ - * --simsInputDir /user/frigate/your_ldap/commonDirForClusterEvaluation/classifiedSims_20190314_copiedFromAtlaProc \ - * --topK 20000000 --date 2019-03-18 --minActiveFollowers 400 \ - * --topUsersDir /user/frigate/your_ldap/commonDirForClusterEvaluation/top20MUsers_minActiveFollowers400_20190215 \ - * --maxSimsNeighborsForEval 40 \ - * --preparedSimsGraph /user/frigate/your_ldap/commonDirForClusterEvaluation/symmetrized_classifiedSims20190318_top20MUsers \ - * --outputDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownForClusterEvaluation \ - * --knownForDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownFor - */ -object ClusterEvaluationAdhoc extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val knownFor = args - .optional("knownForDir").map { location => - KnownForSources.readKnownFor(location) - }.getOrElse(KnownForSources.knownFor_20M_Dec11_145K) - - val minActiveFollowers = args.int("minActiveFollowers", 400) - val topK = args.int("topK") - val date = DateRange.parse(args("date")) - - val topUsersExec = - TopUsersSimilarityGraph - .topUsers( - DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe, - minActiveFollowers, - topK - ) - .map(_.id) - .count("num_top_users") - .make(TypedTsv(args("topUsersDir"))) - - val simsGraphExec = topUsersExec.flatMap { topUsers => - TopUsersSimilarityGraph.makeGraph( - TopUsersSimilarityGraph.getSubgraphFromUserGroupedInput( - TypedPipe.from(WTFCandidatesSource(args("simsInputDir"))), - topUsers, - args.int("maxSimsNeighborsForEval", 40), - degreeThresholdForStat = 5 - ), - args("preparedSimsGraph") - ) - } - - val fullExec = simsGraphExec.flatMap { sims => - ClusterEvaluation - .clusterLevelEvaluation(sims, knownFor, "eval") - .flatMap { clusterResultsPipe => - val clusterResults = clusterResultsPipe.forceToDiskExecution - val outputExec = clusterResults.flatMap { pipe => - pipe - .map { - case (clusterId, (clusterSize, quality)) => - "%d\t%d\t%.2g\t%.2g\t%.1f\t%.2g\t%.2f\t%.2g\t%.2g" - .format( - clusterId, - clusterSize, - quality.unweightedRecall.getOrElse(0.0), - quality.weightedRecall.getOrElse(0.0), - quality.unweightedRecallDenominator.getOrElse(0.0), - quality.weightedRecallDenominator.getOrElse(0.0), - quality.relativePrecision.getOrElse(0.0), - quality.relativePrecisionNumerator.getOrElse(0.0), - quality.weightAndProductOfNodeScoresCorrelation.getOrElse(0.0) - ) - }.writeExecution(TypedTsv(args("outputDir"))) - } - - val printExec = clusterResults.flatMap { pipe => - ClusterEvaluation.summarizePerClusterResults(pipe).map { - case Some(res) => - println("Overall results: " + Util.prettyJsonMapper.writeValueAsString(res)) - case None => - println("No overall results!!! Probably cluster results pipe is empty.") - } - } - - Execution.zip(outputExec, printExec) - } - } - - Util.printCounters(fullExec) - } - } -} - -trait ClusterEvaluationBatch extends TwitterScheduledExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - - def firstTime: String - - def batchDescription: String - - def batchIncrement: Duration - - private lazy val execArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(batchDescription), - firstTime = BatchFirstTime(RichDate(firstTime)), - lastTime = None, - batchIncrement = BatchIncrement(batchIncrement) - ) - - val emailAddress: String = "no-reply@twitter.com" - - def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] - - def knownForModelVersion: String - - def baselineKnownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] - - def baselineKnownForModelVersion: String - - override def scheduledJob: Execution[Unit] = - AnalyticsBatchExecution(execArgs) { implicit dateRange => - Execution.withId { implicit uniqueId => - Execution.withArgs { args => - val baselineKnownFor = - KnownForSources.fromKeyVal( - DAL - .readMostRecentSnapshot(baselineKnownForDALDataset, dateRange.prepend(Days(7))) - .toTypedPipe, - baselineKnownForModelVersion - ) - - val knownFor = - KnownForSources.fromKeyVal( - DAL - .readMostRecentSnapshot(knownForDALDataset, dateRange.prepend(Days(7))) - .toTypedPipe, - knownForModelVersion - ) - - val inputSimsGraph = TypedPipe - .from(FollowingsCosineSimilaritiesManhattanSource()) - .map(_._2) - - val minActiveFollowers = args.int("minActiveFollowers") - val topK = args.int("topK") - val maxSimsNeighborsForEval = - args.int("maxSimsNeighborsForEval", 40) - - val topUsers = TopUsersSimilarityGraph - .topUsers( - DAL - .readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange) - .toTypedPipe, - minActiveFollowers, - topK - ) - .map(_.id) - .count("num_top_users") - - TopUsersSimilarityGraph - .getSubgraphFromUserGroupedInput( - fullGraph = inputSimsGraph, - usersToInclude = topUsers, - maxNeighborsPerNode = maxSimsNeighborsForEval, - degreeThresholdForStat = 2 - ) - .forceToDiskExecution - .flatMap { symmetrizedSims => - val baselineResultsExec = ClusterEvaluation - .overallEvaluation(symmetrizedSims, baselineKnownFor, "baselineKnownForEval") - val newResultsExec = ClusterEvaluation - .overallEvaluation(symmetrizedSims, knownFor, "newKnownForEval") - val minSizeOfBiggerClusterForComparison = 10 - val compareExec = CompareClusters.summarize( - CompareClusters.compare( - KnownForSources.transpose(baselineKnownFor), - KnownForSources.transpose(knownFor), - minSizeOfBiggerCluster = minSizeOfBiggerClusterForComparison - )) - - Execution - .zip(baselineResultsExec, newResultsExec, compareExec) - .map { - case (oldResults, newResults, compareResults) => - val emailText = - s"Evaluation Results for baseline knownFor: $baselineKnownForModelVersion \n" + - Util.prettyJsonMapper.writeValueAsString(oldResults) + - "\n\n-------------------\n\n" + - s"Evaluation Results for new knownFor:$knownForModelVersion\n" + - Util.prettyJsonMapper.writeValueAsString(newResults) + - "\n\n-------------------\n\n" + - s"Cosine similarity distribution between $baselineKnownForModelVersion and " + - s"$knownForModelVersion cluster membership vectors for " + - s"clusters with at least $minSizeOfBiggerClusterForComparison members:\n" + - Util.prettyJsonMapper - .writeValueAsString(compareResults) - - Util - .sendEmail( - emailText, - s"Evaluation results comparing $knownForModelVersion with baseline $baselineKnownForModelVersion", - emailAddress) - () - } - } - } - } - } -} - -/** - * capesospy-v2 update --build_locally --start_cron cluster_evaluation_for_20M_145k \ - * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object ClusterEvaluationFor20M145K extends ClusterEvaluationBatch { - override val firstTime: String = "2019-06-11" - - override val batchIncrement: Duration = Days(7) - - override val batchDescription = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K" - - override val knownForDALDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset - - override val knownForModelVersion = ModelVersions.Model20M145KUpdated - - override val baselineKnownForDALDataset = SimclustersV2KnownFor20M145KDec11ScalaDataset - - override val baselineKnownForModelVersion = ModelVersions.Model20M145KDec11 -} - -/** - * capesospy-v2 update --build_locally --start_cron cluster_evaluation_for_20M_145k_2020 \ - * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object ClusterEvaluationFor20M145K2020 extends ClusterEvaluationBatch { - override val firstTime: String = "2021-01-25" - - override val batchIncrement: Duration = Days(7) - - override val batchDescription = - "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K2020" - - override val knownForDALDataset = SimclustersV2KnownFor20M145K2020ScalaDataset - - override val knownForModelVersion = ModelVersions.Model20M145K2020 - - override val baselineKnownForDALDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset - - override val baselineKnownForModelVersion = ModelVersions.Model20M145KUpdated -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.docx b/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.docx new file mode 100644 index 000000000..58bfd2be7 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.scala b/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.scala deleted file mode 100644 index 55d538d4a..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.scala +++ /dev/null @@ -1,131 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.scalding.{DateOps, DateParser, Execution, Stat, TypedPipe, TypedTsv, UniqueID} -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.simclusters_v2.common.{ClusterId, UserId} -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.scalding.common.Util.Distribution - -object CompareClusters { - def norm(a: Iterable[Float]): Float = { - math - .sqrt(a.map { x => x * x }.sum).toFloat - } - - def cosine(a: Map[Long, Float], b: Map[Long, Float]): Float = { - val intersect = a.toList.collect { - case (id, score) if b.contains(id) => - score * b(id) - } - val dot = if (intersect.nonEmpty) intersect.sum else 0 - val aNorm = norm(a.values) - val bNorm = norm(b.values) - if (aNorm > 0 && bNorm > 0) { - dot / aNorm / bNorm - } else 0 - } - - /** - * Compare two known-for data set, and generate change in cluster assignment stats - */ - def compareClusterAssignments( - newKnownFor: TypedPipe[(UserId, List[(ClusterId, Float)])], - oldKnownFor: TypedPipe[(UserId, List[(ClusterId, Float)])] - )( - implicit uniqueID: UniqueID - ): Execution[String] = { - - val emptyToSomething = Stat("no_assignment_to_some") - val somethingToEmpty = Stat("some_assignment_to_none") - val emptyToEmpty = Stat("empty_to_empty") - val sameCluster = Stat("same_cluster") - val diffCluster = Stat("diff_cluster") - - val calculateStatExec = newKnownFor - .outerJoin(oldKnownFor) - .map { - case (userId, (newKnownForListOpt, oldKnownForListOpt)) => - val newKnownFor = newKnownForListOpt.getOrElse(Nil) - val oldKnownFor = oldKnownForListOpt.getOrElse(Nil) - - if (newKnownFor.nonEmpty && oldKnownFor.isEmpty) { - emptyToSomething.inc() - } - if (newKnownFor.isEmpty && oldKnownFor.nonEmpty) { - somethingToEmpty.inc() - } - if (newKnownFor.isEmpty && oldKnownFor.isEmpty) { - emptyToEmpty.inc() - } - - if (newKnownFor.nonEmpty && oldKnownFor.nonEmpty) { - val newClusterId = newKnownFor.head._1 - val oldClusterId = oldKnownFor.head._1 - - if (newClusterId == oldClusterId) { - sameCluster.inc() - } else { - diffCluster.inc() - } - } - userId - } - .toIterableExecution - - Util.getCustomCountersString(calculateStatExec) - } - - /** - * Compare two cluster assignments in terms of cosine similarity of corresponding clusters. - * Excludes clusters which are too small - * @param knownForA - * @param knownForB - * @param minSizeOfBiggerCluster Set to 10 or some such. - * @return - */ - def compare( - knownForA: TypedPipe[(Int, List[(Long, Float)])], - knownForB: TypedPipe[(Int, List[(Long, Float)])], - minSizeOfBiggerCluster: Int - ): TypedPipe[(Int, Float)] = { - knownForA - .outerJoin(knownForB) - .collect { - case (clusterId, (membersInAOpt, membersInBOpt)) - if membersInAOpt.exists(_.size >= minSizeOfBiggerCluster) || membersInBOpt - .exists(_.size >= minSizeOfBiggerCluster) => - val membersInA = - membersInAOpt.map(_.toMap).getOrElse(Map.empty[Long, Float]) - val membersInB = - membersInBOpt.map(_.toMap).getOrElse(Map.empty[Long, Float]) - (clusterId, cosine(membersInA, membersInB)) - } - } - - def summarize(clusterToCosines: TypedPipe[(Int, Float)]): Execution[Option[Distribution]] = { - clusterToCosines.values.map(x => List(x)).sum.toOptionExecution.map { listOpt => - listOpt.map { list => Util.distributionFromArray(list.map(_.toDouble).toArray) } - } - } -} - -object CompareClustersAdhoc extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - - val knownForA = KnownForSources.transpose(KnownForSources.readKnownFor(args("knownForA"))) - val knownForB = KnownForSources.transpose(KnownForSources.readKnownFor(args("knownForB"))) - - CompareClusters - .compare(knownForA, knownForB, minSizeOfBiggerCluster = 10) - .map { case (cId, cos) => "%d\t%.2f".format(cId, cos) } - .writeExecution(TypedTsv(args("outputDir"))) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.docx b/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.docx new file mode 100644 index 000000000..78d911ad7 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.scala b/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.scala deleted file mode 100644 index 7171e0e7a..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.scala +++ /dev/null @@ -1,330 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.algebird.Monoid -import com.twitter.logging.Logger -import com.twitter.scalding.{Execution, TypedPipe, TypedTsv} -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources -import java.util -import no.uib.cipr.matrix.Matrix -import no.uib.cipr.matrix.sparse.{ArpackSym, LinkedSparseMatrix} -import scala.collection.JavaConverters._ - -object EigenVectorsForSparseSymmetric { - val log: Logger = Logger() - - /** - * Construct matrix from the rows of the matrix, specified as a map. The outer map is indexed by rowId, and the inner maps are indexed by columnId. - * Note that the input matrix is intended to be symmetric. - * - * @param map A map specifying the rows of the matrix. The outer map is indexed by rowId, and the inner maps are indexed by columnId. Both rows and columns are zero-indexed. - * @param nRows number of rows in matrix - * @param nCols number of columns in matrix - * - * @return the constructed matrix - */ - def getMatrix(map: Map[Int, Map[Int, Double]], nRows: Int, nCols: Int): Matrix = { - val nonzeros = map.toSeq.flatMap { - case (i, subMap) => - subMap.toSeq.map { - case (j, value) => - (i, j, value) - } - } - getMatrix(nonzeros, nRows, nCols) - } - - /** - * Construct matrix from iterable of the non-zero entries. Note that the input matrix is intended to be symmetric. - * - * @param nonzeros non-zeros in (i, j, v) format, where i is row, j is column, and v is value. Both rows and columns are zero-indexed. - * @param nRows number of rows in matrix - * @param nCols number of columns in matrix - * - * @return the constructed matrix - */ - def getMatrix(nonzeros: Iterable[(Int, Int, Double)], nRows: Int, nCols: Int): Matrix = { - val matrix = new LinkedSparseMatrix(nRows, nCols) - var numEntries = 0 - var maxRow = 0 - var maxCol = 0 - - nonzeros.foreach { - case (i, j, v) => - if (i > maxRow) { - maxRow = i - } - if (j > maxCol) { - maxCol = j - } - numEntries += 1 - matrix.set(i, j, v) - } - log.info( - "Finished building matrix with %d entries and maxRow %d and maxCol %d" - .format(numEntries, maxRow, maxCol)) - - matrix - } - - /** - * Prints out various diagnostics about how much the given matrix differs from a perfect - * symmetric matrix. If (i,j) and (j,i) are different, it sets both of them to be the max of the two. - * Call this function before invoking EVD. - * - * @param matrix Matrix which is modified (if need be) in place. - */ - def ensureMatrixIsSymmetric(matrix: Matrix): Unit = { - var numUnequalEntries = 0 - var numEntriesDifferentBy1Percent = 0 - var numEqualEntries = 0 - var numUnequalDueToZero = 0 - var maxUnequal = (0, 0, 0.0, 0.0) - matrix.iterator().asScala.foreach { entry => - val curr = entry.get() - val opp = matrix.get(entry.column(), entry.row()) - if (curr == opp) { - numEqualEntries += 1 - } else { - numUnequalEntries += 1 - if (opp == 0) { - numUnequalDueToZero += 1 - } - if (opp != 0 && (math.abs(curr - opp) / math.min(curr, opp)) > 0.01) { - numEntriesDifferentBy1Percent += 1 - } - if (opp != 0 && math.abs(curr - opp) > maxUnequal._4) { - maxUnequal = (entry.row(), entry.column(), curr, math.abs(curr - opp)) - } - val max = math.max(curr, opp) - matrix.set(entry.column(), entry.row(), max) - matrix.set(entry.row(), entry.column(), max) - } - } - - var numUnEqualPrinted = 0 - matrix.iterator().asScala.foreach { entry => - val opp = matrix.get(entry.column(), entry.row()) - if (numUnEqualPrinted < 10 && entry.get() != opp) { - numUnEqualPrinted += 1 - log.info( - "Entries for (%d, %d) are %s and %s" - .format(entry.row(), entry.column(), entry.get(), opp)) - } - } - - log.info( - "Num unequal entries: %d, num unequal due to zero: %d, num unequal by 1percent or more: %d, num equal entries: %d, maxUnequal: %s" - .format( - numUnequalEntries, - numUnequalDueToZero, - numEntriesDifferentBy1Percent, - numEqualEntries, - maxUnequal)) - } - - /** - * Get the top-k eigenvalues (largest magnitude) and eigenvectors for an input matrix. - * Top eigenvalues means they're the largest in magnitude. - * Input matrix needs to be perfectly symmetric; if it's not, this function will fail. - * - * Many of the eigenvectors will have very small values along most of the dimensions. This method also - * only retains the bigger entries in an eigenvector. - * - * @param matrix symmetric input matrix. - * @param k how many of the top eigenvectors to get. - * @param ratioToLargestCutoff An entry needs to be at least 1/ratioToLargestCutoff of the biggest entry in that vector to be retained. - * - * @return seq of (eigenvalue, eigenvector) pairs. - */ - def getTruncatedEVD( - matrix: Matrix, - k: Int, - ratioToLargestCutoff: Float - ): Seq[(Double, Seq[(Int, Double)])] = { - val solver = new ArpackSym(matrix) - val resultsMap = solver.solve(k, ArpackSym.Ritz.LM).asScala.toMap - val results = resultsMap.toIndexedSeq.sortBy { case (eigValue, _) => -eigValue } - results.zipWithIndex.map { - case ((eigValue, denseVectorJava), index) => - val denseVector = new Array[Double](denseVectorJava.size()) - denseVector.indices.foreach { index => denseVector(index) = denseVectorJava.get(index) } - val denseVectorMax = denseVector.maxBy { entry => math.abs(entry) } - val cutOff = math.abs(denseVectorMax) / ratioToLargestCutoff - val significantEntries = denseVector.zipWithIndex - .filter { case (vectorEntry, _) => math.abs(vectorEntry) >= cutOff } - .sortBy { case (vectorEntry, _) => -1 * math.abs(vectorEntry) } - (eigValue.toDouble, significantEntries.toSeq.map(_.swap)) - } - } - - /** - * Compute U*Diag*Ut - where Diag is a diagonal matrix, and U is a sparse matrix. - * This is primarily for testing - to make sure that the computed eigenvectors can be used to - * reconstruct the original matrix up to some reasonable approximation. - * - * @param diagToUColumns seq of (diagonal entries, associated column in U) - * @param cutoff cutoff for including a value in the result. - * - * @return result of multiplication, returned as a map of the rows in the results. - */ - def uTimesDiagTimesUT( - diagToUColumns: Seq[(Double, Seq[(Int, Double)])], - cutoff: Double - ): Map[Int, Map[Int, Double]] = { - val result = new util.HashMap[Int, util.HashMap[Int, Double]]() - diagToUColumns.foreach { - case (diag, uColumn) => - uColumn.foreach { - case (i, iVal) => - uColumn.foreach { - case (j, jVal) => - val prod = diag * iVal * jVal - if (result.containsKey(i)) { - val newVal = if (result.get(i).containsKey(j)) { - result.get(i).get(j) + prod - } else prod - result.get(i).put(j, newVal) - } else { - result.put(i, new util.HashMap[Int, Double]) - result.get(i).put(j, prod) - } - } - } - } - val unfiltered = result.asScala.toMap.mapValues(_.asScala.toMap) - unfiltered - .mapValues { m => m.filter { case (_, value) => math.abs(value) >= cutoff } } - .filter { case (_, vector) => vector.nonEmpty } - } - - /** Note: This requires a full EVD to correctly compute the inverse! :-( */ - def getInverseFromEVD( - evd: Seq[(Double, Seq[(Int, Double)])], - cutoff: Double - ): Map[Int, Map[Int, Double]] = { - val evdInverse = evd.map { - case (eigValue, eigVector) => - (1.0 / eigValue, eigVector) - } - uTimesDiagTimesUT(evdInverse, cutoff) - } -} - -object PCAProjectionMatrixAdhoc extends TwitterExecutionApp { - val log = Logger() - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, _) => - Execution.withId { _ => - val args = config.getArgs - val k = args.int("k", 100) - val ratioToLargestEntryInVectorCutoff = args.int("ratioToLargestEntryInVectorCutoff", 100) - val minClusterFavers = args.int("minClusterFavers", 1000) - val input = TypedPipe.from(AdhocKeyValSources.clusterDetailsSource(args("inputDir"))) - val outputDir = args("outputDir") - - val filteredClustersExec = - input - .collect { - case ((_, clusterId), details) - if details.numUsersWithNonZeroFavScore > minClusterFavers => - clusterId - } - .toIterableExecution - .map { fc => - val fcSet = fc.toSet - log.info("Number of clusters with favers more than %d is %d" - .format(minClusterFavers, fcSet.size)) - fcSet - } - - filteredClustersExec - .flatMap { filteredClusters => - input.flatMap { - case ((_, clusterId), details) => - if (filteredClusters(clusterId)) { - details.neighborClusters.getOrElse(Nil).collect { - case neighbor - if filteredClusters( - neighbor.clusterId) && neighbor.favCosineSimilarity.isDefined => - (clusterId, neighbor.clusterId, neighbor.favCosineSimilarity.get) - } - } else Nil - }.toIterableExecution - } - .flatMap { edgesIter => - val edges = edgesIter.toSeq - val oldIdToNewId = edges - .flatMap { case (i, j, _) => Seq(i, j) } - .distinct - .zipWithIndex - .toMap - - val mapString = oldIdToNewId.toList - .take(5).map { - case (old, nw) => - Seq(old, nw).mkString(" ") - }.mkString("\n") - log.info("A few entries of OldId to NewId map is") - log.info(mapString) - - val newIdToOldId = oldIdToNewId.map(_.swap) - log.info( - "Num clusters after filtering out those with no neighbors with favers more than %d is %d" - .format(minClusterFavers, oldIdToNewId.size)) - val newEdges = edges.map { - case (oldI, oldJ, value) => - (oldIdToNewId(oldI), oldIdToNewId(oldJ), value) - } - log.info("Going to build matrix") - val matrix = EigenVectorsForSparseSymmetric.getMatrix( - newEdges, - oldIdToNewId.size, - oldIdToNewId.size) - EigenVectorsForSparseSymmetric.ensureMatrixIsSymmetric(matrix) - - log.info("Going to solve now for %d eigenvalues".format(k)) - val tic = System.currentTimeMillis() - val results = EigenVectorsForSparseSymmetric.getTruncatedEVD( - matrix, - k, - ratioToLargestEntryInVectorCutoff) - val toc = System.currentTimeMillis() - log.info("Finished solving in %.2f minutes".format((toc - tic) / 1000 / 60.0)) - - val eigValues = results.map(_._1).map { x => "%.3g".format(x) }.mkString(" ") - val eigValueNorm = math.sqrt(results.map(_._1).map(x => x * x).sum) - val matrixNorm = math.sqrt(matrix.iterator().asScala.map(_.get()).map(x => x * x).sum) - - println( - "matrixNorm %s, eigValueNorm %s, explained fraction %s" - .format(matrixNorm, eigValueNorm, eigValueNorm / matrixNorm)) - - log.info("The eigenvalues are:") - log.info(eigValues) - - val nnzInEigenVectors = results.map(_._2.size).sum - log.info("Average nnz per eigenvector using ratioToLargestCutoff %d is %.2g" - .format(ratioToLargestEntryInVectorCutoff, nnzInEigenVectors * 1.0 / results.size)) - val transposedRaw = results.zipWithIndex.flatMap { - case ((_, eigVector), eigIndex) => - eigVector.map { - case (index, vectorEntry) => - val clusterId = newIdToOldId(index) - Map(clusterId -> List((eigIndex, vectorEntry))) - } - } - val transposed = Monoid.sum(transposedRaw).mapValues { rowForCluster => - rowForCluster - .map { - case (dimId, weight) => - "%d:%.2g".format(dimId, weight) - }.mkString(" ") - } - TypedPipe.from(transposed.toSeq).writeExecution(TypedTsv(outputDir)) - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.docx b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.docx new file mode 100644 index 000000000..8eb43d211 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.scala b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.scala deleted file mode 100644 index a65f2a44f..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.scala +++ /dev/null @@ -1,332 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.dal.client.dataset.SnapshotDALDataset -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.scalding_internal.dalv2.DALWrite.WriteExtension -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources -import com.twitter.simclusters_v2.hdfs_sources.AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2UserToInterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource -import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors -import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores -import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusters -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * Production job for computing interestedIn data set from the aggregatable producer embeddings for the model version 20M145K2020. - * It writes the data set in KeyVal format to produce a MH DAL data set. - * - * A high level description of this job: - * - Read the APE dataset - * - Apply log1p to the scores from the above dataset as the scores for producers is high - * - Normalize the scores for each producer (offline benchmarking has shown better results from this step.) - * - Truncate the number of clusters for each producer from the APE dataset to reduce noise - * - Compute interestedIn - * - * To deploy the job: - * - * capesospy-v2 update --build_locally --start_cron interested_in_from_ape_2020 \ - * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object InterestedInFromAPE2020BatchApp extends InterestedInFromAggregatableProducerEmbeddingsBase { - - override val firstTime: RichDate = RichDate("2021-03-03") - - override val batchIncrement: Duration = Days(7) - - override def modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - - override def producerEmbeddingsInputKVDataset: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, SimClustersEmbedding] - ] = AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset - - override def interestedInFromAPEOutputKVDataset: KeyValDALDataset[ - KeyVal[UserId, ClustersUserIsInterestedIn] - ] = SimclustersV2InterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset - - override def interestedInFromAPEOutputThriftDatset: SnapshotDALDataset[ - UserToInterestedInClusters - ] = SimclustersV2UserToInterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset -} - -trait InterestedInFromAggregatableProducerEmbeddingsBase extends ScheduledExecutionApp { - def modelVersion: ModelVersion - - def interestedInFromAPEOutputKVDataset: KeyValDALDataset[ - KeyVal[UserId, ClustersUserIsInterestedIn] - ] - - def producerEmbeddingsInputKVDataset: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, SimClustersEmbedding] - ] - - def interestedInFromAPEOutputThriftDatset: SnapshotDALDataset[UserToInterestedInClusters] - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - //Input args for the run - val socialProofThreshold = args.int("socialProofThreshold", 2) - val maxClustersFromProducer = args.int("maxClustersPerProducer", 5) - val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 200) - - //Path variables - val interestedInFromProducersPath = - s"/user/cassowary/manhattan_sequence_files/interested_in_from_ape/" + modelVersion - - val interestedInFromProducersThriftPath = - s"/user/cassowary/manhattan_sequence_files/interested_in_from_ape_thrift/" + modelVersion - - val userUserGraph: TypedPipe[UserAndNeighbors] = - DAL - .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30)) - .withRemoteReadPolicy(AllowCrossDC) - .toTypedPipe - - val producerEmbeddings = DAL - .readMostRecentSnapshotNoOlderThan( - producerEmbeddingsInputKVDataset, - Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe.map { - case KeyVal(producer, embeddings) => (producer, embeddings) - } - - val result = InterestedInFromAggregatableProducerEmbeddingsBase.run( - userUserGraph, - producerEmbeddings, - maxClustersFromProducer, - socialProofThreshold, - maxClustersPerUserFinalResult, - modelVersion) - - val keyValExec = - result - .map { case (userId, clusters) => KeyVal(userId, clusters) } - .writeDALVersionedKeyValExecution( - interestedInFromAPEOutputKVDataset, - D.Suffix(interestedInFromProducersPath) - ) - val thriftExec = - result - .map { - case (userId, clusters) => - UserToInterestedInClusters( - userId, - ModelVersions.toKnownForModelVersion(modelVersion), - clusters.clusterIdToScores) - } - .writeDALSnapshotExecution( - interestedInFromAPEOutputThriftDatset, - D.Daily, - D.Suffix(interestedInFromProducersThriftPath), - D.EBLzo(), - dateRange.end - ) - Execution.zip(keyValExec, thriftExec).unit - } -} - -/** - * Adhoc job to generate the interestedIn from aggregatable producer embeddings for the model version 20M145K2020 - * - * scalding remote run \ - * --user cassowary \ - * --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ - * --principal service_acoount@TWITTER.BIZ \ - * --cluster bluebird-qus1 \ - * --main-class com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020AdhocApp \ - * --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_ape_2020-adhoc \ - * --hadoop-properties "mapreduce.map.memory.mb=8192 mapreduce.map.java.opts='-Xmx7618M' mapreduce.reduce.memory.mb=8192 mapreduce.reduce.java.opts='-Xmx7618M'" \ - * -- --outputDir /gcs/user/cassowary/adhoc/your_ldap/interested_in_from_ape_2020_keyval --date 2021-03-05 - */ -object InterestedInFromAPE2020AdhocApp extends AdhocExecutionApp { - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val outputDir = args("outputDir") - val socialProofThreshold = args.int("socialProofThreshold", 2) - val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 200) - val maxClustersFromProducer = args.int("maxClustersFromProducer", 5) - val inputGraph = args.optional("graphInputDir") match { - case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir)) - case None => - DAL - .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30)) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - } - - val producerEmbeddings = DAL - .readMostRecentSnapshotNoOlderThan( - AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset, - Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe.map { - case KeyVal(producer, embeddings) => (producer, embeddings) - } - - val result = InterestedInFromAggregatableProducerEmbeddingsBase.run( - inputGraph, - producerEmbeddings, - maxClustersFromProducer, - socialProofThreshold, - maxClustersPerUserFinalResult, - ModelVersion.Model20m145k2020) - - result - .writeExecution(AdhocKeyValSources.interestedInSource(outputDir)) - } -} - -/** - * Helper functions - */ -object InterestedInFromAggregatableProducerEmbeddingsBase { - - /** - * Helper function to prune the embeddings - * @param embeddingsWithScore embeddings - * @param maxClusters number of clusters to keep, per userId - * @param uniqueId for stats - * @return - */ - def getPrunedEmbeddings( - embeddingsWithScore: TypedPipe[(UserId, Seq[(ClusterId, Float)])], - maxClusters: Int - )( - implicit uniqueId: UniqueID - ): TypedPipe[(UserId, Array[(ClusterId, Float)])] = { - val numProducerMappings = Stat("num_producer_embeddings_total") - val numProducersWithLargeClusterMappings = Stat( - "num_producers_with_more_clusters_than_threshold") - val numProducersWithSmallClusterMappings = Stat( - "num_producers_with_clusters_less_than_threshold") - val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings") - embeddingsWithScore.map { - case (producerId, clusterArray) => - numProducerMappings.inc() - val clusterSize = clusterArray.size - totalClustersCoverageProducerEmbeddings.incBy(clusterSize) - val prunedList = if (clusterSize > maxClusters) { - numProducersWithLargeClusterMappings.inc() - clusterArray - .sortBy { - case (_, knownForScore) => -knownForScore - }.take(maxClusters) - } else { - numProducersWithSmallClusterMappings.inc() - clusterArray - } - (producerId, prunedList.toArray) - } - } - - /** - * helper function to remove all scores except follow and logFav - * @param interestedInResult interestedIn clusters for a user - * @return - */ - def getInterestedInDiscardScores( - interestedInResult: TypedPipe[(UserId, List[(ClusterId, UserToInterestedInClusterScores)])] - ): TypedPipe[(UserId, List[(ClusterId, UserToInterestedInClusterScores)])] = { - interestedInResult.map { - case (srcId, fullClusterList) => - val fullClusterListWithDiscardedScores = fullClusterList.map { - case (clusterId, clusterDetails) => - val clusterDetailsWithoutSocial = UserToInterestedInClusterScores( - // We are not planning to use the other scores except for logFav and Follow. - // Hence, setting others as None for now, we can add them back when needed - followScore = clusterDetails.followScore, - logFavScore = clusterDetails.logFavScore, - logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly - ) - (clusterId, clusterDetailsWithoutSocial) - } - (srcId, fullClusterListWithDiscardedScores) - } - } - - /** - * Helper function to normalize the embeddings - * @param embeddings cluster embeddings - * @return - */ - def getNormalizedEmbeddings( - embeddings: TypedPipe[(UserId, Seq[(ClusterId, Float)])] - ): TypedPipe[(UserId, Seq[(ClusterId, Float)])] = { - embeddings.map { - case (userId, clustersWithScores) => - val l2norm = math.sqrt(clustersWithScores.map(_._2).map(score => score * score).sum) - ( - userId, - clustersWithScores.map { - case (clusterId, score) => (clusterId, (score / l2norm).toFloat) - }) - } - } - - def run( - userUserGraph: TypedPipe[UserAndNeighbors], - producerEmbeddings: TypedPipe[(SimClustersEmbeddingId, SimClustersEmbedding)], - maxClustersFromProducer: Int, - socialProofThreshold: Int, - maxClustersPerUserFinalResult: Int, - modelVersion: ModelVersion - )( - implicit uniqueId: UniqueID - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - import InterestedInFromKnownFor._ - - val producerEmbeddingsWithScore: TypedPipe[(UserId, Seq[(ClusterId, Float)])] = - producerEmbeddings.map { - case ( - SimClustersEmbeddingId(embeddingType, modelVersion, InternalId.UserId(producerId)), - simclusterEmbedding) => - ( - producerId, - simclusterEmbedding.embedding.map { simclusterWithScore => - // APE dataset has very high producer scores, hence applying log to smoothen them out before - // computing interestedIn - (simclusterWithScore.clusterId, math.log(1.0 + simclusterWithScore.score).toFloat) - }) - } - - val result = keepOnlyTopClusters( - getInterestedInDiscardScores( - attachNormalizedScores( - userClusterPairsWithoutNormalization( - userUserGraph, - getPrunedEmbeddings( - getNormalizedEmbeddings(producerEmbeddingsWithScore), - maxClustersFromProducer), - socialProofThreshold, - ))), - maxClustersPerUserFinalResult, - ModelVersions.toKnownForModelVersion(modelVersion) - ) - result - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.docx b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.docx new file mode 100644 index 000000000..29884e880 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.scala b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.scala deleted file mode 100644 index ab2cbde2d..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.scala +++ /dev/null @@ -1,666 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.algebird.Semigroup -import com.twitter.bijection.Injection -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding.TypedPipe -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecution -import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecutionArgs -import com.twitter.scalding_internal.job.analytics_batch.BatchDescription -import com.twitter.scalding_internal.job.analytics_batch.BatchFirstTime -import com.twitter.scalding_internal.job.analytics_batch.BatchIncrement -import com.twitter.scalding_internal.job.analytics_batch.TwitterScheduledExecutionApp -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala._ - -/** - * This file implements the job for computing users' interestedIn vector from KnownFor data set. - * - * It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then - * based on the known-for clusters of each followed/faved user, we calculate how much a user is - * interestedIn a cluster. - */ - -/** - * Production job for computing interestedIn data set for the model version 20M145K2020. - * - * To deploy the job: - * - * capesospy-v2 update --build_locally --start_cron interested_in_for_20M_145k_2020 \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object InterestedInFromKnownFor20M145K2020 extends InterestedInFromKnownForBatchBase { - override val firstTime: String = "2020-10-06" - override val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] = - SimclustersV2RawInterestedIn20M145K2020ScalaDataset - override val outputPath: String = InternalDataPaths.RawInterestedIn2020Path - override val knownForModelVersion: String = ModelVersions.Model20M145K2020 - override val knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] = - SimclustersV2KnownFor20M145K2020ScalaDataset -} - -/** - * base class for the main logic of computing interestedIn from KnownFor data set. - */ -trait InterestedInFromKnownForBatchBase extends TwitterScheduledExecutionApp { - implicit val tz = DateOps.UTC - implicit val parser = DateParser.default - - def firstTime: String - val batchIncrement: Duration = Days(7) - val lookBackDays: Duration = Days(30) - - def outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] - def outputPath: String - def knownForModelVersion: String - def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] - - private lazy val execArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(this.getClass.getName.replace("$", "")), - firstTime = BatchFirstTime(RichDate(firstTime)), - lastTime = None, - batchIncrement = BatchIncrement(batchIncrement) - ) - - override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) { - implicit dateRange => - Execution.withId { implicit uniqueId => - Execution.withArgs { args => - val normalizedGraph = - DAL.readMostRecentSnapshot(UserUserNormalizedGraphScalaDataset).toTypedPipe - val knownFor = KnownForSources.fromKeyVal( - DAL.readMostRecentSnapshot(knownForDALDataset, dateRange.extend(Days(30))).toTypedPipe, - knownForModelVersion - ) - - val socialProofThreshold = args.int("socialProofThreshold", 2) - val maxClustersPerUser = args.int("maxClustersPerUser", 50) - - val result = InterestedInFromKnownFor - .run( - normalizedGraph, - knownFor, - socialProofThreshold, - maxClustersPerUser, - knownForModelVersion - ) - - val writeKeyValResultExec = result - .map { case (userId, clusters) => KeyVal(userId, clusters) } - .writeDALVersionedKeyValExecution( - outputKVDataset, - D.Suffix(outputPath) - ) - - // read previous data set for validation purpose - val previousDataset = if (RichDate(firstTime).timestamp != dateRange.start.timestamp) { - DAL - .readMostRecentSnapshot(outputKVDataset, dateRange.prepend(lookBackDays)).toTypedPipe - .map { - case KeyVal(user, interestedIn) => - (user, interestedIn) - } - } else { - TypedPipe.empty - } - - Util.printCounters( - Execution - .zip( - writeKeyValResultExec, - InterestedInFromKnownFor.dataSetStats(result, "NewResult"), - InterestedInFromKnownFor.dataSetStats(previousDataset, "OldResult") - ).unit - ) - } - } - } -} - -/** - * Adhoc job to compute user interestedIn. - * - * scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_adhoc \ - * --user recos-platform \ - * --submitter hadoopnest2.atla.twitter.com \ - * --main-class com.twitter.simclusters_v2.scalding.InterestedInFromKnownForAdhoc -- \ - * --date 2019-08-26 --outputDir /user/recos-platform/adhoc/simclusters_interested_in_log_fav - */ -object InterestedInFromKnownForAdhoc extends TwitterExecutionApp { - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val normalizedGraph = TypedPipe.from( - UserAndNeighborsFixedPathSource(args("graphInputDir")) - ) - val socialProofThreshold = args.int("socialProofThreshold", 2) - val maxClustersPerUser = args.int("maxClustersPerUser", 20) - val knownForModelVersion = args("knownForModelVersion") - val knownFor = KnownForSources.readKnownFor(args("knownForInputDir")) - - val outputSink = AdhocKeyValSources.interestedInSource(args("outputDir")) - Util.printCounters( - InterestedInFromKnownFor - .run( - normalizedGraph, - knownFor, - socialProofThreshold, - maxClustersPerUser, - knownForModelVersion - ).writeExecution(outputSink) - ) - } - } -} - -/** - * Adhoc job to check the output of an adhoc interestedInSource. - */ -object DumpInterestedInAdhoc extends TwitterExecutionApp { - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val users = args.list("users").map(_.toLong).toSet - val input = TypedPipe.from(AdhocKeyValSources.interestedInSource(args("inputDir"))) - input.filter { case (userId, rec) => users.contains(userId) }.toIterableExecution.map { - s => println(s.map(Util.prettyJsonMapper.writeValueAsString).mkString("\n")) - } - } - } -} - -/** - * Helper functions - */ -object InterestedInFromKnownFor { - private def ifNanMake0(x: Double): Double = if (x.isNaN) 0.0 else x - - case class SrcClusterIntermediateInfo( - followScore: Double, - followScoreProducerNormalized: Double, - favScore: Double, - favScoreProducerNormalized: Double, - logFavScore: Double, - logFavScoreProducerNormalized: Double, - followSocialProof: List[Long], - favSocialProof: List[Long]) { - // overriding for the sake of unit tests - override def equals(obj: scala.Any): Boolean = { - obj match { - case that: SrcClusterIntermediateInfo => - math.abs(followScore - that.followScore) < 1e-5 && - math.abs(followScoreProducerNormalized - that.followScoreProducerNormalized) < 1e-5 && - math.abs(favScore - that.favScore) < 1e-5 && - math.abs(favScoreProducerNormalized - that.favScoreProducerNormalized) < 1e-5 && - math.abs(logFavScore - that.logFavScore) < 1e-5 && - math.abs(logFavScoreProducerNormalized - that.logFavScoreProducerNormalized) < 1e-5 && - followSocialProof.toSet == that.followSocialProof.toSet && - favSocialProof.toSet == that.favSocialProof.toSet - case _ => false - } - } - } - - implicit object SrcClusterIntermediateInfoSemigroup - extends Semigroup[SrcClusterIntermediateInfo] { - override def plus( - left: SrcClusterIntermediateInfo, - right: SrcClusterIntermediateInfo - ): SrcClusterIntermediateInfo = { - SrcClusterIntermediateInfo( - followScore = left.followScore + right.followScore, - followScoreProducerNormalized = - left.followScoreProducerNormalized + right.followScoreProducerNormalized, - favScore = left.favScore + right.favScore, - favScoreProducerNormalized = - left.favScoreProducerNormalized + right.favScoreProducerNormalized, - logFavScore = left.logFavScore + right.logFavScore, - logFavScoreProducerNormalized = - left.logFavScoreProducerNormalized + right.logFavScoreProducerNormalized, - followSocialProof = - Semigroup.plus(left.followSocialProof, right.followSocialProof).distinct, - favSocialProof = Semigroup.plus(left.favSocialProof, right.favSocialProof).distinct - ) - } - } - - /** - * @param adjacencyLists User-User follow/fav graph - * @param knownFor KnownFor data set. Each user can be known for several clusters with certain - * knownFor weights. - * @param socialProofThreshold A user will only be interested in a cluster if they follow/fav at - * least certain number of users known for this cluster. - * @param uniqueId required for these Stat - * @return - */ - def userClusterPairsWithoutNormalization( - adjacencyLists: TypedPipe[UserAndNeighbors], - knownFor: TypedPipe[(Long, Array[(Int, Float)])], - socialProofThreshold: Int - )( - implicit uniqueId: UniqueID - ): TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] = { - val edgesToUsersWithKnownFor = Stat("num_edges_to_users_with_known_for") - val srcDestClusterTriples = Stat("num_src_dest_cluster_triples") - val srcClusterPairsBeforeSocialProofThresholding = - Stat("num_src_cluster_pairs_before_social_proof_thresholding") - val srcClusterPairsAfterSocialProofThresholding = - Stat("num_src_cluster_pairs_after_social_proof_thresholding") - - val edges = adjacencyLists.flatMap { - case UserAndNeighbors(srcId, neighborsWithWeights) => - neighborsWithWeights.map { neighborWithWeights => - ( - neighborWithWeights.neighborId, - neighborWithWeights.copy(neighborId = srcId) - ) - } - } - - implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian - - edges - .sketch(4000) - .join(knownFor) - .flatMap { - case (destId, (srcWithWeights, clusterArray)) => - edgesToUsersWithKnownFor.inc() - clusterArray.toList.map { - case (clusterId, knownForScoreF) => - val knownForScore = math.max(0.0, knownForScoreF.toDouble) - - srcDestClusterTriples.inc() - val followScore = - if (srcWithWeights.isFollowed.contains(true)) knownForScore else 0.0 - val followScoreProducerNormalizedOnly = - srcWithWeights.followScoreNormalizedByNeighborFollowersL2.getOrElse( - 0.0) * knownForScore - val favScore = - srcWithWeights.favScoreHalfLife100Days.getOrElse(0.0) * knownForScore - - val favScoreProducerNormalizedOnly = - srcWithWeights.favScoreHalfLife100DaysNormalizedByNeighborFaversL2.getOrElse( - 0.0) * knownForScore - - val logFavScore = srcWithWeights.logFavScore.getOrElse(0.0) * knownForScore - - val logFavScoreProducerNormalizedOnly = srcWithWeights.logFavScoreL2Normalized - .getOrElse(0.0) * knownForScore - - val followSocialProof = if (srcWithWeights.isFollowed.contains(true)) { - List(destId) - } else Nil - val favSocialProof = if (srcWithWeights.favScoreHalfLife100Days.exists(_ > 0)) { - List(destId) - } else Nil - - ( - (srcWithWeights.neighborId, clusterId), - SrcClusterIntermediateInfo( - followScore, - followScoreProducerNormalizedOnly, - favScore, - favScoreProducerNormalizedOnly, - logFavScore, - logFavScoreProducerNormalizedOnly, - followSocialProof, - favSocialProof - ) - ) - } - } - .sumByKey - .withReducers(10000) - .filter { - case ((_, _), SrcClusterIntermediateInfo(_, _, _, _, _, _, followProof, favProof)) => - srcClusterPairsBeforeSocialProofThresholding.inc() - val distinctSocialProof = (followProof ++ favProof).toSet - val result = distinctSocialProof.size >= socialProofThreshold - if (result) { - srcClusterPairsAfterSocialProofThresholding.inc() - } - result - } - } - - /** - * Add the cluster-level l2 norm scores, and use them to normalize follow/fav scores. - */ - def attachNormalizedScores( - intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] - )( - implicit uniqueId: UniqueID - ): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = { - - def square(x: Double): Double = x * x - - val clusterCountsAndNorms = - intermediate - .map { - case ( - (_, clusterId), - SrcClusterIntermediateInfo( - followScore, - followScoreProducerNormalizedOnly, - favScore, - favScoreProducerNormalizedOnly, - logFavScore, - logFavScoreProducerNormalizedOnly, - _, - _ - ) - ) => - ( - clusterId, - ( - 1, - square(followScore), - square(followScoreProducerNormalizedOnly), - square(favScore), - square(favScoreProducerNormalizedOnly), - square(logFavScore), - square(logFavScoreProducerNormalizedOnly) - ) - ) - } - .sumByKey - // .withReducers(100) - .map { - case ( - clusterId, - ( - cnt, - squareFollowScore, - squareFollowScoreProducerNormalizedOnly, - squareFavScore, - squareFavScoreProducerNormalizedOnly, - squareLogFavScore, - squareLogFavScoreProducerNormalizedOnly - )) => - ( - clusterId, - ( - cnt, - math.sqrt(squareFollowScore), - math.sqrt(squareFollowScoreProducerNormalizedOnly), - math.sqrt(squareFavScore), - math.sqrt(squareFavScoreProducerNormalizedOnly), - math.sqrt(squareLogFavScore), - math.sqrt(squareLogFavScoreProducerNormalizedOnly) - )) - } - - implicit val i2b: Int => Array[Byte] = Injection.int2BigEndian - - intermediate - .map { - case ((srcId, clusterId), clusterScoresTuple) => - (clusterId, (srcId, clusterScoresTuple)) - } - .sketch(reducers = 900) - .join(clusterCountsAndNorms) - .map { - case ( - clusterId, - ( - ( - srcId, - SrcClusterIntermediateInfo( - followScore, - followScoreProducerNormalizedOnly, - favScore, - favScoreProducerNormalizedOnly, - logFavScore, - logFavScoreProducerNormalizedOnly, // not used for now - followProof, - favProof - ) - ), - ( - cnt, - followNorm, - followProducerNormalizedNorm, - favNorm, - favProducerNormalizedNorm, - logFavNorm, - logFavProducerNormalizedNorm // not used for now - ) - ) - ) => - ( - srcId, - List( - ( - clusterId, - UserToInterestedInClusterScores( - followScore = Some(ifNanMake0(followScore)), - followScoreClusterNormalizedOnly = Some(ifNanMake0(followScore / followNorm)), - followScoreProducerNormalizedOnly = - Some(ifNanMake0(followScoreProducerNormalizedOnly)), - followScoreClusterAndProducerNormalized = Some( - ifNanMake0(followScoreProducerNormalizedOnly / followProducerNormalizedNorm)), - favScore = Some(ifNanMake0(favScore)), - favScoreClusterNormalizedOnly = Some(ifNanMake0(favScore / favNorm)), - favScoreProducerNormalizedOnly = Some(ifNanMake0(favScoreProducerNormalizedOnly)), - favScoreClusterAndProducerNormalized = - Some(ifNanMake0(favScoreProducerNormalizedOnly / favProducerNormalizedNorm)), - usersBeingFollowed = Some(followProof), - usersThatWereFaved = Some(favProof), - numUsersInterestedInThisClusterUpperBound = Some(cnt), - logFavScore = Some(ifNanMake0(logFavScore)), - logFavScoreClusterNormalizedOnly = Some(ifNanMake0(logFavScore / logFavNorm)) - )) - ) - ) - } - .sumByKey - // .withReducers(1000) - .toTypedPipe - } - - /** - * aggregate cluster scores for each user, to be used instead of attachNormalizedScores - * when we donot want to compute cluster-level l2 norm scores - */ - def groupClusterScores( - intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] - )( - implicit uniqueId: UniqueID - ): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = { - - intermediate - .map { - case ( - (srcId, clusterId), - SrcClusterIntermediateInfo( - followScore, - followScoreProducerNormalizedOnly, - favScore, - favScoreProducerNormalizedOnly, - logFavScore, - logFavScoreProducerNormalizedOnly, - followProof, - favProof - ) - ) => - ( - srcId, - List( - ( - clusterId, - UserToInterestedInClusterScores( - followScore = Some(ifNanMake0(followScore)), - followScoreProducerNormalizedOnly = - Some(ifNanMake0(followScoreProducerNormalizedOnly)), - favScore = Some(ifNanMake0(favScore)), - favScoreProducerNormalizedOnly = Some(ifNanMake0(favScoreProducerNormalizedOnly)), - usersBeingFollowed = Some(followProof), - usersThatWereFaved = Some(favProof), - logFavScore = Some(ifNanMake0(logFavScore)), - )) - ) - ) - } - .sumByKey - .withReducers(1000) - .toTypedPipe - } - - /** - * For each user, only keep up to a certain number of clusters. - * @param allInterests user with a list of interestedIn clusters. - * @param maxClustersPerUser number of clusters to keep for each user - * @param knownForModelVersion known for model version - * @param uniqueId required for these Stat - * @return - */ - def keepOnlyTopClusters( - allInterests: TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])], - maxClustersPerUser: Int, - knownForModelVersion: String - )( - implicit uniqueId: UniqueID - ): TypedPipe[(Long, ClustersUserIsInterestedIn)] = { - val userClusterPairsBeforeUserTruncation = - Stat("num_user_cluster_pairs_before_user_truncation") - val userClusterPairsAfterUserTruncation = - Stat("num_user_cluster_pairs_after_user_truncation") - val usersWithALotOfClusters = - Stat(s"num_users_with_more_than_${maxClustersPerUser}_clusters") - - allInterests - .map { - case (srcId, fullClusterList) => - userClusterPairsBeforeUserTruncation.incBy(fullClusterList.size) - val truncatedClusters = if (fullClusterList.size > maxClustersPerUser) { - usersWithALotOfClusters.inc() - fullClusterList - .sortBy { - case (_, clusterScores) => - ( - -clusterScores.favScore.getOrElse(0.0), - -clusterScores.logFavScore.getOrElse(0.0), - -clusterScores.followScore.getOrElse(0.0), - -clusterScores.logFavScoreClusterNormalizedOnly.getOrElse(0.0), - -clusterScores.followScoreProducerNormalizedOnly.getOrElse(0.0) - ) - } - .take(maxClustersPerUser) - } else { - fullClusterList - } - userClusterPairsAfterUserTruncation.incBy(truncatedClusters.size) - (srcId, ClustersUserIsInterestedIn(knownForModelVersion, truncatedClusters.toMap)) - } - } - - def run( - adjacencyLists: TypedPipe[UserAndNeighbors], - knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])], - socialProofThreshold: Int, - maxClustersPerUser: Int, - knownForModelVersion: String - )( - implicit uniqueId: UniqueID - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - keepOnlyTopClusters( - attachNormalizedScores( - userClusterPairsWithoutNormalization( - adjacencyLists, - knownFor, - socialProofThreshold - ) - ), - maxClustersPerUser, - knownForModelVersion - ) - } - - /** - * run the interestedIn job, cluster normalized scores are not attached to user's clusters. - */ - def runWithoutClusterNormalizedScores( - adjacencyLists: TypedPipe[UserAndNeighbors], - knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])], - socialProofThreshold: Int, - maxClustersPerUser: Int, - knownForModelVersion: String - )( - implicit uniqueId: UniqueID - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - keepOnlyTopClusters( - groupClusterScores( - userClusterPairsWithoutNormalization( - adjacencyLists, - knownFor, - socialProofThreshold - ) - ), - maxClustersPerUser, - knownForModelVersion - ) - } - - /** - * print out some basic stats of the data set to make sure things are not broken - */ - def dataSetStats( - interestedInData: TypedPipe[(UserId, ClustersUserIsInterestedIn)], - dataSetName: String = "" - ): Execution[Unit] = { - - Execution - .zip( - Util.printSummaryOfNumericColumn( - interestedInData.map { - case (user, interestedIn) => - interestedIn.clusterIdToScores.size - }, - Some(s"$dataSetName UserInterestedIn Size") - ), - Util.printSummaryOfNumericColumn( - interestedInData.flatMap { - case (user, interestedIn) => - interestedIn.clusterIdToScores.map { - case (_, scores) => - scores.favScore.getOrElse(0.0) - } - }, - Some(s"$dataSetName UserInterestedIn favScore") - ), - Util.printSummaryOfNumericColumn( - interestedInData.flatMap { - case (user, interestedIn) => - interestedIn.clusterIdToScores.map { - case (_, scores) => - scores.favScoreClusterNormalizedOnly.getOrElse(0.0) - } - }, - Some(s"$dataSetName UserInterestedIn favScoreClusterNormalizedOnly") - ), - Util.printSummaryOfNumericColumn( - interestedInData.flatMap { - case (user, interestedIn) => - interestedIn.clusterIdToScores.map { - case (_, scores) => - scores.logFavScoreClusterNormalizedOnly.getOrElse(0.0) - } - }, - Some(s"$dataSetName UserInterestedIn logFavScoreClusterNormalizedOnly") - ) - ).unit - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.docx b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.docx new file mode 100644 index 000000000..9d10c60ce Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.scala b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.scala deleted file mode 100644 index e4b23ae52..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.scala +++ /dev/null @@ -1,354 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.algebird.Semigroup -import com.twitter.bijection.Injection -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite.{D, WriteExtension} -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.job.analytics_batch.{ - AnalyticsBatchExecution, - AnalyticsBatchExecutionArgs, - BatchDescription, - BatchFirstTime, - BatchIncrement, - TwitterScheduledExecutionApp -} -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.{ClusterId, ModelVersions, UserId} -import com.twitter.simclusters_v2.hdfs_sources.{ - AdhocKeyValSources, - InternalDataPaths, - SimclustersV2KnownFor20M145K2020ScalaDataset, - SimclustersV2RawInterestedInLite20M145K2020ScalaDataset, - SimclustersV2RawInterestedIn20M145KUpdatedScalaDataset, - UserAndNeighborsFixedPathSource, - UserUserGraphScalaDataset -} -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.{ - ClustersUserIsInterestedIn, - ClustersUserIsKnownFor, - UserAndNeighbors, - UserToInterestedInClusterScores -} -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import java.util.TimeZone - -/** - * This file implements the job for computing users' interestedIn vector from KnownFor data set. - * - * It reads the UserUserGraphScalaDataset to get user-user follow + fav graph, and then - * based on the known-for clusters of each followed/faved user, we calculate how much a user is - * interestedIn a cluster. - * - * The main differences of the InterestedInFromKnownForLite compared to InterestedInFromKnownFor are - * the following: - * - We read the UserUserGraph dataset that doesnot contain the producer normalized scores - * - We donot compute the cluster normalized scores for the clusters per user - * - For social proof thresholding, we donot keep track of the entire list of follow and - * fav social proofs but rather make use of numFollowSocial and numFavSocial (this introduces - * some noise if follow and fav social proof contain the same users) - * - Store 200 clusters per user compared to 50 in IIKF - * - Runs more frequently compared to weekly in IIKF - */ -/** - * Production job for computing interestedIn data set for the model version 20M145K2020. - * - * To deploy the job: - * - * capesospy-v2 update --build_locally --start_cron interested_in_lite_for_20M_145k_2020 \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object InterestedInFromKnownForLite20M145K2020 extends InterestedInFromKnownForLite { - override val firstTime: String = "2021-04-24" - override val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] = - SimclustersV2RawInterestedInLite20M145K2020ScalaDataset - override val outputPath: String = InternalDataPaths.RawInterestedInLite2020Path - override val knownForModelVersion: String = ModelVersions.Model20M145K2020 - override val knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] = - SimclustersV2KnownFor20M145K2020ScalaDataset -} -trait InterestedInFromKnownForLite extends TwitterScheduledExecutionApp { - implicit val tz = DateOps.UTC - implicit val parser = DateParser.default - - def firstTime: String - val batchIncrement: Duration = Days(2) - val lookBackDays: Duration = Days(30) - - def outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] - def outputPath: String - def knownForModelVersion: String - def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] - - private lazy val execArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(this.getClass.getName.replace("$", "")), - firstTime = BatchFirstTime(RichDate(firstTime)), - lastTime = None, - batchIncrement = BatchIncrement(batchIncrement) - ) - - override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) { - implicit dateRange => - Execution.withId { implicit uniqueId => - Execution.withArgs { args => - val userUserGraph = - DAL.readMostRecentSnapshot(UserUserGraphScalaDataset).toTypedPipe - val knownFor = KnownForSources.fromKeyVal( - DAL.readMostRecentSnapshot(knownForDALDataset, dateRange.extend(Days(30))).toTypedPipe, - knownForModelVersion - ) - - val socialProofThreshold = args.int("socialProofThreshold", 2) - val maxClustersPerUser = args.int("maxClustersPerUser", 200) - - val result = InterestedInFromKnownForLite - .run( - userUserGraph, - knownFor, - socialProofThreshold, - maxClustersPerUser, - knownForModelVersion - ) - - val writeKeyValResultExec = result - .map { - case (userId, clusters) => KeyVal(userId, clusters) - }.writeDALVersionedKeyValExecution( - outputKVDataset, - D.Suffix(outputPath) - ) - Util.printCounters(writeKeyValResultExec) - } - } - } -} - -/** - * Adhoc job to compute user interestedIn. - * - * scalding remote run \ - * --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_lite_20m_145k_2020-adhoc \ - * --main-class com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020Adhoc \ - * --user cassowary --cluster bluebird-qus1 \ - * --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ - * --principal service_acoount@TWITTER.BIZ \ - * -- \ - * --outputDir /gcs/user/cassowary/adhoc/interested_in_from_knownfor_lite/ \ - * --date 2020-08-25 - */ -object InterestedInFromKnownForLite20M145K2020Adhoc extends AdhocExecutionApp { - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val userUserGraph = DAL.readMostRecentSnapshot(UserUserGraphScalaDataset).toTypedPipe - val socialProofThreshold = args.int("socialProofThreshold", 2) - val maxClustersPerUser = args.int("maxClustersPerUser", 200) - val knownForModelVersion = ModelVersions.Model20M145K2020 - val knownFor = KnownForSources.fromKeyVal( - DAL - .readMostRecentSnapshotNoOlderThan( - SimclustersV2KnownFor20M145K2020ScalaDataset, - Days(30)).toTypedPipe, - knownForModelVersion - ) - - val outputSink = AdhocKeyValSources.interestedInSource(args("outputDir")) - Util.printCounters( - InterestedInFromKnownForLite - .run( - userUserGraph, - knownFor, - socialProofThreshold, - maxClustersPerUser, - knownForModelVersion - ).writeExecution(outputSink) - ) - } - -} - -object InterestedInFromKnownForLite { - private def ifNanMake0(x: Double): Double = if (x.isNaN) 0.0 else x - - case class SrcClusterIntermediateInfo( - followScore: Double, - favScore: Double, - logFavScore: Double, - numFollowed: Int, - numFaved: Int) { - - // helper function used for test cases - override def equals(obj: scala.Any): Boolean = { - obj match { - case that: SrcClusterIntermediateInfo => - math.abs(followScore - that.followScore) < 1e-5 && - math.abs(favScore - that.favScore) < 1e-5 && - math.abs(logFavScore - that.logFavScore) < 1e-5 && - numFollowed == that.numFollowed && - numFaved == that.numFaved - case _ => false - } - } - } - - implicit object SrcClusterIntermediateInfoSemigroup - extends Semigroup[SrcClusterIntermediateInfo] { - override def plus( - left: SrcClusterIntermediateInfo, - right: SrcClusterIntermediateInfo - ): SrcClusterIntermediateInfo = { - SrcClusterIntermediateInfo( - followScore = left.followScore + right.followScore, - favScore = left.favScore + right.favScore, - logFavScore = left.logFavScore + right.logFavScore, - numFollowed = left.numFollowed + right.numFollowed, - numFaved = left.numFaved + right.numFaved - ) - } - } - - def run( - adjacencyLists: TypedPipe[UserAndNeighbors], - knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])], - socialProofThreshold: Int, - maxClustersPerUser: Int, - knownForModelVersion: String - )( - implicit uniqueId: UniqueID - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - InterestedInFromKnownFor.keepOnlyTopClusters( - groupClusterScores( - userClusterPairs( - adjacencyLists, - knownFor, - socialProofThreshold - ) - ), - maxClustersPerUser, - knownForModelVersion - ) - } - - def userClusterPairs( - adjacencyLists: TypedPipe[UserAndNeighbors], - knownFor: TypedPipe[(Long, Array[(Int, Float)])], - socialProofThreshold: Int - )( - implicit uniqueId: UniqueID - ): TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] = { - val edgesToUsersWithKnownFor = Stat("num_edges_to_users_with_known_for") - val srcDestClusterTriples = Stat("num_src_dest_cluster_triples") - val srcClusterPairsBeforeSocialProofThresholding = - Stat("num_src_cluster_pairs_before_social_proof_thresholding") - val srcClusterPairsAfterSocialProofThresholding = - Stat("num_src_cluster_pairs_after_social_proof_thresholding") - - val edges = adjacencyLists.flatMap { - case UserAndNeighbors(srcId, neighborsWithWeights) => - neighborsWithWeights.map { neighborWithWeights => - ( - neighborWithWeights.neighborId, - neighborWithWeights.copy(neighborId = srcId) - ) - } - } - - implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian - - edges - .sketch(4000) - .join(knownFor) - .flatMap { - case (destId, (srcWithWeights, clusterArray)) => - edgesToUsersWithKnownFor.inc() - clusterArray.toList.map { - case (clusterId, knownForScoreF) => - val knownForScore = math.max(0.0, knownForScoreF.toDouble) - - srcDestClusterTriples.inc() - val followScore = - if (srcWithWeights.isFollowed.contains(true)) knownForScore else 0.0 - val favScore = - srcWithWeights.favScoreHalfLife100Days.getOrElse(0.0) * knownForScore - val logFavScore = srcWithWeights.logFavScore.getOrElse(0.0) * knownForScore - val numFollowed = if (srcWithWeights.isFollowed.contains(true)) { - 1 - } else 0 - - val numFaved = if (srcWithWeights.favScoreHalfLife100Days.exists(_ > 0)) { - 1 - } else 0 - - ( - (srcWithWeights.neighborId, clusterId), - SrcClusterIntermediateInfo( - followScore, - favScore, - logFavScore, - numFollowed, - numFaved - ) - ) - } - } - .sumByKey - .withReducers(10000) - .filter { - case ((_, _), SrcClusterIntermediateInfo(_, _, _, numFollowed, numFaved)) => - srcClusterPairsBeforeSocialProofThresholding.inc() - // we donot remove duplicates - val socialProofSize = numFollowed + numFaved - val result = socialProofSize >= socialProofThreshold - if (result) { - srcClusterPairsAfterSocialProofThresholding.inc() - } - result - } - } - - def groupClusterScores( - intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] - )( - implicit uniqueId: UniqueID - ): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = { - - implicit val i2b: Int => Array[Byte] = Injection.int2BigEndian - - intermediate - .map { - case ( - (srcId, clusterId), - SrcClusterIntermediateInfo( - followScore, - favScore, - logFavScore, - numFollowed, - numFaved - )) => - ( - srcId, - List( - ( - clusterId, - UserToInterestedInClusterScores( - followScore = Some(ifNanMake0(followScore)), - favScore = Some(ifNanMake0(favScore)), - logFavScore = Some(ifNanMake0(logFavScore)), - numUsersBeingFollowed = Some(numFollowed), - numUsersThatWereFaved = Some(numFaved) - )) - ) - ) - } - .sumByKey - // .withReducers(1000) - .toTypedPipe - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.docx new file mode 100644 index 000000000..d770e0895 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.scala deleted file mode 100644 index d924dd693..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.scala +++ /dev/null @@ -1,290 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding.Execution -import com.twitter.scalding.TypedTsv -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources.ProducerEmbeddingSources -import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources -import com.twitter.simclusters_v2.hdfs_sources.DataSources -import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource -import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.SimClusterWithScore -import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore -import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone -import scala.util.Random - -/** - * This file implements the job for computing users' interestedIn vector from the producerEmbeddings data set. - * - * It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then - * based on the producerEmbedding clusters of each followed/faved user, we calculate how much a user is - * interestedIn a cluster. To compute the engagement and determine the clusters for the user, we reuse - * the functions defined in InterestedInKnownFor. - * - * Using producerEmbeddings instead of knownFor to obtain interestedIn increases the coverage (especially - * for medium and light users) and also the density of the cluster embeddings for the user. - */ -/** - * Adhoc job to generate the interestedIn from producer embeddings for the model version 20M145KUpdated - * - scalding remote run \ - --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_producer_embeddings \ - --main-class com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsAdhocApp \ - --user cassowary --cluster bluebird-qus1 \ - --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ - --principal service_acoount@TWITTER.BIZ \ - -- \ - --outputDir /gcs/user/cassowary/adhoc/interested_in_from_prod_embeddings/ \ - --date 2020-08-25 --typedTsv true - */ -object InterestedInFromProducerEmbeddingsAdhocApp extends AdhocExecutionApp { - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val outputDir = args("outputDir") - val inputGraph = args.optional("graphInputDir") match { - case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir)) - case None => - DAL - .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30)) - .toTypedPipe - } - val socialProofThreshold = args.int("socialProofThreshold", 2) - val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50) - val maxClustersFromProducer = args.int("maxClustersPerProducer", 25) - val typedTsvTag = args.boolean("typedTsv") - - val embeddingType = - EmbeddingType.ProducerFavBasedSemanticCoreEntity - val modelVersion = ModelVersions.Model20M145KUpdated - val producerEmbeddings = ProducerEmbeddingSources - .producerEmbeddingSourceLegacy(embeddingType, ModelVersions.toModelVersion(modelVersion))( - dateRange.embiggen(Days(7))) - - import InterestedInFromProducerEmbeddingsBatchApp._ - - val numProducerMappings = Stat("num_producer_embeddings_total") - val numProducersWithLargeClusterMappings = Stat( - "num_producers_with_more_clusters_than_threshold") - val numProducersWithSmallClusterMappings = Stat( - "num_producers_with_clusters_less_than_threshold") - val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings") - - val producerEmbeddingsWithScore = producerEmbeddings.map { - case (userId: Long, topSimClusters: TopSimClustersWithScore) => - ( - userId, - topSimClusters.topClusters.toArray - .map { - case (simCluster: SimClusterWithScore) => - (simCluster.clusterId, simCluster.score.toFloat) - } - ) - } - val producerEmbeddingsPruned = producerEmbeddingsWithScore.map { - case (producerId, clusterArray) => - numProducerMappings.inc() - val clusterSize = clusterArray.size - totalClustersCoverageProducerEmbeddings.incBy(clusterSize) - val prunedList = if (clusterSize > maxClustersFromProducer) { - numProducersWithLargeClusterMappings.inc() - clusterArray - .sortBy { - case (_, knownForScore) => -knownForScore - }.take(maxClustersFromProducer) - } else { - numProducersWithSmallClusterMappings.inc() - clusterArray - } - (producerId, prunedList) - } - - val result = InterestedInFromKnownFor - .run( - inputGraph, - producerEmbeddingsPruned, - socialProofThreshold, - maxClustersPerUserFinalResult, - modelVersion - ) - - val resultWithoutSocial = getInterestedInDiscardSocial(result) - - if (typedTsvTag) { - Util.printCounters( - resultWithoutSocial - .map { - case (userId: Long, clusters: ClustersUserIsInterestedIn) => - ( - userId, - clusters.clusterIdToScores.keys.toString() - ) - } - .writeExecution( - TypedTsv(outputDir) - ) - ) - } else { - Util.printCounters( - resultWithoutSocial - .writeExecution( - AdhocKeyValSources.interestedInSource(outputDir) - ) - ) - } - } -} - -/** - * Production job for computing interestedIn data set from the producer embeddings for the model version 20M145KUpdated. - * It writes the data set in KeyVal format to produce a MH DAL data set. - * - * To deploy the job: - * - * capesospy-v2 update --build_locally --start_cron - * --start_cron interested_in_from_producer_embeddings - * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object InterestedInFromProducerEmbeddingsBatchApp extends ScheduledExecutionApp { - override val firstTime: RichDate = RichDate("2019-11-01") - - override val batchIncrement: Duration = Days(7) - - def getPrunedEmbeddings( - producerEmbeddings: TypedPipe[(Long, TopSimClustersWithScore)], - maxClustersFromProducer: Int - ): TypedPipe[(Long, TopSimClustersWithScore)] = { - producerEmbeddings.map { - case (producerId, producerClusters) => - val prunedProducerClusters = - producerClusters.topClusters - .sortBy { - case simCluster => -simCluster.score.toFloat - }.take(maxClustersFromProducer) - (producerId, TopSimClustersWithScore(prunedProducerClusters, producerClusters.modelVersion)) - } - } - - def getInterestedInDiscardSocial( - interestedInFromProducersResult: TypedPipe[(UserId, ClustersUserIsInterestedIn)] - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - interestedInFromProducersResult.map { - case (srcId, fullClusterList) => - val fullClusterListWithoutSocial = fullClusterList.clusterIdToScores.map { - case (clusterId, clusterDetails) => - val clusterDetailsWithoutSocial = UserToInterestedInClusterScores( - followScore = clusterDetails.followScore, - followScoreClusterNormalizedOnly = clusterDetails.followScoreClusterNormalizedOnly, - followScoreProducerNormalizedOnly = clusterDetails.followScoreProducerNormalizedOnly, - followScoreClusterAndProducerNormalized = - clusterDetails.followScoreClusterAndProducerNormalized, - favScore = clusterDetails.favScore, - favScoreClusterNormalizedOnly = clusterDetails.favScoreClusterNormalizedOnly, - favScoreProducerNormalizedOnly = clusterDetails.favScoreProducerNormalizedOnly, - favScoreClusterAndProducerNormalized = - clusterDetails.favScoreClusterAndProducerNormalized, - // Social proof is currently not being used anywhere else, hence being discarded to reduce space for this dataset - usersBeingFollowed = None, - usersThatWereFaved = None, - numUsersInterestedInThisClusterUpperBound = - clusterDetails.numUsersInterestedInThisClusterUpperBound, - logFavScore = clusterDetails.logFavScore, - logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly, - // Counts of the social proof are maintained - numUsersBeingFollowed = Some(clusterDetails.usersBeingFollowed.getOrElse(Nil).size), - numUsersThatWereFaved = Some(clusterDetails.usersThatWereFaved.getOrElse(Nil).size) - ) - (clusterId, clusterDetailsWithoutSocial) - } - ( - srcId, - ClustersUserIsInterestedIn( - fullClusterList.knownForModelVersion, - fullClusterListWithoutSocial)) - } - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - //Input args for the run - val socialProofThreshold = args.int("socialProofThreshold", 2) - val maxClustersFromProducer = args.int("maxClustersPerProducer", 25) - val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50) - - //Path variables - val modelVersionUpdated = ModelVersions.toModelVersion(ModelVersions.Model20M145KUpdated) - val rootPath: String = s"/user/cassowary/manhattan_sequence_files" - val interestedInFromProducersPath = - rootPath + "/interested_in_from_producer_embeddings/" + modelVersionUpdated - - //Input adjacency list and producer embeddings - val userUserNormalGraph = - DataSources.userUserNormalizedGraphSource(dateRange.prepend(Days(7))).forceToDisk - val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] = - SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset - val producerEmbeddings = ProducerEmbeddingSources - .producerEmbeddingSourceLegacy( - EmbeddingType.ProducerFavBasedSemanticCoreEntity, - modelVersionUpdated)(dateRange.embiggen(Days(7))) - - val producerEmbeddingsPruned = getPrunedEmbeddings(producerEmbeddings, maxClustersFromProducer) - val producerEmbeddingsWithScore = producerEmbeddingsPruned.map { - case (userId: Long, topSimClusters: TopSimClustersWithScore) => - ( - userId, - topSimClusters.topClusters.toArray - .map { - case (simCluster: SimClusterWithScore) => - (simCluster.clusterId, simCluster.score.toFloat) - } - ) - } - - val interestedInFromProducersResult = - InterestedInFromKnownFor.run( - userUserNormalGraph, - producerEmbeddingsWithScore, - socialProofThreshold, - maxClustersPerUserFinalResult, - modelVersionUpdated.toString - ) - - val interestedInFromProducersWithoutSocial = - getInterestedInDiscardSocial(interestedInFromProducersResult) - - val writeKeyValResultExec = interestedInFromProducersWithoutSocial - .map { case (userId, clusters) => KeyVal(userId, clusters) } - .writeDALVersionedKeyValExecution( - outputKVDataset, - D.Suffix(interestedInFromProducersPath) - ) - writeKeyValResultExec - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/KnownForSources.docx b/src/scala/com/twitter/simclusters_v2/scalding/KnownForSources.docx new file mode 100644 index 000000000..d4ea878a8 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/KnownForSources.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/KnownForSources.scala b/src/scala/com/twitter/simclusters_v2/scalding/KnownForSources.scala deleted file mode 100644 index 217f521ac..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/KnownForSources.scala +++ /dev/null @@ -1,275 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.logging.Logger -import com.twitter.scalding._ -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, ProcAtla} -import com.twitter.scalding_internal.job.analytics_batch.{ - AnalyticsBatchExecution, - AnalyticsBatchExecutionArgs, - BatchDescription, - BatchFirstTime, - BatchIncrement, - TwitterScheduledExecutionApp -} -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.{ClustersUserIsKnownFor, UserToKnownForClusterScores} -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser -import java.util.TimeZone - -object KnownForSources { - implicit val tz: TimeZone = DateOps.UTC - implicit val parser: DateParser = DateParser.default - - def readDALDataset( - d: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]], - noOlderThan: Duration, - modelVersionToKeep: String - ): TypedPipe[(Long, Array[(Int, Float)])] = { - fromKeyVal( - DAL - .readMostRecentSnapshotNoOlderThan(d, noOlderThan) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe, - modelVersionToKeep - ) - } - - def fromKeyVal( - in: TypedPipe[KeyVal[Long, ClustersUserIsKnownFor]], - modelVersionToKeep: String - ): TypedPipe[(Long, Array[(Int, Float)])] = { - in.collect { - case KeyVal(userId, knownForClusters) - if knownForClusters.knownForModelVersion == modelVersionToKeep => - ( - userId, - knownForClusters.clusterIdToScores.toArray - .map { - case (clusterId, scores) => - (clusterId, scores.knownForScore.getOrElse(0.0).toFloat) - } - .sortBy(-_._2)) - } - } - - def toKeyVal( - in: TypedPipe[(Long, Array[(Int, Float)])], - modelVersion: String - ): TypedPipe[KeyVal[Long, ClustersUserIsKnownFor]] = { - in.map { - case (userId, clustersArray) => - val mappedClusters = clustersArray.map { - case (clusterId, score) => - (clusterId, UserToKnownForClusterScores(Some(score))) - }.toMap - KeyVal(userId, ClustersUserIsKnownFor(modelVersion, mappedClusters)) - } - } - - val knownFor_20M_Dec11_145K: TypedPipe[(Long, Array[(Int, Float)])] = readDALDataset( - SimclustersV2KnownFor20M145KDec11ScalaDataset, - Days(30), - ModelVersions.Model20M145KDec11 - ) - - val knownFor_20M_145K_updated: TypedPipe[(Long, Array[(Int, Float)])] = readDALDataset( - SimclustersV2KnownFor20M145KUpdatedScalaDataset, - Days(30), - ModelVersions.Model20M145KUpdated - ) - - val clusterToKnownFor_20M_Dec11_145K: TypedPipe[(Int, List[(Long, Float)])] = - transpose( - knownFor_20M_Dec11_145K - ) - - val clusterToKnownFor_20M_145K_updated: TypedPipe[(Int, List[(Long, Float)])] = - transpose( - knownFor_20M_145K_updated - ) - - private val log = Logger() - - def readKnownFor(textFile: String): TypedPipe[(Long, Array[(Int, Float)])] = { - TypedPipe - .from(TextLine(textFile)) - .flatMap { str => - if (!str.startsWith("#")) { - try { - val tokens = str.trim.split("\\s+") - val res = Array.newBuilder[(Int, Float)] - val userId = tokens(0).toLong - for (i <- 1 until tokens.length) { - val Array(cIdStr, scoreStr) = tokens(i).split(":") - val clusterId = cIdStr.toInt - val score = scoreStr.toFloat - val newEntry = (clusterId, score) - res += newEntry - } - val result = res.result - if (result.nonEmpty) { - Some((userId, res.result())) - } else None - } catch { - case ex: Throwable => - log.warning( - s"Error while loading knownFor from $textFile for line <$str>: " + - ex.getMessage - ) - None - } - } else None - } - } - - def stringifyKnownFor( - input: TypedPipe[(Long, Array[(Int, Float)])] - ): TypedPipe[(Long, String)] = { - input.mapValues { arr => - arr.map { case (clusterId, score) => "%d:%.2g".format(clusterId, score) }.mkString("\t") - } - } - - def writeKnownForTypedTsv( - input: TypedPipe[(Long, Array[(Int, Float)])], - outputDir: String - ): Execution[Unit] = { - stringifyKnownFor(input).writeExecution(TypedTsv(outputDir)) - } - - def makeKnownForTypedTsv( - input: TypedPipe[(Long, Array[(Int, Float)])], - outputDir: String - ): Execution[TypedPipe[(Long, Array[(Int, Float)])]] = { - Execution.getMode.flatMap { mode => - try { - val dest = TextLine(outputDir) - dest.validateTaps(mode) - Execution.from(KnownForSources.readKnownFor(outputDir)) - } catch { - case ivs: InvalidSourceException => - writeKnownForTypedTsv(input, outputDir).map { _ => input } - } - } - - } - - def transpose( - userToCluster: TypedPipe[(Long, Array[(Int, Float)])] - ): TypedPipe[(Int, List[(Long, Float)])] = { - userToCluster - .flatMap { - case (userId, clusterWeightPairs) => - clusterWeightPairs.map { - case (clusterId, weight) => - (clusterId, List(userId -> weight)) - } - } - .sumByKey - .toTypedPipe - } -} - -/** -capesospy-v2 update --build_locally --start_cron known_for_to_mh \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object KnownForToMHBatch extends TwitterScheduledExecutionApp { - - import KnownForSources._ - - /** - * A simple update function which updates the source by removing deactivated and suspended users. - * This will be eventually replaced by a regular cluster updating method. - */ - def updateKnownForSource( - knownForSource: TypedPipe[(Long, ClustersUserIsKnownFor)], - userSource: TypedPipe[FlatUser] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(Long, ClustersUserIsKnownFor)] = { - val numValidUsers = Stat("num_valid_users") - val numInvalidUsers = Stat("num_invalid_users") - val numKnownForUsersLeft = Stat("num_known_for_users_left") - val numRemovedKnownForUsers = Stat("num_removed_known_for_users") - - val validUsers = - userSource.flatMap { - case flatUser - if !flatUser.deactivated.contains(true) && !flatUser.suspended - .contains(true) - && flatUser.id.nonEmpty => - numValidUsers.inc() - flatUser.id - case _ => - numInvalidUsers.inc() - None - } - - knownForSource.leftJoin(validUsers.asKeys).flatMap { - case (userId, (clustersWithScore, Some(_))) => - numKnownForUsersLeft.inc() - Some((userId, clustersWithScore)) - case _ => - numRemovedKnownForUsers.inc() - None - } - } - - // this should happen before InterestedInFromKnownForBatch - private val firstTime: String = "2019-03-22" - - private val batchIncrement: Duration = Days(7) - - private val outputPath: String = InternalDataPaths.RawKnownForDec11Path - - private val execArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(this.getClass.getName.replace("$", "")), - firstTime = BatchFirstTime(RichDate(firstTime)), - lastTime = None, - batchIncrement = BatchIncrement(batchIncrement) - ) - - override def scheduledJob: Execution[Unit] = - AnalyticsBatchExecution(execArgs) { implicit dateRange => - Execution.withId { implicit uniqueId => - val numKnownForUsers = Stat("num_known_for_users") - - val userSource = - DAL - .readMostRecentSnapshotNoOlderThan(UsersourceFlatScalaDataset, Days(7)) - .toTypedPipe - - val knownForData = DAL - .readMostRecentSnapshotNoOlderThan( - SimclustersV2RawKnownFor20M145KDec11ScalaDataset, - Days(30)) - .toTypedPipe - .map { - case KeyVal(userId, knownForClusters) => - numKnownForUsers.inc() - (userId, knownForClusters) - } - - val result = updateKnownForSource(knownForData, userSource).map { - case (userId, knownForClusters) => - KeyVal(userId, knownForClusters) - } - - Util.printCounters( - result.writeDALVersionedKeyValExecution( - dataset = SimclustersV2RawKnownFor20M145KDec11ScalaDataset, - pathLayout = D.Suffix(outputPath) - ) - ) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/ProducerNormsAndCounts.docx b/src/scala/com/twitter/simclusters_v2/scalding/ProducerNormsAndCounts.docx new file mode 100644 index 000000000..27054b877 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/ProducerNormsAndCounts.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/ProducerNormsAndCounts.scala b/src/scala/com/twitter/simclusters_v2/scalding/ProducerNormsAndCounts.scala deleted file mode 100644 index abaef09e8..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/ProducerNormsAndCounts.scala +++ /dev/null @@ -1,195 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.logging.Logger -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, ProcAtla} -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.job.analytics_batch._ -import com.twitter.simclusters_v2.hdfs_sources.{ - NormsAndCountsFixedPathSource, - ProducerNormsAndCountsScalaDataset -} -import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.NormsAndCounts - -object ProducerNormsAndCounts { - - def getNormsAndCounts( - input: TypedPipe[Edge] - )( - implicit uniqueID: UniqueID - ): TypedPipe[NormsAndCounts] = { - val numRecordsInNormsAndCounts = Stat("num_records_in_norms_and_counts") - input - .map { - case Edge(srcId, destId, isFollowEdge, favWt) => - val followOrNot = if (isFollowEdge) 1 else 0 - ((srcId, destId), (followOrNot, favWt)) - } - .sumByKey - // Uncomment for adhoc job - //.withReducers(2500) - .map { - case ((srcId, destId), (followOrNot, favWt)) => - val favOrNot = if (favWt > 0) 1 else 0 - val logFavScore = if (favWt > 0) UserUserNormalizedGraph.logTransformation(favWt) else 0.0 - ( - destId, - ( - followOrNot, - favWt * favWt, - favOrNot, - favWt, - favWt * followOrNot.toDouble, - logFavScore * logFavScore, - logFavScore, - logFavScore * followOrNot.toDouble)) - } - .sumByKey - // Uncomment for adhoc job - //.withReducers(500) - .map { - case ( - id, - ( - followCount, - favSumSquare, - favCount, - favSumOnFavEdges, - favSumOnFollowEdges, - logFavSumSquare, - logFavSumOnFavEdges, - logFavSumOnFollowEdges)) => - val followerNorm = math.sqrt(followCount) - val faverNorm = math.sqrt(favSumSquare) - numRecordsInNormsAndCounts.inc() - NormsAndCounts( - userId = id, - followerL2Norm = Some(followerNorm), - faverL2Norm = Some(faverNorm), - followerCount = Some(followCount), - faverCount = Some(favCount), - favWeightsOnFavEdgesSum = Some(favSumOnFavEdges), - favWeightsOnFollowEdgesSum = Some(favSumOnFollowEdges), - logFavL2Norm = Some(math.sqrt(logFavSumSquare)), - logFavWeightsOnFavEdgesSum = Some(logFavSumOnFavEdges), - logFavWeightsOnFollowEdgesSum = Some(logFavSumOnFollowEdges) - ) - } - } - - def run( - halfLifeInDaysForFavScore: Int - )( - implicit uniqueID: UniqueID, - date: DateRange - ): TypedPipe[NormsAndCounts] = { - val input = - UserUserNormalizedGraph.getFollowEdges.map { - case (src, dest) => - Edge(src, dest, isFollowEdge = true, 0.0) - } ++ UserUserNormalizedGraph.getFavEdges(halfLifeInDaysForFavScore).map { - case (src, dest, wt) => - Edge(src, dest, isFollowEdge = false, wt) - } - getNormsAndCounts(input) - } -} - -object ProducerNormsAndCountsBatch extends TwitterScheduledExecutionApp { - private val firstTime: String = "2018-06-16" - implicit val tz = DateOps.UTC - implicit val parser = DateParser.default - private val batchIncrement: Duration = Days(7) - private val firstStartDate = DateRange.parse(firstTime).start - private val halfLifeInDaysForFavScore = 100 - - private val outputPath: String = "/user/cassowary/processed/producer_norms_and_counts" - private val log = Logger() - - private val execArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(this.getClass.getName.replace("$", "")), - firstTime = BatchFirstTime(RichDate(firstTime)), - lastTime = None, - batchIncrement = BatchIncrement(batchIncrement) - ) - - override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) { - implicit dateRange => - Execution.withId { implicit uniqueId => - Execution.withArgs { args => - Util.printCounters( - ProducerNormsAndCounts - .run(halfLifeInDaysForFavScore) - .writeDALSnapshotExecution( - ProducerNormsAndCountsScalaDataset, - D.Daily, - D.Suffix(outputPath), - D.EBLzo(), - dateRange.end) - ) - } - } - } -} - -object ProducerNormsAndCountsAdhoc extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - implicit val date = DateRange.parse(args.list("date")) - - Util.printCounters( - ProducerNormsAndCounts - .run(halfLifeInDaysForFavScore = 100) - .forceToDiskExecution.flatMap { result => - Execution.zip( - result.writeExecution(NormsAndCountsFixedPathSource(args("outputDir"))), - result.printSummary("Producer norms and counts") - ) - } - ) - } - } -} - -object DumpNormsAndCountsAdhoc extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - - val users = args.list("users").map(_.toLong).toSet - val input = args.optional("inputDir") match { - case Some(inputDir) => TypedPipe.from(NormsAndCountsFixedPathSource(inputDir)) - case None => - DAL - .readMostRecentSnapshotNoOlderThan(ProducerNormsAndCountsScalaDataset, Days(30)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - - if (users.isEmpty) { - input.printSummary("Producer norms and counts") - } else { - input - .collect { - case rec if users.contains(rec.userId) => - Util.prettyJsonMapper.writeValueAsString(rec).replaceAll("\n", " ") - } - .toIterableExecution - .map { strings => println(strings.mkString("\n")) } - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/TopUsersSimilarityGraph.docx b/src/scala/com/twitter/simclusters_v2/scalding/TopUsersSimilarityGraph.docx new file mode 100644 index 000000000..0e0ceda93 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/TopUsersSimilarityGraph.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/TopUsersSimilarityGraph.scala b/src/scala/com/twitter/simclusters_v2/scalding/TopUsersSimilarityGraph.scala deleted file mode 100644 index d93bd73ee..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/TopUsersSimilarityGraph.scala +++ /dev/null @@ -1,996 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.algebird.Max -import com.twitter.algebird.Monoid -import com.twitter.bijection.scrooge.BinaryScalaCodec -import com.twitter.hermit.candidate.thriftscala.Candidate -import com.twitter.hermit.candidate.thriftscala.Candidates -import com.twitter.logging.Logger -import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource -import com.twitter.sbf.core.AlgorithmConfig -import com.twitter.sbf.core.MHAlgorithm -import com.twitter.sbf.core.PredictionStat -import com.twitter.sbf.core.SparseBinaryMatrix -import com.twitter.sbf.core.SparseRealMatrix -import com.twitter.sbf.graph.Graph -import com.twitter.scalding._ -import com.twitter.scalding.commons.source.VersionedKeyValSource -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.source.lzo_scrooge.FixedPathLzoScrooge -import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser -import com.twitter.wtf.scalding.sims.thriftscala.SimilarUserPair -import java.io.PrintWriter -import java.text.DecimalFormat -import java.util -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.FileSystem -import org.apache.hadoop.fs.Path -import scala.collection.JavaConverters._ - -case class TopUser(id: Long, activeFollowerCount: Int, screenName: String) - -case class TopUserWithMappedId(topUser: TopUser, mappedId: Int) - -case class AdjList(sourceId: Long, neighbors: List[(Long, Float)]) - -object TopUsersSimilarityGraph { - val log = Logger() - - def topUsers( - userSourcePipe: TypedPipe[FlatUser], - minActiveFollowers: Int, - topK: Int - ): TypedPipe[TopUser] = { - userSourcePipe - .collect { - case f: FlatUser - if f.activeFollowers.exists(_ >= minActiveFollowers) - && f.followers.isDefined && f.id.isDefined && f.screenName.isDefined - && !f.deactivated.contains(true) && !f.suspended.contains(true) => - TopUser(f.id.get, f.activeFollowers.get.toInt, f.screenName.get) - } - .groupAll - .sortedReverseTake(topK)(Ordering.by(_.activeFollowerCount)) - .values - .flatten - } - - /** - * This function returns the top most followed userIds truncated to topK - * Offers the same functionality as TopUsersSimilarityGraph.topUsers but more efficient - * as we donot store screennames while grouping and sorting the users - */ - def topUserIds( - userSourcePipe: TypedPipe[FlatUser], - minActiveFollowers: Int, - topK: Int - ): TypedPipe[Long] = { - userSourcePipe - .collect { - case f: FlatUser - if f.activeFollowers.exists(_ >= minActiveFollowers) - && f.followers.isDefined && f.id.isDefined && f.screenName.isDefined - && !f.deactivated.contains(true) && !f.suspended.contains(true) => - (f.id.get, f.activeFollowers.get) - } - .groupAll - .sortedReverseTake(topK)(Ordering.by(_._2)) - .values - .flatten - .keys - } - - def topUsersWithMappedIds( - userSourcePipe: TypedPipe[FlatUser], - minActiveFollowers: Int - ): TypedPipe[TopUserWithMappedId] = { - userSourcePipe - .collect { - case f: FlatUser - if f.activeFollowers.exists(_ >= minActiveFollowers) - && f.followers.isDefined && f.id.isDefined && f.screenName.isDefined - && !f.deactivated.contains(true) && !f.suspended.contains(true) => - TopUser(f.id.get, f.activeFollowers.get.toInt, f.screenName.get) - } - .groupAll - .mapGroup { - case (_, topUserIter) => - topUserIter.zipWithIndex.map { - case (topUser, id) => - TopUserWithMappedId(topUser, id) - } - } - .values - } - - def topUsersWithMappedIdsTopK( - userSourcePipe: TypedPipe[FlatUser], - minActiveFollowers: Int, - topK: Int - ): TypedPipe[TopUserWithMappedId] = { - userSourcePipe - .collect { - case f: FlatUser - if f.activeFollowers.exists(_ >= minActiveFollowers) - && f.followers.isDefined && f.id.isDefined && f.screenName.isDefined - && !f.deactivated.contains(true) && !f.suspended.contains(true) => - TopUser(f.id.get, f.activeFollowers.get.toInt, f.screenName.get) - } - .groupAll - .sortedReverseTake(topK)(Ordering.by(_.activeFollowerCount)) - .map { - case (_, topUserIter) => - topUserIter.zipWithIndex.map { - case (topUser, id) => - TopUserWithMappedId(topUser, id) - } - } - .flatten - } - - /** - * This function returns the top most followed and verified userIds truncated to topK - */ - def vits( - userSourcePipe: TypedPipe[FlatUser], - minActiveFollowers: Int, - topK: Int - ): TypedPipe[Long] = { - userSourcePipe - .collect { - case f: FlatUser - if f.verified.contains(true) && f.id.isDefined && - f.screenName.isDefined && !f.deactivated.contains(true) && !f.suspended.contains( - true) && - f.activeFollowers.exists(_ >= minActiveFollowers) => - (f.id.get, f.activeFollowers.get) - } - .groupAll - .sortedReverseTake(topK)(Ordering.by(_._2)) - .values - .flatten - .keys - } - - def topUsersInMemory( - userSourcePipe: TypedPipe[FlatUser], - minActiveFollowers: Int, - topK: Int - ): Execution[List[TopUserWithMappedId]] = { - log.info(s"Will fetch top $topK users with at least $minActiveFollowers many active followers") - topUsers(userSourcePipe, minActiveFollowers, topK).toIterableExecution - .map { idFollowersList => - idFollowersList.toList.sortBy(_.id).zipWithIndex.map { - case (topuser, index) => - TopUserWithMappedId(topuser, index) - } - } - } - - def addSelfLoop( - input: TypedPipe[(Long, Map[Long, Float])], - maxToSelfLoopWeight: Float => Float - ): TypedPipe[(Long, Map[Long, Float])] = { - input - .map { - case (nodeId, neighborMap) if neighborMap.nonEmpty => - val maxEntry = neighborMap.values.max - val selfLoopWeight = maxToSelfLoopWeight(maxEntry) - (nodeId, neighborMap ++ Map(nodeId -> selfLoopWeight)) - case (nodeId, emptyMap) => - (nodeId, emptyMap) - } - } - - def makeGraph( - backfillPipe: TypedPipe[(Long, Map[Long, Float])], - dirToReadFromOrSaveTo: String - ): Execution[TypedPipe[(Long, Map[Long, Float])]] = { - backfillPipe - .map { - case (nodeId, nbrMap) => - val cands = nbrMap.toList.map { case (nId, wt) => Candidate(nId, wt) } - Candidates(nodeId, candidates = cands) - } - .make(new FixedPathLzoScrooge(dirToReadFromOrSaveTo, Candidates)) - .map { tp => - tp.map { - case Candidates(nodeId, cands) => - (nodeId, cands.map { case Candidate(nId, wt, _) => (nId, wt.toFloat) }.toMap) - } - } - } - - def getSubgraphFromUserGroupedInput( - fullGraph: TypedPipe[Candidates], - usersToInclude: TypedPipe[Long], - maxNeighborsPerNode: Int, - degreeThresholdForStat: Int - )( - implicit uniqId: UniqueID - ): TypedPipe[(Long, Map[Long, Float])] = { - val numUsersWithZeroEdges = Stat("num_users_with_zero_edges") - val numUsersWithSmallDegree = Stat("num_users_with_degree_lt_" + degreeThresholdForStat) - val numUsersWithEnoughDegree = Stat("num_users_with_degree_gte_" + degreeThresholdForStat) - - fullGraph - .map { cands => - ( - cands.userId, - // These candidates are already sorted, but leaving it in just in case the behavior changes upstream - cands.candidates - .map { c => (c.userId, c.score) }.sortBy(-_._2).take(maxNeighborsPerNode).toMap - ) - } - .rightJoin(usersToInclude.asKeys) - // uncomment for adhoc job - //.withReducers(110) - .mapValues(_._1) // discard the Unit - .toTypedPipe - .count("num_sims_records_from_top_users") - .flatMap { - case (nodeId, Some(neighborMap)) => - neighborMap.flatMap { - case (neighborId, edgeWt) => - List( - (nodeId, Map(neighborId -> Max(edgeWt.toFloat))), - (neighborId, Map(nodeId -> Max(edgeWt.toFloat))) - ) - } - case (nodeId, None) => List((nodeId, Map.empty[Long, Max[Float]])) - } - .sumByKey - // uncomment for adhoc job - //.withReducers(150) - .toTypedPipe - .mapValues(_.mapValues(_.get)) // get the max for each value in each map - .count("num_sims_records_after_symmetrization_before_keeping_only_top_users") - .join(usersToInclude.asKeys) // only keep records for top users - // uncomment for adhoc job - //.withReducers(100) - .mapValues(_._1) - .toTypedPipe - .map { - case (nodeId, neighborsMap) => - if (neighborsMap.nonEmpty) { - if (neighborsMap.size < degreeThresholdForStat) { - numUsersWithSmallDegree.inc() - } else { - numUsersWithEnoughDegree.inc() - } - } else { - numUsersWithZeroEdges.inc() - } - (nodeId, neighborsMap) - } - .count("num_sims_records_after_symmetrization_only_top_users") - } - - def getSubgraphFromUserGroupedInput( - fullGraph: TypedPipe[Candidates], - usersToInclude: Set[Long], - maxNeighborsPerNode: Int - )( - implicit uniqId: UniqueID - ): TypedPipe[(Long, Map[Long, Float])] = { - val numUsersWithZeroEdges = Stat("num_users_with_zero_edges") - val numUsersWithDegreeLessThan10 = Stat("num_users_with_degree_less_than_10") - - val (intIdsToIncludeSorted: Array[Int], longIdsToIncludeSorted: Array[Long]) = - setToSortedArrays(usersToInclude) - log.info("Size of intArray " + intIdsToIncludeSorted.length) - log.info("Size of longArray " + longIdsToIncludeSorted.length) - - fullGraph - .collect { - case candidates - if isIdInIntOrLongArray( - candidates.userId, - intIdsToIncludeSorted, - longIdsToIncludeSorted) => - val sourceId = candidates.userId - val toKeep = candidates.candidates.collect { - case neighbor - if isIdInIntOrLongArray( - neighbor.userId, - intIdsToIncludeSorted, - longIdsToIncludeSorted) => - (neighbor.userId, neighbor.score.toFloat) - }.toList - - val toKeepLength = toKeep.size - if (toKeep.isEmpty) { - numUsersWithZeroEdges.inc() - } else if (toKeepLength < 10) { - numUsersWithDegreeLessThan10.inc() - } - - val knn = if (toKeepLength > maxNeighborsPerNode) { - toKeep.sortBy(_._2).takeRight(maxNeighborsPerNode) - } else toKeep - - knn.flatMap { - case (nbrId, wt) => - List( - (sourceId, Map(nbrId -> Max(wt))), - (nbrId, Map(sourceId -> Max(wt))) - ) - } - } - .flatten - .sumByKey - .toTypedPipe - .mapValues(_.mapValues(_.get)) // get the max for each value in each map - } - - def getInMemorySubgraphFromUserGroupedInput( - fullGraph: TypedPipe[Candidates], - usersToInclude: Set[Long], - maxNeighborsPerNode: Int - )( - implicit uniqId: UniqueID - ): Execution[Iterable[AdjList]] = { - getSubgraphFromUserGroupedInput(fullGraph, usersToInclude, maxNeighborsPerNode).map { - case (sourceId, weightedNeighbors) => - AdjList( - sourceId, - weightedNeighbors.toList.sortBy(_._1) - ) - }.toIterableExecution - } - - def isIdInIntOrLongArray( - id: Long, - intArraySorted: Array[Int], - longArraySorted: Array[Long] - ): Boolean = { - if (id < Integer.MAX_VALUE) { - util.Arrays.binarySearch(intArraySorted, id.toInt) >= 0 - } else { - util.Arrays.binarySearch(longArraySorted, id.toLong) >= 0 - } - } - - /** - * Creates two sorted arrays out of a set, one with ints and one with longs. - * Sorted arrays are only slightly more expensive to search in, but empirically I've found - * that the MapReduce job runs more reliably using them than using Set directly. - * - * @param inSet - * - * @return - */ - def setToSortedArrays(inSet: Set[Long]): (Array[Int], Array[Long]) = { - val (intArrayUnconvertedSorted, longArraySorted) = - inSet.toArray.sorted.partition { l => l < Integer.MAX_VALUE } - (intArrayUnconvertedSorted.map(_.toInt), longArraySorted) - } - - def getInMemorySubgraph( - fullGraph: TypedPipe[SimilarUserPair], - usersToInclude: Set[Long], - maxNeighborsPerNode: Int - )( - implicit uniqId: UniqueID - ): Execution[Iterable[AdjList]] = { - val numValidEdges = Stat("num_valid_edges") - val numInvalidEdges = Stat("num_invalid_edges") - - val (intIdsToIncludeSorted: Array[Int], longIdsToIncludeSorted: Array[Long]) = - setToSortedArrays(usersToInclude) - log.info("Size of intArray " + intIdsToIncludeSorted.length) - log.info("Size of longArray " + longIdsToIncludeSorted.length) - - fullGraph - .filter { edge => - val res = - isIdInIntOrLongArray(edge.sourceId, intIdsToIncludeSorted, longIdsToIncludeSorted) && - isIdInIntOrLongArray(edge.destinationId, intIdsToIncludeSorted, longIdsToIncludeSorted) - if (res) { - numValidEdges.inc() - } else { - numInvalidEdges.inc() - } - res - } - .map { edge => (edge.sourceId, (edge.destinationId, edge.cosineScore.toFloat)) } - .group - .sortedReverseTake(maxNeighborsPerNode)(Ordering.by(_._2)) - .toTypedPipe - .flatMap { - case (sourceId, weightedNeighbors) => - weightedNeighbors.flatMap { - case (destId, wt) => - /* - By default, a k-nearest neighbor graph need not be symmetric, since if u is in v's - k nearest neighbors, that doesn't guarantee that v is in u's. - This step adds edges in both directions, but having a Map ensures that each neighbor - only appears once and not twice. Using Max() operator from Algebird, we take the max - weight of (u, v) and (v, u) - it is expected that the two will be pretty much the same. - - Example illustrating how Map and Max work together: - Map(1 -> Max(2)) + Map(1 -> Max(3)) = Map(1 -> Max(3)) - */ - List( - (sourceId, Map(destId -> Max(wt))), - (destId, Map(sourceId -> Max(wt))) - ) - } - } - .sumByKey - .map { - case (sourceId, weightedNeighbors) => - AdjList( - sourceId, - weightedNeighbors.toList.map { case (id, maxWt) => (id, maxWt.get) }.sortBy(_._1) - ) - } - .toIterableExecution - } - - def convertIterableToGraph( - adjList: Iterable[AdjList], - verticesMapping: Map[Long, Int], - wtExponent: Float - ): Graph = { - val n = verticesMapping.size - val neighbors: Array[Array[Int]] = new Array[Array[Int]](n) - val wts: Array[Array[Float]] = new Array[Array[Float]](n) - - var numEdges = 0L - var numVertices = 0 - - val iter = adjList.iterator - val verticesWithAtleastOneEdgeBuilder = Set.newBuilder[Long] - - while (iter.hasNext) { - val AdjList(originalId, wtedNeighbors) = iter.next() - val wtedNeighborsSize = wtedNeighbors.size - val newId = verticesMapping(originalId) // throw exception if originalId not in map - if (newId < 0 || newId >= n) { - throw new IllegalStateException( - s"$originalId has been mapped to $newId, which is outside" + - s"the expected range [0, " + (n - 1) + "]") - } - verticesWithAtleastOneEdgeBuilder += originalId - neighbors(newId) = new Array[Int](wtedNeighborsSize) - wts(newId) = new Array[Float](wtedNeighborsSize) - wtedNeighbors.zipWithIndex.foreach { - case ((nbrId, wt), index) => - neighbors(newId)(index) = verticesMapping(nbrId) - wts(newId)(index) = wt - numEdges += 1 - } - - if (math.abs(wtExponent - 1.0) > 1e-5) { - var maxWt = Float.MinValue - for (index <- wts(newId).indices) { - wts(newId)(index) = math.pow(wts(newId)(index), wtExponent).toFloat - if (wts(newId)(index) > maxWt) { - maxWt = wts(newId)(index) - } - } - } - numVertices += 1 - if (numVertices % 100000 == 0) { - log.info(s"Done with $numVertices many vertices.") - } - } - - val verticesWithAtleastOneEdge = verticesWithAtleastOneEdgeBuilder.result() - val verticesWithZeroEdges = verticesMapping.keySet.diff(verticesWithAtleastOneEdge) - - verticesWithZeroEdges.foreach { originalId => - neighbors(verticesMapping(originalId)) = new Array[Int](0) - wts(verticesMapping(originalId)) = new Array[Float](0) - } - - log.info("Number of vertices with zero edges " + verticesWithZeroEdges.size) - log.info("Number of edges " + numEdges) - if (verticesWithZeroEdges.nonEmpty) { - log.info("The vertices with zero edges: " + verticesWithZeroEdges.mkString(",")) - } - - new Graph(n, numEdges / 2, neighbors, wts) - } - - def run( - userSourcePipe: TypedPipe[FlatUser], - minActiveFollowers: Int, - topK: Int, - getSubgraphFn: Set[Long] => Execution[Iterable[AdjList]], - wtExponent: Float - )( - implicit id: UniqueID - ): Execution[(List[TopUserWithMappedId], Graph)] = { - topUsersInMemory( - userSourcePipe, - minActiveFollowers, - topK - ).flatMap { topUsers => - val idMap = topUsers.map { topUser => (topUser.topUser.id, topUser.mappedId) }.toMap - - log.info("Got idMap with " + idMap.size + " entries.") - getSubgraphFn(idMap.keySet) - .map { iterableAdjLists => - log.info("Going to convert iterable to graph") - val tic = System.currentTimeMillis() - val graph = convertIterableToGraph( - iterableAdjLists, - idMap, - wtExponent - ) - val toc = System.currentTimeMillis() - val seconds = (toc - tic) * 1.0 / 1e6 - log.info("Took %.2f seconds to convert iterable to graph".format(seconds)) - (topUsers, graph) - } - } - } - - def runUsingJoin( - mappedUsers: TypedPipe[(Long, Int)], - allEdges: TypedPipe[Candidates], - maxNeighborsPerNode: Int - )( - implicit uniqueID: UniqueID - ): TypedPipe[(Int, String)] = { - val numEdgesAfterFirstJoin = Stat("num_edges_after_first_join") - val numEdgesAfterSecondJoin = Stat("num_edges_after_second_join") - val numEdgesLostTopKTruncated = Stat("num_edges_lost_topk_truncated") - val finalNumEdges = Stat("final_num_edges") - - allEdges - .map { cs => (cs.userId, cs.candidates) } - .join(mappedUsers) - .withReducers(6000) - .flatMap { - case (id, (neighbors, mappedId)) => - val before = neighbors.size - val topKNeighbors = neighbors.sortBy(-_.score).take(maxNeighborsPerNode) - val after = topKNeighbors.size - numEdgesLostTopKTruncated.incBy(before - after) - topKNeighbors.map { candidate => - numEdgesAfterFirstJoin.inc() - (candidate.userId, (mappedId, candidate.score.toFloat)) - } - } - .join(mappedUsers) - .withReducers(9000) - .flatMap { - case (id, ((mappedNeighborId, score), mappedId)) => - numEdgesAfterSecondJoin.inc() - List( - (mappedId, Map(mappedNeighborId -> Max(score))), - (mappedNeighborId, Map(mappedId -> Max(score))) - ) - } - .sumByKey - .withReducers(9100) - .map { - case (id, nbrMap) => - val sorted = nbrMap.mapValues(_.get).toList.sortBy(-_._2) - finalNumEdges.incBy(sorted.size) - val str = sorted.map { case (nbrId, wt) => "%d %.2f".format(nbrId, wt) }.mkString(" ") - (id, str) - } - - } - - def writeToHDFSFile(lines: Iterator[String], conf: Configuration, outputFile: String): Unit = { - val fs = FileSystem.newInstance(conf) - val outputStream = fs.create(new Path(outputFile)) - log.info("Will write to " + outputFile) - var numLines = 0 - val tic = System.currentTimeMillis() - try { - val writer = new PrintWriter(outputStream) - while (lines.hasNext) { - writer.println(lines.next()) - numLines += 1 - if (numLines % 1000000 == 0) { - log.info(s"Done writing $numLines lines") - } - } - writer.flush() - writer.close() - } finally { - outputStream.close() - } - val toc = System.currentTimeMillis() - val seconds = (toc - tic) * 1.0 / 1e6 - log.info( - "Finished writing %d lines to %s. Took %.2f seconds".format(numLines, outputFile, seconds)) - } - - def writeToHDFSIfHDFS(lines: Iterator[String], mode: Mode, outputFile: String): Unit = { - mode match { - case Hdfs(_, conf) => - writeToHDFSFile(lines, conf, outputFile) - case _ => () - } - } - - def writeTopUsers(topUsers: List[TopUserWithMappedId], mode: Mode, outputFile: String): Unit = { - val topUsersLines = - topUsers.map { topUser => - // Add 1 to mappedId so as to get 1-indexed ids, which are friendlier to humans. - List( - topUser.topUser.id, - topUser.mappedId + 1, - topUser.topUser.screenName, - topUser.topUser.activeFollowerCount - ).mkString("\t") - }.iterator - writeToHDFSIfHDFS(topUsersLines, mode, outputFile) - } - - def readSimsInput(isKeyValSource: Boolean, inputDir: String): TypedPipe[Candidates] = { - if (isKeyValSource) { - log.info("Will treat " + inputDir + " as SequenceFiles input") - val rawInput = FollowingsCosineSimilaritiesManhattanSource(path = inputDir) - TypedPipe.from(rawInput).map(_._2) - } else { - log.info("Will treat " + inputDir + " as LzoScrooge input") - TypedPipe.from(new FixedPathLzoScrooge(inputDir, Candidates)) - } - } -} - -/** - * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:top_users_only && \ - * oscar hdfs --hadoop-client-memory 120000 --user cassowary --host atla-aor-08-sr1 \ - * --bundle top_users_only --tool com.twitter.simclusters_v2.scalding.ClusterHdfsGraphApp \ - * --screen --screen-detached --tee ldap_logs/SBFOnSubGraphOf100MTopusersWithMappedIds_120GB_RAM \ - * -- --inputDir adhoc/ldap_subgraphOf100MTopUsersWithMappedIds --numNodesPerCommunity 200 \ - * --outputDir adhoc/ldap_SBFOnSubGraphOf100MTopusersWithMappedIds_k500K_120GB_RAM --assumedNumberOfNodes 100200000 - */ -object ClusterHdfsGraphApp extends TwitterExecutionApp { - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val inputDir = args("inputDir") - val numNodesPerCommunity = args.int("numNodesPerCommunity", 200) - val outputDir = args("outputDir") - val assumedNumberOfNodes = args.int("assumedNumberOfNodes") - //val useEdgeWeights = args.boolean("useEdgeWeights") - - val input = TypedPipe.from(TypedTsv[(Int, String)](inputDir)).map { - case (id, nbrStr) => - val nbrsWithWeights = nbrStr.split(" ") - val nbrsArray = nbrsWithWeights.zipWithIndex - .collect { - case (str, index) if index % 2 == 0 => - str.toInt - } - (id, nbrsArray.sorted) - } - - println("Gonna assume total number of nodes is " + assumedNumberOfNodes) - - input.toIterableExecution.flatMap { adjListsIter => - val nbrs: Array[Array[Int]] = new Array[Array[Int]](assumedNumberOfNodes) - var numEdges = 0L - var numVertices = 0 - var maxVertexId = 0 - - val tic = System.currentTimeMillis - adjListsIter.foreach { - case (id, nbrArray) => - if (id >= assumedNumberOfNodes) { - throw new IllegalStateException( - s"Yikes! Entry with id $id, >= assumedNumberOfNodes") - } - nbrs(id) = nbrArray - if (id > maxVertexId) { - maxVertexId = id - } - numEdges += nbrArray.length - numVertices += 1 - if (numVertices % 100000 == 0) { - println(s"Done loading $numVertices many vertices. Edges so far: $numEdges") - } - } - (0 until assumedNumberOfNodes).foreach { i => - if (nbrs(i) == null) { - nbrs(i) = Array[Int]() - } - } - val toc = System.currentTimeMillis() - println( - "maxVertexId is " + maxVertexId + ", assumedNumberOfNodes is " + assumedNumberOfNodes) - println( - s"Done loading graph with $assumedNumberOfNodes nodes and $numEdges edges (counting each edge twice)") - println("Number of nodes with at least neighbor is " + numVertices) - println("Time to load the graph " + (toc - tic) / 1000.0 / 60.0 + " minutes") - - val graph = new Graph(assumedNumberOfNodes, numEdges / 2, nbrs, null) - val k = assumedNumberOfNodes / numNodesPerCommunity - println("Will set number of communities to " + k) - val algoConfig = new AlgorithmConfig() - .withCpu(16).withK(k) - .withWtCoeff(10.0).withMaxEpoch(5) - var z = new SparseBinaryMatrix(assumedNumberOfNodes, k) - val err = new PrintWriter(System.err) - - println("Going to initalize from random neighborhoods") - z.initFromBestNeighborhoods( - graph, - (gr: Graph, i: Integer) => algoConfig.rng.nextDouble, - false, - err) - println("Done initializing from random neighborhoods") - - val prec0 = MHAlgorithm.clusterPrecision(graph, z, 0, 1000, algoConfig.rng) - println("Precision of cluster 0:" + prec0.precision) - val prec1 = MHAlgorithm.clusterPrecision(graph, z, 1, 1000, algoConfig.rng) - println("Precision of cluster 1:" + prec1.precision) - println( - "Fraction of empty rows after initializing from random neighborhoods: " + z.emptyRowProportion) - - val tic2 = System.currentTimeMillis - val algo = new MHAlgorithm(algoConfig, graph, z, err) - val optimizedZ = algo.optimize - val toc2 = System.currentTimeMillis - println("Time to optimize: %.2f seconds\n".format((toc2 - tic2) / 1000.0)) - println("Time to initialize & optimize: %.2f seconds\n".format((toc2 - toc) / 1000.0)) - - val srm = MHAlgorithm.heuristicallyScoreClusterAssignments(graph, optimizedZ) - val outputIter = (0 to srm.getNumRows).map { rowId => - val rowWithIndices = srm.getColIdsForRow(rowId) - val rowWithScores = srm.getValuesForRow(rowId) - val str = rowWithIndices - .zip(rowWithScores).map { - case (colId, score) => - "%d:%.2g".format(colId + 1, score) - }.mkString(" ") - "%d %s".format(rowId, str) - } - - TypedPipe.from(outputIter).writeExecution(TypedTsv(outputDir)) - } - } - } -} - -/** - * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:top_users_only && \ - * oscar hdfs --hadoop-client-memory 60000 --user cassowary --host atla-aor-08-sr1 \ - * --bundle top_users_only --tool com.twitter.simclusters_v2.scalding.ScalableTopUsersSimilarityGraphApp \ - * --screen --screen-detached --tee ldap_logs/SubGraphOf100MTopusersWithMappedIds \ - * -- --mappedUsersDir adhoc/ldap_top100M_mappedUsers \ - * --inputDir adhoc/ldap_approximate_cosine_similarity_follow \ - * --outputDir adhoc/ldap_subgraphOf100MTopUsersWithMappedIds_correct_topK - */ -object ScalableTopUsersSimilarityGraphApp extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - val log = Logger() - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val inputDir = args("inputDir") - val mappedUsersDir = args("mappedUsersDir") - val maxNeighbors = args.int("maxNeighbors", 100) - val outputDir = args("outputDir") - - val mappedUsers = TypedPipe - .from(TypedTsv[(Long, Int, String, Int)](mappedUsersDir)) - .map { - case (id, _, _, mappedId) => - (id, mappedId) - } - .shard(200) - - val sims = TypedPipe - .from(FollowingsCosineSimilaritiesManhattanSource(path = inputDir)) - .map(_._2) - - TopUsersSimilarityGraph - .runUsingJoin( - mappedUsers, - sims, - maxNeighbors - ).writeExecution(TypedTsv(args("outputDir"))) - } - } -} - -/** - * Scalding app using Executions that does the following: - * - * 1. Get the top N most followed users on Twitter - * (also maps them to ids 1 -> N in int space for easier processing) - * 2. For each user from the step above, get the top K most similar users for this user from the - * list of N users from the step above. - * 3. Construct an undirected graph by setting an edge between (u, v) if - * either v is in u's top-K similar users list, or u is in v's top-K similar user's list. - * 4. The weight for the (u, v) edge is set to be the cosine similarity between u and v's - * follower lists, raised to some exponent > 1. - * This last step is a heuristic reweighting procedure to give more importance to edges involving - * more similar users. - * 5. Write the above graph to HDFS in Metis format, - * i.e. one line per node, with the line for each node specifying the list of neighbors along - * with their weights. The first line specifies the number of nodes and the number of edges. - * - * I've tested this Scalding job for values of topK upto 20M. - * - * Example invocation: - * $ ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:top_users_similarity_graph && \ - * oscar hdfs --hadoop-client-memory 60000 --host atla-amw-03-sr1 --bundle top_users_similarity_graph \ - * --tool com.twitter.simclusters_v2.scalding.TopUsersSimilarityGraphApp \ - * --hadoop-properties "elephantbird.use.combine.input.format=true;elephantbird.combine.split.size=468435456;mapred.min.split.size=468435456;mapreduce.reduce.memory.mb=5096;mapreduce.reduce.java.opts=-Xmx4400m" \ - * --screen --screen-detached --tee logs/20MSubGraphExecution -- --date 2017-10-24 \ - * --minActiveFollowers 300 --topK 20000000 \ - * --inputUserGroupedDir /user/cassowary/manhattan_sequence_files/approximate_cosine_similarity_follow/ \ - * --groupedInputInSequenceFiles \ - * --maxNeighborsPerNode 100 --wtExponent 2 \ - * --outputTopUsersDir /user/your_ldap/simclusters_graph_prep_q42017/top20MUsers \ - * --outputGraphDir /user/your_ldap/simclusters_graph_prep_q42017/top20Musers_exp2_100neighbors_metis_graph - * - */ -object TopUsersSimilarityGraphApp extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - val log = Logger() - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val minActiveFollowers = args.int("minActiveFollowers", 100000) - val topK = args.int("topK") - val date = DateRange.parse(args("date")) - val inputSimilarPairsDir = args.optional("inputSimilarPairsDir") - val inputUserGroupedDir = args.optional("inputUserGroupedDir") - val isGroupedInputSequenceFiles = args.boolean("groupedInputInSequenceFiles") - val outputTopUsersDir = args("outputTopUsersDir") - val maxNeighborsPerNode = args.int("maxNeighborsPerNode", 300) - val wtExponent = args.float("wtExponent", 3.5f) - val outputGraphDir = args("outputGraphDir") - - val userSource = DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe - val exception = new IllegalStateException( - "Please specify only one of inputSimilarPairsDir or inputUserGroupedDir" - ) - - (inputSimilarPairsDir, inputUserGroupedDir) match { - case (Some(_), Some(_)) => throw exception - case (None, None) => throw exception - case _ => // no-op - } - - def getSubgraphFn(usersToInclude: Set[Long]) = { - (inputSimilarPairsDir, inputUserGroupedDir) match { - case (Some(similarPairs), None) => - val similarUserPairs: TypedPipe[SimilarUserPair] = - TypedPipe.from( - new FixedPathLzoScrooge( - inputSimilarPairsDir.get, - SimilarUserPair - )) - TopUsersSimilarityGraph.getInMemorySubgraph( - similarUserPairs, - usersToInclude, - maxNeighborsPerNode) - case (None, Some(groupedInput)) => - val candidatesPipe = - TopUsersSimilarityGraph.readSimsInput(isGroupedInputSequenceFiles, groupedInput) - TopUsersSimilarityGraph.getInMemorySubgraphFromUserGroupedInput( - candidatesPipe, - usersToInclude, - maxNeighborsPerNode - ) - case _ => Execution.from(Nil) // we should never get here - } - } - - TopUsersSimilarityGraph - .run( - userSource, - minActiveFollowers, - topK, - getSubgraphFn, - wtExponent - ).flatMap { - case (topUsersList, graph) => - // We're writing to HDFS ourselves, from the submitter node. - // When we use TypedPipe.write, it's failing for large topK, e.g.10M. - // We can make the submitter node have a lot of memory, but it's - // difficult and suboptimal to give this much memory to all mappers. - val topUsersExec = Execution.from( - TopUsersSimilarityGraph - .writeTopUsers(topUsersList, mode, outputTopUsersDir + "/all") - ) - - // We want to make sure the write of the topUsers succeeds, and - // only then write out the graph. A graph without the topUsers is useless. - topUsersExec.map { _ => - // We're writing to HDFS ourselves, from the submitter node. - // When we use TypedPipe.write, it fails due to OOM on the mappers. - // We can make the submitter node have a lot of memory, but it's difficult - // and suboptimal to give this much memory to all mappers. - TopUsersSimilarityGraph.writeToHDFSIfHDFS( - graph - .iterableStringRepresentation(new DecimalFormat("#.###")).iterator().asScala, - mode, - outputGraphDir + "/all" - ) - } - } - } - } - -} - -/** - * App that only outputs the topK users on Twitter by active follower count. Example invocation: - * $ ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:top_users_only && \ - * oscar hdfs --hadoop-client-memory 60000 --host atla-aor-08-sr1 --bundle top_users_only \ - * --tool com.twitter.simclusters_v2.scalding.TopUsersOnlyApp \ - * #are these hadoop-properties needed for this job? - * #--hadoop-properties "scalding.with.reducers.set.explicitly=true;elephantbird.use.combine.input.format=true;elephantbird.combine.split.size=468435456;mapred.min.split.size=468435456" \ - * --screen --screen-detached --tee logs/10MTopusersOnlyExecution -- --date 2017-10-20 \ - * --minActiveFollowers 500 --topK 10000000 \ - * --outputTopUsersDir /user/your_ldap/simclusters_graph_prep_q42017/top10MUsers - * - * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:top_users_only && \ - * oscar hdfs --hadoop-client-memory 60000 --user cassowary --host atla-aor-08-sr1 \ - * --bundle top_users_only --tool com.twitter.simclusters_v2.scalding.TopUsersOnlyApp \ - * --screen --screen-detached --tee ldap_logs/100MTopusersWithMappedIds \ - * -- --date 2019-10-11 --minActiveFollowers 67 --outputTopUsersDir adhoc/ldap_top100M_mappedUsers \ - * --includeMappedIds - */ -object TopUsersOnlyApp extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - val log = Logger() - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val minActiveFollowers = args.int("minActiveFollowers", 100000) - val topK = args.int("topK", 20000000) - val date = DateRange.parse(args("date")) - val outputTopUsersDir = args("outputTopUsersDir") - val includeMappedIds = args.boolean("includeMappedIds") - - if (includeMappedIds) { - println("Going to include mappedIds in output") - TopUsersSimilarityGraph - .topUsersWithMappedIds( - DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe, - minActiveFollowers - ) - .map { - case TopUserWithMappedId(TopUser(id, activeFollowerCount, screenName), mappedId) => - (id, activeFollowerCount, screenName, mappedId) - } - .writeExecution(TypedTsv(outputTopUsersDir)) - } else { - TopUsersSimilarityGraph - .topUsersInMemory( - DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe, - minActiveFollowers, - topK - ).map { topUsersList => - TopUsersSimilarityGraph.writeTopUsers( - topUsersList, - mode, - outputTopUsersDir + "/all") - } - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/UpdateKnownFor.docx b/src/scala/com/twitter/simclusters_v2/scalding/UpdateKnownFor.docx new file mode 100644 index 000000000..5e77b447d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/UpdateKnownFor.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/UpdateKnownFor.scala b/src/scala/com/twitter/simclusters_v2/scalding/UpdateKnownFor.scala deleted file mode 100644 index f6a3e7612..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/UpdateKnownFor.scala +++ /dev/null @@ -1,311 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.algebird.{Monoid, Semigroup} -import com.twitter.scalding._ - -object UpdateKnownFor { - - /** - * Convenience datastructure that can summarize key stats about a node's set of - * immediate neighbors. - * - * @param nodeCount number of nodes - * @param sumOfEdgeWeights sum of weights on edges in the neighborhood. - * @param sumOfMembershipWeightedEdgeWeights sum of { edge weight * membership weight } for each node - * in the neighborhood. Membership weight to what is not - * specified in this case class and is instead part of the - * context. - * @param sumOfMembershipWeights sum of membership weight for each node in the - * neighborhood. Membership weight to what is not - * specified in this case class and is instead part of - * the context. - */ - case class NeighborhoodInformation( - nodeCount: Int, - sumOfEdgeWeights: Float, - sumOfMembershipWeightedEdgeWeights: Float, - sumOfMembershipWeights: Float) - - object NeighborhoodInformationMonoid extends Monoid[NeighborhoodInformation] { - override val zero: NeighborhoodInformation = NeighborhoodInformation(0, 0f, 0f, 0f) - override def plus(l: NeighborhoodInformation, r: NeighborhoodInformation) = - NeighborhoodInformation( - l.nodeCount + r.nodeCount, - l.sumOfEdgeWeights + r.sumOfEdgeWeights, - l.sumOfMembershipWeightedEdgeWeights + r.sumOfMembershipWeightedEdgeWeights, - l.sumOfMembershipWeights + r.sumOfMembershipWeights - ) - } - - case class NodeInformation( - originalClusters: List[Int], - overallStats: NeighborhoodInformation, - statsOfClustersInNeighborhood: Map[Int, NeighborhoodInformation]) - - object NodeInformationSemigroup extends Semigroup[NodeInformation] { - implicit val ctsMonoid: Monoid[NeighborhoodInformation] = NeighborhoodInformationMonoid - - override def plus(l: NodeInformation, r: NodeInformation) = - NodeInformation( - l.originalClusters ++ r.originalClusters, - ctsMonoid.plus(l.overallStats, r.overallStats), - Monoid - .mapMonoid[Int, NeighborhoodInformation].plus( - l.statsOfClustersInNeighborhood, - r.statsOfClustersInNeighborhood) - ) - } - - case class ClusterScoresForNode( - sumScoreIgnoringMembershipScores: Double, - ratioScoreIgnoringMembershipScores: Double, - ratioScoreUsingMembershipScores: Double) - - /** - * Given a user and a cluster: - * True positive weight = sum of edge weights to neighbors who belong to that cluster. - * False negative weight = sum of edge weights to neighbors who don’t belong to that cluster. - * False positive weight = (number of users in the cluster who are not neighbors of the node) * globalAvgEdgeWeight - * Membership-weighted true positive weight = for neighbors who are also in the cluster, sum of edge weight times user membership score in the cluster. - * Membership-weighted false negative weight = for neighbors who are not in the cluster, sum of edge weight times avg membership score across the whole knownFor input. - * Membership-weighted false positive weight = for users in the cluster who are not neighbors of the node, avg global edge weight times user membership score for the cluster. - * - * Ignoring membership scores, sum formula: - * truePositiveWtFactor*(True positive weight) - false negative weight - false positive weight - * Ignoring membership scores, ratio formula: - * True positive weight / (true positive weight + false negative weight + false positive weight) - * Using membership scores - * Membership-weighted true positive weight / (Membership-weighted true positive weight + Membership-weighted false negative weight + Membership-weighted false positive weight) - * - * @param overallNeighborhoodStats - * @param statsForCluster - * @param clusterSize - * @param sumOfClusterMembershipScores - * @param globalAvgEdgeWeight - * @param truePositiveWtFactor - * - * @return - */ - def getScoresForCluster( - overallNeighborhoodStats: NeighborhoodInformation, - statsForCluster: NeighborhoodInformation, - clusterSize: Int, - sumOfClusterMembershipScores: Double, - globalAvgEdgeWeight: Double, - truePositiveWtFactor: Double - ): ClusterScoresForNode = { - val truePositiveWt = statsForCluster.sumOfEdgeWeights - val falseNegativeWt = overallNeighborhoodStats.sumOfEdgeWeights - truePositiveWt - val falsePositiveWt = (clusterSize - statsForCluster.nodeCount) * globalAvgEdgeWeight - val membershipWeightedTruePositiveWt = statsForCluster.sumOfMembershipWeightedEdgeWeights - val membershipWeightedFalseNegativeWt = - overallNeighborhoodStats.sumOfMembershipWeightedEdgeWeights - membershipWeightedTruePositiveWt - val membershipWeightedFalsePositiveWt = - (sumOfClusterMembershipScores - statsForCluster.sumOfMembershipWeights) * globalAvgEdgeWeight - val sumScore = - truePositiveWtFactor * statsForCluster.sumOfEdgeWeights - falseNegativeWt - falsePositiveWt - val ratioScore = truePositiveWt / (truePositiveWt + falseNegativeWt + falsePositiveWt) - val ratioUsingMemberships = - membershipWeightedTruePositiveWt / (membershipWeightedTruePositiveWt + - membershipWeightedFalsePositiveWt + membershipWeightedFalseNegativeWt) - ClusterScoresForNode(sumScore, ratioScore, ratioUsingMemberships) - } - - def pickBestCluster( - overallNeighborhoodStats: NeighborhoodInformation, - statsOfClustersInNeighborhood: Map[Int, NeighborhoodInformation], - clusterOverallStatsMap: Map[Int, NeighborhoodInformation], - globalAvgEdgeWeight: Double, - truePositiveWtFactor: Double, - clusterScoresToFinalScore: ClusterScoresForNode => Double, - minNeighborsInCluster: Int - ): Option[(Int, Double)] = { - val clusterToScores = statsOfClustersInNeighborhood.toList.flatMap { - case (clusterId, statsInNeighborhood) => - val clusterOverallStats = clusterOverallStatsMap(clusterId) - if (statsInNeighborhood.nodeCount >= minNeighborsInCluster) { - Some( - ( - clusterId, - clusterScoresToFinalScore( - getScoresForCluster( - overallNeighborhoodStats, - statsInNeighborhood, - clusterOverallStats.nodeCount, - clusterOverallStats.sumOfMembershipWeights, - globalAvgEdgeWeight, - truePositiveWtFactor - ) - ) - ) - ) - } else { - None - } - } - if (clusterToScores.nonEmpty) { - Some(clusterToScores.maxBy(_._2)) - } else None - } - - def updateGeneric( - graph: TypedPipe[(Long, Map[Long, Float])], - inputUserToClusters: TypedPipe[(Long, Array[(Int, Float)])], - clusterOverallStatsMap: Map[Int, NeighborhoodInformation], - minNeighborsInCluster: Int, - globalAvgWeight: Double, - avgMembershipScore: Double, - truePositiveWtFactor: Double, - clusterScoresToFinalScore: ClusterScoresForNode => Double - )( - implicit uniqId: UniqueID - ): TypedPipe[(Long, Array[(Int, Float)])] = { - val emptyToSomething = Stat("no_assignment_to_some") - val somethingToEmpty = Stat("some_assignment_to_none") - val emptyToEmpty = Stat("empty_to_empty") - val sameCluster = Stat("same_cluster") - val diffCluster = Stat("diff_cluster") - val nodesWithSmallDegree = Stat("nodes_with_degree_lt_" + minNeighborsInCluster) - - collectInformationPerNode(graph, inputUserToClusters, avgMembershipScore) - .mapValues { - case NodeInformation(originalClusters, overallStats, statsOfClustersInNeighborhood) => - val newClusterWithScoreOpt = if (overallStats.nodeCount < minNeighborsInCluster) { - nodesWithSmallDegree.inc() - None - } else { - pickBestCluster( - overallStats, - statsOfClustersInNeighborhood, - clusterOverallStatsMap, - globalAvgWeight, - truePositiveWtFactor, - clusterScoresToFinalScore, - minNeighborsInCluster - ) - } - newClusterWithScoreOpt match { - case Some((newClusterId, score)) => - if (originalClusters.isEmpty) { - emptyToSomething.inc() - } else if (originalClusters.contains(newClusterId)) { - sameCluster.inc() - } else { - diffCluster.inc() - } - Array((newClusterId, score.toFloat)) - case None => - if (originalClusters.isEmpty) { - emptyToEmpty.inc() - } else { - somethingToEmpty.inc() - } - Array.empty[(Int, Float)] - } - } - } - - /** - * Assembles the information we need at a node in order to decide what the new cluster should be. - * So this is where we assemble what the overall - * - * This function is where all the crucial steps take place. First get the cluster that each - * node belongs to, and then broadcast information about this node and cluster membership to each - * of it's neighbors. Now bring together all records with the same nodeId as the key and create - * the NodeInformation dataset. - * @param graph symmetric graph i.e. if u is in v's adj list, then v is in u's adj list. - * @param userToClusters current knownFor. - * @param avgMembershipScore avg. membership score of a node in the knownFor we're updating. - * Useful to deal with nodes which don't belong to any knownFor. - * @return pipe with node information for each node - */ - def collectInformationPerNode( - graph: TypedPipe[(Long, Map[Long, Float])], - userToClusters: TypedPipe[(Long, Array[(Int, Float)])], - avgMembershipScore: Double - ): TypedPipe[(Long, NodeInformation)] = { - implicit val nisg: Semigroup[NodeInformation] = NodeInformationSemigroup - graph - .leftJoin(userToClusters) - // uncomment for adhoc job - //.withReducers(200) - .flatMap { - case (nodeId, (adjList, assignedClustersOpt)) => - val assignedClusters = - assignedClustersOpt.map(_.toList).getOrElse(Nil) - val res = adjList.toList.flatMap { - case (neighborId, neighborWeight) => - if (assignedClusters.nonEmpty) { - assignedClusters.map { - case (clusterId, membershipScore) => - val neighborhoodInformationForCluster = NeighborhoodInformation( - 1, - neighborWeight, - membershipScore * neighborWeight, - membershipScore) - val originalClusters = - if (neighborId == nodeId) List(clusterId) - else List.empty[Int] - ( - neighborId, - NodeInformation( - originalClusters, - neighborhoodInformationForCluster, - Map(clusterId -> neighborhoodInformationForCluster))) - } - } else { - List( - ( - neighborId, - NodeInformation( - Nil, - NeighborhoodInformation( - 1, - neighborWeight, - (avgMembershipScore * neighborWeight).toFloat, - avgMembershipScore.toFloat), - Map.empty[Int, NeighborhoodInformation] - ))) - } - } - res - } - .sumByKey - // uncomment for adhoc job - //.withReducers(100) - } - - /** - * Replace incoming knownFor scores with ratioScoreIgnoringMembershipScores - * @param knownFor - * @param simsGraphWithoutSelfLoops - * @param globalAvgWeight - * @param clusterStats - * @param avgMembershipScore - * @return - */ - def newKnownForScores( - knownFor: TypedPipe[(Long, Array[(Int, Float)])], - simsGraphWithoutSelfLoops: TypedPipe[(Long, Map[Long, Float])], - globalAvgWeight: Double, - clusterStats: Map[Int, NeighborhoodInformation], - avgMembershipScore: Double - ): TypedPipe[(Long, Array[(Int, Float)])] = { - collectInformationPerNode(simsGraphWithoutSelfLoops, knownFor, avgMembershipScore) - .mapValues { - case NodeInformation(originalClusters, overallStats, statsOfClustersInNeighborhood) => - originalClusters.map { clusterId => - ( - clusterId, - getScoresForCluster( - overallStats, - statsOfClustersInNeighborhood(clusterId), - clusterStats(clusterId).nodeCount, - clusterStats(clusterId).sumOfMembershipWeights, - globalAvgWeight, - 0 - ).ratioScoreIgnoringMembershipScores.toFloat) - }.toArray - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/UpdateKnownForApps.docx b/src/scala/com/twitter/simclusters_v2/scalding/UpdateKnownForApps.docx new file mode 100644 index 000000000..868a83f86 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/UpdateKnownForApps.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/UpdateKnownForApps.scala b/src/scala/com/twitter/simclusters_v2/scalding/UpdateKnownForApps.scala deleted file mode 100644 index 3cffe47b8..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/UpdateKnownForApps.scala +++ /dev/null @@ -1,443 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.hermit.candidate.thriftscala.Candidates -import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource -import com.twitter.pluck.source.cassowary.SimsCandidatesSource -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecution -import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecutionArgs -import com.twitter.scalding_internal.job.analytics_batch.BatchDescription -import com.twitter.scalding_internal.job.analytics_batch.BatchFirstTime -import com.twitter.scalding_internal.job.analytics_batch.BatchIncrement -import com.twitter.scalding_internal.job.analytics_batch.TwitterScheduledExecutionApp -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.UpdateKnownFor.ClusterScoresForNode -import com.twitter.simclusters_v2.scalding.UpdateKnownFor.NeighborhoodInformation -import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import scala.util.Success - -object UpdateKnownForApps { - - /** - * Average edge weight of an input graph - * @param graph a TypedPipe with nodeId as key and adjacency list as value. We don't care about - * the keys in this method. - * @return avg edge weight wrapped in an option in an execution - */ - def getGlobalAvgWeight(graph: TypedPipe[(Long, Map[Long, Float])]): Execution[Option[Double]] = { - graph.values - .flatMap(_.values) - .map { x => (x.toDouble, 1L) } - .sum - .toOptionExecution - .map { - case Some((sum, cnt)) => - val res = sum / cnt - println("globalAvgWeight is " + res) - Some(res) - case _ => - println("Input graph to globalAvgWeight seems to be empty") - None - } - } - - /** - * Average membership score for a particular knownFor assignment - * @param knownFor TypedPipe from nodeId to the clusters it's been assigned to along with - * membership scores. We don't care about the keys in this method. - * @return average membership score - */ - def getAvgMembershipScore(knownFor: TypedPipe[(Long, Array[(Int, Float)])]): Execution[Double] = { - knownFor.values - .flatMap(_.map(_._2)) - .map { x => (x, 1L) } - .sum - .map { case (num, den) => num / den.toDouble } - .getExecution - .onComplete { - case Success(x) => println("Avg. membership score is " + x) - case _ => println("Failed to calculate avg. membership score") - } - } - - /** - * For each cluster, get two statistics about it: the number of nodes assigned to it, and the - * sum of the membership scores - * - * @param knownFor TypedPipe from nodeId to the clusters it's been assigned to along with - * membership scores. - * @return Map giving the NeighborhoodInformation for each cluster. The nodeCount and - * sumOfMembershipWeights fields in NeighborhoodInformation are populated, others are 0. - */ - def getClusterStats( - knownFor: TypedPipe[(Long, Array[(Int, Float)])] - ): Execution[Map[Int, NeighborhoodInformation]] = { - knownFor - .flatMap { - case (_, clusterArray) => - clusterArray.map { - case (clusterId, score) => - Map(clusterId -> (1, score)) - } - } - .sum - .getExecution - .map { map => - map.mapValues { - case (count, sum) => - NeighborhoodInformation(count, 0, 0, sum) - } - } - } - - /** - * Adds self-loops and also potentially raises all edge weights to an exponent - * (typically exponent > 1, and has the effect of increasing inequality in edge weights to - * "clarify" structure in the graph - currently we just set exponent to 1). - * @param symmetrizedSims input symmetrized similarity graph - * @param exponentForEdgeWeight exponent to raise all edge weights to. - * Set to 1.0 to make this a no-op - * @param maxWtToSelfLoopWtMultFactor What to multiply the max wt among non-self-loop edges to - * derive the weight on the self-loop edge. - * @return New graph - */ - def simsGraphForUpdateFromSymmetrizedSims( - symmetrizedSims: TypedPipe[(Long, Map[Long, Float])], - exponentForEdgeWeight: Float, - maxWtToSelfLoopWtMultFactor: Float - ): TypedPipe[(Long, Map[Long, Float])] = { - val expWeighted = symmetrizedSims.mapValues { y => - y.mapValues { x => math.pow(x, exponentForEdgeWeight).toFloat } - } - - TopUsersSimilarityGraph.addSelfLoop( - input = expWeighted, - maxToSelfLoopWeight = { x: Float => x * maxWtToSelfLoopWtMultFactor } - ) - } - - /** - * Runs the job - * @param args args which specify many parameters - * @param inputKnownFor - * @param inputSimsGraph - * @param defaultEmailAddress by default, the email address to send an to email to, which has - * a bunch of evaluation metrics - * @param writeKnownForFunction function that takes a knownFor and writes to some - * persistent location - * @param readKnownForFunction function that reads the knownFor which was written to using the - * writeKnownForFunction - * @param dateRange dateRange, used for reading UserSource - * @param uniqueID need for creating stats - * @return Execution[Unit] encapsulating the whole job - */ - def runUpdateKnownForGeneric( - args: Args, - inputKnownFor: TypedPipe[(Long, Array[(Int, Float)])], - inputSimsGraph: TypedPipe[Candidates], - defaultEmailAddress: String, - writeKnownForFunction: TypedPipe[(Long, Array[(Int, Float)])] => Execution[Unit], - readKnownForFunction: => TypedPipe[(Long, Array[(Int, Float)])], - includeEvaluationResultsInEmail: Boolean - )( - implicit dateRange: DateRange, - uniqueID: UniqueID - ): Execution[Unit] = { - val minActiveFollowers = args.int("minActiveFollowers", 400) - val topK = args.int("topK") - val maxSimsNeighborsForUpdate = - args.int("maxSimsNeighborsForUpdate", 40) - val minNeighborsInCluster = args.int("minNeighborsInCluster", 2) - val maxWtToSelfLoopWtMultFactor = - args.float("maxWtToSelfLoopWtMultFactor", 2) - val exponentForEdgeWeight = args.float("exponentForEdgeWeights", 1.0f) - val updateMethod: ClusterScoresForNode => Double = args("updateMethod") match { - case "sumScoreIgnoringMembershipScores" => { x: ClusterScoresForNode => - x.sumScoreIgnoringMembershipScores - } - case "ratioScoreIgnoringMembershipScores" => { x: ClusterScoresForNode => - x.ratioScoreIgnoringMembershipScores - } - case "ratioScoreUsingMembershipScores" => { x: ClusterScoresForNode => - x.ratioScoreUsingMembershipScores - } - case x @ _ => - throw new Exception(s"value for --updateMethod $x is unknown. It must be one of " + - s"[sumScoreIgnoringMembershipScores, ratioScoreIgnoringMembershipScores, ratioScoreUsingMembershipScores]") - } - val truePositiveWtFactor = args.float("truePositiveWtFactor", 10) - val modelVersion = args("outputModelVersion") - val emailAddress = - args.optional("emailAddress").getOrElse(defaultEmailAddress) - - val topUsers = TopUsersSimilarityGraph - .topUserIds( - DAL - .readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange) - .toTypedPipe, - minActiveFollowers, - topK).count("num_top_users") - - TopUsersSimilarityGraph - .getSubgraphFromUserGroupedInput( - fullGraph = inputSimsGraph, - usersToInclude = topUsers, - maxNeighborsPerNode = maxSimsNeighborsForUpdate, - degreeThresholdForStat = minNeighborsInCluster - ) - .forceToDiskExecution - .flatMap { symmetrizedSims => - val modifiedSims = - UpdateKnownForApps.simsGraphForUpdateFromSymmetrizedSims( - symmetrizedSims = symmetrizedSims, - exponentForEdgeWeight = exponentForEdgeWeight, - maxWtToSelfLoopWtMultFactor = maxWtToSelfLoopWtMultFactor - ) - - val previouslyFamousUsersExec = inputKnownFor - .leftJoin(topUsers.asKeys) - .collect { case (userId, (clusters, None)) => userId } - .getSummaryString( - "Users previously in known for but not in topUsers anymore", - numRecords = 20) - - val clusterStatsExec = UpdateKnownForApps.getClusterStats(inputKnownFor) - - val globalAvgWeightExec = - UpdateKnownForApps.getGlobalAvgWeight(modifiedSims) - - val globalAvgMembershipScoreExec = UpdateKnownForApps.getAvgMembershipScore(inputKnownFor) - - Execution.zip(globalAvgWeightExec, clusterStatsExec, globalAvgMembershipScoreExec).flatMap { - case (Some(globalAvgWeight), clusterStats, globalAvgMembershipScore) => - println("Size of clusterStats: " + clusterStats.size) - println("First few entries from clusterStats: " + clusterStats.take(5)) - println("globalAvgWeight: " + globalAvgWeight) - println("globalAvgMembershipScore: " + globalAvgMembershipScore) - - val knownForWithUnnormalizedScores = UpdateKnownFor - .newKnownForScores( - inputKnownFor, - modifiedSims, - globalAvgWeight, - clusterStats, - globalAvgMembershipScore - ) - val writeNewKnownForExec = writeKnownForFunction( - UpdateKnownFor.updateGeneric( - modifiedSims, - knownForWithUnnormalizedScores, - clusterStats, - minNeighborsInCluster, - globalAvgWeight, - globalAvgMembershipScore, - truePositiveWtFactor, - updateMethod - ) - ) - - writeNewKnownForExec.flatMap { _ => - Util.getCustomCountersString(writeNewKnownForExec).flatMap { customCountersString => - if (includeEvaluationResultsInEmail) { - // It's unfortunate that we're not using the newKnownFor directly, but are instead - // first writing it out and then reading it back in. The reason for doing it in this - // convoluted way is that when we directly use the newKnownFor, the clusterEvaluation - // metrics are being incorrectly computed. - - val newKnownFor = readKnownForFunction - - val newResultsExec = - ClusterEvaluation - .overallEvaluation(symmetrizedSims, newKnownFor, "newKnownForEval") - val oldResultsExec = - ClusterEvaluation - .overallEvaluation(symmetrizedSims, inputKnownFor, "oldKnownForEval") - val minSizeOfBiggerClusterForComparison = 10 - val compareExec = CompareClusters.summarize( - CompareClusters.compare( - KnownForSources.transpose(inputKnownFor), - KnownForSources.transpose(newKnownFor), - minSizeOfBiggerCluster = minSizeOfBiggerClusterForComparison - )) - - Execution - .zip(oldResultsExec, newResultsExec, compareExec, previouslyFamousUsersExec) - .map { - case (oldResults, newResults, compareResults, previouslyFamousUsersString) => - val emailText = "Evaluation Results for existing knownFor:\n" + - Util.prettyJsonMapper.writeValueAsString(oldResults) + - "\n\n-------------------\n\n" + - "Evaluation Results for new knownFor:\n" + - Util.prettyJsonMapper.writeValueAsString(newResults) + - "\n\n-------------------\n\n" + - s"Cosine similarity distribution between cluster membership vectors for " + - s"clusters with at least $minSizeOfBiggerClusterForComparison members\n" + - Util.prettyJsonMapper - .writeValueAsString(compareResults) + - "\n\n-------------------\n\n" + - "Custom counters:\n" + customCountersString + - "\n\n-------------------\n\n" + - previouslyFamousUsersString - - Util - .sendEmail( - emailText, - s"Evaluation results of new knownFor $modelVersion", - emailAddress) - } - } else { - Util - .sendEmail( - customCountersString, - s"Change in cluster assignments for update of knownFor $modelVersion", - emailAddress - ) - Execution.unit - } - - } - } - } - } - } -} - -trait UpdateKnownForBatch extends TwitterScheduledExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - - def firstTime: String - - val batchIncrement: Duration = Days(30) - - def batchDescription: String - - private lazy val execArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(batchDescription), - firstTime = BatchFirstTime(RichDate(firstTime)), - lastTime = None, - batchIncrement = BatchIncrement(batchIncrement) - ) - - val emailAddress: String = "no-reply@twitter.com" - - def inputDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] - - def inputModelVersion: String - - def outputModelVersion: String - - def outputPath: String - - def outputDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] - - override def scheduledJob: Execution[Unit] = - AnalyticsBatchExecution(execArgs) { implicit dateRange => - Execution.withId { implicit uniqueId => - Execution.withArgs { args => - val inputKnownFor = - KnownForSources.readDALDataset(inputDALDataset, Days(30), inputModelVersion) - - val inputSimsGraph = TypedPipe - .from(FollowingsCosineSimilaritiesManhattanSource()) - .map(_._2) - - def writeKnownFor(knownFor: TypedPipe[(Long, Array[(Int, Float)])]): Execution[Unit] = { - KnownForSources - .toKeyVal(knownFor, outputModelVersion) - .writeDALVersionedKeyValExecution( - outputDALDataset, - D.Suffix(outputPath) - ) - } - - def readKnownFor = - KnownForSources.readDALDataset(outputDALDataset, Days(1), outputModelVersion) - - UpdateKnownForApps.runUpdateKnownForGeneric( - args, - inputKnownFor, - inputSimsGraph, - emailAddress, - writeKnownFor, - readKnownFor, - includeEvaluationResultsInEmail = false - ) - } - } - } -} - -/** -capesospy-v2 update --build_locally --start_cron update_known_for_20M_145k \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object UpdateKnownFor20M145K extends UpdateKnownForBatch { - override val firstTime: String = "2019-06-06" - - override val batchIncrement: Duration = Days(7) - - override val batchDescription: String = - "com.twitter.simclusters_v2.scalding.UpdateKnownFor20M145K" - - override val inputModelVersion: String = ModelVersions.Model20M145KUpdated - - override val inputDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] = - SimclustersV2RawKnownFor20M145KUpdatedScalaDataset - - override val outputModelVersion: String = ModelVersions.Model20M145KUpdated - - override val outputDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] = - SimclustersV2RawKnownFor20M145KUpdatedScalaDataset - - override val outputPath: String = InternalDataPaths.RawKnownForUpdatedPath -} - -/** This one's end-to-end, doesn't save any intermediate data etc. **/ -object UpdateKnownForAdhoc extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - implicit val date: DateRange = DateRange.parse(args("date")) - val defaultEmailAddress = "your_ldap@twitter.com" - - val inputKnownFor = args.optional("inputKnownForDir") match { - case Some(inputKnownForDir) => KnownForSources.readKnownFor(inputKnownForDir) - case None => KnownForSources.knownFor_20M_Dec11_145K - } - - val inputSimsGraph = TopUsersSimilarityGraph.readSimsInput( - args.boolean("simsInputIsKeyValSource"), - args("simsInputDir") - ) - - def readKnownFor() = KnownForSources.readKnownFor(args("outputDir")) - - UpdateKnownForApps.runUpdateKnownForGeneric( - args, - inputKnownFor, - inputSimsGraph, - defaultEmailAddress, - { input: TypedPipe[(Long, Array[(Int, Float)])] => - KnownForSources.writeKnownForTypedTsv(input, args("outputDir")) - }, - readKnownFor, - includeEvaluationResultsInEmail = true - ) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/UserUserFavGraph.docx b/src/scala/com/twitter/simclusters_v2/scalding/UserUserFavGraph.docx new file mode 100644 index 000000000..ac2715633 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/UserUserFavGraph.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/UserUserFavGraph.scala b/src/scala/com/twitter/simclusters_v2/scalding/UserUserFavGraph.scala deleted file mode 100644 index 60fb0339d..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/UserUserFavGraph.scala +++ /dev/null @@ -1,445 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.algebird.DecayedValue -import com.twitter.algebird.DecayedValueMonoid -import com.twitter.algebird.Monoid -import com.twitter.algebird.Semigroup -import com.twitter.conversions.DurationOps._ -import com.twitter.logging.Logger -import com.twitter.scalding._ -import com.twitter.scalding.typed.UnsortedGrouped -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.job.analytics_batch._ -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.DecayedSums -import com.twitter.simclusters_v2.thriftscala.EdgeWithDecayedWeights -import com.twitter.timelineservice.thriftscala.ContextualizedFavoriteEvent -import com.twitter.timelineservice.thriftscala.FavoriteEventUnion -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser -import com.twitter.util.Time -import twadoop_config.configuration.log_categories.group.timeline.TimelineServiceFavoritesScalaDataset - -sealed trait FavState - -object Fav extends FavState - -object UnFavWithoutPriorFav extends FavState - -object UnFavWithPriorFav extends FavState - -case class TimestampedFavState(favOrUnfav: FavState, timestampMillis: Long) - -object TimestampedFavStateSemigroup extends Semigroup[TimestampedFavState] { - override def plus(left: TimestampedFavState, right: TimestampedFavState): TimestampedFavState = { - - /** - * Assigning to first, second ensures commutative property - */ - val (first, second) = if (left.timestampMillis < right.timestampMillis) { - (left, right) - } else { - (right, left) - } - (first.favOrUnfav, second.favOrUnfav) match { - case (_, UnFavWithPriorFav) => second - case (UnFavWithPriorFav, UnFavWithoutPriorFav) => - TimestampedFavState(UnFavWithPriorFav, second.timestampMillis) - case (Fav, UnFavWithoutPriorFav) => - TimestampedFavState(UnFavWithPriorFav, second.timestampMillis) - case (UnFavWithoutPriorFav, UnFavWithoutPriorFav) => second - case (_, Fav) => second - } - } -} - -object UserUserFavGraph { - implicit val tz: java.util.TimeZone = DateOps.UTC - // setting the prune threshold in the monoid below to 0.0, since we want to do our own pruning - // outside the monoid, primarily to be able to count how many scores are pruned. - implicit val dvMonoid: Monoid[DecayedValue] = DecayedValueMonoid(0.0) - implicit val lfvSemigroup: Semigroup[TimestampedFavState] = TimestampedFavStateSemigroup - - def getSummedFavGraph( - previousGraphOpt: Option[TypedPipe[EdgeWithDecayedWeights]], - newFavsDateRange: DateRange, - halfLivesInDays: List[Int], - minScoreToKeep: Double - )( - implicit uniqueID: UniqueID - ): TypedPipe[EdgeWithDecayedWeights] = { - val newFavs = DAL.read(TimelineServiceFavoritesScalaDataset, newFavsDateRange).toTypedPipe - val endTime = Time.fromMilliseconds(newFavsDateRange.end.timestamp) - val userSource = - DAL.readMostRecentSnapshotNoOlderThan(UsersourceFlatScalaDataset, Days(7)).toTypedPipe - getSummedFavGraphWithValidUsers( - previousGraphOpt, - newFavs, - halfLivesInDays, - endTime, - minScoreToKeep, - userSource - ) - } - - def getSummedFavGraphWithValidUsers( - previousGraphOpt: Option[TypedPipe[EdgeWithDecayedWeights]], - newFavs: TypedPipe[ContextualizedFavoriteEvent], - halfLivesInDays: List[Int], - endTime: Time, - minScoreToKeep: Double, - userSource: TypedPipe[FlatUser] - )( - implicit uniqueID: UniqueID - ): TypedPipe[EdgeWithDecayedWeights] = { - val fullGraph = getSummedFavGraph( - previousGraphOpt, - newFavs, - halfLivesInDays, - endTime, - minScoreToKeep - ) - removeDeactivedOrSuspendedUsers(fullGraph, userSource) - } - - def processRawFavEvents( - favsOrUnfavs: TypedPipe[ContextualizedFavoriteEvent] - )( - implicit uniqueID: UniqueID - ): TypedPipe[((UserId, TweetId, UserId), TimestampedFavState)] = { - val numFavsBeforeUniq = Stat("num_favs_before_uniq") - val numUnFavsBeforeUniq = Stat("num_unfavs_before_uniq") - val numFinalFavs = Stat("num_final_favs") - val numUnFavsWithPriorFavs = Stat("num_unfavs_with_prior_favs") - val numUnFavsWithoutPriorFavs = Stat("num_unfavs_without_prior_favs") - - favsOrUnfavs - .flatMap { cfe: ContextualizedFavoriteEvent => - cfe.event match { - case FavoriteEventUnion.Favorite(fav) => - numFavsBeforeUniq.inc() - Some( - ( - (fav.userId, fav.tweetId, fav.tweetUserId), - TimestampedFavState(Fav, fav.eventTimeMs))) - case FavoriteEventUnion.Unfavorite(unfav) => - numUnFavsBeforeUniq.inc() - Some( - ( - (unfav.userId, unfav.tweetId, unfav.tweetUserId), - TimestampedFavState(UnFavWithoutPriorFav, unfav.eventTimeMs))) - case _ => None - } - } - .sumByKey - .toTypedPipe - .flatMap { - case fav @ (_, TimestampedFavState(Fav, _)) => - numFinalFavs.inc() - Some(fav) - case unfav @ (_, TimestampedFavState(UnFavWithoutPriorFav, _)) => - numUnFavsWithoutPriorFavs.inc() - Some(unfav) - case (_, TimestampedFavState(UnFavWithPriorFav, _)) => - numUnFavsWithPriorFavs.inc() - None - } - } - - private def getGraphFromNewFavsOnly( - newFavs: TypedPipe[ContextualizedFavoriteEvent], - halfLivesInDays: List[Int], - endTime: Time - )( - implicit uniqueID: UniqueID - ): UnsortedGrouped[(UserId, UserId), Map[Int, DecayedValue]] = { - - val numEventsNewerThanEndTime = Stat("num_events_newer_than_endtime") - - processRawFavEvents(newFavs).map { - case ((userId, _, authorId), TimestampedFavState(favOrUnfav, timestampMillis)) => - val halfLifeInDaysToScores = halfLivesInDays.map { halfLifeInDays => - val givenTime = Time.fromMilliseconds(timestampMillis) - if (givenTime > endTime) { - // technically this should never happen, and even if it did happen, - // we shouldn't have to care, but I'm noticing that the weights aren't being computed - // correctly for events that spilled over the edge - numEventsNewerThanEndTime.inc() - } - val timeInSeconds = math.min(givenTime.inSeconds, endTime.inSeconds) - val value = favOrUnfav match { - case Fav => 1.0 - case UnFavWithoutPriorFav => -1.0 - case UnFavWithPriorFav => 0.0 - } - val decayedValue = DecayedValue.build(value, timeInSeconds, halfLifeInDays.days.inSeconds) - halfLifeInDays -> decayedValue - } - ((userId, authorId), halfLifeInDaysToScores.toMap) - }.sumByKey - } - - def getSummedFavGraph( - previousGraphOpt: Option[TypedPipe[EdgeWithDecayedWeights]], - newFavs: TypedPipe[ContextualizedFavoriteEvent], - halfLivesInDays: List[Int], - endTime: Time, - minScoreToKeep: Double - )( - implicit uniqueID: UniqueID - ): TypedPipe[EdgeWithDecayedWeights] = { - val prunedScoresCounter = Stat("num_pruned_scores") - val negativeScoresCounter = Stat("num_negative_scores") - val prunedEdgesCounter = Stat("num_pruned_edges") - val keptEdgesCounter = Stat("num_kept_edges") - val keptScoresCounter = Stat("num_kept_scores") - val numCommonEdges = Stat("num_common_edges") - val numNewEdges = Stat("num_new_edges") - val numOldEdges = Stat("num_old_edges") - - val unprunedOuterJoinedGraph = previousGraphOpt match { - case Some(previousGraph) => - previousGraph - .map { - case EdgeWithDecayedWeights(srcId, destId, decayedSums) => - val ts = decayedSums.lastUpdatedTimestamp.toDouble / 1000 - val map = decayedSums.halfLifeInDaysToDecayedSums.map { - case (halfLifeInDays, value) => - halfLifeInDays -> DecayedValue.build(value, ts, halfLifeInDays.days.inSeconds) - }.toMap - ((srcId, destId), map) - } - .outerJoin(getGraphFromNewFavsOnly(newFavs, halfLivesInDays, endTime)) - .toTypedPipe - case None => - getGraphFromNewFavsOnly(newFavs, halfLivesInDays, endTime).toTypedPipe - .map { - case ((srcId, destId), scoreMap) => - ((srcId, destId), (None, Some(scoreMap))) - } - } - - unprunedOuterJoinedGraph - .flatMap { - case ((srcId, destId), (previousScoreMapOpt, newScoreMapOpt)) => - val latestTimeDecayedValues = halfLivesInDays.map { hlInDays => - hlInDays -> DecayedValue.build(0, endTime.inSeconds, hlInDays.days.inSeconds) - }.toMap - - val updatedDecayedValues = - Monoid.sum( - List(previousScoreMapOpt, newScoreMapOpt, Some(latestTimeDecayedValues)).flatten) - - (previousScoreMapOpt, newScoreMapOpt) match { - case (Some(pm), None) => numOldEdges.inc() - case (None, Some(nm)) => numNewEdges.inc() - case (Some(pm), Some(nm)) => numCommonEdges.inc() - } - - val prunedMap = updatedDecayedValues.flatMap { - case (hlInDays, decayedValue) => - if (decayedValue.value < minScoreToKeep) { - if (decayedValue.value < 0) { - negativeScoresCounter.inc() - } - prunedScoresCounter.inc() - None - } else { - keptScoresCounter.inc() - Some((hlInDays, decayedValue.value)) - } - } - - if (prunedMap.nonEmpty) { - keptEdgesCounter.inc() - Some(EdgeWithDecayedWeights(srcId, destId, DecayedSums(endTime.inMillis, prunedMap))) - } else { - prunedEdgesCounter.inc() - None - } - } - } - - def removeDeactivedOrSuspendedUsers( - full: TypedPipe[EdgeWithDecayedWeights], - userSource: TypedPipe[FlatUser] - )( - implicit uniqueID: UniqueID - ): TypedPipe[EdgeWithDecayedWeights] = { - val numValidUsers = Stat("num_valid_users") - val numInvalidUsers = Stat("num_invalid_users") - val numEdgesBeforeUsersourceJoin = Stat("num_edges_before_join_with_usersource") - val numEdgesWithValidSource = Stat("num_edges_with_valid_source") - val numEdgesWithValidSourceAndDest = Stat("num_edges_with_valid_source_and_dest") - - val validUsers = userSource.flatMap { - case flatUser - if !flatUser.deactivated.contains(true) && !flatUser.suspended.contains(true) - && flatUser.id.nonEmpty => - numValidUsers.inc() - flatUser.id - case _ => - numInvalidUsers.inc() - None - }.forceToDisk // avoid reading in the whole of userSource for both of the joins below - - val toJoin = full.map { edge => - numEdgesBeforeUsersourceJoin.inc() - (edge.sourceId, edge) - } - - toJoin - .join(validUsers.asKeys) - .map { - case (_, (edge, _)) => - numEdgesWithValidSource.inc() - (edge.destinationId, edge) - } - .join(validUsers.asKeys) - .map { - case (_, (edge, _)) => - numEdgesWithValidSourceAndDest.inc() - edge - } - } -} - -/** - * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:fav_graph_adhoc && \ - * oscar hdfs --user frigate --host hadoopnest1.atla.twitter.com --bundle fav_graph_adhoc \ - * --tool com.twitter.simclusters_v2.scalding.UserUserFavGraphAdhoc --screen --screen-detached \ - * --tee logs/userUserFavGraphAdhoc_20170101 -- --date 2017-01-01 --halfLivesInDays 14 50 100 \ - * --outputDir /user/frigate/your_ldap/userUserFavGraphAdhoc_20170101_hl14_50_100 - * - * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:fav_graph_adhoc && \ - * oscar hdfs --user frigate --host hadoopnest1.atla.twitter.com --bundle fav_graph_adhoc \ - * --tool com.twitter.simclusters_v2.scalding.UserUserFavGraphAdhoc --screen --screen-detached \ - * --tee logs/userUserFavGraphAdhoc_20170102_addPrevious20170101 -- --date 2017-01-02 \ - * --previousGraphDir /user/frigate/your_ldap/userUserFavGraphAdhoc_20170101_hl14_50_100 \ - * --halfLivesInDays 14 50 100 \ - * --outputDir /user/frigate/your_ldap/userUserFavGraphAdhoc_20170102_addPrevious20170101_hl14_50_100 - */ -object UserUserFavGraphAdhoc extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - val log = Logger() - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val previousGraphOpt = args.optional("previousGraphDir").map { dir => - TypedPipe.from(EdgeWithDecayedWtsFixedPathSource(dir)) - } - val favsDateRange = DateRange.parse(args.list("date")) - val halfLives = args.list("halfLivesInDays").map(_.toInt) - val minScoreToKeep = args.double("minScoreToKeep", 1e-5) - val outputDir = args("outputDir") - Util.printCounters( - UserUserFavGraph - .getSummedFavGraph(previousGraphOpt, favsDateRange, halfLives, minScoreToKeep) - .writeExecution(EdgeWithDecayedWtsFixedPathSource(outputDir)) - ) - } - } -} - -/** - * $ capesospy-v2 update --start_cron fav_graph src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object UserUserFavGraphBatch extends TwitterScheduledExecutionApp { - private val firstTime: String = "2017-01-01" - implicit val tz = DateOps.UTC - implicit val parser = DateParser.default - private val batchIncrement: Duration = Days(2) - private val firstStartDate = DateRange.parse(firstTime).start - - val outputPath: String = "/user/cassowary/processed/user_user_fav_graph" - val log = Logger() - - private val execArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(this.getClass.getName), - firstTime = BatchFirstTime(RichDate(firstTime)), - lastTime = None, - batchIncrement = BatchIncrement(batchIncrement) - ) - - override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) { dateRange => - Execution.withId { implicit uniqueId => - Execution.withArgs { args => - val previousGraph = if (dateRange.start.timestamp == firstStartDate.timestamp) { - log.info("Looks like this is the first time, setting previousGraph to None") - None - } else { - Some( - DAL - .readMostRecentSnapshot(UserUserFavGraphScalaDataset, dateRange - batchIncrement) - .toTypedPipe - ) - } - val halfLives = args.list("halfLivesInDays").map(_.toInt) - val minScoreToKeep = args.double("minScoreToKeep", 1e-5) - Util.printCounters( - UserUserFavGraph - .getSummedFavGraph(previousGraph, dateRange, halfLives, minScoreToKeep) - .writeDALSnapshotExecution( - UserUserFavGraphScalaDataset, - D.Daily, - D.Suffix(outputPath), - D.EBLzo(), - dateRange.end) - ) - } - } - } -} - -object DumpFavGraphAdhoc extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val favGraph = DAL - .readMostRecentSnapshotNoOlderThan(UserUserFavGraphScalaDataset, Days(10)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .collect { - case edge if edge.weights.halfLifeInDaysToDecayedSums.contains(100) => - (edge.sourceId, edge.destinationId, edge.weights.halfLifeInDaysToDecayedSums(100)) - } - - Execution - .sequence( - Seq( - Util.printSummaryOfNumericColumn( - favGraph.map(_._3), - Some("Weight") - ), - Util.printSummaryOfNumericColumn( - favGraph.map(c => math.log10(10.0 + c._3)), - Some("Weight_Log_P10") - ), - Util.printSummaryOfNumericColumn( - favGraph.map(c => math.log10(1.0 + c._3)), - Some("Weight_Log_P1") - ), - Util.printSummaryOfCategoricalColumn(favGraph.map(_._1), Some("SourceId")), - Util.printSummaryOfCategoricalColumn(favGraph.map(_._2), Some("DestId")) - ) - ).flatMap { summarySeq => - println(summarySeq.mkString("\n")) - Execution.unit - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/UserUserGraph.docx b/src/scala/com/twitter/simclusters_v2/scalding/UserUserGraph.docx new file mode 100644 index 000000000..ce4463085 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/UserUserGraph.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/UserUserGraph.scala b/src/scala/com/twitter/simclusters_v2/scalding/UserUserGraph.scala deleted file mode 100644 index bdb1004c7..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/UserUserGraph.scala +++ /dev/null @@ -1,180 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite.{D, WriteExtension} -import com.twitter.scalding_internal.job.analytics_batch.{ - AnalyticsBatchExecution, - AnalyticsBatchExecutionArgs, - BatchDescription, - BatchFirstTime, - BatchIncrement, - TwitterScheduledExecutionApp -} -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.hdfs_sources.{ - UserAndNeighborsFixedPathSource, - UserUserGraphScalaDataset -} -import com.twitter.simclusters_v2.thriftscala.{NeighborWithWeights, UserAndNeighbors} -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import java.util.TimeZone - -/** - * This is a scheduled version of the user_user_normalized_graph dataset generation job. - * - * The key difference in this implementation is that we donot read the ProducerNormsAndCounts dataset. - * So we no longer store the following producer normalized scores for the edges in the NeigborWithWeights thrift: - * followScoreNormalizedByNeighborFollowersL2, favScoreHalfLife100DaysNormalizedByNeighborFaversL2 and logFavScoreL2Normalized - * - */ -object UserUserGraph { - - def getNeighborWithWeights( - inputEdge: Edge - ): NeighborWithWeights = { - val logFavScore = UserUserNormalizedGraph.logTransformation(inputEdge.favWeight) - NeighborWithWeights( - neighborId = inputEdge.destId, - isFollowed = Some(inputEdge.isFollowEdge), - favScoreHalfLife100Days = Some(inputEdge.favWeight), - logFavScore = Some(logFavScore), - ) - } - - def addWeightsAndAdjListify( - input: TypedPipe[Edge], - maxNeighborsPerUser: Int - )( - implicit uniqueId: UniqueID - ): TypedPipe[UserAndNeighbors] = { - val numUsersNeedingNeighborTruncation = Stat("num_users_needing_neighbor_truncation") - val numEdgesAfterTruncation = Stat("num_edges_after_truncation") - val numEdgesBeforeTruncation = Stat("num_edges_before_truncation") - val numFollowEdgesBeforeTruncation = Stat("num_follow_edges_before_truncation") - val numFavEdgesBeforeTruncation = Stat("num_fav_edges_before_truncation") - val numFollowEdgesAfterTruncation = Stat("num_follow_edges_after_truncation") - val numFavEdgesAfterTruncation = Stat("num_fav_edges_after_truncation") - val numRecordsInOutputGraph = Stat("num_records_in_output_graph") - - input - .map { edge => - numEdgesBeforeTruncation.inc() - if (edge.isFollowEdge) numFollowEdgesBeforeTruncation.inc() - if (edge.favWeight > 0) numFavEdgesBeforeTruncation.inc() - (edge.srcId, getNeighborWithWeights(edge)) - } - .group - // .withReducers(10000) - .sortedReverseTake(maxNeighborsPerUser)(Ordering.by { x: NeighborWithWeights => - x.favScoreHalfLife100Days.getOrElse(0.0) - }) - .map { - case (srcId, neighborList) => - if (neighborList.size >= maxNeighborsPerUser) numUsersNeedingNeighborTruncation.inc() - neighborList.foreach { neighbor => - numEdgesAfterTruncation.inc() - if (neighbor.favScoreHalfLife100Days.exists(_ > 0)) numFavEdgesAfterTruncation.inc() - if (neighbor.isFollowed.contains(true)) numFollowEdgesAfterTruncation.inc() - } - numRecordsInOutputGraph.inc() - UserAndNeighbors(srcId, neighborList) - } - } - - def run( - followEdges: TypedPipe[(Long, Long)], - favEdges: TypedPipe[(Long, Long, Double)], - maxNeighborsPerUser: Int - )( - implicit uniqueID: UniqueID - ): TypedPipe[UserAndNeighbors] = { - val combined = UserUserNormalizedGraph.combineFollowAndFav(followEdges, favEdges) - addWeightsAndAdjListify( - combined, - maxNeighborsPerUser - ) - } -} - -/** - * - * capesospy-v2 update --build_locally --start_cron user_user_follow_fav_graph \ - * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ - -object UserUserGraphBatch extends TwitterScheduledExecutionApp { - private val firstTime: String = "2021-04-24" - implicit val tz = DateOps.UTC - implicit val parser = DateParser.default - private val batchIncrement: Duration = Days(2) - private val halfLifeInDaysForFavScore = 100 - - private val outputPath: String = "/user/cassowary/processed/user_user_graph" - - private val execArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(this.getClass.getName.replace("$", "")), - firstTime = BatchFirstTime(RichDate(firstTime)), - lastTime = None, - batchIncrement = BatchIncrement(batchIncrement) - ) - - override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) { - implicit dateRange => - Execution.withId { implicit uniqueId => - Execution.withArgs { args => - val maxNeighborsPerUser = args.int("maxNeighborsPerUser", 2000) - - Util.printCounters( - UserUserGraph - .run( - UserUserNormalizedGraph.getFollowEdges, - UserUserNormalizedGraph.getFavEdges(halfLifeInDaysForFavScore), - maxNeighborsPerUser - ) - .writeDALSnapshotExecution( - UserUserGraphScalaDataset, - D.Daily, - D.Suffix(outputPath), - D.EBLzo(), - dateRange.end) - ) - } - } - } -} - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:user_user_graph-adhoc -scalding remote run \ ---user cassowary \ ---keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ ---principal service_acoount@TWITTER.BIZ \ ---cluster bluebird-qus1 \ ---main-class com.twitter.simclusters_v2.scalding.UserUserGraphAdhoc \ ---target src/scala/com/twitter/simclusters_v2/scalding:user_user_graph-adhoc \ --- --date 2021-04-24 --outputDir "/user/cassowary/adhoc/user_user_graph_adhoc" - */ -object UserUserGraphAdhoc extends AdhocExecutionApp { - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val maxNeighborsPerUser = args.int("maxNeighborsPerUser", 2000) - val halfLifeInDaysForFavScore = 100 - val outputDir = args("outputDir") - val userAndNeighbors = - UserUserGraph - .run( - UserUserNormalizedGraph.getFollowEdges, - UserUserNormalizedGraph.getFavEdges(halfLifeInDaysForFavScore), - maxNeighborsPerUser) - - Execution - .zip( - userAndNeighbors.writeExecution(UserAndNeighborsFixedPathSource(outputDir)), - userAndNeighbors.writeExecution(TypedTsv(outputDir + "_tsv"))).unit - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/UserUserNormalizedGraph.docx b/src/scala/com/twitter/simclusters_v2/scalding/UserUserNormalizedGraph.docx new file mode 100644 index 000000000..58087b8f8 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/UserUserNormalizedGraph.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/UserUserNormalizedGraph.scala b/src/scala/com/twitter/simclusters_v2/scalding/UserUserNormalizedGraph.scala deleted file mode 100644 index 62d878fc6..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/UserUserNormalizedGraph.scala +++ /dev/null @@ -1,453 +0,0 @@ -package com.twitter.simclusters_v2.scalding - -import com.twitter.bijection.Injection -import com.twitter.frigate.user_sampler.common.EmployeeIds -import com.twitter.hashing.KeyHasher -import com.twitter.logging.Logger -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecution -import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecutionArgs -import com.twitter.scalding_internal.job.analytics_batch.BatchDescription -import com.twitter.scalding_internal.job.analytics_batch.BatchFirstTime -import com.twitter.scalding_internal.job.analytics_batch.BatchIncrement -import com.twitter.scalding_internal.job.analytics_batch.TwitterScheduledExecutionApp -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.EdgeWithDecayedWeights -import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights -import com.twitter.simclusters_v2.thriftscala.NormsAndCounts -import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import flockdb_tools.datasets.flock.FlockFollowsEdgesScalaDataset - -case class Edge(srcId: Long, destId: Long, isFollowEdge: Boolean, favWeight: Double) - -object UserUserNormalizedGraph { - - // The common function for applying logarithmic transformation - def logTransformation(weight: Double): Double = { - math.max(math.log10(1.0 + weight), 0.0) - } - - def getFollowEdges(implicit dateRange: DateRange, uniqueID: UniqueID): TypedPipe[(Long, Long)] = { - val numInputFollowEdges = Stat("num_input_follow_edges") - DAL - .readMostRecentSnapshot(FlockFollowsEdgesScalaDataset) - .toTypedPipe - .collect { - case edge if edge.state == 0 => - numInputFollowEdges.inc() - (edge.sourceId, edge.destinationId) - } - } - - def transformFavEdges( - input: TypedPipe[EdgeWithDecayedWeights], - halfLifeInDaysForFavScore: Int - )( - implicit uniqueID: UniqueID - ): TypedPipe[(Long, Long, Double)] = { - val numEdgesWithSpecifiedHalfLife = Stat( - s"num_edges_with_specified_half_life_${halfLifeInDaysForFavScore}_days") - val numEdgesWithoutSpecifiedHalfLife = Stat( - s"num_edges_without_specified_half_life_${halfLifeInDaysForFavScore}_days") - input - .flatMap { edge => - if (edge.weights.halfLifeInDaysToDecayedSums.contains(halfLifeInDaysForFavScore)) { - numEdgesWithSpecifiedHalfLife.inc() - Some((edge.sourceId, edge.destinationId, edge.weights.halfLifeInDaysToDecayedSums(100))) - } else { - numEdgesWithoutSpecifiedHalfLife.inc() - None - } - } - } - - def getFavEdges( - halfLifeInDaysForFavScore: Int - )( - implicit dateRange: DateRange, - uniqueID: UniqueID - ): TypedPipe[(Long, Long, Double)] = { - implicit val tz: java.util.TimeZone = DateOps.UTC - transformFavEdges( - DAL - .readMostRecentSnapshot(UserUserFavGraphScalaDataset) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe, - halfLifeInDaysForFavScore - ) - } - - def getNeighborWithWeights( - inputEdge: Edge, - followerL2NormOfDest: Double, - faverL2NormOfDest: Double, - logFavL2Norm: Double - ): NeighborWithWeights = { - val normalizedFollowScore = { - val numerator = if (inputEdge.isFollowEdge) 1.0 else 0.0 - if (followerL2NormOfDest > 0) numerator / followerL2NormOfDest else 0.0 - } - val normalizedFavScore = - if (faverL2NormOfDest > 0) inputEdge.favWeight / faverL2NormOfDest else 0.0 - val logFavScore = if (inputEdge.favWeight > 0) logTransformation(inputEdge.favWeight) else 0.0 - val logFavScoreL2Normalized = if (logFavL2Norm > 0) logFavScore / logFavL2Norm else 0.0 - NeighborWithWeights( - inputEdge.destId, - Some(inputEdge.isFollowEdge), - Some(normalizedFollowScore), - Some(inputEdge.favWeight), - Some(normalizedFavScore), - logFavScore = Some(logFavScore), - logFavScoreL2Normalized = Some(logFavScoreL2Normalized) - ) - } - - def addNormalizedWeightsAndAdjListify( - input: TypedPipe[Edge], - maxNeighborsPerUser: Int, - normsAndCountsFull: TypedPipe[NormsAndCounts] - )( - implicit uniqueId: UniqueID - ): TypedPipe[UserAndNeighbors] = { - val numUsersNeedingNeighborTruncation = Stat("num_users_needing_neighbor_truncation") - val numEdgesAfterTruncation = Stat("num_edges_after_truncation") - val numEdgesBeforeTruncation = Stat("num_edges_before_truncation") - val numFollowEdgesBeforeTruncation = Stat("num_follow_edges_before_truncation") - val numFavEdgesBeforeTruncation = Stat("num_fav_edges_before_truncation") - val numFollowEdgesAfterTruncation = Stat("num_follow_edges_after_truncation") - val numFavEdgesAfterTruncation = Stat("num_fav_edges_after_truncation") - val numRecordsInOutputGraph = Stat("num_records_in_output_graph") - - val norms = normsAndCountsFull.map { record => - ( - record.userId, - ( - record.followerL2Norm.getOrElse(0.0), - record.faverL2Norm.getOrElse(0.0), - record.logFavL2Norm.getOrElse(0.0))) - } - - implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian - input - .map { edge => (edge.destId, edge) } - .sketch(reducers = 2000) - .join(norms) - .map { - case (destId, (edge, (followNorm, favNorm, logFavNorm))) => - numEdgesBeforeTruncation.inc() - if (edge.isFollowEdge) numFollowEdgesBeforeTruncation.inc() - if (edge.favWeight > 0) numFavEdgesBeforeTruncation.inc() - (edge.srcId, getNeighborWithWeights(edge, followNorm, favNorm, logFavNorm)) - } - .group - //.withReducers(1000) - .sortedReverseTake(maxNeighborsPerUser)(Ordering.by { x: NeighborWithWeights => - ( - x.favScoreHalfLife100Days.getOrElse(0.0), - x.followScoreNormalizedByNeighborFollowersL2.getOrElse(0.0) - ) - }) - .map { - case (srcId, neighborList) => - if (neighborList.size >= maxNeighborsPerUser) numUsersNeedingNeighborTruncation.inc() - neighborList.foreach { neighbor => - numEdgesAfterTruncation.inc() - if (neighbor.favScoreHalfLife100Days.exists(_ > 0)) numFavEdgesAfterTruncation.inc() - if (neighbor.isFollowed.contains(true)) numFollowEdgesAfterTruncation.inc() - } - numRecordsInOutputGraph.inc() - UserAndNeighbors(srcId, neighborList) - } - } - - def combineFollowAndFav( - followEdges: TypedPipe[(Long, Long)], - favEdges: TypedPipe[(Long, Long, Double)] - ): TypedPipe[Edge] = { - ( - followEdges.map { case (src, dest) => ((src, dest), (1, 0.0)) } ++ - favEdges.map { case (src, dest, wt) => ((src, dest), (0, wt)) } - ).sumByKey - //.withReducers(2500) - .map { - case ((src, dest), (follow, favWt)) => - Edge(src, dest, isFollowEdge = follow > 0, favWt) - } - } - - def run( - followEdges: TypedPipe[(Long, Long)], - favEdges: TypedPipe[(Long, Long, Double)], - normsAndCounts: TypedPipe[NormsAndCounts], - maxNeighborsPerUser: Int - )( - implicit uniqueID: UniqueID - ): TypedPipe[UserAndNeighbors] = { - val combined = combineFollowAndFav(followEdges, favEdges) - addNormalizedWeightsAndAdjListify( - combined, - maxNeighborsPerUser, - normsAndCounts - ) - } -} - -object UserUserNormalizedGraphBatch extends TwitterScheduledExecutionApp { - private val firstTime: String = "2018-06-16" - implicit val tz = DateOps.UTC - implicit val parser = DateParser.default - private val batchIncrement: Duration = Days(7) - private val halfLifeInDaysForFavScore = 100 - - private val outputPath: String = "/user/cassowary/processed/user_user_normalized_graph" - - private val execArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(this.getClass.getName.replace("$", "")), - firstTime = BatchFirstTime(RichDate(firstTime)), - lastTime = None, - batchIncrement = BatchIncrement(batchIncrement) - ) - - override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) { - implicit dateRange => - Execution.withId { implicit uniqueId => - Execution.withArgs { args => - val maxNeighborsPerUser = args.int("maxNeighborsPerUser", 2000) - - val producerNormsAndCounts = - DAL.readMostRecentSnapshot(ProducerNormsAndCountsScalaDataset).toTypedPipe - - Util.printCounters( - UserUserNormalizedGraph - .run( - UserUserNormalizedGraph.getFollowEdges, - UserUserNormalizedGraph.getFavEdges(halfLifeInDaysForFavScore), - producerNormsAndCounts, - maxNeighborsPerUser - ) - .writeDALSnapshotExecution( - UserUserNormalizedGraphScalaDataset, - D.Daily, - D.Suffix(outputPath), - D.EBLzo(), - dateRange.end) - ) - } - } - } -} - -object UserUserNormalizedGraphAdhoc extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - val log = Logger() - - def hashToLong(input: Long): Long = { - val bb = java.nio.ByteBuffer.allocate(8) - bb.putLong(input) - Math.abs(KeyHasher.KETAMA.hashKey(bb.array())) - } - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - implicit val dateRange: DateRange = DateRange.parse(args.list("date")) - val halfLifeInDaysForFavScore = 100 - val maxNeighborsPerUser = args.int("maxNeighborsPerUser", 2000) - val producerNormsAndCounts = TypedPipe.from( - NormsAndCountsFixedPathSource(args("normsInputDir")) - ) - val favEdges = args.optional("favGraphInputDir") match { - case Some(favGraphInputDir) => - UserUserNormalizedGraph.transformFavEdges( - TypedPipe.from( - EdgeWithDecayedWtsFixedPathSource(favGraphInputDir) - ), - halfLifeInDaysForFavScore - ) - case None => - UserUserNormalizedGraph.getFavEdges(halfLifeInDaysForFavScore) - } - - val followEdges = UserUserNormalizedGraph.getFollowEdges - - Util.printCounters( - UserUserNormalizedGraph - .run( - followEdges, - favEdges, - producerNormsAndCounts, - maxNeighborsPerUser - ).writeExecution(UserAndNeighborsFixedPathSource(args("outputDir"))) - ) - } - } -} - -object DumpUserUserGraphAdhoc extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val input = args.optional("inputDir") match { - case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir)) - case None => - DAL - .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - val users = args.list("users").map(_.toLong).toSet - if (users.isEmpty) { - input.printSummary("Producer norms and counts") - } else { - input - .collect { - case rec if users.contains(rec.userId) => - (Seq(rec.userId.toString) ++ rec.neighbors.map { n => - Util.prettyJsonMapper.writeValueAsString(n).replaceAll("\n", " ") - }).mkString("\n") - } - .toIterableExecution - .map { strings => println(strings.mkString("\n")) } - } - } - } -} - -/* - * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:user_user_normalized_graph && \ - * oscar hdfs --host hadoopnest2.atla.twitter.com --bundle user_user_normalized_graph \ - * --tool com.twitter.simclusters_v2.scalding.EmployeeGraph --screen --screen-detached \ - * --tee your_ldap/employeeGraph20190809 -- --outputDir adhoc/employeeGraph20190809 - */ -object EmployeeGraph extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val input = args.optional("inputDir") match { - case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir)) - case None => - DAL - .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - val employeeIds = EmployeeIds.buildMerlinClientAndGetEmployees("frigate-scalding.dev") - input - .collect { - case rec if employeeIds.contains(rec.userId) => - rec.neighbors.collect { - case n if employeeIds.contains(n.neighborId) => - ( - rec.userId, - n.neighborId, - n.favScoreHalfLife100Days.getOrElse(0), - n.isFollowed.getOrElse(false)) - } - } - .flatten - .writeExecution(TypedTsv(args("outputDir"))) - - } - } -} -/* - * scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding:employee_graph_from_user_user - * --main-class com.twitter.simclusters_v2.scalding.EmployeeGraphFromUserUser - * --submitter hadoopnest2.atla.twitter.com --user recos-platform -- --graphOutputDir "/user/recos-platform/adhoc/employee_graph_from_user_user/" - */ - -object EmployeeGraphFromUserUser extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val graphOutputDir = args("graphOutputDir") - val input = args.optional("inputDir") match { - case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir)) - case None => - DAL - .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - val employeeIds = EmployeeIds.buildMerlinClientAndGetEmployees("frigate-scalding.dev") - input - .collect { - case rec if employeeIds.contains(rec.userId) => - rec - } - .writeExecution(UserAndNeighborsFixedPathSource(graphOutputDir)) - - } - } -} - -/* - * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:user_user_normalized_graph && \ - * oscar hdfs --host hadoopnest2.atla.twitter.com --bundle user_user_normalized_graph \ - * --tool com.twitter.simclusters_v2.scalding.VitGraph --screen --screen-detached \ - * --tee your_ldap/vitGraph20190809 -- --outputDir adhoc/vitGraph20190809 - */ -object VitGraph extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - val minActiveFollowers = args.int("minActiveFollowers") - val topK = args.int("topK") - val input = args.optional("inputDir") match { - case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir)) - case None => - DAL - .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - val userSource = - DAL.readMostRecentSnapshotNoOlderThan(UsersourceFlatScalaDataset, Days(30)).toTypedPipe - - TopUsersSimilarityGraph - .vits(userSource, minActiveFollowers, topK).toIterableExecution.flatMap { vitsIter => - val vits = vitsIter.toSet - println(s"Found ${vits.size} many vits. First few: " + vits.take(5).mkString(",")) - input - .collect { - case rec if vits.contains(rec.userId) => - rec.neighbors.collect { - case n if vits.contains(n.neighborId) => - ( - rec.userId, - n.neighborId, - n.favScoreHalfLife100Days.getOrElse(0), - n.isFollowed.getOrElse(false)) - } - } - .flatten - .writeExecution(TypedTsv(args("outputDir"))) - } - - } - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/BUILD b/src/scala/com/twitter/simclusters_v2/scalding/common/BUILD deleted file mode 100644 index cbb6e14c0..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/common/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/fasterxml/jackson:jackson-module-scala", - "3rdparty/jvm/com/fasterxml/jackson/core:jackson-core", - "3rdparty/jvm/com/fasterxml/jackson/core:jackson-databind", - "3rdparty/jvm/com/fasterxml/jackson/module:jackson-module-scala", - "src/scala/com/twitter/scalding_internal/job/analytics_batch", - "src/scala/com/twitter/simclusters_v2/common", - "strato/src/main/scala/com/twitter/strato/scalding", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/common/BUILD.docx new file mode 100644 index 000000000..43171b703 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/common/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/PersistentTweetEmbeddingSource.docx b/src/scala/com/twitter/simclusters_v2/scalding/common/PersistentTweetEmbeddingSource.docx new file mode 100644 index 000000000..e00d9f5cc Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/common/PersistentTweetEmbeddingSource.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/PersistentTweetEmbeddingSource.scala b/src/scala/com/twitter/simclusters_v2/scalding/common/PersistentTweetEmbeddingSource.scala deleted file mode 100644 index 355144aa4..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/common/PersistentTweetEmbeddingSource.scala +++ /dev/null @@ -1,60 +0,0 @@ -package com.twitter.simclusters_v2.scalding.common - -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.scalding.DateRange -import com.twitter.simclusters_v2.common.Timestamp -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.thriftscala.PersistentSimClustersEmbedding -import com.twitter.strato.scalding.StratoManhattanExportSource -import com.twitter.strato.thrift.ScroogeConvImplicits._ - -object PersistentTweetEmbeddingSource { - // hdfs paths - val FavBasedUpdatedHdfsPath: String = - "/atla/proc/user/cassowary/manhattan-exporter/fav_based_tweet_20m_145k_updated_embeddings" - - val LogFavBasedUpdatedHdfsPath: String = - "/atla/proc/user/cassowary/manhattan-exporter/log_fav_based_tweet_20m_145k_updated_embeddings" - - val LogFavBased2020HdfsPath: String = - "/atla/proc/user/cassowary/manhattan-exporter/log_fav_based_tweet_20m_145k_2020_embeddings" - - // Strato columns - val FavBasedUpdatedStratoColumn: String = - "recommendations/simclusters_v2/embeddings/favBasedTweet20M145KUpdated" - - val LogFavBasedUpdatedStratoColumn: String = - "recommendations/simclusters_v2/embeddings/logFavBasedTweet20M145KUpdatedPersistent" - - val LogFavBased2020StratoColumn: String = - "recommendations/simclusters_v2/embeddings/logFavBasedTweet20M145K2020Persistent" - -} - -/** - * The source that read the Manhattan export persistent embeddings - */ -// Defaults to Updated version. -class FavBasedPersistentTweetEmbeddingMhExportSource( - hdfsPath: String = PersistentTweetEmbeddingSource.FavBasedUpdatedHdfsPath, - stratoColumnPath: String = PersistentTweetEmbeddingSource.FavBasedUpdatedStratoColumn, - range: DateRange, - serviceIdentifier: ServiceIdentifier = ServiceIdentifier.empty) - extends StratoManhattanExportSource[(TweetId, Timestamp), PersistentSimClustersEmbedding]( - hdfsPath, - range, - stratoColumnPath, - serviceIdentifier = serviceIdentifier - ) -// Defaults to 2020 version. -class LogFavBasedPersistentTweetEmbeddingMhExportSource( - hdfsPath: String = PersistentTweetEmbeddingSource.LogFavBased2020HdfsPath, - stratoColumnPath: String = PersistentTweetEmbeddingSource.LogFavBased2020StratoColumn, - range: DateRange, - serviceIdentifier: ServiceIdentifier = ServiceIdentifier.empty) - extends StratoManhattanExportSource[(TweetId, Timestamp), PersistentSimClustersEmbedding]( - hdfsPath, - range, - stratoColumnPath, - serviceIdentifier = serviceIdentifier - ) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/QTreeMultiAggregator.docx b/src/scala/com/twitter/simclusters_v2/scalding/common/QTreeMultiAggregator.docx new file mode 100644 index 000000000..49abd92fc Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/common/QTreeMultiAggregator.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/QTreeMultiAggregator.scala b/src/scala/com/twitter/simclusters_v2/scalding/common/QTreeMultiAggregator.scala deleted file mode 100644 index 970eb3c8e..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/common/QTreeMultiAggregator.scala +++ /dev/null @@ -1,30 +0,0 @@ -package com.twitter.simclusters_v2.scalding.common - -import com.twitter.algebird._ - -/** - * The reason of creating this class is that we need multiple percentiles and current - * implementations need one QTree per percentile which is unnecessary. This class gets multiple - * percentiles from the same QTree. - */ -case class QTreeMultiAggregator[T](percentiles: Seq[Double])(implicit val num: Numeric[T]) - extends Aggregator[T, QTree[Unit], Map[String, Double]] - with QTreeAggregatorLike[T] { - - require( - percentiles.forall(p => p >= 0.0 && p <= 1.0), - "The given percentile must be of the form 0 <= p <= 1.0" - ) - - override def percentile: Double = 0.0 // Useless but needed for the base class - - override def k: Int = QTreeAggregator.DefaultK - - private def getPercentile(qt: QTree[Unit], p: Double): Double = { - val (lower, upper) = qt.quantileBounds(p) - (lower + upper) / 2 - } - - def present(qt: QTree[Unit]): Map[String, Double] = - percentiles.map { p => p.toString -> getPercentile(qt, p) }.toMap -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/TypedRichPipe.docx b/src/scala/com/twitter/simclusters_v2/scalding/common/TypedRichPipe.docx new file mode 100644 index 000000000..b15668125 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/common/TypedRichPipe.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/TypedRichPipe.scala b/src/scala/com/twitter/simclusters_v2/scalding/common/TypedRichPipe.scala deleted file mode 100644 index 6e40ecf80..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/common/TypedRichPipe.scala +++ /dev/null @@ -1,72 +0,0 @@ -package com.twitter.simclusters_v2.scalding.common - -import com.twitter.algebird._ -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding.{Execution, Stat, UniqueID} - -/** - * A richer version of TypedPipe. - */ -class TypedRichPipe[V](pipe: TypedPipe[V]) { - - def count(counterName: String)(implicit uniqueID: UniqueID): TypedPipe[V] = { - val stat = Stat(counterName) - pipe.map { v => - stat.inc() - v - } - } - - /** - * Print a summary of the TypedPipe with total size and some randomly selected records - */ - def getSummary(numRecords: Int = 100): Execution[Option[(Long, String)]] = { - val randomSample = Aggregator.reservoirSample[V](numRecords) - - // more aggregator can be added here - pipe - .aggregate(randomSample.join(Aggregator.size)) - .map { - case (randomSamples, size) => - val samplesStr = randomSamples - .map { sample => - Util.prettyJsonMapper - .writeValueAsString(sample) - .replaceAll("\n", " ") - } - .mkString("\n\t") - - (size, samplesStr) - } - .toOptionExecution - } - - def getSummaryString(name: String, numRecords: Int = 100): Execution[String] = { - getSummary(numRecords) - .map { - case Some((size, string)) => - s"TypedPipeName: $name \nTotal size: $size. \nSample records: \n$string" - case None => s"TypedPipeName: $name is empty" - } - - } - - /** - * Print a summary of the TypedPipe with total size and some randomly selected records - */ - def printSummary(name: String, numRecords: Int = 100): Execution[Unit] = { - getSummaryString(name, numRecords).map { s => println(s) } - } -} - -object TypedRichPipe extends java.io.Serializable { - import scala.language.implicitConversions - - implicit def typedPipeToRichPipe[V]( - pipe: TypedPipe[V] - )( - implicit uniqueID: UniqueID - ): TypedRichPipe[V] = { - new TypedRichPipe(pipe) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/Util.docx b/src/scala/com/twitter/simclusters_v2/scalding/common/Util.docx new file mode 100644 index 000000000..13c40d3e8 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/common/Util.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/Util.scala b/src/scala/com/twitter/simclusters_v2/scalding/common/Util.scala deleted file mode 100644 index 0ed3812a0..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/common/Util.scala +++ /dev/null @@ -1,305 +0,0 @@ -package com.twitter.simclusters_v2.scalding.common - -import com.fasterxml.jackson.core.JsonGenerator -import com.fasterxml.jackson.databind.ObjectMapper -import com.fasterxml.jackson.databind.ObjectWriter -import com.fasterxml.jackson.module.scala.DefaultScalaModule -import com.fasterxml.jackson.module.scala.ScalaObjectMapper -import com.twitter.algebird.Aggregator -import com.twitter.algebird.Moments -import com.twitter.algebird.MultiAggregator -import com.twitter.algebird.SetSizeAggregator -import com.twitter.algebird.SketchMap -import com.twitter.algebird.SketchMapParams -import com.twitter.algebird.mutable.PriorityQueueMonoid -import com.twitter.bijection.Injection -import com.twitter.hashing.KeyHasher -import com.twitter.scalding.Execution -import com.twitter.scalding.Stat -import com.twitter.scalding.TypedPipe -import com.twitter.scalding.UniqueID -import java.io.File -import java.io.PrintWriter -import scala.sys.process._ - -object Util { - private val formatter = java.text.NumberFormat.getNumberInstance - - private val jsonMapper = { - val mapper = new ObjectMapper() with ScalaObjectMapper - mapper.registerModule(DefaultScalaModule) - mapper.configure(JsonGenerator.Feature.WRITE_NUMBERS_AS_STRINGS, true) - mapper - } - - val prettyJsonMapper: ObjectWriter = jsonMapper.writerWithDefaultPrettyPrinter() - - def getCustomCounters[T](exec: Execution[T]): Execution[Map[String, Long]] = { - exec.getCounters.map { - case (_, counters) => - counters.toMap.collect { - case (key, value) if key.group == "Scalding Custom" => - key.counter -> value - } - } - } - - def getCustomCountersString[T](exec: Execution[T]): Execution[String] = { - getCustomCounters(exec).map { map => - val customCounterStrings = map.toList.map { - case (key, value) => - s"$key:${formatter.format(value)}" - } - if (customCounterStrings.nonEmpty) { - "Printing all custom counters:\n" + customCounterStrings.mkString("\n") - } else { - "No custom counters to print" - } - } - } - - // Note ideally this should not allow T that is itself Execution[U] i.e. don't accept - // nested executions - def printCounters[T](exec: Execution[T]): Execution[Unit] = { - getCustomCountersString(exec).map { s => println(s) } - } - - /** - * Print some basic stats of a numeric column. - */ - def printSummaryOfNumericColumn[V]( - input: TypedPipe[V], - columnName: Option[String] = None - )( - implicit num: Numeric[V] - ): Execution[String] = { - lazy val randomSampler = Aggregator.reservoirSample[V](100) - - lazy val percentiles = QTreeMultiAggregator(Seq(0.05, 0.25, 0.50, 0.75, 0.95)) - - lazy val moments = Moments.numericAggregator - - val multiAggregator = MultiAggregator( - Aggregator.size, - percentiles, - Aggregator.max, - Aggregator.min, - Aggregator.numericSum, - moments, - randomSampler - ).andThenPresent { - case (size_, percentiles_, max_, min_, sum_, moments_, samples_) => - percentiles_.mapValues(_.toString) ++ Map( - "size" -> size_.toString, - "max" -> max_.toString, - "min" -> min_.toString, - "sum" -> sum_.toString, - "avg" -> moments_.mean.toString, - "stddev" -> moments_.stddev.toString, - "skewness" -> moments_.skewness.toString, - "samples" -> samples_.mkString(",") - ) - } - - input - .aggregate(multiAggregator) - .toIterableExecution - .map { m => - val summary = - s"Column Name: $columnName\nSummary:\n${Util.prettyJsonMapper.writeValueAsString(m)}" - println(summary) - summary - } - } - - /** - * Output some basic stats of a categorical column. - * - * Note that HeavyHitters only work when the distribution is skewed. - */ - def printSummaryOfCategoricalColumn[V]( - input: TypedPipe[V], - columnName: Option[String] = None - )( - implicit injection: Injection[V, Array[Byte]] - ): Execution[String] = { - - lazy val randomSampler = Aggregator.reservoirSample[V](100) - - lazy val uniqueCounter = new SetSizeAggregator[V](hllBits = 13, maxSetSize = 1000)(injection) - - lazy val sketchMapParams = - SketchMapParams[V](seed = 1618, eps = 0.001, delta = 0.05, heavyHittersCount = 20)(injection) - - lazy val heavyHitter = - SketchMap.aggregator[V, Long](sketchMapParams).composePrepare[V](v => v -> 1L) - - val multiAggregator = MultiAggregator( - Aggregator.size, - uniqueCounter, - heavyHitter, - randomSampler - ).andThenPresent { - case (size_, uniqueSize_, heavyHitter_, sampler_) => - Map( - "size" -> size_.toString, - "unique" -> uniqueSize_.toString, - "samples" -> sampler_.mkString(","), - "heavyHitter" -> heavyHitter_.heavyHitterKeys - .map { key => - val freq = sketchMapParams.frequency(key, heavyHitter_.valuesTable) - key -> freq - } - .sortBy(-_._2).mkString(",") - ) - } - - input - .aggregate(multiAggregator) - .toIterableExecution - .map { m => - val summary = - s"Column Name: $columnName\nSummary:\n${Util.prettyJsonMapper.writeValueAsString(m)}" - println(summary) - summary - } - } - - val edgeOrdering: Ordering[(Long, Long)] = Ordering.by { - case (fromNodeId, toNodeId) => hashToLong(fromNodeId, toNodeId) - } - - def reservoirSamplerMonoidForPairs[K, V]( - sampleSize: Int - )( - implicit ord: Ordering[K] - ): PriorityQueueMonoid[(K, V)] = { - implicit val fullOrdering: Ordering[(K, V)] = Ordering.by(_._1) - new PriorityQueueMonoid[(K, V)](sampleSize) - } - - def reservoirSamplerMonoid[T, U]( - sampleSize: Int, - convert: T => U - )( - implicit ord: Ordering[U] - ): PriorityQueueMonoid[T] = { - new PriorityQueueMonoid[T](sampleSize)(Ordering.by(convert)) - } - - def hashToLong(a: Long, b: Long): Long = { - val bb = java.nio.ByteBuffer.allocate(16) - bb.putLong(a) - bb.putLong(b) - KeyHasher.KETAMA.hashKey(bb.array()) - } - - def hashToLong(a: Long): Long = { - val bb = java.nio.ByteBuffer.allocate(8) - bb.putLong(a) - KeyHasher.KETAMA.hashKey(bb.array()) - } - - // https://en.wikipedia.org/wiki/Pearson_correlation_coefficient - def computeCorrelation(pairedIter: Iterator[(Double, Double)]): Double = { - val (len, xSum, ySum, x2Sum, y2Sum, xySum) = - pairedIter.foldLeft((0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) { - case ((l, xs, ys, x2s, y2s, xys), (x, y)) => - (l + 1, xs + x, ys + y, x2s + x * x, y2s + y * y, xys + x * y) - } - val den = math.sqrt(len * x2Sum - xSum * xSum) * math.sqrt(len * y2Sum - ySum * ySum) - if (den > 0) { - (len * xySum - xSum * ySum) / den - } else 0.0 - } - - // https://en.wikipedia.org/wiki/Cosine_similarity - def cosineSimilarity(pairedIter: Iterator[(Double, Double)]): Double = { - val (xySum, x2Sum, y2Sum) = pairedIter.foldLeft(0.0, 0.0, 0.0) { - case ((xy, x2, y2), (x, y)) => - (xy + x * y, x2 + x * x, y2 + y * y) - } - val den = math.sqrt(x2Sum) * math.sqrt(y2Sum) - if (den > 0) { - xySum / den - } else 0.0 - } - - case class Distribution( - avg: Double, - stdDev: Double, - p1: Double, - p10: Double, - p50: Double, - p90: Double, - p99: Double) - - val emptyDist: Distribution = Distribution(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) - - def distributionFromArray(l: Array[Double]): Distribution = { - val s = l.sorted - val len = l.length - - if (len < 1) { - emptyDist - } else { - def pctToIndex(p: Double): Int = { - val idx = math.round(l.length * p).toInt - if (idx < 0) { - 0 - } else if (idx >= len) { - len - 1 - } else { - idx - } - } - - val (sum, sumSquared) = l.foldLeft((0.0, 0.0)) { - case ((curSum, curSumSquared), x) => - (curSum + x, curSumSquared + x * x) - } - - val avg = sum / len - val stdDev = math.sqrt(sumSquared / len - avg * avg) - Distribution( - avg, - stdDev, - p1 = s(pctToIndex(0.01)), - p10 = s(pctToIndex(0.1)), - p50 = s(pctToIndex(0.5)), - p90 = s(pctToIndex(0.9)), - p99 = s(pctToIndex(0.99))) - } - } - - // Calculate cumulative frequency using Scalding Custom Counters. - // Increment all buckets by 1 where value <= bucket_threshold. - case class CumulativeStat( - key: String, - buckets: Seq[Double] - )( - implicit uniqueID: UniqueID) { - - val counters = buckets.map { bucket => - bucket -> Stat(key + "_<=" + bucket.toString) - } - - def incForValue(value: Double): Unit = { - counters.foreach { - case (bucket, stat) => - if (value <= bucket) stat.inc() - } - } - } - - def sendEmail(text: String, subject: String, toAddress: String): String = { - val file = File.createTempFile("somePrefix_", "_someSuffix") - println(s"Email body is at ${file.getPath}") - val writer = new PrintWriter(file) - writer.write(text) - writer.close() - - val mailCmd = s"cat ${file.getPath}" #| Seq("mail", "-s", subject, toAddress) - mailCmd.!! - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/BUILD b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/BUILD deleted file mode 100644 index 962a53de0..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/BUILD +++ /dev/null @@ -1,8 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/scala/com/twitter/scalding_internal/job/analytics_batch", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/BUILD.docx new file mode 100644 index 000000000..f0c6ea478 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/DenseRowMatrix.docx b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/DenseRowMatrix.docx new file mode 100644 index 000000000..3420bdbc0 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/DenseRowMatrix.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/DenseRowMatrix.scala b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/DenseRowMatrix.scala deleted file mode 100644 index eb72e8708..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/DenseRowMatrix.scala +++ /dev/null @@ -1,73 +0,0 @@ -package com.twitter.simclusters_v2.scalding.common.matrix - -import com.twitter.algebird.{ArrayMonoid, BloomFilterMonoid, Monoid, Semigroup} -import com.twitter.algebird.Semigroup._ -import com.twitter.bijection.Injection -import com.twitter.scalding.{TypedPipe, ValuePipe} - -/** - * A class that represents a row-indexed dense matrix, backed by a TypedPipe[(R, Array[Double])]. - * For each row of the TypedPipe, we save an array of values. - * Only use this class when the number of columns is small (say, <100K). - * - * @param pipe underlying pipe - * @param rowOrd ordering function for row type - * @param rowInj injection function for the row type - * @tparam R Type for rows - */ -case class DenseRowMatrix[R]( - pipe: TypedPipe[(R, Array[Double])], -)( - implicit val rowOrd: Ordering[R], - val rowInj: Injection[R, Array[Byte]]) { - - lazy val semigroupArrayV: Semigroup[Array[Double]] = new ArrayMonoid[Double]() - - // convert to a SparseMatrix - lazy val toSparseMatrix: SparseMatrix[R, Int, Double] = { - this.toSparseRowMatrix.toSparseMatrix - } - - // convert to a SparseRowMatrix - lazy val toSparseRowMatrix: SparseRowMatrix[R, Int, Double] = { - SparseRowMatrix( - this.pipe.map { - case (i, values) => - (i, values.zipWithIndex.collect { case (value, j) if value != 0.0 => (j, value) }.toMap) - }, - isSkinnyMatrix = true) - } - - // convert to a TypedPipe - lazy val toTypedPipe: TypedPipe[(R, Array[Double])] = { - this.pipe - } - - // filter the matrix based on a subset of rows - def filterRows(rows: TypedPipe[R]): DenseRowMatrix[R] = { - DenseRowMatrix(this.pipe.join(rows.asKeys).mapValues(_._1)) - } - - // get the l2 norms for all rows. this does not trigger a shuffle. - lazy val rowL2Norms: TypedPipe[(R, Double)] = { - this.pipe.map { - case (row, values) => - row -> math.sqrt(values.map(a => a * a).sum) - } - } - - // normalize the matrix to make sure each row has unit norm - lazy val rowL2Normalize: DenseRowMatrix[R] = { - - DenseRowMatrix(this.pipe.map { - case (row, values) => - val norm = math.sqrt(values.map(v => v * v).sum) - if (norm == 0.0) { - row -> values - } else { - row -> values.map(v => v / norm) - } - }) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/SparseMatrix.docx b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/SparseMatrix.docx new file mode 100644 index 000000000..3cf5f584d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/SparseMatrix.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/SparseMatrix.scala b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/SparseMatrix.scala deleted file mode 100644 index 55514c350..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/SparseMatrix.scala +++ /dev/null @@ -1,423 +0,0 @@ -package com.twitter.simclusters_v2.scalding.common.matrix - -import com.twitter.algebird.Semigroup -import com.twitter.bijection.Injection -import com.twitter.scalding.{TypedPipe, ValuePipe} - -/** - * A case class that represents a sparse matrix backed by a TypedPipe[(R, C, V)]. - * - * We assume the input does not have more than one value per (row, col), and all the input values - * are non-zero. - * - * We do not except the input pipe are indexed from 0 to numRows or numCols. - * The input can be any type (for example, userId/TweetId/Hashtag). - * We do not convert them to indices, but just use the input as a key to represent the rowId/colId. - * - * Example: - * - * val a = SparseMatrix(TypedPipe.from(Seq((1,1,1.0), (2,2,2.0), (3,3,3.0)))) - * - * val b = a.rowL2Normalize // get a new matrix that has unit-norm each row. - * - * val c = a.multiplySparseMatrix(b) // multiply another matrix - * - * val d = a.transpose // transpose the matrix - * - * @param pipe underlying pipe. We assume the input does not have more than one value per (row, col), - * and all the values are non-zero. - * @param rowOrd ordering function for row type - * @param colOrd ordering function for col type - * @param numericV numeric operations for value type - * @param semigroupV semigroup for the value type - * @param rowInj injection function for the row type - * @param colInj injection function for the col type - * @tparam R Type for rows - * @tparam C Type for columns - * @tparam V Type for elements of the matrix - */ -case class SparseMatrix[R, C, V]( - pipe: TypedPipe[(R, C, V)] -)( - implicit override val rowOrd: Ordering[R], - override val colOrd: Ordering[C], - override val numericV: Numeric[V], - override val semigroupV: Semigroup[V], - override val rowInj: Injection[R, Array[Byte]], - override val colInj: Injection[C, Array[Byte]]) - extends TypedPipeMatrix[R, C, V] { - - // number of non-zero values in the matrix - override lazy val nnz: ValuePipe[Long] = { - this.filter((_, _, v) => v != numericV.zero).pipe.map(_ => 1L).sum - } - - // number of non-zero values in each row - lazy val rowNnz: TypedPipe[(R, Long)] = { - this.pipe.collect { - case (row, _, v) if v != numericV.zero => - row -> 1L - }.sumByKey - } - - // get the num of non-zero values for each col. - lazy val colNnz: TypedPipe[(C, Long)] = { - this.transpose.rowNnz - } - - override lazy val uniqueRowIds: TypedPipe[R] = { - this.pipe.map(t => t._1).distinct - } - - override lazy val uniqueColIds: TypedPipe[C] = { - this.pipe.map(t => t._2).distinct - } - - override def getRow(rowId: R): TypedPipe[(C, V)] = { - this.pipe.collect { - case (i, j, value) if i == rowId => - j -> value - } - } - - override def getCol(colId: C): TypedPipe[(R, V)] = { - this.pipe.collect { - case (i, j, value) if j == colId => - i -> value - } - } - - override def get(rowId: R, colId: C): ValuePipe[V] = { - this.pipe.collect { - case (i, j, value) if i == rowId && j == colId => - value - }.sum // this assumes the matrix does not have any duplicates - } - - // filter the matrix based on (row, col, value) - def filter(fn: (R, C, V) => Boolean): SparseMatrix[R, C, V] = { - SparseMatrix(this.pipe.filter { - case (row, col, value) => fn(row, col, value) - }) - } - - // filter the matrix based on a subset of rows - def filterRows(rows: TypedPipe[R]): SparseMatrix[R, C, V] = { - SparseMatrix(this.rowAsKeys.join(rows.asKeys).map { - case (row, ((col, value), _)) => (row, col, value) - }) - } - - // filter the matrix based on a subset of cols - def filterCols(cols: TypedPipe[C]): SparseMatrix[R, C, V] = { - this.transpose.filterRows(cols).transpose - } - - // convert the triplet (row, col, value) to a new (row1, col1, value1) - def tripleApply[R1, C1, V1]( - fn: (R, C, V) => (R1, C1, V1) - )( - implicit rowOrd1: Ordering[R1], - colOrd1: Ordering[C1], - numericV1: Numeric[V1], - semigroupV1: Semigroup[V1], - rowInj: Injection[R1, Array[Byte]], - colInj: Injection[C1, Array[Byte]] - ): SparseMatrix[R1, C1, V1] = { - SparseMatrix(this.pipe.map { - case (row, col, value) => fn(row, col, value) - }) - } - - // get the l1 norms for all rows - lazy val rowL1Norms: TypedPipe[(R, Double)] = { - this.pipe.map { - case (row, _, value) => - row -> numericV.toDouble(value).abs - }.sumByKey - } - - // get the l2 norms for all rows - lazy val rowL2Norms: TypedPipe[(R, Double)] = { - this.pipe - .map { - case (row, _, value) => - row -> numericV.toDouble(value) * numericV.toDouble(value) - } - .sumByKey - .mapValues(math.sqrt) - } - - // normalize the matrix to make sure each row has unit norm - lazy val rowL2Normalize: SparseMatrix[R, C, Double] = { - val result = this.rowAsKeys - .join(this.rowL2Norms) - .collect { - case (row, ((col, value), l2norm)) if l2norm > 0.0 => - (row, col, numericV.toDouble(value) / l2norm) - } - - SparseMatrix(result) - } - - // get the l2 norms for all cols - lazy val colL2Norms: TypedPipe[(C, Double)] = { - this.transpose.rowL2Norms - } - - // normalize the matrix to make sure each column has unit norm - lazy val colL2Normalize: SparseMatrix[R, C, Double] = { - this.transpose.rowL2Normalize.transpose - } - - /** - * Take topK non-zero elements from each row. Cols are ordered by the `ordering` function - */ - def sortWithTakePerRow(k: Int)(ordering: Ordering[(C, V)]): TypedPipe[(R, Seq[(C, V)])] = { - this.rowAsKeys.group.sortedTake(k)(ordering) - } - - /** - * Take topK non-zero elements from each column. Rows are ordered by the `ordering` function. - * - */ - def sortWithTakePerCol(k: Int)(ordering: Ordering[(R, V)]): TypedPipe[(C, Seq[(R, V)])] = { - this.transpose.sortWithTakePerRow(k)(ordering) - } - - /** - * Multiply another SparseMatrix. The only requirement is that the col type of current matrix should - * be same with the row type of the other matrix. - * - * @param sparseMatrix another matrix to multiply - * @param numReducersOpt optional parameter to set number of reducers. It uses 1000 by default. - * you can change it based on your applications. - * @param ordering2 ordering function for the column type of another matrix - * @param injection2 injection function for the column type of another matrix - * @tparam C2 col type of another matrix - * - * @return - */ - def multiplySparseMatrix[C2]( - sparseMatrix: SparseMatrix[C, C2, V], - numReducersOpt: Option[Int] = None - )( - implicit ordering2: Ordering[C2], - injection2: Injection[C2, Array[Byte]] - ): SparseMatrix[R, C2, V] = { - implicit val colInjectionFunction: C => Array[Byte] = colInj.toFunction - - val result = - // 1000 is the reducer number used for sketchJoin; 1000 is a number that works well empirically. - // feel free to change this or make this as a param if you find this does not work for your case. - this.transpose.rowAsKeys - .sketch(numReducersOpt.getOrElse(1000)) - .join(sparseMatrix.rowAsKeys) - .map { - case (_, ((row1, value1), (col2, value2))) => - (row1, col2) -> numericV.times(value1, value2) - } - .sumByKey - .map { - case ((row, col), value) => - (row, col, value) - } - - SparseMatrix(result) - } - - /** - * Multiply a SparseRowMatrix. The implementation of this function assume the input SparseRowMatrix - * is a skinny matrix, i.e., with a small number of unique columns. Based on our experience, you can - * think 100K is a small number here. - * - * @param skinnyMatrix another matrix to multiply - * @param numReducersOpt optional parameter to set number of reducers. It uses 1000 by default. - * you can change it based on your applications. - * @param ordering2 ordering function for the column type of another matrix - * @param injection2 injection function for the column type of another matrix - * @tparam C2 col type of another matrix - * - * @return - */ - def multiplySkinnySparseRowMatrix[C2]( - skinnyMatrix: SparseRowMatrix[C, C2, V], - numReducersOpt: Option[Int] = None - )( - implicit ordering2: Ordering[C2], - injection2: Injection[C2, Array[Byte]] - ): SparseRowMatrix[R, C2, V] = { - - assert( - skinnyMatrix.isSkinnyMatrix, - "this function only works for skinny sparse row matrix, otherwise you will get out-of-memory problem") - - implicit val colInjectionFunction: C => Array[Byte] = colInj.toFunction - - val result = - // 1000 is the reducer number used for sketchJoin; 1000 is a number that works well empirically. - // feel free to change this or make this as a param if you find this does not work for your case. - this.transpose.rowAsKeys - .sketch(numReducersOpt.getOrElse(1000)) - .join(skinnyMatrix.pipe) - .map { - case (_, ((row1, value1), colMap)) => - row1 -> colMap.mapValues(v => numericV.times(value1, v)) - } - .sumByKey - - SparseRowMatrix(result, skinnyMatrix.isSkinnyMatrix) - } - - /*** - * Multiply a DenseRowMatrix. The result will be also a DenseRowMatrix. - * - * @param denseRowMatrix matrix to multiply - * @param numReducersOpt optional parameter to set number of reducers. It uses 1000 by default. - * you can change it based on your applications - * @return - */ - def multiplyDenseRowMatrix( - denseRowMatrix: DenseRowMatrix[C], - numReducersOpt: Option[Int] = None - ): DenseRowMatrix[R] = { - - implicit val colInjectionFunction: C => Array[Byte] = colInj.toFunction - implicit val arrayVSemiGroup: Semigroup[Array[Double]] = denseRowMatrix.semigroupArrayV - - val result = - // 1000 is the reducer number used for sketchJoin; 1000 is a number that works well empirically. - // feel free to change this or make this as a param if you find this does not work for your case. - this.transpose.rowAsKeys - .sketch(numReducersOpt.getOrElse(1000)) - .join(denseRowMatrix.pipe) - .map { - case (_, ((row1, value1), array)) => - row1 -> array.map(v => numericV.toDouble(value1) * v) - } - .sumByKey - - DenseRowMatrix(result) - } - - // Transpose the matrix. - lazy val transpose: SparseMatrix[C, R, V] = { - SparseMatrix( - this.pipe - .map { - case (row, col, value) => - (col, row, value) - }) - } - - // Create a Key-Val TypedPipe for .join() and other use cases. - lazy val rowAsKeys: TypedPipe[(R, (C, V))] = { - this.pipe - .map { - case (row, col, value) => - (row, (col, value)) - } - } - - // convert to a TypedPipe - lazy val toTypedPipe: TypedPipe[(R, C, V)] = { - this.pipe - } - - lazy val forceToDisk: SparseMatrix[R, C, V] = { - SparseMatrix(this.pipe.forceToDisk) - } - - /** - * Convert the matrix to a SparseRowMatrix. Do this only when the max number of non-zero values per row is - * small (say, not more than 200K). - * - * @isSkinnyMatrix is the resulted matrix skinny, i.e., number of unique colIds is small (<200K). - * Note the difference between `number of unique colIds` and `max number of non-zero values per row`. - * @return - */ - def toSparseRowMatrix(isSkinnyMatrix: Boolean = false): SparseRowMatrix[R, C, V] = { - SparseRowMatrix( - this.pipe.map { - case (i, j, v) => - i -> Map(j -> v) - }.sumByKey, - isSkinnyMatrix) - } - - /** - * Convert the matrix to a DenseRowMatrix - * - * @param numCols the number of columns in the DenseRowMatrix. - * @param colToIndexFunction the function to convert colId to the column index in the dense matrix - * @return - */ - def toDenseRowMatrix(numCols: Int, colToIndexFunction: C => Int): DenseRowMatrix[R] = { - this.toSparseRowMatrix(isSkinnyMatrix = true).toDenseRowMatrix(numCols, colToIndexFunction) - } - - /** - * Determines whether we should return a given Iterator given a threshold for the sum of values - * across a row and whether we are looking to stay under or above that value. - * Note that Iterators are mutable/destructive, and even calling .size on it will 'use it up' - * i.e. it no longer hasNext and we no longer have any reference to the head of the collection. - * - * @param columnValueIterator Iterator over column-value pairs. - * @param threshold The threshold for the sum of values - * @param ifMin True if we want to stay at least above that given value - * @return A new SparseMatrix after we have filtered the ineligible rows - */ - private[this] def filterIter( - columnValueIterator: Iterator[(C, V)], - threshold: V, - ifMin: Boolean - ): Iterator[(C, V)] = { - var sum: V = numericV.zero - var it: Iterator[(C, V)] = Iterator.empty - var exceeded = false - while (columnValueIterator.hasNext && !exceeded) { - val (c, v) = columnValueIterator.next - val nextSum = semigroupV.plus(sum, v) - val cmp = numericV.compare(nextSum, threshold) - if ((ifMin && cmp < 0) || (!ifMin && cmp <= 0)) { - it = it ++ Iterator((c, v)) - sum = nextSum - } else { - it = it ++ Iterator((c, v)) - exceeded = true - } - } - (ifMin, exceeded) match { - case (true, true) => it ++ columnValueIterator - case (true, false) => Iterator.empty - case (false, true) => Iterator.empty - case (false, false) => it ++ columnValueIterator - } - } - - /** - * removes entries whose sum over rows do not meet the minimum sum (minSum) - * @param minSum minimum sum for which we want to enforce across all rows - */ - def filterRowsByMinSum(minSum: V): SparseMatrix[R, C, V] = { - val filteredPipe = this.rowAsKeys.group - .mapValueStream(filterIter(_, threshold = minSum, ifMin = true)).map { - case (r, (c, v)) => - (r, c, v) - } - SparseMatrix(filteredPipe) - } - - /** - * removes entries whose sum over rows exceed the maximum sum (maxSum) - * @param maxSum maximum sum for which we want to enforce across all rows - */ - def filterRowsByMaxSum(maxSum: V): SparseMatrix[R, C, V] = { - val filteredPipe = this.rowAsKeys.group - .mapValueStream(filterIter(_, threshold = maxSum, ifMin = false)).map { - case (r, (c, v)) => - (r, c, v) - } - SparseMatrix(filteredPipe) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/SparseRowMatrix.docx b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/SparseRowMatrix.docx new file mode 100644 index 000000000..448d920e4 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/SparseRowMatrix.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/SparseRowMatrix.scala b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/SparseRowMatrix.scala deleted file mode 100644 index 767c8f588..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/SparseRowMatrix.scala +++ /dev/null @@ -1,366 +0,0 @@ -package com.twitter.simclusters_v2.scalding.common.matrix - -import com.twitter.algebird.Semigroup -import com.twitter.bijection.Injection -import com.twitter.scalding.TypedPipe -import com.twitter.scalding.ValuePipe -import org.apache.avro.SchemaBuilder.ArrayBuilder -import scala.util.Random - -/** - * A class that represents a row-indexed matrix, backed by a TypedPipe[(R, Map(C, V)]. - * For each row of the TypedPipe, we save the rowId and a map consisting of colIds and their values. - * Only use this class when the max number of non-zero values per row is small (say, <100K). - * - * Compared to SparseMatrix, this class has some optimizations to efficiently perform some row-wise - * operations. - * - * Also, if the matrix is skinny (i.e., number of unique colIds is small), we have optimized solutions - * for col-wise normalization as well as matrix multiplication (see SparseMatrix.multiplySkinnySparseRowMatrix). - * - * @param pipe underlying pipe - * @param isSkinnyMatrix if the matrix is skinny (i.e., number of unique colIds is small) - * Note the difference between `number of unique colIds` and `max number of non-zero values per row`. - * @param rowOrd ordering function for row type - * @param colOrd ordering function for col type - * @param numericV numeric operations for value type - * @param semigroupV semigroup for the value type - * @param rowInj injection function for the row type - * @param colInj injection function for the col type - * @tparam R Type for rows - * @tparam C Type for columns - * @tparam V Type for elements of the matrix - */ -case class SparseRowMatrix[R, C, V]( - pipe: TypedPipe[(R, Map[C, V])], - isSkinnyMatrix: Boolean -)( - implicit override val rowOrd: Ordering[R], - override val colOrd: Ordering[C], - override val numericV: Numeric[V], - override val semigroupV: Semigroup[V], - override val rowInj: Injection[R, Array[Byte]], - override val colInj: Injection[C, Array[Byte]]) - extends TypedPipeMatrix[R, C, V] { - - // number of non-zero values in the matrix - override lazy val nnz: ValuePipe[Long] = { - this - .filter((_, _, v) => v != numericV.zero) - .pipe - .values - .map(_.size.toLong) - .sum - } - - override def get(rowId: R, colId: C): ValuePipe[V] = { - this.pipe - .collect { - case (i, values) if i == rowId => - values.collect { - case (j, value) if j == colId => value - } - } - .flatten - .sum - } - - override def getRow(rowId: R): TypedPipe[(C, V)] = { - this.pipe.flatMap { - case (i, values) if i == rowId => - values.toSeq - case _ => - Nil - } - } - - override def getCol(colId: C): TypedPipe[(R, V)] = { - this.pipe.flatMap { - case (i, values) => - values.collect { - case (j, value) if j == colId => - i -> value - } - } - } - - override lazy val uniqueRowIds: TypedPipe[R] = { - this.pipe.map(_._1).distinct - } - - override lazy val uniqueColIds: TypedPipe[C] = { - this.pipe.flatMapValues(_.keys).values.distinct - } - - // convert to a SparseMatrix - lazy val toSparseMatrix: SparseMatrix[R, C, V] = { - SparseMatrix(this.pipe.flatMap { - case (i, values) => - values.map { case (j, value) => (i, j, value) } - }) - } - - // convert to a TypedPipe - lazy val toTypedPipe: TypedPipe[(R, Map[C, V])] = { - this.pipe - } - - def filter(fn: (R, C, V) => Boolean): SparseRowMatrix[R, C, V] = { - SparseRowMatrix( - this.pipe - .map { - case (i, values) => - i -> values.filter { case (j, v) => fn(i, j, v) } - } - .filter(_._2.nonEmpty), - isSkinnyMatrix = this.isSkinnyMatrix - ) - } - - // sample the rows in the matrix as defined by samplingRatio - def sampleRows(samplingRatio: Double): SparseRowMatrix[R, C, V] = { - SparseRowMatrix(this.pipe.filter(_ => Random.nextDouble < samplingRatio), this.isSkinnyMatrix) - } - - // filter the matrix based on a subset of rows - def filterRows(rows: TypedPipe[R]): SparseRowMatrix[R, C, V] = { - SparseRowMatrix(this.pipe.join(rows.asKeys).mapValues(_._1), this.isSkinnyMatrix) - } - - // filter the matrix based on a subset of cols - def filterCols(cols: TypedPipe[C]): SparseRowMatrix[R, C, V] = { - this.toSparseMatrix.filterCols(cols).toSparseRowMatrix(this.isSkinnyMatrix) - } - - // convert the triplet (row, col, value) to a new (row1, col1, value1) - def tripleApply[R1, C1, V1]( - fn: (R, C, V) => (R1, C1, V1) - )( - implicit rowOrd1: Ordering[R1], - colOrd1: Ordering[C1], - numericV1: Numeric[V1], - semigroupV1: Semigroup[V1], - rowInj: Injection[R1, Array[Byte]], - colInj: Injection[C1, Array[Byte]] - ): SparseRowMatrix[R1, C1, V1] = { - SparseRowMatrix( - this.pipe.flatMap { - case (i, values) => - values - .map { - case (j, v) => fn(i, j, v) - } - .groupBy(_._1) - .mapValues { _.map { case (_, j1, v1) => (j1, v1) }.toMap } - }, - isSkinnyMatrix = this.isSkinnyMatrix - ) - } - - // get the l2 norms for all rows. this does not trigger a shuffle. - lazy val rowL2Norms: TypedPipe[(R, Double)] = { - this.pipe.map { - case (row, values) => - row -> math.sqrt( - values.values - .map(a => numericV.toDouble(a) * numericV.toDouble(a)) - .sum) - } - } - - // normalize the matrix to make sure each row has unit norm - lazy val rowL2Normalize: SparseRowMatrix[R, C, Double] = { - val result = this.pipe.flatMap { - case (row, values) => - val norm = - math.sqrt( - values.values - .map(v => numericV.toDouble(v) * numericV.toDouble(v)) - .sum) - if (norm == 0.0) { - None - } else { - Some(row -> values.mapValues(v => numericV.toDouble(v) / norm)) - } - } - - SparseRowMatrix(result, isSkinnyMatrix = this.isSkinnyMatrix) - } - - // get the l2 norms for all cols - lazy val colL2Norms: TypedPipe[(C, Double)] = { - this.pipe - .flatMap { - case (_, values) => - values.map { - case (col, v) => - col -> numericV.toDouble(v) * numericV.toDouble(v) - } - } - .sumByKey - .mapValues(math.sqrt) - } - - // normalize the matrix to make sure each column has unit norm - lazy val colL2Normalize: SparseRowMatrix[R, C, Double] = { - val result = if (this.isSkinnyMatrix) { - // if this is a skinny matrix, we first put the norm of all columns into a Map, and then use - // this Map inside the mappers without shuffling the whole matrix (which is expensive, see the - // `else` part of this function). - val colL2NormsValuePipe = this.colL2Norms.map { - case (col, norm) => Map(col -> norm) - }.sum - - this.pipe.flatMapWithValue(colL2NormsValuePipe) { - case ((row, values), Some(colNorms)) => - Some(row -> values.flatMap { - case (col, value) => - val colNorm = colNorms.getOrElse(col, 0.0) - if (colNorm == 0.0) { - None - } else { - Some(col -> numericV.toDouble(value) / colNorm) - } - }) - case _ => - None - } - } else { - this.toSparseMatrix.transpose.rowAsKeys - .join(this.colL2Norms) - .collect { - case (col, ((row, value), colNorm)) if colNorm > 0.0 => - row -> Map(col -> numericV.toDouble(value) / colNorm) - } - .sumByKey - .toTypedPipe - } - - SparseRowMatrix(result, isSkinnyMatrix = this.isSkinnyMatrix) - } - - /** - * Take topK non-zero elements from each row. Cols are ordered by the `ordering` function - */ - def sortWithTakePerRow( - k: Int - )( - ordering: Ordering[(C, V)] - ): TypedPipe[(R, Seq[(C, V)])] = { - this.pipe.map { - case (row, values) => - row -> values.toSeq.sorted(ordering).take(k) - } - } - - /** - * Take topK non-zero elements from each column. Rows are ordered by the `ordering` function. - */ - def sortWithTakePerCol( - k: Int - )( - ordering: Ordering[(R, V)] - ): TypedPipe[(C, Seq[(R, V)])] = { - this.toSparseMatrix.sortWithTakePerCol(k)(ordering) - } - - /** - * Similar to .forceToDisk function in TypedPipe, but with an option to specify how many partitions - * to save, which is useful if you want to consolidate the data set or want to tune the number - * of mappers for the next step. - * - * @param numShardsOpt number of shards to save the data. - * - * @return - */ - def forceToDisk( - numShardsOpt: Option[Int] = None - ): SparseRowMatrix[R, C, V] = { - numShardsOpt - .map { numShards => - SparseRowMatrix(this.pipe.shard(numShards), this.isSkinnyMatrix) - } - .getOrElse { - SparseRowMatrix(this.pipe.forceToDisk, this.isSkinnyMatrix) - } - } - - /** - * transpose current matrix and multiple another Skinny SparseRowMatrix. - * The difference between this and .transpose.multiplySkinnySparseRowMatrix(anotherSparseRowMatrix), - * is that we do not need to do flatten and group again. - * - * One use case is to when we need to compute the column-wise covariance matrix, then we only need - * a.transposeAndMultiplySkinnySparseRowMatrix(a) to get it. - * - * @param anotherSparseRowMatrix it needs to be a skinny SparseRowMatrix - * @numReducersOpt Number of reducers. - */ - def transposeAndMultiplySkinnySparseRowMatrix[C2]( - anotherSparseRowMatrix: SparseRowMatrix[R, C2, V], - numReducersOpt: Option[Int] = None - )( - implicit ordering2: Ordering[C2], - injection2: Injection[C2, Array[Byte]] - ): SparseRowMatrix[C, C2, V] = { - - // it needs to be a skinny SparseRowMatrix, otherwise we will have out-of-memory issue - require(anotherSparseRowMatrix.isSkinnyMatrix) - - SparseRowMatrix( - numReducersOpt - .map { numReducers => - this.pipe - .join(anotherSparseRowMatrix.pipe).withReducers(numReducers) - }.getOrElse(this.pipe - .join(anotherSparseRowMatrix.pipe)) - .flatMap { - case (_, (row1, row2)) => - row1.map { - case (col1, val1) => - col1 -> row2.mapValues(val2 => numericV.times(val1, val2)) - } - } - .sumByKey, - isSkinnyMatrix = true - ) - - } - - /*** - * Multiply a DenseRowMatrix. The result will be also a DenseRowMatrix. - * - * @param denseRowMatrix matrix to multiply - * @param numReducersOpt optional parameter to set number of reducers. It uses 1000 by default. - * you can change it based on your applications - * @return - */ - def multiplyDenseRowMatrix( - denseRowMatrix: DenseRowMatrix[C], - numReducersOpt: Option[Int] = None - ): DenseRowMatrix[R] = { - this.toSparseMatrix.multiplyDenseRowMatrix(denseRowMatrix, numReducersOpt) - } - - /** - * Convert the matrix to a DenseRowMatrix - * - * @param numCols the number of columns in the DenseRowMatrix. - * @param colToIndexFunction the function to convert colId to the column index in the dense matrix - * @return - */ - def toDenseRowMatrix(numCols: Int, colToIndexFunction: C => Int): DenseRowMatrix[R] = { - DenseRowMatrix(this.pipe.map { - case (row, colMap) => - val array = new Array[Double](numCols) - colMap.foreach { - case (col, value) => - val index = colToIndexFunction(col) - assert(index < numCols && index >= 0, "The converted index is out of range!") - array(index) = numericV.toDouble(value) - } - row -> array - }) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/TypedPipeMatrix.docx b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/TypedPipeMatrix.docx new file mode 100644 index 000000000..4fd2b2480 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/TypedPipeMatrix.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/TypedPipeMatrix.scala b/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/TypedPipeMatrix.scala deleted file mode 100644 index 24e3fb3ad..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/common/matrix/TypedPipeMatrix.scala +++ /dev/null @@ -1,49 +0,0 @@ -package com.twitter.simclusters_v2.scalding.common.matrix - -import com.twitter.algebird.{Aggregator, Semigroup} -import com.twitter.bijection.Injection -import com.twitter.scalding.{TypedPipe, ValuePipe} - -/** - * A matrix trait for representing a matrix backed by TypedPipe - * - * @tparam R Type for rows - * @tparam C Type for columns - * @tparam V Type for elements of the matrix - */ -abstract class TypedPipeMatrix[R, C, @specialized(Double, Int, Float, Long, Short) V] { - implicit val semigroupV: Semigroup[V] - implicit val numericV: Numeric[V] - implicit val rowOrd: Ordering[R] - implicit val colOrd: Ordering[C] - implicit val rowInj: Injection[R, Array[Byte]] - implicit val colInj: Injection[C, Array[Byte]] - - // num of non-zero elements in the matrix - val nnz: ValuePipe[Long] - - // list of unique rowIds in the matrix - val uniqueRowIds: TypedPipe[R] - - // list of unique unique in the matrix - val uniqueColIds: TypedPipe[C] - - // get a specific row of the matrix - def getRow(rowId: R): TypedPipe[(C, V)] - - // get a specific column of the matrix - def getCol(colId: C): TypedPipe[(R, V)] - - // get the value of an element - def get(rowId: R, colId: C): ValuePipe[V] - - // number of unique rowIds - lazy val numUniqueRows: ValuePipe[Long] = { - this.uniqueRowIds.aggregate(Aggregator.size) - } - - // number of unique unique - lazy val numUniqueCols: ValuePipe[Long] = { - this.uniqueColIds.aggregate(Aggregator.size) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/BUILD b/src/scala/com/twitter/simclusters_v2/scalding/embedding/BUILD deleted file mode 100644 index 399d64417..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/BUILD +++ /dev/null @@ -1,311 +0,0 @@ -scala_library( - sources = [ - "*.scala", - "common/*.scala", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "escherbird/src/scala/com/twitter/escherbird/scalding/source", - "flockdb-tools/datasets/flock:flock-blocks-edges-scala", - "flockdb-tools/datasets/flock:flock-follows-edges-scala", - "flockdb-tools/datasets/flock:flock-report-as-abuse-edges-scala", - "flockdb-tools/datasets/flock:flock-report-as-spam-edges-scala", - "iesource/processing/events/src/main/scala/com/twitter/iesource/processing/events/batch:server_engagements-scala", - "interests-ds/src/main/scala/com/twitter/interests_ds/jobs/interests_service", - "interests-ds/src/main/scala/com/twitter/interests_ds/jobs/interests_service:user_topic_relation_snapshot-scala", - "src/java/com/twitter/common/text/language:locale-util", - "src/scala/com/twitter/frigate/data_pipeline/scalding/magicrecs/magicrecs_notification_lite:magicrecs_notification_lite_1day_lag-scala", - "src/scala/com/twitter/onboarding/relevance/source:utt_account_recommendations-scala", - "src/scala/com/twitter/penguin/scalding/datasets:penguin_user_languages-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:simclusters_v2_embeddings_lite-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/scala/com/twitter/simclusters_v2/scalding/common/matrix", - "src/scala/com/twitter/wtf/entity_real_graph/common", - "src/scala/com/twitter/wtf/entity_real_graph/scalding/common", - "src/scala/com/twitter/wtf/scalding/jobs/common:execution_app", - "src/scala/com/twitter/wtf/scalding/jobs/common:sources", - "src/scala/com/twitter/wtf/scalding/jobs/common:stats_util", - "src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala", - "src/thrift/com/twitter/onboarding/relevance/candidates:candidates-scala", - "src/thrift/com/twitter/recos/entities:entities-thrift-scala", - "src/thrift/com/twitter/search/adaptive/scribing:adaptive-scribing-scala", - "src/thrift/com/twitter/wtf/entity_real_graph:entity_real_graph-thrift-scala", - "twadoop_config/configuration/log_categories/group/search:adaptive_search-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - "usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala", - ], -) - -hadoop_binary( - name = "entity_embeddings_job-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.EntityToSimClustersEmbeddingAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "entity_per_language_embeddings_job-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.LocaleEntitySimClustersEmbeddingAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "semantic_core_entity_embeddings_dec11_model_job", - main = "com.twitter.simclusters_v2.scalding.embedding.SemanticCoreEntityEmbeddingsDec11ModelApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "semantic_core_entity_embeddings_2020_job", - main = "com.twitter.simclusters_v2.scalding.embedding.SemanticCoreEntityEmbeddings2020App", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "semantic_core_entity_embeddings_per_language_job", - main = "com.twitter.simclusters_v2.scalding.embedding.LocaleEntitySimClustersEmbeddingScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "locale_entity_simclusters_embedding_v2", - main = "com.twitter.simclusters_v2.scalding.embedding.LocaleEntitySimClustersEmbeddingV2ScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "locale_entity_simclusters_embedding_v2-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.LocaleEntitySimClustersEmbeddingV2AdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "producer_embeddings_from_interested_in-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.ProducerEmbeddingsFromInterestedInAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "producer_embeddings_from_interested_in_by_fav_score", - main = "com.twitter.simclusters_v2.scalding.embedding.ProducerEmbeddingsFromInterestedInByFavScoreBatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "producer_embeddings_from_interested_in_by_fav_score_2020", - main = "com.twitter.simclusters_v2.scalding.embedding.ProducerEmbeddingsFromInterestedInByFavScore2020BatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "producer_embeddings_from_interested_in_by_fav_score_dec11", - main = "com.twitter.simclusters_v2.scalding.embedding.ProducerEmbeddingsFromInterestedInByFavScoreDec11BatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "producer_embeddings_from_interested_in_by_follow_score", - main = "com.twitter.simclusters_v2.scalding.embedding.ProducerEmbeddingsFromInterestedInByFollowScoreBatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "producer_embeddings_from_interested_in_by_follow_score_2020", - main = "com.twitter.simclusters_v2.scalding.embedding.ProducerEmbeddingsFromInterestedInByFollowScore2020BatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "producer_embeddings_from_interested_in_by_follow_score_dec11", - main = "com.twitter.simclusters_v2.scalding.embedding.ProducerEmbeddingsFromInterestedInByFollowScoreDec11BatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "similar_users_by_simclusters_embeddings-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.SimilarUsersBySimClustersEmbeddingAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "similar_users_by_simclusters_embeddings", - main = "com.twitter.simclusters_v2.scalding.embedding.SimilarUsersBySimClustersEmbeddingBatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "entity_embedding_from_producer_embedding-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.EntityEmbeddingFromProducerEmbeddingAdhocJob", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "entity_embedding_from_producer_embedding_job", - main = "com.twitter.simclusters_v2.scalding.embedding.EntityEmbeddingFromProducerEmbeddingScheduledJob", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -# Generated with `capesospy-v2 create_target similar_users_by_simclusters_embeddings_job src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml`, config hash b8cf4d. -scalding_job( - name = "similar_users_by_simclusters_embeddings_job", - main = "com.twitter.simclusters_v2.scalding.embedding.SimiliarUsersBySimClustersEmbeddingBatchApp", - config = [ - ("hadoop.combine-input", "true"), - ("hadoop.map.jvm.total-memory", "3072m"), - ("hadoop.queue", "cassowary.default"), - ("hadoop.reduce.jvm.total-memory", "3072m"), - ("hadoop.submitter.jvm.total-memory", "5120m"), - ("submitter.tier", "preemptible"), - ], - cron = "15 * * * *", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) - -hadoop_binary( - name = "global_simclusters_language_embedding_job", - main = "com.twitter.simclusters_v2.scalding.embedding.GlobalSimClustersLanguageEmbeddingBatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":embedding"], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/BUILD.docx new file mode 100644 index 000000000..e45e9f88d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/EntityEmbeddingFromProducerEmbeddingJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/EntityEmbeddingFromProducerEmbeddingJob.docx new file mode 100644 index 000000000..1581e7bd8 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/EntityEmbeddingFromProducerEmbeddingJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/EntityEmbeddingFromProducerEmbeddingJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/EntityEmbeddingFromProducerEmbeddingJob.scala deleted file mode 100644 index 4d2e3c205..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/EntityEmbeddingFromProducerEmbeddingJob.scala +++ /dev/null @@ -1,239 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding - -import com.twitter.onboarding.relevance.candidates.thriftscala.InterestBasedUserRecommendations -import com.twitter.onboarding.relevance.candidates.thriftscala.UTTInterest -import com.twitter.onboarding.relevance.source.UttAccountRecommendationsScalaDataset -import com.twitter.scalding.Args -import com.twitter.scalding.DateRange -import com.twitter.scalding.Days -import com.twitter.scalding.Duration -import com.twitter.scalding.Execution -import com.twitter.scalding.RichDate -import com.twitter.scalding.UniqueID -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding.typed.UnsortedGrouped -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources -import com.twitter.simclusters_v2.hdfs_sources.ProducerEmbeddingSources -import com.twitter.simclusters_v2.hdfs_sources.SemanticCoreEmbeddingsFromProducerScalaDataset -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil._ -import com.twitter.simclusters_v2.thriftscala -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.SimClusterWithScore -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import com.twitter.wtf.scalding.jobs.common.StatsUtil._ -import java.util.TimeZone - -/* - $ ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding:entity_embedding_from_producer_embedding-adhoc - - $ scalding remote run \ - --main-class com.twitter.simclusters_v2.scalding.embedding.EntityEmbeddingFromProducerEmbeddingAdhocJob \ - --target src/scala/com/twitter/simclusters_v2/scalding/embedding:entity_embedding_from_producer_embedding-adhoc \ - --user recos-platform \ - -- --date 2019-10-23 --model_version 20M_145K_updated - */ -object EntityEmbeddingFromProducerEmbeddingAdhocJob extends AdhocExecutionApp { - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - // step 1: read in (entity, producer) pairs and remove duplicates - val topK = args.getOrElse("top_k", "100").toInt - - val modelVersion = ModelVersions.toModelVersion( - args.getOrElse("model_version", ModelVersions.Model20M145KUpdated)) - - val entityKnownForProducers = - EntityEmbeddingFromProducerEmbeddingJob - .getNormalizedEntityProducerMatrix(dateRange.embiggen(Days(7))) - .count("num unique entity producer pairs").map { - case (entityId, producerId, score) => (producerId, (entityId, score)) - } - - // step 2: read in producer to simclusters embeddings - - val producersEmbeddingsFollowBased = - ProducerEmbeddingSources.producerEmbeddingSourceLegacy( - EmbeddingType.ProducerFollowBasedSemanticCoreEntity, - modelVersion)(dateRange.embiggen(Days(7))) - - val producersEmbeddingsFavBased = - ProducerEmbeddingSources.producerEmbeddingSourceLegacy( - EmbeddingType.ProducerFavBasedSemanticCoreEntity, - modelVersion)(dateRange.embiggen(Days(7))) - - // step 3: join producer embedding with entity, producer pairs and reformat result into format [SimClustersEmbeddingId, SimClustersEmbedding] - val producerBasedEntityEmbeddingsFollowBased = - EntityEmbeddingFromProducerEmbeddingJob - .computeEmbedding( - producersEmbeddingsFollowBased, - entityKnownForProducers, - topK, - modelVersion, - EmbeddingType.ProducerFollowBasedSemanticCoreEntity).toTypedPipe.count( - "follow_based_entity_count") - - val producerBasedEntityEmbeddingsFavBased = - EntityEmbeddingFromProducerEmbeddingJob - .computeEmbedding( - producersEmbeddingsFavBased, - entityKnownForProducers, - topK, - modelVersion, - EmbeddingType.ProducerFavBasedSemanticCoreEntity).toTypedPipe.count( - "fav_based_entity_count") - - val producerBasedEntityEmbeddings = - producerBasedEntityEmbeddingsFollowBased ++ producerBasedEntityEmbeddingsFavBased - - // step 4 write results to file - producerBasedEntityEmbeddings - .count("total_count").writeExecution( - AdhocKeyValSources.entityToClustersSource( - getHdfsPath(isAdhoc = true, isManhattanKeyVal = true, modelVersion, "producer"))) - } - -} - -/* - $ ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding:entity_embedding_from_producer_embedding_job - $ capesospy-v2 update \ - --build_locally \ - --start_cron entity_embedding_from_producer_embedding_job src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object EntityEmbeddingFromProducerEmbeddingScheduledJob extends ScheduledExecutionApp { - override def firstTime: RichDate = RichDate("2019-10-16") - - override def batchIncrement: Duration = Days(7) - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - // parse args: modelVersion, topK - val topK = args.getOrElse("top_k", "100").toInt - // only support dec11 now since updated model is not productionized for producer embedding - val modelVersion = - ModelVersions.toModelVersion( - args.getOrElse("model_version", ModelVersions.Model20M145KUpdated)) - - val entityKnownForProducers = - EntityEmbeddingFromProducerEmbeddingJob - .getNormalizedEntityProducerMatrix(dateRange.embiggen(Days(7))) - .count("num unique entity producer pairs").map { - case (entityId, producerId, score) => (producerId, (entityId, score)) - } - - val favBasedEmbeddings = EntityEmbeddingFromProducerEmbeddingJob - .computeEmbedding( - ProducerEmbeddingSources.producerEmbeddingSourceLegacy( - EmbeddingType.ProducerFavBasedSemanticCoreEntity, - modelVersion)(dateRange.embiggen(Days(7))), - entityKnownForProducers, - topK, - modelVersion, - EmbeddingType.ProducerFavBasedSemanticCoreEntity - ).toTypedPipe.count("follow_based_entity_count") - - val followBasedEmbeddings = EntityEmbeddingFromProducerEmbeddingJob - .computeEmbedding( - ProducerEmbeddingSources.producerEmbeddingSourceLegacy( - EmbeddingType.ProducerFollowBasedSemanticCoreEntity, - modelVersion)(dateRange.embiggen(Days(7))), - entityKnownForProducers, - topK, - modelVersion, - EmbeddingType.ProducerFollowBasedSemanticCoreEntity - ).toTypedPipe.count("fav_based_entity_count") - - val embedding = favBasedEmbeddings ++ followBasedEmbeddings - - embedding - .count("total_count") - .map { - case (embeddingId, embedding) => KeyVal(embeddingId, embedding) - }.writeDALVersionedKeyValExecution( - SemanticCoreEmbeddingsFromProducerScalaDataset, - D.Suffix(getHdfsPath(isAdhoc = false, isManhattanKeyVal = true, modelVersion, "producer")) - ) - - } - -} - -private object EntityEmbeddingFromProducerEmbeddingJob { - def computeEmbedding( - producersEmbeddings: TypedPipe[(Long, TopSimClustersWithScore)], - entityKnownForProducers: TypedPipe[(Long, (Long, Double))], - topK: Int, - modelVersion: ModelVersion, - embeddingType: EmbeddingType - ): UnsortedGrouped[SimClustersEmbeddingId, thriftscala.SimClustersEmbedding] = { - producersEmbeddings - .hashJoin(entityKnownForProducers).flatMap { - case (_, (topSimClustersWithScore, (entityId, producerScore))) => { - val entityEmbedding = topSimClustersWithScore.topClusters - entityEmbedding.map { - case SimClusterWithScore(clusterId, score) => - ( - ( - SimClustersEmbeddingId( - embeddingType, - modelVersion, - InternalId.EntityId(entityId)), - clusterId), - score * producerScore) - } - } - }.sumByKey.map { - case ((embeddingId, clusterId), clusterScore) => - (embeddingId, (clusterId, clusterScore)) - }.group.sortedReverseTake(topK)(Ordering.by(_._2)).mapValues(SimClustersEmbedding - .apply(_).toThrift) - } - - def getNormalizedEntityProducerMatrix( - implicit dateRange: DateRange - ): TypedPipe[(Long, Long, Double)] = { - val uttRecs: TypedPipe[(UTTInterest, InterestBasedUserRecommendations)] = - DAL - .readMostRecentSnapshot(UttAccountRecommendationsScalaDataset).withRemoteReadPolicy( - ExplicitLocation(ProcAtla)).toTypedPipe.map { - case KeyVal(interest, candidates) => (interest, candidates) - } - - uttRecs - .flatMap { - case (interest, candidates) => { - // current populated features - val top20Producers = candidates.recommendations.sortBy(-_.score.getOrElse(0.0d)).take(20) - val producerScorePairs = top20Producers.map { producer => - (producer.candidateUserID, producer.score.getOrElse(0.0)) - } - val scoreSum = producerScorePairs.map(_._2).sum - producerScorePairs.map { - case (producerId, score) => (interest.uttID, producerId, score / scoreSum) - } - } - } - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/EntityToSimClustersEmbeddingsJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/EntityToSimClustersEmbeddingsJob.docx new file mode 100644 index 000000000..9eaa0edf6 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/EntityToSimClustersEmbeddingsJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/EntityToSimClustersEmbeddingsJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/EntityToSimClustersEmbeddingsJob.scala deleted file mode 100644 index 21d68ee22..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/EntityToSimClustersEmbeddingsJob.scala +++ /dev/null @@ -1,354 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.recos.entities.thriftscala.Entity -import com.twitter.recos.entities.thriftscala.Hashtag -import com.twitter.recos.entities.thriftscala.SemanticCoreEntity -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil._ -import com.twitter.simclusters_v2.scalding.embedding.common.EntityEmbeddingUtil -import com.twitter.simclusters_v2.scalding.embedding.common.SimClustersEmbeddingJob -import com.twitter.simclusters_v2.thriftscala.{ - SimClustersEmbedding => ThriftSimClustersEmbedding, - _ -} -import com.twitter.wtf.entity_real_graph.common.EntityUtil -import com.twitter.wtf.entity_real_graph.thriftscala.EntityType -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.DataSources -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * $ ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding:entity_embeddings_job-adhoc - * - * ---------------------- Deploy to atla ---------------------- - * $ scalding remote run \ - --main-class com.twitter.simclusters_v2.scalding.embedding.EntityToSimClustersEmbeddingAdhocApp \ - --target src/scala/com/twitter/simclusters_v2/scalding/embedding:entity_embeddings_job-adhoc \ - --user recos-platform \ - -- --date 2019-09-09 --model-version 20M_145K_updated --entity-type SemanticCore - */ -object EntityToSimClustersEmbeddingAdhocApp extends AdhocExecutionApp { - - import EmbeddingUtil._ - import EntityEmbeddingUtil._ - import EntityToSimClustersEmbeddingsJob._ - import EntityUtil._ - import SimClustersEmbeddingJob._ - - def writeOutput( - embeddings: TypedPipe[(SimClustersEmbeddingId, (ClusterId, EmbeddingScore))], - topKEmbeddings: TypedPipe[(SimClustersEmbeddingId, Seq[(ClusterId, EmbeddingScore)])], - jobConfig: EntityEmbeddingsJobConfig - ): Execution[Unit] = { - - val toSimClusterEmbeddingExec = topKEmbeddings - .mapValues(SimClustersEmbedding.apply(_).toThrift) - .writeExecution( - AdhocKeyValSources.entityToClustersSource( - EntityToSimClustersEmbeddingsJob.getHdfsPath( - isAdhoc = true, - isManhattanKeyVal = true, - isReverseIndex = false, - jobConfig.modelVersion, - jobConfig.entityType))) - - val fromSimClusterEmbeddingExec = - toReverseIndexSimClusterEmbedding(embeddings, jobConfig.topK) - .writeExecution( - AdhocKeyValSources.clusterToEntitiesSource( - EntityToSimClustersEmbeddingsJob.getHdfsPath( - isAdhoc = true, - isManhattanKeyVal = true, - isReverseIndex = true, - jobConfig.modelVersion, - jobConfig.entityType))) - - Execution.zip(toSimClusterEmbeddingExec, fromSimClusterEmbeddingExec).unit - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val jobConfig = EntityEmbeddingsJobConfig(args, isAdhoc = true) - - val numReducers = args.getOrElse("m", "1000").toInt - - /* - Using the ERG daily dataset in the adhoc job for quick prototyping, note that there may be - issues with scaling the job when productionizing on ERG aggregated dataset. - */ - val entityRealGraphSource = DataSources.entityRealGraphDailyDataSetSource - - val entityUserMatrix: TypedPipe[(Entity, (UserId, Double))] = - (jobConfig.entityType match { - case EntityType.SemanticCore => - getEntityUserMatrix(entityRealGraphSource, jobConfig.halfLife, EntityType.SemanticCore) - case EntityType.Hashtag => - getEntityUserMatrix(entityRealGraphSource, jobConfig.halfLife, EntityType.Hashtag) - case _ => - throw new IllegalArgumentException( - s"Argument [--entity-type] must be provided. Supported options [${EntityType.SemanticCore.name}, ${EntityType.Hashtag.name}]") - }).forceToDisk - - val normalizedUserEntityMatrix = - getNormalizedTransposeInputMatrix(entityUserMatrix, numReducers = Some(numReducers)) - - //determine which data source to use based on model version - val simClustersSource = jobConfig.modelVersion match { - case ModelVersion.Model20m145kUpdated => - InterestedInSources.simClustersInterestedInUpdatedSource(dateRange, timeZone) - case _ => - InterestedInSources.simClustersInterestedInDec11Source(dateRange, timeZone) - } - - val embeddings = computeEmbeddings( - simClustersSource, - normalizedUserEntityMatrix, - scoreExtractors, - ModelVersion.Model20m145kUpdated, - toSimClustersEmbeddingId(jobConfig.modelVersion), - numReducers = Some(numReducers * 2) - ) - - val topKEmbeddings = - embeddings.group - .sortedReverseTake(jobConfig.topK)(Ordering.by(_._2)) - .withReducers(numReducers) - - writeOutput(embeddings, topKEmbeddings, jobConfig) - } -} - -/** - * $ ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding:semantic_core_entity_embeddings_2020_job - * $ capesospy-v2 update \ - --build_locally \ - --start_cron semantic_core_entity_embeddings_2020_job src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object SemanticCoreEntityEmbeddings2020App extends EntityToSimClustersEmbeddingApp - -trait EntityToSimClustersEmbeddingApp extends ScheduledExecutionApp { - - import EmbeddingUtil._ - import EntityEmbeddingUtil._ - import EntityToSimClustersEmbeddingsJob._ - import EntityUtil._ - import SimClustersEmbeddingJob._ - - override val firstTime: RichDate = RichDate("2023-01-01") - - override val batchIncrement: Duration = Days(7) - - private def writeOutput( - embeddings: TypedPipe[(SimClustersEmbeddingId, (ClusterId, EmbeddingScore))], - topKEmbeddings: TypedPipe[(SimClustersEmbeddingId, Seq[(ClusterId, EmbeddingScore)])], - jobConfig: EntityEmbeddingsJobConfig, - clusterEmbeddingsDataset: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding] - ], - entityEmbeddingsDataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]] - ): Execution[Unit] = { - - val toSimClustersEmbeddings = - topKEmbeddings - .mapValues(SimClustersEmbedding.apply(_).toThrift) - .map { - case (entityId, topSimClusters) => KeyVal(entityId, topSimClusters) - } - .writeDALVersionedKeyValExecution( - clusterEmbeddingsDataset, - D.Suffix( - EntityToSimClustersEmbeddingsJob.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - isReverseIndex = false, - jobConfig.modelVersion, - jobConfig.entityType)) - ) - - val fromSimClustersEmbeddings = - toReverseIndexSimClusterEmbedding(embeddings, jobConfig.topK) - .map { - case (embeddingId, internalIdsWithScore) => - KeyVal(embeddingId, internalIdsWithScore) - } - .writeDALVersionedKeyValExecution( - entityEmbeddingsDataset, - D.Suffix( - EntityToSimClustersEmbeddingsJob.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - isReverseIndex = true, - jobConfig.modelVersion, - jobConfig.entityType)) - ) - - Execution.zip(toSimClustersEmbeddings, fromSimClustersEmbeddings).unit - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val jobConfig = EntityEmbeddingsJobConfig(args, isAdhoc = false) - - val embeddingsDataset = EntityEmbeddingsSources.getEntityEmbeddingsDataset( - jobConfig.entityType, - ModelVersions.toKnownForModelVersion(jobConfig.modelVersion) - ) - - val reverseIndexEmbeddingsDataset = - EntityEmbeddingsSources.getReverseIndexedEntityEmbeddingsDataset( - jobConfig.entityType, - ModelVersions.toKnownForModelVersion(jobConfig.modelVersion) - ) - - val entityRealGraphSource = - DataSources.entityRealGraphAggregationDataSetSource(dateRange.embiggen(Days(7))) - - val entityUserMatrix: TypedPipe[(Entity, (UserId, Double))] = - getEntityUserMatrix( - entityRealGraphSource, - jobConfig.halfLife, - jobConfig.entityType).forceToDisk - - val normalizedUserEntityMatrix = getNormalizedTransposeInputMatrix(entityUserMatrix) - - val simClustersEmbedding = jobConfig.modelVersion match { - case ModelVersion.Model20m145k2020 => - val simClustersSource2020 = - InterestedInSources.simClustersInterestedIn2020Source(dateRange, timeZone) - computeEmbeddings( - simClustersSource2020, - normalizedUserEntityMatrix, - scoreExtractors, - ModelVersion.Model20m145k2020, - toSimClustersEmbeddingId(ModelVersion.Model20m145k2020) - ) - case modelVersion => - throw new IllegalArgumentException(s"Model Version ${modelVersion.name} not supported") - } - - val topKEmbeddings = - simClustersEmbedding.group.sortedReverseTake(jobConfig.topK)(Ordering.by(_._2)) - - val simClustersEmbeddingsExec = - writeOutput( - simClustersEmbedding, - topKEmbeddings, - jobConfig, - embeddingsDataset, - reverseIndexEmbeddingsDataset) - - // We don't support embeddingsLite for the 2020 model version. - val embeddingsLiteExec = if (jobConfig.modelVersion == ModelVersion.Model20m145kUpdated) { - topKEmbeddings - .collect { - case ( - SimClustersEmbeddingId( - EmbeddingType.FavBasedSematicCoreEntity, - ModelVersion.Model20m145kUpdated, - InternalId.EntityId(entityId)), - clustersWithScores) => - entityId -> clustersWithScores - } - .flatMap { - case (entityId, clustersWithScores) => - clustersWithScores.map { - case (clusterId, score) => EmbeddingsLite(entityId, clusterId, score) - } - case _ => Nil - }.writeDALSnapshotExecution( - SimclustersV2EmbeddingsLiteScalaDataset, - D.Daily, - D.Suffix(embeddingsLitePath(ModelVersion.Model20m145kUpdated, "fav_based")), - D.EBLzo(), - dateRange.end) - } else { - Execution.unit - } - - Execution - .zip(simClustersEmbeddingsExec, embeddingsLiteExec).unit - } -} - -object EntityToSimClustersEmbeddingsJob { - - def toSimClustersEmbeddingId( - modelVersion: ModelVersion - ): (Entity, ScoreType.ScoreType) => SimClustersEmbeddingId = { - case (Entity.SemanticCore(SemanticCoreEntity(entityId, _)), ScoreType.FavScore) => - SimClustersEmbeddingId( - EmbeddingType.FavBasedSematicCoreEntity, - modelVersion, - InternalId.EntityId(entityId)) - case (Entity.SemanticCore(SemanticCoreEntity(entityId, _)), ScoreType.FollowScore) => - SimClustersEmbeddingId( - EmbeddingType.FollowBasedSematicCoreEntity, - modelVersion, - InternalId.EntityId(entityId)) - case (Entity.Hashtag(Hashtag(hashtag)), ScoreType.FavScore) => - SimClustersEmbeddingId( - EmbeddingType.FavBasedHashtagEntity, - modelVersion, - InternalId.Hashtag(hashtag)) - case (Entity.Hashtag(Hashtag(hashtag)), ScoreType.FollowScore) => - SimClustersEmbeddingId( - EmbeddingType.FollowBasedHashtagEntity, - modelVersion, - InternalId.Hashtag(hashtag)) - case (scoreType, entity) => - throw new IllegalArgumentException( - s"(ScoreType, Entity) ($scoreType, ${entity.toString}) not supported") - } - - /** - * Generates the output path for the Entity Embeddings Job. - * - * Example Adhoc: /user/recos-platform/processed/adhoc/simclusters_embeddings/hashtag/model_20m_145k_updated - * Example Prod: /atla/proc/user/cassowary/processed/simclusters_embeddings/semantic_core/model_20m_145k_dec11 - * - */ - def getHdfsPath( - isAdhoc: Boolean, - isManhattanKeyVal: Boolean, - isReverseIndex: Boolean, - modelVersion: ModelVersion, - entityType: EntityType - ): String = { - - val reverseIndex = if (isReverseIndex) "reverse_index/" else "" - - val entityTypeSuffix = entityType match { - case EntityType.SemanticCore => "semantic_core" - case EntityType.Hashtag => "hashtag" - case _ => "unknown" - } - - val pathSuffix = s"$reverseIndex$entityTypeSuffix" - - EmbeddingUtil.getHdfsPath(isAdhoc, isManhattanKeyVal, modelVersion, pathSuffix) - } - - def embeddingsLitePath(modelVersion: ModelVersion, pathSuffix: String): String = { - s"/user/cassowary/processed/entity_real_graph/simclusters_embedding/lite/$modelVersion/$pathSuffix/" - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/GlobalSimClustersLanguageEmbedding.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/GlobalSimClustersLanguageEmbedding.docx new file mode 100644 index 000000000..03346f40b Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/GlobalSimClustersLanguageEmbedding.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/GlobalSimClustersLanguageEmbedding.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/GlobalSimClustersLanguageEmbedding.scala deleted file mode 100644 index 2a66a8a8e..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/GlobalSimClustersLanguageEmbedding.scala +++ /dev/null @@ -1,197 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.dal.client.dataset.SnapshotDALDataset -import com.twitter.scalding.DateRange -import com.twitter.scalding.Days -import com.twitter.scalding.UniqueID -import com.twitter.scalding._ -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.scalding_internal.dalv2.DALWrite.ExplicitEndTime -import com.twitter.scalding_internal.dalv2.DALWrite.WriteExtension -import com.twitter.scalding_internal.job.RequiredBinaryComparators.ordSer -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.Country -import com.twitter.simclusters_v2.common.Language -import com.twitter.simclusters_v2.common.Timestamp -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources.InterestedInSources -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.simclusters_v2.thriftscala.InternalId.ClusterId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2GlobalLanguageEmbeddingScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2GlobalLanguageEmbeddingThriftScalaDataset -import com.twitter.simclusters_v2.thriftscala.LanguageToClusters -import java.util.TimeZone - -/** -capesospy-v2 update --build_locally --start_cron \ - --start_cron global_simclusters_language_embedding_job \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object GlobalSimClustersLanguageEmbeddingBatchApp extends ScheduledExecutionApp { - - override val firstTime: RichDate = RichDate("2023-03-07") - - override val batchIncrement: Duration = Days(1) - - val outputHdfsDirectory = - "/user/cassowary/manhattan_sequence_files/global_simclusters_language_embeddings" - - val outputThriftHdfsDirectory = - "/user/cassowary/processed/global_simclusters_language_embeddings" - - val globalLanguageEmbeddingsKeyValDataset: KeyValDALDataset[ - KeyVal[String, ClustersUserIsInterestedIn] - ] = SimclustersV2GlobalLanguageEmbeddingScalaDataset - - val globalLanguageEmbeddingsThriftDataset: SnapshotDALDataset[LanguageToClusters] = - SimclustersV2GlobalLanguageEmbeddingThriftScalaDataset - - val numOfClustersPerLanguage: Int = 400 - - def getInterestedInFn: ( - DateRange, - TimeZone - ) => TypedPipe[(UserId, ClustersUserIsInterestedIn)] = - InterestedInSources.simClustersInterestedIn2020Source - - def flattenAndFilterUserInterestedIn( - interestedIn: TypedPipe[(UserId, ClustersUserIsInterestedIn)] - ): TypedPipe[(UserId, (Int, Double))] = { - interestedIn - // Get (userId, Seq[(clusterId, scores)] - .map { - case (user, clusterUserIsInterestedIn) => { - (user, clusterUserIsInterestedIn.clusterIdToScores) - } - } - // Flatten it into (UserId, ClusterId, LogFavScore) - .flatMap { - case (userId, clusterUserIsInterestedIn) => { - clusterUserIsInterestedIn.toSeq.map { - case (clusterId, scores) => { - (userId, (clusterId, scores.logFavScore.getOrElse(0.0))) - } - } - } - }.filter(_._2._2 > 0.0) // Filter out zero scores - } - - def getGlobalSimClustersEmbeddingPerLanguage( - interestedIn: TypedPipe[(UserId, (Int, Double))], - favEdges: TypedPipe[(UserId, TweetId, Timestamp)], - language: TypedPipe[(UserId, (Country, Language))] - ): TypedPipe[(Language, ClustersUserIsInterestedIn)] = { - // Engagement fav edges - val edges = favEdges.map { case (userId, tweetId, ts) => (userId, (tweetId, ts)) } - - // Language information for users - val userLanguage = language.map { - case (userId, (country, lang)) => (userId, lang) - } - val numUsersPerLanguage = userLanguage.map { - case (_, lang) => (lang, 1L) - }.sumByKey - - val embeddings = - interestedIn - .join(edges) // Join InterestedIn and user-tweet engagements - .map { - case (userId, ((clusterId, score), (_, _))) => { - (userId, (clusterId, score)) - } - } - .join(userLanguage) // Join and get cluster scores per language - .map { - case (userId, ((clusterId, score), lang)) => { - ((lang, clusterId), score) - } - } - .sumByKey // Sum the user embeddings per language based on the engagements - .map { case ((lang, clusterId), score) => (lang, (clusterId, score)) } - .join(numUsersPerLanguage) - // We compute the average cluster scores per language - .map { - case (lang, ((clusterId, score), count)) => (lang, (clusterId -> score / count)) - } - .group - .sortedReverseTake(numOfClustersPerLanguage)(Ordering - .by(_._2)) // Take top 400 clusters per language - .flatMap { - case (lang, clusterScores) => { - clusterScores.map { - case (clusterId, score) => (lang, (clusterId, score)) - } - } - }.mapValues { case (clusterId, score) => Map(clusterId -> score) } - - // Build the final SimClusters embeddings per language - embeddings.sumByKey.map { - case (lang, clusterToScore) => { - val clusterScores = clusterToScore.map { - case (clusterId, score) => - clusterId -> UserToInterestedInClusterScores(logFavScore = Some(score)) - } - (lang, ClustersUserIsInterestedIn(ModelVersion.Model20m145k2020.name, clusterScores)) - } - } - } - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - // Read the most recent InterestedIn snapshot from the past 21 days - val interestedIn = - InterestedInSources - .simClustersInterestedIn2020Source(dateRange.prepend(Days(21)), timeZone).forceToDisk - - // Get the user tweet fav engagement history from the past 2 days - val userTweetFavEdges = ExternalDataSources.userTweetFavoritesSource - - // Read user language from UserSource - val userLanguages = ExternalDataSources.userSource - - val globalEmbeddings = getGlobalSimClustersEmbeddingPerLanguage( - flattenAndFilterUserInterestedIn(interestedIn), - userTweetFavEdges, - userLanguages) - - // Write results as a key-val dataset - globalEmbeddings - .map { - case (lang, embeddings) => - KeyVal(lang, embeddings) - } - .writeDALVersionedKeyValExecution( - globalLanguageEmbeddingsKeyValDataset, - D.Suffix(outputHdfsDirectory) - ) - - // Write results as a thrift dataset - globalEmbeddings - .map { - case (lang, clusterUserIsInterestedIn) => - LanguageToClusters( - lang, - clusterUserIsInterestedIn.knownForModelVersion, - clusterUserIsInterestedIn.clusterIdToScores - ) - } - .writeDALSnapshotExecution( - globalLanguageEmbeddingsThriftDataset, - D.Daily, - D.Suffix(outputThriftHdfsDirectory), - D.Parquet, - dateRange.`end` - ) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/LocaleEntitySimClustersEmbeddingV2Job.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/LocaleEntitySimClustersEmbeddingV2Job.docx new file mode 100644 index 000000000..347a8a702 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/LocaleEntitySimClustersEmbeddingV2Job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/LocaleEntitySimClustersEmbeddingV2Job.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/LocaleEntitySimClustersEmbeddingV2Job.scala deleted file mode 100644 index baf604cba..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/LocaleEntitySimClustersEmbeddingV2Job.scala +++ /dev/null @@ -1,248 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding - -import com.twitter.bijection.{Bufferable, Injection} -import com.twitter.recos.entities.thriftscala.{Entity, SemanticCoreEntity} -import com.twitter.scalding.{DateRange, Days, Duration, Execution, RichDate, TypedPipe, UniqueID} -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common._ -import com.twitter.simclusters_v2.hdfs_sources.{AdhocKeyValSources, EntityEmbeddingsSources} -import com.twitter.simclusters_v2.scalding.common.matrix.{SparseMatrix, SparseRowMatrix} -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.ClusterId -import com.twitter.simclusters_v2.scalding.embedding.common.{ - EmbeddingUtil, - ExternalDataSources, - SimClustersEmbeddingBaseJob -} -import com.twitter.simclusters_v2.thriftscala.{ - EmbeddingType, - InternalId, - InternalIdEmbedding, - InternalIdWithScore, - LocaleEntityId, - ModelVersion, - SimClustersEmbeddingId -} -import com.twitter.wtf.entity_real_graph.thriftscala.{Edge, FeatureName} -import com.twitter.wtf.scalding.jobs.common.{AdhocExecutionApp, DataSources, ScheduledExecutionApp} -import java.util.TimeZone - -/** - * Scheduled production job which generates topic embeddings per locale based on Entity Real Graph. - * - * V2 Uses the log transform of the ERG favScores and the SimCluster InterestedIn scores. - * - * $ ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding:locale_entity_simclusters_embedding_v2 - * $ capesospy-v2 update \ - --build_locally \ - --start_cron locale_entity_simclusters_embedding_v2 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object LocaleEntitySimClustersEmbeddingV2ScheduledApp - extends LocaleEntitySimClustersEmbeddingV2Job - with ScheduledExecutionApp { - - override val firstTime: RichDate = RichDate("2020-04-08") - - override val batchIncrement: Duration = Days(1) - - override def writeNounToClustersIndex( - output: TypedPipe[(LocaleEntity, Seq[(ClusterId, Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - output - .map { - case ((entityId, lang), clustersWithScores) => - KeyVal( - SimClustersEmbeddingId( - EmbeddingType.LogFavBasedLocaleSemanticCoreEntity, - ModelVersion.Model20m145kUpdated, - InternalId.LocaleEntityId(LocaleEntityId(entityId, lang)) - ), - SimClustersEmbedding(clustersWithScores).toThrift - ) - } - .writeDALVersionedKeyValExecution( - EntityEmbeddingsSources.LogFavSemanticCorePerLanguageSimClustersEmbeddingsDataset, - D.Suffix( - EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - ModelVersion.Model20m145kUpdated, - pathSuffix = "log_fav_erg_based_embeddings")) - ) - } - - override def writeClusterToNounsIndex( - output: TypedPipe[(ClusterId, Seq[(LocaleEntity, Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .map { - case (clusterId, nounsWithScore) => - KeyVal( - SimClustersEmbeddingId( - EmbeddingType.LogFavBasedLocaleSemanticCoreEntity, - ModelVersion.Model20m145kUpdated, - InternalId.ClusterId(clusterId) - ), - InternalIdEmbedding(nounsWithScore.map { - case ((entityId, lang), score) => - InternalIdWithScore( - InternalId.LocaleEntityId(LocaleEntityId(entityId, lang)), - score) - }) - ) - } - .writeDALVersionedKeyValExecution( - EntityEmbeddingsSources.LogFavReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset, - D.Suffix( - EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - ModelVersion.Model20m145kUpdated, - pathSuffix = "reverse_index_log_fav_erg_based_embeddings")) - ) - } -} - -/** - * $ ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding:locale_entity_simclusters_embedding_v2-adhoc - * - * $ scalding remote run \ - --main-class com.twitter.simclusters_v2.scalding.embedding.LocaleEntitySimClustersEmbeddingV2AdhocApp \ - --target src/scala/com/twitter/simclusters_v2/scalding/embedding:locale_entity_simclusters_embedding_v2-adhoc \ - --user recos-platform --reducers 2000\ - -- --date 2020-04-06 - */ -object LocaleEntitySimClustersEmbeddingV2AdhocApp - extends LocaleEntitySimClustersEmbeddingV2Job - with AdhocExecutionApp { - - override def writeNounToClustersIndex( - output: TypedPipe[(LocaleEntity, Seq[(ClusterId, Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - output - .map { - case ((entityId, lang), clustersWithScores) => - SimClustersEmbeddingId( - EmbeddingType.LogFavBasedLocaleSemanticCoreEntity, - ModelVersion.Model20m145kUpdated, - InternalId.LocaleEntityId(LocaleEntityId(entityId, lang)) - ) -> SimClustersEmbedding(clustersWithScores).toThrift - - }.writeExecution( - AdhocKeyValSources.entityToClustersSource( - EmbeddingUtil.getHdfsPath( - isAdhoc = true, - isManhattanKeyVal = true, - ModelVersion.Model20m145kUpdated, - pathSuffix = "log_fav_erg_based_embeddings"))) - } - - override def writeClusterToNounsIndex( - output: TypedPipe[(ClusterId, Seq[(LocaleEntity, Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - output - .map { - case (clusterId, nounsWithScore) => - SimClustersEmbeddingId( - EmbeddingType.LogFavBasedLocaleSemanticCoreEntity, - ModelVersion.Model20m145kUpdated, - InternalId.ClusterId(clusterId) - ) -> - InternalIdEmbedding(nounsWithScore.map { - case ((entityId, lang), score) => - InternalIdWithScore( - InternalId.LocaleEntityId(LocaleEntityId(entityId, lang)), - score) - }) - } - .writeExecution( - AdhocKeyValSources.clusterToEntitiesSource( - EmbeddingUtil.getHdfsPath( - isAdhoc = true, - isManhattanKeyVal = true, - ModelVersion.Model20m145kUpdated, - pathSuffix = "reverse_index_log_fav_erg_based_embeddings"))) - } -} - -trait LocaleEntitySimClustersEmbeddingV2Job extends SimClustersEmbeddingBaseJob[LocaleEntity] { - - override val numClustersPerNoun = 100 - - override val numNounsPerClusters = 100 - - override val thresholdForEmbeddingScores: Double = 0.001 - - override val numReducersOpt: Option[Int] = Some(8000) - - private val DefaultERGHalfLifeInDays = 14 - - private val MinInterestedInLogFavScore = 0.0 - - implicit val inj: Injection[LocaleEntity, Array[Byte]] = Bufferable.injectionOf[LocaleEntity] - - override def prepareNounToUserMatrix( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): SparseMatrix[LocaleEntity, UserId, Double] = { - - val erg: TypedPipe[(SemanticCoreEntityId, (UserId, Double))] = - DataSources.entityRealGraphAggregationDataSetSource(dateRange.embiggen(Days(7))).flatMap { - case Edge( - userId, - Entity.SemanticCore(SemanticCoreEntity(entityId, _)), - consumerFeatures, - _, - _) if consumerFeatures.exists(_.exists(_.featureName == FeatureName.Favorites)) => - for { - features <- consumerFeatures - favFeatures <- features.find(_.featureName == FeatureName.Favorites) - ewmaMap <- favFeatures.featureValues.ewmaMap - favScore <- ewmaMap.get(DefaultERGHalfLifeInDays) - } yield (entityId, (userId, Math.log(favScore + 1))) - - case _ => None - } - - SparseMatrix[LocaleEntity, UserId, Double]( - erg - .hashJoin(ExternalDataSources.uttEntitiesSource().asKeys).map { - case (entityId, ((userId, score), _)) => (userId, (entityId, score)) - }.join(ExternalDataSources.userSource).map { - case (userId, ((entityId, score), (_, language))) => - ((entityId, language), userId, score) - } - ) - } - - override def prepareUserToClusterMatrix( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): SparseRowMatrix[UserId, ClusterId, Double] = { - SparseRowMatrix( - ExternalDataSources.simClustersInterestInLogFavSource(MinInterestedInLogFavScore), - isSkinnyMatrix = true - ) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/LocaleEntitySimClustersEmbeddingsJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/LocaleEntitySimClustersEmbeddingsJob.docx new file mode 100644 index 000000000..37a306593 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/LocaleEntitySimClustersEmbeddingsJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/LocaleEntitySimClustersEmbeddingsJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/LocaleEntitySimClustersEmbeddingsJob.scala deleted file mode 100644 index 06c66038c..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/LocaleEntitySimClustersEmbeddingsJob.scala +++ /dev/null @@ -1,437 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.recos.entities.thriftscala.Entity -import com.twitter.recos.entities.thriftscala.Hashtag -import com.twitter.recos.entities.thriftscala.SemanticCoreEntity -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.hdfs_sources.presto_hdfs_sources._ -import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources -import com.twitter.simclusters_v2.hdfs_sources.EntityEmbeddingsSources -import com.twitter.simclusters_v2.hdfs_sources.InterestedInSources -import com.twitter.simclusters_v2.scalding.embedding.LocaleEntitySimClustersEmbeddingsJob._ -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil._ -import com.twitter.simclusters_v2.scalding.embedding.common.EntityEmbeddingUtil._ -import com.twitter.simclusters_v2.scalding.embedding.common.SimClustersEmbeddingJob._ -import com.twitter.simclusters_v2.thriftscala.{ - SimClustersEmbedding => ThriftSimClustersEmbedding, - _ -} -import com.twitter.wtf.entity_real_graph.common.EntityUtil -import com.twitter.wtf.entity_real_graph.thriftscala.Edge -import com.twitter.wtf.entity_real_graph.thriftscala.EntityType -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.DataSources -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * $ ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding:entity_per_language_embeddings_job-adhoc - * - * ---------------------- Deploy to atla ---------------------- - * $ scalding remote run \ - --main-class com.twitter.simclusters_v2.scalding.embedding.LocaleEntitySimClustersEmbeddingAdhocApp \ - --target src/scala/com/twitter/simclusters_v2/scalding/embedding:entity_per_language_embeddings_job-adhoc \ - --user recos-platform \ - -- --date 2019-12-17 --model-version 20M_145K_updated --entity-type SemanticCore - */ -object LocaleEntitySimClustersEmbeddingAdhocApp extends AdhocExecutionApp { - - // Import implicits - - import EntityUtil._ - - def writeOutput( - embeddings: TypedPipe[(SimClustersEmbeddingId, (ClusterId, EmbeddingScore))], - topKEmbeddings: TypedPipe[(SimClustersEmbeddingId, Seq[(ClusterId, EmbeddingScore)])], - jobConfig: EntityEmbeddingsJobConfig - ): Execution[Unit] = { - - val toSimClusterEmbeddingExec = topKEmbeddings - .mapValues(SimClustersEmbedding.apply(_).toThrift) - .writeExecution( - AdhocKeyValSources.entityToClustersSource( - LocaleEntitySimClustersEmbeddingsJob.getHdfsPath( - isAdhoc = true, - isManhattanKeyVal = true, - isReverseIndex = false, - isLogFav = false, - jobConfig.modelVersion, - jobConfig.entityType))) - - val fromSimClusterEmbeddingExec = - toReverseIndexSimClusterEmbedding(embeddings, jobConfig.topK) - .writeExecution( - AdhocKeyValSources.clusterToEntitiesSource( - LocaleEntitySimClustersEmbeddingsJob.getHdfsPath( - isAdhoc = true, - isManhattanKeyVal = true, - isReverseIndex = true, - isLogFav = false, - jobConfig.modelVersion, - jobConfig.entityType))) - - Execution.zip(toSimClusterEmbeddingExec, fromSimClusterEmbeddingExec).unit - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val jobConfig = EntityEmbeddingsJobConfig(args, isAdhoc = true) - - val numReducers = args.getOrElse("m", "2000").toInt - - /* - Can use the ERG daily dataset in the adhoc job for quick prototyping, note that there may be - issues with scaling the job when productionizing on ERG aggregated dataset. - */ - val userEntityMatrix: TypedPipe[(UserId, (Entity, Double))] = - getUserEntityMatrix( - jobConfig, - DataSources.entityRealGraphAggregationDataSetSource(dateRange.embiggen(Days(7))), - Some(ExternalDataSources.uttEntitiesSource()) - ).forceToDisk - - //determine which data source to use based on model version - val simClustersSource = jobConfig.modelVersion match { - case ModelVersion.Model20m145kUpdated => - InterestedInSources.simClustersInterestedInUpdatedSource(dateRange, timeZone) - case modelVersion => - throw new IllegalArgumentException( - s"SimClusters model version not supported ${modelVersion.name}") - } - - val entityPerLanguage = userEntityMatrix.join(ExternalDataSources.userSource).map { - case (userId, ((entity, score), (_, language))) => - ((entity, language), (userId, score)) - } - - val normalizedUserEntityMatrix = - getNormalizedTransposeInputMatrix(entityPerLanguage, numReducers = Some(numReducers)) - - val embeddings = computeEmbeddings[(Entity, String)]( - simClustersSource, - normalizedUserEntityMatrix, - scoreExtractors, - ModelVersion.Model20m145kUpdated, - toSimClustersEmbeddingId(jobConfig.modelVersion), - numReducers = Some(numReducers * 2) - ) - - val topKEmbeddings = - embeddings.group - .sortedReverseTake(jobConfig.topK)(Ordering.by(_._2)) - .withReducers(numReducers) - - writeOutput(embeddings, topKEmbeddings, jobConfig) - } -} - -/** - * $ ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding:semantic_core_entity_embeddings_per_language_job - * $ capesospy-v2 update \ - --build_locally \ - --start_cron semantic_core_entity_embeddings_per_language_job src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object LocaleEntitySimClustersEmbeddingScheduledApp extends ScheduledExecutionApp { - - // Import implicits - - import EmbeddingUtil._ - import EntityUtil._ - - override val firstTime: RichDate = RichDate("2019-10-22") - - override val batchIncrement: Duration = Days(7) - - private def writeOutput( - embeddings: TypedPipe[(SimClustersEmbeddingId, (ClusterId, EmbeddingScore))], - topKEmbeddings: TypedPipe[(SimClustersEmbeddingId, Seq[(ClusterId, EmbeddingScore)])], - jobConfig: EntityEmbeddingsJobConfig, - clusterEmbeddingsDataset: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding] - ], - entityEmbeddingsDataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone - ): Execution[Unit] = { - - val thriftSimClustersEmbedding = topKEmbeddings - .mapValues(SimClustersEmbedding.apply(_).toThrift) - - val writeSimClustersEmbeddingKeyValDataset = - thriftSimClustersEmbedding - .map { - case (entityId, topSimClusters) => KeyVal(entityId, topSimClusters) - } - .writeDALVersionedKeyValExecution( - clusterEmbeddingsDataset, - D.Suffix( - LocaleEntitySimClustersEmbeddingsJob.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - isReverseIndex = false, - isLogFav = false, - jobConfig.modelVersion, - jobConfig.entityType)) - ) - - val writeSimClustersEmbeddingDataset = thriftSimClustersEmbedding - .map { - case (embeddingId, embedding) => SimClustersEmbeddingWithId(embeddingId, embedding) - } - .writeDALSnapshotExecution( - SemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset, - D.Daily, - D.Suffix( - LocaleEntitySimClustersEmbeddingsJob.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - isReverseIndex = false, - isLogFav = false, - jobConfig.modelVersion, - jobConfig.entityType)), - D.EBLzo(), - dateRange.end - ) - - val thriftReversedSimclustersEmbeddings = - toReverseIndexSimClusterEmbedding(embeddings, jobConfig.topK) - - val writeReverseSimClustersEmbeddingKeyValDataset = - thriftReversedSimclustersEmbeddings - .map { - case (embeddingId, internalIdsWithScore) => - KeyVal(embeddingId, internalIdsWithScore) - } - .writeDALVersionedKeyValExecution( - entityEmbeddingsDataset, - D.Suffix( - LocaleEntitySimClustersEmbeddingsJob.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - isReverseIndex = true, - isLogFav = false, - jobConfig.modelVersion, - jobConfig.entityType)) - ) - - val writeReverseSimClustersEmbeddingDataset = - thriftReversedSimclustersEmbeddings - .map { - case (embeddingId, embedding) => InternalIdEmbeddingWithId(embeddingId, embedding) - }.writeDALSnapshotExecution( - ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset, - D.Daily, - D.Suffix( - LocaleEntitySimClustersEmbeddingsJob.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - isReverseIndex = true, - isLogFav = false, - jobConfig.modelVersion, - jobConfig.entityType)), - D.EBLzo(), - dateRange.end - ) - - Execution - .zip( - writeSimClustersEmbeddingDataset, - writeSimClustersEmbeddingKeyValDataset, - writeReverseSimClustersEmbeddingDataset, - writeReverseSimClustersEmbeddingKeyValDataset - ).unit - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val jobConfig = EntityEmbeddingsJobConfig(args, isAdhoc = false) - - val embeddingsDataset = EntityEmbeddingsSources.getEntityEmbeddingsDataset( - jobConfig.entityType, - ModelVersions.toKnownForModelVersion(jobConfig.modelVersion), - isEmbeddingsPerLocale = true - ) - - val reverseIndexEmbeddingsDataset = - EntityEmbeddingsSources.getReverseIndexedEntityEmbeddingsDataset( - jobConfig.entityType, - ModelVersions.toKnownForModelVersion(jobConfig.modelVersion), - isEmbeddingsPerLocale = true - ) - - val userEntityMatrix: TypedPipe[(UserId, (Entity, Double))] = - getUserEntityMatrix( - jobConfig, - DataSources.entityRealGraphAggregationDataSetSource(dateRange.embiggen(Days(7))), - Some(ExternalDataSources.uttEntitiesSource()) - ).forceToDisk - - //determine which data source to use based on model version - val simClustersSource = jobConfig.modelVersion match { - case ModelVersion.Model20m145kUpdated => - InterestedInSources.simClustersInterestedInUpdatedSource(dateRange, timeZone) - case modelVersion => - throw new IllegalArgumentException( - s"SimClusters model version not supported ${modelVersion.name}") - } - - val entityPerLanguage = userEntityMatrix.join(ExternalDataSources.userSource).map { - case (userId, ((entity, score), (_, language))) => - ((entity, language), (userId, score)) - } - - val normalizedUserEntityMatrix = - getNormalizedTransposeInputMatrix(entityPerLanguage, numReducers = Some(3000)) - - val simClustersEmbedding = jobConfig.modelVersion match { - case ModelVersion.Model20m145kUpdated => - computeEmbeddings( - simClustersSource, - normalizedUserEntityMatrix, - scoreExtractors, - ModelVersion.Model20m145kUpdated, - toSimClustersEmbeddingId(ModelVersion.Model20m145kUpdated), - numReducers = Some(8000) - ) - case modelVersion => - throw new IllegalArgumentException( - s"SimClusters model version not supported ${modelVersion.name}") - } - - val topKEmbeddings = - simClustersEmbedding.group.sortedReverseTake(jobConfig.topK)(Ordering.by(_._2)) - - writeOutput( - simClustersEmbedding, - topKEmbeddings, - jobConfig, - embeddingsDataset, - reverseIndexEmbeddingsDataset) - } -} - -object LocaleEntitySimClustersEmbeddingsJob { - - def getUserEntityMatrix( - jobConfig: EntityEmbeddingsJobConfig, - entityRealGraphSource: TypedPipe[Edge], - semanticCoreEntityIdsToKeep: Option[TypedPipe[Long]], - applyLogTransform: Boolean = false - ): TypedPipe[(UserId, (Entity, Double))] = - jobConfig.entityType match { - case EntityType.SemanticCore => - semanticCoreEntityIdsToKeep match { - case Some(entityIdsToKeep) => - getEntityUserMatrix(entityRealGraphSource, jobConfig.halfLife, EntityType.SemanticCore) - .map { - case (entity, (userId, score)) => - entity match { - case Entity.SemanticCore(SemanticCoreEntity(entityId, _)) => - if (applyLogTransform) { - (entityId, (userId, (entity, Math.log(score + 1)))) - } else { - (entityId, (userId, (entity, score))) - } - case _ => - throw new IllegalArgumentException( - "Job config specified EntityType.SemanticCore, but non-semantic core entity was found.") - } - }.hashJoin(entityIdsToKeep.asKeys).values.map { - case ((userId, (entity, score)), _) => (userId, (entity, score)) - } - case _ => - getEntityUserMatrix(entityRealGraphSource, jobConfig.halfLife, EntityType.SemanticCore) - .map { case (entity, (userId, score)) => (userId, (entity, score)) } - } - case EntityType.Hashtag => - getEntityUserMatrix(entityRealGraphSource, jobConfig.halfLife, EntityType.Hashtag) - .map { case (entity, (userId, score)) => (userId, (entity, score)) } - case _ => - throw new IllegalArgumentException( - s"Argument [--entity-type] must be provided. Supported options [${EntityType.SemanticCore.name}, ${EntityType.Hashtag.name}]") - } - - def toSimClustersEmbeddingId( - modelVersion: ModelVersion - ): ((Entity, String), ScoreType.ScoreType) => SimClustersEmbeddingId = { - case ((Entity.SemanticCore(SemanticCoreEntity(entityId, _)), lang), ScoreType.FavScore) => - SimClustersEmbeddingId( - EmbeddingType.FavBasedSematicCoreEntity, - modelVersion, - InternalId.LocaleEntityId(LocaleEntityId(entityId, lang))) - case ((Entity.SemanticCore(SemanticCoreEntity(entityId, _)), lang), ScoreType.FollowScore) => - SimClustersEmbeddingId( - EmbeddingType.FollowBasedSematicCoreEntity, - modelVersion, - InternalId.LocaleEntityId(LocaleEntityId(entityId, lang))) - case ((Entity.SemanticCore(SemanticCoreEntity(entityId, _)), lang), ScoreType.LogFavScore) => - SimClustersEmbeddingId( - EmbeddingType.LogFavBasedLocaleSemanticCoreEntity, - modelVersion, - InternalId.LocaleEntityId(LocaleEntityId(entityId, lang))) - case ((Entity.Hashtag(Hashtag(hashtag)), _), ScoreType.FavScore) => - SimClustersEmbeddingId( - EmbeddingType.FavBasedHashtagEntity, - modelVersion, - InternalId.Hashtag(hashtag)) - case ((Entity.Hashtag(Hashtag(hashtag)), _), ScoreType.FollowScore) => - SimClustersEmbeddingId( - EmbeddingType.FollowBasedHashtagEntity, - modelVersion, - InternalId.Hashtag(hashtag)) - case (scoreType, entity) => - throw new IllegalArgumentException( - s"(ScoreType, Entity) ($scoreType, ${entity.toString}) not supported") - } - - /** - * Generates the output path for the Entity Embeddings Job. - * - * Example Adhoc: /user/recos-platform/processed/adhoc/simclusters_embeddings/hashtag_per_language/model_20m_145k_updated - * Example Prod: /atla/proc/user/cassowary/processed/simclusters_embeddings/semantic_core_per_language/model_20m_145k_updated - * - */ - def getHdfsPath( - isAdhoc: Boolean, - isManhattanKeyVal: Boolean, - isReverseIndex: Boolean, - isLogFav: Boolean, - modelVersion: ModelVersion, - entityType: EntityType - ): String = { - - val reverseIndex = if (isReverseIndex) "reverse_index/" else "" - - val logFav = if (isLogFav) "log_fav/" else "" - - val entityTypeSuffix = entityType match { - case EntityType.SemanticCore => "semantic_core_per_language" - case EntityType.Hashtag => "hashtag_per_language" - case _ => "unknown_per_language" - } - - val pathSuffix = s"$logFav$reverseIndex$entityTypeSuffix" - - EmbeddingUtil.getHdfsPath(isAdhoc, isManhattanKeyVal, modelVersion, pathSuffix) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/ProducerEmbeddingsFromInterestedIn.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/ProducerEmbeddingsFromInterestedIn.docx new file mode 100644 index 000000000..f1f038a7c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/ProducerEmbeddingsFromInterestedIn.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/ProducerEmbeddingsFromInterestedIn.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/ProducerEmbeddingsFromInterestedIn.scala deleted file mode 100644 index e78299d66..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/ProducerEmbeddingsFromInterestedIn.scala +++ /dev/null @@ -1,701 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil._ -import com.twitter.simclusters_v2.scalding.embedding.common.SimClustersEmbeddingJob -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.wtf.scalding.jobs.common.{AdhocExecutionApp, ScheduledExecutionApp} -import java.util.TimeZone - -object ProducerEmbeddingsFromInterestedInBatchAppUtil { - import ProducerEmbeddingsFromInterestedIn._ - - val user = System.getenv("USER") - - val rootPath: String = s"/user/$user/manhattan_sequence_files" - - // Helps speed up the multiplication step which can get very big - val numReducersForMatrixMultiplication: Int = 12000 - - /** - * Given the producer x cluster matrix, key by producer / cluster individually, and write output - * to individual DAL datasets - */ - def writeOutput( - producerClusterEmbedding: TypedPipe[((ClusterId, UserId), Double)], - producerTopKEmbeddingsDataset: KeyValDALDataset[KeyVal[Long, TopSimClustersWithScore]], - clusterTopKProducersDataset: KeyValDALDataset[ - KeyVal[PersistedFullClusterId, TopProducersWithScore] - ], - producerTopKEmbeddingsPath: String, - clusterTopKProducersPath: String, - modelVersion: ModelVersion - ): Execution[Unit] = { - val keyedByProducer = - toSimClusterEmbedding(producerClusterEmbedding, topKClustersToKeep, modelVersion) - .map { case (userId, clusters) => KeyVal(userId, clusters) } - .writeDALVersionedKeyValExecution( - producerTopKEmbeddingsDataset, - D.Suffix(producerTopKEmbeddingsPath) - ) - - val keyedBySimCluster = fromSimClusterEmbedding( - producerClusterEmbedding, - topKUsersToKeep, - modelVersion - ).map { - case (clusterId, topProducers) => KeyVal(clusterId, topProducersToThrift(topProducers)) - } - .writeDALVersionedKeyValExecution( - clusterTopKProducersDataset, - D.Suffix(clusterTopKProducersPath) - ) - - Execution.zip(keyedByProducer, keyedBySimCluster).unit - } -} - -/** - * Base class for Fav based producer embeddings. Helps reuse the code for different model versions - */ -trait ProducerEmbeddingsFromInterestedInByFavScoreBase extends ScheduledExecutionApp { - import ProducerEmbeddingsFromInterestedIn._ - import ProducerEmbeddingsFromInterestedInBatchAppUtil._ - - def modelVersion: ModelVersion - - val producerTopKEmbeddingsByFavScorePathPrefix: String = - "/producer_top_k_simcluster_embeddings_by_fav_score_" - - val clusterTopKProducersByFavScorePathPrefix: String = - "/simcluster_embedding_top_k_producers_by_fav_score_" - - val minNumFavers: Int = minNumFaversForProducer - - def producerTopKSimclusterEmbeddingsByFavScoreDataset: KeyValDALDataset[ - KeyVal[Long, TopSimClustersWithScore] - ] - - def simclusterEmbeddingTopKProducersByFavScoreDataset: KeyValDALDataset[ - KeyVal[PersistedFullClusterId, TopProducersWithScore] - ] - - def getInterestedInFn: (DateRange, TimeZone) => TypedPipe[(Long, ClustersUserIsInterestedIn)] - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val producerTopKEmbeddingsByFavScorePathUpdated: String = - rootPath + producerTopKEmbeddingsByFavScorePathPrefix + ModelVersions - .toKnownForModelVersion(modelVersion) - - val clusterTopKProducersByFavScorePathUpdated: String = - rootPath + clusterTopKProducersByFavScorePathPrefix + ModelVersions - .toKnownForModelVersion(modelVersion) - - val producerClusterEmbeddingByFavScore = getProducerClusterEmbedding( - getInterestedInFn(dateRange.embiggen(Days(5)), timeZone), - DataSources.userUserNormalizedGraphSource, - DataSources.userNormsAndCounts, - userToProducerFavScore, - userToClusterFavScore, // Fav score - _.faverCount.exists(_ > minNumFavers), - numReducersForMatrixMultiplication, - modelVersion, - cosineSimilarityThreshold - ).forceToDisk - - writeOutput( - producerClusterEmbeddingByFavScore, - producerTopKSimclusterEmbeddingsByFavScoreDataset, - simclusterEmbeddingTopKProducersByFavScoreDataset, - producerTopKEmbeddingsByFavScorePathUpdated, - clusterTopKProducersByFavScorePathUpdated, - modelVersion - ) - } -} - -/** - * Base class for Follow based producer embeddings. Helps reuse the code for different model versions - */ -trait ProducerEmbeddingsFromInterestedInByFollowScoreBase extends ScheduledExecutionApp { - import ProducerEmbeddingsFromInterestedIn._ - import ProducerEmbeddingsFromInterestedInBatchAppUtil._ - - def modelVersion: ModelVersion - - val producerTopKEmbeddingsByFollowScorePathPrefix: String = - "/producer_top_k_simcluster_embeddings_by_follow_score_" - - val clusterTopKProducersByFollowScorePathPrefix: String = - "/simcluster_embedding_top_k_producers_by_follow_score_" - - def producerTopKSimclusterEmbeddingsByFollowScoreDataset: KeyValDALDataset[ - KeyVal[Long, TopSimClustersWithScore] - ] - - def simclusterEmbeddingTopKProducersByFollowScoreDataset: KeyValDALDataset[ - KeyVal[PersistedFullClusterId, TopProducersWithScore] - ] - - def getInterestedInFn: (DateRange, TimeZone) => TypedPipe[(Long, ClustersUserIsInterestedIn)] - - val minNumFollowers: Int = minNumFollowersForProducer - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val producerTopKEmbeddingsByFollowScorePath: String = - rootPath + producerTopKEmbeddingsByFollowScorePathPrefix + ModelVersions - .toKnownForModelVersion(modelVersion) - - val clusterTopKProducersByFollowScorePath: String = - rootPath + clusterTopKProducersByFollowScorePathPrefix + ModelVersions - .toKnownForModelVersion(modelVersion) - - val producerClusterEmbeddingByFollowScore = getProducerClusterEmbedding( - getInterestedInFn(dateRange.embiggen(Days(5)), timeZone), - DataSources.userUserNormalizedGraphSource, - DataSources.userNormsAndCounts, - userToProducerFollowScore, - userToClusterFollowScore, // Follow score - _.followerCount.exists(_ > minNumFollowers), - numReducersForMatrixMultiplication, - modelVersion, - cosineSimilarityThreshold - ).forceToDisk - - writeOutput( - producerClusterEmbeddingByFollowScore, - producerTopKSimclusterEmbeddingsByFollowScoreDataset, - simclusterEmbeddingTopKProducersByFollowScoreDataset, - producerTopKEmbeddingsByFollowScorePath, - clusterTopKProducersByFollowScorePath, - modelVersion - ) - } -} - -/** - capesospy-v2 update --build_locally --start_cron \ - --start_cron producer_embeddings_from_interested_in_by_fav_score \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object ProducerEmbeddingsFromInterestedInByFavScoreBatchApp - extends ProducerEmbeddingsFromInterestedInByFavScoreBase { - override def modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated - - override def getInterestedInFn: ( - DateRange, - TimeZone - ) => TypedPipe[(UserId, ClustersUserIsInterestedIn)] = - InterestedInSources.simClustersInterestedInUpdatedSource - - override val firstTime: RichDate = RichDate("2019-09-10") - - override val batchIncrement: Duration = Days(7) - - override def producerTopKSimclusterEmbeddingsByFavScoreDataset: KeyValDALDataset[ - KeyVal[Long, TopSimClustersWithScore] - ] = - ProducerTopKSimclusterEmbeddingsByFavScoreUpdatedScalaDataset - - override def simclusterEmbeddingTopKProducersByFavScoreDataset: KeyValDALDataset[ - KeyVal[PersistedFullClusterId, TopProducersWithScore] - ] = - SimclusterEmbeddingTopKProducersByFavScoreUpdatedScalaDataset -} - -/** -capesospy-v2 update --build_locally --start_cron \ - --start_cron producer_embeddings_from_interested_in_by_fav_score_2020 \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object ProducerEmbeddingsFromInterestedInByFavScore2020BatchApp - extends ProducerEmbeddingsFromInterestedInByFavScoreBase { - override def modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - - override def getInterestedInFn: ( - DateRange, - TimeZone - ) => TypedPipe[(UserId, ClustersUserIsInterestedIn)] = - InterestedInSources.simClustersInterestedIn2020Source - - override val firstTime: RichDate = RichDate("2021-03-01") - - override val batchIncrement: Duration = Days(7) - - override def producerTopKSimclusterEmbeddingsByFavScoreDataset: KeyValDALDataset[ - KeyVal[Long, TopSimClustersWithScore] - ] = - ProducerTopKSimclusterEmbeddingsByFavScore2020ScalaDataset - - override def simclusterEmbeddingTopKProducersByFavScoreDataset: KeyValDALDataset[ - KeyVal[PersistedFullClusterId, TopProducersWithScore] - ] = - SimclusterEmbeddingTopKProducersByFavScore2020ScalaDataset -} - -/** -capesospy-v2 update --build_locally --start_cron \ - --start_cron producer_embeddings_from_interested_in_by_fav_score_dec11 \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object ProducerEmbeddingsFromInterestedInByFavScoreDec11BatchApp - extends ProducerEmbeddingsFromInterestedInByFavScoreBase { - override def modelVersion: ModelVersion = ModelVersion.Model20m145kDec11 - - override def getInterestedInFn: ( - DateRange, - TimeZone - ) => TypedPipe[(UserId, ClustersUserIsInterestedIn)] = - InterestedInSources.simClustersInterestedInDec11Source - - override val firstTime: RichDate = RichDate("2019-11-18") - - override val batchIncrement: Duration = Days(7) - - override def producerTopKSimclusterEmbeddingsByFavScoreDataset: KeyValDALDataset[ - KeyVal[Long, TopSimClustersWithScore] - ] = - ProducerTopKSimclusterEmbeddingsByFavScoreScalaDataset - - override def simclusterEmbeddingTopKProducersByFavScoreDataset: KeyValDALDataset[ - KeyVal[PersistedFullClusterId, TopProducersWithScore] - ] = - SimclusterEmbeddingTopKProducersByFavScoreScalaDataset -} - -/** -capesospy-v2 update --build_locally --start_cron \ - --start_cron producer_embeddings_from_interested_in_by_follow_score \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object ProducerEmbeddingsFromInterestedInByFollowScoreBatchApp - extends ProducerEmbeddingsFromInterestedInByFollowScoreBase { - override def modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated - - override def getInterestedInFn: ( - DateRange, - TimeZone - ) => TypedPipe[(UserId, ClustersUserIsInterestedIn)] = - InterestedInSources.simClustersInterestedInUpdatedSource - - override val firstTime: RichDate = RichDate("2019-09-10") - - override val batchIncrement: Duration = Days(7) - - override def producerTopKSimclusterEmbeddingsByFollowScoreDataset: KeyValDALDataset[ - KeyVal[Long, TopSimClustersWithScore] - ] = - ProducerTopKSimclusterEmbeddingsByFollowScoreUpdatedScalaDataset - - override def simclusterEmbeddingTopKProducersByFollowScoreDataset: KeyValDALDataset[ - KeyVal[PersistedFullClusterId, TopProducersWithScore] - ] = - SimclusterEmbeddingTopKProducersByFollowScoreUpdatedScalaDataset -} - -/** -capesospy-v2 update --build_locally --start_cron \ - --start_cron producer_embeddings_from_interested_in_by_follow_score_2020 \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object ProducerEmbeddingsFromInterestedInByFollowScore2020BatchApp - extends ProducerEmbeddingsFromInterestedInByFollowScoreBase { - override def modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - - override def getInterestedInFn: ( - DateRange, - TimeZone - ) => TypedPipe[(UserId, ClustersUserIsInterestedIn)] = - InterestedInSources.simClustersInterestedIn2020Source - - override val firstTime: RichDate = RichDate("2021-03-01") - - override val batchIncrement: Duration = Days(7) - - override def producerTopKSimclusterEmbeddingsByFollowScoreDataset: KeyValDALDataset[ - KeyVal[Long, TopSimClustersWithScore] - ] = - ProducerTopKSimclusterEmbeddingsByFollowScore2020ScalaDataset - - override def simclusterEmbeddingTopKProducersByFollowScoreDataset: KeyValDALDataset[ - KeyVal[PersistedFullClusterId, TopProducersWithScore] - ] = - SimclusterEmbeddingTopKProducersByFollowScore2020ScalaDataset -} - -/** -capesospy-v2 update --build_locally --start_cron \ - --start_cron producer_embeddings_from_interested_in_by_follow_score_dec11 \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object ProducerEmbeddingsFromInterestedInByFollowScoreDec11BatchApp - extends ProducerEmbeddingsFromInterestedInByFollowScoreBase { - override def modelVersion: ModelVersion = ModelVersion.Model20m145kDec11 - - override def getInterestedInFn: ( - DateRange, - TimeZone - ) => TypedPipe[(UserId, ClustersUserIsInterestedIn)] = - InterestedInSources.simClustersInterestedInDec11Source - - override val firstTime: RichDate = RichDate("2019-11-18") - - override val batchIncrement: Duration = Days(7) - - override def producerTopKSimclusterEmbeddingsByFollowScoreDataset: KeyValDALDataset[ - KeyVal[Long, TopSimClustersWithScore] - ] = - ProducerTopKSimclusterEmbeddingsByFollowScoreScalaDataset - - override def simclusterEmbeddingTopKProducersByFollowScoreDataset: KeyValDALDataset[ - KeyVal[PersistedFullClusterId, TopProducersWithScore] - ] = - SimclusterEmbeddingTopKProducersByFollowScoreScalaDataset -} - -/** - * Adhoc job to calculate producer's simcluster embeddings, which essentially assigns interestedIn - * SimClusters to each producer, regardless of whether the producer has a knownFor assignment. - * -$ ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding:producer_embeddings_from_interested_in-adhoc - - $ scalding remote run \ - --main-class com.twitter.simclusters_v2.scalding.embedding.ProducerEmbeddingsFromInterestedInAdhocApp \ - --target src/scala/com/twitter/simclusters_v2/scalding/embedding:producer_embeddings_from_interested_in-adhoc \ - --user cassowary --cluster bluebird-qus1 \ - --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ - --principal service_acoount@TWITTER.BIZ \ - -- --date 2020-08-25 --model_version 20M_145K_updated \ - --outputDir /gcs/user/cassowary/adhoc/producerEmbeddings/ - - */ -object ProducerEmbeddingsFromInterestedInAdhocApp extends AdhocExecutionApp { - - import ProducerEmbeddingsFromInterestedIn._ - - private val numReducersForMatrixMultiplication = 12000 - - /** - * Calculate the embedding and writes the results keyed by producers and clusters separately into - * individual locations - */ - private def runAdhocByScore( - interestedInClusters: TypedPipe[(Long, ClustersUserIsInterestedIn)], - userUserNormalGraph: TypedPipe[UserAndNeighbors], - userNormsAndCounts: TypedPipe[NormsAndCounts], - keyedByProducerSinkPath: String, - keyedByClusterSinkPath: String, - userToProducerScoringFn: NeighborWithWeights => Double, - userToClusterScoringFn: UserToInterestedInClusterScores => Double, - userFilter: NormsAndCounts => Boolean, - modelVersion: ModelVersion - )( - implicit uniqueID: UniqueID - ): Execution[Unit] = { - - val producerClusterEmbedding = getProducerClusterEmbedding( - interestedInClusters, - userUserNormalGraph, - userNormsAndCounts, - userToProducerScoringFn, - userToClusterScoringFn, - userFilter, - numReducersForMatrixMultiplication, - modelVersion, - cosineSimilarityThreshold - ).forceToDisk - - val keyByProducerExec = - toSimClusterEmbedding(producerClusterEmbedding, topKClustersToKeep, modelVersion) - .writeExecution( - AdhocKeyValSources.topProducerToClusterEmbeddingsSource(keyedByProducerSinkPath)) - - val keyByClusterExec = - fromSimClusterEmbedding(producerClusterEmbedding, topKUsersToKeep, modelVersion) - .map { case (clusterId, topProducers) => (clusterId, topProducersToThrift(topProducers)) } - .writeExecution( - AdhocKeyValSources.topClusterEmbeddingsToProducerSource(keyedByClusterSinkPath)) - - Execution.zip(keyByProducerExec, keyByClusterExec).unit - } - - // Calculate the embeddings using follow scores - private def runFollowScore( - interestedInClusters: TypedPipe[(Long, ClustersUserIsInterestedIn)], - userUserNormalGraph: TypedPipe[UserAndNeighbors], - userNormsAndCounts: TypedPipe[NormsAndCounts], - modelVersion: ModelVersion, - outputDir: String - )( - implicit uniqueID: UniqueID - ): Execution[Unit] = { - val keyByClusterSinkPath = outputDir + "keyedByCluster/byFollowScore_" + modelVersion - val keyByProducerSinkPath = outputDir + "keyedByProducer/byFollowScore_" + modelVersion - - runAdhocByScore( - interestedInClusters, - userUserNormalGraph, - userNormsAndCounts, - keyedByProducerSinkPath = keyByProducerSinkPath, - keyedByClusterSinkPath = keyByClusterSinkPath, - userToProducerScoringFn = userToProducerFollowScore, - userToClusterScoringFn = userToClusterFollowScore, - _.followerCount.exists(_ > minNumFollowersForProducer), - modelVersion - ) - } - - // Calculate the embeddings using fav scores - private def runFavScore( - interestedInClusters: TypedPipe[(Long, ClustersUserIsInterestedIn)], - userUserNormalGraph: TypedPipe[UserAndNeighbors], - userNormsAndCounts: TypedPipe[NormsAndCounts], - modelVersion: ModelVersion, - outputDir: String - )( - implicit uniqueID: UniqueID - ): Execution[Unit] = { - val keyByClusterSinkPath = outputDir + "keyedByCluster/byFavScore_" + modelVersion - val keyByProducerSinkPath = outputDir + "keyedByProducer/byFavScore_" + modelVersion - - runAdhocByScore( - interestedInClusters, - userUserNormalGraph, - userNormsAndCounts, - keyedByProducerSinkPath = keyByProducerSinkPath, - keyedByClusterSinkPath = keyByClusterSinkPath, - userToProducerScoringFn = userToProducerFavScore, - userToClusterScoringFn = userToClusterFavScore, - _.faverCount.exists(_ > minNumFaversForProducer), - modelVersion - ) - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val outputDir = args("outputDir") - - val modelVersion = - ModelVersions.toModelVersion(args.required("model_version")) - - val interestedInClusters = modelVersion match { - case ModelVersion.Model20m145k2020 => - InterestedInSources.simClustersInterestedIn2020Source(dateRange, timeZone).forceToDisk - case ModelVersion.Model20m145kUpdated => - InterestedInSources.simClustersInterestedInUpdatedSource(dateRange, timeZone).forceToDisk - case _ => - InterestedInSources.simClustersInterestedInDec11Source(dateRange, timeZone).forceToDisk - } - - Execution - .zip( - runFavScore( - interestedInClusters, - DataSources.userUserNormalizedGraphSource, - DataSources.userNormsAndCounts, - modelVersion, - outputDir - ), - runFollowScore( - interestedInClusters, - DataSources.userUserNormalizedGraphSource, - DataSources.userNormsAndCounts, - modelVersion, - outputDir - ) - ).unit - } -} - -/** - * Computes the producer's interestedIn cluster embedding. i.e. If a tweet author (producer) is not - * associated with a KnownFor cluster, do a cross-product between - * [user, interestedIn] and [user, producer] to find the similarity matrix [interestedIn, producer]. - */ -object ProducerEmbeddingsFromInterestedIn { - val minNumFollowersForProducer: Int = 100 - val minNumFaversForProducer: Int = 100 - val topKUsersToKeep: Int = 300 - val topKClustersToKeep: Int = 60 - val cosineSimilarityThreshold: Double = 0.01 - - type ClusterId = Int - - def topProducersToThrift(producersWithScore: Seq[(UserId, Double)]): TopProducersWithScore = { - val thrift = producersWithScore.map { producer => - TopProducerWithScore(producer._1, producer._2) - } - TopProducersWithScore(thrift) - } - - def userToProducerFavScore(neighbor: NeighborWithWeights): Double = { - neighbor.favScoreHalfLife100DaysNormalizedByNeighborFaversL2.getOrElse(0.0) - } - - def userToProducerFollowScore(neighbor: NeighborWithWeights): Double = { - neighbor.followScoreNormalizedByNeighborFollowersL2.getOrElse(0.0) - } - - def userToClusterFavScore(clusterScore: UserToInterestedInClusterScores): Double = { - clusterScore.favScoreClusterNormalizedOnly.getOrElse(0.0) - } - - def userToClusterFollowScore(clusterScore: UserToInterestedInClusterScores): Double = { - clusterScore.followScoreClusterNormalizedOnly.getOrElse(0.0) - } - - def getUserSimClustersMatrix( - simClustersSource: TypedPipe[(UserId, ClustersUserIsInterestedIn)], - extractScore: UserToInterestedInClusterScores => Double, - modelVersion: ModelVersion - ): TypedPipe[(UserId, Seq[(Int, Double)])] = { - simClustersSource.collect { - case (userId, clusters) - if ModelVersions.toModelVersion(clusters.knownForModelVersion).equals(modelVersion) => - userId -> clusters.clusterIdToScores - .map { - case (clusterId, clusterScores) => - (clusterId, extractScore(clusterScores)) - }.toSeq.filter(_._2 > 0) - } - } - - /** - * Given a weighted user-producer engagement history matrix, as well as a - * weighted user-interestedInCluster matrix, do the matrix multiplication to yield a weighted - * producer-cluster embedding matrix - */ - def getProducerClusterEmbedding( - interestedInClusters: TypedPipe[(UserId, ClustersUserIsInterestedIn)], - userProducerEngagementGraph: TypedPipe[UserAndNeighbors], - userNormsAndCounts: TypedPipe[NormsAndCounts], - userToProducerScoringFn: NeighborWithWeights => Double, - userToClusterScoringFn: UserToInterestedInClusterScores => Double, - userFilter: NormsAndCounts => Boolean, // function to decide whether to compute embeddings for the user or not - numReducersForMatrixMultiplication: Int, - modelVersion: ModelVersion, - threshold: Double - )( - implicit uid: UniqueID - ): TypedPipe[((ClusterId, UserId), Double)] = { - val userSimClustersMatrix = getUserSimClustersMatrix( - interestedInClusters, - userToClusterScoringFn, - modelVersion - ) - - val userUserNormalizedGraph = getFilteredUserUserNormalizedGraph( - userProducerEngagementGraph, - userNormsAndCounts, - userToProducerScoringFn, - userFilter - ) - - SimClustersEmbeddingJob - .legacyMultiplyMatrices( - userUserNormalizedGraph, - userSimClustersMatrix, - numReducersForMatrixMultiplication - ) - .filter(_._2 >= threshold) - } - - def getFilteredUserUserNormalizedGraph( - userProducerEngagementGraph: TypedPipe[UserAndNeighbors], - userNormsAndCounts: TypedPipe[NormsAndCounts], - userToProducerScoringFn: NeighborWithWeights => Double, - userFilter: NormsAndCounts => Boolean - )( - implicit uid: UniqueID - ): TypedPipe[(UserId, (UserId, Double))] = { - val numUsersCount = Stat("num_users_with_engagements") - val userUserFilteredEdgeCount = Stat("num_filtered_user_user_engagements") - val validUsersCount = Stat("num_valid_users") - - val validUsers = userNormsAndCounts.collect { - case user if userFilter(user) => - validUsersCount.inc() - user.userId - } - - userProducerEngagementGraph - .flatMap { userAndNeighbors => - numUsersCount.inc() - userAndNeighbors.neighbors - .map { neighbor => - userUserFilteredEdgeCount.inc() - (neighbor.neighborId, (userAndNeighbors.userId, userToProducerScoringFn(neighbor))) - } - .filter(_._2._2 > 0.0) - } - .join(validUsers.asKeys) - .map { - case (neighborId, ((userId, score), _)) => - (userId, (neighborId, score)) - } - } - - def fromSimClusterEmbedding[T, E]( - resultMatrix: TypedPipe[((ClusterId, T), Double)], - topK: Int, - modelVersion: ModelVersion - ): TypedPipe[(PersistedFullClusterId, Seq[(T, Double)])] = { - resultMatrix - .map { - case ((clusterId, inputId), score) => (clusterId, (inputId, score)) - } - .group - .sortedReverseTake(topK)(Ordering.by(_._2)) - .map { - case (clusterId, topEntitiesWithScore) => - PersistedFullClusterId(modelVersion, clusterId) -> topEntitiesWithScore - } - } - - def toSimClusterEmbedding[T]( - resultMatrix: TypedPipe[((ClusterId, T), Double)], - topK: Int, - modelVersion: ModelVersion - )( - implicit ordering: Ordering[T] - ): TypedPipe[(T, TopSimClustersWithScore)] = { - resultMatrix - .map { - case ((clusterId, inputId), score) => (inputId, (clusterId, score)) - } - .group - //.withReducers(3000) // uncomment for producer-simclusters job - .sortedReverseTake(topK)(Ordering.by(_._2)) - .map { - case (inputId, topSimClustersWithScore) => - val topSimClusters = topSimClustersWithScore.map { - case (clusterId, score) => SimClusterWithScore(clusterId, score) - } - inputId -> TopSimClustersWithScore(topSimClusters, modelVersion) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/SimilarUsersBySimClustersEmbedding.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/SimilarUsersBySimClustersEmbedding.docx new file mode 100644 index 000000000..b5cafe72d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/SimilarUsersBySimClustersEmbedding.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/SimilarUsersBySimClustersEmbedding.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/SimilarUsersBySimClustersEmbedding.scala deleted file mode 100644 index c530614f7..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/SimilarUsersBySimClustersEmbedding.scala +++ /dev/null @@ -1,299 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding - -import com.twitter.bijection.Injection -import com.twitter.bijection.scrooge.CompactScalaCodec -import com.twitter.hermit.candidate.thriftscala.Candidate -import com.twitter.hermit.candidate.thriftscala.Candidates -import com.twitter.scalding._ -import com.twitter.scalding.commons.source.VersionedKeyValSource -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2._ -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.CosineSimilarityUtil -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** -capesospy-v2 update --build_locally --start_cron \ - --start_cron similar_users_by_simclusters_embeddings_job \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object SimilarUsersBySimClustersEmbeddingBatchApp extends ScheduledExecutionApp { - - override val firstTime: RichDate = RichDate("2019-07-10") - - override val batchIncrement: Duration = Days(7) - - private val outputByFav = - "/user/cassowary/manhattan_sequence_files/similar_users_by_simclusters_embeddings/by_fav" - private val outputByFollow = - "/user/cassowary/manhattan_sequence_files/similar_users_by_simclusters_embeddings/by_follow" - - private implicit val valueInj: CompactScalaCodec[Candidates] = CompactScalaCodec(Candidates) - - private val topClusterEmbeddingsByFavScore = DAL - .readMostRecentSnapshotNoOlderThan( - ProducerTopKSimclusterEmbeddingsByFavScoreUpdatedScalaDataset, - Days(14) - ) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .map { clusterScorePair => clusterScorePair.key -> clusterScorePair.value } - - private val topProducersForClusterEmbeddingByFavScore = DAL - .readMostRecentSnapshotNoOlderThan( - SimclusterEmbeddingTopKProducersByFavScoreUpdatedScalaDataset, - Days(14) - ) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .map { producerScoresPair => producerScoresPair.key -> producerScoresPair.value } - - private val topClusterEmbeddingsByFollowScore = DAL - .readMostRecentSnapshotNoOlderThan( - ProducerTopKSimclusterEmbeddingsByFollowScoreUpdatedScalaDataset, - Days(14) - ) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .map { clusterScorePair => clusterScorePair.key -> clusterScorePair.value } - - private val topProducersForClusterEmbeddingByFollowScore = DAL - .readMostRecentSnapshotNoOlderThan( - SimclusterEmbeddingTopKProducersByFollowScoreUpdatedScalaDataset, - Days(14) - ) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .map { producerScoresPair => producerScoresPair.key -> producerScoresPair.value } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - Execution - .zip( - SimilarUsersBySimClustersEmbedding - .getTopUsersRelatedToUser( - topClusterEmbeddingsByFavScore, - topProducersForClusterEmbeddingByFavScore - ) - .map { case (key, value) => KeyVal(key, value) } - .writeDALVersionedKeyValExecution( - SimilarUsersByFavBasedProducerEmbeddingScalaDataset, - D.Suffix(outputByFav) - ), - SimilarUsersBySimClustersEmbedding - .getTopUsersRelatedToUser( - topClusterEmbeddingsByFollowScore, - topProducersForClusterEmbeddingByFollowScore - ) - .map { case (key, value) => KeyVal(key, value) } - .writeDALVersionedKeyValExecution( - SimilarUsersByFollowBasedProducerEmbeddingScalaDataset, - D.Suffix(outputByFollow) - ) - ).unit - } -} - -/** - * Adhoc job to calculate producer's simcluster embeddings, which essentially assigns interestedIn - * SimClusters to each producer, regardless of whether the producer has a knownFor assignment. - * -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding:similar_users_by_simclusters_embeddings-adhoc && \ - oscar hdfs --user recos-platform --screen --tee similar_users_by_simclusters_embeddings --bundle similar_users_by_simclusters_embeddings-adhoc \ - --tool com.twitter.simclusters_v2.scalding.embedding.SimilarUsersBySimClustersEmbeddingAdhocApp \ - -- --date 2019-07-10T00 2019-07-10T23 - */ -object SimilarUsersBySimClustersEmbeddingAdhocApp extends AdhocExecutionApp { - - private val outputByFav = - "/user/recos-platform/adhoc/similar_users_by_simclusters_embeddings/by_fav" - private val outputByFollow = - "/user/recos-platform/adhoc/similar_users_by_simclusters_embeddings/by_follow" - - private val topClusterEmbeddingsByFavScore = DAL - .readMostRecentSnapshotNoOlderThan( - ProducerTopKSimclusterEmbeddingsByFavScoreUpdatedScalaDataset, - Days(14) - ) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .map { clusterScorePair => clusterScorePair.key -> clusterScorePair.value } - - private val topProducersForClusterEmbeddingByFavScore = DAL - .readMostRecentSnapshotNoOlderThan( - SimclusterEmbeddingTopKProducersByFavScoreUpdatedScalaDataset, - Days(14) - ) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .map { producerScoresPair => producerScoresPair.key -> producerScoresPair.value } - - private val topClusterEmbeddingsByFollowScore = DAL - .readMostRecentSnapshotNoOlderThan( - ProducerTopKSimclusterEmbeddingsByFollowScoreUpdatedScalaDataset, - Days(14) - ) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .map { clusterScorePair => clusterScorePair.key -> clusterScorePair.value } - - private val topProducersForClusterEmbeddingByFollowScore = DAL - .readMostRecentSnapshotNoOlderThan( - SimclusterEmbeddingTopKProducersByFollowScoreUpdatedScalaDataset, - Days(14) - ) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .map { producerScoresPair => producerScoresPair.key -> producerScoresPair.value } - - implicit val candidatesInj: CompactScalaCodec[Candidates] = CompactScalaCodec(Candidates) - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - Execution - .zip( - SimilarUsersBySimClustersEmbedding - .getTopUsersRelatedToUser( - topClusterEmbeddingsByFavScore, - topProducersForClusterEmbeddingByFavScore).writeExecution( - VersionedKeyValSource[Long, Candidates](outputByFav)) - .getCounters - .flatMap { - case (_, counters) => - counters.toMap.toSeq - .sortBy(e => (e._1.group, e._1.counter)) - .foreach { - case (statKey, value) => - println(s"${statKey.group}\t${statKey.counter}\t$value") - } - Execution.unit - }, - SimilarUsersBySimClustersEmbedding - .getTopUsersRelatedToUser( - topClusterEmbeddingsByFollowScore, - topProducersForClusterEmbeddingByFollowScore).writeExecution( - VersionedKeyValSource[Long, Candidates](outputByFollow)) - .getCounters - .flatMap { - case (_, counters) => - counters.toMap.toSeq - .sortBy(e => (e._1.group, e._1.counter)) - .foreach { - case (statKey, value) => - println(s"${statKey.group}\t${statKey.counter}\t$value") - } - Execution.unit - } - ).unit - } -} - -object SimilarUsersBySimClustersEmbedding { - private val maxUsersPerCluster = 300 - private val maxClustersPerUser = 50 - private val topK = 100 - - def getTopUsersRelatedToUser( - clusterScores: TypedPipe[(Long, TopSimClustersWithScore)], - producerScores: TypedPipe[(PersistedFullClusterId, TopProducersWithScore)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(Long, Candidates)] = { - - val numUserUserPair = Stat("num_user_producer_pairs") - val numUserClusterPair = Stat("num_user_cluster_pairs") - val numClusterProducerPair = Stat("num_cluster_producer_pairs") - - val clusterToUserMap = - clusterScores.flatMap { - case (userId, topSimClustersWithScore) => - val targetUserClusters = - topSimClustersWithScore.topClusters.sortBy(-_.score).take(maxClustersPerUser) - - targetUserClusters.map { simClusterWithScore => - numUserClusterPair.inc() - simClusterWithScore.clusterId -> userId - } - } - - val clusterToProducerMap = producerScores.flatMap { - case (persistedFullClusterId, topProducersWithScore) => - numClusterProducerPair.inc() - val targetProducers = topProducersWithScore.topProducers - .sortBy(-_.score) - .take(maxUsersPerCluster) - targetProducers.map { topProducerWithScore => - persistedFullClusterId.clusterId -> topProducerWithScore.userId - } - } - - implicit val intInject: Int => Array[Byte] = Injection.int2BigEndian.toFunction - - val userToProducerMap = - clusterToUserMap.group - .sketch(2000) - .join(clusterToProducerMap.group) - .values - .distinct - .collect({ - //filter self-pair - case userPair if userPair._1 != userPair._2 => - numUserUserPair.inc() - userPair - }) - - val userEmbeddingsAllGrouped = clusterScores.map { - case (userId, topSimClustersWithScore) => - val targetUserClusters = - topSimClustersWithScore.topClusters.sortBy(-_.score).take(maxClustersPerUser) - val embedding = targetUserClusters.map { simClustersWithScore => - simClustersWithScore.clusterId -> simClustersWithScore.score - }.toMap - val embeddingNormalized = CosineSimilarityUtil.normalize(embedding) - userId -> embeddingNormalized - }.forceToDisk - - val userToProducerMapJoinWithEmbedding = - userToProducerMap - .join(userEmbeddingsAllGrouped) - .map { - case (user, (producer, userEmbedding)) => - producer -> (user, userEmbedding) - } - .join(userEmbeddingsAllGrouped) - .map { - case (producer, ((user, userEmbedding), producerEmbedding)) => - user -> (producer, CosineSimilarityUtil.dotProduct(userEmbedding, producerEmbedding)) - } - .group - .sortWithTake(topK)((a, b) => a._2 > b._2) - .map { - case (userId, candidatesList) => - val candidatesSeq = candidatesList - .map { - case (candidateId, score) => Candidate(candidateId, score) - } - userId -> Candidates(userId, candidatesSeq) - } - - userToProducerMapJoinWithEmbedding - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/AbuseSimclusterFeaturesScaldingJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/AbuseSimclusterFeaturesScaldingJob.docx new file mode 100644 index 000000000..79e548fc0 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/AbuseSimclusterFeaturesScaldingJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/AbuseSimclusterFeaturesScaldingJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/AbuseSimclusterFeaturesScaldingJob.scala deleted file mode 100644 index a1d11e2a2..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/AbuseSimclusterFeaturesScaldingJob.scala +++ /dev/null @@ -1,178 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.abuse - -import com.twitter.scalding._ -import com.twitter.scalding.source.TypedText -import com.twitter.scalding_internal.dalv2.DALWrite.{D, _} -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.hdfs_sources.SearchAbuseSimclusterFeaturesManhattanScalaDataset -import com.twitter.simclusters_v2.scalding.common.matrix.SparseMatrix -import com.twitter.simclusters_v2.scalding.embedding.abuse.AbuseSimclusterFeaturesScaldingJob.buildKeyValDataSet -import com.twitter.simclusters_v2.scalding.embedding.abuse.AdhocAbuseSimClusterFeaturesScaldingJob.{ - abuseInteractionSearchGraph, - buildSearchAbuseScores, - impressionInteractionSearchGraph -} -import com.twitter.simclusters_v2.scalding.embedding.abuse.DataSources.getUserInterestedInSparseMatrix -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.{ClusterId, UserId} -import com.twitter.simclusters_v2.thriftscala.{ - ModelVersion, - SimClustersEmbedding, - SingleSideUserScores -} -import com.twitter.wtf.scalding.jobs.common.{AdhocExecutionApp, ScheduledExecutionApp} -import java.util.TimeZone - -object AbuseSimclusterFeaturesScaldingJob { - - val HealthyConsumerKey = "healthyConsumer" - val UnhealthyConsumerKey = "unhealthyConsumer" - val HealthyAuthorKey = "healthyAuthor" - val UnhealthyAuthorKey = "unhealthyAuthor" - - private[this] val EmptySimCluster = SimClustersEmbedding(List()) - - def buildKeyValDataSet( - normalizedSimClusterMatrix: SparseMatrix[UserId, ClusterId, Double], - unhealthyGraph: SparseMatrix[UserId, UserId, Double], - healthyGraph: SparseMatrix[UserId, UserId, Double] - ): TypedPipe[KeyVal[Long, SingleSideUserScores]] = { - - val searchAbuseScores = - buildSearchAbuseScores( - normalizedSimClusterMatrix, - unhealthyGraph = unhealthyGraph, - healthyGraph = healthyGraph - ) - - val pairedScores = SingleSideInteractionTransformation.pairScores( - Map( - HealthyConsumerKey -> searchAbuseScores.healthyConsumerClusterScores, - UnhealthyConsumerKey -> searchAbuseScores.unhealthyConsumerClusterScores, - HealthyAuthorKey -> searchAbuseScores.healthyAuthorClusterScores, - UnhealthyAuthorKey -> searchAbuseScores.unhealthyAuthorClusterScores - ) - ) - - pairedScores - .map { pairedScore => - val userPairInteractionFeatures = PairedInteractionFeatures( - healthyInteractionSimClusterEmbedding = - pairedScore.interactionScores.getOrElse(HealthyConsumerKey, EmptySimCluster), - unhealthyInteractionSimClusterEmbedding = - pairedScore.interactionScores.getOrElse(UnhealthyConsumerKey, EmptySimCluster) - ) - - val authorPairInteractionFeatures = PairedInteractionFeatures( - healthyInteractionSimClusterEmbedding = - pairedScore.interactionScores.getOrElse(HealthyAuthorKey, EmptySimCluster), - unhealthyInteractionSimClusterEmbedding = - pairedScore.interactionScores.getOrElse(UnhealthyAuthorKey, EmptySimCluster) - ) - - val value = SingleSideUserScores( - pairedScore.userId, - consumerHealthyScore = userPairInteractionFeatures.healthySum, - consumerUnhealthyScore = userPairInteractionFeatures.unhealthySum, - authorUnhealthyScore = authorPairInteractionFeatures.unhealthySum, - authorHealthyScore = authorPairInteractionFeatures.healthySum - ) - - KeyVal(pairedScore.userId, value) - } - } -} - -/** - * This job creates single-side features used to predict the abuse reports in search. The features - * are put into manhattan and availabe in feature store. We expect that search will be able to use - * these features directly. They may be useful for other models as well. - */ -object SearchAbuseSimclusterFeaturesScaldingJob extends ScheduledExecutionApp { - override def firstTime: RichDate = RichDate("2021-02-01") - - override def batchIncrement: Duration = - Days(7) - - private val OutputPath: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - modelVersion = ModelVersion.Model20m145kUpdated, - pathSuffix = "search_abuse_simcluster_features" - ) - - def buildDataset( - )( - implicit dateRange: DateRange, - ): Execution[TypedPipe[KeyVal[Long, SingleSideUserScores]]] = { - Execution.getMode.map { implicit mode => - val normalizedSimClusterMatrix = getUserInterestedInSparseMatrix.rowL2Normalize - val abuseSearchGraph = abuseInteractionSearchGraph()(dateRange, mode) - val impressionSearchGraph = impressionInteractionSearchGraph()(dateRange, mode) - - buildKeyValDataSet(normalizedSimClusterMatrix, abuseSearchGraph, impressionSearchGraph) - } - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - // Extend the date range to a total of 19 days. Search keeps 21 days of data. - val dateRangeSearchData = dateRange.prepend(Days(12)) - buildDataset()(dateRangeSearchData).flatMap { dataset => - dataset.writeDALVersionedKeyValExecution( - dataset = SearchAbuseSimclusterFeaturesManhattanScalaDataset, - pathLayout = D.Suffix(OutputPath) - ) - } - } -} - -/** - * You can check the logic of this job by running this query. - * - * scalding remote run \ - * --target src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse:abuse-prod \ - * --main-class com.twitter.simclusters_v2.scalding.embedding.abuse.AdhocSearchAbuseSimclusterFeaturesScaldingJob \ - * --hadoop-properties "mapreduce.job.split.metainfo.maxsize=-1" \ - * --cluster bluebird-qus1 --submitter hadoopnest-bluebird-1.qus1.twitter.com \ - * -- --date 2021-02-01 2021-02-02 \ - * --outputPath AdhocSearchAbuseSimclusterFeaturesScaldingJob-test1 - */ -object AdhocSearchAbuseSimclusterFeaturesScaldingJob extends AdhocExecutionApp { - def toTsv( - datasetExecution: Execution[TypedPipe[KeyVal[Long, SingleSideUserScores]]], - outputPath: String - ): Execution[Unit] = { - datasetExecution.flatMap { dataset => - dataset - .map { keyVal => - ( - keyVal.key, - keyVal.value.consumerHealthyScore, - keyVal.value.consumerUnhealthyScore, - keyVal.value.authorHealthyScore, - keyVal.value.authorUnhealthyScore - ) - } - .writeExecution(TypedText.tsv(outputPath)) - } - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - toTsv( - SearchAbuseSimclusterFeaturesScaldingJob.buildDataset()(dateRange), - args("outputPath") - ) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/AdhocAbuseSimClusterFeaturesScaldingJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/AdhocAbuseSimClusterFeaturesScaldingJob.docx new file mode 100644 index 000000000..e72155329 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/AdhocAbuseSimClusterFeaturesScaldingJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/AdhocAbuseSimClusterFeaturesScaldingJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/AdhocAbuseSimClusterFeaturesScaldingJob.scala deleted file mode 100644 index 245825b40..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/AdhocAbuseSimClusterFeaturesScaldingJob.scala +++ /dev/null @@ -1,217 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.abuse - -import com.twitter.ml.api.Feature -import com.twitter.ml.api.util.SRichDataRecord -import com.twitter.scalding.Args -import com.twitter.scalding.DateRange -import com.twitter.scalding.Execution -import com.twitter.scalding.UniqueID -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.dataset.DAL.DALSourceBuilderExtension -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC -import com.twitter.search.common.features.ExternalTweetFeature -import com.twitter.search.common.features.SearchContextFeature -import com.twitter.search.tweet_ranking.scalding.datasets.TweetEngagementRawTrainingDataDailyJavaDataset -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.hdfs_sources.AdhocAbuseSimclusterFeaturesScalaDataset -import com.twitter.simclusters_v2.scalding.common.matrix.SparseMatrix -import com.twitter.simclusters_v2.scalding.embedding.abuse.DataSources.NumBlocksP95 -import com.twitter.simclusters_v2.scalding.embedding.abuse.DataSources.getFlockBlocksSparseMatrix -import com.twitter.simclusters_v2.scalding.embedding.abuse.DataSources.getUserInterestedInSparseMatrix -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.UserId -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.CassowaryJob -import java.util.TimeZone - -object AdhocAbuseSimClusterFeatureKeys { - val AbuseAuthorSearchKey = "abuseAuthorSearch" - val AbuseUserSearchKey = "abuseUserSearch" - val ImpressionUserSearchKey = "impressionUserSearch" - val ImpressionAuthorSearchKey = "impressionAuthorSearch" - val FlockBlocksAuthorKey = "blocksAuthorFlockDataset" - val FlockBlocksUserKey = "blocksUserFlockDataset" - val FavScoresAuthorKey = "favsAuthorFromFavGraph" - val FavScoresUserKey = "favsUserFromFavGraph" -} - -/** - * Adhoc job that is still in development. The job builds features that are meant to be useful for - * search. - * - * Features are built from existing SimCluster representations and the interaction graphs. - * - * Example command: - * scalding remote run \ - * --target src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse:abuse-adhoc \ - * --main-class com.twitter.simclusters_v2.scalding.embedding.abuse.AdhocAbuseSimClusterFeaturesScaldingJob \ - * --submitter hadoopnest1.atla.twitter.com --user cassowary \ - * --hadoop-properties "mapreduce.job.user.classpath.first=true" -- \ - * --hdfs --date 2020/11/24 2020/12/14 --partitionName second_run --dalEnvironment Prod - */ -object AdhocAbuseSimClusterFeaturesScaldingJob extends AdhocExecutionApp with CassowaryJob { - override def jobName: String = "AdhocAbuseScaldingJob" - - import AdhocAbuseSimClusterFeatureKeys._ - - val tweetAuthorFeature = new Feature.Discrete(ExternalTweetFeature.TWEET_AUTHOR_ID.getName) - val searcherIdFeature = new Feature.Discrete(SearchContextFeature.SEARCHER_ID.getName) - val isReportedFeature = new Feature.Binary(ExternalTweetFeature.IS_REPORTED.getName) - val HalfLifeInDaysForFavScore = 100 - - private val outputPathThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = ModelVersion.Model20m145kUpdated, - pathSuffix = "abuse_simcluster_features" - ) - - def searchDataRecords( - )( - implicit dateRange: DateRange, - mode: Mode - ) = { - DAL - .read(TweetEngagementRawTrainingDataDailyJavaDataset) - .withRemoteReadPolicy(AllowCrossDC) - .toDataSetPipe - .records - } - - def abuseInteractionSearchGraph( - )( - implicit dateRange: DateRange, - mode: Mode - ): SparseMatrix[UserId, UserId, Double] = { - val abuseMatrixEntries = searchDataRecords() - .flatMap { dataRecord => - val sDataRecord = SRichDataRecord(dataRecord) - val authorIdOption = sDataRecord.getFeatureValueOpt(tweetAuthorFeature) - val userIdOption = sDataRecord.getFeatureValueOpt(searcherIdFeature) - val isReportedOption = sDataRecord.getFeatureValueOpt(isReportedFeature) - - for { - isReported <- isReportedOption if isReported - authorId <- authorIdOption if authorId != 0 - userId <- userIdOption if userId != 0 - } yield { - (userId: UserId, authorId: UserId, 1.0) - } - } - SparseMatrix.apply[UserId, UserId, Double](abuseMatrixEntries) - } - - def impressionInteractionSearchGraph( - )( - implicit dateRange: DateRange, - mode: Mode - ): SparseMatrix[UserId, UserId, Double] = { - val impressionMatrixEntries = searchDataRecords - .flatMap { dataRecord => - val sDataRecord = SRichDataRecord(dataRecord) - val authorIdOption = sDataRecord.getFeatureValueOpt(tweetAuthorFeature) - val userIdOption = sDataRecord.getFeatureValueOpt(searcherIdFeature) - - for { - authorId <- authorIdOption if authorId != 0 - userId <- userIdOption if userId != 0 - } yield { - (userId: UserId, authorId: UserId, 1.0) - } - } - SparseMatrix.apply[UserId, UserId, Double](impressionMatrixEntries) - } - - case class SingleSideScores( - unhealthyConsumerClusterScores: TypedPipe[(UserId, SimClustersEmbedding)], - unhealthyAuthorClusterScores: TypedPipe[(UserId, SimClustersEmbedding)], - healthyConsumerClusterScores: TypedPipe[(UserId, SimClustersEmbedding)], - healthyAuthorClusterScores: TypedPipe[(UserId, SimClustersEmbedding)]) - - def buildSearchAbuseScores( - normalizedSimClusterMatrix: SparseMatrix[UserId, ClusterId, Double], - unhealthyGraph: SparseMatrix[UserId, UserId, Double], - healthyGraph: SparseMatrix[UserId, UserId, Double] - ): SingleSideScores = { - SingleSideScores( - unhealthyConsumerClusterScores = SingleSideInteractionTransformation - .clusterScoresFromGraphs(normalizedSimClusterMatrix, unhealthyGraph), - unhealthyAuthorClusterScores = SingleSideInteractionTransformation - .clusterScoresFromGraphs(normalizedSimClusterMatrix, unhealthyGraph.transpose), - healthyConsumerClusterScores = SingleSideInteractionTransformation - .clusterScoresFromGraphs(normalizedSimClusterMatrix, healthyGraph), - healthyAuthorClusterScores = SingleSideInteractionTransformation - .clusterScoresFromGraphs(normalizedSimClusterMatrix, healthyGraph.transpose) - ) - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - Execution.getMode.flatMap { implicit mode => - val normalizedSimClusterMatrix = getUserInterestedInSparseMatrix.rowL2Normalize - - val abuseSearchGraph = abuseInteractionSearchGraph() - val impressionSearchGraph = impressionInteractionSearchGraph() - - val searchAbuseScores = buildSearchAbuseScores( - normalizedSimClusterMatrix, - unhealthyGraph = abuseSearchGraph, - healthyGraph = impressionSearchGraph) - - // Step 2a: Read FlockBlocks for unhealthy interactions and user-user-fav for healthy interactions - val flockBlocksSparseGraph = - getFlockBlocksSparseMatrix(NumBlocksP95, dateRange.prepend(Years(1))) - - val favSparseGraph = SparseMatrix.apply[UserId, UserId, Double]( - ExternalDataSources.getFavEdges(HalfLifeInDaysForFavScore)) - - val blocksAbuseScores = buildSearchAbuseScores( - normalizedSimClusterMatrix, - unhealthyGraph = flockBlocksSparseGraph, - healthyGraph = favSparseGraph - ) - - // Step 3. Combine all scores from different sources for users - val pairedScores = SingleSideInteractionTransformation.pairScores( - Map( - // User cluster scores built from the search abuse reports graph - AbuseUserSearchKey -> searchAbuseScores.unhealthyConsumerClusterScores, - // Author cluster scores built from the search abuse reports graph - AbuseAuthorSearchKey -> searchAbuseScores.unhealthyAuthorClusterScores, - // User cluster scores built from the search impression graph - ImpressionUserSearchKey -> searchAbuseScores.healthyConsumerClusterScores, - // Author cluster scores built from the search impression graph - ImpressionAuthorSearchKey -> searchAbuseScores.healthyAuthorClusterScores, - // User cluster scores built from flock blocks graph - FlockBlocksUserKey -> blocksAbuseScores.unhealthyConsumerClusterScores, - // Author cluster scores built from the flock blocks graph - FlockBlocksAuthorKey -> blocksAbuseScores.unhealthyAuthorClusterScores, - // User cluster scores built from the user-user fav graph - FavScoresUserKey -> blocksAbuseScores.healthyConsumerClusterScores, - // Author cluster scores built from the user-user fav graph - FavScoresAuthorKey -> blocksAbuseScores.healthyAuthorClusterScores - ) - ) - - pairedScores.writeDALSnapshotExecution( - AdhocAbuseSimclusterFeaturesScalaDataset, - D.Daily, - D.Suffix(outputPathThrift), - D.Parquet, - dateRange.`end`, - partitions = Set(D.Partition("partition", args("partitionName"), D.PartitionType.String)) - ) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/BUILD b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/BUILD deleted file mode 100644 index 0f162e417..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/BUILD +++ /dev/null @@ -1,74 +0,0 @@ -scala_library( - sources = [ - "*.scala", - ], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/algebird:core", - "graphstore/common:flock_blocks-java", - "src/java/com/twitter/search/common/features", - "src/scala/com/twitter/ml/api:api-base", - "src/scala/com/twitter/scalding_internal/dalv2", - "src/scala/com/twitter/scalding_internal/dalv2/dataset", - "src/scala/com/twitter/scalding_internal/source/lzo_scrooge", - "src/scala/com/twitter/search/tweet_ranking/scalding/datasets:tweet_engagement_raw_training_data_daily-java", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:adhoc_abuse_simcluster_features-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:adhoc_cross_simcluster_block_interaction_features-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:adhoc_cross_simcluster_fav_interaction_features-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:search_abuse_simcluster_features_manhattan-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:user_user_fav_graph-scala", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/scala/com/twitter/simclusters_v2/scalding/common/matrix", - "src/scala/com/twitter/simclusters_v2/scalding/embedding", - "src/scala/com/twitter/wtf/entity_real_graph/common", - "src/scala/com/twitter/wtf/entity_real_graph/scalding/common", - "src/scala/com/twitter/wtf/scalding/jobs/common:cassowary_job", - "src/scala/com/twitter/wtf/scalding/jobs/common:execution_app", - "src/scala/com/twitter/wtf/scalding/jobs/common:sources", - "src/scala/com/twitter/wtf/scalding/jobs/common:stats_util", - ], -) - -hadoop_binary( - name = "abuse-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.abuse.AdhocAbuseScaldingJob", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":abuse"], -) - -hadoop_binary( - name = "abuse-prod", - main = "com.twitter.simclusters_v2.scalding.embedding.abuse.SearchAbuseSimclusterFeaturesScaldingJob", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":abuse"], -) - -hadoop_binary( - name = "cross_simcluster-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.abuse.CrossSimClusterFeaturesScaldingJob", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":abuse", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/BUILD.docx new file mode 100644 index 000000000..6c1968b61 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/CrossSimClusterFeaturesScaldingJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/CrossSimClusterFeaturesScaldingJob.docx new file mode 100644 index 000000000..e16403c14 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/CrossSimClusterFeaturesScaldingJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/CrossSimClusterFeaturesScaldingJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/CrossSimClusterFeaturesScaldingJob.scala deleted file mode 100644 index f2ee98bd4..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/CrossSimClusterFeaturesScaldingJob.scala +++ /dev/null @@ -1,149 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.abuse - -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding.Args -import com.twitter.scalding.DateRange -import com.twitter.scalding.Execution -import com.twitter.scalding.UniqueID -import com.twitter.scalding.Years -import com.twitter.simclusters_v2.scalding.common.matrix.SparseMatrix -import com.twitter.simclusters_v2.scalding.embedding.abuse.DataSources.NumBlocksP95 -import com.twitter.simclusters_v2.scalding.embedding.abuse.DataSources.getFlockBlocksSparseMatrix -import com.twitter.simclusters_v2.scalding.embedding.abuse.DataSources.getUserInterestedInTruncatedKMatrix -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.ClusterId -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.UserId -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.thriftscala.AdhocCrossSimClusterInteractionScores -import com.twitter.simclusters_v2.thriftscala.ClustersScore -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.CassowaryJob -import com.twitter.simclusters_v2.hdfs_sources.AdhocCrossSimclusterBlockInteractionFeaturesScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.AdhocCrossSimclusterFavInteractionFeaturesScalaDataset -import java.util.TimeZone - -/* -To run: -scalding remote run \ ---user cassowary \ ---submitter hadoopnest1.atla.twitter.com \ ---target src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse:cross_simcluster-adhoc \ ---main-class com.twitter.simclusters_v2.scalding.embedding.abuse.CrossSimClusterFeaturesScaldingJob \ ---submitter-memory 128192.megabyte --hadoop-properties "mapreduce.map.memory.mb=8192 mapreduce.map.java.opts='-Xmx7618M' mapreduce.reduce.memory.mb=8192 mapreduce.reduce.java.opts='-Xmx7618M'" \ --- \ ---date 2021-02-07 \ ---dalEnvironment Prod - */ - -object CrossSimClusterFeaturesUtil { - - /** - * To generate the interaction score for 2 simclusters c1 and c2 for all cluster combinations (I): - * a) Get C - user interestedIn matrix, User * Cluster - * b) Get INT - positive or negative interaction matrix, User * User - * c) Compute C^T*INT - * d) Finally, return C^T*INT*C - */ - def getCrossClusterScores( - userClusterMatrix: SparseMatrix[UserId, ClusterId, Double], - userInteractionMatrix: SparseMatrix[UserId, UserId, Double] - ): SparseMatrix[ClusterId, ClusterId, Double] = { - // intermediate = C^T*INT - val intermediateResult = userClusterMatrix.transpose.multiplySparseMatrix(userInteractionMatrix) - // return intermediate*C - intermediateResult.multiplySparseMatrix(userClusterMatrix) - } -} - -object CrossSimClusterFeaturesScaldingJob extends AdhocExecutionApp with CassowaryJob { - override def jobName: String = "AdhocAbuseCrossSimClusterFeaturesScaldingJob" - - private val outputPathBlocksThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = ModelVersion.Model20m145kUpdated, - pathSuffix = "abuse_cross_simcluster_block_features" - ) - - private val outputPathFavThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = ModelVersion.Model20m145kUpdated, - pathSuffix = "abuse_cross_simcluster_fav_features" - ) - - private val HalfLifeInDaysForFavScore = 100 - - // Adhoc jobs which use all user interestedIn simclusters (default=50) was failing - // Hence truncating the number of clusters - private val MaxNumClustersPerUser = 20 - - import CrossSimClusterFeaturesUtil._ - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val normalizedUserInterestedInMatrix: SparseMatrix[UserId, ClusterId, Double] = - getUserInterestedInTruncatedKMatrix(MaxNumClustersPerUser).rowL2Normalize - - //the below code is to get cross simcluster features from flockblocks - negative user-user interactions. - val flockBlocksMatrix: SparseMatrix[UserId, UserId, Double] = - getFlockBlocksSparseMatrix(NumBlocksP95, dateRange.prepend(Years(1))) - - val crossClusterBlockScores: SparseMatrix[ClusterId, ClusterId, Double] = - getCrossClusterScores(normalizedUserInterestedInMatrix, flockBlocksMatrix) - - val blockScores: TypedPipe[AdhocCrossSimClusterInteractionScores] = - crossClusterBlockScores.rowAsKeys - .mapValues(List(_)).sumByKey.toTypedPipe.map { - case (givingClusterId, receivingClustersWithScores) => - AdhocCrossSimClusterInteractionScores( - clusterId = givingClusterId, - clusterScores = receivingClustersWithScores.map { - case (cluster, score) => ClustersScore(cluster, score) - }) - } - - // get cross simcluster features from fav graph - positive user-user interactions - val favGraphMatrix: SparseMatrix[UserId, UserId, Double] = - SparseMatrix.apply[UserId, UserId, Double]( - ExternalDataSources.getFavEdges(HalfLifeInDaysForFavScore)) - - val crossClusterFavScores: SparseMatrix[ClusterId, ClusterId, Double] = - getCrossClusterScores(normalizedUserInterestedInMatrix, favGraphMatrix) - - val favScores: TypedPipe[AdhocCrossSimClusterInteractionScores] = - crossClusterFavScores.rowAsKeys - .mapValues(List(_)).sumByKey.toTypedPipe.map { - case (givingClusterId, receivingClustersWithScores) => - AdhocCrossSimClusterInteractionScores( - clusterId = givingClusterId, - clusterScores = receivingClustersWithScores.map { - case (cluster, score) => ClustersScore(cluster, score) - }) - } - // write both block and fav interaction matrices to hdfs in thrift format - Execution - .zip( - blockScores.writeDALSnapshotExecution( - AdhocCrossSimclusterBlockInteractionFeaturesScalaDataset, - D.Daily, - D.Suffix(outputPathBlocksThrift), - D.Parquet, - dateRange.`end`), - favScores.writeDALSnapshotExecution( - AdhocCrossSimclusterFavInteractionFeaturesScalaDataset, - D.Daily, - D.Suffix(outputPathFavThrift), - D.Parquet, - dateRange.`end`) - ).unit - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/DataSources.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/DataSources.docx new file mode 100644 index 000000000..8de1e18fb Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/DataSources.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/DataSources.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/DataSources.scala deleted file mode 100644 index 20c16abbf..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/DataSources.scala +++ /dev/null @@ -1,101 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.abuse - -import com.twitter.data.proto.Flock -import com.twitter.scalding.{DateOps, DateRange, Days, RichDate, UniqueID} -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.simclusters_v2.hdfs_sources.InterestedInSources -import com.twitter.simclusters_v2.scalding.common.matrix.SparseMatrix -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.{ClusterId, UserId} -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import graphstore.common.FlockBlocksJavaDataset -import java.util.TimeZone - -object DataSources { - - private val ValidEdgeStateId = 0 - val NumBlocksP95 = 49 - - /** - * Helper function to return Sparse Matrix of user's interestedIn clusters and fav scores - * @param dateRange - * @return - */ - def getUserInterestedInSparseMatrix( - implicit dateRange: DateRange, - timeZone: TimeZone - ): SparseMatrix[UserId, ClusterId, Double] = { - val simClusters = ExternalDataSources.simClustersInterestInSource - - val simClusterMatrixEntries = simClusters - .flatMap { keyVal => - keyVal.value.clusterIdToScores.flatMap { - case (clusterId, score) => - score.favScore.map { favScore => - (keyVal.key, clusterId, favScore) - } - } - } - - SparseMatrix.apply[UserId, ClusterId, Double](simClusterMatrixEntries) - } - - def getUserInterestedInTruncatedKMatrix( - topK: Int - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): SparseMatrix[UserId, ClusterId, Double] = { - SparseMatrix( - InterestedInSources - .simClustersInterestedInUpdatedSource(dateRange, timeZone) - .flatMap { - case (userId, clustersUserIsInterestedIn) => - val sortedAndTruncatedList = clustersUserIsInterestedIn.clusterIdToScores - .mapValues(_.favScore.getOrElse(0.0)).filter(_._2 > 0.0).toList.sortBy(-_._2).take( - topK) - sortedAndTruncatedList.map { - case (clusterId, score) => - (userId, clusterId, score) - } - } - ) - } - - /** - * Helper function to return SparseMatrix of user block interactions from the FlockBlocks - * dataset. All users with greater than numBlocks are filtered out - * @param dateRange - * @return - */ - def getFlockBlocksSparseMatrix( - maxNumBlocks: Int, - rangeForData: DateRange - )( - implicit dateRange: DateRange - ): SparseMatrix[UserId, UserId, Double] = { - implicit val tz: java.util.TimeZone = DateOps.UTC - val userGivingBlocks = SparseMatrix.apply[UserId, UserId, Double]( - DAL - .readMostRecentSnapshotNoOlderThan(FlockBlocksJavaDataset, Days(30)) - .toTypedPipe - .flatMap { data: Flock.Edge => - // Consider edges that are valid and have been updated in the past 1 year - if (data.getStateId == ValidEdgeStateId && - rangeForData.contains(RichDate(data.getUpdatedAt * 1000L))) { - Some((data.getSourceId, data.getDestinationId, 1.0)) - } else { - None - } - }) - // Find all users who give less than numBlocksP95 blocks. - // This is to remove those who might be responsible for automatically blocking users - // on the twitter platform. - val usersWithLegitBlocks = userGivingBlocks.rowL1Norms.collect { - case (userId, l1Norm) if l1Norm <= maxNumBlocks => - userId - } - // retain only those users who give legit blocks (i.e those users who give less than numBlocks95) - userGivingBlocks.filterRows(usersWithLegitBlocks) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/PairedinteractionFeatures.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/PairedinteractionFeatures.docx new file mode 100644 index 000000000..8991704ff Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/PairedinteractionFeatures.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/PairedinteractionFeatures.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/PairedinteractionFeatures.scala deleted file mode 100644 index 645519d59..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/PairedinteractionFeatures.scala +++ /dev/null @@ -1,122 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.abuse - -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.thriftscala.{SimClusterWithScore, SimClustersEmbedding} -import com.twitter.util.Try - -object ClusterPair { - def apply( - clusterId: ClusterId, - healthyScore: Double, - unhealthyScore: Double - ): Option[ClusterPair] = { - if (healthyScore + unhealthyScore == 0.0) { - None - } else { - Some(new ClusterPair(clusterId, healthyScore, unhealthyScore)) - } - } -} - -case class ClusterPair private ( - clusterId: ClusterId, - healthyScore: Double, - unhealthyScore: Double) { - - def totalScores: Double = healthyScore + unhealthyScore - - def healthRatio: Double = unhealthyScore / (unhealthyScore + healthyScore) -} - -object PairedInteractionFeatures { - def smoothedHealthRatio( - unhealthySum: Double, - healthySum: Double, - smoothingFactor: Double, - prior: Double - ): Double = - (unhealthySum + smoothingFactor * prior) / (unhealthySum + healthySum + smoothingFactor) -} - -/** - * Class used to derive features for abuse models. We pair a healthy embedding with an unhealthy - * embedding. All the public methods on this class are derived features of these embeddings. - * - * @param healthyInteractionSimClusterEmbedding SimCluster embedding of healthy interactions (for - * instance favs or impressions) - * @param unhealthyInteractionSimClusterEmbedding SimCluster embedding of unhealthy interactions - * (for instance blocks or abuse reports) - */ -case class PairedInteractionFeatures( - healthyInteractionSimClusterEmbedding: SimClustersEmbedding, - unhealthyInteractionSimClusterEmbedding: SimClustersEmbedding) { - - private[this] val scorePairs: Seq[ClusterPair] = { - val clusterToScoreMap = healthyInteractionSimClusterEmbedding.embedding.map { - simClusterWithScore => - simClusterWithScore.clusterId -> simClusterWithScore.score - }.toMap - - unhealthyInteractionSimClusterEmbedding.embedding.flatMap { simClusterWithScore => - val clusterId = simClusterWithScore.clusterId - val postiveScoreOption = clusterToScoreMap.get(clusterId) - postiveScoreOption.flatMap { postiveScore => - ClusterPair(clusterId, postiveScore, simClusterWithScore.score) - } - } - } - - /** - * Get the pair of clusters with the most total interactions. - */ - val highestScoreClusterPair: Option[ClusterPair] = - Try(scorePairs.maxBy(_.totalScores)).toOption - - /** - * Get the pair of clusters with the highest unhealthy to healthy ratio. - */ - val highestHealthRatioClusterPair: Option[ClusterPair] = - Try(scorePairs.maxBy(_.healthRatio)).toOption - - /** - * Get the pair of clusters with the lowest unhealthy to healthy ratio. - */ - val lowestHealthRatioClusterPair: Option[ClusterPair] = - Try(scorePairs.minBy(_.healthRatio)).toOption - - /** - * Get an embedding whose values are the ratio of unhealthy to healthy for that simcluster. - */ - val healthRatioEmbedding: SimClustersEmbedding = { - val scores = scorePairs.map { pair => - SimClusterWithScore(pair.clusterId, pair.healthRatio) - } - SimClustersEmbedding(scores) - } - - /** - * Sum of the healthy scores for all the simclusters - */ - val healthySum: Double = healthyInteractionSimClusterEmbedding.embedding.map(_.score).sum - - /** - * Sum of the unhealthy scores for all the simclusters - */ - val unhealthySum: Double = unhealthyInteractionSimClusterEmbedding.embedding.map(_.score).sum - - /** - * ratio of unhealthy to healthy for all simclusters - */ - val healthRatio: Double = unhealthySum / (unhealthySum + healthySum) - - /** - * Ratio of unhealthy to healthy for all simclusters that is smoothed toward the prior with when - * we have fewer observations. - * - * @param smoothingFactor The higher this value the more interactions we need to move the returned - * ratio - * @param prior The unhealthy to healthy for all interactions. - */ - def smoothedHealthRatio(smoothingFactor: Double, prior: Double): Double = - PairedInteractionFeatures.smoothedHealthRatio(unhealthySum, healthySum, smoothingFactor, prior) -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/SingleSideInteractionTransformation.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/SingleSideInteractionTransformation.docx new file mode 100644 index 000000000..11c4df067 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/SingleSideInteractionTransformation.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/SingleSideInteractionTransformation.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/SingleSideInteractionTransformation.scala deleted file mode 100644 index 95577139f..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/abuse/SingleSideInteractionTransformation.scala +++ /dev/null @@ -1,154 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.abuse - -import com.google.common.annotations.VisibleForTesting -import com.twitter.scalding._ -import com.twitter.simclusters_v2.scalding.common.matrix.SparseMatrix -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.ClusterId -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.UserId -import com.twitter.simclusters_v2.thriftscala.AdhocSingleSideClusterScores -import com.twitter.simclusters_v2.thriftscala.SimClusterWithScore -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding - -/** - * Logic for building a SimCluster represenation of interaction signals. The purpose of this job is - * to model negative behavior (like abuse and blocks). - * - * This is a "SingleSide", because we are only considering one side of the interaction graph to - * build these features. So for instance we would keep track of which simclusters are most likely to - * get reported for abuse regardless of who reported it. Another job will be responsible for - * building the simcluster to simcluster interaction matrix as described in the doc. - */ -object SingleSideInteractionTransformation { - - /** - * Compute a score for every SimCluster. The SimCluster score is a count of the number of - * interactions for each SimCluster. For a user that has many SimClusters, we distribute each of - * their interactions across all of these SimClusters. - * - * @param normalizedUserSimClusters Sparse matrix of User-SimCluster scores. Users are rows and - * SimClusters are columns. This should already by L2normalized. - * It is important that we normalize so that each interaction - * only adds 1 to the counts. - * @param interactionGraph Graph of interactions. Rows are the users, columns are not used. - * All values in this graph are assumed to be positive; they are the number of - * interactions. - * - * @return SingleSideClusterFeatures for each SimCluster that has user with an interaction. - */ - def computeClusterFeatures( - normalizedUserSimClusters: SparseMatrix[UserId, ClusterId, Double], - interactionGraph: SparseMatrix[UserId, _, Double] - ): TypedPipe[SimClusterWithScore] = { - - val numReportsForUserEntries = interactionGraph.rowL1Norms.map { - // turn into a vector where we use 1 as the column key for every entry. - case (user, count) => (user, 1, count) - } - - val numReportsForUser = SparseMatrix[UserId, Int, Double](numReportsForUserEntries) - - normalizedUserSimClusters.transpose - .multiplySparseMatrix(numReportsForUser) - .toTypedPipe - .map { - case (clusterId, _, clusterScore: Double) => - SimClusterWithScore(clusterId, clusterScore) - } - } - - /** - * Given that we have the score for each SimCluster and the user's SimClusters, create a - * representation of the user so that the new SimCluster scores are an estimate of the - * interactions for this user. - * - * @param normalizedUserSimClusters sparse matrix of User-SimCluster scores. Users are rows and - * SimClusters are columns. This should already be L2 normalized. - * @param simClusterFeatures For each SimCluster, a score associated with this interaction type. - * - * @return SingleSideAbuseFeatures for each user the SimClusters and scores for this - */ - @VisibleForTesting - private[abuse] def computeUserFeaturesFromClusters( - normalizedUserSimClusters: SparseMatrix[UserId, ClusterId, Double], - simClusterFeatures: TypedPipe[SimClusterWithScore] - ): TypedPipe[(UserId, SimClustersEmbedding)] = { - - normalizedUserSimClusters.toTypedPipe - .map { - case (userId, clusterId, score) => - (clusterId, (userId, score)) - } - .group - // There are at most 140k SimClusters. They should fit in memory - .hashJoin(simClusterFeatures.groupBy(_.clusterId)) - .map { - case (_, ((userId, score), singleSideClusterFeatures)) => - ( - userId, - List( - SimClusterWithScore( - singleSideClusterFeatures.clusterId, - singleSideClusterFeatures.score * score)) - ) - } - .sumByKey - .mapValues(SimClustersEmbedding.apply) - } - - /** - * Combines all the different SimClustersEmbedding for a user into one - * AdhocSingleSideClusterScores. - * - * @param interactionMap The key is an identifier for the embedding type. The typed pipe will have - * embeddings of only for that type of embedding. - * @return Typed pipe with one AdhocSingleSideClusterScores per user. - */ - def pairScores( - interactionMap: Map[String, TypedPipe[(UserId, SimClustersEmbedding)]] - ): TypedPipe[AdhocSingleSideClusterScores] = { - - val combinedInteractions = interactionMap - .map { - case (interactionTypeName, userInteractionFeatures) => - userInteractionFeatures.map { - case (userId, simClustersEmbedding) => - (userId, List((interactionTypeName, simClustersEmbedding))) - } - } - .reduce[TypedPipe[(UserId, List[(String, SimClustersEmbedding)])]] { - case (list1, list2) => - list1 ++ list2 - } - .group - .sumByKey - - combinedInteractions.toTypedPipe - .map { - case (userId, interactionFeatureList) => - AdhocSingleSideClusterScores( - userId, - interactionFeatureList.toMap - ) - } - } - - /** - * Given the SimCluster and interaction graph get the user representation for this interaction. - * See the documentation of the underlying methods for more details - * - * @param normalizedUserSimClusters sparse matrix of User-SimCluster scores. Users are rows and - * SimClusters are columns. This should already by L2normalized. - * @param interactionGraph Graph of interactions. Rows are the users, columns are not used. - * All values in this graph are assumed to be positive; they are the number of - * interactions. - * - * @return SimClustersEmbedding for all users in the give SimCluster graphs - */ - def clusterScoresFromGraphs( - normalizedUserSimClusters: SparseMatrix[UserId, ClusterId, Double], - interactionGraph: SparseMatrix[UserId, _, Double] - ): TypedPipe[(UserId, SimClustersEmbedding)] = { - val clusterFeatures = computeClusterFeatures(normalizedUserSimClusters, interactionGraph) - computeUserFeaturesFromClusters(normalizedUserSimClusters, clusterFeatures) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/EmbeddingUtil.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/EmbeddingUtil.docx new file mode 100644 index 000000000..82e11fd81 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/EmbeddingUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/EmbeddingUtil.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/EmbeddingUtil.scala deleted file mode 100644 index 9b1e45f89..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/EmbeddingUtil.scala +++ /dev/null @@ -1,114 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.common - -import com.twitter.simclusters_v2.thriftscala._ -import java.net.InetAddress -import java.net.UnknownHostException - -object EmbeddingUtil { - - type UserId = Long - type ClusterId = Int - type ProducerId = Long - type EmbeddingScore = Double - type SemanticCoreEntityId = Long - type HashtagId = String - type Language = String - - implicit val internalIdOrdering: Ordering[InternalId] = Ordering.by { - case InternalId.EntityId(id) => id.toString - case InternalId.Hashtag(strId) => strId - case InternalId.ClusterId(iid) => iid.toString - case InternalId.LocaleEntityId(LocaleEntityId(entityId, lang)) => lang + entityId.toString - } - - implicit val embeddingTypeOrdering: Ordering[EmbeddingType] = Ordering.by(_.getValue) - - /** - * We do not need to group by model version since we are making the - * This ordering holds the assumption that we would NEVER generate embeddings for two separate - * SimClusters KnownFor versions under the same dataset. - */ - implicit val SimClustersEmbeddingIdOrdering: Ordering[SimClustersEmbeddingId] = Ordering.by { - case SimClustersEmbeddingId(embeddingType, _, internalId) => (embeddingType, internalId) - } - - val ModelVersionPathMap: Map[ModelVersion, String] = Map( - ModelVersion.Model20m145kDec11 -> "model_20m_145k_dec11", - ModelVersion.Model20m145kUpdated -> "model_20m_145k_updated", - ModelVersion.Model20m145k2020 -> "model_20m_145k_2020" - ) - - /** - * Generates the HDFS output path in order to consolidate the offline embeddings datasets under - * a common directory pattern. - * Prepends "/gcs" if the detected data center is qus1. - * - * @param isAdhoc Whether the dataset was generated from an adhoc run - * @param isManhattanKeyVal Whether the dataset is written as KeyVal and is intended to be imported to Manhattan - * @param modelVersion The model version of SimClusters KnownFor that is used to generate the embedding - * @param pathSuffix Any additional path structure suffixed at the end of the path - * @return The consolidated HDFS path, for example: - * /user/cassowary/adhoc/manhattan_sequence_files/simclusters_embeddings/model_20m_145k_updated/... - */ - def getHdfsPath( - isAdhoc: Boolean, - isManhattanKeyVal: Boolean, - modelVersion: ModelVersion, - pathSuffix: String - ): String = { - val adhoc = if (isAdhoc) "adhoc/" else "" - - val user = System.getenv("USER") - - val gcs: String = - try { - InetAddress.getAllByName("metadata.google.internal") // throws Exception if not in GCP. - "/gcs" - } catch { - case _: UnknownHostException => "" - } - - val datasetType = if (isManhattanKeyVal) "manhattan_sequence_files" else "processed" - - val path = s"/user/$user/$adhoc$datasetType/simclusters_embeddings" - - s"$gcs${path}_${ModelVersionPathMap(modelVersion)}_$pathSuffix" - } - - def favScoreExtractor(u: UserToInterestedInClusterScores): (Double, ScoreType.ScoreType) = { - (u.favScoreClusterNormalizedOnly.getOrElse(0.0), ScoreType.FavScore) - } - - def followScoreExtractor(u: UserToInterestedInClusterScores): (Double, ScoreType.ScoreType) = { - (u.followScoreClusterNormalizedOnly.getOrElse(0.0), ScoreType.FollowScore) - } - - def logFavScoreExtractor(u: UserToInterestedInClusterScores): (Double, ScoreType.ScoreType) = { - (u.logFavScoreClusterNormalizedOnly.getOrElse(0.0), ScoreType.LogFavScore) - } - - // Define all scores to extract from the SimCluster InterestedIn source - val scoreExtractors: Seq[UserToInterestedInClusterScores => (Double, ScoreType.ScoreType)] = - Seq( - favScoreExtractor, - followScoreExtractor - ) - - object ScoreType extends Enumeration { - type ScoreType = Value - val FavScore: Value = Value(1) - val FollowScore: Value = Value(2) - val LogFavScore: Value = Value(3) - } - - @deprecated("Use 'common/ModelVersions'", "2019-09-04") - final val ModelVersion20M145KDec11: String = "20M_145K_dec11" - @deprecated("Use 'common/ModelVersions'", "2019-09-04") - final val ModelVersion20M145KUpdated: String = "20M_145K_updated" - - @deprecated("Use 'common/ModelVersions'", "2019-09-04") - final val ModelVersionMap: Map[String, ModelVersion] = Map( - ModelVersion20M145KDec11 -> ModelVersion.Model20m145kDec11, - ModelVersion20M145KUpdated -> ModelVersion.Model20m145kUpdated - ) -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/EntityEmbeddingUtil.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/EntityEmbeddingUtil.docx new file mode 100644 index 000000000..4e66de1fe Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/EntityEmbeddingUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/EntityEmbeddingUtil.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/EntityEmbeddingUtil.scala deleted file mode 100644 index b9f715f2e..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/EntityEmbeddingUtil.scala +++ /dev/null @@ -1,79 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.common - -import com.twitter.recos.entities.thriftscala.Entity -import com.twitter.scalding.Args -import com.twitter.scalding.TypedPipe -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.UserId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.wtf.entity_real_graph.common.EntityUtil -import com.twitter.wtf.entity_real_graph.thriftscala.Edge -import com.twitter.wtf.entity_real_graph.thriftscala.EntityType -import com.twitter.wtf.entity_real_graph.thriftscala.FeatureName - -object EntityEmbeddingUtil { - - def getEntityUserMatrix( - entityRealGraphSource: TypedPipe[Edge], - halfLife: HalfLifeScores.HalfLifeScoresType, - entityType: EntityType - ): TypedPipe[(Entity, (UserId, Double))] = { - entityRealGraphSource - .flatMap { - case Edge(userId, entity, consumerFeatures, _, _) - if consumerFeatures.exists(_.exists(_.featureName == FeatureName.Favorites)) && - EntityUtil.getEntityType(entity) == entityType => - for { - features <- consumerFeatures - favFeatures <- features.find(_.featureName == FeatureName.Favorites) - ewmaMap <- favFeatures.featureValues.ewmaMap - favScore <- ewmaMap.get(halfLife.id) - } yield (entity, (userId, favScore)) - - case _ => None - } - } - - object HalfLifeScores extends Enumeration { - type HalfLifeScoresType = Value - val OneDay: Value = Value(1) - val SevenDays: Value = Value(7) - val FourteenDays: Value = Value(14) - val ThirtyDays: Value = Value(30) - val SixtyDays: Value = Value(60) - } - - case class EntityEmbeddingsJobConfig( - topK: Int, - halfLife: HalfLifeScores.HalfLifeScoresType, - modelVersion: ModelVersion, - entityType: EntityType, - isAdhoc: Boolean) - - object EntityEmbeddingsJobConfig { - - def apply(args: Args, isAdhoc: Boolean): EntityEmbeddingsJobConfig = { - - val entityTypeArg = - EntityType.valueOf(args.getOrElse("entity-type", default = "")) match { - case Some(entityType) => entityType - case _ => - throw new IllegalArgumentException( - s"Argument [--entity-type] must be provided. Supported options [" + - s"${EntityType.SemanticCore.name}, ${EntityType.Hashtag.name}]") - } - - EntityEmbeddingsJobConfig( - topK = args.getOrElse("top-k", default = "100").toInt, - halfLife = HalfLifeScores(args.getOrElse("half-life", default = "14").toInt), - // Fail fast if there is no correct model-version argument - modelVersion = ModelVersions.toModelVersion( - args.getOrElse("model-version", ModelVersions.Model20M145K2020) - ), - // Fail fast if there is no correct entity-type argument - entityType = entityTypeArg, - isAdhoc = isAdhoc - ) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/ExternalDataSources.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/ExternalDataSources.docx new file mode 100644 index 000000000..155f521d3 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/ExternalDataSources.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/ExternalDataSources.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/ExternalDataSources.scala deleted file mode 100644 index 729cb95d0..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/ExternalDataSources.scala +++ /dev/null @@ -1,565 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.common - -import com.twitter.algebird.Aggregator -import com.twitter.common.text.language.LocaleUtil -import com.twitter.escherbird.common.thriftscala.Locale -import com.twitter.escherbird.common.thriftscala.LocalizedUser -import com.twitter.escherbird.metadata.thriftscala.FullMetadata -import com.twitter.escherbird.scalding.source.FullMetadataSource -import com.twitter.escherbird.scalding.source.utt.UttSourceScalaDataset -import com.twitter.escherbird.utt.strato.thriftscala.SnapshotType -import com.twitter.escherbird.utt.thriftscala.UttEntityRecord -import com.twitter.interests_ds.jobs.interests_service.UserTopicRelationSnapshotScalaDataset -import com.twitter.interests.thriftscala.InterestRelationType -import com.twitter.interests.thriftscala.UserInterestsRelationSnapshot -import com.twitter.penguin.scalding.datasets.PenguinUserLanguagesScalaDataset -import com.twitter.scalding.DateOps -import com.twitter.scalding.DateRange -import com.twitter.scalding.Days -import com.twitter.scalding.Stat -import com.twitter.scalding.TypedPipe -import com.twitter.scalding.UniqueID -import com.twitter.scalding.ValuePipe -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.common._ -import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedIn20M145KUpdatedScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.UserUserFavGraphScalaDataset -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC -import com.twitter.common_header.thriftscala.CommonHeader -import com.twitter.common_header.thriftscala.IdType -import com.twitter.common_header.thriftscala.VersionedCommonHeader -import flockdb_tools.datasets.flock.FlockBlocksEdgesScalaDataset -import flockdb_tools.datasets.flock.FlockFollowsEdgesScalaDataset -import flockdb_tools.datasets.flock.FlockReportAsAbuseEdgesScalaDataset -import flockdb_tools.datasets.flock.FlockReportAsSpamEdgesScalaDataset -import twadoop_config.configuration.log_categories.group.search.AdaptiveSearchScalaDataset -import com.twitter.search.adaptive.scribing.thriftscala.AdaptiveSearchScribeLog -import twadoop_config.configuration.log_categories.group.timeline.TimelineServiceFavoritesScalaDataset -import tweetsource.common.UnhydratedFlatScalaDataset -import com.twitter.frigate.data_pipeline.magicrecs.magicrecs_notifications_lite.thriftscala.MagicRecsNotificationLite -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.simclusters_v2.thriftscala.EdgeWithDecayedWeights -import com.twitter.timelineservice.thriftscala.ContextualizedFavoriteEvent -import com.twitter.timelineservice.thriftscala.FavoriteEventUnion -import com.twitter.tweetsource.common.thriftscala.UnhydratedFlatTweet -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import com.twitter.wtf.entity_real_graph.scalding.common.DatasetConstants -import com.twitter.wtf.entity_real_graph.scalding.common.SemanticCoreFilters -import com.twitter.wtf.scalding.client_event_processing.thriftscala.InteractionDetails -import com.twitter.wtf.scalding.client_event_processing.thriftscala.InteractionType -import com.twitter.wtf.scalding.client_event_processing.thriftscala.TweetImpressionDetails -import com.twitter.frigate.data_pipeline.scalding.magicrecs.magicrecs_notification_lite.MagicrecsNotificationLite1DayLagScalaDataset -import com.twitter.iesource.thriftscala.InteractionEvent -import com.twitter.iesource.thriftscala.InteractionTargetType -import com.twitter.wtf.scalding.jobs.client_event_processing.UserInteractionScalaDataset -import java.util.TimeZone -import com.twitter.interests_ds.jobs.interests_service.UserInterestRelationSnapshotScalaDataset -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.UserId -import com.twitter.scalding.typed.{ValuePipe => TypedValuePipe} -import com.twitter.tweetsource.common.thriftscala.UnhydratedTweet -import tweetsource.common.UnhydratedScalaDataset - -object ExternalDataSources { - val UTTDomain = 131L - val usersourceColumns = Set("id", "account_country_code", "language") - val ValidFlockEdgeStateId = 0 - - def getStandardLanguageCode(language: String): Option[String] = { - val locale = LocaleUtil.getLocaleOf(language) - if (locale == LocaleUtil.UNKNOWN) None else Some(locale.getLanguage) - } - - // Reads UTT Entity Records (`utt_source` dataset) - def getUttEntityRecords(implicit timeZone: TimeZone): TypedPipe[UttEntityRecord] = { - DAL - .readMostRecentSnapshotNoOlderThan(UttSourceScalaDataset, Days(14)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - - /** - * Extracts the KGO seeds from the UTT Entity Records. - * Uses the most recent "Stable" version by default unless specified otherwise. - * - * @param uttVersion UTT Version to use instead of the default value. - */ - def getLocaleProducerSeedIdsFromUttEntityRecords( - uttVersion: Option[Long] = None - )( - implicit timeZone: TimeZone, - uniqueId: UniqueID - ): TypedPipe[((TopicId, Language), Seq[UserId])] = { - - val topicLangPairCount = Stat("topic_lang_pair_count_all") - val topicLangPairCountEmptySeed = Stat("topic_lang_pair_count_empty_seed") - val topicLangPairCountLteOneSeed = Stat("topic_lang_pair_count_lte_one_seed") - val topicLangPairCountLteFiveSeeds = Stat("topic_lang_pair_count_lte_five_seeds") - val topicLangPairCountLteTenSeeds = Stat("topic_lang_pair_count_lte_ten_seeds") - - val uttEntityRecords: TypedPipe[UttEntityRecord] = getUttEntityRecords - - val uttVersionToUse: ValuePipe[Long] = uttVersion match { - case Some(uttVersionValue) => - TypedValuePipe(uttVersionValue) - case _ => // find the most recent "stable" version as recommended by the SemanticCore team - uttEntityRecords - .filter(_.snapshotType.exists(_ == SnapshotType.Stable)) - .map(_.version) - .distinct - .aggregate(Aggregator.min) // the most recent version is the smallest negative value - } - - val uttEntityRecordsSingleVersion: TypedPipe[UttEntityRecord] = - uttEntityRecords - .filterWithValue(uttVersionToUse) { - case (uttEntityRecord: UttEntityRecord, uttVersionOpt: Option[Long]) => - uttVersionOpt.contains(uttEntityRecord.version) - } - - uttEntityRecordsSingleVersion.flatMap { uttEntityRecord: UttEntityRecord => - val localizedUsers: Seq[LocalizedUser] = - uttEntityRecord.knownForUsers.flatMap(_.localizedUsers).getOrElse(Nil) - - val validLocalizedUsers: Seq[(TopicId, Language, UserId)] = - localizedUsers - .flatMap { - case LocalizedUser(userId: UserId, Some(Locale(Some(language: String), _)), _) => - Some((uttEntityRecord.entityId, language, userId)) - case _ => - None - } - - val localeProducerSeedIds: Seq[((TopicId, Language), Seq[UserId])] = validLocalizedUsers - .groupBy { - case (topicId: TopicId, language: Language, _) => - (topicId, language) - } - .mapValues(_.map(_._3).distinct) // values are distinct producerIds - .toSeq - - localeProducerSeedIds.foreach { // stats - case (_, seedIds: Seq[UserId]) => - topicLangPairCount.inc() - if (seedIds.isEmpty) topicLangPairCountEmptySeed.inc() - if (seedIds.length <= 1) topicLangPairCountLteOneSeed.inc() - if (seedIds.length <= 5) topicLangPairCountLteFiveSeeds.inc() - if (seedIds.length <= 10) topicLangPairCountLteTenSeeds.inc() - } - - localeProducerSeedIds - }.forceToDisk - } - - def uttEntitiesSource( - customFullMetadataSource: Option[TypedPipe[FullMetadata]] = None - )( - implicit dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[Long] = { - customFullMetadataSource - .getOrElse(fullMetadataSource) - .flatMap { - case fullMetadata if fullMetadata.domainId == UTTDomain => - for { - basicMetadata <- fullMetadata.basicMetadata - indexableFields <- basicMetadata.indexableFields - tags <- indexableFields.tags - if !SemanticCoreFilters.shouldFilterByTags(tags.toSet, DatasetConstants.stopTags) - } yield { - fullMetadata.entityId - } - case _ => None - } - } - - // Get followable topics from Escherbird - def uttFollowableEntitiesSource( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[Long] = { - val followableEntityCount = Stat("followable_entities_count") - val FollowableTag = "utt:followable_topic" - fullMetadataSource - .flatMap { - case fullMetadata if fullMetadata.domainId == UTTDomain => - for { - basicMetadata <- fullMetadata.basicMetadata - indexableFields <- basicMetadata.indexableFields - tags <- indexableFields.tags - if tags.contains(FollowableTag) - } yield { - followableEntityCount.inc() - fullMetadata.entityId - } - case _ => None - } - } - - def fullMetadataSource( - implicit dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[FullMetadata] = { - TypedPipe - .from( - new FullMetadataSource(s"/atla/proc/${FullMetadataSource.DefaultHdfsPath}")()( - dateRange.embiggen(Days(7)))) - } - - def userSource(implicit timeZone: TimeZone): TypedPipe[(UserId, (Country, Language))] = - DAL - .readMostRecentSnapshotNoOlderThan(UsersourceFlatScalaDataset, Days(7)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .withColumns(usersourceColumns) - .toTypedPipe.flatMap { flatUser => - for { - userId <- flatUser.id - country <- flatUser.accountCountryCode - language <- flatUser.language - standardLang <- getStandardLanguageCode(language) - } yield { - (userId, country.toUpperCase, standardLang) - } - }.distinct - .map { case (user, country, lang) => user -> (country, lang) } - - // Build user language source from inferred languages (penguin_user_languages dataset) - def inferredUserConsumedLanguageSource( - implicit timeZone: TimeZone - ): TypedPipe[(UserId, Seq[(Language, Double)])] = { - DAL - .readMostRecentSnapshotNoOlderThan(PenguinUserLanguagesScalaDataset, Days(7)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .map { kv => - val consumed = kv.value.consumed - .collect { - case scoredString if scoredString.weight > 0.001 => //throw away 5% outliers - (getStandardLanguageCode(scoredString.item), scoredString.weight) - }.collect { - case (Some(language), score) => (language, score) - } - (kv.key, consumed) - } - } - - def inferredUserProducedLanguageSource( - implicit timeZone: TimeZone - ): TypedPipe[(UserId, Seq[(Language, Double)])] = { - DAL - .readMostRecentSnapshotNoOlderThan(PenguinUserLanguagesScalaDataset, Days(7)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .map { kv => - val produced = kv.value.produced - .collect { - case scoredString if scoredString.weight > 0.15 => //throw away 5% outliers - (getStandardLanguageCode(scoredString.item), scoredString.weight) - }.collect { - case (Some(language), score) => (language, score) - } - (kv.key, produced) - } - } - - def simClustersInterestInSource( - implicit dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[KeyVal[UserId, ClustersUserIsInterestedIn]] = { - DAL - .readMostRecentSnapshotNoOlderThan( - SimclustersV2InterestedIn20M145KUpdatedScalaDataset, - Days(30)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - - def simClustersInterestInLogFavSource( - minLogFavScore: Double - )( - implicit dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[(UserId, Map[ClusterId, Double])] = { - simClustersInterestInSource.map { - case KeyVal(userId, clustersUserIsInterestedIn) => - userId -> clustersUserIsInterestedIn.clusterIdToScores - .map { - case (clusterId, scores) => - clusterId -> scores.logFavScore.getOrElse(0.0) - } - .filter(_._2 > minLogFavScore) - .toMap - } - } - - def topicFollowGraphSource( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[(TopicId, UserId)] = { - val userTopicFollowCount = Stat("user_topic_follow_count") - DAL - .readMostRecentSnapshotNoOlderThan(UserTopicRelationSnapshotScalaDataset, Days(7)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .collect { - case userInterestsRelationSnapshot: UserInterestsRelationSnapshot - if userInterestsRelationSnapshot.interestType == "UTT" && - userInterestsRelationSnapshot.relation == InterestRelationType.Followed => - (userInterestsRelationSnapshot.interestId, userInterestsRelationSnapshot.userId) - } - .hashJoin(uttFollowableEntitiesSource.asKeys) - .map { - case (topic, (user, _)) => - userTopicFollowCount.inc() - (topic, user) - } - } - - def notInterestedTopicsSource( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[(TopicId, UserId)] = { - val userNotInterestedInTopicsCount = Stat("user_not_interested_in_topics_count") - DAL - .readMostRecentSnapshotNoOlderThan( - UserInterestRelationSnapshotScalaDataset, - Days(7)).withRemoteReadPolicy(ExplicitLocation(ProcAtla)).toTypedPipe.collect { - case userInterestsRelationSnapshot: UserInterestsRelationSnapshot - if userInterestsRelationSnapshot.interestType == "UTT" && - userInterestsRelationSnapshot.relation == InterestRelationType.NotInterested => - (userInterestsRelationSnapshot.interestId, userInterestsRelationSnapshot.userId) - } - .hashJoin(uttFollowableEntitiesSource.asKeys) - .map { - case (topic, (user, _)) => - userNotInterestedInTopicsCount.inc() - (topic, user) - } - } - - def tweetSource( - implicit dateRange: DateRange - ): TypedPipe[UnhydratedTweet] = { - DAL - .read(UnhydratedScalaDataset, dateRange).withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - - def flatTweetsSource( - implicit dateRange: DateRange - ): TypedPipe[UnhydratedFlatTweet] = { - DAL - .read(UnhydratedFlatScalaDataset, dateRange) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } - - def userTweetFavoritesSource( - implicit dateRange: DateRange - ): TypedPipe[(UserId, TweetId, Timestamp)] = { - DAL - .read(TimelineServiceFavoritesScalaDataset, dateRange) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .flatMap { cfe: ContextualizedFavoriteEvent => - cfe.event match { - case FavoriteEventUnion.Favorite(fav) => - Some(fav.userId, fav.tweetId, fav.eventTimeMs) - case _ => - None - } - } - } - - def userTweetImpressionsSource( - dwellSec: Int = 1 - )( - implicit dateRange: DateRange - ): TypedPipe[(UserId, TweetId, Timestamp)] = { - DAL - .read(UserInteractionScalaDataset, dateRange) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .flatMap { - case userInteraction - if userInteraction.interactionType == InteractionType.TweetImpressions => - userInteraction.interactionDetails match { - case InteractionDetails.TweetImpressionDetails( - TweetImpressionDetails(tweetId, _, dwellTimeInSecOpt)) - if dwellTimeInSecOpt.exists(_ >= dwellSec) => - Some(userInteraction.userId, tweetId, userInteraction.timeStamp) - case _ => - None - } - case _ => None - } - } - - def transformFavEdges( - input: TypedPipe[EdgeWithDecayedWeights], - halfLifeInDaysForFavScore: Int - )( - implicit uniqueID: UniqueID - ): TypedPipe[(Long, Long, Double)] = { - val numEdgesWithSpecifiedHalfLife = Stat( - s"num_edges_with_specified_half_life_${halfLifeInDaysForFavScore}_days") - val numEdgesWithoutSpecifiedHalfLife = Stat( - s"num_edges_without_specified_half_life_${halfLifeInDaysForFavScore}_days") - input - .flatMap { edge => - if (edge.weights.halfLifeInDaysToDecayedSums.contains(halfLifeInDaysForFavScore)) { - numEdgesWithSpecifiedHalfLife.inc() - Some((edge.sourceId, edge.destinationId, edge.weights.halfLifeInDaysToDecayedSums(100))) - } else { - numEdgesWithoutSpecifiedHalfLife.inc() - None - } - } - } - - def getFavEdges( - halfLifeInDaysForFavScore: Int - )( - implicit dateRange: DateRange, - uniqueID: UniqueID - ): TypedPipe[(Long, Long, Double)] = { - implicit val tz: java.util.TimeZone = DateOps.UTC - transformFavEdges( - DAL - .readMostRecentSnapshotNoOlderThan(UserUserFavGraphScalaDataset, Days(14)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe, - halfLifeInDaysForFavScore - ) - } - - def flockReportAsSpamSource( - )( - implicit dateRange: DateRange - ): TypedPipe[(UserId, UserId)] = { - DAL - .readMostRecentSnapshot(FlockReportAsSpamEdgesScalaDataset) - .toTypedPipe - .collect { - case edge if edge.state == ValidFlockEdgeStateId => - (edge.sourceId, edge.destinationId) - } - } - - def flockBlocksSource( - )( - implicit dateRange: DateRange - ): TypedPipe[(UserId, UserId)] = { - DAL - .readMostRecentSnapshot(FlockBlocksEdgesScalaDataset) - .toTypedPipe - .collect { - case edge if edge.state == ValidFlockEdgeStateId => - (edge.sourceId, edge.destinationId) - } - } - - def flockFollowsSource( - )( - implicit dateRange: DateRange - ): TypedPipe[(UserId, UserId)] = { - DAL - .readMostRecentSnapshot(FlockFollowsEdgesScalaDataset) - .toTypedPipe - .collect { - case edge if edge.state == ValidFlockEdgeStateId => - (edge.sourceId, edge.destinationId) - } - } - - def flockReportAsAbuseSource( - )( - implicit dateRange: DateRange - ): TypedPipe[(UserId, UserId)] = { - DAL - .readMostRecentSnapshot(FlockReportAsAbuseEdgesScalaDataset) - .toTypedPipe - .collect { - case edge if edge.state == ValidFlockEdgeStateId => - (edge.sourceId, edge.destinationId) - } - } - - def magicRecsNotficationOpenOrClickEventsSource( - implicit dateRange: DateRange - ): TypedPipe[MagicRecsNotificationLite] = { - DAL - .read(MagicrecsNotificationLite1DayLagScalaDataset, dateRange) - .toTypedPipe - .filter { entry => - // keep entries with a valid userId and tweetId, opened or clicked timestamp defined - val userIdExists = entry.targetUserId.isDefined - val tweetIdExists = entry.tweetId.isDefined - val openOrClickExists = - entry.openTimestampMs.isDefined || entry.ntabClickTimestampMs.isDefined - userIdExists && tweetIdExists && openOrClickExists - } - } - - def ieSourceTweetEngagementsSource(implicit dateRange: DateRange): TypedPipe[InteractionEvent] = { - DAL - .read( - com.twitter.iesource.processing.events.batch.ServerEngagementsScalaDataset, - dateRange).withColumns( - Set("targetId", "targetType", "engagingUserId", "details", "referenceTweet")) - .toTypedPipe - .filter { event => - // filter out logged out users because their favorites are less reliable - event.engagingUserId > 0L && event.targetType == InteractionTargetType.Tweet - } - } - - private def userIdFromBlenderAdaptiveScribeLog( - blenderAdaptiveLog: AdaptiveSearchScribeLog - ): Option[Long] = { - blenderAdaptiveLog.versionedCommonHeader match { - case VersionedCommonHeader.CommonHeader(CommonHeader.ServerHeader(serverHeader)) => - serverHeader.requestInfo match { - case Some(requestInfo) => requestInfo.ids.get(IdType.UserId).map(_.toLong) - case _ => None - } - case _ => None - } - } - - def adaptiveSearchScribeLogsSource( - implicit dateRange: DateRange - ): TypedPipe[(UserId, String)] = { - val searchData: TypedPipe[AdaptiveSearchScribeLog] = - DAL - .read(AdaptiveSearchScalaDataset, dateRange).toTypedPipe - - searchData - .flatMap({ scribeLog: AdaptiveSearchScribeLog => - for { - userId <- userIdFromBlenderAdaptiveScribeLog(scribeLog) - // filter out logged out search queries - if userId != 0 - queryString <- scribeLog.requestLog.flatMap(_.request).flatMap(_.rawQuery) - } yield { - (userId, Set(queryString)) - } - }) - // if a user searches for the same query multiple times, there could be duplicates. - // De-dup them to get the distinct queries searched by a user - .sumByKey - .flatMap { - case (userId, distinctQuerySet) => - distinctQuerySet.map { query => - (userId, query) - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/SimClustersEmbeddingJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/SimClustersEmbeddingJob.docx new file mode 100644 index 000000000..e4dc218cc Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/SimClustersEmbeddingJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/SimClustersEmbeddingJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/SimClustersEmbeddingJob.scala deleted file mode 100644 index db5ba807d..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/common/SimClustersEmbeddingJob.scala +++ /dev/null @@ -1,248 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.common - -import com.twitter.scalding.{Args, DateRange, Execution, TypedPipe, UniqueID} -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.scalding.common.matrix.{SparseMatrix, SparseRowMatrix} -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil._ -import com.twitter.simclusters_v2.thriftscala._ -import java.util.TimeZone - -/** - * This is the base job for computing SimClusters Embedding for any Noun Type on Twitter, such as - * Users, Tweets, Topics, Entities, Channels, etc. - * - * The most straightforward way to understand the SimClusters Embeddings for a Noun is that it is - * a weighted sum of SimClusters InterestedIn vectors from users who are interested in the Noun. - * So for a noun type, you only need to define `prepareNounToUserMatrix` to pass in a matrix which - * represents how much each user is interested in this noun. - */ -trait SimClustersEmbeddingBaseJob[NounType] { - - def numClustersPerNoun: Int - - def numNounsPerClusters: Int - - def thresholdForEmbeddingScores: Double - - def numReducersOpt: Option[Int] = None - - def prepareNounToUserMatrix( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): SparseMatrix[NounType, UserId, Double] - - def prepareUserToClusterMatrix( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): SparseRowMatrix[UserId, ClusterId, Double] - - def writeNounToClustersIndex( - output: TypedPipe[(NounType, Seq[(ClusterId, Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] - - def writeClusterToNounsIndex( - output: TypedPipe[(ClusterId, Seq[(NounType, Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] - - def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val embeddingMatrix: SparseRowMatrix[NounType, ClusterId, Double] = - prepareNounToUserMatrix.rowL2Normalize - .multiplySkinnySparseRowMatrix( - prepareUserToClusterMatrix.colL2Normalize, - numReducersOpt - ) - .filter((_, _, v) => v > thresholdForEmbeddingScores) - - Execution - .zip( - writeNounToClustersIndex( - embeddingMatrix.sortWithTakePerRow(numClustersPerNoun)(Ordering.by(-_._2)) - ), - writeClusterToNounsIndex( - embeddingMatrix.sortWithTakePerCol(numNounsPerClusters)( - Ordering.by(-_._2) - ) - ) - ) - .unit - } - -} - -object SimClustersEmbeddingJob { - - /** - * Multiply the [user, cluster] and [user, T] matrices, and return the cross product. - */ - def computeEmbeddings[T]( - simClustersSource: TypedPipe[(UserId, ClustersUserIsInterestedIn)], - normalizedInputMatrix: TypedPipe[(UserId, (T, Double))], - scoreExtractors: Seq[UserToInterestedInClusterScores => (Double, ScoreType.ScoreType)], - modelVersion: ModelVersion, - toSimClustersEmbeddingId: (T, ScoreType.ScoreType) => SimClustersEmbeddingId, - numReducers: Option[Int] = None - ): TypedPipe[(SimClustersEmbeddingId, (ClusterId, Double))] = { - val userSimClustersMatrix = - getUserSimClustersMatrix(simClustersSource, scoreExtractors, modelVersion) - multiplyMatrices( - normalizedInputMatrix, - userSimClustersMatrix, - toSimClustersEmbeddingId, - numReducers) - } - - def getL2Norm[T]( - inputMatrix: TypedPipe[(T, (UserId, Double))], - numReducers: Option[Int] = None - )( - implicit ordering: Ordering[T] - ): TypedPipe[(T, Double)] = { - val l2Norm = inputMatrix - .mapValues { - case (_, score) => score * score - } - .sumByKey - .mapValues(math.sqrt) - - numReducers match { - case Some(reducers) => l2Norm.withReducers(reducers) - case _ => l2Norm - } - } - - def getNormalizedTransposeInputMatrix[T]( - inputMatrix: TypedPipe[(T, (UserId, Double))], - numReducers: Option[Int] = None - )( - implicit ordering: Ordering[T] - ): TypedPipe[(UserId, (T, Double))] = { - val inputWithNorm = inputMatrix.join(getL2Norm(inputMatrix, numReducers)) - - (numReducers match { - case Some(reducers) => inputWithNorm.withReducers(reducers) - case _ => inputWithNorm - }).map { - case (inputId, ((userId, favScore), norm)) => - (userId, (inputId, favScore / norm)) - } - } - - /** - * Matrix multiplication with the ability to tune the reducer size for better performance - */ - @Deprecated - def legacyMultiplyMatrices[T]( - normalizedTransposeInputMatrix: TypedPipe[(UserId, (T, Double))], - userSimClustersMatrix: TypedPipe[(UserId, Seq[(ClusterId, Double)])], - numReducers: Int // Matrix multiplication is expensive. Use this to tune performance - )( - implicit ordering: Ordering[T] - ): TypedPipe[((ClusterId, T), Double)] = { - normalizedTransposeInputMatrix - .join(userSimClustersMatrix) - .withReducers(numReducers) - .flatMap { - case (_, ((inputId, score), clustersWithScores)) => - clustersWithScores.map { - case (clusterId, clusterScore) => - ((clusterId, inputId), score * clusterScore) - } - } - .sumByKey - .withReducers(numReducers + 1) // +1 to distinguish this step from above in Dr. Scalding - } - - def multiplyMatrices[T]( - normalizedTransposeInputMatrix: TypedPipe[(UserId, (T, Double))], - userSimClustersMatrix: TypedPipe[(UserId, Seq[((ClusterId, ScoreType.ScoreType), Double)])], - toSimClustersEmbeddingId: (T, ScoreType.ScoreType) => SimClustersEmbeddingId, - numReducers: Option[Int] = None - ): TypedPipe[(SimClustersEmbeddingId, (ClusterId, Double))] = { - val inputJoinedWithSimClusters = numReducers match { - case Some(reducers) => - normalizedTransposeInputMatrix - .join(userSimClustersMatrix) - .withReducers(reducers) - case _ => - normalizedTransposeInputMatrix.join(userSimClustersMatrix) - } - - val matrixMultiplicationResult = inputJoinedWithSimClusters.flatMap { - case (_, ((inputId, inputScore), clustersWithScores)) => - clustersWithScores.map { - case ((clusterId, scoreType), clusterScore) => - ((clusterId, toSimClustersEmbeddingId(inputId, scoreType)), inputScore * clusterScore) - } - }.sumByKey - - (numReducers match { - case Some(reducers) => - matrixMultiplicationResult.withReducers(reducers + 1) - case _ => matrixMultiplicationResult - }).map { - case ((clusterId, embeddingId), score) => - (embeddingId, (clusterId, score)) - } - } - - def getUserSimClustersMatrix( - simClustersSource: TypedPipe[(UserId, ClustersUserIsInterestedIn)], - scoreExtractors: Seq[UserToInterestedInClusterScores => (Double, ScoreType.ScoreType)], - modelVersion: ModelVersion - ): TypedPipe[(UserId, Seq[((ClusterId, ScoreType.ScoreType), Double)])] = { - simClustersSource.map { - case (userId, clusters) - if ModelVersions.toModelVersion(clusters.knownForModelVersion) == modelVersion => - userId -> clusters.clusterIdToScores.flatMap { - case (clusterId, clusterScores) => - scoreExtractors.map { scoreExtractor => - scoreExtractor(clusterScores) match { - case (score, scoreType) => ((clusterId, scoreType), score) - } - } - }.toSeq - case (userId, _) => userId -> Nil - } - } - - def toReverseIndexSimClusterEmbedding( - embeddings: TypedPipe[(SimClustersEmbeddingId, (ClusterId, EmbeddingScore))], - topK: Int - ): TypedPipe[(SimClustersEmbeddingId, InternalIdEmbedding)] = { - embeddings - .map { - case (embeddingId, (clusterId, score)) => - ( - SimClustersEmbeddingId( - embeddingId.embeddingType, - embeddingId.modelVersion, - InternalId.ClusterId(clusterId)), - (embeddingId.internalId, score)) - } - .group - .sortedReverseTake(topK)(Ordering.by(_._2)) - .mapValues { topInternalIdsWithScore => - val internalIdsWithScore = topInternalIdsWithScore.map { - case (internalId, score) => InternalIdWithScore(internalId, score) - } - InternalIdEmbedding(internalIdsWithScore) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableFavBasedProducerEmbeddings.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableFavBasedProducerEmbeddings.docx new file mode 100644 index 000000000..1d2b7c059 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableFavBasedProducerEmbeddings.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableFavBasedProducerEmbeddings.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableFavBasedProducerEmbeddings.scala deleted file mode 100644 index e4c1d6f58..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableFavBasedProducerEmbeddings.scala +++ /dev/null @@ -1,278 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.producer - -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scalding_internal.source.lzo_scrooge.FixedPathLzoScrooge -import com.twitter.simclusters_v2.hdfs_sources.{ - AggregatableProducerSimclustersEmbeddingsByFavScoreScalaDataset, - AggregatableProducerSimclustersEmbeddingsByFavScoreThriftScalaDataset, - AggregatableProducerSimclustersEmbeddingsByFavScore2020ScalaDataset, - AggregatableProducerSimclustersEmbeddingsByFavScore2020ThriftScalaDataset -} -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.wtf.scalding.jobs.common.{AdhocExecutionApp, ScheduledExecutionApp} -import java.util.TimeZone - -/** - * See AggregatableProducerEmbeddingsBaseApp for an explanation of this job. - * - * Production job: -capesospy-v2 update aggregatable_producer_embeddings_by_fav_score src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object AggregatableFavBasedProducerEmbeddingsScheduledApp - extends AggregatableFavBasedProducerEmbeddingsBaseApp - with ScheduledExecutionApp { - - override val modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated - // Not using the EmbeddingUtil.getHdfsPath to preserve the previous functionality. - private val outputPath: String = - "/user/cassowary/manhattan_sequence_files/producer_simclusters_aggregatable_embeddings_by_fav_score" - - private val outputPathThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_fav_score_thrift" - ) - - override def firstTime: RichDate = RichDate("2020-05-11") - - override def batchIncrement: Duration = Days(7) - - override def writeToManhattan( - output: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeDALVersionedKeyValExecution( - AggregatableProducerSimclustersEmbeddingsByFavScoreScalaDataset, - D.Suffix(outputPath), - version = ExplicitEndTime(dateRange.end) - ) - } - - override def writeToThrift( - output: TypedPipe[SimClustersEmbeddingWithId] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeDALSnapshotExecution( - dataset = AggregatableProducerSimclustersEmbeddingsByFavScoreThriftScalaDataset, - updateStep = D.Daily, - pathLayout = D.Suffix(outputPathThrift), - fmt = D.Parquet, - endDate = dateRange.end - ) - } -} - -/** - * Production job: -capesospy-v2 update --build_locally --start_cron aggregatable_producer_embeddings_by_fav_score_2020 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object AggregatableFavBasedProducerEmbeddings2020ScheduledApp - extends AggregatableFavBasedProducerEmbeddingsBaseApp - with ScheduledExecutionApp { - - override val modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - // Not using the EmbeddingUtil.getHdfsPath to preserve the previous functionality. - private val outputPath: String = - "/user/cassowary/manhattan_sequence_files/producer_simclusters_aggregatable_embeddings_by_fav_score_20m145k2020" - - // getHdfsPath appends model version str to the pathSuffix - private val outputPathThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_fav_score_thrift" - ) - - override def firstTime: RichDate = RichDate("2021-03-04") - - override def batchIncrement: Duration = Days(7) - - override def writeToManhattan( - output: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeDALVersionedKeyValExecution( - AggregatableProducerSimclustersEmbeddingsByFavScore2020ScalaDataset, - D.Suffix(outputPath), - version = ExplicitEndTime(dateRange.end) - ) - } - - override def writeToThrift( - output: TypedPipe[SimClustersEmbeddingWithId] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeDALSnapshotExecution( - dataset = AggregatableProducerSimclustersEmbeddingsByFavScore2020ThriftScalaDataset, - updateStep = D.Daily, - pathLayout = D.Suffix(outputPathThrift), - fmt = D.Parquet, - endDate = dateRange.end - ) - } -} - -/*** - * Adhoc job: - -scalding remote run --user recos-platform \ ---main-class com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableFavBasedProducerEmbeddingsAdhocApp \ ---target src/scala/com/twitter/simclusters_v2/scalding/embedding/producer:aggregatable_fav_based_producer_embeddings_job-adhoc \ --- --date 2020-05-11 - - */ -object AggregatableFavBasedProducerEmbeddingsAdhocApp - extends AggregatableFavBasedProducerEmbeddingsBaseApp - with AdhocExecutionApp { - - override val modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated - private val outputPath: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_fav_score" - ) - - private val outputPathThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_fav_score_thrift" - ) - - override def writeToManhattan( - output: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .flatMap { keyVal => - keyVal.value.embedding.map { simClusterWithScore => - ( - keyVal.key.embeddingType, - keyVal.key.modelVersion, - keyVal.key.internalId, - simClusterWithScore.clusterId, - simClusterWithScore.score - ) - } - } - .writeExecution( - // Write to TSV for easier debugging of the adhoc job. - TypedTsv(outputPath) - ) - } - - override def writeToThrift( - output: TypedPipe[SimClustersEmbeddingWithId] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeExecution( - new FixedPathLzoScrooge(outputPathThrift, SimClustersEmbeddingWithId) - ) - } -} - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/producer:aggregatable_fav_based_producer_embeddings_job_2020-adhoc -scalding remote run \ ---user cassowary \ ---keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ ---principal service_acoount@TWITTER.BIZ \ ---cluster bluebird-qus1 \ ---main-class com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableFavBasedProducerEmbeddings2020AdhocApp \ ---target src/scala/com/twitter/simclusters_v2/scalding/embedding/producer:aggregatable_fav_based_producer_embeddings_job_2020-adhoc \ ---hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \ --- --date 2020-06-28 - */ -object AggregatableFavBasedProducerEmbeddings2020AdhocApp - extends AggregatableFavBasedProducerEmbeddingsBaseApp - with AdhocExecutionApp { - - override val modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - private val outputPath: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_fav_score" - ) - - private val outputPathThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_fav_score_thrift" - ) - - override def writeToManhattan( - output: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .flatMap { keyVal => - keyVal.value.embedding.map { simClusterWithScore => - ( - keyVal.key.embeddingType, - keyVal.key.modelVersion, - keyVal.key.internalId, - simClusterWithScore.clusterId, - simClusterWithScore.score - ) - } - } - .writeExecution( - // Write to TSV for easier debugging of the adhoc job. - TypedTsv(outputPath) - ) - } - - override def writeToThrift( - output: TypedPipe[SimClustersEmbeddingWithId] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeExecution( - new FixedPathLzoScrooge(outputPathThrift, SimClustersEmbeddingWithId) - ) - } -} - -trait AggregatableFavBasedProducerEmbeddingsBaseApp extends AggregatableProducerEmbeddingsBaseApp { - override val userToProducerScoringFn: NeighborWithWeights => Double = - _.favScoreHalfLife100Days.getOrElse(0.0) - override val userToClusterScoringFn: UserToInterestedInClusterScores => Double = - _.favScore.getOrElse(0.0) - override val embeddingType: EmbeddingType = EmbeddingType.AggregatableFavBasedProducer -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableFollowBasedProducerEmbeddings.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableFollowBasedProducerEmbeddings.docx new file mode 100644 index 000000000..405a99eec Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableFollowBasedProducerEmbeddings.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableFollowBasedProducerEmbeddings.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableFollowBasedProducerEmbeddings.scala deleted file mode 100644 index d18b66a7f..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableFollowBasedProducerEmbeddings.scala +++ /dev/null @@ -1,165 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.producer - -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scalding_internal.source.lzo_scrooge.FixedPathLzoScrooge -import com.twitter.simclusters_v2.hdfs_sources.AggregatableProducerSimclustersEmbeddingsByFollowScore2020ScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.AggregatableProducerSimclustersEmbeddingsByFollowScore2020ThriftScalaDataset -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId -import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * This file implements a new Producer SimClusters Embeddings. - * The differences with existing producer embeddings are: - * - * 1) the embedding scores are not normalized, so that one can aggregate multiple producer embeddings by adding them. - * 2) we use follow scores in the user-producer graph and user-simclusters graph. - */ - -/** - * Production job: -capesospy-v2 update --build_locally --start_cron aggregatable_producer_embeddings_by_follow_score_2020 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object AggregatableFollowBasedProducerEmbeddings2020ScheduledApp - extends AggregatableFollowBasedProducerEmbeddingsBaseApp - with ScheduledExecutionApp { - - override val modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - // Not using the EmbeddingUtil.getHdfsPath to preserve the previous functionality. - private val outputPath: String = - "/user/cassowary/manhattan_sequence_files/producer_simclusters_aggregatable_embeddings_by_follow_score_20m145k2020" - - private val outputPathThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_follow_score_thrift" - ) - - override def batchIncrement: Duration = Days(7) - - override def firstTime: RichDate = RichDate("2021-11-10") - - override def writeToManhattan( - output: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeDALVersionedKeyValExecution( - AggregatableProducerSimclustersEmbeddingsByFollowScore2020ScalaDataset, - D.Suffix(outputPath), - version = ExplicitEndTime(dateRange.end) - ) - } - - override def writeToThrift( - output: TypedPipe[SimClustersEmbeddingWithId] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeDALSnapshotExecution( - dataset = AggregatableProducerSimclustersEmbeddingsByFollowScore2020ThriftScalaDataset, - updateStep = D.Daily, - pathLayout = D.Suffix(outputPathThrift), - fmt = D.Parquet, - endDate = dateRange.end - ) - } -} - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/producer:aggregatable_follow_based_producer_embeddings_job_2020-adhoc -scalding remote run \ ---user cassowary \ ---keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ ---principal service_acoount@TWITTER.BIZ \ ---cluster bluebird-qus1 \ ---main-class com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableFollowBasedProducerEmbeddings2020AdhocApp \ ---target src/scala/com/twitter/simclusters_v2/scalding/embedding/producer:aggregatable_follow_based_producer_embeddings_job_2020-adhoc \ ---hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \ --- --date 2021-11-10 - */ - -object AggregatableFollowBasedProducerEmbeddings2020AdhocApp - extends AggregatableFollowBasedProducerEmbeddingsBaseApp - with AdhocExecutionApp { - - override val modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - - private val outputPath: String = EmbeddingUtil.getHdfsPath( - isAdhoc = true, - isManhattanKeyVal = true, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_follow_score" - ) - - private val outputPathThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = true, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_follow_score_thrift" - ) - - override def writeToManhattan( - output: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .flatMap { keyVal => - keyVal.value.embedding.map { simClusterWithScore => - ( - keyVal.key.embeddingType, - keyVal.key.modelVersion, - keyVal.key.internalId, - simClusterWithScore.clusterId, - simClusterWithScore.score - ) - } - } - .writeExecution( - // Write to TSV for easier debugging of the adhoc job. - TypedTsv(outputPath) - ) - } - - override def writeToThrift( - output: TypedPipe[SimClustersEmbeddingWithId] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeExecution( - new FixedPathLzoScrooge(outputPathThrift, SimClustersEmbeddingWithId) - ) - } -} - -trait AggregatableFollowBasedProducerEmbeddingsBaseApp - extends AggregatableProducerEmbeddingsBaseApp { - override val userToProducerScoringFn: NeighborWithWeights => Double = - _.followScoreNormalizedByNeighborFollowersL2.getOrElse(0.0) - override val userToClusterScoringFn: UserToInterestedInClusterScores => Double = - _.followScoreClusterNormalizedOnly.getOrElse(0.0) - override val embeddingType: EmbeddingType = EmbeddingType.AggregatableFollowBasedProducer -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableLogFavBasedProducerEmbeddings.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableLogFavBasedProducerEmbeddings.docx new file mode 100644 index 000000000..fe9e70717 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableLogFavBasedProducerEmbeddings.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableLogFavBasedProducerEmbeddings.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableLogFavBasedProducerEmbeddings.scala deleted file mode 100644 index 8344043b5..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableLogFavBasedProducerEmbeddings.scala +++ /dev/null @@ -1,368 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.producer - -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scalding_internal.source.lzo_scrooge.FixedPathLzoScrooge -import com.twitter.simclusters_v2.hdfs_sources.{ - AggregatableProducerSimclustersEmbeddingsByLogFavScoreScalaDataset, - AggregatableProducerSimclustersEmbeddingsByLogFavScoreThriftScalaDataset, - AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset, - AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ThriftScalaDataset, - AggregatableProducerSimclustersEmbeddingsByLogFavScoreRelaxedFavEngagementThreshold2020ScalaDataset, - AggregatableProducerSimclustersEmbeddingsByLogFavScoreRelaxedFavEngagementThreshold2020ThriftScalaDataset -} -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil -import com.twitter.simclusters_v2.thriftscala.{ - EmbeddingType, - ModelVersion, - NeighborWithWeights, - SimClustersEmbedding, - SimClustersEmbeddingId, - SimClustersEmbeddingWithId, - UserToInterestedInClusterScores -} -import com.twitter.wtf.scalding.jobs.common.{AdhocExecutionApp, ScheduledExecutionApp} -import java.util.TimeZone - -/** - * This file implements a new Producer SimClusters Embeddings. - * The differences with existing producer embeddings are: - * - * 1) the embedding scores are not normalized, so that one can aggregate multiple producer embeddings by adding them. - * 2) we use log-fav scores in the user-producer graph and user-simclusters graph. - * LogFav scores are smoother than fav scores we previously used and they are less sensitive to outliers - * - * - * - * The main difference with other normalized embeddings is the `convertEmbeddingToAggregatableEmbeddings` function - * where we multiply the normalized embedding with producer's norms. The resulted embeddings are then - * unnormalized and aggregatable. - * - */ -/** - * Production job: -capesospy-v2 update aggregatable_producer_embeddings_by_logfav_score src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object AggregatableLogFavBasedProducerEmbeddingsScheduledApp - extends AggregatableLogFavBasedProducerEmbeddingsBaseApp - with ScheduledExecutionApp { - - override val modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated - // Not using the EmbeddingUtil.getHdfsPath to preserve the previous functionality. - private val outputPath: String = - "/user/cassowary/manhattan_sequence_files/producer_simclusters_aggregatable_embeddings_by_logfav_score" - - private val outputPathThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_logfav_score_thrift" - ) - - override def batchIncrement: Duration = Days(7) - - override def firstTime: RichDate = RichDate("2020-04-05") - - override def writeToManhattan( - output: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeDALVersionedKeyValExecution( - AggregatableProducerSimclustersEmbeddingsByLogFavScoreScalaDataset, - D.Suffix(outputPath), - version = ExplicitEndTime(dateRange.end) - ) - } - - override def writeToThrift( - output: TypedPipe[SimClustersEmbeddingWithId] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeDALSnapshotExecution( - dataset = AggregatableProducerSimclustersEmbeddingsByLogFavScoreThriftScalaDataset, - updateStep = D.Daily, - pathLayout = D.Suffix(outputPathThrift), - fmt = D.Parquet, - endDate = dateRange.end - ) - } -} - -/** - * Production job: -capesospy-v2 update --build_locally --start_cron aggregatable_producer_embeddings_by_logfav_score_2020 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object AggregatableLogFavBasedProducerEmbeddings2020ScheduledApp - extends AggregatableLogFavBasedProducerEmbeddingsBaseApp - with ScheduledExecutionApp { - - override val modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - // Not using the EmbeddingUtil.getHdfsPath to preserve the previous functionality. - private val outputPath: String = - "/user/cassowary/manhattan_sequence_files/producer_simclusters_aggregatable_embeddings_by_logfav_score_20m145k2020" - - private val outputPathThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_logfav_score_thrift" - ) - - override def batchIncrement: Duration = Days(7) - - override def firstTime: RichDate = RichDate("2021-03-05") - - override def writeToManhattan( - output: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeDALVersionedKeyValExecution( - AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset, - D.Suffix(outputPath), - version = ExplicitEndTime(dateRange.end) - ) - } - - override def writeToThrift( - output: TypedPipe[SimClustersEmbeddingWithId] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeDALSnapshotExecution( - dataset = AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ThriftScalaDataset, - updateStep = D.Daily, - pathLayout = D.Suffix(outputPathThrift), - fmt = D.Parquet, - endDate = dateRange.end - ) - } -} - -/** - * Production job: -capesospy-v2 update --build_locally --start_cron aggregatable_producer_embeddings_by_logfav_score_relaxed_fav_engagement_threshold_2020 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object AggregatableLogFavBasedProducerEmbeddingsRelaxedFavEngagementThreshold2020ScheduledApp - extends AggregatableLogFavBasedProducerEmbeddingsBaseApp - with ScheduledExecutionApp { - - override val modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - - override val embeddingType: EmbeddingType = EmbeddingType.RelaxedAggregatableLogFavBasedProducer - - // Relax fav engagement threshold - override val minNumFavers = 15 - - // Not using the EmbeddingUtil.getHdfsPath to preserve the previous functionality. - private val outputPath: String = - "/user/cassowary/manhattan_sequence_files/producer_simclusters_aggregatable_embeddings_by_logfav_score_relaxed_fav_engagement_threshold_20m145k2020" - - private val outputPathThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = - "producer_simclusters_aggregatable_embeddings_by_logfav_score_relaxed_fav_score_threshold_thrift" - ) - - override def batchIncrement: Duration = Days(7) - - override def firstTime: RichDate = RichDate("2021-07-26") - - override def writeToManhattan( - output: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeDALVersionedKeyValExecution( - AggregatableProducerSimclustersEmbeddingsByLogFavScoreRelaxedFavEngagementThreshold2020ScalaDataset, - D.Suffix(outputPath), - version = ExplicitEndTime(dateRange.end) - ) - } - - override def writeToThrift( - output: TypedPipe[SimClustersEmbeddingWithId] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeDALSnapshotExecution( - dataset = - AggregatableProducerSimclustersEmbeddingsByLogFavScoreRelaxedFavEngagementThreshold2020ThriftScalaDataset, - updateStep = D.Daily, - pathLayout = D.Suffix(outputPathThrift), - fmt = D.Parquet, - endDate = dateRange.end - ) - } -} - -/*** - * Adhoc job: - -scalding remote run --user recos-platform \ ---main-class com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableLogFavBasedProducerEmbeddingsAdhocApp \ ---target src/scala/com/twitter/simclusters_v2/scalding/embedding/producer:aggregatable_logfav_based_producer_embeddings_job-adhoc \ --- --date 2020-04-08 - - */ -object AggregatableLogFavBasedProducerEmbeddingsAdhocApp - extends AggregatableLogFavBasedProducerEmbeddingsBaseApp - with AdhocExecutionApp { - - override val modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated - - private val outputPath: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_log_fav_score" - ) - - private val outputPathThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_log_fav_score_thrift" - ) - - override def writeToManhattan( - output: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .flatMap { keyVal => - keyVal.value.embedding.map { simClusterWithScore => - ( - keyVal.key.embeddingType, - keyVal.key.modelVersion, - keyVal.key.internalId, - simClusterWithScore.clusterId, - simClusterWithScore.score - ) - } - } - .writeExecution( - // Write to TSV for easier debugging of the adhoc job. - TypedTsv(outputPath) - ) - } - - override def writeToThrift( - output: TypedPipe[SimClustersEmbeddingWithId] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeExecution( - new FixedPathLzoScrooge(outputPathThrift, SimClustersEmbeddingWithId) - ) - } -} - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/producer:aggregatable_logfav_based_producer_embeddings_job_2020-adhoc -scalding remote run \ ---user cassowary \ ---keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ ---principal service_acoount@TWITTER.BIZ \ ---cluster bluebird-qus1 \ ---main-class com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableLogFavBasedProducerEmbeddings2020AdhocApp \ ---target src/scala/com/twitter/simclusters_v2/scalding/embedding/producer:aggregatable_logfav_based_producer_embeddings_job_2020-adhoc \ ---hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \ --- --date 2020-06-28 - */ - -object AggregatableLogFavBasedProducerEmbeddings2020AdhocApp - extends AggregatableLogFavBasedProducerEmbeddingsBaseApp - with AdhocExecutionApp { - - override val modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - - private val outputPath: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_log_fav_score" - ) - - private val outputPathThrift: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = "producer_simclusters_aggregatable_embeddings_by_log_fav_score_thrift" - ) - - override def writeToManhattan( - output: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .flatMap { keyVal => - keyVal.value.embedding.map { simClusterWithScore => - ( - keyVal.key.embeddingType, - keyVal.key.modelVersion, - keyVal.key.internalId, - simClusterWithScore.clusterId, - simClusterWithScore.score - ) - } - } - .writeExecution( - // Write to TSV for easier debugging of the adhoc job. - TypedTsv(outputPath) - ) - } - - override def writeToThrift( - output: TypedPipe[SimClustersEmbeddingWithId] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - output - .writeExecution( - new FixedPathLzoScrooge(outputPathThrift, SimClustersEmbeddingWithId) - ) - } -} - -trait AggregatableLogFavBasedProducerEmbeddingsBaseApp - extends AggregatableProducerEmbeddingsBaseApp { - override val userToProducerScoringFn: NeighborWithWeights => Double = _.logFavScore.getOrElse(0.0) - override val userToClusterScoringFn: UserToInterestedInClusterScores => Double = - _.logFavScore.getOrElse(0.0) - override val embeddingType: EmbeddingType = EmbeddingType.AggregatableLogFavBasedProducer -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableProducerEmbeddings.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableProducerEmbeddings.docx new file mode 100644 index 000000000..b66616660 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableProducerEmbeddings.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableProducerEmbeddings.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableProducerEmbeddings.scala deleted file mode 100644 index cd6755328..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/AggregatableProducerEmbeddings.scala +++ /dev/null @@ -1,168 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.producer - -import com.twitter.scalding._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scalding_internal.source.lzo_scrooge.FixedPathLzoScrooge -import com.twitter.simclusters_v2.hdfs_sources.{DataSources, InterestedInSources} -import com.twitter.simclusters_v2.scalding.common.matrix.{SparseMatrix, SparseRowMatrix} -import com.twitter.simclusters_v2.scalding.embedding.ProducerEmbeddingsFromInterestedIn -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.{ - ClusterId, - ProducerId, - UserId -} -import com.twitter.simclusters_v2.scalding.embedding.common.SimClustersEmbeddingBaseJob -import com.twitter.simclusters_v2.thriftscala.{EmbeddingType, _} -import java.util.TimeZone - -/** - * This file implements a new Producer SimClusters Embeddings. - * The differences with existing producer embeddings are: - * - * 1) the embedding scores are not normalized, so that one can aggregate multiple producer embeddings by adding them. - * 2) we use log-fav scores in the user-producer graph and user-simclusters graph. - * LogFav scores are smoother than fav scores we previously used and they are less sensitive to outliers - * - * - * - * The main difference with other normalized embeddings is the `convertEmbeddingToAggregatableEmbeddings` function - * where we multiply the normalized embedding with producer's norms. The resulted embeddings are then - * unnormalized and aggregatable. - * - */ -trait AggregatableProducerEmbeddingsBaseApp extends SimClustersEmbeddingBaseJob[ProducerId] { - - val userToProducerScoringFn: NeighborWithWeights => Double - val userToClusterScoringFn: UserToInterestedInClusterScores => Double - val modelVersion: ModelVersion - - // Minimum engagement threshold - val minNumFavers: Int = ProducerEmbeddingsFromInterestedIn.minNumFaversForProducer - - override def numClustersPerNoun: Int = 60 - - override def numNounsPerClusters: Int = 500 // this is not used for now - - override def thresholdForEmbeddingScores: Double = 0.01 - - override def prepareNounToUserMatrix( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): SparseMatrix[ProducerId, UserId, Double] = { - - SparseMatrix( - ProducerEmbeddingsFromInterestedIn - .getFilteredUserUserNormalizedGraph( - DataSources.userUserNormalizedGraphSource, - DataSources.userNormsAndCounts, - userToProducerScoringFn, - _.faverCount.exists( - _ > minNumFavers - ) - ) - .map { - case (userId, (producerId, score)) => - (producerId, userId, score) - }) - } - - override def prepareUserToClusterMatrix( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): SparseRowMatrix[UserId, ClusterId, Double] = { - SparseRowMatrix( - ProducerEmbeddingsFromInterestedIn - .getUserSimClustersMatrix( - InterestedInSources - .simClustersInterestedInSource(modelVersion, dateRange.embiggen(Days(5)), timeZone), - userToClusterScoringFn, - modelVersion - ) - .mapValues(_.toMap), - isSkinnyMatrix = true - ) - } - - // in order to make the embeddings aggregatable, we need to revert the normalization - // (multiply the norms) we did when computing embeddings in the base job. - def convertEmbeddingToAggregatableEmbeddings( - embeddings: TypedPipe[(ProducerId, Seq[(ClusterId, Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[(ProducerId, Seq[(ClusterId, Double)])] = { - embeddings.join(prepareNounToUserMatrix.rowL2Norms).map { - case (producerId, (embeddingVec, norm)) => - producerId -> embeddingVec.map { - case (id, score) => (id, score * norm) - } - } - } - - override final def writeClusterToNounsIndex( - output: TypedPipe[(ClusterId, Seq[(ProducerId, Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { Execution.unit } // we do not need this for now - - /** - * Override this method to write the manhattan dataset. - */ - def writeToManhattan( - output: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] - - /** - * Override this method to writethrough the thrift dataset. - */ - def writeToThrift( - output: TypedPipe[SimClustersEmbeddingWithId] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] - - val embeddingType: EmbeddingType - - override final def writeNounToClustersIndex( - output: TypedPipe[(ProducerId, Seq[(ClusterId, Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val convertedEmbeddings = convertEmbeddingToAggregatableEmbeddings(output) - .map { - case (producerId, topSimClustersWithScore) => - val id = SimClustersEmbeddingId( - embeddingType = embeddingType, - modelVersion = modelVersion, - internalId = InternalId.UserId(producerId)) - - val embeddings = SimClustersEmbedding(topSimClustersWithScore.map { - case (clusterId, score) => SimClusterWithScore(clusterId, score) - }) - - SimClustersEmbeddingWithId(id, embeddings) - } - - val keyValuePairs = convertedEmbeddings.map { simClustersEmbeddingWithId => - KeyVal(simClustersEmbeddingWithId.embeddingId, simClustersEmbeddingWithId.embedding) - } - val manhattanExecution = writeToManhattan(keyValuePairs) - - val thriftExecution = writeToThrift(convertedEmbeddings) - - Execution.zip(manhattanExecution, thriftExecution).unit - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/BUILD.bazel b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/BUILD.bazel deleted file mode 100644 index d6ff0d162..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/BUILD.bazel +++ /dev/null @@ -1,223 +0,0 @@ -scala_library( - sources = [ - "*.scala", - ], - platform = "java8", - tags = [ - "bazel-compatible", - "bazel-only", - ], - dependencies = [ - "escherbird/src/scala/com/twitter/escherbird/scalding/source", - "src/scala/com/twitter/onboarding/relevance/source:utt_account_recommendations-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:aggregatable_producer_simclusters_embeddings_by_fav_score-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:aggregatable_producer_simclusters_embeddings_by_log_fav_score-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/scala/com/twitter/simclusters_v2/scalding/common/matrix", - "src/scala/com/twitter/simclusters_v2/scalding/embedding", - "src/scala/com/twitter/wtf/entity_real_graph/common", - "src/scala/com/twitter/wtf/entity_real_graph/scalding/common", - "src/scala/com/twitter/wtf/scalding/jobs/common:execution_app", - "src/scala/com/twitter/wtf/scalding/jobs/common:sources", - "src/scala/com/twitter/wtf/scalding/jobs/common:stats_util", - "src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala", - "src/thrift/com/twitter/onboarding/relevance/candidates:candidates-scala", - "src/thrift/com/twitter/recos/entities:entities-thrift-scala", - "src/thrift/com/twitter/wtf/entity_real_graph:entity_real_graph-thrift-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - "usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala", - ], -) - -hadoop_binary( - name = "aggregatable_logfav_based_producer_embeddings_job-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableLogFavBasedProducerEmbeddingsAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) - -hadoop_binary( - name = "aggregatable_logfav_based_producer_embeddings_job_2020-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableLogFavBasedProducerEmbeddings2020AdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) - -hadoop_binary( - name = "aggregatable_follow_based_producer_embeddings_job_2020-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableFollowBasedProducerEmbeddings2020AdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) - -hadoop_binary( - name = "aggregatable_logfav_based_producer_embeddings_job", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableLogFavBasedProducerEmbeddingsScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) - -hadoop_binary( - name = "aggregatable_logfav_based_producer_embeddings_job_2020", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableLogFavBasedProducerEmbeddings2020ScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) - -hadoop_binary( - name = "aggregatable_follow_based_producer_embeddings_job_2020", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableFollowBasedProducerEmbeddings2020ScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) - -hadoop_binary( - name = "aggregatable_logfav_based_producer_embeddings_job_relaxed_fav_engagement_threshold_2020", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableLogFavBasedProducerEmbeddingsRelaxedFavEngagementThreshold2020ScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) - -hadoop_binary( - name = "aggregatable_fav_based_producer_embeddings_job-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableFavBasedProducerEmbeddingsAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) - -hadoop_binary( - name = "aggregatable_fav_based_producer_embeddings_job", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableFavBasedProducerEmbeddingsScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) - -hadoop_binary( - name = "aggregatable_fav_based_producer_embeddings_job_2020-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableFavBasedProducerEmbeddings2020AdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) - -hadoop_binary( - name = "aggregatable_fav_based_producer_embeddings_job_2020", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableFavBasedProducerEmbeddings2020ScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) - -# Generated with `capesospy-v2 create_target aggregatable_producer_embeddings_by_logfav_score src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml`, config hash f8229a. -scalding_job( - name = "aggregatable_producer_embeddings_by_logfav_score", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableLogFavBasedProducerEmbeddingsScheduledApp", - config = [ - ("hadoop.combine-input", "true"), - ("hadoop.map.jvm.total-memory", "3072m"), - ("hadoop.queue", "cassowary.default"), - ("hadoop.reduce.jvm.total-memory", "3072m"), - ("hadoop.submitter.jvm.total-memory", "5120m"), - ("submitter.tier", "preemptible"), - ], - cron = "17 * * * *", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) - -# Generated with `capesospy-v2 create_target aggregatable_producer_embeddings_by_fav_score src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml`, config hash bc0103. -scalding_job( - name = "aggregatable_producer_embeddings_by_fav_score", - main = "com.twitter.simclusters_v2.scalding.embedding.producer.AggregatableFavBasedProducerEmbeddingsScheduledApp", - config = [ - ("hadoop.combine-input", "true"), - ("hadoop.map.jvm.total-memory", "3072m"), - ("hadoop.queue", "cassowary.default"), - ("hadoop.reduce.jvm.total-memory", "3072m"), - ("hadoop.submitter.jvm.total-memory", "5120m"), - ("submitter.tier", "preemptible"), - ], - cron = "17 * * * *", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":producer"], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/BUILD.docx new file mode 100644 index 000000000..1a9de19d9 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/producer/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/BUILD b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/BUILD deleted file mode 100644 index 2f9cb7ebe..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/BUILD +++ /dev/null @@ -1,196 +0,0 @@ -scala_library( - sources = [ - "*.scala", - "common/*.scala", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "escherbird/src/scala/com/twitter/escherbird/scalding/source", - "interests-ds/src/main/scala/com/twitter/interests_ds/jobs/interests_service", - "interests-ds/src/main/scala/com/twitter/interests_ds/jobs/interests_service:user_topic_relation_snapshot-scala", - "src/scala/com/twitter/ml/featurestore/catalog/datasets/timelines:timelines-user-topic-aggregates", - "src/scala/com/twitter/onboarding/relevance/source:utt_account_recommendations-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:simclusters_v2_embeddings_lite-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:user_topic_weighted_embedding-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/scala/com/twitter/simclusters_v2/scalding/common/matrix", - "src/scala/com/twitter/simclusters_v2/scalding/embedding", - "src/scala/com/twitter/timelines/prediction/common/aggregates:user_topic_aggregates-scala", - "src/scala/com/twitter/wtf/entity_real_graph/common", - "src/scala/com/twitter/wtf/entity_real_graph/scalding/common", - "src/scala/com/twitter/wtf/scalding/jobs/common:execution_app", - "src/scala/com/twitter/wtf/scalding/jobs/common:sources", - "src/scala/com/twitter/wtf/scalding/jobs/common:stats_util", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - "timelines/data_processing/ml_util/aggregation_framework/conversion", - "timelines/data_processing/ml_util/aggregation_framework/metrics", - "timelines/data_processing/ml_util/aggregation_framework/scalding", - ], -) - -hadoop_binary( - name = "fav_tfg_topic_embeddings-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.tfg.FavTfgTopicEmbeddingsAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":tfg"], -) - -hadoop_binary( - name = "fav_tfg_topic_embeddings_2020-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.tfg.FavTfgTopicEmbeddings2020AdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":tfg"], -) - -hadoop_binary( - name = "fav_tfg_topic_embeddings", - main = "com.twitter.simclusters_v2.scalding.embedding.tfg.FavTfgTopicEmbeddingsScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":tfg"], -) - -hadoop_binary( - name = "fav_tfg_topic_embeddings_2020", - main = "com.twitter.simclusters_v2.scalding.embedding.tfg.FavTfgTopicEmbeddings2020ScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":tfg"], -) - -hadoop_binary( - name = "logfav_tfg_topic_embeddings-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.tfg.LogFavTfgTopicEmbeddingsAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":tfg"], -) - -hadoop_binary( - name = "logfav_tfg_topic_embeddings", - main = "com.twitter.simclusters_v2.scalding.embedding.tfg.LogFavTfgTopicEmbeddingsScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":tfg"], -) - -hadoop_binary( - name = "fav_inferred_lang_tfg_topic_embeddings-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.tfg.FavInferredLanguageTfgBasedTopicEmbeddingsAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":tfg"], -) - -hadoop_binary( - name = "fav_inferred_lang_tfg_topic_embeddings", - main = "com.twitter.simclusters_v2.scalding.embedding.tfg.FavInferredLanguageTfgBasedTopicEmbeddingsScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":tfg"], -) - -hadoop_binary( - name = "fav_tfg_topic_embeddings_2020_copy", - main = "com.twitter.simclusters_v2.scalding.embedding.tfg.FavTfgTopicEmbeddings2020CopyScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":tfg"], -) - -scalding_job( - name = "fav_weighted_user_topic_tfg_embeddings_adhoc_job", - main = "com.twitter.simclusters_v2.scalding.embedding.tfg.EngagementWeightedTfgBasedTopicEmbeddingsAdhocJob", - config = [ - ("hadoop.reduce.jvm.total-memory", "8192m"), - ("hadoop.combine-input", "true"), - ( - "job.args", - ["--date 2021-10-28"], - ), - ], - hadoop_cluster = "atla-proc3", #"qus1-bluebird", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tfg", - ], -) - -scalding_job( - name = "fav_weighted_user_topic_tfg_embeddings_batch_job", - main = "com.twitter.simclusters_v2.scalding.embedding.tfg.EngagementWeightedTfgBasedTopicEmbeddingsScheduleJob", - args = [], - config = [ - ("hadoop.reduce.jvm.total-memory", "8192m"), - ("hadoop.combine-input", "true"), - ("submitter.cluster", "atla"), - ], - cron = "0 1 * * *", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tfg", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/BUILD.docx new file mode 100644 index 000000000..efc719d28 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/EngagementWeightedTfgBasedTopicEmbeddingsJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/EngagementWeightedTfgBasedTopicEmbeddingsJob.docx new file mode 100644 index 000000000..59ec2ff0f Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/EngagementWeightedTfgBasedTopicEmbeddingsJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/EngagementWeightedTfgBasedTopicEmbeddingsJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/EngagementWeightedTfgBasedTopicEmbeddingsJob.scala deleted file mode 100644 index 5ce6284af..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/EngagementWeightedTfgBasedTopicEmbeddingsJob.scala +++ /dev/null @@ -1,310 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.tfg - -import com.twitter.dal.client.dataset.SnapshotDALDatasetBase -import com.twitter.ml.api.DataSetPipe -import com.twitter.ml.api.Feature.Continuous -import com.twitter.ml.api.constant.SharedFeatures -import com.twitter.ml.api.util.SRichDataRecord -import com.twitter.scalding.Execution -import com.twitter.scalding._ -import com.twitter.scalding.typed.UnsortedGrouped -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.scalding_internal.dalv2.DALWrite.WriteExtension -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.Country -import com.twitter.simclusters_v2.common.Language -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.hdfs_sources.FavTfgTopicEmbeddings2020ScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.UserTopicWeightedEmbeddingScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.UserTopicWeightedEmbeddingParquetScalaDataset -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion._ -import com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationConfig -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.DateRangeExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * Jobs to generate Fav-based engagement weighted Topic-Follow-Graph (TFG) topic embeddings - * The job uses fav based TFG embeddings and fav based engagement to produce a new embedding - */ - -/** - * ./bazel bundle ... - * scalding workflow upload --jobs src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_weighted_user_topic_tfg_embeddings_adhoc_job --autoplay - */ -object EngagementWeightedTfgBasedTopicEmbeddingsAdhocJob - extends AdhocExecutionApp - with EngagementWeightedTfgBasedTopicEmbeddingsBaseJob { - override val outputByFav = - "/user/cassowary/adhoc/manhattan_sequence_files/simclusters_v2_embedding/user_tfgembedding/by_fav" - override val parquetOutputByFav = - "/user/cassowary/adhoc/processed/simclusters_v2_embedding/user_tfgembedding/by_fav/snapshot" -} - -/** - * ./bazel bundle ... - * scalding workflow upload --jobs src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_weighted_user_topic_tfg_embeddings_batch_job --autoplay - */ -object EngagementWeightedTfgBasedTopicEmbeddingsScheduleJob - extends ScheduledExecutionApp - with EngagementWeightedTfgBasedTopicEmbeddingsBaseJob { - override val firstTime: RichDate = RichDate("2021-10-03") - override val batchIncrement: Duration = Days(1) - override val outputByFav = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_embedding/user_tfgembedding/by_fav" - override val parquetOutputByFav = - "/user/cassowary/processed/simclusters_v2_embedding/user_tfgembedding/by_fav/snapshot" -} - -trait EngagementWeightedTfgBasedTopicEmbeddingsBaseJob extends DateRangeExecutionApp { - - val outputByFav: String - val parquetOutputByFav: String - - //root path to read aggregate data - private val aggregateFeatureRootPath = - "/atla/proc2/user/timelines/processed/aggregates_v2" - - private val topKTopicsToKeep = 100 - - private val favContinuousFeature = new Continuous( - "user_topic_aggregate.pair.recap.engagement.is_favorited.any_feature.50.days.count") - - private val parquetDataSource: SnapshotDALDatasetBase[UserTopicWeightedEmbedding] = - UserTopicWeightedEmbeddingParquetScalaDataset - - def sortedTake[K](m: Map[K, Double], keysToKeep: Int): Map[K, Double] = { - m.toSeq.sortBy { case (k, v) => -v }.take(keysToKeep).toMap - } - - case class UserTopicEngagement( - userId: Long, - topicId: Long, - language: String, - country: String, //field is not used - favCount: Double) { - val userLanguageGroup: (Long, String) = (userId, language) - } - - def prepareUserToTopicEmbedding( - favTfgTopicEmbeddings: TypedPipe[(Long, String, SimClustersEmbedding)], - userTopicEngagementCount: TypedPipe[UserTopicEngagement] - )( - implicit uniqueID: UniqueID - ): TypedPipe[((Long, String), Map[Int, Double])] = { - val userTfgEmbeddingsStat = Stat("User Tfg Embeddings Count") - val userTopicTopKEngagementStat = Stat("User Topic Top K engagement count") - val userEngagementStat = Stat("User engagement count") - val tfgEmbeddingsStat = Stat("TFG Embedding Map count") - - //get only top K topics - val userTopKTopicEngagementCount: TypedPipe[UserTopicEngagement] = userTopicEngagementCount - .groupBy(_.userLanguageGroup) - .withReducers(499) - .withDescription("select topK topics") - .sortedReverseTake(topKTopicsToKeep)(Ordering.by(_.favCount)) - .values - .flatten - - //(userId, language), totalCount - val userLanguageEngagementCount: UnsortedGrouped[(Long, String), Double] = - userTopKTopicEngagementCount - .collect { - case UserTopicEngagement(userId, topicId, language, country, favCount) => - userTopicTopKEngagementStat.inc() - ((userId, language), favCount) - }.sumByKey - .withReducers(499) - .withDescription("fav count by user") - - //(topicId, language), (userId, favWeight) - val topicUserWithNormalizedWeights: TypedPipe[((Long, String), (Long, Double))] = - userTopKTopicEngagementCount - .groupBy(_.userLanguageGroup) - .join(userLanguageEngagementCount) - .withReducers(499) - .withDescription("join userTopic and user EngagementCount") - .collect { - case ((userId, language), (engagementData, totalCount)) => - userEngagementStat.inc() - ( - (engagementData.topicId, engagementData.language), - (userId, engagementData.favCount / totalCount) - ) - } - - // (topicId, language), embeddingMap - val tfgEmbeddingsMap: TypedPipe[((Long, String), Map[Int, Double])] = favTfgTopicEmbeddings - .map { - case (topicId, language, embedding) => - tfgEmbeddingsStat.inc() - ((topicId, language), embedding.embedding.map(a => a.clusterId -> a.score).toMap) - } - .withDescription("covert sim cluster embedding to map") - - // (userId, language), clusters - val newUserTfgEmbedding = topicUserWithNormalizedWeights - .join(tfgEmbeddingsMap) - .withReducers(799) - .withDescription("join user | topic | favWeight * embedding") - .collect { - case ((topicId, language), ((userId, favWeight), embeddingMap)) => - userTfgEmbeddingsStat.inc() - ((userId, language), embeddingMap.mapValues(_ * favWeight)) - } - .sumByKey - .withReducers(799) - .withDescription("aggregate embedding by user") - - newUserTfgEmbedding.toTypedPipe - } - - def writeOutput( - newUserTfgEmbedding: TypedPipe[((Long, String), Map[Int, Double])], - outputPath: String, - parquetOutputPath: String, - modelVersion: String - )( - implicit uniqueID: UniqueID, - dateRange: DateRange - ): Execution[Unit] = { - val outputRecordStat = Stat("output record count") - val output = newUserTfgEmbedding - .map { - //language has been purposely ignored because the entire logic is based on the fact that - //user is mapped to a language. In future if a user is mapped to multiple languages then - //the final output needs to be keyed on (userId, language) - case ((userId, language), embeddingMap) => - outputRecordStat.inc() - val clusterScores = embeddingMap.map { - case (clusterId, score) => - clusterId -> UserToInterestedInClusterScores(favScore = Some(score)) - } - KeyVal(userId, ClustersUserIsInterestedIn(modelVersion, clusterScores)) - } - - val keyValExec = output - .withDescription("write output keyval dataset") - .writeDALVersionedKeyValExecution( - UserTopicWeightedEmbeddingScalaDataset, - D.Suffix(outputPath)) - - val parquetExec = newUserTfgEmbedding - .map { - case ((userId, language), embeddingMap) => - val clusterScores = embeddingMap.map { - case (clusterId, score) => ClustersScore(clusterId, score) - } - UserTopicWeightedEmbedding(userId, clusterScores.toSeq) - } - .withDescription("write output parquet dataset") - .writeDALSnapshotExecution( - parquetDataSource, - D.Daily, - D.Suffix(parquetOutputPath), - D.Parquet, - dateRange.end - ) - Execution.zip(keyValExec, parquetExec).unit - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val end = dateRange.start - val start = end - Days(21) - val featureDateRange = DateRange(start, end - Millisecs(1)) - val outputPath = args.getOrElse("output_path", outputByFav) - val parquetOutputPath = args.getOrElse("parquet_output_path", parquetOutputByFav) - val modelVersion = ModelVersions.Model20M145K2020 - - //define stats counter - val favTfgTopicEmbeddingsStat = Stat("FavTfgTopicEmbeddings") - val userTopicEngagementStat = Stat("UserTopicEngagement") - val userTopicsStat = Stat("UserTopics") - val userLangStat = Stat("UserLanguage") - - //get fav based tfg embeddings - //topic can have different languages and the clusters will be different - //current logic is to filter based on user language - // topicId, lang, embedding - val favTfgTopicEmbeddings: TypedPipe[(Long, String, SimClustersEmbedding)] = DAL - .readMostRecentSnapshot(FavTfgTopicEmbeddings2020ScalaDataset, featureDateRange) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .collect { - case KeyVal( - SimClustersEmbeddingId( - embedType, - modelVersion, - InternalId.LocaleEntityId(LocaleEntityId(entityId, language))), - embedding) => - favTfgTopicEmbeddingsStat.inc() - (entityId, language, embedding) - } - - /* - Ideally, if the timeline aggregate framework provided data with breakdown by language, - it could have been joined with (topic, language) embedding. Since, it is not possible - we fetch the language of the user from other sources. - This returns language for the user so that it could be joined with (topic, language) embedding. - `userSource` returns 1 language per user - `inferredUserConsumedLanguageSource` returns multiple languages with confidence values - */ - val userLangSource = ExternalDataSources.userSource - .map { - case (userId, (country, language)) => - userLangStat.inc() - (userId, (language, country)) - } - - //get userid, topicid, favcount as aggregated dataset - //currently there is no way to get language breakdown from the timeline aggregate framework. - val userTopicEngagementPipe: DataSetPipe = AggregatesV2MostRecentFeatureSource( - rootPath = aggregateFeatureRootPath, - storeName = "user_topic_aggregates", - aggregates = - Set(TimelinesAggregationConfig.userTopicAggregates).flatMap(_.buildTypedAggregateGroups()), - ).read - - val userTopicEngagementCount = userTopicEngagementPipe.records - .flatMap { record => - val sRichDataRecord = SRichDataRecord(record) - val userId: Long = sRichDataRecord.getFeatureValue(SharedFeatures.USER_ID) - val topicId: Long = sRichDataRecord.getFeatureValue(TimelinesSharedFeatures.TOPIC_ID) - val favCount: Double = sRichDataRecord - .getFeatureValueOpt(favContinuousFeature).map(_.toDouble).getOrElse(0.0) - userTopicEngagementStat.inc() - if (favCount > 0) { - List((userId, (topicId, favCount))) - } else None - }.join(userLangSource) - .collect { - case (userId, ((topicId, favCount), (language, country))) => - userTopicsStat.inc() - UserTopicEngagement(userId, topicId, language, country, favCount) - } - .withDescription("User Topic aggregated favcount") - - // combine user, topics, topic_embeddings - // and take weighted aggregate of the tfg embedding - val newUserTfgEmbedding = - prepareUserToTopicEmbedding(favTfgTopicEmbeddings, userTopicEngagementCount) - - writeOutput(newUserTfgEmbedding, outputPath, parquetOutputPath, modelVersion) - - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/FavInferredLanguageTfgBasedTopicEmbeddings.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/FavInferredLanguageTfgBasedTopicEmbeddings.docx new file mode 100644 index 000000000..0d13e1e01 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/FavInferredLanguageTfgBasedTopicEmbeddings.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/FavInferredLanguageTfgBasedTopicEmbeddings.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/FavInferredLanguageTfgBasedTopicEmbeddings.scala deleted file mode 100644 index 14604af6a..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/FavInferredLanguageTfgBasedTopicEmbeddings.scala +++ /dev/null @@ -1,66 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.tfg - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.hdfs_sources.EntityEmbeddingsSources -import com.twitter.simclusters_v2.thriftscala.{ - EmbeddingType, - ModelVersion, - SimClustersEmbeddingId, - UserToInterestedInClusterScores, - SimClustersEmbedding => ThriftSimClustersEmbedding -} -import com.twitter.wtf.scalding.jobs.common.{AdhocExecutionApp, ScheduledExecutionApp} - -/** - * Apps to generate fav-based Topic-Follow-Graph (TFG) topic embeddings from inferred languages - * The fav-based embeddings are built from topic followers' fav-based InterestedIn - */ - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_inferred_lang_tfg_topic_embeddings-adhoc - scalding remote run \ - --user cassowary \ - --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ - --principal service_acoount@TWITTER.BIZ \ - --cluster bluebird-qus1 \ - --main-class com.twitter.simclusters_v2.scalding.embedding.tfg.FavInferredLanguageTfgBasedTopicEmbeddingsAdhocApp \ - --target src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_inferred_lang_tfg_topic_embeddings-adhoc \ - --hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \ - -- --date 2020-06-28 - */ -object FavInferredLanguageTfgBasedTopicEmbeddingsAdhocApp - extends InferredLanguageTfgBasedTopicEmbeddingsBaseApp - with AdhocExecutionApp { - override val isAdhoc: Boolean = true - override val embeddingType: EmbeddingType = EmbeddingType.FavInferredLanguageTfgTopic - override val embeddingSource: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding] - ] = EntityEmbeddingsSources.FavInferredLanguageTfgTopicEmbeddingsDataset - override val pathSuffix: String = "fav_inferred_lang_tfg_topic_embeddings" - override val modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated - override def scoreExtractor: UserToInterestedInClusterScores => Double = scores => - scores.favScore.getOrElse(0.0) -} - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_inferred_lang_tfg_topic_embeddings -capesospy-v2 update --build_locally --start_cron fav_inferred_lang_tfg_topic_embeddings src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object FavInferredLanguageTfgBasedTopicEmbeddingsScheduledApp - extends InferredLanguageTfgBasedTopicEmbeddingsBaseApp - with ScheduledExecutionApp { - override val isAdhoc: Boolean = false - override val embeddingType: EmbeddingType = EmbeddingType.FavInferredLanguageTfgTopic - override val embeddingSource: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding] - ] = EntityEmbeddingsSources.FavInferredLanguageTfgTopicEmbeddingsDataset - override val pathSuffix: String = "fav_inferred_lang_tfg_topic_embeddings" - override val modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated - override def scoreExtractor: UserToInterestedInClusterScores => Double = scores => - scores.favScore.getOrElse(0.0) - - override val firstTime: RichDate = RichDate("2020-07-04") - override val batchIncrement: Duration = Days(1) -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/FavTfgBasedTopicEmbeddings.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/FavTfgBasedTopicEmbeddings.docx new file mode 100644 index 000000000..eaacc2c1f Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/FavTfgBasedTopicEmbeddings.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/FavTfgBasedTopicEmbeddings.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/FavTfgBasedTopicEmbeddings.scala deleted file mode 100644 index d3e2d6525..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/FavTfgBasedTopicEmbeddings.scala +++ /dev/null @@ -1,172 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.tfg - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.dal.client.dataset.SnapshotDALDatasetBase -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.scalding_internal.dalv2.DALWrite.WriteExtension -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.hdfs_sources.EntityEmbeddingsSources -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.TfgTopicEmbeddings -import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores -import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * Jobs to generate Fav-based Topic-Follow-Graph (TFG) topic embeddings - * A topic's fav-based TFG embedding is the sum of its followers' fav-based InterestedIn - */ - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_tfg_topic_embeddings-adhoc - scalding remote run \ - --user cassowary \ - --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ - --principal service_acoount@TWITTER.BIZ \ - --cluster bluebird-qus1 \ - --main-class com.twitter.simclusters_v2.scalding.embedding.tfg.FavTfgTopicEmbeddingsAdhocApp \ - --target src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_tfg_topic_embeddings-adhoc \ - --hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \ - -- --date 2020-12-08 - */ -object FavTfgTopicEmbeddingsAdhocApp extends TfgBasedTopicEmbeddingsBaseApp with AdhocExecutionApp { - override val isAdhoc: Boolean = true - override val embeddingType: EmbeddingType = EmbeddingType.FavTfgTopic - override val embeddingSource: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding] - ] = EntityEmbeddingsSources.FavTfgTopicEmbeddingsDataset - override val pathSuffix: String = "fav_tfg_topic_embedding" - override val modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated - override val parquetDataSource: SnapshotDALDatasetBase[TfgTopicEmbeddings] = - EntityEmbeddingsSources.FavTfgTopicEmbeddingsParquetDataset - override def scoreExtractor: UserToInterestedInClusterScores => Double = scores => - scores.favScore.getOrElse(0.0) -} - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_tfg_topic_embeddings -capesospy-v2 update --build_locally --start_cron fav_tfg_topic_embeddings src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object FavTfgTopicEmbeddingsScheduledApp - extends TfgBasedTopicEmbeddingsBaseApp - with ScheduledExecutionApp { - override val isAdhoc: Boolean = false - override val embeddingType: EmbeddingType = EmbeddingType.FavTfgTopic - override val embeddingSource: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding] - ] = EntityEmbeddingsSources.FavTfgTopicEmbeddingsDataset - override val pathSuffix: String = "fav_tfg_topic_embedding" - override val modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated - override val parquetDataSource: SnapshotDALDatasetBase[TfgTopicEmbeddings] = - EntityEmbeddingsSources.FavTfgTopicEmbeddingsParquetDataset - override def scoreExtractor: UserToInterestedInClusterScores => Double = scores => - scores.favScore.getOrElse(0.0) - - override val firstTime: RichDate = RichDate("2020-05-25") - override val batchIncrement: Duration = Days(1) -} - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_tfg_topic_embeddings_2020-adhoc - scalding remote run \ - --user cassowary \ - --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ - --principal service_acoount@TWITTER.BIZ \ - --cluster bluebird-qus1 \ - --main-class com.twitter.simclusters_v2.scalding.embedding.tfg.FavTfgTopicEmbeddings2020AdhocApp \ - --target src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_tfg_topic_embeddings_2020-adhoc \ - --hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \ - -- --date 2020-12-08 - */ -object FavTfgTopicEmbeddings2020AdhocApp - extends TfgBasedTopicEmbeddingsBaseApp - with AdhocExecutionApp { - override val isAdhoc: Boolean = true - override val embeddingType: EmbeddingType = EmbeddingType.FavTfgTopic - override val embeddingSource: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding] - ] = EntityEmbeddingsSources.FavTfgTopicEmbeddings2020Dataset - override val pathSuffix: String = "fav_tfg_topic_embedding" - override val modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - override val parquetDataSource: SnapshotDALDatasetBase[TfgTopicEmbeddings] = - EntityEmbeddingsSources.FavTfgTopicEmbeddings2020ParquetDataset - override def scoreExtractor: UserToInterestedInClusterScores => Double = scores => - scores.favScore.getOrElse(0.0) -} - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_tfg_topic_embeddings_2020 -capesospy-v2 update --build_locally --start_cron fav_tfg_topic_embeddings_2020 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object FavTfgTopicEmbeddings2020ScheduledApp - extends TfgBasedTopicEmbeddingsBaseApp - with ScheduledExecutionApp { - override val isAdhoc: Boolean = false - override val embeddingType: EmbeddingType = EmbeddingType.FavTfgTopic - override val embeddingSource: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding] - ] = EntityEmbeddingsSources.FavTfgTopicEmbeddings2020Dataset - override val pathSuffix: String = "fav_tfg_topic_embedding" - override val modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - override val parquetDataSource: SnapshotDALDatasetBase[TfgTopicEmbeddings] = - EntityEmbeddingsSources.FavTfgTopicEmbeddings2020ParquetDataset - override def scoreExtractor: UserToInterestedInClusterScores => Double = scores => - scores.favScore.getOrElse(0.0) - - override val firstTime: RichDate = RichDate("2021-03-10") - override val batchIncrement: Duration = Days(1) -} - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_tfg_topic_embeddings_2020_copy -scalding scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:fav_tfg_topic_embeddings_2020_copy - */ - -/** - * This is a copy job where we copy the previous version of TFG and write to a new one. - * The dependent dataset for TFG has been deleted. - * Instead of restarting the entire job, we create this temp hacky solution to keep TFG dataset alive until we deprecate topics. - * Having a table TFG doesn't lead to a big quality concern b/c TFG is built from topic follows, which is relative stable - * and we don't have new topics anymore. - */ -object FavTfgTopicEmbeddings2020CopyScheduledApp extends ScheduledExecutionApp { - val isAdhoc: Boolean = false - val embeddingType: EmbeddingType = EmbeddingType.FavTfgTopic - val embeddingSource: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding] - ] = EntityEmbeddingsSources.FavTfgTopicEmbeddings2020Dataset - val pathSuffix: String = "fav_tfg_topic_embedding" - val modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - - override val firstTime: RichDate = RichDate("2023-01-20") - override val batchIncrement: Duration = Days(3) - - def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - DAL - .readMostRecentSnapshotNoOlderThan( - EntityEmbeddingsSources.FavTfgTopicEmbeddings2020Dataset, - Days(21)) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .writeDALVersionedKeyValExecution( - EntityEmbeddingsSources.FavTfgTopicEmbeddings2020Dataset, - D.Suffix( - EmbeddingUtil - .getHdfsPath(isAdhoc = isAdhoc, isManhattanKeyVal = true, modelVersion, pathSuffix)) - ) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/InferredLanguageTfgBasedTopicEmbeddingsBaseApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/InferredLanguageTfgBasedTopicEmbeddingsBaseApp.docx new file mode 100644 index 000000000..588f9ae5c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/InferredLanguageTfgBasedTopicEmbeddingsBaseApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/InferredLanguageTfgBasedTopicEmbeddingsBaseApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/InferredLanguageTfgBasedTopicEmbeddingsBaseApp.scala deleted file mode 100644 index 2ee09cc8f..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/InferredLanguageTfgBasedTopicEmbeddingsBaseApp.scala +++ /dev/null @@ -1,194 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.tfg - -import com.twitter.bijection.{Bufferable, Injection} -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite.{D, _} -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.{Country, Language, SimClustersEmbedding, TopicId} -import com.twitter.simclusters_v2.hdfs_sources.InterestedInSources -import com.twitter.simclusters_v2.scalding.common.matrix.{SparseMatrix, SparseRowMatrix} -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.{UserId, _} -import com.twitter.simclusters_v2.scalding.embedding.common.{ - EmbeddingUtil, - ExternalDataSources, - SimClustersEmbeddingBaseJob -} -import com.twitter.simclusters_v2.thriftscala.{ - EmbeddingType, - InternalId, - ModelVersion, - SimClustersEmbeddingId, - UserToInterestedInClusterScores, - SimClustersEmbedding => ThriftSimClustersEmbedding, - TopicId => ThriftTopicId -} -import com.twitter.wtf.scalding.jobs.common.DateRangeExecutionApp -import java.util.TimeZone - -/** - * Base app to generate Topic-Follow-Graph (TFG) topic embeddings from inferred languages. - * In this app, topic embeddings are keyed by (topic, language, country). - * Given a (topic t, country c, language l) tuple, the embedding is the sum of the - * InterestedIn of the topic followers whose inferred language has l and account country is c - * The language and the country fields in the keys are optional. - * The app will generate 1) country-language-based 2) language-based 3) global embeddings in one dataset. - * It's up to the clients to decide which embeddings to use - */ -trait InferredLanguageTfgBasedTopicEmbeddingsBaseApp - extends SimClustersEmbeddingBaseJob[(TopicId, Option[Language], Option[Country])] - with DateRangeExecutionApp { - - val isAdhoc: Boolean - val embeddingType: EmbeddingType - val embeddingSource: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding]] - val pathSuffix: String - val modelVersion: ModelVersion - def scoreExtractor: UserToInterestedInClusterScores => Double - - override def numClustersPerNoun: Int = 50 - override def numNounsPerClusters: Int = 1 // not used for now. Set to an arbitrary number - override def thresholdForEmbeddingScores: Double = 0.001 - - implicit val inj: Injection[(TopicId, Option[Language], Option[Country]), Array[Byte]] = - Bufferable.injectionOf[(TopicId, Option[Language], Option[Country])] - - // Default to 10K, top 1% for (topic, country, language) follows - // Child classes may want to tune this number for their own use cases. - val minPerCountryFollowers = 10000 - val minFollowers = 100 - - def getTopicUsers( - topicFollowGraph: TypedPipe[(TopicId, UserId)], - userSource: TypedPipe[(UserId, (Country, Language))], - userLanguages: TypedPipe[(UserId, Seq[(Language, Double)])] - ): TypedPipe[((TopicId, Option[Language], Option[Country]), UserId, Double)] = { - topicFollowGraph - .map { case (topic, user) => (user, topic) } - .join(userSource) - .join(userLanguages) - .flatMap { - case (user, ((topic, (country, _)), scoredLangs)) => - scoredLangs.flatMap { - case (lang, score) => - Seq( - ((topic, Some(lang), Some(country)), user, score), // with language and country - ((topic, Some(lang), None), user, score) // with language - ) - } ++ Seq(((topic, None, None), user, 1.0)) // non-language - }.forceToDisk - } - - def getValidTopics( - topicUsers: TypedPipe[((TopicId, Option[Language], Option[Country]), UserId, Double)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(TopicId, Option[Language], Option[Country])] = { - val countryBasedTopics = Stat("country_based_topics") - val nonCountryBasedTopics = Stat("non_country_based_topics") - - val (countryBased, nonCountryBased) = topicUsers.partition { - case ((_, lang, country), _, _) => lang.isDefined && country.isDefined - } - - SparseMatrix(countryBased).rowL1Norms.collect { - case (key, l1Norm) if l1Norm >= minPerCountryFollowers => - countryBasedTopics.inc() - key - } ++ - SparseMatrix(nonCountryBased).rowL1Norms.collect { - case (key, l1Norm) if l1Norm >= minFollowers => - nonCountryBasedTopics.inc() - key - } - } - - override def prepareNounToUserMatrix( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): SparseMatrix[(TopicId, Option[Language], Option[Country]), UserId, Double] = { - val topicUsers = getTopicUsers( - ExternalDataSources.topicFollowGraphSource, - ExternalDataSources.userSource, - ExternalDataSources.inferredUserConsumedLanguageSource) - - SparseMatrix[(TopicId, Option[Language], Option[Country]), UserId, Double](topicUsers) - .filterRows(getValidTopics(topicUsers)) - } - - override def prepareUserToClusterMatrix( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): SparseRowMatrix[UserId, ClusterId, Double] = - SparseRowMatrix( - InterestedInSources - .simClustersInterestedInSource(modelVersion, dateRange, timeZone) - .map { - case (userId, clustersUserIsInterestedIn) => - userId -> clustersUserIsInterestedIn.clusterIdToScores - .map { - case (clusterId, scores) => - clusterId -> scoreExtractor(scores) - } - .filter(_._2 > 0.0) - .toMap - }, - isSkinnyMatrix = true - ) - - override def writeNounToClustersIndex( - output: TypedPipe[((TopicId, Option[Language], Option[Country]), Seq[(ClusterId, Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val topicEmbeddingCount = Stat(s"topic_embedding_count") - - val tsvExec = - output - .map { - case ((entityId, language, country), clustersWithScores) => - (entityId, language, country, clustersWithScores.take(5).mkString(",")) - } - .shard(5) - .writeExecution(TypedTsv[(TopicId, Option[Language], Option[Country], String)]( - s"/user/recos-platform/adhoc/topic_embedding/$pathSuffix/$ModelVersionPathMap($modelVersion)")) - - val keyValExec = output - .map { - case ((entityId, lang, country), clustersWithScores) => - topicEmbeddingCount.inc() - KeyVal( - SimClustersEmbeddingId( - embeddingType, - modelVersion, - InternalId.TopicId(ThriftTopicId(entityId, lang, country)) - ), - SimClustersEmbedding(clustersWithScores).toThrift - ) - } - .writeDALVersionedKeyValExecution( - embeddingSource, - D.Suffix( - EmbeddingUtil - .getHdfsPath(isAdhoc = isAdhoc, isManhattanKeyVal = true, modelVersion, pathSuffix)) - ) - if (isAdhoc) - Execution.zip(tsvExec, keyValExec).unit - else - keyValExec - } - - override def writeClusterToNounsIndex( - output: TypedPipe[(ClusterId, Seq[((TopicId, Option[Language], Option[Country]), Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - Execution.unit // do not need this - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/LogFavTfgBasedTopicEmbeddings.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/LogFavTfgBasedTopicEmbeddings.docx new file mode 100644 index 000000000..3f145720d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/LogFavTfgBasedTopicEmbeddings.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/LogFavTfgBasedTopicEmbeddings.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/LogFavTfgBasedTopicEmbeddings.scala deleted file mode 100644 index 1869b5c64..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/LogFavTfgBasedTopicEmbeddings.scala +++ /dev/null @@ -1,70 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.tfg - -import com.twitter.dal.client.dataset.{KeyValDALDataset, SnapshotDALDatasetBase} -import com.twitter.scalding._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.hdfs_sources.EntityEmbeddingsSources -import com.twitter.simclusters_v2.thriftscala.{ - EmbeddingType, - ModelVersion, - SimClustersEmbeddingId, - TfgTopicEmbeddings, - UserToInterestedInClusterScores, - SimClustersEmbedding => ThriftSimClustersEmbedding -} -import com.twitter.wtf.scalding.jobs.common.{AdhocExecutionApp, ScheduledExecutionApp} - -/** - * Jobs to generate Logfav-based Topic-Follow-Graph (TFG) topic embeddings - * A topic's logfav-based TFG embedding is the sum of its followers' logfav-based InterestedIn - */ - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:logfav_tfg_topic_embeddings-adhoc - scalding remote run \ - --user cassowary \ - --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ - --principal service_acoount@TWITTER.BIZ \ - --cluster bluebird-qus1 \ - --main-class com.twitter.simclusters_v2.scalding.embedding.tfg.LogFavTfgTopicEmbeddingsAdhocApp \ - --target src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:logfav_tfg_topic_embeddings-adhoc \ - --hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \ - -- --date 2020-12-08 - */ -object LogFavTfgTopicEmbeddingsAdhocApp - extends TfgBasedTopicEmbeddingsBaseApp - with AdhocExecutionApp { - override val isAdhoc: Boolean = true - override val embeddingType: EmbeddingType = EmbeddingType.LogFavTfgTopic - override val embeddingSource: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding] - ] = EntityEmbeddingsSources.LogFavTfgTopicEmbeddingsDataset - override val pathSuffix: String = "logfav_tfg_topic_embedding" - override val modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated - override val parquetDataSource: SnapshotDALDatasetBase[TfgTopicEmbeddings] = - EntityEmbeddingsSources.LogFavTfgTopicEmbeddingsParquetDataset - override def scoreExtractor: UserToInterestedInClusterScores => Double = scores => - scores.logFavScore.getOrElse(0.0) -} - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg:logfav_tfg_topic_embeddings -capesospy-v2 update --build_locally --start_cron logfav_tfg_topic_embeddings src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object LogFavTfgTopicEmbeddingsScheduledApp - extends TfgBasedTopicEmbeddingsBaseApp - with ScheduledExecutionApp { - override val isAdhoc: Boolean = false - override val embeddingType: EmbeddingType = EmbeddingType.LogFavTfgTopic - override val embeddingSource: KeyValDALDataset[ - KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding] - ] = EntityEmbeddingsSources.LogFavTfgTopicEmbeddingsDataset - override val pathSuffix: String = "logfav_tfg_topic_embedding" - override val modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated - override def scoreExtractor: UserToInterestedInClusterScores => Double = scores => - scores.logFavScore.getOrElse(0.0) - override val parquetDataSource: SnapshotDALDatasetBase[TfgTopicEmbeddings] = - EntityEmbeddingsSources.LogFavTfgTopicEmbeddingsParquetDataset - override val firstTime: RichDate = RichDate("2020-05-25") - override val batchIncrement: Duration = Days(1) -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/README b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/README deleted file mode 100644 index d08ff73f1..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/README +++ /dev/null @@ -1,7 +0,0 @@ -TFG stands for Topic Follow Graph -The TFG topic embeddings are embeddings built from Topic Follow Graph. -Each topic is represented by the sum of its followers' user InterestedIn embeddings. - -There are two types of embeddings: -logfav - topic embeddings built from followers' logfav-based InterestedIn -fav - topic embeddings built from followers' fav-based InterestedIn diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/README.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/README.docx new file mode 100644 index 000000000..c75c068f2 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/README.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/TfgBasedTopicEmbeddingsBaseApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/TfgBasedTopicEmbeddingsBaseApp.docx new file mode 100644 index 000000000..dbce486f9 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/TfgBasedTopicEmbeddingsBaseApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/TfgBasedTopicEmbeddingsBaseApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/TfgBasedTopicEmbeddingsBaseApp.scala deleted file mode 100644 index 2725bafb5..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/tfg/TfgBasedTopicEmbeddingsBaseApp.scala +++ /dev/null @@ -1,191 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.tfg - -import com.twitter.bijection.{Bufferable, Injection} -import com.twitter.dal.client.dataset.{KeyValDALDataset, SnapshotDALDatasetBase} -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite.{D, _} -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.{Language, SimClustersEmbedding, TopicId} -import com.twitter.simclusters_v2.hdfs_sources.InterestedInSources -import com.twitter.simclusters_v2.scalding.common.matrix.{SparseMatrix, SparseRowMatrix} -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.{UserId, _} -import com.twitter.simclusters_v2.scalding.embedding.common.{ - EmbeddingUtil, - ExternalDataSources, - SimClustersEmbeddingBaseJob -} -import com.twitter.simclusters_v2.thriftscala.{ - ClustersScore, - EmbeddingType, - TfgTopicEmbeddings, - InternalId, - LocaleEntityId, - ModelVersion, - SimClustersEmbeddingId, - UserToInterestedInClusterScores, - SimClustersEmbedding => ThriftSimClustersEmbedding, - TopicId => TID -} -import com.twitter.wtf.scalding.jobs.common.DateRangeExecutionApp - -import java.util.TimeZone - -/** - * Base app for the Topic-Follow-Graph (TFG) topic embeddings - * A topic's TFG embedding is represented by the sum of all the users who followed the topic - */ -trait TfgBasedTopicEmbeddingsBaseApp - extends SimClustersEmbeddingBaseJob[(TopicId, Language)] - with DateRangeExecutionApp { - - val isAdhoc: Boolean - val embeddingType: EmbeddingType - val embeddingSource: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, ThriftSimClustersEmbedding]] - val pathSuffix: String - val modelVersion: ModelVersion - val parquetDataSource: SnapshotDALDatasetBase[TfgTopicEmbeddings] - def scoreExtractor: UserToInterestedInClusterScores => Double - - override def numClustersPerNoun: Int = 50 - override def numNounsPerClusters: Int = 1 // not used for now. Set to an arbitrary number - override def thresholdForEmbeddingScores: Double = 0.001 - - val minNumFollowers = 100 - - override def prepareNounToUserMatrix( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): SparseMatrix[(TopicId, Language), UserId, Double] = { - implicit val inj: Injection[(TopicId, Language), Array[Byte]] = - Bufferable.injectionOf[(TopicId, Language)] - - val topicLangUsers = ExternalDataSources.topicFollowGraphSource - .map { case (topic, user) => (user, topic) } - .join(ExternalDataSources.userSource) - .map { - case (user, (topic, (_, language))) => - ((topic, language), user, 1.0) - } - .forceToDisk - - val validTopicLang = - SparseMatrix(topicLangUsers).rowNnz.filter { - case (_, nzCount) => nzCount >= minNumFollowers - }.keys - - SparseMatrix[(TopicId, Language), UserId, Double](topicLangUsers).filterRows(validTopicLang) - } - - override def prepareUserToClusterMatrix( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): SparseRowMatrix[UserId, ClusterId, Double] = - SparseRowMatrix( - InterestedInSources - .simClustersInterestedInSource(modelVersion, dateRange, timeZone) - .map { - case (userId, clustersUserIsInterestedIn) => - userId -> clustersUserIsInterestedIn.clusterIdToScores - .map { - case (clusterId, scores) => - clusterId -> scoreExtractor(scores) - } - .filter(_._2 > 0.0) - .toMap - }, - isSkinnyMatrix = true - ) - - override def writeNounToClustersIndex( - output: TypedPipe[((TopicId, Language), Seq[(ClusterId, Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val topicEmbeddingCount = Stat(s"topic_embedding_count") - val user = System.getenv("USER") - val parquetExec = output - .map { - case ((entityId, language), clustersWithScores) => - TfgTopicEmbeddings( - TID( - entityId = entityId, - language = Some(language), - ), - clusterScore = clustersWithScores.map { - case (clusterId, score) => ClustersScore(clusterId, score) - } - ) - } - .writeDALSnapshotExecution( - parquetDataSource, - D.Daily, - D.Suffix( - EmbeddingUtil.getHdfsPath( - isAdhoc = isAdhoc, - isManhattanKeyVal = false, - modelVersion, - pathSuffix + "/snapshot")), - D.Parquet, - dateRange.end - ) - - val tsvExec = - output - .map { - case ((entityId, language), clustersWithScores) => - (entityId, language, clustersWithScores.mkString(";")) - } - .shard(10) - .writeExecution(TypedTsv[(TopicId, Language, String)]( - s"/user/$user/adhoc/topic_embedding/$pathSuffix/$ModelVersionPathMap($modelVersion)")) - - val keyValExec = output - .flatMap { - case ((entityId, lang), clustersWithScores) => - topicEmbeddingCount.inc() - val embedding = SimClustersEmbedding(clustersWithScores).toThrift - Seq( - KeyVal( - SimClustersEmbeddingId( - embeddingType, - modelVersion, - InternalId.LocaleEntityId(LocaleEntityId(entityId, lang)) - ), - embedding - ), - KeyVal( - SimClustersEmbeddingId( - embeddingType, - modelVersion, - InternalId.TopicId(TID(entityId, Some(lang), country = None)) - ), - embedding - ), - ) - } - .writeDALVersionedKeyValExecution( - embeddingSource, - D.Suffix( - EmbeddingUtil - .getHdfsPath(isAdhoc = isAdhoc, isManhattanKeyVal = true, modelVersion, pathSuffix)) - ) - if (isAdhoc) - Execution.zip(tsvExec, keyValExec, parquetExec).unit - else - Execution.zip(keyValExec, parquetExec).unit - } - - override def writeClusterToNounsIndex( - output: TypedPipe[(ClusterId, Seq[((TopicId, Language), Double)])] - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - Execution.unit // do not need this - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/BUILD.bazel b/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/BUILD.bazel deleted file mode 100644 index 86124f1ff..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/BUILD.bazel +++ /dev/null @@ -1,166 +0,0 @@ -scala_library( - sources = [ - "*.scala", - "common/*.scala", - ], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = [ - "bazel-compatible", - "bazel-only", - ], - dependencies = [ - "src/scala/com/twitter/simclusters_v2/common/clustering", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:aggregatable_producer_simclusters_embeddings_by_log_fav_score-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:clusters_members_connected_components_ape_similarity-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:clusters_members_largest_dim_ape_similarity-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:clusters_members_largest_dim_ape_similarity_2_day_update-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:clusters_members_louvain_ape_similarity-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:interested_in_twice_by_largest_dim-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:interested_in_twice_by_largest_dim_2_day_update-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:interested_in_twice_by_largest_dim_fav_score-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:interested_in_twice_connected_components-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:interested_in_twice_louvain-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:user_user_normalized_graph-scala", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/scala/com/twitter/simclusters_v2/scalding/embedding", - "src/scala/com/twitter/wtf/scalding/jobs/common:execution_app", - "src/scala/com/twitter/wtf/scalding/jobs/common:sources", - "src/scala/com/twitter/wtf/scalding/jobs/common:stats_util", - ], -) - -# ======================== -# ADHOC JOB CONFIGURATIONS -# Note: Please change mapreduce.job.reduces and --num-reducers together. -# ======================== -scalding_job( - name = "interested_in_twice_largest_dim-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.twice.InterestedInTwiceLargestDimAdhocApp", - args = [ - "--date 2021-08-31", - "--num-reducers 4000", - ], - config = [ - ("hadoop.combine-input", "true"), - ("hadoop.map.jvm.total-memory", "3072m"), - ("hadoop.reduce.jvm.total-memory", "3072m"), - ("hadoop.submitter.jvm.total-memory", "5120m"), - ("submitter.tier", "preemptible"), - ], - hadoop_cluster = "qus1-bluebird", - hadoop_properties = [ - ("mapreduce.job.reduce.slowstart.completedmaps", "1.0"), - ("scalding.with.reducers.set.explicitly", "true"), - ("mapreduce.job.reduces", "4000"), - ("mapreduce.task.timeout", "0"), - ], - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":twice"], -) - -scalding_job( - name = "interested_in_twice_largest_dim_fav_score-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.twice.InterestedInTwiceLargestDimMaxFavScoreAdhocApp", - args = [ - "--date 2022-07-01", - "--num-reducers 4000", - ], - config = [ - ("hadoop.combine-input", "true"), - ("hadoop.map.jvm.total-memory", "3072m"), - ("hadoop.reduce.jvm.total-memory", "3072m"), - ("hadoop.submitter.jvm.total-memory", "5120m"), - ("submitter.tier", "preemptible"), - ], - hadoop_cluster = "qus1-bluebird", - hadoop_properties = [ - ("mapreduce.job.reduce.slowstart.completedmaps", "1.0"), - ("scalding.with.reducers.set.explicitly", "true"), - ("mapreduce.job.reduces", "4000"), - ("mapreduce.task.timeout", "0"), - ], - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":twice"], -) - -scalding_job( - name = "interested_in_twice_louvain-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.twice.InterestedInTwiceLouvainAdhocApp", - args = [ - "--date 2021-08-31", - "--num-reducers 4000", - "--cosine_similarity_threshold 0.5", - ], - config = [ - ("hadoop.combine-input", "true"), - ("hadoop.map.jvm.total-memory", "3072m"), - ("hadoop.reduce.jvm.total-memory", "3072m"), - ("hadoop.submitter.jvm.total-memory", "5120m"), - ("submitter.tier", "preemptible"), - ], - hadoop_cluster = "qus1-bluebird", - hadoop_properties = [ - ("mapreduce.job.reduce.slowstart.completedmaps", "1.0"), - ("scalding.with.reducers.set.explicitly", "true"), - ("mapreduce.job.reduces", "4000"), - ("mapreduce.task.timeout", "0"), - ], - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":twice"], -) - -scalding_job( - name = "interested_in_twice_connected_components-adhoc", - main = "com.twitter.simclusters_v2.scalding.embedding.twice.InterestedInTwiceConnectedComponentsAdhocApp", - args = [ - "--date 2021-08-31", - "--num-reducers 4000", - "--cosine_similarity_threshold 0.5", - ], - config = [ - ("hadoop.combine-input", "true"), - ("hadoop.map.jvm.total-memory", "3072m"), - ("hadoop.reduce.jvm.total-memory", "3072m"), - ("hadoop.submitter.jvm.total-memory", "5120m"), - ("submitter.tier", "preemptible"), - ], - hadoop_cluster = "qus1-bluebird", - hadoop_properties = [ - ("mapreduce.job.reduce.slowstart.completedmaps", "1.0"), - ("scalding.with.reducers.set.explicitly", "true"), - ("mapreduce.job.reduces", "4000"), - ("mapreduce.task.timeout", "0"), - ], - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":twice"], -) - -# ============================ -# SCHEDULED JOB CONFIGURATIONS -# Twice jobs have been descheduled -# ============================ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/BUILD.docx new file mode 100644 index 000000000..9ab63490d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/InterestedInTwice.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/InterestedInTwice.docx new file mode 100644 index 000000000..5fecffd34 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/InterestedInTwice.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/InterestedInTwice.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/InterestedInTwice.scala deleted file mode 100644 index 5669f8bbd..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/InterestedInTwice.scala +++ /dev/null @@ -1,454 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.twice - -import com.twitter.scalding.Args -import com.twitter.scalding.DateRange -import com.twitter.scalding.Days -import com.twitter.scalding.Duration -import com.twitter.scalding.Execution -import com.twitter.scalding.RichDate -import com.twitter.scalding.UniqueID -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.clustering.ConnectedComponentsClusteringMethod -import com.twitter.simclusters_v2.common.clustering.LargestDimensionClusteringMethod -import com.twitter.simclusters_v2.common.clustering.LouvainClusteringMethod -import com.twitter.simclusters_v2.common.clustering.MedoidRepresentativeSelectionMethod -import com.twitter.simclusters_v2.common.clustering.MaxFavScoreRepresentativeSelectionMethod -import com.twitter.simclusters_v2.common.clustering.SimilarityFunctions -import com.twitter.simclusters_v2.hdfs_sources.ClustersMembersConnectedComponentsApeSimilarityScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.ClustersMembersLargestDimApeSimilarity2DayUpdateScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.ClustersMembersLargestDimApeSimilarityScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.ClustersMembersLouvainApeSimilarityScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.InterestedInTwiceByLargestDim2DayUpdateScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.InterestedInTwiceByLargestDimScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.InterestedInTwiceByLargestDimFavScoreScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.InterestedInTwiceConnectedComponentsScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.InterestedInTwiceLouvainScalaDataset -import com.twitter.simclusters_v2.scalding.embedding.twice.InterestedInTwiceBaseApp.ProducerEmbeddingSource -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - To build & deploy the TWICE scheduled jobs via workflows: - - scalding workflow upload \ - --workflow interested_in_twice-batch \ - --jobs src/scala/com/twitter/simclusters_v2/scalding/embedding/twice:interested_in_twice_largest_dim-batch,src/scala/com/twitter/simclusters_v2/scalding/embedding/twice:interested_in_twice_louvain-batch,src/scala/com/twitter/simclusters_v2/scalding/embedding/twice:interested_in_twice_connected_components-batch \ - --scm-paths "src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/*" \ - --autoplay \ - - -> See workflow here: https://workflows.twitter.biz/workflow/cassowary/interested_in_twice-batch - - (Use `scalding workflow upload --help` for a breakdown of the different flags) - */*/ - -object InterestedInTwiceLargestDimScheduledApp - extends InterestedInTwiceBaseApp[SimClustersEmbedding] - with ScheduledExecutionApp { - - override def firstTime: RichDate = RichDate("2021-09-02") - override def batchIncrement: Duration = Days(7) - - override def producerProducerSimilarityFnForClustering: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersMatchingLargestDimension - override def producerProducerSimilarityFnForClusterRepresentative: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - - /** - * Top-level method of this application. - */ - def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueId: UniqueID - ): Execution[Unit] = { - - runScheduledApp( - new LargestDimensionClusteringMethod(), - new MedoidRepresentativeSelectionMethod[SimClustersEmbedding]( - producerProducerSimilarityFnForClusterRepresentative), - ProducerEmbeddingSource.getAggregatableProducerEmbeddings, - "interested_in_twice_by_largest_dim", - "clusters_members_largest_dim_ape_similarity", - InterestedInTwiceByLargestDimScalaDataset, - ClustersMembersLargestDimApeSimilarityScalaDataset, - args.getOrElse("num-reducers", "4000").toInt - ) - - } - -} - -object InterestedInTwiceLargestDimMaxFavScoreScheduledApp - extends InterestedInTwiceBaseApp[SimClustersEmbedding] - with ScheduledExecutionApp { - - override def firstTime: RichDate = RichDate("2022-06-30") - override def batchIncrement: Duration = Days(7) - - override def producerProducerSimilarityFnForClustering: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersMatchingLargestDimension - override def producerProducerSimilarityFnForClusterRepresentative: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - - /** - * Top-level method of this application. - */ - def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueId: UniqueID - ): Execution[Unit] = { - - runScheduledApp( - new LargestDimensionClusteringMethod(), - new MaxFavScoreRepresentativeSelectionMethod[SimClustersEmbedding](), - ProducerEmbeddingSource.getAggregatableProducerEmbeddings, - "interested_in_twice_by_largest_dim_fav_score", - "clusters_members_largest_dim_ape_similarity", - InterestedInTwiceByLargestDimFavScoreScalaDataset, - ClustersMembersLargestDimApeSimilarityScalaDataset, - args.getOrElse("num-reducers", "4000").toInt - ) - - } - -} - -object InterestedInTwiceLouvainScheduledApp - extends InterestedInTwiceBaseApp[SimClustersEmbedding] - with ScheduledExecutionApp { - - override def firstTime: RichDate = RichDate("2021-09-02") - override def batchIncrement: Duration = Days(7) - - override def producerProducerSimilarityFnForClustering: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - override def producerProducerSimilarityFnForClusterRepresentative: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - - /** - * Top-level method of this application. - */ - def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueId: UniqueID - ): Execution[Unit] = { - - runScheduledApp( - new LouvainClusteringMethod( - args.required("cosine_similarity_threshold").toDouble, - args.optional("resolution_factor").map(_.toDouble)), - new MedoidRepresentativeSelectionMethod[SimClustersEmbedding]( - producerProducerSimilarityFnForClusterRepresentative), - ProducerEmbeddingSource.getAggregatableProducerEmbeddings, - "interested_in_twice_louvain", - "clusters_members_louvain_ape_similarity", - InterestedInTwiceLouvainScalaDataset, - ClustersMembersLouvainApeSimilarityScalaDataset, - args.getOrElse("num-reducers", "4000").toInt - ) - - } - -} - -object InterestedInTwiceConnectedComponentsScheduledApp - extends InterestedInTwiceBaseApp[SimClustersEmbedding] - with ScheduledExecutionApp { - - override def firstTime: RichDate = RichDate("2021-09-02") - override def batchIncrement: Duration = Days(7) - override def producerProducerSimilarityFnForClustering: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - override def producerProducerSimilarityFnForClusterRepresentative: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - - /** - * Top-level method of this application. - */ - def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueId: UniqueID - ): Execution[Unit] = { - - runScheduledApp( - new ConnectedComponentsClusteringMethod( - args.required("cosine_similarity_threshold").toDouble), - new MedoidRepresentativeSelectionMethod[SimClustersEmbedding]( - producerProducerSimilarityFnForClusterRepresentative), - ProducerEmbeddingSource.getAggregatableProducerEmbeddings, - "interested_in_twice_connected_components", - "clusters_members_connected_components_ape_similarity", - InterestedInTwiceConnectedComponentsScalaDataset, - ClustersMembersConnectedComponentsApeSimilarityScalaDataset, - args.getOrElse("num-reducers", "4000").toInt - ) - - } - -} - -/** Production Scalding job that calculates TWICE embeddings in a shorter period (every two days). - * - * Given that the input sources of TWICE are updated more frequently (e.g., user_user_graph is - * updated every 2 day), updating TWICE embedding every 2 day will better capture interests of new - * users and the interest shift of existing users. - * - * To build & deploy the scheduled job via workflows: - * {{{ - * scalding workflow upload \ - * --workflow interested_in_twice_2_day_update-batch \ - * --jobs src/scala/com/twitter/simclusters_v2/scalding/embedding/twice:interested_in_twice_largest_dim_2_day_update-batch \ - * --scm-paths "src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/*" \ - * --autoplay - * }}} - * - */*/ -object InterestedInTwiceLargestDim2DayUpdateScheduledApp - extends InterestedInTwiceBaseApp[SimClustersEmbedding] - with ScheduledExecutionApp { - - override def firstTime: RichDate = RichDate("2022-04-06") - override def batchIncrement: Duration = Days(2) - - override def producerProducerSimilarityFnForClustering: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersMatchingLargestDimension - override def producerProducerSimilarityFnForClusterRepresentative: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - - /** - * Top-level method of this application. - */ - def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueId: UniqueID - ): Execution[Unit] = { - - runScheduledApp( - new LargestDimensionClusteringMethod(), - new MedoidRepresentativeSelectionMethod[SimClustersEmbedding]( - producerProducerSimilarityFnForClusterRepresentative), - ProducerEmbeddingSource.getAggregatableProducerEmbeddings, - "interested_in_twice_by_largest_dim_2_day_update", - "clusters_members_largest_dim_ape_similarity_2_day_update", - InterestedInTwiceByLargestDim2DayUpdateScalaDataset, - ClustersMembersLargestDimApeSimilarity2DayUpdateScalaDataset, - args.getOrElse("num-reducers", "4000").toInt - ) - } -} - -/** - -[Preferred way] To run a locally built adhoc job: - ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/embedding/twice:interested_in_twice_-adhoc - scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/embedding/twice:interested_in_twice_-adhoc - -To build and run a adhoc job with workflows: - scalding workflow upload \ - --workflow interested_in_twice-adhoc \ - --jobs src/scala/com/twitter/simclusters_v2/scalding/embedding/twice:interested_in_twice_largest_dim-adhoc,src/scala/com/twitter/simclusters_v2/scalding/embedding/twice:interested_in_twice_louvain-adhoc,src/scala/com/twitter/simclusters_v2/scalding/embedding/twice:interested_in_twice_connected_components-adhoc \ - --scm-paths "src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/*" \ - --autoplay \ - - */*/ -object InterestedInTwiceLargestDimAdhocApp - extends InterestedInTwiceBaseApp[SimClustersEmbedding] - with AdhocExecutionApp { - - override def producerProducerSimilarityFnForClustering: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersMatchingLargestDimension - override def producerProducerSimilarityFnForClusterRepresentative: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - - /** - * Top-level method of this application. - */ - def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueId: UniqueID - ): Execution[Unit] = { - - runAdhocApp( - new LargestDimensionClusteringMethod(), - new MedoidRepresentativeSelectionMethod[SimClustersEmbedding]( - producerProducerSimilarityFnForClusterRepresentative), - ProducerEmbeddingSource.getAggregatableProducerEmbeddings, - "interested_in_twice_by_largest_dim", - "clusters_members_largest_dim_ape_similarity", - args.getOrElse("num-reducers", "4000").toInt - ) - - } -} - -object InterestedInTwiceLargestDimMaxFavScoreAdhocApp - extends InterestedInTwiceBaseApp[SimClustersEmbedding] - with AdhocExecutionApp { - - override def producerProducerSimilarityFnForClustering: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersMatchingLargestDimension - override def producerProducerSimilarityFnForClusterRepresentative: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - - /** - * Top-level method of this application. - */ - def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueId: UniqueID - ): Execution[Unit] = { - - runAdhocApp( - new LargestDimensionClusteringMethod(), - new MaxFavScoreRepresentativeSelectionMethod[SimClustersEmbedding](), - ProducerEmbeddingSource.getAggregatableProducerEmbeddings, - "interested_in_twice_by_largest_dim_fav_score", - "clusters_members_largest_dim_ape_similarity", - args.getOrElse("num-reducers", "4000").toInt - ) - - } -} - -object InterestedInTwiceLouvainAdhocApp - extends InterestedInTwiceBaseApp[SimClustersEmbedding] - with AdhocExecutionApp { - - override def producerProducerSimilarityFnForClustering: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - override def producerProducerSimilarityFnForClusterRepresentative: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - - /** - * Top-level method of this application. - */ - def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueId: UniqueID - ): Execution[Unit] = { - - runAdhocApp( - new LouvainClusteringMethod( - args.required("cosine_similarity_threshold").toDouble, - args.optional("resolution_factor").map(_.toDouble)), - new MedoidRepresentativeSelectionMethod[SimClustersEmbedding]( - producerProducerSimilarityFnForClusterRepresentative), - ProducerEmbeddingSource.getAggregatableProducerEmbeddings, - "interested_in_twice_louvain", - "clusters_members_louvain_ape_similarity", - args.getOrElse("num-reducers", "4000").toInt - ) - - } -} - -object InterestedInTwiceConnectedComponentsAdhocApp - extends InterestedInTwiceBaseApp[SimClustersEmbedding] - with AdhocExecutionApp { - - override def producerProducerSimilarityFnForClustering: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - override def producerProducerSimilarityFnForClusterRepresentative: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Double = - SimilarityFunctions.simClustersCosineSimilarity - - /** - * Top-level method of this application. - */ - def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueId: UniqueID - ): Execution[Unit] = { - - runAdhocApp( - new ConnectedComponentsClusteringMethod( - args.required("cosine_similarity_threshold").toDouble), - new MedoidRepresentativeSelectionMethod[SimClustersEmbedding]( - producerProducerSimilarityFnForClusterRepresentative), - ProducerEmbeddingSource.getAggregatableProducerEmbeddings, - "interested_in_twice_connected_components", - "clusters_members_connected_components_ape_similarity", - args.getOrElse("num-reducers", "4000").toInt - ) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/InterestedInTwiceBaseApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/InterestedInTwiceBaseApp.docx new file mode 100644 index 000000000..92a7a6b07 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/InterestedInTwiceBaseApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/InterestedInTwiceBaseApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/InterestedInTwiceBaseApp.scala deleted file mode 100644 index 585f23630..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/embedding/twice/InterestedInTwiceBaseApp.scala +++ /dev/null @@ -1,495 +0,0 @@ -package com.twitter.simclusters_v2.scalding.embedding.twice - -import com.twitter.bijection.Injection -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding.DateRange -import com.twitter.scalding.Days -import com.twitter.scalding.Execution -import com.twitter.scalding.Stat -import com.twitter.scalding.TypedTsv -import com.twitter.scalding.UniqueID -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.common.clustering.ClusteringMethod -import com.twitter.simclusters_v2.common.clustering.ClusteringStatistics._ -import com.twitter.simclusters_v2.common.clustering.ClusterRepresentativeSelectionMethod -import com.twitter.simclusters_v2.common.clustering.ClusterRepresentativeSelectionStatistics._ -import com.twitter.simclusters_v2.hdfs_sources.ProducerEmbeddingSources -import com.twitter.simclusters_v2.hdfs_sources.UserUserGraphScalaDataset -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.MultiEmbeddingType -import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights -import com.twitter.simclusters_v2.thriftscala.OrderedClustersAndMembers -import com.twitter.simclusters_v2.thriftscala.ClusterMembers -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingIdWithScore -import com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding -import com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding.Ids -import com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbeddingByIds -import com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbeddingId -import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors -import com.twitter.simclusters_v2.thriftscala.{ - SimClustersEmbeddingId => SimClustersEmbeddingIdThrift -} -import com.twitter.util.Stopwatch -import java.util.TimeZone -import scala.util.Random.shuffle - -/** - * Base app for computing User InterestedIn multi-embedding representation. - * TWICE: Capturing users’ long-term interests using multiple SimClusters embeddings. - * This job will - * - Randomly select K follow/fav actions for each user, - * - cluster the follow/fav actions for each user, - * - for each cluster, construct a representation (e.g. average or medoid). - * - * @tparam T type of producer embedding. e.g. SimClustersEmbedding - */ -trait InterestedInTwiceBaseApp[T] { - - import InterestedInTwiceBaseApp._ - - def modelVersion: ModelVersion = ModelVersion.Model20m145k2020 - - /** - * function to output similarity (>=0, the larger, more similar), given two producer embeddings. - */ - def producerProducerSimilarityFnForClustering: (T, T) => Double - def producerProducerSimilarityFnForClusterRepresentative: (T, T) => Double - - // Sort clusters by decreasing size, fall back to entity ID to break tie - val clusterOrdering: Ordering[Set[Long]] = math.Ordering.by(c => (-c.size, c.min)) - - /** - * Read user-user graph. - */ - def getUserUserGraph( - implicit dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[UserAndNeighbors] = { - DAL - .readMostRecentSnapshot( - UserUserGraphScalaDataset - ) - .withRemoteReadPolicy(AllowCrossDC) - .toTypedPipe - } - - /** - * Randomly select up to maxNeighborsByUser neighbors for each user. - * Attempts to equally sample both follow and fav edges (e.g. maxNeighborsByUser/2 for each). - * However, if one type of edge is insufficient, backfill with other type up to maxNeighborsByUser neighbours. - * @param userUserGraph User-User follow/fav graph. - * @param maxNeighborsByUser How many neighbors to keep for each user. - */ - def selectMaxProducersPerUser( - userUserGraph: TypedPipe[UserAndNeighbors], - maxNeighborsByUser: Int = MaxNeighborsByUser - )( - implicit uniqueID: UniqueID - ): TypedPipe[UserAndNeighbors] = { - - val numOfFollowEdgesStat = Stat(StatNumOfFollowEdges) - val numOfFavEdgesStat = Stat(StatNumOfFavEdges) - val numOfEdgesCumulativeFrequencyBeforeFilter = Util.CumulativeStat( - StatCFNumProducersPerConsumerBeforeFilter, - StatCFNumProducersPerConsumerBeforeFilterBuckets) - - userUserGraph.map { userAndNeighbors: UserAndNeighbors => - numOfEdgesCumulativeFrequencyBeforeFilter.incForValue(userAndNeighbors.neighbors.size) - - val (followEdges, favEdges) = - userAndNeighbors.neighbors.partition(_.isFollowed.contains(true)) - val randomFollowEdges = shuffle(followEdges) - val randomFavEdges = shuffle(favEdges) - - // interleave follow and fav edges, and select top k - val interleavedTopKEdges: Seq[NeighborWithWeights] = randomFollowEdges - .map(Some(_)) - .zipAll( - randomFavEdges.map(Some(_)), - None, - None - ) // default None value when one edge Seq is shorter than another - .flatMap { - case (followEdgeOpt, favEdgeOpt) => - Seq(followEdgeOpt, favEdgeOpt) - }.flatten - .take(maxNeighborsByUser) - - // edge stats - interleavedTopKEdges - .foreach { edge => - if (edge.isFollowed.contains(true)) numOfFollowEdgesStat.inc() - else numOfFavEdgesStat.inc() - } - - userAndNeighbors.copy(neighbors = interleavedTopKEdges) - } - } - - /** - * Get multi embedding for each user: - * - For each user, join their follow / fav - based neighbors to producer embeddings, - * - Group these neighbors into clusters using the specified clusteringMethod, - * - For each cluster, select the medoid as the representation. - * - * @param userUserGraph User-User follow/fav graph. - * @param producerEmbedding producer embedding dataset. e.g. simclusters embeddings, simhash, etc. - * @param clusteringMethod A method to group embeddings together. - * @param maxClustersPerUser How many clusters to keep per user. - * @param clusterRepresentativeSelectionMethod A method to select a cluster representative. - * @param numReducers How many reducers to use for sketch operation. - */ - def getMultiEmbeddingPerUser( - userUserGraph: TypedPipe[UserAndNeighbors], - producerEmbedding: TypedPipe[(UserId, T)], - clusteringMethod: ClusteringMethod, - maxClustersPerUser: Int = MaxClustersPerUser, - clusterRepresentativeSelectionMethod: ClusterRepresentativeSelectionMethod[T], - numReducers: Int - )( - implicit uniqueID: UniqueID - ): TypedPipe[(UserId, Seq[Set[UserId]], SimClustersMultiEmbedding)] = { - - val truncatedUserUserGraph: TypedPipe[UserAndNeighbors] = selectMaxProducersPerUser( - userUserGraph) - val validEdges: TypedPipe[(UserId, NeighborWithWeights)] = - truncatedUserUserGraph.flatMap { - case UserAndNeighbors(srcId, neighborsWithWeights) => - neighborsWithWeights.map { neighborWithWeights => - ( - neighborWithWeights.neighborId, // producerId - neighborWithWeights.copy(neighborId = srcId)) - } - } - - implicit val l2b: UserId => Array[Byte] = Injection.long2BigEndian - - val totalEdgesNonEmptyProducerEmbeddingsStat = Stat(StatTotalEdgesNonEmptyProducerEmbeddings) - val userClusterPairsBeforeTruncation = Stat(StatNumUserClusterPairsBeforeTruncation) - val userClusterPairsAfterTruncation = Stat(StatNumUserClusterPairsAfterTruncation) - val numUsers = Stat(StatNumUsers) - val numOfClustersCumulativeFrequencyBeforeFilter = - Util.CumulativeStat(StatCFNumOfClustersBeforeFilter, StatCFNumOfClustersBeforeFilterBuckets) - - // map each clustering statistic to a scalding.Stat - val clusteringStatsMap: Map[String, Stat] = Map( - StatSimilarityGraphTotalBuildTime -> Stat(StatSimilarityGraphTotalBuildTime), - StatClusteringAlgorithmRunTime -> Stat(StatClusteringAlgorithmRunTime), - StatMedoidSelectionTime -> Stat(StatMedoidSelectionTime) - ) - val cosineSimilarityCumulativeFrequencyBeforeFilter = Util.CumulativeStat( - StatCFCosineSimilarityBeforeFilter, - StatCFCosineSimilarityBeforeFilterBuckets) - - val clusterRepresentativeSelectionTime = Stat(StatClusterRepresentativeSelectionTime) - - validEdges - .sketch(numReducers) - .join(producerEmbedding) - .map { - case (producerId: UserId, (srcWithWeights: NeighborWithWeights, embedding)) => - totalEdgesNonEmptyProducerEmbeddingsStat.inc() - (srcWithWeights.neighborId, (srcWithWeights.copy(neighborId = producerId), embedding)) - } - .group - .toList - .map { - case (userId: UserId, embeddings: Seq[(NeighborWithWeights, T)]) => - numUsers.inc() - val embeddingsMap: Map[Long, T] = embeddings.map { - case (n: NeighborWithWeights, e) => (n.neighborId, e) - }.toMap - val weightsMap: Map[Long, NeighborWithWeights] = embeddings.map { - case (n: NeighborWithWeights, _) => (n.neighborId, n) - }.toMap - // 1. Cluster embeddings - val clusters: Set[Set[UserId]] = - clusteringMethod - .cluster[T]( - embeddingsMap, - producerProducerSimilarityFnForClustering, - // Map.get() returns an Option, so will not throw. - // Use .foreach() to filter out potential Nones. - (name, incr) => { - clusteringStatsMap.get(name).foreach(ctr => ctr.incBy(incr)) - if (name == StatComputedSimilarityBeforeFilter) - cosineSimilarityCumulativeFrequencyBeforeFilter.incForValue(incr) - } - ) - - // 2. Sort clusters - val sortedClusters: Seq[Set[UserId]] = clusters.toSeq.sorted(clusterOrdering) - - // 3. Keep only a max number of clusters (avoid OOM) - userClusterPairsBeforeTruncation.incBy(sortedClusters.size) - numOfClustersCumulativeFrequencyBeforeFilter.incForValue(sortedClusters.size) - val truncatedClusters = sortedClusters.take(maxClustersPerUser) - userClusterPairsAfterTruncation.incBy(truncatedClusters.size) - - // 4. Get list of cluster representatives - val truncatedIdWithScoreList: Seq[SimClustersEmbeddingIdWithScore] = - truncatedClusters.map { members: Set[UserId] => - val clusterRepresentationSelectionElapsed = Stopwatch.start() - val medoid: UserId = clusterRepresentativeSelectionMethod.selectClusterRepresentative( - members.map(id => weightsMap(id)), - embeddingsMap) - clusterRepresentativeSelectionTime.incBy( - clusterRepresentationSelectionElapsed().inMilliseconds) - - SimClustersEmbeddingIdWithScore( - id = SimClustersEmbeddingIdThrift( - EmbeddingType.TwiceUserInterestedIn, - modelVersion, - InternalId.UserId(medoid)), - score = members.size) - } - - ( - userId, - sortedClusters, - SimClustersMultiEmbedding.Ids( - SimClustersMultiEmbeddingByIds(ids = truncatedIdWithScoreList))) - } - } - - /** - * Write the output to disk as a TypedTsv. - */ - def writeOutputToTypedTSV( - output: TypedPipe[(UserId, Seq[Set[UserId]], SimClustersMultiEmbedding)], - userToClusterRepresentativesIndexOutputPath: String, - userToClusterMembersIndexOutputPath: String - ): Execution[(Unit, Unit)] = { - - // write the user -> cluster representatives index - val writeClusterRepresentatives = output - .collect { - case (userId: Long, _, Ids(ids)) => (userId, ids.ids) - } - //.shard(partitions = 1) - .writeExecution(TypedTsv[(UserId, Seq[SimClustersEmbeddingIdWithScore])]( - userToClusterRepresentativesIndexOutputPath)) - - // write the user -> cluster members index - val writeClusterMembers = output - .collect { - case (userId: Long, clusters: Seq[Set[UserId]], _) => (userId, clusters) - } - //.shard(partitions = 1) - .writeExecution(TypedTsv[(UserId, Seq[Set[UserId]])](userToClusterMembersIndexOutputPath)) - - Execution.zip(writeClusterRepresentatives, writeClusterMembers) - - } - - /** - * Write the output to disk as a KeyValDataset. - */ - def writeOutputToKeyValDataset( - output: TypedPipe[(UserId, Seq[Set[UserId]], SimClustersMultiEmbedding)], - embeddingType: MultiEmbeddingType, - userToClusterRepresentativesIndexDataset: KeyValDALDataset[ - KeyVal[SimClustersMultiEmbeddingId, SimClustersMultiEmbedding] - ], - userToClusterMembersIndexDataset: KeyValDALDataset[KeyVal[UserId, OrderedClustersAndMembers]], - userToClusterRepresentativesIndexOutputPath: String, - userToClusterMembersIndexOutputPath: String - )( - implicit dateRange: DateRange - ): Execution[(Unit, Unit)] = { - // write the user -> cluster representatives index - val writeClusterRepresentatives = output - .map { - case (userId: UserId, _, embeddings: SimClustersMultiEmbedding) => - KeyVal( - key = SimClustersMultiEmbeddingId( - embeddingType = embeddingType, - modelVersion = modelVersion, - internalId = InternalId.UserId(userId) - ), - value = embeddings - ) - } - .writeDALVersionedKeyValExecution( - userToClusterRepresentativesIndexDataset, - D.Suffix(userToClusterRepresentativesIndexOutputPath), - ExplicitEndTime(dateRange.end) - ) - - // write the user -> cluster members index - val writeClusterMembers = output - .map { - case (userId: UserId, clusters: Seq[Set[UserId]], _) => - KeyVal( - key = userId, - value = OrderedClustersAndMembers(clusters, Some(clusters.map(ClusterMembers(_))))) - } - .writeDALVersionedKeyValExecution( - userToClusterMembersIndexDataset, - D.Suffix(userToClusterMembersIndexOutputPath), - ExplicitEndTime(dateRange.end) - ) - - Execution.zip(writeClusterRepresentatives, writeClusterMembers) - } - - /** - * Main method for scheduled jobs. - */ - def runScheduledApp( - clusteringMethod: ClusteringMethod, - clusterRepresentativeSelectionMethod: ClusterRepresentativeSelectionMethod[T], - producerEmbedding: TypedPipe[(UserId, T)], - userToClusterRepresentativesIndexPathSuffix: String, - userToClusterMembersIndexPathSuffix: String, - userToClusterRepresentativesIndexDataset: KeyValDALDataset[ - KeyVal[SimClustersMultiEmbeddingId, SimClustersMultiEmbedding] - ], - userToClusterMembersIndexDataset: KeyValDALDataset[KeyVal[UserId, OrderedClustersAndMembers]], - numReducers: Int - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueId: UniqueID - ): Execution[Unit] = { - - val userToClusterRepresentativesIndexOutputPath: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - modelVersion = modelVersion, - pathSuffix = userToClusterRepresentativesIndexPathSuffix - ) - - val userToClusterMembersIndexOutputPath: String = EmbeddingUtil.getHdfsPath( - isAdhoc = false, - isManhattanKeyVal = true, - modelVersion = modelVersion, - pathSuffix = userToClusterMembersIndexPathSuffix - ) - - val execution = Execution.withId { implicit uniqueId => - val output: TypedPipe[(UserId, Seq[Set[UserId]], SimClustersMultiEmbedding)] = - getMultiEmbeddingPerUser( - userUserGraph = getUserUserGraph(dateRange.prepend(Days(30)), implicitly), - producerEmbedding = producerEmbedding, - clusteringMethod = clusteringMethod, - clusterRepresentativeSelectionMethod = clusterRepresentativeSelectionMethod, - numReducers = numReducers - ) - - writeOutputToKeyValDataset( - output = output, - embeddingType = MultiEmbeddingType.TwiceUserInterestedIn, - userToClusterRepresentativesIndexDataset = userToClusterRepresentativesIndexDataset, - userToClusterMembersIndexDataset = userToClusterMembersIndexDataset, - userToClusterRepresentativesIndexOutputPath = userToClusterRepresentativesIndexOutputPath, - userToClusterMembersIndexOutputPath = userToClusterMembersIndexOutputPath - ) - - } - - execution.unit - } - - /** - * Main method for adhoc jobs. - */ - def runAdhocApp( - clusteringMethod: ClusteringMethod, - clusterRepresentativeSelectionMethod: ClusterRepresentativeSelectionMethod[T], - producerEmbedding: TypedPipe[(UserId, T)], - userToClusterRepresentativesIndexPathSuffix: String, - userToClusterMembersIndexPathSuffix: String, - numReducers: Int - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueId: UniqueID - ): Execution[Unit] = { - - val userToClusterRepresentativesIndexOutputPath: String = EmbeddingUtil.getHdfsPath( - isAdhoc = true, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = userToClusterRepresentativesIndexPathSuffix - ) - - val userToClusterMembersIndexOutputPath: String = EmbeddingUtil.getHdfsPath( - isAdhoc = true, - isManhattanKeyVal = false, - modelVersion = modelVersion, - pathSuffix = userToClusterMembersIndexPathSuffix - ) - - val execution = Execution.withId { implicit uniqueId => - val output: TypedPipe[(UserId, Seq[Set[UserId]], SimClustersMultiEmbedding)] = - getMultiEmbeddingPerUser( - userUserGraph = getUserUserGraph(dateRange.prepend(Days(30)), implicitly), - producerEmbedding = producerEmbedding, - clusteringMethod = clusteringMethod, - clusterRepresentativeSelectionMethod = clusterRepresentativeSelectionMethod, - numReducers = numReducers - ) - - writeOutputToTypedTSV( - output, - userToClusterRepresentativesIndexOutputPath, - userToClusterMembersIndexOutputPath) - } - - execution.unit - } - -} - -object InterestedInTwiceBaseApp { - - // Statistics - val StatNumOfFollowEdges = "num_of_follow_edges" - val StatNumOfFavEdges = "num_of_fav_edges" - val StatTotalEdgesNonEmptyProducerEmbeddings = "total_edges_with_non_empty_producer_embeddings" - val StatNumUserClusterPairsBeforeTruncation = "num_user_cluster_pairs_before_truncation" - val StatNumUserClusterPairsAfterTruncation = "num_user_cluster_pairs_after_truncation" - val StatNumUsers = "num_users" - // Cumulative Frequency - val StatCFNumProducersPerConsumerBeforeFilter = "num_producers_per_consumer_cf_before_filter" - val StatCFNumProducersPerConsumerBeforeFilterBuckets: Seq[Double] = - Seq(0, 10, 20, 50, 100, 500, 1000) - val StatCFCosineSimilarityBeforeFilter = "cosine_similarity_cf_before_filter" - val StatCFCosineSimilarityBeforeFilterBuckets: Seq[Double] = - Seq(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100) - val StatCFNumOfClustersBeforeFilter = "num_of_clusters_cf_before_filter" - val StatCFNumOfClustersBeforeFilterBuckets: Seq[Double] = - Seq(1, 3, 5, 10, 15, 20, 50, 100, 200, 300, 500) - - val MaxClustersPerUser: Int = 10 - val MaxNeighborsByUser: Int = 500 - - object ProducerEmbeddingSource { - - /** - * Read log-fav based Aggregatable Producer embeddings dataset. - */ - def getAggregatableProducerEmbeddings( - implicit dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[(UserId, SimClustersEmbedding)] = - ProducerEmbeddingSources - .producerEmbeddingSource( - EmbeddingType.AggregatableLogFavBasedProducer, - ModelVersion.Model20m145k2020)(dateRange.prepend(Days(30))) - .mapValues(s => SimClustersEmbedding(s)) - - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/BUILD.bazel b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/BUILD.bazel deleted file mode 100644 index 7615fcf43..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/BUILD.bazel +++ /dev/null @@ -1,72 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = [ - "bazel-compatible", - "bazel-only", - ], - dependencies = [ - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/scala/com/twitter/simclusters_v2/scalding/offline_job", - ], -) - -hadoop_binary( - name = "tweet_evaluation_dummy_candidate_adhoc", - main = "com.twitter.simclusters_v2.scalding.DummyCandidateGenerationAdhocJob", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":evaluation", - ], -) - -hadoop_binary( - name = "tweet_evaluation_timelines_reference_adhoc", - main = "com.twitter.simclusters_v2.scalding.evaluation.AdhocTimelinesDataExtraction", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":evaluation", - ], -) - -hadoop_binary( - name = "tweet_evaluation_timelines_reference_batch", - main = "com.twitter.simclusters_v2.scalding.evaluation.ScheduledTimelinesDataExtractionBatch", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":evaluation", - ], -) - -hadoop_binary( - name = "simcluster_offline_eval_adhoc", - main = "com.twitter.simclusters_v2.scalding.evaluation.SimClustersEvaluationAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":evaluation", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/BUILD.docx new file mode 100644 index 000000000..d795ae2a5 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/CandidateEvaluationBase.docx b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/CandidateEvaluationBase.docx new file mode 100644 index 000000000..53ee41985 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/CandidateEvaluationBase.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/CandidateEvaluationBase.scala b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/CandidateEvaluationBase.scala deleted file mode 100644 index 24195fff7..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/CandidateEvaluationBase.scala +++ /dev/null @@ -1,163 +0,0 @@ -package com.twitter.simclusters_v2.scalding.evaluation - -import com.twitter.core_workflows.user_model.thriftscala.CondensedUserState -import com.twitter.core_workflows.user_model.thriftscala.UserState -import com.twitter.pluck.source.core_workflows.user_model.CondensedUserStateScalaDataset -import com.twitter.scalding._ -import com.twitter.scalding.source.TypedText -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.simclusters_v2.thriftscala.CandidateTweets -import com.twitter.simclusters_v2.thriftscala.ReferenceTweets -import scala.util.Random - -/** - * Helper functions to provide user samples by sampling across user states. - */ -object UserStateUserSampler { - def getSampleUsersByUserState( - userStateSource: TypedPipe[CondensedUserState], - validStates: Seq[UserState], - samplePercentage: Double - ): TypedPipe[(UserState, Long)] = { - assert(samplePercentage >= 0 && samplePercentage <= 1) - val validStateSet = validStates.toSet - - userStateSource - .collect { - case data if data.userState.isDefined && validStateSet.contains(data.userState.get) => - (data.userState.get, data.uid) - } - .filter(_ => Random.nextDouble() <= samplePercentage) - .forceToDisk - } - - /** - * Given a list of string corresponding to user states, convert them to the UserState type. - * If the input is empty, default to return all available user states - */ - def parseUserStates(strStates: Seq[String]): Seq[UserState] = { - if (strStates.isEmpty) { - UserState.list - } else { - strStates.map { str => - UserState - .valueOf(str).getOrElse( - throw new IllegalArgumentException( - s"Input user_states $str is invalid. Valid states are: " + UserState.list - ) - ) - } - } - } -} - -/** - * A variation of the evaluation base where target users are sampled by user states. - * For each user state of interest (e.x. HEAVY_TWEETER), we run a separate evaluation call, and - * output the evaluation results per user state. This is helpful when we want to horizontally - * compare how users in different user states respond to the candidate tweets. - */ -trait UserStateBasedEvaluationExecutionBase - extends CandidateEvaluationBase - with TwitterExecutionApp { - - def referenceTweets: TypedPipe[ReferenceTweets] - def candidateTweets: TypedPipe[CandidateTweets] - - override def job: Execution[Unit] = { - Execution.withId { implicit uniqueId => - Execution.withArgs { args => - implicit val dateRange: DateRange = - DateRange.parse(args.list("date"))(DateOps.UTC, DateParser.default) - - val outputRootDir = args("outputDir") - val userStates: Seq[UserState] = - UserStateUserSampler.parseUserStates(args.list("user_states")) - val sampleRate = args.double("sample_rate") - - // For each user state we are interested in, run separate executions and write - // the output into individual sub directories - val userStateSource = DAL.read(CondensedUserStateScalaDataset).toTypedPipe - val userIdsByState = - UserStateUserSampler.getSampleUsersByUserState(userStateSource, userStates, sampleRate) - val executionsPerUserState = userStates.map { userState => - val sampleUsers = userIdsByState.collect { case data if data._1 == userState => data._2 } - val outputPath = outputRootDir + "/" + userState + "/" - - super - .runSampledEvaluation(sampleUsers, referenceTweets, candidateTweets) - .writeExecution(TypedText.csv(outputPath)) - } - // Run evaluation for each user state in parallel - Execution.sequence(executionsPerUserState).unit - } - } - } -} - -/** - * A basic flow for evaluating the quality of a set of candidate tweets, typically generated by an - * algorithm (ex. SimClusters), by comparing its engagement rates against a set of reference tweets - * The job goes through the following steps: - * 1. Generate a group of target users on which we measure tweet engagements - * 2. Collect tweets impressed by these users and their engagements on tweets from a labeled - * tweet source (ex. Home Timeline engagement data), and form a reference set - * 3. For each candidate tweet, collect the engagement rates from the reference set - * 4. Run evaluation calculations (ex. percentage of intersection, engagement rate, etc) - * - * Each sub class is expected to provide 3 sets of data sources, which are the sample users, - * candidate tweet sources, and reference tweet sources. - */ -trait CandidateEvaluationBase { - private def getSampledReferenceTweets( - referenceTweetEngagements: TypedPipe[ReferenceTweets], - sampleUsers: TypedPipe[Long] - ): TypedPipe[ReferenceTweets] = { - referenceTweetEngagements - .groupBy(_.targetUserId) - .join(sampleUsers.asKeys) - .map { case (targetUserId, (referenceEngagements, _)) => referenceEngagements } - } - - private def getSampledCandidateTweets( - candidateTweets: TypedPipe[CandidateTweets], - sampleUsers: TypedPipe[Long] - ): TypedPipe[CandidateTweets] = { - candidateTweets - .groupBy(_.targetUserId) - .join(sampleUsers.asKeys) - .map { case (_, (tweets, _)) => tweets } - } - - /** - * Evaluation function, should be overridden by implementing sub classes to suit individual - * objectives, such as like engagement rates, CRT, etc. - * @param sampledReference - * @param sampledCandidate - */ - def evaluateResults( - sampledReference: TypedPipe[ReferenceTweets], - sampledCandidate: TypedPipe[CandidateTweets] - ): TypedPipe[String] - - /** - * Given a list of target users, the reference tweet set, and the candidate tweet set, - * calculate the engagement rates on the reference set and the candidate set by these users. - * The evaluation result should be converted into an itemized format - * these users. - * @param referenceTweets - * @param candidateTweets - * @return - */ - def runSampledEvaluation( - targetUserSamples: TypedPipe[Long], - referenceTweets: TypedPipe[ReferenceTweets], - candidateTweets: TypedPipe[CandidateTweets] - ): TypedPipe[String] = { - val sampledCandidate = getSampledCandidateTweets(candidateTweets, targetUserSamples) - val referencePerUser = getSampledReferenceTweets(referenceTweets, targetUserSamples) - - evaluateResults(referencePerUser, sampledCandidate) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/EvaluationMetricHelper.docx b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/EvaluationMetricHelper.docx new file mode 100644 index 000000000..3d4ed8532 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/EvaluationMetricHelper.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/EvaluationMetricHelper.scala b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/EvaluationMetricHelper.scala deleted file mode 100644 index 50bc36538..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/EvaluationMetricHelper.scala +++ /dev/null @@ -1,540 +0,0 @@ -package com.twitter.simclusters_v2.scalding.evaluation - -import com.twitter.scalding.{Execution, TypedPipe, UniqueID} -import com.twitter.simclusters_v2.thriftscala.{ - CandidateTweet, - CandidateTweets, - ReferenceTweet, - ReferenceTweets, - TweetLabels -} -import com.twitter.algebird.Aggregator.size -import com.twitter.scalding.typed.{CoGrouped, ValuePipe} -import com.twitter.util.TwitterDateFormat -import java.util.Calendar - -/** - * Statistics about the number of users who have engaged with tweets - */ -case class UserEngagerCounts( - numDistinctTargetUsers: Long, - numDistinctLikeEngagers: Long, - numDistinctRetweetEngagers: Long) - -/** - * Tweet side statistics, e.x. number of tweets, authors, etc. - */ -case class TweetStats( - numTweets: Long, - numDistinctTweets: Long, - numDistinctAuthors: Option[Long], - avgScore: Option[Double]) - -/** - * Helper data container class for storing engagement counts - */ -case class TweetEngagementCounts(like: Long, retweet: Long, click: Long, hasEngagement: Long) - -/** - * Helper data container class for storing engagement rates - */ -case class TweetEngagementRates(like: Double, retweet: Double, click: Double, hasEngagement: Double) - -case class LabelCorrelations( - pearsonCoefficientForLikes: Double, - cosineSimilarityGlobal: Double, - cosineSimilarityPerUserAvg: Double) { - private val f = java.text.NumberFormat.getInstance - def format(): String = { - Seq( - s"\tPearson Coefficient: ${f.format(pearsonCoefficientForLikes)}", - s"\tCosine similarity: ${f.format(cosineSimilarityGlobal)}", - s"\tAverage cosine similarity for all users: ${f.format(cosineSimilarityPerUserAvg)}" - ).mkString("\n") - } -} - -/** - * Helper tweet data container that can hold both the reference label engagements as well as the - * recommendation algorithm's scores. Helpful for evaluating joint data - */ -case class LabeledTweet( - targetUserId: Long, - tweetId: Long, - authorId: Long, - labels: TweetLabels, - algorithmScore: Option[Double]) - -case class LabeledTweetsResults( - tweetStats: TweetStats, - userEngagerCounts: UserEngagerCounts, - tweetEngagementCounts: TweetEngagementCounts, - tweetEngagementRates: TweetEngagementRates, - labelCorrelations: Option[LabelCorrelations] = None) { - private val f = java.text.NumberFormat.getInstance - - def format(title: String = ""): String = { - val str = Seq( - s"Number of tweets: ${f.format(tweetStats.numTweets)}", - s"Number of distinct tweets: ${f.format(tweetStats.numDistinctTweets)}", - s"Number of distinct users targeted: ${f.format(userEngagerCounts.numDistinctTargetUsers)}", - s"Number of distinct authors: ${tweetStats.numDistinctAuthors.map(f.format).getOrElse("N/A")}", - s"Average algorithm score of tweets: ${tweetStats.avgScore.map(f.format).getOrElse("N/A")}", - s"Engager counts:", - s"\tNumber of users who liked tweets: ${f.format(userEngagerCounts.numDistinctLikeEngagers)}", - s"\tNumber of users who retweeted tweets: ${f.format(userEngagerCounts.numDistinctRetweetEngagers)}", - s"Tweet engagement counts:", - s"\tNumber of Likes: ${f.format(tweetEngagementCounts.like)}", - s"\tNumber of Retweets: ${f.format(tweetEngagementCounts.retweet)}", - s"\tNumber of Clicks: ${f.format(tweetEngagementCounts.click)}", - s"\tNumber of tweets with any engagements: ${f.format(tweetEngagementCounts.hasEngagement)}", - s"Tweet engagement rates:", - s"\tRate of Likes: ${f.format(tweetEngagementRates.like * 100)}%", - s"\tRate of Retweets: ${f.format(tweetEngagementRates.retweet * 100)}%", - s"\tRate of Clicks: ${f.format(tweetEngagementRates.click * 100)}%", - s"\tRate of any engagement: ${f.format(tweetEngagementRates.hasEngagement * 100)}%" - ).mkString("\n") - - val correlations = labelCorrelations.map("\n" + _.format()).getOrElse("") - - s"$title\n$str$correlations" - } -} - -case class CandidateResults(tweetStats: TweetStats, numDistinctTargetUsers: Long) { - private val f = java.text.NumberFormat.getInstance - - def format(title: String = ""): String = { - val str = Seq( - s"Number of tweets: ${f.format(tweetStats.numTweets)}", - s"Number of distinct tweets: ${f.format(tweetStats.numDistinctTweets)}", - s"Number of distinct users targeted: ${f.format(numDistinctTargetUsers)}", - s"Number of distinct authors: ${tweetStats.numDistinctAuthors.map(f.format).getOrElse("N/A")}", - s"Average algorithm score of tweets: ${tweetStats.avgScore.map(f.format).getOrElse("N/A")}" - ).mkString("\n") - s"$title\n$str" - } -} - -/** - * Helper class for evaluating a given candidate tweet set against a reference tweet set. - * It provides aggregation evaluation metrics such as sum of engagements, rate of engagements, etc. - */ -object EvaluationMetricHelper { - private def toLong(bool: Boolean): Long = { - if (bool) 1L else 0L - } - - /** - * Core engagements are user actions that count towards core metrics, e.x. like, RT, etc - */ - private def hasCoreEngagements(labels: TweetLabels): Boolean = { - labels.isRetweeted || - labels.isLiked || - labels.isQuoted || - labels.isReplied - } - - /** - * Whether there are core engagements or click on the tweet - */ - private def hasCoreEngagementsOrClick(labels: TweetLabels): Boolean = { - hasCoreEngagements(labels) || labels.isClicked - } - - /** - * Return outer join of reference tweets and candidate tweets, keyed by (targetUserId, tweetId). - * The output of this can then be reused to fetch the inner join / left / right join, - * without having to redo the expensive join - * - * NOTE: Assumes the uniqueness of keys (i.e. (targetId, tweetId)). Make sure to dedup tweetIds - * for each targetId, otherwise .join() will yield duplicate results. - */ - def outerJoinReferenceAndCandidate( - referencePipe: TypedPipe[ReferenceTweets], - candidatePipe: TypedPipe[CandidateTweets] - ): CoGrouped[(Long, Long), (Option[ReferenceTweet], Option[CandidateTweet])] = { - - val references = referencePipe - .flatMap { refTweets => - refTweets.impressedTweets.map { refTweet => - ((refTweets.targetUserId, refTweet.tweetId), refTweet) - } - } - - val candidates = candidatePipe - .flatMap { candTweets => - candTweets.recommendedTweets.map { candTweet => - ((candTweets.targetUserId, candTweet.tweetId), candTweet) - } - } - - references.outerJoin(candidates).withReducers(50) - } - - /** - * Convert reference tweets to labeled tweets. We do this so that we can re-use the common - * metric calculations for labeled tweets on reference tweets - */ - def getLabeledReference(referencePipe: TypedPipe[ReferenceTweets]): TypedPipe[LabeledTweet] = { - referencePipe - .flatMap { refTweets => - refTweets.impressedTweets.map { tweet => - // Reference tweets do not have scores - LabeledTweet(refTweets.targetUserId, tweet.tweetId, tweet.authorId, tweet.labels, None) - } - } - } - - def getUniqueCount[T](pipe: TypedPipe[T])(implicit ord: scala.Ordering[T]): Execution[Long] = { - pipe.distinct - .aggregate(size) - .toOptionExecution - .map(_.getOrElse(0L)) - } - - def countUniqueEngagedUsersBy( - labeledTweetsPipe: TypedPipe[LabeledTweet], - f: TweetLabels => Boolean - ): Execution[Long] = { - getUniqueCount[Long](labeledTweetsPipe.collect { case t if f(t.labels) => t.targetUserId }) - } - - def countUniqueLabeledTargetUsers(labeledTweetsPipe: TypedPipe[LabeledTweet]): Execution[Long] = { - getUniqueCount[Long](labeledTweetsPipe.map(_.targetUserId)) - } - - def countUniqueCandTargetUsers(candidatePipe: TypedPipe[CandidateTweets]): Execution[Long] = { - getUniqueCount[Long](candidatePipe.map(_.targetUserId)) - } - - def countUniqueLabeledAuthors(labeledTweetPipe: TypedPipe[LabeledTweet]): Execution[Long] = { - getUniqueCount[Long](labeledTweetPipe.map(_.authorId)) - } - - /** - * Helper function to calculate the basic engagement rates - */ - def getEngagementRate( - basicStats: TweetStats, - engagementCount: TweetEngagementCounts - ): TweetEngagementRates = { - val numTweets = basicStats.numTweets.toDouble - if (numTweets <= 0) throw new IllegalArgumentException("Invalid tweet counts") - val likeRate = engagementCount.like / numTweets - val rtRate = engagementCount.retweet / numTweets - val clickRate = engagementCount.click / numTweets - val engagementRate = engagementCount.hasEngagement / numTweets - TweetEngagementRates(likeRate, rtRate, clickRate, engagementRate) - } - - /** - * Helper function to calculate the basic stats for a pipe of candidate tweets - */ - def getTweetStatsForCandidateExec( - candidatePipe: TypedPipe[CandidateTweets] - ): Execution[TweetStats] = { - val pipe = candidatePipe.map { candTweets => - (candTweets.targetUserId, candTweets.recommendedTweets) - }.sumByKey // Dedup by targetId, in case there exists multiple entries. - - val distinctTweetPipe = pipe.flatMap(_._2.map(_.tweetId)).distinct.aggregate(size) - - val otherStats = pipe - .map { - case (uid, recommendedTweets) => - val scoreSum = recommendedTweets.flatMap(_.score).sum - (recommendedTweets.size.toLong, scoreSum) - } - .sum - .map { - case (numTweets, scoreSum) => - if (numTweets <= 0) throw new IllegalArgumentException("Invalid tweet counts") - val avgScore = scoreSum / numTweets.toDouble - (numTweets, avgScore) - } - ValuePipe - .fold(distinctTweetPipe, otherStats) { - case (numDistinctTweet, (numTweets, avgScore)) => - // no author side information for candidate tweets yet - TweetStats(numTweets, numDistinctTweet, None, Some(avgScore)) - }.getOrElseExecution(TweetStats(0L, 0L, None, None)) - } - - /** - * Helper function to count the total number of engagements - */ - def getLabeledEngagementCountExec( - labeledTweets: TypedPipe[LabeledTweet] - ): Execution[TweetEngagementCounts] = { - labeledTweets - .map { labeledTweet => - val like = toLong(labeledTweet.labels.isLiked) - val retweet = toLong(labeledTweet.labels.isRetweeted) - val click = toLong(labeledTweet.labels.isClicked) - val hasEngagement = toLong(hasCoreEngagementsOrClick(labeledTweet.labels)) - - (like, retweet, click, hasEngagement) - } - .sum - .map { - case (like, retweet, click, hasEngagement) => - TweetEngagementCounts(like, retweet, click, hasEngagement) - } - .getOrElseExecution(TweetEngagementCounts(0L, 0L, 0L, 0L)) - } - - /** - * Count the total number of unique users who have engaged with tweets - */ - def getTargetUserStatsForLabeledTweetsExec( - labeledTweetsPipe: TypedPipe[LabeledTweet] - ): Execution[UserEngagerCounts] = { - val numUniqueTargetUsersExec = countUniqueLabeledTargetUsers(labeledTweetsPipe) - val numUniqueLikeUsersExec = - countUniqueEngagedUsersBy(labeledTweetsPipe, labels => labels.isLiked) - val numUniqueRetweetUsersExec = - countUniqueEngagedUsersBy(labeledTweetsPipe, labels => labels.isRetweeted) - - Execution - .zip( - numUniqueTargetUsersExec, - numUniqueLikeUsersExec, - numUniqueRetweetUsersExec - ) - .map { - case (numTarget, like, retweet) => - UserEngagerCounts( - numDistinctTargetUsers = numTarget, - numDistinctLikeEngagers = like, - numDistinctRetweetEngagers = retweet - ) - } - } - - /** - * Helper function to calculate the basic stats for a pipe of labeled tweets. - */ - def getTweetStatsForLabeledTweetsExec( - labeledTweetPipe: TypedPipe[LabeledTweet] - ): Execution[TweetStats] = { - val uniqueAuthorsExec = countUniqueLabeledAuthors(labeledTweetPipe) - - val uniqueTweetExec = - labeledTweetPipe.map(_.tweetId).distinct.aggregate(size).getOrElseExecution(0L) - val scoresExec = labeledTweetPipe - .map { t => (t.targetUserId, (1, t.algorithmScore.getOrElse(0.0))) } - .sumByKey // Dedup by targetId, in case there exists multiple entries. - .map { - case (uid, (c1, c2)) => - (c1.toLong, c2) - } - .sum - .map { - case (numTweets, scoreSum) => - if (numTweets <= 0) throw new IllegalArgumentException("Invalid tweet counts") - val avgScore = scoreSum / numTweets.toDouble - (numTweets, Option(avgScore)) - } - .getOrElseExecution((0L, None)) - - Execution - .zip(uniqueAuthorsExec, uniqueTweetExec, scoresExec) - .map { - case (numDistinctAuthors, numUniqueTweets, (numTweets, avgScores)) => - TweetStats(numTweets, numUniqueTweets, Some(numDistinctAuthors), avgScores) - } - } - - /** - * Print a update message to the stdout when a step is done. - */ - private def printOnCompleteMsg(stepDescription: String, startTimeMillis: Long): Unit = { - val formatDate = TwitterDateFormat("yyyy-MM-dd hh:mm:ss") - val now = Calendar.getInstance().getTime - - val secondsSpent = (now.getTime - startTimeMillis) / 1000 - println( - s"- ${formatDate.format(now)}\tStep complete: $stepDescription\t " + - s"Time spent: ${secondsSpent / 60}m${secondsSpent % 60}s" - ) - } - - /** - * Calculate the metrics of a pipe of [[CandidateTweets]] - */ - private def getEvaluationResultsForCandidates( - candidatePipe: TypedPipe[CandidateTweets] - ): Execution[CandidateResults] = { - val tweetStatsExec = getTweetStatsForCandidateExec(candidatePipe) - val numDistinctTargetUsersExec = countUniqueCandTargetUsers(candidatePipe) - - Execution - .zip(tweetStatsExec, numDistinctTargetUsersExec) - .map { - case (tweetStats, numDistinctTargetUsers) => - CandidateResults(tweetStats, numDistinctTargetUsers) - } - } - - /** - * Calculate the metrics of a pipe of [[LabeledTweet]] - */ - private def getEvaluationResultsForLabeledTweets( - labeledTweetPipe: TypedPipe[LabeledTweet], - getLabelCorrelations: Boolean = false - ): Execution[LabeledTweetsResults] = { - val tweetStatsExec = getTweetStatsForLabeledTweetsExec(labeledTweetPipe) - val userStatsExec = getTargetUserStatsForLabeledTweetsExec(labeledTweetPipe) - val engagementCountExec = getLabeledEngagementCountExec(labeledTweetPipe) - - val correlationsExec = if (getLabelCorrelations) { - Execution - .zip( - LabelCorrelationsHelper.pearsonCoefficientForLike(labeledTweetPipe), - LabelCorrelationsHelper.cosineSimilarityForLike(labeledTweetPipe), - LabelCorrelationsHelper.cosineSimilarityForLikePerUser(labeledTweetPipe) - ).map { - case (pearsonCoeff, globalCos, avgCos) => - Some(LabelCorrelations(pearsonCoeff, globalCos, avgCos)) - } - } else { - ValuePipe(None).getOrElseExecution(None) // Empty pipe with a None value - } - - Execution - .zip(tweetStatsExec, engagementCountExec, userStatsExec, correlationsExec) - .map { - case (tweetStats, engagementCount, engagerCount, correlationsOpt) => - val engagementRate = getEngagementRate(tweetStats, engagementCount) - LabeledTweetsResults( - tweetStats, - engagerCount, - engagementCount, - engagementRate, - correlationsOpt) - } - } - - private def runAllEvalForCandidates( - candidatePipe: TypedPipe[CandidateTweets], - outerJoinPipe: TypedPipe[((Long, Long), (Option[ReferenceTweet], Option[CandidateTweet]))] - ): Execution[(CandidateResults, CandidateResults)] = { - val t0 = System.currentTimeMillis() - - val candidateNotInIntersectionPipe = - outerJoinPipe - .collect { - case ((targetUserId, _), (None, Some(candTweet))) => (targetUserId, Seq(candTweet)) - } - .sumByKey - .map { case (targetUserId, candTweets) => CandidateTweets(targetUserId, candTweets) } - .forceToDisk - - Execution - .zip( - getEvaluationResultsForCandidates(candidatePipe), - getEvaluationResultsForCandidates(candidateNotInIntersectionPipe) - ).onComplete(_ => printOnCompleteMsg("runAllEvalForCandidates()", t0)) - } - - private def runAllEvalForIntersection( - outerJoinPipe: TypedPipe[((Long, Long), (Option[ReferenceTweet], Option[CandidateTweet]))] - )( - implicit uniqueID: UniqueID - ): Execution[(LabeledTweetsResults, LabeledTweetsResults, LabeledTweetsResults)] = { - val t0 = System.currentTimeMillis() - val intersectionTweetsPipe = outerJoinPipe.collect { - case ((targetUserId, tweetId), (Some(refTweet), Some(candTweet))) => - LabeledTweet(targetUserId, tweetId, refTweet.authorId, refTweet.labels, candTweet.score) - }.forceToDisk - - val likedTweetsPipe = intersectionTweetsPipe.filter(_.labels.isLiked) - val notLikedTweetsPipe = intersectionTweetsPipe.filter(!_.labels.isLiked) - - Execution - .zip( - getEvaluationResultsForLabeledTweets(intersectionTweetsPipe, getLabelCorrelations = true), - getEvaluationResultsForLabeledTweets(likedTweetsPipe), - getEvaluationResultsForLabeledTweets(notLikedTweetsPipe) - ).onComplete(_ => printOnCompleteMsg("runAllEvalForIntersection()", t0)) - } - - private def runAllEvalForReferences( - referencePipe: TypedPipe[ReferenceTweets], - outerJoinPipe: TypedPipe[((Long, Long), (Option[ReferenceTweet], Option[CandidateTweet]))] - ): Execution[(LabeledTweetsResults, LabeledTweetsResults)] = { - val t0 = System.currentTimeMillis() - val labeledReferenceNotInIntersectionPipe = - outerJoinPipe.collect { - case ((targetUserId, _), (Some(refTweet), None)) => - LabeledTweet(targetUserId, refTweet.tweetId, refTweet.authorId, refTweet.labels, None) - }.forceToDisk - - Execution - .zip( - getEvaluationResultsForLabeledTweets(getLabeledReference(referencePipe)), - getEvaluationResultsForLabeledTweets(labeledReferenceNotInIntersectionPipe) - ).onComplete(_ => printOnCompleteMsg("runAllEvalForReferences()", t0)) - } - - def runAllEvaluations( - referencePipe: TypedPipe[ReferenceTweets], - candidatePipe: TypedPipe[CandidateTweets] - )( - implicit uniqueID: UniqueID - ): Execution[String] = { - val t0 = System.currentTimeMillis() - - // Force everything to disk to maximize data re-use - Execution - .zip( - referencePipe.forceToDiskExecution, - candidatePipe.forceToDiskExecution - ).flatMap { - case (referenceDiskPipe, candidateDiskPipe) => - outerJoinReferenceAndCandidate(referenceDiskPipe, candidateDiskPipe).forceToDiskExecution - .flatMap { outerJoinPipe => - val referenceResultsExec = runAllEvalForReferences(referenceDiskPipe, outerJoinPipe) - val intersectionResultsExec = runAllEvalForIntersection(outerJoinPipe) - val candidateResultsExec = runAllEvalForCandidates(candidateDiskPipe, outerJoinPipe) - - Execution - .zip( - referenceResultsExec, - intersectionResultsExec, - candidateResultsExec - ).map { - case ( - (allReference, referenceNotInIntersection), - (allIntersection, intersectionLiked, intersectionNotLiked), - (allCandidate, candidateNotInIntersection)) => - val timeSpent = (System.currentTimeMillis() - t0) / 1000 - val resultStr = Seq( - "===================================================", - s"Evaluation complete. Took ${timeSpent / 60}m${timeSpent % 60}s ", - allReference.format("-----Metrics for all Reference Tweets-----"), - referenceNotInIntersection.format( - "-----Metrics for Reference Tweets that are not in the intersection-----" - ), - allIntersection.format("-----Metrics for all Intersection Tweets-----"), - intersectionLiked.format("-----Metrics for Liked Intersection Tweets-----"), - intersectionNotLiked.format( - "-----Metrics for not Liked Intersection Tweets-----"), - allCandidate.format("-----Metrics for all Candidate Tweets-----"), - candidateNotInIntersection.format( - "-----Metrics for Candidate Tweets that are not in the intersection-----" - ), - "===================================================\n" - ).mkString("\n") - println(resultStr) - resultStr - } - .onComplete(_ => - printOnCompleteMsg( - "Evaluation complete. Check stdout or output logs for results.", - t0)) - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/EvaluationReferenceDataExtraction.docx b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/EvaluationReferenceDataExtraction.docx new file mode 100644 index 000000000..b92c9151c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/EvaluationReferenceDataExtraction.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/EvaluationReferenceDataExtraction.scala b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/EvaluationReferenceDataExtraction.scala deleted file mode 100644 index 47bcabe1b..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/EvaluationReferenceDataExtraction.scala +++ /dev/null @@ -1,270 +0,0 @@ -package com.twitter.simclusters_v2.scalding.evaluation - -import com.twitter.ml.api.constant.SharedFeatures.AUTHOR_ID -import com.twitter.ml.api.constant.SharedFeatures.TIMESTAMP -import com.twitter.ml.api.constant.SharedFeatures.TWEET_ID -import com.twitter.ml.api.constant.SharedFeatures.USER_ID -import com.twitter.ml.api.DailySuffixFeatureSource -import com.twitter.ml.api.DataSetPipe -import com.twitter.ml.api.RichDataRecord -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecution -import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecutionArgs -import com.twitter.scalding_internal.job.analytics_batch.BatchDescription -import com.twitter.scalding_internal.job.analytics_batch.BatchFirstTime -import com.twitter.scalding_internal.job.analytics_batch.BatchIncrement -import com.twitter.scalding_internal.job.analytics_batch.TwitterScheduledExecutionApp -import com.twitter.simclusters_v2.hdfs_sources.TimelineDataExtractorFixedPathSource -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.thriftscala.DisplayLocation -import com.twitter.simclusters_v2.thriftscala.ReferenceTweet -import com.twitter.simclusters_v2.thriftscala.ReferenceTweets -import com.twitter.simclusters_v2.thriftscala.TweetLabels -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures.IS_LINGER_IMPRESSION -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures.SOURCE_AUTHOR_ID -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures.SOURCE_TWEET_ID -import com.twitter.timelines.prediction.features.itl.ITLFeatures -import com.twitter.timelines.prediction.features.recap.RecapFeatures -import java.util.TimeZone - -/** - * A scheduled version of the job to parse Timelines data for impressed and engaged tweets. - capesospy-v2 update|create --start_cron tweet_evaluation_timelines_reference_batch src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object ScheduledTimelinesDataExtractionBatch extends TwitterScheduledExecutionApp { - - val outputPath = "/user/cassowary/processed/tweet_evaluation_reference_set/timelines" - - private val firstTime: String = "2019-03-31" - private implicit val tz: TimeZone = DateOps.UTC - private implicit val parser: DateParser = DateParser.default - private val batchIncrement: Duration = Days(1) - - private val execArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(this.getClass.getName.replace("$", "")), - firstTime = BatchFirstTime(RichDate(firstTime)), - lastTime = None, - batchIncrement = BatchIncrement(batchIncrement) - ) - - override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) { - implicit dateRange => - Execution.withId { implicit uniqueId => - Execution.withArgs { args => - val defaultSampleRate = 1.0 - val recaps = - TimelinesEngagementDataExtractor.readTimelinesRecapTweets( - recapTweets = - DailySuffixFeatureSource(TimelinesEngagementDataExtractor.RecapTweetHdfsPath).read, - sampleRate = defaultSampleRate - )(dateRange) - val recTweets = - TimelinesEngagementDataExtractor.readTimelinesRecTweets( - recTweets = - DailySuffixFeatureSource(TimelinesEngagementDataExtractor.RecTweetHdfsPath).read, - sampleRate = defaultSampleRate - )(dateRange) - - (recaps ++ recTweets).writeDALSnapshotExecution( - TweetEvaluationTimelinesReferenceSetScalaDataset, - D.Daily, - D.Suffix(outputPath), - D.EBLzo(), - dateRange.end - ) - } - } - } -} - -/** - * Ad-hoc version of the job to process a subset of the Timeline data, either to catch up with data - * on a particular day, or to generate human readable data for debugging. - ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/evaluation:tweet_evaluation_timelines_reference_adhoc - - oscar hdfs --screen --user cassowary --bundle tweet_evaluation_timelines_reference_adhoc \ - --tool com.twitter.simclusters_v2.scalding.evaluation.AdhocTimelinesDataExtraction \ - -- --date 2018-11-15 --output_dir /user/cassowary/your_ldap/test_htl_data/recap --sample_rate 0.01 \ - --recap --rectweet --output_tsv - */ -object AdhocTimelinesDataExtraction extends TwitterExecutionApp { - - @Override - def job: Execution[Unit] = { - Execution.withArgs { args => - implicit val dateRange: DateRange = - DateRange.parse(args.list("date"))(DateOps.UTC, DateParser.default) - - val outputDir = args("output_dir") - val readRecTweet = args.boolean("rectweet") - val readRecap = args.boolean("recap") - val sampleRate = args.double("sample_rate") - val useTsv = args.boolean("output_tsv") - - if (!readRecTweet && !readRecap) { - throw new IllegalArgumentException("Must read at least some data!") - } - val recTweets = if (readRecTweet) { - println("RecTweets are included in the dataset") - TimelinesEngagementDataExtractor.readTimelinesRecTweets( - recTweets = - DailySuffixFeatureSource(TimelinesEngagementDataExtractor.RecTweetHdfsPath).read, - sampleRate = sampleRate)(dateRange) - } else { - TypedPipe.empty - } - - val recaps = if (readRecap) { - println("Recaps are included in the dataset") - TimelinesEngagementDataExtractor.readTimelinesRecapTweets( - recapTweets = - DailySuffixFeatureSource(TimelinesEngagementDataExtractor.RecapTweetHdfsPath).read, - sampleRate = sampleRate - )(dateRange) - } else { - TypedPipe.empty - } - - val referenceTweets = recaps ++ recTweets - - if (useTsv) { - // Write in plain text in tsv format for human readability - referenceTweets - .map(t => (t.targetUserId, t.impressedTweets)) - .writeExecution(TypedTsv[(Long, Seq[ReferenceTweet])](outputDir)) - } else { - // Write in compact thrift lzo format - referenceTweets - .writeExecution(TimelineDataExtractorFixedPathSource(outputDir)) - } - } - } -} - -/** - * Base class to provide functions to parse tweet engagement data from Home Timeline's data. - * We are mainly interested in 2 tweet data sets from Home Timeline: - * 1. Recap tweet: Tweets + RTs from user's follow graph. We are interested in out of network RTs. - * 2. RecTweet: Out of network tweets not from user's follow graph. - */ -object TimelinesEngagementDataExtractor { - - val RecapTweetHdfsPath = "/atla/proc2/user/timelines/processed/suggests/recap/data_records" - val RecTweetHdfsPath = "/atla/proc2/user/timelines/processed/injections/rectweet/data_records" - - // Timelines name the same feature differently depending on the surface area (ex. recap vs rectweet). - // For each data source we extract the features with different feature names. Detail: - def toRecapTweetLabels(record: RichDataRecord): TweetLabels = { - val isClicked = record.getFeatureValue(RecapFeatures.IS_CLICKED) - val isFav = record.getFeatureValue(RecapFeatures.IS_FAVORITED) - val isRT = record.getFeatureValue(RecapFeatures.IS_RETWEETED) - val isQuoted = record.getFeatureValue(RecapFeatures.IS_QUOTED) - val isReplied = record.getFeatureValue(RecapFeatures.IS_REPLIED) - TweetLabels(isClicked, isFav, isRT, isQuoted, isReplied) - } - - def toRecTweetLabels(record: RichDataRecord): TweetLabels = { - // Refer to ITLFeatures for more labels - val isClicked = record.getFeatureValue(ITLFeatures.IS_CLICKED) - val isFav = record.getFeatureValue(ITLFeatures.IS_FAVORITED) - val isRT = record.getFeatureValue(ITLFeatures.IS_RETWEETED) - val isQuoted = record.getFeatureValue(ITLFeatures.IS_QUOTED) - val isReplied = record.getFeatureValue(ITLFeatures.IS_REPLIED) - TweetLabels(isClicked, isFav, isRT, isQuoted, isReplied) - } - - /** - * Return Recap tweets, which are in-network tweets. Here we only filter for Retweets of tweets - * that are outside the user's follow graph. - */ - def readTimelinesRecapTweets( - recapTweets: DataSetPipe, - sampleRate: Double - )( - implicit dateRange: DateRange - ): TypedPipe[ReferenceTweets] = { - // recapTweets are in network tweets. We want to discover RTs of OON tweets. - // For Retweets, we check IS_RETWEET and use SOURCE_TWEET_ID, and then check - // PROBABLY_FROM_FOLLOWED_AUTHOR, which filters in network tweet from user's top 1000 follow graph. - - recapTweets.richRecords - .sample(sampleRate) - .filter { record => - val isInDateRange = dateRange.contains(RichDate(record.getFeatureValue(TIMESTAMP).toLong)) - val isLingeredImpression = record.getFeatureValue(IS_LINGER_IMPRESSION) - val isInNetwork = - record.getFeatureValue(RecapFeatures.PROBABLY_FROM_FOLLOWED_AUTHOR) // approximate - val isRetweet = record.getFeatureValue(RecapFeatures.IS_RETWEET) - isRetweet && (!isInNetwork) && isInDateRange && isLingeredImpression - } - .flatMap { record => - for { - userId <- Option(record.getFeatureValue(USER_ID)).map(_.toLong) - sourceTweetId <- Option(record.getFeatureValue(SOURCE_TWEET_ID)).map( - _.toLong - ) // source tweetId is the RT id - sourceAuthorId <- Option(record.getFeatureValue(SOURCE_AUTHOR_ID)).map(_.toLong) - timestamp <- Option(record.getFeatureValue(TIMESTAMP)).map(_.toLong) - labels = toRecapTweetLabels(record) - } yield { - ( - userId, - Seq( - ReferenceTweet( - sourceTweetId, - sourceAuthorId, - timestamp, - DisplayLocation.TimelinesRecap, - labels)) - ) - } - } - .sumByKey - .map { case (uid, tweetSeq) => ReferenceTweets(uid, tweetSeq) } - } - - /** - * Return RecTweets, which are out of network tweets served in the Timeline. - */ - def readTimelinesRecTweets( - recTweets: DataSetPipe, - sampleRate: Double - )( - implicit dateRange: DateRange - ): TypedPipe[ReferenceTweets] = { - // recTweets contain strictly out of network injection tweets - - recTweets.richRecords - .sample(sampleRate) - .filter { record => - val isInDateRange = dateRange.contains(RichDate(record.getFeatureValue(TIMESTAMP).toLong)) - val isLingeredImpression = record.getFeatureValue(IS_LINGER_IMPRESSION) - - isInDateRange && isLingeredImpression - } - .flatMap { record => - for { - userId <- Option(record.getFeatureValue(USER_ID)).map(_.toLong) - tweetId <- Option(record.getFeatureValue(TWEET_ID)).map(_.toLong) - authorId <- Option(record.getFeatureValue(AUTHOR_ID)).map(_.toLong) - timestamp <- Option(record.getFeatureValue(TIMESTAMP)).map(_.toLong) - labels = toRecTweetLabels(record) - } yield { - ( - userId, - Seq( - ReferenceTweet( - tweetId, - authorId, - timestamp, - DisplayLocation.TimelinesRectweet, - labels)) - ) - } - } - .sumByKey - .map { case (uid, tweetSeq) => ReferenceTweets(uid, tweetSeq) } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/LabelCorrelationsHelper.docx b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/LabelCorrelationsHelper.docx new file mode 100644 index 000000000..ff3516462 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/LabelCorrelationsHelper.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/LabelCorrelationsHelper.scala b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/LabelCorrelationsHelper.scala deleted file mode 100644 index 8d86f5a6a..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/LabelCorrelationsHelper.scala +++ /dev/null @@ -1,61 +0,0 @@ -package com.twitter.simclusters_v2.scalding.evaluation - -import com.twitter.algebird.AveragedValue -import com.twitter.scalding.Execution -import com.twitter.scalding.typed.TypedPipe -import com.twitter.simclusters_v2.scalding.common.Util - -/** - * Utility object for correlation measures between the algorithm scores and the user engagements, - * such as the number of Likes. - */ -object LabelCorrelationsHelper { - - private def toDouble(bool: Boolean): Double = { - if (bool) 1.0 else 0.0 - } - - /** - * Given a pipe of labeled tweets, calculate the cosine similarity between the algorithm scores - * and users' favorite engagements. - */ - def cosineSimilarityForLike(labeledTweets: TypedPipe[LabeledTweet]): Execution[Double] = { - labeledTweets - .map { tweet => (toDouble(tweet.labels.isLiked), tweet.algorithmScore.getOrElse(0.0)) } - .toIterableExecution.map { iter => Util.cosineSimilarity(iter.iterator) } - } - - /** - * Given a pipe of labeled tweets, calculate cosine similarity between algorithm score and users' - * favorites engagements, on a per user basis, and return the average of all cosine - * similarities across all users. - */ - def cosineSimilarityForLikePerUser(labeledTweets: TypedPipe[LabeledTweet]): Execution[Double] = { - val avg = AveragedValue.aggregator.composePrepare[(Unit, Double)](_._2) - - labeledTweets - .map { tweet => - ( - tweet.targetUserId, - Seq((toDouble(tweet.labels.isLiked), tweet.algorithmScore.getOrElse(0.0))) - ) - } - .sumByKey - .map { - case (userId, seq) => - ((), Util.cosineSimilarity(seq.iterator)) - } - .aggregate(avg) - .getOrElseExecution(0.0) - } - - /** - * Calculates the Pearson correlation coefficient for the algorithm scores and user's favorite - * engagement. Note this function call triggers a writeToDisk execution. - */ - def pearsonCoefficientForLike(labeledTweets: TypedPipe[LabeledTweet]): Execution[Double] = { - labeledTweets - .map { tweet => (toDouble(tweet.labels.isLiked), tweet.algorithmScore.getOrElse(0.0)) } - .toIterableExecution.map { iter => Util.computeCorrelation(iter.iterator) } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/SimClustersEvaluationAdhocApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/SimClustersEvaluationAdhocApp.docx new file mode 100644 index 000000000..6f0a64465 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/SimClustersEvaluationAdhocApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/SimClustersEvaluationAdhocApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/evaluation/SimClustersEvaluationAdhocApp.scala deleted file mode 100644 index 3ddded4cf..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/evaluation/SimClustersEvaluationAdhocApp.scala +++ /dev/null @@ -1,210 +0,0 @@ -package com.twitter.simclusters_v2.scalding.evaluation - -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.candidate_source.ClusterRanker -import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources -import com.twitter.simclusters_v2.hdfs_sources.ClusterTopKTweetsHourlySuffixSource -import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.TweetEvaluationTimelinesReferenceSetScalaDataset -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.CandidateTweet -import com.twitter.simclusters_v2.thriftscala.CandidateTweets -import com.twitter.simclusters_v2.thriftscala.ClusterTopKTweetsWithScores -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.simclusters_v2.thriftscala.DisplayLocation -import com.twitter.simclusters_v2.thriftscala.ReferenceTweets -import com.twitter.simclusters_v2.scalding.offline_job.OfflineRecConfig -import com.twitter.simclusters_v2.scalding.offline_job.OfflineTweetRecommendation -import java.util.TimeZone - -/** - * Do evaluations for SimClusters' tweet recommendations by using offline datasets. - * The job does the following: - * 1. Take in a test date range, for which the offline simclusters rec will be evaluated - * 2. For all users that had tweet impressions in timelines during the period, generate offline - * SimClusters candidate tweets for these users - * 3. Run offline evaluation and return metrics - -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/evaluation:simcluster_offline_eval_adhoc - -Note: Never specify reference date range across more than 1 day! -oscar hdfs --user cassowary --screen --screen-detached --tee your_ldap/prod_percentile \ - --bundle simcluster_offline_eval_adhoc \ - --tool com.twitter.simclusters_v2.scalding.evaluation.SimClustersEvaluationAdhocApp \ - -- --cand_tweet_date 2019-03-04T00 2019-03-04T23 \ - --ref_tweet_date 2019-03-05T00 2019-03-05T01 \ - --timeline_tweet rectweet \ - --sample_rate 0.05 \ - --max_cand_tweets 16000000 \ - --min_tweet_score 0.0 \ - --user_interested_in_dir /user/frigate/your_ldap/interested_in_copiedFromAtlaProc_20190228 \ - --cluster_top_k_dir /user/cassowary/your_ldap/offline_simcluster_20190304/cluster_top_k_tweets \ - --output_dir /user/cassowary/your_ldap/prod_percentile \ - --toEmailAddress your_ldap@twitter.com \ - --testRunName TestingProdOn0305Data - */ -object SimClustersEvaluationAdhocApp extends TwitterExecutionApp { - private val maxTweetResults = 40 - private val maxClustersToQuery = 20 - - @Override - def job: Execution[Unit] = { - Execution.withArgs { args => - Execution.withId { implicit uniqueId => - implicit val tz: TimeZone = DateOps.UTC - implicit val dateParser: DateParser = DateParser.default - - val candTweetDateRange = DateRange.parse(args.list("cand_tweet_date")) - val refTweetDateRange = DateRange.parse(args.list("ref_tweet_date")) - val toEmailAddressOpt = args.optional("toEmailAddress") - val testRunName = args.optional("testRunName") - - println( - s"Using SimClusters tweets from ${candTweetDateRange.start} to ${candTweetDateRange.end}") - println(s"Using Timelines tweets on the day of ${refTweetDateRange.start}") - - // separate tweets from different display locations for now - val tweetType = args("timeline_tweet") match { - case "rectweet" => DisplayLocation.TimelinesRectweet - case "recap" => DisplayLocation.TimelinesRecap - case e => - throw new IllegalArgumentException(s"$e isn't a valid timeline display location") - } - - val sampleRate = args.double("sample_rate", 1.0) - val validRefPipe = getProdTimelineReference(tweetType, refTweetDateRange, sampleRate) - val targetUserPipe = validRefPipe.map { _.targetUserId } - - // Read a fixed-path in atla if provided, otherwise read prod data from atla for date range - val userInterestInPipe = args.optional("user_interested_in_dir") match { - case Some(fixedPath) => - println(s"user_interested_in_dir is provided at: $fixedPath. Reading fixed path data.") - TypedPipe.from(AdhocKeyValSources.interestedInSource(fixedPath)) - case _ => - println(s"user_interested_in_dir isn't provided. Reading prod data.") - interestedInProdSource(candTweetDateRange) - } - - // Offline simulation of this dataset - val clusterTopKDir = args("cluster_top_k_dir") - println(s"cluster_top_k_dir is defined at: $clusterTopKDir") - val clusterTopKPipe = TypedPipe.from( - ClusterTopKTweetsHourlySuffixSource(clusterTopKDir, candTweetDateRange) - ) - - // Configs for offline simcluster tweet recommendation - val maxTweetRecs = args.int("max_cand_tweets", 30000000) - val minTweetScoreThreshold = args.double("min_tweet_score", 0.0) - - val offlineRecConfig = OfflineRecConfig( - maxTweetRecs, - maxTweetResults, - maxClustersToQuery, - minTweetScoreThreshold, - ClusterRanker.RankByNormalizedFavScore - ) - println("SimClusters offline config: " + offlineRecConfig) - - getValidCandidate( - targetUserPipe, - userInterestInPipe, - clusterTopKPipe, - offlineRecConfig, - candTweetDateRange - ).flatMap { validCandPipe => - val outputDir = args("output_dir") - EvaluationMetricHelper.runAllEvaluations(validRefPipe, validCandPipe).map { results => - toEmailAddressOpt.foreach { address => - Util.sendEmail( - results, - "Results from tweet evaluation test bed " + testRunName.getOrElse(""), - address) - } - TypedPipe.from(Seq((results, ""))).writeExecution(TypedTsv[(String, String)](outputDir)) - } - } - } - } - } - - /** - * Given a pipe of raw timelines reference engagement data, collect the engagements that took - * place during the given date range, then sample these engagements - */ - private def getProdTimelineReference( - displayLocation: DisplayLocation, - batchDateRange: DateRange, - sampleRate: Double - )( - implicit tz: TimeZone - ): TypedPipe[ReferenceTweets] = { - // Snapshot data timestamps itself with the last possible time of the day. +1 day to cover it - val snapshotRange = DateRange(batchDateRange.start, batchDateRange.start + Days(1)) - val timelinesRefPipe = DAL - .readMostRecentSnapshot(TweetEvaluationTimelinesReferenceSetScalaDataset, snapshotRange) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - - timelinesRefPipe - .flatMap { refTweets => - val tweets = refTweets.impressedTweets - .filter { refTweet => - refTweet.timestamp >= batchDateRange.start.timestamp && - refTweet.timestamp <= batchDateRange.end.timestamp && - refTweet.displayLocation == displayLocation - } - if (tweets.nonEmpty) { - Some(ReferenceTweets(refTweets.targetUserId, tweets)) - } else { - None - } - } - .sample(sampleRate) - } - - /** - * Given a list of target users, simulate SimCluster's online serving logic offline for these - * users, then convert them into [[CandidateTweets]] - */ - private def getValidCandidate( - targetUserPipe: TypedPipe[Long], - userIsInterestedInPipe: TypedPipe[(Long, ClustersUserIsInterestedIn)], - clusterTopKTweetsPipe: TypedPipe[ClusterTopKTweetsWithScores], - offlineConfig: OfflineRecConfig, - batchDateRange: DateRange - )( - implicit uniqueID: UniqueID - ): Execution[TypedPipe[CandidateTweets]] = { - OfflineTweetRecommendation - .getTopTweets(offlineConfig, targetUserPipe, userIsInterestedInPipe, clusterTopKTweetsPipe) - .map(_.map { - case (userId, scoredTweets) => - val tweets = scoredTweets.map { tweet => - CandidateTweet(tweet.tweetId, Some(tweet.score), Some(batchDateRange.start.timestamp)) - } - CandidateTweets(userId, tweets) - }) - } - - /** - * Read interested in key-val store from atla-proc from the given date range - */ - private def interestedInProdSource( - dateRange: DateRange - ): TypedPipe[(Long, ClustersUserIsInterestedIn)] = { - implicit val timeZone: TimeZone = DateOps.UTC - - DAL - .readMostRecentSnapshot(SimclustersV2InterestedInScalaDataset, dateRange.embiggen(Weeks(1))) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .map { - case KeyVal(key, value) => (key, value) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/BUILD.bazel b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/BUILD.bazel deleted file mode 100644 index ed5c9d6b8..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/BUILD.bazel +++ /dev/null @@ -1,74 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = [ - "bazel-compatible", - "bazel-only", - ], - dependencies = [ - "src/scala/com/twitter/onboarding/relevance/source:utt_account_recommendations-scala", - "src/scala/com/twitter/scalding_internal/dalv2", - "src/scala/com/twitter/simclusters_v2/scalding", - "src/scala/com/twitter/wtf/entity_real_graph/common", - ], -) - -hadoop_binary( - name = "inferred_entities_from_known_for-adhoc", - main = "com.twitter.simclusters_v2.scalding.inferred_entities.InferredKnownForSemanticCoreEntitiesAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":inferred_entities", - ], -) - -hadoop_binary( - name = "inferred_entities_from_known_for", - main = "com.twitter.simclusters_v2.scalding.inferred_entities.InferredKnownForSemanticCoreEntitiesBatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":inferred_entities", - ], -) - -hadoop_binary( - name = "inferred_entities_from_interested_in-adhoc", - main = "com.twitter.simclusters_v2.scalding.inferred_entities.InferredInterestedInSemanticCoreEntitiesAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":inferred_entities", - ], -) - -hadoop_binary( - name = "inferred_entities_from_interested_in", - main = "com.twitter.simclusters_v2.scalding.inferred_entities.InferredInterestedInSemanticCoreEntitiesBatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":inferred_entities", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/BUILD.docx new file mode 100644 index 000000000..a80afdd74 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredEntities.docx b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredEntities.docx new file mode 100644 index 000000000..2b935c9e1 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredEntities.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredEntities.scala b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredEntities.scala deleted file mode 100644 index 67b1574b7..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredEntities.scala +++ /dev/null @@ -1,92 +0,0 @@ -package com.twitter.simclusters_v2.scalding.inferred_entities - -import com.twitter.scalding.DateRange -import com.twitter.scalding.Days -import com.twitter.scalding.typed.TypedPipe -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources.EntityEmbeddingsSources -import com.twitter.simclusters_v2.thriftscala.ClusterType -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.InferredEntity -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.SemanticCoreEntityWithScore -import com.twitter.simclusters_v2.thriftscala.SimClustersInferredEntities -import com.twitter.simclusters_v2.thriftscala.SimClustersSource -import java.util.TimeZone - -/** - * Opt-out compliance for SimClusters means offering users an option to opt out of clusters that - * have inferred legible meanings. This file sets some of the data sources & thresholds from which - * the inferred entities are considered legible. One should always refer to the sources & constants - * here for SimClusters' inferred entity compliance work - */ -object InferredEntities { - val MHRootPath: String = - "/user/cassowary/manhattan_sequence_files/simclusters_v2_inferred_entities" - - // Convenience objects for defining cluster sources - val InterestedIn2020 = - SimClustersSource(ClusterType.InterestedIn, ModelVersion.Model20m145k2020) - - val Dec11KnownFor = SimClustersSource(ClusterType.KnownFor, ModelVersion.Model20m145kDec11) - - val UpdatedKnownFor = SimClustersSource(ClusterType.KnownFor, ModelVersion.Model20m145kUpdated) - - val KnownFor2020 = SimClustersSource(ClusterType.KnownFor, ModelVersion.Model20m145k2020) - - /** - * This is the threshold at which we consider a simcluster "legible" through an entity - */ - val MinLegibleEntityScore = 0.6 - - /** - * Query for the entity embeddings that are used for SimClusters compliance. We will use these - * entity embeddings for a cluster to allow a user to opt out of a cluster - */ - def getLegibleEntityEmbeddings( - dateRange: DateRange, - timeZone: TimeZone - ): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = { - val entityEmbeddings = EntityEmbeddingsSources - .getReverseIndexedSemanticCoreEntityEmbeddingsSource( - EmbeddingType.FavBasedSematicCoreEntity, - ModelVersions.Model20M145K2020, // only support the latest 2020 model - dateRange.embiggen(Days(7)(timeZone)) // read 7 days before & after to give buffer - ) - filterEntityEmbeddingsByScore(entityEmbeddings, MinLegibleEntityScore) - } - - // Return entities whose score are above threshold - def filterEntityEmbeddingsByScore( - entityEmbeddings: TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])], - minEntityScore: Double - ): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = { - entityEmbeddings.flatMap { - case (clusterId, entities) => - val validEntities = entities.filter { entity => entity.score >= minEntityScore } - if (validEntities.nonEmpty) { - Some((clusterId, validEntities)) - } else { - None - } - - } - } - - /** - * Given inferred entities from different sources, combine the results into job's output format - */ - def combineResults( - results: TypedPipe[(UserId, Seq[InferredEntity])]* - ): TypedPipe[(UserId, SimClustersInferredEntities)] = { - results - .reduceLeft(_ ++ _) - .sumByKey - .map { - case (userId, inferredEntities) => - (userId, SimClustersInferredEntities(inferredEntities)) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredEntitiesFromInterestedIn.docx b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredEntitiesFromInterestedIn.docx new file mode 100644 index 000000000..661217232 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredEntitiesFromInterestedIn.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredEntitiesFromInterestedIn.scala b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredEntitiesFromInterestedIn.scala deleted file mode 100644 index 212d851e1..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredEntitiesFromInterestedIn.scala +++ /dev/null @@ -1,377 +0,0 @@ -package com.twitter.simclusters_v2.scalding.inferred_entities - -import com.twitter.algebird.Max -import com.twitter.scalding.Args -import com.twitter.scalding.DateRange -import com.twitter.scalding.Days -import com.twitter.scalding.Duration -import com.twitter.scalding.Execution -import com.twitter.scalding.RichDate -import com.twitter.scalding.TypedPipe -import com.twitter.scalding.TypedTsv -import com.twitter.scalding.UniqueID -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.UTTEntityId -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.simclusters_v2.thriftscala.EntitySource -import com.twitter.simclusters_v2.thriftscala.InferredEntity -import com.twitter.simclusters_v2.thriftscala.SemanticCoreEntityWithScore -import com.twitter.simclusters_v2.thriftscala.SimClustersInferredEntities -import com.twitter.simclusters_v2.thriftscala.SimClustersSource -import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone -import com.twitter.onboarding.relevance.source.UttAccountRecommendationsScalaDataset -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.wtf.entity_real_graph.scalding.common.SemanticCoreFilters.getValidSemanticCoreEntities -import com.twitter.wtf.entity_real_graph.scalding.common.DataSources - -/** - * Infer interested-in entities for a given user. Depending on how and where the entity source comes - * from, this can be achieve a number of ways. For example, we can use user->interested-in clusters - * and cluster-> semanticcore entity embeddings to derive user->entity. Or, we can use a producers' - * UTT embeddings and user-user engagement graph to aggregate UTT engagement history. - */ -object InferredEntitiesFromInterestedIn { - - def getUserToKnownForUttEntities( - dateRange: DateRange, - maxUttEntitiesPerUser: Int - )( - implicit timeZone: TimeZone - ): TypedPipe[(UserId, Seq[(Long, Double)])] = { - - val validEntities = getValidSemanticCoreEntities( - DataSources.semanticCoreMetadataSource(dateRange, timeZone)).distinct.map { entityId => - Set(entityId) - }.sum - - DAL - .readMostRecentSnapshot(UttAccountRecommendationsScalaDataset, dateRange) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .flatMapWithValue(validEntities) { - // Keep only valid Entities - case (KeyVal(interest, candidates), Some(validUTTEntities)) - if validUTTEntities.contains(interest.uttID) => - candidates.recommendations.map { rec => - (rec.candidateUserID, (interest.uttID, rec.score.getOrElse(0.0))) - } - case _ => None - } - .group - .sortedReverseTake(maxUttEntitiesPerUser)(Ordering.by(_._2)) - .toTypedPipe - } - - def filterUTTEntities( - interestedInEntities: TypedPipe[(UserId, Seq[(UTTEntityId, Int)])], - minSocialProofThreshold: Int, - maxInterestsPerUser: Int - ): TypedPipe[(UserId, Seq[UTTEntityId])] = { - - interestedInEntities - .map { - case (userId, entities) => - val topEntities = entities - .filter(_._2 >= minSocialProofThreshold) - .sortBy(-_._2) - .take(maxInterestsPerUser) - .map(_._1) - - (userId, topEntities) - } - .filter(_._2.nonEmpty) - } - - def getUserToUTTEntities( - userUserGraph: TypedPipe[UserAndNeighbors], - knownForEntities: TypedPipe[(UserId, Seq[UTTEntityId])] - )( - implicit uniqueId: UniqueID - ): TypedPipe[(UserId, Seq[(UTTEntityId, Int)])] = { - val flatEngagementGraph = - userUserGraph - .count("num_user_user_graph_records") - .flatMap { userAndNeighbors => - userAndNeighbors.neighbors.flatMap { neighbor => - val producerId = neighbor.neighborId - val hasFav = neighbor.favScoreHalfLife100Days.exists(_ > 0) - val hasFollow = neighbor.isFollowed.contains(true) - - if (hasFav || hasFollow) { - Some((producerId, userAndNeighbors.userId)) - } else { - None - } - } - } - .count("num_flat_user_user_graph_edges") - - flatEngagementGraph - .join(knownForEntities.count("num_producer_to_entities")) - .withReducers(3000) - .flatMap { - case (producerId, (userId, entities)) => - entities.map { entityId => ((userId, entityId), 1) } - } - .count("num_flat_user_to_entity") - .sumByKey - .withReducers(2999) - .toTypedPipe - .count("num_user_with_entities") - .collect { - case ((userId, uttEntityId), numEngagements) => - (userId, Seq((uttEntityId, numEngagements))) - } - .sumByKey - } - - /** - * Infer entities using user-interestedIn clusters and entity embeddings for those clusters, - * based on a threshold - */ - def getInterestedInFromEntityEmbeddings( - userToInterestedIn: TypedPipe[(UserId, ClustersUserIsInterestedIn)], - clusterToEntities: TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])], - inferredFromCluster: Option[SimClustersSource], - inferredFromEntity: Option[EntitySource] - )( - implicit uniqueId: UniqueID - ): TypedPipe[(UserId, Seq[InferredEntity])] = { - val clusterToUsers = userToInterestedIn - .flatMap { - case (userId, clusters) => - clusters.clusterIdToScores.map { - case (clusterId, score) => - (clusterId, (userId, score)) - } - } - .count("num_flat_user_to_interested_in_cluster") - - clusterToUsers - .join(clusterToEntities) - .withReducers(3000) - .map { - case (clusterId, ((userId, interestedInScore), entitiesWithScores)) => - (userId, entitiesWithScores) - } - .flatMap { - case (userId, entitiesWithScore) => - // Dedup by entityIds in case user is associated with an entity from different clusters - entitiesWithScore.map { entity => (userId, Map(entity.entityId -> Max(entity.score))) } - } - .sumByKey - .map { - case (userId, entitiesWithMaxScore) => - val inferredEntities = entitiesWithMaxScore.map { entityWithScore => - InferredEntity( - entityId = entityWithScore._1, - score = entityWithScore._2.get, - simclusterSource = inferredFromCluster, - entitySource = inferredFromEntity - ) - }.toSeq - (userId, inferredEntities) - } - .count("num_user_with_inferred_entities") - } -} - -/** -capesospy-v2 update --build_locally --start_cron \ - --start_cron inferred_entities_from_interested_in \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object InferredInterestedInSemanticCoreEntitiesBatchApp extends ScheduledExecutionApp { - - override def firstTime: RichDate = RichDate("2023-01-01") - - override def batchIncrement: Duration = Days(1) - - private val outputPath = InferredEntities.MHRootPath + "/interested_in" - - private val outputPathKeyedByCluster = - InferredEntities.MHRootPath + "/interested_in_keyed_by_cluster" - - import InferredEntitiesFromInterestedIn._ - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - Execution.unit - - val clusterToEntities = InferredEntities - .getLegibleEntityEmbeddings(dateRange, timeZone) - .count("num_legible_cluster_to_entities") - .forceToDisk - - // inferred interests. Only support 2020 model version - val userToClusters2020 = - InterestedInSources.simClustersInterestedIn2020Source(dateRange, timeZone) - - val inferredEntities2020 = getInterestedInFromEntityEmbeddings( - userToInterestedIn = userToClusters2020, - clusterToEntities = clusterToEntities, - inferredFromCluster = Some(InferredEntities.InterestedIn2020), - inferredFromEntity = Some(EntitySource.SimClusters20M145K2020EntityEmbeddingsByFavScore) - )(uniqueID) - .count("num_user_with_inferred_entities_2020") - - val combinedInferredInterests = - InferredEntities.combineResults(inferredEntities2020) - - // output cluster -> entity mapping - val clusterToEntityExec = clusterToEntities - .map { - case (clusterId, entities) => - val inferredEntities = SimClustersInferredEntities( - entities.map(entity => InferredEntity(entity.entityId, entity.score)) - ) - KeyVal(clusterId, inferredEntities) - } - .writeDALVersionedKeyValExecution( - SimclustersInferredEntitiesFromInterestedInKeyedByClusterScalaDataset, - D.Suffix(outputPathKeyedByCluster) - ) - - // output user -> entity mapping - val userToEntityExec = combinedInferredInterests - .map { case (userId, entities) => KeyVal(userId, entities) } - .writeDALVersionedKeyValExecution( - SimclustersInferredEntitiesFromInterestedInScalaDataset, - D.Suffix(outputPath) - ) - - Execution.zip(clusterToEntityExec, userToEntityExec).unit - } -} - -/** -Adhob debugging job. Uses Entity Embeddings dataset to infer user interests - -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/ &&\ -scalding remote run \ - --main-class com.twitter.simclusters_v2.scalding.inferred_entities.InferredInterestedInSemanticCoreEntitiesAdhocApp \ - --target src/scala/com/twitter/simclusters_v2/scalding/inferred_entities:inferred_entities_from_interested_in-adhoc \ - --user recos-platform \ - -- --date 2019-11-11 --email your_ldap@twitter.com - */ -object InferredInterestedInSemanticCoreEntitiesAdhocApp extends AdhocExecutionApp { - import InferredEntitiesFromInterestedIn._ - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val interestedIn = InterestedInSources.simClustersInterestedIn2020Source(dateRange, timeZone) - - val clusterToEntities = InferredEntities - .getLegibleEntityEmbeddings(dateRange, timeZone) - .count("num_legible_cluster_to_entities") - - // Debugging InterestedIn -> EntityEmbeddings approach - val interestedInFromEntityEmbeddings = getInterestedInFromEntityEmbeddings( - interestedIn, - clusterToEntities, - None, - None - )(uniqueID) - - val distribution = Util - .printSummaryOfNumericColumn( - interestedInFromEntityEmbeddings.map { case (k, v) => v.size }, - Some("# of interestedIn entities per user") - ).map { results => - Util.sendEmail(results, "# of interestedIn entities per user", args.getOrElse("email", "")) - } - - Execution - .zip( - distribution, - interestedInFromEntityEmbeddings - .writeExecution( - TypedTsv("/user/recos-platform/adhoc/debug/interested_in_from_entity_embeddings")) - ).unit - } -} - -/** - Adhob debuggingjob. Runs through the UTT interest inference, analyze the size & distribution of - interests per user. - -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/ &&\ -scalding remote run \ - --main-class com.twitter.simclusters_v2.scalding.inferred_entities.InferredUTTEntitiesFromInterestedInAdhocApp \ - --target src/scala/com/twitter/simclusters_v2/scalding/inferred_entities:inferred_entities_from_interested_in-adhoc \ - --user recos-platform \ - -- --date 2019-11-03 --email your_ldap@twitter.com - */ -object InferredUTTEntitiesFromInterestedInAdhocApp extends AdhocExecutionApp { - import InferredEntitiesFromInterestedIn._ - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val employeeGraphPath = "/user/recos-platform/adhoc/employee_graph_from_user_user/" - val employeeGraph = TypedPipe.from(UserAndNeighborsFixedPathSource(employeeGraphPath)) - - val maxKnownForUttsPerProducer = 100 - val minSocialProofThreshold = 10 - val maxInferredInterestsPerUser = 500 - - // KnownFor UTT entities - val userToUttEntities = getUserToKnownForUttEntities( - dateRange.embiggen(Days(7)), - maxKnownForUttsPerProducer - ).map { case (userId, entities) => (userId, entities.map(_._1)) } - - val userToInterestsEngagementCounts = getUserToUTTEntities(employeeGraph, userToUttEntities) - - val topInterests = filterUTTEntities( - userToInterestsEngagementCounts, - minSocialProofThreshold, - maxInferredInterestsPerUser - ).count("num_users_with_inferred_interests") - - // Debugging UTT entities - val analysis = Util - .printSummaryOfNumericColumn( - topInterests.map { case (k, v) => v.size }, - Some( - "# of UTT entities per user, maxKnownForUtt=100, minSocialProof=10, maxInferredPerUser=500") - ).map { results => - Util.sendEmail(results, "# of UTT entities per user", args.getOrElse("email", "")) - } - - val outputPath = "/user/recos-platform/adhoc/inferred_utt_interests" - - Execution - .zip( - topInterests.writeExecution(TypedTsv(outputPath)), - analysis - ).unit - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredSemanticCoreEntitiesFromKnownFor.docx b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredSemanticCoreEntitiesFromKnownFor.docx new file mode 100644 index 000000000..cfa5ef85a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredSemanticCoreEntitiesFromKnownFor.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredSemanticCoreEntitiesFromKnownFor.scala b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredSemanticCoreEntitiesFromKnownFor.scala deleted file mode 100644 index 2dfbd5f4b..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/InferredSemanticCoreEntitiesFromKnownFor.scala +++ /dev/null @@ -1,244 +0,0 @@ -package com.twitter.simclusters_v2.scalding.inferred_entities - -import com.twitter.escherbird.metadata.thriftscala.FullMetadata -import com.twitter.scalding._ -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.wtf.entity_real_graph.scalding.common.{DataSources => ERGDataSources} -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * Infer Known-For entities based on users' different variations of SimClusters Known-Fors. - * The basic idea is to look at the Known-For datasets (User, Cluster) and the entity embeddings - * (Cluster, Entities) to derive the (User, Entities). - */ -object InferredSemanticCoreEntitiesFromKnownFor { - - /** - * Given a (user, cluster) and (cluster, entity) mappings, generate (user, entity) mappings - */ - def getUserToEntities( - userToClusters: TypedPipe[(UserId, Seq[SimClusterWithScore])], - clusterToEntities: TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])], - inferredFromCluster: Option[SimClustersSource], - inferredFromEntity: Option[EntitySource], - minEntityScore: Double - ): TypedPipe[(UserId, Seq[InferredEntity])] = { - - val validClusterToEntities = clusterToEntities.flatMap { - case (clusterId, entities) => - entities.collect { - case entity if entity.score >= minEntityScore => - (clusterId, (entity.entityId, entity.score)) - } - } - - userToClusters - .flatMap { - case (userId, clusters) => - clusters.map { cluster => (cluster.clusterId, userId) } - } - .join(validClusterToEntities) - .map { - case (clusterId, (userId, (entityId, score))) => - ((userId, entityId), score) - } - // If a user is known for the same entity through multiple cluster-entity mappings, sum the scores - .sumByKey - .map { - case ((userId, entityId), score) => - (userId, Seq(InferredEntity(entityId, score, inferredFromCluster, inferredFromEntity))) - } - .sumByKey - } - -} - -/** -capesospy-v2 update --build_locally --start_cron \ - inferred_entities_from_known_for \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object InferredKnownForSemanticCoreEntitiesBatchApp extends ScheduledExecutionApp { - - import InferredSemanticCoreEntitiesFromKnownFor._ - - override def firstTime: RichDate = RichDate("2023-01-23") - - override def batchIncrement: Duration = Days(1) - - private val outputPath = InferredEntities.MHRootPath + "/known_for" - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val clusterToEntities = EntityEmbeddingsSources - .getReverseIndexedSemanticCoreEntityEmbeddingsSource( - EmbeddingType.FavBasedSematicCoreEntity, - ModelVersions.Model20M145K2020, - dateRange.embiggen(Days(7)) // read 7 days before & after to give buffer - ) - .forceToDisk - - val userToEntities2020 = getUserToEntities( - ProdSources.getUpdatedKnownFor, - clusterToEntities, - Some(InferredEntities.KnownFor2020), - Some(EntitySource.SimClusters20M145K2020EntityEmbeddingsByFavScore), - InferredEntities.MinLegibleEntityScore - ) - - val userToEntities = InferredEntities.combineResults(userToEntities2020) - - userToEntities - .map { case (userId, entities) => KeyVal(userId, entities) } - .writeDALVersionedKeyValExecution( - SimclustersInferredEntitiesFromKnownForScalaDataset, - D.Suffix(outputPath) - ) - } -} - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/inferred_entities:inferred_entities_from_known_for-adhoc && \ - oscar hdfs --user recos-platform --screen --tee your_ldap-logs/ \ - --bundle inferred_entities_from_known_for-adhoc \ - --tool com.twitter.simclusters_v2.scalding.inferred_entities.InferredSemanticCoreEntitiesFromKnownForAdhocApp \ - -- --date 2019-11-02 --email your_ldap@twitter.com - */ -object InferredSemanticCoreEntitiesFromKnownForAdhocApp extends AdhocExecutionApp { - - private def readEntityEmbeddingsFromPath( - path: String - ): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = { - TypedPipe - .from(AdhocKeyValSources.clusterToEntitiesSource(path)) - .map { - case (embeddingId, embedding) => - embeddingId.internalId match { - case InternalId.ClusterId(clusterId) => - val semanticCoreEntities = embedding.embedding.map { - case InternalIdWithScore(InternalId.EntityId(entityId), score) => - SemanticCoreEntityWithScore(entityId, score) - case _ => - throw new IllegalArgumentException( - "The value to the entity embeddings dataset isn't entityId" - ) - } - (clusterId, semanticCoreEntities) - case _ => - throw new IllegalArgumentException( - "The key to the entity embeddings dataset isn't clusterId" - ) - } - } - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - import InferredSemanticCoreEntitiesFromKnownFor._ - - val entityIdToString: TypedPipe[(Long, String)] = - ERGDataSources.semanticCoreMetadataSource - .collect { - case FullMetadata(domainId, entityId, Some(basicMetadata), _, _, _) - if domainId == 131L && !basicMetadata.indexableFields.exists( - _.tags.exists(_.contains("utt:sensitive_interest"))) => - entityId -> basicMetadata.name - }.distinctBy(_._1) - - val clusterToEntitiesUpdated = EntityEmbeddingsSources - .getReverseIndexedSemanticCoreEntityEmbeddingsSource( - EmbeddingType.FavBasedSematicCoreEntity, - ModelVersions.Model20M145KUpdated, - dateRange.embiggen(Days(4)) // read 4 days before & after to give buffer - ) - .forceToDisk - - // Inferred entities based on Updated version's entity embeddings - val dec11UserToUpdatedEntities = getUserToEntities( - ProdSources.getDec11KnownFor, - clusterToEntitiesUpdated, - Some(InferredEntities.Dec11KnownFor), - Some(EntitySource.SimClusters20M145KUpdatedEntityEmbeddingsByFavScore), - InferredEntities.MinLegibleEntityScore - ) - - val updatedUserToUpdatedEntities = getUserToEntities( - ProdSources.getUpdatedKnownFor, - clusterToEntitiesUpdated, - Some(InferredEntities.UpdatedKnownFor), - Some(EntitySource.SimClusters20M145KUpdatedEntityEmbeddingsByFavScore), - InferredEntities.MinLegibleEntityScore - ) - - // Updated entities data - val entitiesPipe = ( - dec11UserToUpdatedEntities ++ updatedUserToUpdatedEntities - ).sumByKey - - val userToEntitiesWithString = entitiesPipe - .flatMap { - case (userId, entities) => - entities.map { entity => (entity.entityId, (userId, entity)) } - } - .hashJoin(entityIdToString) - .map { - case (entityId, ((userId, inferredEntity), entityStr)) => - (userId, Seq((entityStr, inferredEntity))) - } - .sumByKey - - val outputPath = "/user/recos-platform/adhoc/known_for_inferred_entities_updated" - - val scoreDistribution = Util - .printSummaryOfNumericColumn( - entitiesPipe.flatMap { case (k, v) => v.map(_.score) }, - Some("Distributions of scores, Updated version") - ).map { results => - Util.sendEmail( - results, - "Distributions of scores, Updated version", - args.getOrElse("email", "") - ) - } - - val coverageDistribution = Util - .printSummaryOfNumericColumn( - entitiesPipe.map { case (k, v) => v.size }, - Some("# of knownFor entities per user, Updated version") - ).map { results => - Util.sendEmail( - results, - "# of knownFor entities per user, Updated version", - args.getOrElse("email", "") - ) - } - - Execution - .zip( - userToEntitiesWithString.writeExecution(TypedTsv(outputPath)), - scoreDistribution, - coverageDistribution - ).unit - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/ProdSources.docx b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/ProdSources.docx new file mode 100644 index 000000000..b0652c653 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/ProdSources.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/ProdSources.scala b/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/ProdSources.scala deleted file mode 100644 index a1dc71ac8..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/inferred_entities/ProdSources.scala +++ /dev/null @@ -1,94 +0,0 @@ -package com.twitter.simclusters_v2.scalding.inferred_entities - -import com.twitter.scalding.{DateRange, Days, TypedPipe} -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, ProcAtla} -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.{ModelVersions, SemanticCoreEntityId, UserId} -import com.twitter.simclusters_v2.hdfs_sources.{ - SimclustersInferredEntitiesFromKnownForScalaDataset, - SimclustersV2InterestedIn20M145KUpdatedScalaDataset, - SimclustersV2InterestedInScalaDataset, - SimclustersV2KnownFor20M145KDec11ScalaDataset, - SimclustersV2KnownFor20M145KUpdatedScalaDataset, - UserUserNormalizedGraphScalaDataset -} -import com.twitter.simclusters_v2.scalding.KnownForSources -import com.twitter.simclusters_v2.thriftscala.{ - EntitySource, - SimClusterWithScore, - SimClustersSource, - TopSimClustersWithScore, - UserAndNeighbors -} -import java.util.TimeZone - -/** - * Convenience functions to read data from prod. - */ -object ProdSources { - - // Returns the Dec11 KnownFor from production - def getDec11KnownFor(implicit tz: TimeZone): TypedPipe[(UserId, Seq[SimClusterWithScore])] = - KnownForSources - .readDALDataset( - SimclustersV2KnownFor20M145KDec11ScalaDataset, - Days(30), - ModelVersions.Model20M145KDec11) - .map { - case (userId, clustersArray) => - val clusters = clustersArray.map { - case (clusterId, score) => SimClusterWithScore(clusterId, score) - }.toSeq - (userId, clusters) - } - - // Returns the Updated KnownFor from production - def getUpdatedKnownFor(implicit tz: TimeZone): TypedPipe[(UserId, Seq[SimClusterWithScore])] = - KnownForSources - .readDALDataset( - SimclustersV2KnownFor20M145KUpdatedScalaDataset, - Days(30), - ModelVersions.Model20M145KUpdated - ) - .map { - case (userId, clustersArray) => - val clusters = clustersArray.map { - case (clusterId, score) => SimClusterWithScore(clusterId, score) - }.toSeq - (userId, clusters) - } - - def getInferredEntitiesFromKnownFor( - inferredFromCluster: SimClustersSource, - inferredFromEntity: EntitySource, - dateRange: DateRange - ): TypedPipe[(UserId, Seq[(SemanticCoreEntityId, Double)])] = { - DAL - .readMostRecentSnapshot(SimclustersInferredEntitiesFromKnownForScalaDataset, dateRange) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .map { - case KeyVal(userId, entities) => - val validEntities = - entities.entities - .collect { - case entity - if entity.entitySource.contains(inferredFromEntity) && - entity.simclusterSource.contains(inferredFromCluster) => - (entity.entityId, entity.score) - } - .groupBy(_._1) - .map { case (entityId, scores) => (entityId, scores.map(_._2).max) } - .toSeq - (userId, validEntities) - } - } - - def getUserUserEngagementGraph(dateRange: DateRange): TypedPipe[UserAndNeighbors] = { - DAL - .readMostRecentSnapshot(UserUserNormalizedGraphScalaDataset, dateRange) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/AllFeatures.docx b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/AllFeatures.docx new file mode 100644 index 000000000..b72249546 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/AllFeatures.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/AllFeatures.scala b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/AllFeatures.scala deleted file mode 100644 index 902c981bd..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/AllFeatures.scala +++ /dev/null @@ -1,58 +0,0 @@ -package com.twitter.simclusters_v2.scalding.mbcg - -import com.google.common.collect.ImmutableSet -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.ml.api.DataType -import com.twitter.ml.api.Feature -import com.twitter.ml.api.Feature.SparseContinuous -import com.twitter.ml.api.Feature.Tensor -import com.twitter.ml.api.FeatureContext -import com.twitter.ml.api.constant.SharedFeatures -import java.util.{Map => JMap} - -/* -Features used for model-based candidate generation - */ -object TweetAllFeatures { - val tweetId = SharedFeatures.TWEET_ID - val tweetSimclusters = - new SparseContinuous( - "tweet.simcluster.log_fav_based_embedding.20m_145k_2020", - ImmutableSet.of(InferredInterests)) - .asInstanceOf[Feature[JMap[String, Double]]] - val authorF2vProducerEmbedding = - new Tensor( - "tweet.author_follow2vec.producer_embedding_200", - DataType.FLOAT - ) - - private val allFeatures: Seq[Feature[_]] = Seq( - tweetId, - tweetSimclusters, - authorF2vProducerEmbedding - ) - - val featureContext = new FeatureContext(allFeatures: _*) -} - -object UserAllFeatures { - val userId = SharedFeatures.USER_ID - val userSimclusters = - new SparseContinuous( - "user.iiape.log_fav_based_embedding.20m_145k_2020", - ImmutableSet.of(InferredInterests)) - .asInstanceOf[Feature[JMap[String, Double]]] - val userF2vConsumerEmbedding = - new Tensor( - "user.follow2vec.consumer_avg_fol_emb_200", - DataType.FLOAT - ) - - private val allFeatures: Seq[Feature[_]] = Seq( - userId, - userSimclusters, - userF2vConsumerEmbedding - ) - - val featureContext = new FeatureContext(allFeatures: _*) -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/BUILD.bazel b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/BUILD.bazel deleted file mode 100644 index 469a917be..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/BUILD.bazel +++ /dev/null @@ -1,314 +0,0 @@ -scala_library( - sources = [ - "*.scala", - ], - platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - "3rdparty/jvm/com/twitter/algebird:core", - "3rdparty/jvm/com/twitter/algebird:util", - "3rdparty/jvm/com/twitter/storehaus:algebra", - "3rdparty/jvm/com/twitter/storehaus:core", - "3rdparty/src/jvm/com/twitter/scalding:args", - "3rdparty/src/jvm/com/twitter/scalding:commons", - "3rdparty/src/jvm/com/twitter/scalding:core", - "3rdparty/src/jvm/com/twitter/scalding:date", - "3rdparty/src/jvm/com/twitter/scalding:db", - "3rdparty/src/jvm/com/twitter/scalding:parquet", - "ann/src/main/scala/com/twitter/ann/hnsw", - "ann/src/main/scala/com/twitter/ann/scalding/offline", - "ann/src/main/scala/com/twitter/ann/util", - "geoduck/hadoop/scalding/datasets:userlocation-scala", - "iesource/common/src/main/scala/com/twitter/iesource/common/util", - "iesource/processing/events/src/main/scala/com/twitter/iesource/processing/events/batch" + - ":server_engagements-scala", - "iesource/thrift", - "src/java/com/twitter/ml/api/constant", - "src/scala/com/twitter/ml/api/util", - "src/scala/com/twitter/ml/featurestore/catalog/entities/core", - "src/scala/com/twitter/ml/featurestore/catalog/features/geo", - "src/scala/com/twitter/ml/featurestore/lib/batch", - "src/scala/com/twitter/scalding_internal/dalv2", - "src/scala/com/twitter/scalding_internal/dalv2/dataset", - "src/scala/com/twitter/scalding_internal/db", - "src/scala/com/twitter/scalding_internal/db/jdbc", - "src/scala/com/twitter/scalding_internal/error_handling", - "src/scala/com/twitter/scalding_internal/job", - "src/scala/com/twitter/scalding_internal/job/analytics_batch", - "src/scala/com/twitter/scalding_internal/multiformat", - "src/scala/com/twitter/scalding_internal/source", - "src/scala/com/twitter/scalding_internal/source/lzo_scrooge", - "src/scala/com/twitter/scalding_internal/typed", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/thrift/com/twitter/ml/api:data-java", - "src/thrift/com/twitter/ml/api:interpretable-model-java", - "tweetsource/public_tweets/src/main/scala/com/twitter/tweetsource/public_tweets:public_tweets-scala", - "twml/runtime/src/main/scala/com/twitter/twml/runtime/scalding", - "util/util-core:scala", - "util/util-stats/src/main/scala", - ], -) - -scalding_job( - name = "tweet-embedding-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scalding.mbcg.TweetEmbeddingGenerationAdhocJob", - args = [ - "--dateRange 2021-10-30T00 2021-10-30T01", - "--model_name model", - "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_model_1104/1635973177/tweet_tower_with_signature", - "--concurrency_level 60", - "--embedding_dimension 128", - "--expected_elements 30000000", - "--max_M 20", - "--ef_construction 200", - "--tweet_embedding_name output", - "--ann_output_path hdfs:///atla/proc/user/cassowary/explore_mbcg/ann_index/test_11_04_adhoc", - ], - config = [ - ("hadoop.submitter.cpu", 60), - ("hadoop.submitter.jvm.total-memory", "256g"), - ("submitter.tier", "preemptible"), - ("hadoop.map.jvm.total-memory", "6144m"), - ], - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":mbcg"], -) - -scalding_job( - name = "tweet-embedding-generation-batch-job", - main = "com.twitter.simclusters_v2.scalding.mbcg.TweetEmbeddingGenerationBatchJob", - args = [ - "--model_name model", - "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_0119_1day_0110_3l_5e_f2v_gpu_resave/tweet_tower_with_signature", - "--concurrency_level 60", - "--embedding_dimension 128", - "--expected_elements 5000000", - "--max_M 40", - "--ef_construction 800", - "--tweet_embedding_name output", - "--f2v_input.feature_store_embedding Follow2VecProducerEmbedding200Dataset", - "--f2v_input.feature_store_major_version 20210708", - "--minFavCount 32", - "--ann_output_path hdfs:///atla/proc/user/cassowary/explore_mbcg/ann_index/0125_batch_index_f2v_minfav", - ], - config = [ - ("hadoop.submitter.cpu", 60), - ("hadoop.submitter.jvm.total-memory", "256g"), - ("hadoop.map.jvm.total-memory", "6144m"), - ("hadoop.submitter.disk", "100g"), - ], - cron = "*/5 * * * *", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":mbcg"], -) - -scalding_job( - name = "tweet-embedding-generation-batch-job-alternate", - main = "com.twitter.simclusters_v2.scalding.mbcg.TweetEmbeddingGenerationBatchJobAlternate", - args = [ - "--model_name model", - "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_331_329_1e_128em_b128_hn10_all_gpu/tweet_tower_with_signature", - "--concurrency_level 60", - "--embedding_dimension 128", - "--expected_elements 5000000", - "--max_M 40", - "--ef_construction 800", - "--tweet_embedding_name output", - "--f2v_input.feature_store_embedding Follow2VecProducerEmbedding200Dataset", - "--f2v_input.feature_store_major_version 20210708", - "--minFavCount 100", - "--indexAllTweets", - "--ann_output_path hdfs:///atla/proc/user/cassowary/explore_mbcg/ann_index/0401_batch_index_f2v_cosine_all_tweets", - ], - config = [ - ("hadoop.submitter.cpu", 60), - ("hadoop.submitter.jvm.total-memory", "256g"), - ("hadoop.map.jvm.total-memory", "6144m"), - ("hadoop.submitter.disk", "100g"), - ], - contact = "no-reply@twitter.com", - cron = "*/5 * * * *", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":mbcg"], -) - -scalding_job( - name = "tweet-embedding-generation-batch-job-experimental", - main = "com.twitter.simclusters_v2.scalding.mbcg.TweetEmbeddingGenerationBatchJobExperimental", - args = [ - "--model_name model", - "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_0127_1day_0110_3l_10e_128e_normf2v_nocosine_gpu/tweet_tower_with_signature", - "--concurrency_level 60", - "--embedding_dimension 128", - "--expected_elements 5000000", - "--max_M 40", - "--ef_construction 800", - "--tweet_embedding_name output", - "--f2v_input.feature_store_embedding Follow2VecProducerEmbedding200Dataset", - "--f2v_input.feature_store_major_version 20210708", - "--minFavCount 32", - "--ann_output_path hdfs:///atla/proc/user/cassowary/explore_mbcg/ann_index/0128_f2v_1week_batch_index", - ], - config = [ - ("hadoop.submitter.cpu", 60), - ("hadoop.submitter.jvm.total-memory", "256g"), - ("hadoop.map.jvm.total-memory", "6144m"), - ("hadoop.submitter.disk", "100g"), - ], - contact = "no-reply@twitter.com", - cron = "*/5 * * * *", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":mbcg"], -) - -scalding_job( - name = "user-embedding-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scalding.mbcg.UserEmbeddingGenerationAdhocJob", - args = [ - "--dateRange 2021-12-01T00 2021-12-01T01", - "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_1202_logs_100m_b64_hn10_1127_video_persistent/user_tower_with_signature", - "--embedding_dimension 128", - "--user_embedding_name output", - "--kvs_output_path /user/cassowary/explore_mbcg/user_kvs_store/1207_adhoc_model_store", - ], - config = [ - ("hadoop.submitter.cpu", 60), - ("hadoop.submitter.jvm.total-memory", "256g"), - ("submitter.tier", "preemptible"), - ("hadoop.map.jvm.total-memory", "6144m"), - ], - contact = "no-reply@twitter.com", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - "known-to-fail-jira:SD-20253", - ], - dependencies = [":mbcg"], -) - -scalding_job( - name = "user-embedding-generation-batch-job", - main = "com.twitter.simclusters_v2.scalding.mbcg.UserEmbeddingGenerationBatchJob", - args = [ - "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_0119_1day_0110_3l_5e_f2v_gpu_resave/user_tower_with_signature", - "--embedding_dimension 128", - "--user_embedding_name output", - "--f2v_input.feature_store_embedding FollowBasedConsumerFollow2VecAvgEmbedding200Dataset", - "--f2v_input.feature_store_major_version 20210708", - "--kvs_output_path /user/cassowary/explore_mbcg/user_kvs_store/0125_refreshed_model_store_f2v", - ], - config = [ - ("hadoop.submitter.cpu", 60), - ("hadoop.submitter.jvm.total-memory", "256g"), - ("submitter.tier", "preemptible"), - ("hadoop.map.jvm.total-memory", "6144m"), - ], - contact = "no-reply@twitter.com", - cron = "*/30 * * * *", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":mbcg"], -) - -scalding_job( - name = "user-embedding-generation-batch-job-alternate", - main = "com.twitter.simclusters_v2.scalding.mbcg.UserEmbeddingGenerationBatchJobAlternate", - args = [ - "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_331_329_1e_128em_b128_hn10_all_gpu/user_tower_with_signature", - "--embedding_dimension 128", - "--user_embedding_name output", - "--f2v_input.feature_store_embedding FollowBasedConsumerFollow2VecAvgEmbedding200Dataset", - "--f2v_input.feature_store_major_version 20210708", - "--kvs_output_path /user/cassowary/explore_mbcg/user_kvs_store/0401_refreshed_model_store_all", - ], - config = [ - ("hadoop.submitter.cpu", 60), - ("hadoop.submitter.jvm.total-memory", "256g"), - ("submitter.tier", "preemptible"), - ("hadoop.map.jvm.total-memory", "6144m"), - ], - contact = "no-reply@twitter.com", - cron = "*/30 * * * *", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":mbcg"], -) - -scalding_job( - name = "user-embedding-generation-batch-job-experimental", - main = "com.twitter.simclusters_v2.scalding.mbcg.UserEmbeddingGenerationBatchJobExperimental", - args = [ - "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_0127_1day_0110_3l_10e_128e_normf2v_nocosine_gpu/user_tower_with_signature", - "--embedding_dimension 128", - "--user_embedding_name output", - "--f2v_input.feature_store_embedding FollowBasedConsumerFollow2VecAvgEmbedding200Dataset", - "--f2v_input.feature_store_major_version 20210708", - "--kvs_output_path /user/cassowary/explore_mbcg/user_kvs_store/0328_f2v_cosine_all_tweets_model_store", - ], - config = [ - ("hadoop.submitter.cpu", 60), - ("hadoop.submitter.jvm.total-memory", "256g"), - ("submitter.tier", "preemptible"), - ("hadoop.map.jvm.total-memory", "6144m"), - ], - contact = "no-reply@twitter.com", - cron = "*/30 * * * *", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":mbcg"], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/BUILD.docx new file mode 100644 index 000000000..eda14fa4a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/RecordAdapters.docx b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/RecordAdapters.docx new file mode 100644 index 000000000..7d87a5e8c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/RecordAdapters.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/RecordAdapters.scala b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/RecordAdapters.scala deleted file mode 100644 index e972a24ae..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/RecordAdapters.scala +++ /dev/null @@ -1,79 +0,0 @@ -package com.twitter.simclusters_v2.scalding.mbcg - -import com.twitter.ml.api.DataRecord -import com.twitter.ml.api.embedding.Embedding -import com.twitter.ml.api.FeatureContext -import com.twitter.ml.api.FloatTensor -import com.twitter.ml.api.GeneralTensor -import com.twitter.ml.api.IRecordOneToOneAdapter -import com.twitter.ml.api.util.FDsl._ -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.simclusters_v2.thriftscala.PersistentSimClustersEmbedding -import scala.collection.JavaConverters._ - -/* -Adapters to convert data from MBCG input sources into DataRecords - */ -object TweetSimclusterRecordAdapter - extends IRecordOneToOneAdapter[(Long, PersistentSimClustersEmbedding, Embedding[Float])] { - override def getFeatureContext: FeatureContext = TweetAllFeatures.featureContext - - override def adaptToDataRecord( - tweetFeatures: (Long, PersistentSimClustersEmbedding, Embedding[Float]) - ) = { - val dataRecord = new DataRecord() - val tweetId = tweetFeatures._1 - val tweetEmbedding = tweetFeatures._2 - val f2vEmbedding = tweetFeatures._3 - val simclusterWithScores = tweetEmbedding.embedding.embedding - .map { simclusterWithScore => - // Cluster ID and score for that cluster - (simclusterWithScore._1.toString, simclusterWithScore._2) - }.toMap.asJava - - dataRecord.setFeatureValue(TweetAllFeatures.tweetId, tweetId) - dataRecord.setFeatureValue(TweetAllFeatures.tweetSimclusters, simclusterWithScores) - dataRecord.setFeatureValue( - TweetAllFeatures.authorF2vProducerEmbedding, - GeneralTensor.floatTensor( - new FloatTensor(f2vEmbedding.map(Double.box(_)).asJava) - ) - ) - - dataRecord - } -} - -object UserSimclusterRecordAdapter - extends IRecordOneToOneAdapter[(Long, ClustersUserIsInterestedIn, Embedding[Float])] { - override def getFeatureContext: FeatureContext = TweetAllFeatures.featureContext - - override def adaptToDataRecord( - userSimclusterEmbedding: (Long, ClustersUserIsInterestedIn, Embedding[Float]) - ) = { - val dataRecord = new DataRecord() - val userId = userSimclusterEmbedding._1 - val userEmbedding = userSimclusterEmbedding._2 - val simclusterWithScores = userEmbedding.clusterIdToScores - .filter { - case (_, score) => - score.logFavScore.map(_ >= 0.0).getOrElse(false) - } - .map { - case (clusterId, score) => - (clusterId.toString, score.logFavScore.get) - }.toMap.asJava - val f2vEmbedding = userSimclusterEmbedding._3 - - dataRecord.setFeatureValue(UserAllFeatures.userId, userId) - dataRecord.setFeatureValue(UserAllFeatures.userSimclusters, simclusterWithScores) - dataRecord.setFeatureValue( - UserAllFeatures.userF2vConsumerEmbedding, - GeneralTensor.floatTensor( - new FloatTensor(f2vEmbedding.map(Double.box(_)).asJava) - ) - ) - - dataRecord - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/TweetEmbeddingGenerationJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/TweetEmbeddingGenerationJob.docx new file mode 100644 index 000000000..a9e5ca9b2 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/TweetEmbeddingGenerationJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/TweetEmbeddingGenerationJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/TweetEmbeddingGenerationJob.scala deleted file mode 100644 index 717e07493..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/TweetEmbeddingGenerationJob.scala +++ /dev/null @@ -1,384 +0,0 @@ -package com.twitter.simclusters_v2.scalding.mbcg - -import com.twitter.ann.common.EntityEmbedding -import com.twitter.ann.common.Cosine -import com.twitter.ann.common.CosineDistance -import com.twitter.ann.common.InnerProduct -import com.twitter.ann.common.InnerProductDistance -import com.twitter.ann.common.ReadWriteFuturePool -import com.twitter.ann.hnsw.TypedHnswIndex -import com.twitter.ann.util.IndexBuilderUtils -import com.twitter.conversions.DurationOps._ -import com.twitter.cortex.deepbird.runtime.prediction_engine.TensorflowPredictionEngineConfig -import com.twitter.cortex.ml.embeddings.common.TweetKind -import com.twitter.cortex.ml.embeddings.common.UserKind -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.finagle.stats.NullStatsReceiver -import com.twitter.iesource.common.util.InteractionEventUtils -import com.twitter.iesource.processing.events.batch.ServerEngagementsScalaDataset -import com.twitter.iesource.thriftscala.InteractionDetails -import com.twitter.ml.api.embedding.Embedding -import com.twitter.ml.api.FeatureUtil -import com.twitter.ml.api.constant.SharedFeatures -import com.twitter.ml.api.embedding.EmbeddingSerDe -import com.twitter.ml.api.thriftscala -import com.twitter.ml.api.thriftscala.{GeneralTensor => ThriftGeneralTensor} -import com.twitter.ml.api.util.FDsl._ -import com.twitter.ml.api.util.ScalaToJavaDataRecordConversions -import com.twitter.ml.featurestore.lib.TweetId -import com.twitter.ml.featurestore.lib.embedding.EmbeddingWithEntity -import com.twitter.scalding.Args -import com.twitter.scalding.DateParser -import com.twitter.scalding.DateRange -import com.twitter.scalding.Execution -import com.twitter.scalding.UniqueID -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC -import com.twitter.scalding_internal.job.FutureHelper -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecution -import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecutionArgs -import com.twitter.scalding_internal.job.analytics_batch.BatchDescription -import com.twitter.scalding_internal.job.analytics_batch.BatchFirstTime -import com.twitter.scalding_internal.job.analytics_batch.BatchIncrement -import com.twitter.scalding_internal.job.analytics_batch.BatchWidth -import com.twitter.scalding_internal.job.analytics_batch.TwitterScheduledExecutionApp -import com.twitter.search.common.file.FileUtils -import com.twitter.simclusters_v2.scalding.common.LogFavBasedPersistentTweetEmbeddingMhExportSource -import com.twitter.simclusters_v2.thriftscala.PersistentSimClustersEmbedding -import com.twitter.tweetsource.common.thriftscala.MediaType -import com.twitter.tweetsource.public_tweets.PublicTweetsScalaDataset -import com.twitter.tweetsource.public_tweets.thriftscala.PublicTweet -import com.twitter.twml.runtime.scalding.TensorflowBatchPredictor -import com.twitter.twml.runtime.scalding.TensorflowBatchPredictor.ScaldingThreadingConfig -import com.twitter.util.FuturePool -import com.twitter.util.logging.Logger -import java.util.TimeZone -import java.util.concurrent.Executors - -/* -This class does the following: -1) Get tweet simcluster features from LogFavBasedPersistentTweetEmbeddingMhExportSource -2) Filter them down to English media tweets that aren't replies or quote tweets using TweetSource -3) Convert the remaining tweets into DataRecords using TweetSimclusterRecordAdapter -4) Run inference using a TF model exported with a DataRecord compatible serving signature -5) Create an ANN index from the generated tweet embeddings - */ -trait TweetEmbeddingGenerationTrait { - implicit val tz: TimeZone = DateOps.UTC - implicit val dp: DateParser = DateParser.default - implicit val updateHours = 4 - - private val inputNodeName = "request:0" - private val outputNodeName = "response:0" - private val functionSignatureName = "serve" - private val predictionRequestTimeout = 5.seconds - private val SupportedLanguages = Set("en") - private val tweetSourceLookback = Days(2) - - private val DEFAULT_F2V_VECTOR: Embedding[Float] = Embedding(Array.fill[Float](200)(0.0f)) - - def getPredictionEngine(modelName: String, modelPath: String): TensorflowBatchPredictor = { - val config = TensorflowPredictionEngineConfig( - modelName = modelName, - modelSource = modelPath, - threadingConfig = Some(ScaldingThreadingConfig), - defaultInputNode = inputNodeName, - defaultOutputNode = outputNodeName, - functionSignatureName = functionSignatureName, - statsReceiver = NullStatsReceiver - ) - TensorflowBatchPredictor(config, predictionRequestTimeout) - } - - def getEmbeddingWithEntity(tweetEmbeddingTensor: ThriftGeneralTensor, tweetId: Long) = { - tweetEmbeddingTensor match { - case ThriftGeneralTensor.RawTypedTensor(rawTensor) => - val embedding = EmbeddingSerDe.floatEmbeddingSerDe.fromThrift( - thriftscala.Embedding(Some(rawTensor)) - ) - EmbeddingWithEntity[TweetId](TweetId(tweetId), embedding) - case _ => throw new IllegalArgumentException("tensor is wrong type!") - } - } - - def buildAnnIndex( - pipe: TypedPipe[EmbeddingWithEntity[TweetId]], - args: Args - ): Execution[Unit] = { - def embeddingDimension: Int = args.int("embedding_dimension", 128) - def efConstruction: Int = args.int("ef_construction", 800) - def maxM: Int = args.int("max_M", 40) - val log: Logger = Logger(getClass) - val annOutputPath: String = args("ann_output_path") - - val embeddingWithEntity = pipe.map { - case EmbeddingWithEntity(tweetId, embedding) => - EntityEmbedding[TweetId](tweetId, embedding) - } - val concurrencyLevel = args.int("concurrency_level", 60) - val expectedElements = args.int("expected_elements", 30000000) - val threadPool = Executors.newFixedThreadPool(concurrencyLevel) - val hnswIndex = TypedHnswIndex.serializableIndex[TweetId, InnerProductDistance]( - embeddingDimension, - InnerProduct, - efConstruction, - maxM, - expectedElements, - TweetKind.byteInjection, - ReadWriteFuturePool(FuturePool.apply(threadPool)) - ) - - // Create a timestamped directory to use for recovery in case of index corruption - val timeStampedAnnOutputPath: String = annOutputPath + "/" + (System.currentTimeMillis() / 1000) - val timeStampedAnnOutputDirectory = FileUtils.getFileHandle(timeStampedAnnOutputPath) - - embeddingWithEntity.toIterableExecution - .flatMap { annEmbeddings => - val future = - IndexBuilderUtils.addToIndex(hnswIndex, annEmbeddings.toStream, concurrencyLevel) - val result = future.map { numberUpdates => - log.info(s"Performed $numberUpdates updates") - hnswIndex.toDirectory(timeStampedAnnOutputDirectory) - log.info(s"Finished writing to timestamped index directory - " + - s"$timeStampedAnnOutputDirectory") - } - FutureHelper.executionFrom(result).unit - }.onComplete { _ => - threadPool.shutdown() - Unit - } - } - - def getTweetSimclusterFeatures( - args: Args - )( - implicit dateRange: DateRange - ): TypedPipe[(Long, PersistentSimClustersEmbedding)] = { - val serviceIdEnv = args.getOrElse("sIdEnv", "prod") - val serviceIdRole = args.getOrElse("sIdRole", "cassowary") - val serviceIdZone = args.getOrElse("sIdZone", "atla") - val serviceIdName = args - .getOrElse("sIdName", "tweet-embedding-generation-batch-job") - val serviceId = ServiceIdentifier( - role = serviceIdRole, - service = serviceIdName, - environment = serviceIdEnv, - zone = serviceIdZone) - - val logFavBasedPersistentTweetEmbeddingSource = - new LogFavBasedPersistentTweetEmbeddingMhExportSource( - range = dateRange.prepend(Hours(24)), - serviceIdentifier = serviceId) - val tweetSimclusterEmbeddingTypedPipe = TypedPipe - .from(logFavBasedPersistentTweetEmbeddingSource) - .collect { - case ( - (tweetId, timestamp), - simclusterEmbedding: PersistentSimClustersEmbedding - ) if timestamp == 1L => // 1L corresponds to the LongestL2Norm simcluster embedding - (tweetId.toLong, simclusterEmbedding) - } - - tweetSimclusterEmbeddingTypedPipe - } - - def getTweetSource()(implicit dateRange: DateRange): TypedPipe[PublicTweet] = { - val recentTweets = DAL - .read(PublicTweetsScalaDataset, dateRange.prepend(tweetSourceLookback)) - .toTypedPipe - - recentTweets - } - - def isVideoTweet(tweet: PublicTweet): Boolean = { - tweet.media.exists { mediaSeq => - mediaSeq.exists { e => - e.mediaType.contains(MediaType.Video) - } - } - } - - def getEngagementFilteredTweets( - minFavCount: Long - )( - implicit dateRange: DateRange - ): TypedPipe[(Long, Int)] = { - val engagementFilteredTweetsPipe = DAL - .read(ServerEngagementsScalaDataset, dateRange.prepend(Days(2))).withRemoteReadPolicy( - AllowCrossDC).toTypedPipe - .collect { - case event if InteractionEventUtils.isTweetType(event) => - val targetTweetId = event.targetId - event.details match { - case InteractionDetails.Favorite(_) => (targetTweetId, 1) - case _ => (targetTweetId, 0) - } - } - .sumByKey - .map { - case (tweetId, count) => (tweetId, count) - } - .filter(_._2 >= minFavCount) - - engagementFilteredTweetsPipe - } - - def run(args: Args)(implicit dateRange: DateRange, idx: UniqueID) = { - val minFavCount = args.int("minFavCount", 32) - val indexAllTweets = args.boolean("indexAllTweets") - - val tweetSimclusterDataset = getTweetSimclusterFeatures(args) - val tweetSourceDataset = getTweetSource() - val engagementFilteredTweetsPipe = getEngagementFilteredTweets(minFavCount) - val inputEmbeddingFormat = UserKind.parser - .getEmbeddingFormat(args, "f2v_input", Some(dateRange.prepend(Days(14)))) - val f2vProducerEmbeddings = inputEmbeddingFormat.getEmbeddings - .map { - case EmbeddingWithEntity(userId, embedding) => (userId.userId, embedding) - } - - val engagementFilteredTweetInfoPipe = tweetSourceDataset - .groupBy(_.tweetId) - .join(engagementFilteredTweetsPipe.groupBy(_._1)) - .map { - case (tweetId, (tweetInfo, tweetFavCount)) => - (tweetId, tweetInfo) - } - - val filteredSimclustersPipe = tweetSimclusterDataset - .groupBy(_._1) - .join(engagementFilteredTweetInfoPipe.groupBy(_._1)) - .map { - case (tweetId, ((_, simclusterEmbedding), (_, tweetInfo))) => - (tweetId, simclusterEmbedding, tweetInfo) - } - .filter { - case (_, _, tweetInfo) => - tweetInfo.quotedTweetTweetId.isEmpty && - tweetInfo.inReplyToTweetId.isEmpty && - tweetInfo.language.exists(SupportedLanguages.contains) && - (indexAllTweets || (!tweetInfo.media.exists(_.isEmpty) && isVideoTweet(tweetInfo))) && - !tweetInfo.nsfwAdmin && - !tweetInfo.nsfwUser - } - .map { - case (tweetId, simclusterEmbedding, tweetInfo) => - (tweetInfo.userId, tweetId, simclusterEmbedding) - } - - val dataRecordsPipe = filteredSimclustersPipe - .groupBy(_._1) - .leftJoin(f2vProducerEmbeddings.groupBy(_._1)) - .values - .map { - case ((authorId1, tweetId, simclusterEmbedding), Some((authorId2, f2vEmbedding))) => - TweetSimclusterRecordAdapter.adaptToDataRecord( - (tweetId, simclusterEmbedding, f2vEmbedding)) - case ((authorId, tweetId, simclusterEmbedding), None) => - TweetSimclusterRecordAdapter.adaptToDataRecord( - (tweetId, simclusterEmbedding, DEFAULT_F2V_VECTOR)) - } - - val modelPath = args.getOrElse("model_path", "") - val batchPredictor = getPredictionEngine(modelName = "tweet_model", modelPath = modelPath) - val tweetIdFeature = SharedFeatures.TWEET_ID - val tweetEmbeddingName = args.getOrElse("tweet_embedding_name", "output") - - val outputPipe = batchPredictor.predict(dataRecordsPipe).map { - case (originalDataRecord, predictedDataRecord) => - val tweetId = originalDataRecord.getFeatureValue(tweetIdFeature) - val scalaPredictedDataRecord = - ScalaToJavaDataRecordConversions.javaDataRecord2ScalaDataRecord(predictedDataRecord) - val tweetEmbeddingTensor = - scalaPredictedDataRecord.tensors.get(FeatureUtil.featureIdForName(tweetEmbeddingName)) - val tweetEmbeddingWithEntity = getEmbeddingWithEntity(tweetEmbeddingTensor, tweetId) - tweetEmbeddingWithEntity - } - - buildAnnIndex(outputPipe, args) - } -} - -object TweetEmbeddingGenerationAdhocJob - extends TwitterExecutionApp - with TweetEmbeddingGenerationTrait { - - override def job: Execution[Unit] = - Execution.withId { implicit uid => - Execution.withArgs { args => - implicit val dateRange: DateRange = DateRange.parse(args.list("dateRange")) - run(args) - } - } -} - -object TweetEmbeddingGenerationBatchJob - extends TwitterScheduledExecutionApp - with TweetEmbeddingGenerationTrait { - - override def scheduledJob: Execution[Unit] = - Execution.withId { implicit uid => - Execution.withArgs { args => - implicit val tz: TimeZone = DateOps.UTC - val batchFirstTime = BatchFirstTime(RichDate("2021-10-28")(tz, DateParser.default)) - val analyticsArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(getClass.getName), - firstTime = batchFirstTime, - batchIncrement = BatchIncrement(Hours(updateHours)), - batchWidth = Some(BatchWidth(Hours(updateHours))) - ) - - AnalyticsBatchExecution(analyticsArgs) { implicit dateRange => - run(args) - } - } - } -} - -object TweetEmbeddingGenerationBatchJobAlternate - extends TwitterScheduledExecutionApp - with TweetEmbeddingGenerationTrait { - - override def scheduledJob: Execution[Unit] = - Execution.withId { implicit uid => - Execution.withArgs { args => - implicit val tz: TimeZone = DateOps.UTC - val batchFirstTime = BatchFirstTime(RichDate("2022-03-28")(tz, DateParser.default)) - val analyticsArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(getClass.getName), - firstTime = batchFirstTime, - batchIncrement = BatchIncrement(Hours(updateHours)), - batchWidth = Some(BatchWidth(Hours(updateHours))) - ) - - AnalyticsBatchExecution(analyticsArgs) { implicit dateRange => - run(args) - } - } - } -} - -object TweetEmbeddingGenerationBatchJobExperimental - extends TwitterScheduledExecutionApp - with TweetEmbeddingGenerationTrait { - - override def scheduledJob: Execution[Unit] = - Execution.withId { implicit uid => - Execution.withArgs { args => - implicit val tz: TimeZone = DateOps.UTC - val batchFirstTime = BatchFirstTime(RichDate("2021-12-12")(tz, DateParser.default)) - val analyticsArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(getClass.getName), - firstTime = batchFirstTime, - batchIncrement = BatchIncrement(Hours(updateHours)), - batchWidth = Some(BatchWidth(Hours(updateHours))) - ) - - AnalyticsBatchExecution(analyticsArgs) { implicit dateRange => - run(args) - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/UserEmbeddingGenerationJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/UserEmbeddingGenerationJob.docx new file mode 100644 index 000000000..f7fecd5cf Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/UserEmbeddingGenerationJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/UserEmbeddingGenerationJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/mbcg/UserEmbeddingGenerationJob.scala deleted file mode 100644 index f747764d9..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/mbcg/UserEmbeddingGenerationJob.scala +++ /dev/null @@ -1,270 +0,0 @@ -package com.twitter.simclusters_v2.scalding.mbcg - -import com.twitter.conversions.DurationOps._ -import com.twitter.cortex.deepbird.runtime.prediction_engine.TensorflowPredictionEngineConfig -import com.twitter.cortex.ml.embeddings.common.UserKind -import com.twitter.finagle.stats.NullStatsReceiver -import com.twitter.ml.api.FeatureUtil -import com.twitter.ml.api.constant.SharedFeatures -import com.twitter.ml.api.embedding.Embedding -import com.twitter.ml.api.thriftscala -import com.twitter.ml.api.thriftscala.{GeneralTensor => ThriftGeneralTensor} -import com.twitter.ml.api.util.FDsl._ -import com.twitter.ml.api.util.ScalaToJavaDataRecordConversions -import com.twitter.ml.featurestore.lib.embedding.EmbeddingWithEntity -import com.twitter.scalding.Args -import com.twitter.scalding.DateParser -import com.twitter.scalding.DateRange -import com.twitter.scalding.Execution -import com.twitter.scalding.UniqueID -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecution -import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecutionArgs -import com.twitter.scalding_internal.job.analytics_batch.BatchDescription -import com.twitter.scalding_internal.job.analytics_batch.BatchFirstTime -import com.twitter.scalding_internal.job.analytics_batch.BatchIncrement -import com.twitter.scalding_internal.job.analytics_batch.BatchWidth -import com.twitter.scalding_internal.job.analytics_batch.TwitterScheduledExecutionApp -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources -import com.twitter.simclusters_v2.hdfs_sources.ExploreMbcgUserEmbeddingsKvScalaDataset -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.twml.runtime.scalding.TensorflowBatchPredictor -import com.twitter.twml.runtime.scalding.TensorflowBatchPredictor.ScaldingThreadingConfig -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser -import java.util.TimeZone - -/* -This class does the following: -1) Get user IIAPE Simcluster features that use LogFav scores -2) Filter them down to users whose accounts are not deactivated or suspended -3) Convert the remaining user Simclusters into DataRecords using UserSimclusterRecordAdapter -4) Run inference using a TF model exported with a DataRecord compatible serving signature -5) Write to MH using a KeyVal format - */ -trait UserEmbeddingGenerationTrait { - implicit val tz: TimeZone = DateOps.UTC - implicit val dp: DateParser = DateParser.default - implicit val updateHours = 12 - - private val inputNodeName = "request:0" - private val outputNodeName = "response:0" - private val functionSignatureName = "serve" - private val predictionRequestTimeout = 5.seconds - private val IIAPEHdfsPath: String = - "/atla/proc3/user/cassowary/manhattan_sequence_files/interested_in_from_ape/Model20m145k2020" - - private val DEFAULT_F2V_VECTOR: Embedding[Float] = Embedding(Array.fill[Float](200)(0.0f)) - - def getPredictionEngine(modelName: String, modelPath: String): TensorflowBatchPredictor = { - val config = TensorflowPredictionEngineConfig( - modelName = modelName, - modelSource = modelPath, - threadingConfig = Some(ScaldingThreadingConfig), - defaultInputNode = inputNodeName, - defaultOutputNode = outputNodeName, - functionSignatureName = functionSignatureName, - statsReceiver = NullStatsReceiver - ) - TensorflowBatchPredictor(config, predictionRequestTimeout) - } - - def getEmbeddingWithEntity(userEmbeddingTensor: ThriftGeneralTensor, userId: Long) = { - userEmbeddingTensor match { - case ThriftGeneralTensor.RawTypedTensor(rawTensor) => - val embedding = - thriftscala.Embedding(Some(rawTensor)) - KeyVal(userId, embedding) - case _ => throw new IllegalArgumentException("tensor is wrong type!") - } - } - - def writeUserEmbedding( - result: TypedPipe[KeyVal[Long, thriftscala.Embedding]], - args: Args - ): Execution[Unit] = { - result.writeDALVersionedKeyValExecution( - ExploreMbcgUserEmbeddingsKvScalaDataset, - D.Suffix( - args.getOrElse("kvs_output_path", "/user/cassowary/explore_mbcg/user_kvs_store/test") - ) - ) - } - - def getUserSimclusterFeatures( - args: Args - )( - implicit dateRange: DateRange - ): TypedPipe[(Long, ClustersUserIsInterestedIn)] = { - val userSimclusterEmbeddingTypedPipe = TypedPipe - .from(AdhocKeyValSources.interestedInSource(IIAPEHdfsPath)) - .collect { - case ( - userId, - iIAPE: ClustersUserIsInterestedIn - ) => - (userId.toLong, iIAPE) - } - - userSimclusterEmbeddingTypedPipe - } - - def getUserSource()(implicit dateRange: DateRange): TypedPipe[FlatUser] = { - val userSource = - DAL - .readMostRecentSnapshotNoOlderThan(UsersourceFlatScalaDataset, Days(7)) - .withRemoteReadPolicy(AllowCrossDC) - .toTypedPipe - - userSource - } - - def run(args: Args)(implicit dateRange: DateRange, id: UniqueID) = { - val userSimclusterDataset = getUserSimclusterFeatures(args) - val userSourceDataset = getUserSource() - - val inputEmbeddingFormat = UserKind.parser - .getEmbeddingFormat(args, "f2v_input", Some(dateRange.prepend(Days(14)))) - val f2vConsumerEmbeddings = inputEmbeddingFormat.getEmbeddings - .map { - case EmbeddingWithEntity(userId, embedding) => (userId.userId, embedding) - } - - val filteredUserPipe = userSimclusterDataset - .groupBy(_._1) - .join(userSourceDataset.groupBy(_.id.getOrElse(-1L))) - .map { - case (userId, ((_, simclusterEmbedding), userInfo)) => - (userId, simclusterEmbedding, userInfo) - } - .filter { - case (_, _, userInfo) => - !userInfo.deactivated.contains(true) && !userInfo.suspended - .contains(true) - } - .map { - case (userId, simclusterEmbedding, _) => - (userId, simclusterEmbedding) - } - - val dataRecordsPipe = filteredUserPipe - .groupBy(_._1) - .leftJoin(f2vConsumerEmbeddings.groupBy(_._1)) - .values - .map { - case ((userId1, simclusterEmbedding), Some((userId2, f2vEmbedding))) => - UserSimclusterRecordAdapter.adaptToDataRecord( - (userId1, simclusterEmbedding, f2vEmbedding)) - case ((userId, simclusterEmbedding), None) => - UserSimclusterRecordAdapter.adaptToDataRecord( - (userId, simclusterEmbedding, DEFAULT_F2V_VECTOR)) - } - - val modelPath = args.getOrElse("model_path", "") - val batchPredictor = getPredictionEngine(modelName = "tweet_model", modelPath = modelPath) - val userIdFeature = SharedFeatures.USER_ID - val userEmbeddingName = args.getOrElse("user_embedding_name", "output") - - val outputPipe = batchPredictor.predict(dataRecordsPipe).map { - case (originalDataRecord, predictedDataRecord) => - val userId = originalDataRecord.getFeatureValue(userIdFeature) - val scalaPredictedDataRecord = - ScalaToJavaDataRecordConversions.javaDataRecord2ScalaDataRecord(predictedDataRecord) - val userEmbeddingTensor = - scalaPredictedDataRecord.tensors.get(FeatureUtil.featureIdForName(userEmbeddingName)) - val userEmbeddingWithEntity = getEmbeddingWithEntity(userEmbeddingTensor, userId) - userEmbeddingWithEntity - } - - Util.printCounters(writeUserEmbedding(outputPipe, args)) - } -} - -object UserEmbeddingGenerationAdhocJob - extends TwitterExecutionApp - with UserEmbeddingGenerationTrait { - - override def job: Execution[Unit] = - Execution.withId { implicit uid => - Execution.withArgs { args => - implicit val dateRange: DateRange = DateRange.parse(args.list("dateRange")) - run(args) - } - } -} - -object UserEmbeddingGenerationBatchJob - extends TwitterScheduledExecutionApp - with UserEmbeddingGenerationTrait { - - override def scheduledJob: Execution[Unit] = - Execution.withId { implicit uid => - Execution.withArgs { args => - implicit val tz: TimeZone = DateOps.UTC - val batchFirstTime = BatchFirstTime(RichDate("2021-12-04")(tz, DateParser.default)) - val analyticsArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(getClass.getName), - firstTime = batchFirstTime, - batchIncrement = BatchIncrement(Hours(updateHours)), - batchWidth = Some(BatchWidth(Hours(updateHours))) - ) - - AnalyticsBatchExecution(analyticsArgs) { implicit dateRange => - run(args) - } - } - } -} - -object UserEmbeddingGenerationBatchJobAlternate - extends TwitterScheduledExecutionApp - with UserEmbeddingGenerationTrait { - - override def scheduledJob: Execution[Unit] = - Execution.withId { implicit uid => - Execution.withArgs { args => - implicit val tz: TimeZone = DateOps.UTC - val batchFirstTime = BatchFirstTime(RichDate("2022-03-28")(tz, DateParser.default)) - val analyticsArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(getClass.getName), - firstTime = batchFirstTime, - batchIncrement = BatchIncrement(Hours(updateHours)), - batchWidth = Some(BatchWidth(Hours(updateHours))) - ) - - AnalyticsBatchExecution(analyticsArgs) { implicit dateRange => - run(args) - } - } - } -} - -object UserEmbeddingGenerationBatchJobExperimental - extends TwitterScheduledExecutionApp - with UserEmbeddingGenerationTrait { - - override def scheduledJob: Execution[Unit] = - Execution.withId { implicit uid => - Execution.withArgs { args => - implicit val tz: TimeZone = DateOps.UTC - val batchFirstTime = BatchFirstTime(RichDate("2021-12-12")(tz, DateParser.default)) - val analyticsArgs = AnalyticsBatchExecutionArgs( - batchDesc = BatchDescription(getClass.getName), - firstTime = batchFirstTime, - batchIncrement = BatchIncrement(Hours(updateHours)), - batchWidth = Some(BatchWidth(Hours(updateHours))) - ) - - AnalyticsBatchExecution(analyticsArgs) { implicit dateRange => - run(args) - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraph.docx b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraph.docx new file mode 100644 index 000000000..07369b9bc Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraph.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraph.scala b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraph.scala deleted file mode 100644 index 5e289c6b8..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraph.scala +++ /dev/null @@ -1,514 +0,0 @@ -package com.twitter.simclusters_v2.scalding -package multi_type_graph.assemble_multi_type_graph - -import com.twitter.bijection.scrooge.BinaryScalaCodec -import com.twitter.scalding_internal.job.RequiredBinaryComparators.ordSer -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding.{DateRange, Days, Stat, UniqueID} -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.thriftscala.{ - LeftNode, - Noun, - RightNode, - RightNodeType, - RightNodeWithEdgeWeight -} -import java.util.TimeZone -import com.twitter.iesource.thriftscala.{InteractionEvent, InteractionType, ReferenceTweet} -import com.twitter.simclusters_v2.common.{Country, Language, TopicId, TweetId, UserId} -import com.twitter.usersource.snapshot.combined.UsersourceScalaDataset -import com.twitter.frigate.data_pipeline.magicrecs.magicrecs_notifications_lite.thriftscala.MagicRecsNotificationLite -import com.twitter.twadoop.user.gen.thriftscala.CombinedUser - -object AssembleMultiTypeGraph { - import Config._ - - implicit val nounOrdering: Ordering[Noun] = new Ordering[Noun] { - // We define an ordering for each noun type as specified in simclusters_v2/multi_type_graph.thrift - // Please make sure we don't remove anything here that's still a part of the union Noun thrift and - // vice versa, if we add a new noun type to thrift, an ordering for it needs to added here as well. - def nounTypeOrder(noun: Noun): Int = noun match { - case _: Noun.UserId => 0 - case _: Noun.Country => 1 - case _: Noun.Language => 2 - case _: Noun.Query => 3 - case _: Noun.TopicId => 4 - case _: Noun.TweetId => 5 - } - - override def compare(x: Noun, y: Noun): Int = (x, y) match { - case (Noun.UserId(a), Noun.UserId(b)) => a compare b - case (Noun.Country(a), Noun.Country(b)) => a compare b - case (Noun.Language(a), Noun.Language(b)) => a compare b - case (Noun.Query(a), Noun.Query(b)) => a compare b - case (Noun.TopicId(a), Noun.TopicId(b)) => a compare b - case (Noun.TweetId(a), Noun.TweetId(b)) => a compare b - case (nounA, nounB) => nounTypeOrder(nounA) compare nounTypeOrder(nounB) - } - } - implicit val rightNodeTypeOrdering: Ordering[RightNodeType] = ordSer[RightNodeType] - - implicit val rightNodeTypeWithNounOrdering: Ordering[RightNode] = - new Ordering[RightNode] { - override def compare(x: RightNode, y: RightNode): Int = { - Ordering - .Tuple2(rightNodeTypeOrdering, nounOrdering) - .compare((x.rightNodeType, x.noun), (y.rightNodeType, y.noun)) - } - } - - def getUserTweetInteractionGraph( - tweetInteractionEvents: TypedPipe[InteractionEvent], - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numUserTweetInteractionEntries = Stat("num_user_tweet_interaction_entries") - val numDistinctUserTweetInteractionEntries = Stat("num_distinct_user_tweet_interaction_entries") - val numFavedTweets = Stat("num_faved_tweets") - val numRepliedTweets = Stat("num_replied_tweets") - val numRetweetedTweets = Stat("num_retweeted_tweets") - val userTweetInteractionsByType: TypedPipe[((UserId, RightNodeType), TweetId)] = - tweetInteractionEvents - .flatMap { event => - val referenceTweet: Option[ReferenceTweet] = event.referenceTweet - val targetId: Long = event.targetId - val userId: Long = event.engagingUserId - - // To find the id of the tweet that was interacted with - // For likes, this is the targetId; for retweet or reply, it is the referenceTweet's id - // One thing to note is that for likes, referenceTweet is empty - val (tweetIdOpt, rightNodeTypeOpt) = { - event.interactionType match { - case Some(InteractionType.Favorite) => - // Only allow favorites on original tweets, not retweets, to avoid double-counting - // because we have retweet-type tweets in the data source as well - ( - if (referenceTweet.isEmpty) { - numFavedTweets.inc() - Some(targetId) - } else None, - Some(RightNodeType.FavTweet)) - case Some(InteractionType.Reply) => - numRepliedTweets.inc() - (referenceTweet.map(_.tweetId), Some(RightNodeType.ReplyTweet)) - case Some(InteractionType.Retweet) => - numRetweetedTweets.inc() - (referenceTweet.map(_.tweetId), Some(RightNodeType.RetweetTweet)) - case _ => (None, None) - } - } - for { - tweetId <- tweetIdOpt - rightNodeType <- rightNodeTypeOpt - } yield { - numUserTweetInteractionEntries.inc() - ((userId, rightNodeType), tweetId) - } - } - - userTweetInteractionsByType - .mapValues(Set(_)) - .sumByKey - .flatMap { - case ((userId, rightNodeType), tweetIdSet) => - tweetIdSet.map { tweetId => - numDistinctUserTweetInteractionEntries.inc() - ( - LeftNode.UserId(userId), - RightNodeWithEdgeWeight( - rightNode = RightNode(rightNodeType = rightNodeType, noun = Noun.TweetId(tweetId)), - weight = 1.0)) - } - } - } - - def getUserFavGraph( - userUserFavEdges: TypedPipe[(UserId, UserId, Double)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numInputFavEdges = Stat("num_input_fav_edges") - userUserFavEdges.map { - case (srcId, destId, edgeWt) => - numInputFavEdges.inc() - ( - LeftNode.UserId(srcId), - RightNodeWithEdgeWeight( - rightNode = - RightNode(rightNodeType = RightNodeType.FavUser, noun = Noun.UserId(destId)), - weight = edgeWt)) - } - } - - def getUserFollowGraph( - userUserFollowEdges: TypedPipe[(UserId, UserId)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numFlockFollowEdges = Stat("num_flock_follow_edges") - userUserFollowEdges.map { - case (srcId, destId) => - numFlockFollowEdges.inc() - ( - LeftNode.UserId(srcId), - RightNodeWithEdgeWeight( - rightNode = - RightNode(rightNodeType = RightNodeType.FollowUser, noun = Noun.UserId(destId)), - weight = 1.0)) - } - } - - def getUserBlockGraph( - userUserBlockEdges: TypedPipe[(UserId, UserId)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numFlockBlockEdges = Stat("num_flock_block_edges") - userUserBlockEdges.map { - case (srcId, destId) => - numFlockBlockEdges.inc() - ( - LeftNode.UserId(srcId), - RightNodeWithEdgeWeight( - rightNode = - RightNode(rightNodeType = RightNodeType.BlockUser, noun = Noun.UserId(destId)), - weight = 1.0)) - } - } - - def getUserAbuseReportGraph( - userUserAbuseReportEdges: TypedPipe[(UserId, UserId)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numFlockAbuseEdges = Stat("num_flock_abuse_edges") - userUserAbuseReportEdges.map { - case (srcId, destId) => - numFlockAbuseEdges.inc() - ( - LeftNode.UserId(srcId), - RightNodeWithEdgeWeight( - rightNode = - RightNode(rightNodeType = RightNodeType.AbuseReportUser, noun = Noun.UserId(destId)), - weight = 1.0)) - } - } - - def filterInvalidUsers( - flockEdges: TypedPipe[(UserId, UserId)], - validUsers: TypedPipe[UserId] - ): TypedPipe[(UserId, UserId)] = { - flockEdges - .join(validUsers.asKeys) - // .withReducers(10000) - .map { - case (srcId, (destId, _)) => - (destId, srcId) - } - .join(validUsers.asKeys) - // .withReducers(10000) - .map { - case (destId, (srcId, _)) => - (srcId, destId) - } - } - - def getUserSpamReportGraph( - userUserSpamReportEdges: TypedPipe[(UserId, UserId)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numFlockSpamEdges = Stat("num_flock_spam_edges") - userUserSpamReportEdges.map { - case (srcId, destId) => - numFlockSpamEdges.inc() - ( - LeftNode.UserId(srcId), - RightNodeWithEdgeWeight( - rightNode = - RightNode(rightNodeType = RightNodeType.SpamReportUser, noun = Noun.UserId(destId)), - weight = 1.0)) - } - } - - def getUserTopicFollowGraph( - topicUserFollowedByEdges: TypedPipe[(TopicId, UserId)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numTFGEdges = Stat("num_tfg_edges") - topicUserFollowedByEdges.map { - case (topicId, userId) => - numTFGEdges.inc() - ( - LeftNode.UserId(userId), - RightNodeWithEdgeWeight( - rightNode = - RightNode(rightNodeType = RightNodeType.FollowTopic, noun = Noun.TopicId(topicId)), - weight = 1.0) - ) - } - } - - def getUserSignUpCountryGraph( - userSignUpCountryEdges: TypedPipe[(UserId, (Country, Language))] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numUserSourceEntriesRead = Stat("num_user_source_entries") - userSignUpCountryEdges.map { - case (userId, (country, lang)) => - numUserSourceEntriesRead.inc() - ( - LeftNode.UserId(userId), - RightNodeWithEdgeWeight( - rightNode = - RightNode(rightNodeType = RightNodeType.SignUpCountry, noun = Noun.Country(country)), - weight = 1.0)) - } - } - - def getMagicRecsNotifOpenOrClickTweetsGraph( - userMRNotifOpenOrClickEvents: TypedPipe[MagicRecsNotificationLite] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numNotifOpenOrClickEntries = Stat("num_notif_open_or_click") - userMRNotifOpenOrClickEvents.flatMap { entry => - numNotifOpenOrClickEntries.inc() - for { - userId <- entry.targetUserId - tweetId <- entry.tweetId - } yield { - ( - LeftNode.UserId(userId), - RightNodeWithEdgeWeight( - rightNode = RightNode( - rightNodeType = RightNodeType.NotifOpenOrClickTweet, - noun = Noun.TweetId(tweetId)), - weight = 1.0)) - } - } - } - - def getUserConsumedLanguagesGraph( - userConsumedLanguageEdges: TypedPipe[(UserId, Seq[(Language, Double)])] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numPenguinSourceEntriesRead = Stat("num_penguin_source_entries") - userConsumedLanguageEdges.flatMap { - case (userId, langWithWeights) => - numPenguinSourceEntriesRead.inc() - langWithWeights.map { - case (lang, weight) => - ( - LeftNode.UserId(userId), - RightNodeWithEdgeWeight( - rightNode = RightNode( - rightNodeType = RightNodeType.ConsumedLanguage, - noun = Noun.Language(lang)), - weight = weight)) - } - } - } - - def getSearchGraph( - userSearchQueryEdges: TypedPipe[(UserId, String)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numSearchQueries = Stat("num_search_queries") - userSearchQueryEdges.map { - case (userId, query) => - numSearchQueries.inc() - ( - LeftNode.UserId(userId), - RightNodeWithEdgeWeight( - rightNode = - RightNode(rightNodeType = RightNodeType.SearchQuery, noun = Noun.Query(query)), - weight = 1.0)) - } - } - - def buildEmployeeGraph( - fullGraph: TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numEmployeeEdges = Stat("num_employee_edges") - val employeeIds = Config.SampledEmployeeIds - fullGraph - .collect { - case (LeftNode.UserId(userId), rightNodeWithWeight) if employeeIds.contains(userId) => - numEmployeeEdges.inc() - (LeftNode.UserId(userId), rightNodeWithWeight) - } - } - - def getTruncatedGraph( - fullGraph: TypedPipe[(LeftNode, RightNodeWithEdgeWeight)], - topKWithFrequency: TypedPipe[(RightNodeType, Seq[(Noun, Double)])] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - val numEntriesTruncatedGraph = Stat("num_entries_truncated_graph") - val numTopKTruncatedNouns = Stat("num_topk_truncated_nouns") - - implicit val rightNodeSer: RightNode => Array[Byte] = BinaryScalaCodec(RightNode) - val topNouns: TypedPipe[RightNode] = topKWithFrequency - .flatMap { - case (rightNodeType, nounsList) => - nounsList - .map { - case (nounVal, aggregatedFrequency) => - numTopKTruncatedNouns.inc() - RightNode(rightNodeType, nounVal) - } - } - - fullGraph - .map { - case (leftNode, rightNodeWithWeight) => - (rightNodeWithWeight.rightNode, (leftNode, rightNodeWithWeight)) - } - .sketch(reducers = 5000) - .join(topNouns.asKeys.toTypedPipe) - .map { - case (rightNode, ((left, rightNodeWithWeight), _)) => - numEntriesTruncatedGraph.inc() - (left, rightNodeWithWeight) - } - } - - def getTopKRightNounsWithFrequencies( - fullGraph: TypedPipe[(LeftNode, RightNodeWithEdgeWeight)], - topKConfig: Map[RightNodeType, Int], - minFrequency: Int - )( - implicit uniqueID: UniqueID - ): TypedPipe[(RightNodeType, Seq[(Noun, Double)])] = { - val maxAcrossRightNounType: Int = topKConfig.valuesIterator.max - fullGraph - .map { - case (leftNode, rightNodeWithWeight) => - (rightNodeWithWeight.rightNode, 1.0) - } - .sumByKey - // .withReducers(20000) - .toTypedPipe - .filter(_._2 >= minFrequency) - .map { - case (rightNode, freq) => - (rightNode.rightNodeType, (rightNode.noun, freq)) - } - .group(rightNodeTypeOrdering) - // Note: if maxAcrossRightNounType is >15M, it might result in OOM on reducer - .sortedReverseTake(maxAcrossRightNounType)(Ordering.by(_._2)) - // An alternative to using group followed by sortedReverseTake is to define TopKMonoids, - // one for each RightNodeType to get the most frequent rightNouns - .map { - case (rightNodeType, nounsListWithFreq) => - val truncatedList = nounsListWithFreq - .sortBy(-_._2) - .take(topKConfig.getOrElse(rightNodeType, NumTopNounsForUnknownRightNodeType)) - (rightNodeType, truncatedList) - } - } - - def getValidUsers( - userSource: TypedPipe[CombinedUser] - )( - implicit uniqueID: UniqueID - ): TypedPipe[UserId] = { - val numValidUsers = Stat("num_valid_users") - userSource - .flatMap { u => - for { - user <- u.user - if user.id != 0 - safety <- user.safety - if !(safety.suspended || safety.deactivated) - } yield { - numValidUsers.inc() - user.id - } - } - } - - def getFullGraph( - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = { - - // list of valid UserIds - to filter out deactivated or suspended user accounts - val userSource: TypedPipe[CombinedUser] = - DAL - .readMostRecentSnapshotNoOlderThan(UsersourceScalaDataset, Days(7)).toTypedPipe - val validUsers: TypedPipe[UserId] = getValidUsers(userSource).forceToDisk - - //Dataset read operations - - // ieSource tweet engagements data for tweet favs, replies, retweets - from last 14 days - val tweetSource: TypedPipe[InteractionEvent] = - ExternalDataSources.ieSourceTweetEngagementsSource(dateRange = - DateRange(dateRange.end - Days(14), dateRange.end)) - - // user-user fav edges - val userUserFavEdges: TypedPipe[(UserId, UserId, Double)] = - ExternalDataSources.getFavEdges(HalfLifeInDaysForFavScore) - - // user-user follow edges - val userUserFollowEdges: TypedPipe[(UserId, UserId)] = - filterInvalidUsers(ExternalDataSources.flockFollowsSource, validUsers) - - // user-user block edges - val userUserBlockEdges: TypedPipe[(UserId, UserId)] = - filterInvalidUsers(ExternalDataSources.flockBlocksSource, validUsers) - - // user-user abuse report edges - val userUserAbuseReportEdges: TypedPipe[(UserId, UserId)] = - filterInvalidUsers(ExternalDataSources.flockReportAsAbuseSource, validUsers) - - // user-user spam report edges - val userUserSpamReportEdges: TypedPipe[(UserId, UserId)] = - filterInvalidUsers(ExternalDataSources.flockReportAsSpamSource, validUsers) - - // user-signup country edges - val userSignUpCountryEdges: TypedPipe[(UserId, (Country, Language))] = - ExternalDataSources.userSource - - // user-consumed language edges - val userConsumedLanguageEdges: TypedPipe[(UserId, Seq[(Language, Double)])] = - ExternalDataSources.inferredUserConsumedLanguageSource - - // user-topic follow edges - val topicUserFollowedByEdges: TypedPipe[(TopicId, UserId)] = - ExternalDataSources.topicFollowGraphSource - - // user-MRNotifOpenOrClick events from last 7 days - val userMRNotifOpenOrClickEvents: TypedPipe[MagicRecsNotificationLite] = - ExternalDataSources.magicRecsNotficationOpenOrClickEventsSource(dateRange = - DateRange(dateRange.end - Days(7), dateRange.end)) - - // user-searchQuery strings from last 7 days - val userSearchQueryEdges: TypedPipe[(UserId, String)] = - ExternalDataSources.adaptiveSearchScribeLogsSource(dateRange = - DateRange(dateRange.end - Days(7), dateRange.end)) - - getUserTweetInteractionGraph(tweetSource) ++ - getUserFavGraph(userUserFavEdges) ++ - getUserFollowGraph(userUserFollowEdges) ++ - getUserBlockGraph(userUserBlockEdges) ++ - getUserAbuseReportGraph(userUserAbuseReportEdges) ++ - getUserSpamReportGraph(userUserSpamReportEdges) ++ - getUserSignUpCountryGraph(userSignUpCountryEdges) ++ - getUserConsumedLanguagesGraph(userConsumedLanguageEdges) ++ - getUserTopicFollowGraph(topicUserFollowedByEdges) ++ - getMagicRecsNotifOpenOrClickTweetsGraph(userMRNotifOpenOrClickEvents) ++ - getSearchGraph(userSearchQueryEdges) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphApp.docx new file mode 100644 index 000000000..a5995a2bf Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphApp.scala deleted file mode 100644 index c341113fb..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphApp.scala +++ /dev/null @@ -1,74 +0,0 @@ -package com.twitter.simclusters_v2.scalding -package multi_type_graph.assemble_multi_type_graph - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.dal.client.dataset.SnapshotDALDataset -import com.twitter.scalding.Days -import com.twitter.scalding.Duration -import com.twitter.scalding.RichDate -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.thriftscala.LeftNode -import com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct -import com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeightList -import com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList -import com.twitter.simclusters_v2.thriftscala.MultiTypeGraphEdge -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import com.twitter.simclusters_v2.hdfs_sources._ - -/** -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph:multi_type_graph-adhoc -scalding remote run \ ---user cassowary \ ---keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ ---principal service_acoount@TWITTER.BIZ \ ---cluster bluebird-qus1 \ ---main-class com.twitter.simclusters_v2.scalding.multi_type_graph.assemble_multi_type_graph.AssembleMultiTypeGraphAdhocApp \ ---target src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph:multi_type_graph-adhoc \ ---hadoop-properties "mapreduce.reduce.memory.mb=8192 mapreduce.map.memory.mb=8192 mapreduce.map.java.opts='-Xmx7618M' mapreduce.reduce.java.opts='-Xmx7618M' mapreduce.task.timeout=3600000" \ --- --date 2021-07-10 --outputDir /gcs/user/cassowary/adhoc/your_ldap/multi_type/multi_type - -To run using scalding_job target: -scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph:multi_type_graph-adhoc - */ - -object AssembleMultiTypeGraphAdhocApp extends AssembleMultiTypeGraphBaseApp with AdhocExecutionApp { - override val isAdhoc: Boolean = true - override val truncatedMultiTypeGraphMHOutputPath: String = "truncated_graph_mh" - override val topKRightNounsMHOutputPath: String = "top_k_right_nouns_mh" - override val fullMultiTypeGraphThriftOutputPath: String = "full_graph_thrift" - override val truncatedMultiTypeGraphKeyValDataset: KeyValDALDataset[ - KeyVal[LeftNode, RightNodeWithEdgeWeightList] - ] = TruncatedMultiTypeGraphAdhocScalaDataset - override val topKRightNounsKeyValDataset: KeyValDALDataset[ - KeyVal[RightNodeTypeStruct, NounWithFrequencyList] - ] = TopKRightNounsAdhocScalaDataset - override val fullMultiTypeGraphSnapshotDataset: SnapshotDALDataset[MultiTypeGraphEdge] = - FullMultiTypeGraphAdhocScalaDataset -} - -/** -To deploy the job: - -capesospy-v2 update --build_locally \ - --start_cron assemble_multi_type_graph \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object AssembleMultiTypeGraphBatchApp - extends AssembleMultiTypeGraphBaseApp - with ScheduledExecutionApp { - override val isAdhoc: Boolean = false - override val truncatedMultiTypeGraphMHOutputPath: String = "truncated_graph_mh" - override val topKRightNounsMHOutputPath: String = "top_k_right_nouns_mh" - override val fullMultiTypeGraphThriftOutputPath: String = "full_graph_thrift" - override val truncatedMultiTypeGraphKeyValDataset: KeyValDALDataset[ - KeyVal[LeftNode, RightNodeWithEdgeWeightList] - ] = TruncatedMultiTypeGraphScalaDataset - override val topKRightNounsKeyValDataset: KeyValDALDataset[ - KeyVal[RightNodeTypeStruct, NounWithFrequencyList] - ] = TopKRightNounsScalaDataset - override val fullMultiTypeGraphSnapshotDataset: SnapshotDALDataset[MultiTypeGraphEdge] = - FullMultiTypeGraphScalaDataset - override val firstTime: RichDate = RichDate("2021-08-21") - override val batchIncrement: Duration = Days(7) -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphBaseApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphBaseApp.docx new file mode 100644 index 000000000..06ff67e78 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphBaseApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphBaseApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphBaseApp.scala deleted file mode 100644 index 4f645e522..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphBaseApp.scala +++ /dev/null @@ -1,185 +0,0 @@ -package com.twitter.simclusters_v2.scalding -package multi_type_graph.assemble_multi_type_graph - -import com.twitter.dal.client.dataset.{KeyValDALDataset, SnapshotDALDataset} -import com.twitter.scalding.{Execution, _} -import com.twitter.scalding_internal.dalv2.DALWrite.{D, _} -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.scalding.common.TypedRichPipe.typedPipeToRichPipe -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.{ - LeftNode, - Noun, - NounWithFrequency, - NounWithFrequencyList, - RightNodeType, - RightNodeTypeStruct, - RightNodeWithEdgeWeight, - RightNodeWithEdgeWeightList, - MultiTypeGraphEdge -} -import com.twitter.wtf.scalding.jobs.common.DateRangeExecutionApp -import java.util.TimeZone - -/** - * In this file, we assemble the multi_type_graph user-entity engagement signals - * - * It works as follows and the following datasets are produced as a result: - * - * 1. FullGraph (fullMultiTypeGraphSnapshotDataset) : reads datasets from multiple sources and generates - * a bipartite graph with LeftNode -> RightNode edges, capturing a user's engagement with varied entity types - * - * 2. TruncatedGraph (truncatedMultiTypeGraphKeyValDataset): a truncated version of the FullGraph - * where we only store the topK most frequently occurring RightNodes in the bipartite graph LeftNode -> RightNode - * - * 3. TopKNouns (topKRightNounsKeyValDataset): this stores the topK most frequent Nouns for each engagement type - * Please note that this dataset is currently only being used for the debugger to find which nodes we consider as the - * most frequently occurring, in FullGraph - */ - -trait AssembleMultiTypeGraphBaseApp extends DateRangeExecutionApp { - val truncatedMultiTypeGraphKeyValDataset: KeyValDALDataset[ - KeyVal[LeftNode, RightNodeWithEdgeWeightList] - ] - val topKRightNounsKeyValDataset: KeyValDALDataset[ - KeyVal[RightNodeTypeStruct, NounWithFrequencyList] - ] - val fullMultiTypeGraphSnapshotDataset: SnapshotDALDataset[MultiTypeGraphEdge] - val isAdhoc: Boolean - val truncatedMultiTypeGraphMHOutputPath: String - val topKRightNounsMHOutputPath: String - val fullMultiTypeGraphThriftOutputPath: String - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - import Config._ - import AssembleMultiTypeGraph._ - - val numKeysInTruncatedGraph = Stat("num_keys_truncated_mts") - val numKeysInTopKNounsGraph = Stat("num_keys_topk_nouns_mts") - - val fullGraph: TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = - getFullGraph().count("num_entries_full_graph") - - val topKRightNodes: TypedPipe[(RightNodeType, Seq[(Noun, Double)])] = - getTopKRightNounsWithFrequencies( - fullGraph, - TopKConfig, - GlobalDefaultMinFrequencyOfRightNodeType) - - val truncatedGraph: TypedPipe[(LeftNode, RightNodeWithEdgeWeight)] = - getTruncatedGraph(fullGraph, topKRightNodes).count("num_entries_truncated_graph") - - // key transformations - truncated graph, keyed by LeftNode - val truncatedGraphKeyedBySrc: TypedPipe[(LeftNode, RightNodeWithEdgeWeightList)] = - truncatedGraph - .map { - case (LeftNode.UserId(userId), rightNodeWithWeight) => - userId -> List(rightNodeWithWeight) - } - .sumByKey - .map { - case (userId, rightNodeWithWeightList) => - (LeftNode.UserId(userId), RightNodeWithEdgeWeightList(rightNodeWithWeightList)) - } - - // key transformation - topK nouns, keyed by the RightNodeNounType - val topKNounsKeyedByType: TypedPipe[(RightNodeTypeStruct, NounWithFrequencyList)] = - topKRightNodes - .map { - case (rightNodeType, rightNounsWithScoresList) => - val nounsListWithFrequency: Seq[NounWithFrequency] = rightNounsWithScoresList - .map { - case (noun, aggregatedFrequency) => - NounWithFrequency(noun, aggregatedFrequency) - } - (RightNodeTypeStruct(rightNodeType), NounWithFrequencyList(nounsListWithFrequency)) - } - - //WriteExecs - truncated graph - val truncatedGraphTsvExec: Execution[Unit] = - truncatedGraphKeyedBySrc.writeExecution( - TypedTsv[(LeftNode, RightNodeWithEdgeWeightList)](AdhocRootPrefix + "truncated_graph_tsv")) - - val truncatedGraphDALExec: Execution[Unit] = truncatedGraphKeyedBySrc - .map { - case (leftNode, rightNodeWithWeightList) => - numKeysInTruncatedGraph.inc() - KeyVal(leftNode, rightNodeWithWeightList) - } - .writeDALVersionedKeyValExecution( - truncatedMultiTypeGraphKeyValDataset, - D.Suffix( - (if (!isAdhoc) - RootPath - else - AdhocRootPrefix) - + truncatedMultiTypeGraphMHOutputPath), - ExplicitEndTime(dateRange.`end`) - ) - - //WriteExec - topK rightnouns - val topKNounsTsvExec: Execution[Unit] = - topKNounsKeyedByType.writeExecution( - TypedTsv[(RightNodeTypeStruct, NounWithFrequencyList)]( - AdhocRootPrefix + "top_k_right_nouns_tsv")) - - // writing topKNouns MH dataset for debugger - val topKNounsDALExec: Execution[Unit] = topKNounsKeyedByType - .map { - case (engagementType, rightList) => - val rightListMH = - NounWithFrequencyList(rightList.nounWithFrequencyList.take(TopKRightNounsForMHDump)) - numKeysInTopKNounsGraph.inc() - KeyVal(engagementType, rightListMH) - } - .writeDALVersionedKeyValExecution( - topKRightNounsKeyValDataset, - D.Suffix( - (if (!isAdhoc) - RootPath - else - AdhocRootPrefix) - + topKRightNounsMHOutputPath), - ExplicitEndTime(dateRange.`end`) - ) - - //WriteExec - fullGraph - val fullGraphDALExec: Execution[Unit] = fullGraph - .map { - case (leftNode, rightNodeWithWeight) => - MultiTypeGraphEdge(leftNode, rightNodeWithWeight) - }.writeDALSnapshotExecution( - fullMultiTypeGraphSnapshotDataset, - D.Daily, - D.Suffix( - (if (!isAdhoc) - RootThriftPath - else - AdhocRootPrefix) - + fullMultiTypeGraphThriftOutputPath), - D.Parquet, - dateRange.`end` - ) - - if (isAdhoc) { - Util.printCounters( - Execution - .zip( - truncatedGraphTsvExec, - topKNounsTsvExec, - truncatedGraphDALExec, - topKNounsDALExec, - fullGraphDALExec).unit) - } else { - Util.printCounters( - Execution.zip(truncatedGraphDALExec, topKNounsDALExec, fullGraphDALExec).unit) - } - - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/BUILD b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/BUILD deleted file mode 100644 index 5afed4a7a..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/BUILD +++ /dev/null @@ -1,91 +0,0 @@ -scala_library( - sources = [ - "*.scala", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":full_multi_type_graph_adhoc-scala", - ":top_k_right_nouns_adhoc-scala", - ":truncated_multi_type_graph_adhoc-scala", - "3rdparty/src/jvm/com/twitter/scalding:commons", - "3rdparty/src/jvm/com/twitter/scalding:core", - "src/scala/com/twitter/scalding_internal/job", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding", - "src/thrift/com/twitter/twadoop/user/gen:gen-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - "usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala", - ], -) - -scalding_job( - name = "multi_type_graph-adhoc", - main = "com.twitter.simclusters_v2.scalding.multi_type_graph.assemble_multi_type_graph.AssembleMultiTypeGraphAdhocApp", - config = [ - ("hadoop.map.jvm.total-memory", "8192m"), - ("hadoop.reduce.jvm.total-memory", "8192m"), - ("hadoop.submitter.jvm.total-memory", "8192m"), - ("hadoop.am.jvm.total-memory", "8192m"), - ( - "job.args", - [ - "--date 2021-07-14", - ], - ), - ], - hadoop_cluster = "qus1-bluebird", - hadoop_properties = [("mapreduce.task.timeout", "3600000")], - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [":assemble_multi_type_graph"], -) - -create_datasets( - base_name = "truncated_multi_type_graph_adhoc", - key_type = "com.twitter.simclusters_v2.thriftscala.LeftNode", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.truncatedMultiTypeGraphInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeightList", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "top_k_right_nouns_adhoc", - key_type = "com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.topKRightNounListInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "full_multi_type_graph_adhoc", - java_schema = "com.twitter.simclusters_v2.thriftjava.MultiTypeGraphEdge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.MultiTypeGraphEdge", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/BUILD.docx new file mode 100644 index 000000000..a16585d6c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/Config.docx b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/Config.docx new file mode 100644 index 000000000..cfac1c547 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/Config.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/Config.scala b/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/Config.scala deleted file mode 100644 index c423262a5..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/Config.scala +++ /dev/null @@ -1,35 +0,0 @@ -package com.twitter.simclusters_v2.scalding -package multi_type_graph.assemble_multi_type_graph - -import com.twitter.simclusters_v2.thriftscala.RightNodeType - -object Config { - - val User = System.getenv("USER") - val RootPath: String = s"/user/$User/manhattan_sequence_files/multi_type_simclusters/" - val RootThriftPath: String = s"/user/$User/processed/multi_type_simclusters/" - val AdhocRootPrefix = s"/gcs/user/$User/adhoc/multi_type_simclusters/" - val HalfLifeInDaysForFavScore = 100 - val NumTopNounsForUnknownRightNodeType = 20 - val GlobalDefaultMinFrequencyOfRightNodeType = 100 - val TopKRightNounsForMHDump = 1000 - - // the topK most frequent nouns for each engagement type - val TopKConfig: Map[RightNodeType, Int] = Map( - RightNodeType.FollowUser -> 10000000, // 10M, current simclusters_v2 has this value set to 20M, providing this the most weight - RightNodeType.FavUser -> 5000000, - RightNodeType.BlockUser -> 1000000, - RightNodeType.AbuseReportUser -> 1000000, - RightNodeType.SpamReportUser -> 1000000, - RightNodeType.FollowTopic -> 5000, - RightNodeType.SignUpCountry -> 200, - RightNodeType.ConsumedLanguage -> 50, - RightNodeType.FavTweet -> 500000, - RightNodeType.ReplyTweet -> 500000, - RightNodeType.RetweetTweet -> 500000, - RightNodeType.NotifOpenOrClickTweet -> 500000, - RightNodeType.SearchQuery -> 500000 - ) - val SampledEmployeeIds: Set[Long] = - Set() -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/BUILD.bazel b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/BUILD.bazel deleted file mode 100644 index 95ccf5027..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/BUILD.bazel +++ /dev/null @@ -1,126 +0,0 @@ -scala_library( - platform = "java8", - tags = [ - "bazel-compatible", - "bazel-only", - ], - dependencies = [ - "3rdparty/jvm/com/twitter/storehaus:algebra", - "3rdparty/jvm/com/twitter/storehaus:core", - "3rdparty/src/jvm/com/twitter/storehaus:algebra", - "3rdparty/src/jvm/com/twitter/storehaus:core", - "graphstore/common:flock_follows-java", - "snowflake:id", - "src/java/com/twitter/ml/api/constant", - "src/java/com/twitter/sbf/graph", - "src/scala/com/twitter/ml/api:api-base", - "src/scala/com/twitter/pluck/source/core_workflows/user_model:condensed_user_state-scala", - "src/scala/com/twitter/scalding_internal/dalv2", - "src/scala/com/twitter/scalding_internal/job", - "src/scala/com/twitter/scalding_internal/job/analytics_batch", - "src/scala/com/twitter/scalding_internal/source", - "src/scala/com/twitter/scalding_internal/source/lzo_scrooge", - "src/scala/com/twitter/simclusters_v2/candidate_source", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/scala/com/twitter/simclusters_v2/summingbird/common", - "src/scala/com/twitter/timelines/prediction/features/common", - "src/scala/com/twitter/timelines/prediction/features/itl", - "src/scala/com/twitter/timelines/prediction/features/recap", - "src/scala/com/twitter/wtf/scalding/jobs/common:execution_app", - "src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala", - "src/thrift/com/twitter/wtf/scalding/sims:sims-thrift-scala", - "twadoop_config/configuration/log_categories/group/timeline:timeline_service_favorites-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - "usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala", - ], -) - -hadoop_binary( - name = "simclusters_offline_job-adhoc", - main = "com.twitter.simclusters_v2.scalding.offline_job.SimClustersOfflineJobAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":offline_job", - ], -) - -hadoop_binary( - name = "simclusters_offline_job", - main = "com.twitter.simclusters_v2.scalding.offline_job.SimClustersOfflineJobScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":offline_job", - ], -) - -hadoop_binary( - name = "simclusters_offline_job-repl", - main = "com.twitter.scalding_internal.repl.TwitterScaldingShell", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":offline_job", - "science/scalding/scripts:scalding-repl-deps", - ], -) - -hadoop_binary( - name = "dump_cluster_topk_job-adhoc", - main = "com.twitter.simclusters_v2.scalding.offline_job.DumpClusterTopKTweetsAdhoc", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":offline_job", - ], -) - -# Generated with `capesospy-v2 create_target offline_tweet_job src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml`, config hash bb0831. -scalding_job( - name = "offline_tweet_job", - main = "com.twitter.simclusters_v2.scalding.offline_job.SimClustersOfflineJobScheduledApp", - config = [ - ("hadoop.combine-input", "true"), - ("hadoop.map.jvm.total-memory", "3072m"), - ("hadoop.queue", "cassowary.default"), - ("hadoop.reduce.jvm.total-memory", "3072m"), - ("hadoop.submitter.jvm.total-memory", "5120m"), - ("submitter.tier", "preemptible"), - ], - contact = "no-reply@twitter.com", - cron = "14 * * * *", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":offline_job", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/BUILD.docx new file mode 100644 index 000000000..9731677ea Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/OfflineTweetRecommendation.docx b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/OfflineTweetRecommendation.docx new file mode 100644 index 000000000..196f6c48b Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/OfflineTweetRecommendation.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/OfflineTweetRecommendation.scala b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/OfflineTweetRecommendation.scala deleted file mode 100644 index a34c2e972..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/OfflineTweetRecommendation.scala +++ /dev/null @@ -1,176 +0,0 @@ -package com.twitter.simclusters_v2.scalding.offline_job - -import com.twitter.algebird.Aggregator.size -import com.twitter.algebird.{Aggregator, QTreeAggregatorLowerBound} -import com.twitter.scalding.{Execution, Stat, TypedPipe, UniqueID} -import com.twitter.simclusters_v2.candidate_source._ -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.thriftscala.{ - ClusterTopKTweetsWithScores, - ClustersUserIsInterestedIn -} -import java.nio.ByteBuffer - -case class OfflineRecConfig( - maxTweetRecs: Int, // total number of tweet recs. - maxTweetsPerUser: Int, - maxClustersToQuery: Int, - minTweetScoreThreshold: Double, - rankClustersBy: ClusterRanker.Value) - -/** - * An offline simulation of the tweet rec logic in [[InterestedInTweetCandidateStore]]. - * The main difference is that instead of using Memcache, it uses an offline clusterTopK store as - * the tweet source. - * Also, instead of taking a single userId as input, it processes a pipe of users altogether. - */ -object OfflineTweetRecommendation { - - case class ScoredTweet(tweetId: TweetId, score: Double) { - - def toTuple: (TweetId, Double) = { - (tweetId, score) - } - } - - object ScoredTweet { - def apply(tuple: (TweetId, Double)): ScoredTweet = new ScoredTweet(tuple._1, tuple._2) - implicit val scoredOrdering: Ordering[ScoredTweet] = (x: ScoredTweet, y: ScoredTweet) => { - Ordering.Double.compare(x.score, y.score) - } - } - - def getTopTweets( - config: OfflineRecConfig, - targetUsersPipe: TypedPipe[Long], - userIsInterestedInPipe: TypedPipe[(Long, ClustersUserIsInterestedIn)], - clusterTopKTweetsPipe: TypedPipe[ClusterTopKTweetsWithScores] - )( - implicit uniqueID: UniqueID - ): Execution[TypedPipe[(Long, Seq[ScoredTweet])]] = { - val tweetRecommendedCount = Stat("NumTweetsRecomended") - val targetUserCount = Stat("NumTargetUsers") - val userWithRecsCount = Stat("NumUsersWithAtLeastTweetRec") - - // For every user, read the user's interested-in clusters and cluster's weights - val userClusterWeightPipe: TypedPipe[(Int, (Long, Double))] = - targetUsersPipe.asKeys - .join(userIsInterestedInPipe) - .flatMap { - case (userId, (_, clustersWithScores)) => - targetUserCount.inc() - val topClusters = ClusterRanker - .getTopKClustersByScore( - clustersWithScores.clusterIdToScores.toMap, - ClusterRanker.RankByNormalizedFavScore, - config.maxClustersToQuery - ).toList - topClusters.map { - case (clusterId, clusterWeightForUser) => - (clusterId, (userId, clusterWeightForUser)) - } - } - - // For every cluster, read the top tweets in the cluster, and their weights - val clusterTweetWeightPipe: TypedPipe[(Int, List[(Long, Double)])] = - clusterTopKTweetsPipe - .flatMap { cluster => - val tweets = - cluster.topKTweets.toList // Convert to a List, otherwise .flatMap dedups by clusterIds - .flatMap { - case (tid, persistedScores) => - val tweetWeight = persistedScores.score.map(_.value).getOrElse(0.0) - if (tweetWeight > 0) { - Some((tid, tweetWeight)) - } else { - None - } - } - if (tweets.nonEmpty) { - Some((cluster.clusterId, tweets)) - } else { - None - } - } - - // Collect all the tweets from clusters user is interested in - val recommendedTweetsPipe = userClusterWeightPipe - .sketch(4000)(cid => ByteBuffer.allocate(4).putInt(cid).array(), Ordering.Int) - .join(clusterTweetWeightPipe) - .flatMap { - case (_, ((userId, clusterWeight), tweetsPerCluster)) => - tweetsPerCluster.map { - case (tid, tweetWeight) => - val contribution = clusterWeight * tweetWeight - ((userId, tid), contribution) - } - } - .sumByKey - .withReducers(5000) - - // Filter by minimum score threshold - val scoreFilteredTweetsPipe = recommendedTweetsPipe - .collect { - case ((userId, tid), score) if score >= config.minTweetScoreThreshold => - (userId, ScoredTweet(tid, score)) - } - - // Rank top tweets for each user - val topTweetsPerUserPipe = scoreFilteredTweetsPipe.group - .sortedReverseTake(config.maxTweetsPerUser)(ScoredTweet.scoredOrdering) - .flatMap { - case (userId, tweets) => - userWithRecsCount.inc() - tweetRecommendedCount.incBy(tweets.size) - - tweets.map { t => (userId, t) } - } - .forceToDiskExecution - - val topTweetsPipe = topTweetsPerUserPipe - .flatMap { tweets => - approximateScoreAtTopK(tweets.map(_._2.score), config.maxTweetRecs).map { threshold => - tweets - .collect { - case (userId, tweet) if tweet.score >= threshold => - (userId, List(tweet)) - } - .sumByKey - .toTypedPipe - } - } - topTweetsPipe - } - - /** - * Returns the approximate score at the k'th top ranked record using sampling. - * This score can then be used to filter for the top K elements in a big pipe where - * K is too big to fit in memory. - * - */ - def approximateScoreAtTopK(pipe: TypedPipe[Double], topK: Int): Execution[Double] = { - val defaultScore = 0.0 - println("approximateScoreAtTopK: topK=" + topK) - pipe - .aggregate(size) - .getOrElseExecution(0L) - .flatMap { len => - println("approximateScoreAtTopK: len=" + len) - val topKPercentile = if (len == 0 || topK > len) 0 else 1 - topK.toDouble / len.toDouble - val randomSample = Aggregator.reservoirSample[Double](Math.max(100000, topK / 100)) - pipe - .aggregate(randomSample) - .getOrElseExecution(List.empty) - .flatMap { sample => - TypedPipe - .from(sample) - .aggregate(QTreeAggregatorLowerBound[Double](topKPercentile)) - .getOrElseExecution(defaultScore) - } - } - .map { score => - println("approximateScoreAtTopK: topK percentile score=" + score) - score - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJob.docx new file mode 100644 index 000000000..d35663ef3 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJob.scala deleted file mode 100644 index 66b458b2a..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJob.scala +++ /dev/null @@ -1,176 +0,0 @@ -package com.twitter.simclusters_v2.scalding.offline_job - -import com.twitter.scalding._ -import com.twitter.simclusters_v2.common._ -import com.twitter.simclusters_v2.summingbird.common.{Configs, SimClustersInterestedInUtil} -import com.twitter.simclusters_v2.thriftscala._ -import java.util.TimeZone - -object SimClustersOfflineJob { - import SimClustersOfflineJobUtil._ - import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ - - val modelVersionMap: Map[String, PersistedModelVersion] = Map( - ModelVersions.Model20M145KDec11 -> PersistedModelVersion.Model20m145kDec11, - ModelVersions.Model20M145KUpdated -> PersistedModelVersion.Model20m145kUpdated - ) - - /** - * Get a list of tweets that received at least one fav in the last tweetTtl Duration - */ - def getSubsetOfValidTweets(tweetTtl: Duration)(implicit dateRange: DateRange): TypedPipe[Long] = { - readTimelineFavoriteData(DateRange(dateRange.end - tweetTtl, dateRange.end)).map(_._2).distinct - } - - /** - * Note that this job will write several types of scores into the same data set. Please use filter - * to take the score types you need. - */ - def computeAggregatedTweetClusterScores( - dateRange: DateRange, - userInterestsData: TypedPipe[(Long, ClustersUserIsInterestedIn)], - favoriteData: TypedPipe[(UserId, TweetId, Timestamp)], - previousTweetClusterScores: TypedPipe[TweetAndClusterScores] - )( - implicit timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[TweetAndClusterScores] = { - - val latestTimeStamp = dateRange.end.timestamp - - val currentScores: TypedPipe[ - ((Long, Int, PersistedModelVersion, Option[PersistedScoreType]), PersistedScores) - ] = - favoriteData - .map { - case (userId, tweetId, timestamp) => - (userId, (tweetId, timestamp)) - } - .count("NumFavEvents") - .leftJoin(userInterestsData) - .withReducers(600) - .flatMap { - case (_, ((tweetId, timestamp), Some(userInterests))) => - val clustersWithScores = - SimClustersInterestedInUtil.topClustersWithScores(userInterests) - ( - for { - (clusterId, scores) <- clustersWithScores - if scores.favScore >= Configs.favScoreThresholdForUserInterest( - userInterests.knownForModelVersion) - } yield { - // write several types of scores - Seq( - ( - tweetId, - clusterId, - modelVersionMap(userInterests.knownForModelVersion), - Some(PersistedScoreType.NormalizedFav8HrHalfLife)) -> - // let the score decay to latestTimeStamp - persistedScoresMonoid.plus( - persistedScoresMonoid - .build(scores.clusterNormalizedFavScore, timestamp), - persistedScoresMonoid.build(0.0, latestTimeStamp) - ), - ( - tweetId, - clusterId, - modelVersionMap(userInterests.knownForModelVersion), - Some(PersistedScoreType.NormalizedFollow8HrHalfLife)) -> - // let the score decay to latestTimeStamp - persistedScoresMonoid.plus( - persistedScoresMonoid - .build(scores.clusterNormalizedFollowScore, timestamp), - persistedScoresMonoid.build(0.0, latestTimeStamp) - ), - ( - tweetId, - clusterId, - modelVersionMap(userInterests.knownForModelVersion), - Some(PersistedScoreType.NormalizedLogFav8HrHalfLife)) -> - // let the score decay to latestTimeStamp - persistedScoresMonoid.plus( - persistedScoresMonoid - .build(scores.clusterNormalizedLogFavScore, timestamp), - persistedScoresMonoid.build(0.0, latestTimeStamp) - ) - ) - } - ).flatten - case _ => - Nil - } - .count("NumTweetClusterScoreUpdates") - .sumByLocalKeys // there is a .sumByKey later, so just doing a local sum here. - - val previousScores: TypedPipe[ - ((Long, Int, PersistedModelVersion, Option[PersistedScoreType]), PersistedScores) - ] = - previousTweetClusterScores.map { v => - (v.tweetId, v.clusterId, v.modelVersion, v.scoreType) -> v.scores - } - - // add current scores and previous scores - (currentScores ++ previousScores).sumByKey - .withReducers(1000) - .map { - case ((tweetId, clusterId, modelVersion, scoreType), scores) => - TweetAndClusterScores(tweetId, clusterId, modelVersion, scores, scoreType) - } - .count("NumAggregatedTweetClusterScores") - } - - def computeTweetTopKClusters( - latestTweetClusterScores: TypedPipe[TweetAndClusterScores], - topK: Int = Configs.topKClustersPerTweet, - scoreThreshold: Double = Configs.scoreThresholdForEntityTopKClustersCache - )( - implicit timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[TweetTopKClustersWithScores] = { - latestTweetClusterScores - .flatMap { v => - val score = v.scores.score.map(_.value).getOrElse(0.0) - if (score < scoreThreshold) { - None - } else { - Some((v.tweetId, v.modelVersion, v.scoreType) -> (v.clusterId, v.scores)) - } - } - .count("NumAggregatedTweetClusterScoresAfterFilteringInTweetTopK") - .group - .sortedReverseTake(topK)(Ordering.by(_._2)) - .map { - case ((tweetId, modelVersion, scoreType), topKClusters) => - TweetTopKClustersWithScores(tweetId, modelVersion, topKClusters.toMap, scoreType) - } - .count("NumTweetTopK") - } - - def computeClusterTopKTweets( - latestTweetClusterScores: TypedPipe[TweetAndClusterScores], - topK: Int = Configs.topKTweetsPerCluster, - scoreThreshold: Double = Configs.scoreThresholdForClusterTopKTweetsCache - )( - implicit timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[ClusterTopKTweetsWithScores] = { - latestTweetClusterScores - .flatMap { v => - val score = v.scores.score.map(_.value).getOrElse(0.0) - if (score < scoreThreshold) { - None - } else { - Some((v.clusterId, v.modelVersion, v.scoreType) -> (v.tweetId, v.scores)) - } - } - .count("NumAggregatedTweetClusterScoresAfterFilteringInClusterTopK") - .group - .sortedReverseTake(topK)(Ordering.by(_._2)) - .map { - case ((clusterId, modelVersion, scoreType), topKTweets) => - ClusterTopKTweetsWithScores(clusterId, modelVersion, topKTweets.toMap, scoreType) - } - .count("NumClusterTopK") - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobAdhocApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobAdhocApp.docx new file mode 100644 index 000000000..69ba315bc Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobAdhocApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobAdhocApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobAdhocApp.scala deleted file mode 100644 index 32acbe020..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobAdhocApp.scala +++ /dev/null @@ -1,197 +0,0 @@ -package com.twitter.simclusters_v2.scalding.offline_job - -import com.twitter.scalding._ -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources -import com.twitter.simclusters_v2.hdfs_sources.ClusterTopKTweetsHourlySuffixSource -import com.twitter.simclusters_v2.hdfs_sources.TweetClusterScoresHourlySuffixSource -import com.twitter.simclusters_v2.hdfs_sources.TweetTopKClustersHourlySuffixSource -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.scalding.offline_job.SimClustersOfflineJob._ -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import java.util.TimeZone - -/** -scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/offline_job:simclusters_offline_job-adhoc \ ---user cassowary \ ---submitter hadoopnest2.atla.twitter.com \ ---main-class com.twitter.simclusters_v2.scalding.offline_job.SimClustersOfflineJobAdhocApp -- \ ---date 2019-08-10 --batch_hours 24 \ ---output_dir /user/cassowary/your_ldap/offline_simcluster_20190810 ---model_version 20M_145K_updated - */ -object SimClustersOfflineJobAdhocApp extends TwitterExecutionApp { - - import SimClustersOfflineJobUtil._ - import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ - - override def job: Execution[Unit] = - Execution.withId { implicit uniqueId => - Execution.withArgs { args: Args => - // required - val wholeDateRange: DateRange = DateRange.parse(args.list("date")) - val batchSize: Duration = Hours(args.int("batch_hours")) - - val outputDir = args("output_dir") - - val modelVersion = args.getOrElse("model_version", ModelVersions.Model20M145KUpdated) - - val scoringMethod = args.getOrElse("score", "logFav") - - val tweetClusterScoreOutputPath: String = outputDir + "/tweet_cluster_scores" - - val tweetTopKClustersOutputPath: String = outputDir + "/tweet_top_k_clusters" - - val clusterTopKTweetsOutputPath: String = outputDir + "/cluster_top_k_tweets" - - val fullInterestedInData: TypedPipe[(Long, ClustersUserIsInterestedIn)] = - args.optional("interested_in_path") match { - case Some(dir) => - println("Loading InterestedIn from supplied path " + dir) - TypedPipe.from(AdhocKeyValSources.interestedInSource(dir)) - case None => - println("Loading production InterestedIn data") - readInterestedInScalaDataset(wholeDateRange) - } - - val interestedInData: TypedPipe[(Long, ClustersUserIsInterestedIn)] = - fullInterestedInData.filter(_._2.knownForModelVersion == modelVersion) - - val debugExec = Execution.zip( - fullInterestedInData.printSummary("fullInterestedIn", numRecords = 20), - interestedInData.printSummary("interestedIn", numRecords = 20) - ) - - // recursive function to calculate batches one by one - def runBatch(batchDateRange: DateRange): Execution[Unit] = { - if (batchDateRange.start.timestamp > wholeDateRange.end.timestamp) { - Execution.unit // stops here - } else { - - val previousScores = if (batchDateRange.start == wholeDateRange.start) { - TypedPipe.from(Nil) - } else { - TypedPipe.from( - TweetClusterScoresHourlySuffixSource( - tweetClusterScoreOutputPath, - batchDateRange - batchSize - ) - ) - } - - val latestScores = computeAggregatedTweetClusterScores( - batchDateRange, - interestedInData, - readTimelineFavoriteData(batchDateRange), - previousScores - ) - - val writeLatestScoresExecution = { - Execution.zip( - latestScores.printSummary(name = "TweetEntityScores"), - latestScores - .writeExecution( - TweetClusterScoresHourlySuffixSource( - tweetClusterScoreOutputPath, - batchDateRange - ) - ) - ) - } - - val computeTweetTopKExecution = { - val tweetTopK = computeTweetTopKClusters(latestScores) - Execution.zip( - tweetTopK.printSummary(name = "TweetTopK"), - tweetTopK.writeExecution( - TweetTopKClustersHourlySuffixSource(tweetTopKClustersOutputPath, batchDateRange) - ) - ) - } - - val computeClusterTopKExecution = { - val clusterTopK = computeClusterTopKTweets(latestScores) - Execution.zip( - clusterTopK.printSummary(name = "ClusterTopK"), - clusterTopK.writeExecution( - ClusterTopKTweetsHourlySuffixSource(clusterTopKTweetsOutputPath, batchDateRange) - ) - ) - } - - Execution - .zip( - writeLatestScoresExecution, - computeTweetTopKExecution, - computeClusterTopKExecution - ).flatMap { _ => - // run next batch - runBatch(batchDateRange + batchSize) - } - } - } - - // start from the first batch - Util.printCounters( - Execution.zip( - debugExec, - runBatch( - DateRange(wholeDateRange.start, wholeDateRange.start + batchSize - Millisecs(1))) - ) - ) - } - } -} - -/** -For example: -scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/offline_job:dump_cluster_topk_job-adhoc \ ---user cassowary ---main-class com.twitter.simclusters_v2.scalding.offline_job.DumpClusterTopKTweetsAdhoc \ ---submitter hadoopnest2.atla.twitter.com -- \ ---date 2019-08-03 \ ---clusterTopKTweetsPath /atla/proc3/user/cassowary/processed/simclusters/cluster_top_k_tweets/ \ ---clusters 4446 - - */ -object DumpClusterTopKTweetsAdhoc extends TwitterExecutionApp { - - implicit val timeZone: TimeZone = DateOps.UTC - implicit val dateParser: DateParser = DateParser.default - - import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ - import com.twitter.simclusters_v2.summingbird.common.ThriftDecayedValueMonoid._ - - override def job: Execution[Unit] = - Execution.withId { implicit uniqueId => - Execution.withArgs { args: Args => - val date = DateRange.parse(args.list("date")) - val path = args("clusterTopKTweetsPath") - val input = TypedPipe.from(ClusterTopKTweetsHourlySuffixSource(path, date)) - val clusters = args.list("clusters").map(_.toInt).toSet - - val dvm = SimClustersOfflineJobUtil.thriftDecayedValueMonoid - if (clusters.isEmpty) { - input.printSummary("Cluster top k tweets") - } else { - input - .collect { - case rec if clusters.contains(rec.clusterId) => - val res = rec.topKTweets - .mapValues { x => - x.score - .map { y => - val enriched = new EnrichedThriftDecayedValue(y)(dvm) - enriched.decayToTimestamp(date.end.timestamp).value - }.getOrElse(0.0) - }.toList.sortBy(-_._2) - rec.clusterId + "\t" + Util.prettyJsonMapper - .writeValueAsString(res).replaceAll("\n", " ") - } - .toIterableExecution - .map { strings => println(strings.mkString("\n")) } - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobScheduledApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobScheduledApp.docx new file mode 100644 index 000000000..288e6cb9d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobScheduledApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobScheduledApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobScheduledApp.scala deleted file mode 100644 index 8be6537d1..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobScheduledApp.scala +++ /dev/null @@ -1,113 +0,0 @@ -package com.twitter.simclusters_v2.scalding.offline_job - -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.offline_job.SimClustersOfflineJob._ -import com.twitter.simclusters_v2.scalding.offline_job.SimClustersOfflineJobUtil._ -import com.twitter.simclusters_v2.thriftscala.TweetAndClusterScores -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * The offline job runs every 12 hours, and save these two data sets to HDFS. - * - * capesospy-v2 update --build_locally --start_cron \ - * --start_cron offline_tweet_job src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object SimClustersOfflineJobScheduledApp extends ScheduledExecutionApp { - import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ - - private val tweetClusterScoresDatasetPath: String = - "/user/cassowary/processed/simclusters/tweet_cluster_scores" - private val tweetTopKClustersDatasetPath: String = - "/user/cassowary/processed/simclusters/tweet_top_k_clusters" - private val clusterTopKTweetsDatasetPath: String = - "/user/cassowary/processed/simclusters/cluster_top_k_tweets" - - override def batchIncrement: Duration = Hours(12) - - override def firstTime: RichDate = RichDate("2020-05-25") - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val previousTweetClusterScores: TypedPipe[TweetAndClusterScores] = - if (firstTime.timestamp == dateRange.start.timestamp) { // if it is the first batch - TypedPipe.from(Nil) - } else { - DAL - .readMostRecentSnapshot( - SimclustersOfflineTweetClusterScoresScalaDataset, - dateRange - batchIncrement - ) - .toTypedPipe - .count("NumPreviousTweetClusterScores") - } - - // we have to use some way to throw away old tweets, otherwise the data set will be growing - // all the time. We only keep the tweets that received at least 1 engagement in the last day. - // This parameter can be adjusted - val tweetsToKeep = getSubsetOfValidTweets(Days(1)) - .count("NumTweetsToKeep") - - val updatedTweetClusterScores = computeAggregatedTweetClusterScores( - dateRange, - readInterestedInScalaDataset(dateRange), - readTimelineFavoriteData(dateRange), - previousTweetClusterScores - ).map { tweetClusterScore => - tweetClusterScore.tweetId -> tweetClusterScore - } - .count("NumUpdatedTweetClusterScoresBeforeFiltering") - .join(tweetsToKeep.asKeys) // filter out invalid tweets - .map { - case (_, (tweetClusterScore, _)) => tweetClusterScore - } - .count("NumUpdatedTweetClusterScores") - .forceToDisk - - val tweetTopKClusters = computeTweetTopKClusters(updatedTweetClusterScores) - .count("NumTweetTopKSaved") - val clusterTopKTweets = computeClusterTopKTweets(updatedTweetClusterScores) - .count("NumClusterTopKSaved") - - val writeTweetClusterScoresExec = updatedTweetClusterScores - .writeDALSnapshotExecution( - SimclustersOfflineTweetClusterScoresScalaDataset, - D.Hourly, // note that we use hourly in order to make it flexible for hourly batch size - D.Suffix(tweetClusterScoresDatasetPath), - D.EBLzo(), - dateRange.end - ) - - val writeTweetTopKClustersExec = tweetTopKClusters - .writeDALSnapshotExecution( - SimclustersOfflineTweetTopKClustersScalaDataset, - D.Hourly, // note that we use hourly in order to make it flexible for hourly batch size - D.Suffix(tweetTopKClustersDatasetPath), - D.EBLzo(), - dateRange.end - ) - - val writeClusterTopKTweetsExec = clusterTopKTweets - .writeDALSnapshotExecution( - SimclustersOfflineClusterTopKTweetsScalaDataset, - D.Hourly, // note that we use hourly in order to make it flexible for hourly batch size - D.Suffix(clusterTopKTweetsDatasetPath), - D.EBLzo(), - dateRange.end - ) - - Execution - .zip(writeTweetClusterScoresExec, writeTweetTopKClustersExec, writeClusterTopKTweetsExec) - .unit - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobUtil.docx b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobUtil.docx new file mode 100644 index 000000000..8adb2691d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobUtil.scala b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobUtil.scala deleted file mode 100644 index 50e91f5b4..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/SimClustersOfflineJobUtil.scala +++ /dev/null @@ -1,97 +0,0 @@ -package com.twitter.simclusters_v2.scalding.offline_job - -import com.twitter.algebird.{DecayedValueMonoid, Monoid, OptionMonoid} -import com.twitter.algebird_internal.thriftscala.{DecayedValue => ThriftDecayedValue} -import com.twitter.scalding.{TypedPipe, _} -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, ProcAtla} -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.{Timestamp, TweetId, UserId} -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.summingbird.common.{Configs, ThriftDecayedValueMonoid} -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.timelineservice.thriftscala.{ContextualizedFavoriteEvent, FavoriteEventUnion} -import java.util.TimeZone -import twadoop_config.configuration.log_categories.group.timeline.TimelineServiceFavoritesScalaDataset - -object SimClustersOfflineJobUtil { - - implicit val timeZone: TimeZone = DateOps.UTC - implicit val dateParser: DateParser = DateParser.default - - implicit val modelVersionOrdering: Ordering[PersistedModelVersion] = - Ordering.by(_.value) - - implicit val scoreTypeOrdering: Ordering[PersistedScoreType] = - Ordering.by(_.value) - - implicit val persistedScoresOrdering: Ordering[PersistedScores] = Ordering.by( - _.score.map(_.value).getOrElse(0.0) - ) - - implicit val decayedValueMonoid: DecayedValueMonoid = DecayedValueMonoid(0.0) - - implicit val thriftDecayedValueMonoid: ThriftDecayedValueMonoid = - new ThriftDecayedValueMonoid(Configs.HalfLifeInMs)(decayedValueMonoid) - - implicit val persistedScoresMonoid: PersistedScoresMonoid = - new PersistedScoresMonoid()(thriftDecayedValueMonoid) - - def readInterestedInScalaDataset( - implicit dateRange: DateRange - ): TypedPipe[(Long, ClustersUserIsInterestedIn)] = { - //read SimClusters InterestedIn datasets - DAL - .readMostRecentSnapshot( - SimclustersV2InterestedIn20M145KUpdatedScalaDataset, - dateRange.embiggen(Days(30)) - ) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .map { - case KeyVal(key, value) => (key, value) - } - } - - def readTimelineFavoriteData( - implicit dateRange: DateRange - ): TypedPipe[(UserId, TweetId, Timestamp)] = { - DAL - .read(TimelineServiceFavoritesScalaDataset, dateRange) // Note: this is a hourly source - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .flatMap { cfe: ContextualizedFavoriteEvent => - cfe.event match { - case FavoriteEventUnion.Favorite(fav) => - Some((fav.userId, fav.tweetId, fav.eventTimeMs)) - case _ => - None - } - - } - } - - class PersistedScoresMonoid( - implicit thriftDecayedValueMonoid: ThriftDecayedValueMonoid) - extends Monoid[PersistedScores] { - - private val optionalThriftDecayedValueMonoid = - new OptionMonoid[ThriftDecayedValue]() - - override val zero: PersistedScores = PersistedScores() - - override def plus(x: PersistedScores, y: PersistedScores): PersistedScores = { - PersistedScores( - optionalThriftDecayedValueMonoid.plus( - x.score, - y.score - ) - ) - } - - def build(value: Double, timeInMs: Double): PersistedScores = { - PersistedScores(Some(thriftDecayedValueMonoid.build(value, timeInMs))) - } - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/BUILD.bazel b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/BUILD.bazel deleted file mode 100644 index 437e716d4..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/BUILD.bazel +++ /dev/null @@ -1,81 +0,0 @@ -scala_library( - platform = "java8", - tags = ["bazel-only"], - dependencies = [ - "3rdparty/jvm/com/twitter/storehaus:algebra", - "3rdparty/jvm/com/twitter/storehaus:core", - "3rdparty/src/jvm/com/twitter/storehaus:algebra", - "3rdparty/src/jvm/com/twitter/storehaus:core", - "graphstore/common:flock_follows-java", - "snowflake:id", - "src/java/com/twitter/ml/api/constant", - "src/java/com/twitter/sbf/graph", - "src/scala/com/twitter/ml/api:api-base", - "src/scala/com/twitter/pluck/source/core_workflows/user_model:condensed_user_state-scala", - "src/scala/com/twitter/scalding_internal/dalv2", - "src/scala/com/twitter/scalding_internal/job", - "src/scala/com/twitter/scalding_internal/job/analytics_batch", - "src/scala/com/twitter/scalding_internal/source", - "src/scala/com/twitter/scalding_internal/source/lzo_scrooge", - "src/scala/com/twitter/simclusters_v2/candidate_source", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/scala/com/twitter/simclusters_v2/scalding/offline_job", - "src/scala/com/twitter/simclusters_v2/summingbird/common", - "src/scala/com/twitter/timelines/prediction/features/common", - "src/scala/com/twitter/timelines/prediction/features/itl", - "src/scala/com/twitter/timelines/prediction/features/recap", - "src/scala/com/twitter/wtf/scalding/jobs/common:execution_app", - "src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala", - "src/thrift/com/twitter/wtf/scalding/sims:sims-thrift-scala", - "twadoop_config/configuration/log_categories/group/timeline:timeline_service_favorites-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - "usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala", - ], -) - -hadoop_binary( - name = "tweet_embedding-adhoc", - main = "com.twitter.simclusters_v2.scalding.offline_job.SimClustersTweetEmbeddingAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":adhoc", - ], -) - -hadoop_binary( - name = "tweet_embedding_evaluation_samples-adhoc", - main = "com.twitter.simclusters_v2.scalding.offline_job.TweetSimilarityEvaluationSamplingAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":adhoc", - ], -) - -hadoop_binary( - name = "tweet_embedding_evaluation-adhoc", - main = "com.twitter.simclusters_v2.scalding.offline_job.TweetSimilarityEvaluationAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":adhoc", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/BUILD.docx new file mode 100644 index 000000000..5e3d31f31 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/README b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/README deleted file mode 100644 index c5f963e67..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/README +++ /dev/null @@ -1,5 +0,0 @@ -To reproduce, you need to run - -1. SimClustersTweetEmbeddingAdhocApp to generate cluster -> top tweets and tweet -> top clusters data sets -2. TweetSimilarityEvaluationSamplingAdhocApp to sample a subset of tweets that you want to compute some metrics on -3. TweetSimilarityEvaluationAdhocApp to perform the evaluation diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/README.docx b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/README.docx new file mode 100644 index 000000000..c72f45866 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/README.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/SimClustersTweetEmbeddingAdhocApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/SimClustersTweetEmbeddingAdhocApp.docx new file mode 100644 index 000000000..7c56fe6d4 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/SimClustersTweetEmbeddingAdhocApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/SimClustersTweetEmbeddingAdhocApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/SimClustersTweetEmbeddingAdhocApp.scala deleted file mode 100644 index 5b2f382af..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/SimClustersTweetEmbeddingAdhocApp.scala +++ /dev/null @@ -1,211 +0,0 @@ -package com.twitter.simclusters_v2.scalding.offline_job.adhoc - -import com.twitter.bijection.{Bufferable, Injection} -import com.twitter.scalding._ -import com.twitter.scalding.commons.source.VersionedKeyValSource -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, ProcAtla} -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.{ClusterId, TweetId, UserId} -import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedIn20M145KUpdatedScalaDataset -import com.twitter.simclusters_v2.scalding.common.matrix.{SparseMatrix, SparseRowMatrix} -import com.twitter.simclusters_v2.scalding.offline_job.SimClustersOfflineJobUtil -import com.twitter.simclusters_v2.summingbird.common.{Configs, SimClustersInterestedInUtil} -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import java.util.TimeZone - -/** - * Adhoc job for computing Tweet SimClusters embeddings. - * The output of this job includes two data sets: tweet -> top clusters (or Tweet Embedding), and cluster -> top tweets. - * These data sets are supposed to be the snapshot of the two index at the end of the dataRange you run. - * - * Note that you can also use the output from SimClustersOfflineJobScheduledApp for analysis purpose. - * The outputs from that job might be more close to the data we use in production. - * The benefit of having this job is to keep the flexibility of experiment different ideas. - * - * It is recommended to put at least 2 days in the --date (dataRange in the code) in order to make sure - * we have enough engagement data for tweets have more engagements in the last 1+ days. - * - * - * There are several parameters to tune in the job. They are explained in the inline comments. - * - * - * To run the job: - scalding remote run \ - --target src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc:tweet_embedding-adhoc \ - --user recos-platform \ - --reducers 1000 \ - --main-class com.twitter.simclusters_v2.scalding.offline_job.adhoc.SimClustersTweetEmbeddingAdhocApp -- \ - --date 2021-01-27 2021-01-28 \ - --score_type logFav \ - --output_dir /user/recos-platform/adhoc/tweet_embedding_01_27_28_unnormalized_t9 - */ -object SimClustersTweetEmbeddingAdhocApp extends AdhocExecutionApp { - - import SimClustersOfflineJobUtil._ - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val outputDir = args("output_dir") - - // what interestedIn score to use. logFav is what we use in production - val scoringMethod = args.getOrElse("score_type", "logFav") - - // whether to use normalized score in the cluster -> top tweets. - // Currently, we do not do this in production. DONOT turn it on unless you know what you are doing. - // NOTE that for scalding args, "--run_normalized" will just set the arg to be true, and - // even you use "--run_normalized false", it will still be true. - val usingNormalizedScoringFunction = args.boolean("run_normalized") - - // filter out tweets that has less than X favs in the dateRange. - val tweetFavThreshold = args.long("tweet_fav_threshold", 0L) - - // tweet -> top clusters will be saved in this subfolder - val tweetTopKClustersOutputPath: String = outputDir + "/tweet_top_k_clusters" - - // cluster -> top tweets will be saved in this subfolder - val clusterTopKTweetsOutputPath: String = outputDir + "/cluster_top_k_tweets" - - val interestedInData: TypedPipe[(Long, ClustersUserIsInterestedIn)] = - DAL - .readMostRecentSnapshot( - SimclustersV2InterestedIn20M145KUpdatedScalaDataset, - dateRange.embiggen(Days(14)) - ) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .map { - case KeyVal(key, value) => (key, value) - } - - // read user-tweet fav data. set the weight to be a decayed value. they will be decayed to the dateRang.end - val userTweetFavData: SparseMatrix[UserId, TweetId, Double] = - SparseMatrix(readTimelineFavoriteData(dateRange)).tripleApply { - case (userId, tweetId, timestamp) => - ( - userId, - tweetId, - thriftDecayedValueMonoid - .plus( - thriftDecayedValueMonoid.build(1.0, timestamp), - thriftDecayedValueMonoid.build(0.0, dateRange.end.timestamp) - ) - .value) - } - - // filter out tweets without x favs - val tweetSubset = - userTweetFavData.colNnz.filter( - _._2 > tweetFavThreshold.toDouble - ) // keep tweets with at least x favs - - val userTweetFavDataSubset = userTweetFavData.filterCols(tweetSubset.keys) - - // construct user-simclusters matrix - val userSimClustersInterestedInData: SparseRowMatrix[UserId, ClusterId, Double] = - SparseRowMatrix( - interestedInData.map { - case (userId, clusters) => - val topClustersWithScores = - SimClustersInterestedInUtil - .topClustersWithScores(clusters) - .collect { - case (clusterId, scores) - if scores.favScore > Configs - .favScoreThresholdForUserInterest( - clusters.knownForModelVersion - ) => // this is the same threshold used in the summingbird job - scoringMethod match { - case "fav" => - clusterId -> scores.clusterNormalizedFavScore - case "follow" => - clusterId -> scores.clusterNormalizedFollowScore - case "logFav" => - clusterId -> scores.clusterNormalizedLogFavScore - case _ => - throw new IllegalArgumentException( - "score_type can only be fav, follow or logFav") - } - } - .filter(_._2 > 0.0) - .toMap - userId -> topClustersWithScores - }, - isSkinnyMatrix = true - ) - - // multiply tweet -> user matrix with user -> cluster matrix to get tweet -> cluster matrix - val tweetClusterScoreMatrix = if (usingNormalizedScoringFunction) { - userTweetFavDataSubset.transpose.rowL2Normalize - .multiplySkinnySparseRowMatrix(userSimClustersInterestedInData) - } else { - userTweetFavDataSubset.transpose.multiplySkinnySparseRowMatrix( - userSimClustersInterestedInData) - } - - // get the tweet -> top clusters by taking top K in each row - val tweetTopClusters = tweetClusterScoreMatrix - .sortWithTakePerRow(Configs.topKClustersPerTweet)(Ordering.by(-_._2)) - .fork - - // get the cluster -> top tweets by taking top K in each colum - val clusterTopTweets = tweetClusterScoreMatrix - .sortWithTakePerCol(Configs.topKTweetsPerCluster)(Ordering.by(-_._2)) - .fork - - // injections for saving a list - implicit val inj1: Injection[List[(Int, Double)], Array[Byte]] = - Bufferable.injectionOf[List[(Int, Double)]] - implicit val inj2: Injection[List[(Long, Double)], Array[Byte]] = - Bufferable.injectionOf[List[(Long, Double)]] - - // save the data sets and also output to some tsv files for eyeballing the results - Execution - .zip( - tweetTopClusters - .mapValues(_.toList) - .writeExecution( - VersionedKeyValSource[TweetId, List[(ClusterId, Double)]](tweetTopKClustersOutputPath) - ), - tweetTopClusters - .map { - case (tweetId, topKClusters) => - tweetId -> topKClusters - .map { - case (clusterId, score) => - s"$clusterId:" + "%.3g".format(score) - } - .mkString(",") - } - .writeExecution( - TypedTsv(tweetTopKClustersOutputPath + "_tsv") - ), - tweetSubset.writeExecution(TypedTsv(tweetTopKClustersOutputPath + "_tweet_favs")), - clusterTopTweets - .mapValues(_.toList) - .writeExecution( - VersionedKeyValSource[ClusterId, List[(TweetId, Double)]](clusterTopKTweetsOutputPath) - ), - clusterTopTweets - .map { - case (clusterId, topKTweets) => - clusterId -> topKTweets - .map { - case (tweetId, score) => s"$tweetId:" + "%.3g".format(score) - } - .mkString(",") - } - .writeExecution( - TypedTsv(clusterTopKTweetsOutputPath + "_tsv") - ) - ) - .unit - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/TweetSimilarityEvaluationAdhocApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/TweetSimilarityEvaluationAdhocApp.docx new file mode 100644 index 000000000..58e53f64e Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/TweetSimilarityEvaluationAdhocApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/TweetSimilarityEvaluationAdhocApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/TweetSimilarityEvaluationAdhocApp.scala deleted file mode 100644 index ea64df6e2..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc/TweetSimilarityEvaluationAdhocApp.scala +++ /dev/null @@ -1,362 +0,0 @@ -package com.twitter.simclusters_v2.scalding.offline_job.adhoc - -import com.twitter.bijection.{Bufferable, Injection} -import com.twitter.scalding._ -import com.twitter.scalding.commons.source.VersionedKeyValSource -import com.twitter.simclusters_v2.common.{ClusterId, CosineSimilarityUtil, TweetId} -import com.twitter.simclusters_v2.scalding.common.matrix.SparseRowMatrix -import com.twitter.simclusters_v2.scalding.offline_job.SimClustersOfflineJobUtil -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import java.util.TimeZone - -/** - * - * A job to sample some tweets for evaluation. - * - * we bucket tweets by the log(# of fav + 1) and randomly pick 1000 for each bucket for evaluation. - * - * to run the job: - * - scalding remote run \ - --target src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc:tweet_embedding_evaluation_samples-adhoc \ - --user recos-platform \ - --reducers 1000 \ - --main-class com.twitter.simclusters_v2.scalding.offline_job.adhoc.TweetSimilarityEvaluationSamplingAdhocApp -- \ - --date 2021-01-27 2021-01-28 \ - --output /user/recos-platform/adhoc/tweet_embedding_01_27_28_sample_tweets - */ -object TweetSimilarityEvaluationSamplingAdhocApp extends AdhocExecutionApp { - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val random = new java.util.Random(args.long("seed", 20200322L)) - - // # of tweets in each bucket - val topK = args.int("bucket_size", 1000) - - val output = args("output") - - SimClustersOfflineJobUtil - .readTimelineFavoriteData(dateRange) - .map { - case (_, tweetId, _) => - tweetId -> 1L - } - .sumByKey - .filter(_._2 >= 10L) // only consider tweets with more than 10 favs - .map { - case (tweetId, tweetFavs) => - val bucket = math.log10(tweetFavs + 1.0).toInt - bucket -> (tweetId, random.nextDouble()) - } - .group - .sortedReverseTake(topK)(Ordering.by(_._2)) - .flatMap { - case (bucket, tweets) => - val bucketSize = tweets.length - tweets.map { - case (tweetId, _) => - (tweetId, bucket, bucketSize) - } - } - .writeExecution( - TypedTsv[(Long, Int, Int)](output) - ) - - } -} - -/** - * - * A job for evaluating the performance of an approximate nearest neighbor search method with a brute - * force method. - * - * Evaluation method: - * - * After getting the embeddings for these tweets, we bucketize tweets based on the number of favs they have - * (i.e., math.log10(numFavors).toInt), and then randomly select 1000 tweets from each bucket. - * We do not include tweets with fewer than 10 favs. We compute the nearest neighbors (in terms of cosine similarity) - * for these tweets using the brute force method and use up to top 100 neighbors with the cosine - * similarity score >0.8 for each tweet as ground-truth set G. - * - * We then compute the nearest neighbors for these tweets based on the approximate nearest neighbor search: for each tweet, we find the top clusters, and then find top tweets in each cluster as potential candidates. We rank these potential candidates by the cosine similarity scores and take top 100 as prediction set P. We evaluate the precision and recall using - * - * Precision = |P \intersect G| / |P| - * Recall = |P \intersect G| / |G| - * - * Note that |P| and |G| can be different, when there are not many neighbors returned. - * - scalding remote run \ - --target src/scala/com/twitter/simclusters_v2/scalding/offline_job/adhoc:tweet_embedding_evaluation-adhoc \ - --user recos-platform \ - --reducers 1000 \ - --main-class com.twitter.simclusters_v2.scalding.offline_job.adhoc.TweetSimilarityEvaluationAdhocApp -- \ - --date 2021-01-27 \ - --tweet_top_k /user/recos-platform/adhoc/tweet_embedding_01_27_28_unnormalized_t9/tweet_top_k_clusters \ - --cluster_top_k /user/recos-platform/adhoc/tweet_embedding_01_27_28_unnormalized_t9/cluster_top_k_tweets \ - --tweets /user/recos-platform/adhoc/tweet_embedding_01_27_28_sample_tweets \ - --output /user/recos-platform/adhoc/tweet_embedding_evaluation_01_27_28_t05_k50_1 - */ -object TweetSimilarityEvaluationAdhocApp extends AdhocExecutionApp { - - implicit val inj1: Injection[List[(Int, Double)], Array[Byte]] = - Bufferable.injectionOf[List[(Int, Double)]] - implicit val inj2: Injection[List[(Long, Double)], Array[Byte]] = - Bufferable.injectionOf[List[(Long, Double)]] - - // Take top 20 candidates, the score * 100 - private def formatList(candidates: Seq[(TweetId, Double)]): Seq[(TweetId, Int)] = { - candidates.take(10).map { - case (clusterId, score) => - (clusterId, (score * 100).toInt) - } - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - // path to read the tweet -> top cluster data set. should be the same from the SimClustersTweetEmbeddingAdhocApp job - val tweetTopKClustersPath = args("tweet_top_k") - - // path to read the cluster -> top tweets data set. should be the same from the SimClustersTweetEmbeddingAdhocApp job - val clusterTopKTweetsPath = args("cluster_top_k") - - // path to read the sampled tweets, should be the same from TweetSimilarityEvaluationSamplingAdhocApp - val tweetsPath = args("tweets") - - // see the comment of this class. this is to determine which tweet should be ground truth - val threshold = args.double("threshold", 0.8) - - // see the comment of this class. this is to determine which tweet should be ground truth - val topK = args.int("topK", 100) - - // output path for evaluation results - val output = args("output") - - // read tweet -> top clusters data set - val tweetTopKClusters: SparseRowMatrix[TweetId, ClusterId, Double] = - SparseRowMatrix( - TypedPipe - .from( - VersionedKeyValSource[TweetId, List[(ClusterId, Double)]](tweetTopKClustersPath) - ) - .mapValues(_.filter(_._2 > 0.001).toMap), - isSkinnyMatrix = true - ).rowL2Normalize - - // read cluster -> top tweets data set - val clusterTopTweets: SparseRowMatrix[ClusterId, TweetId, Double] = - SparseRowMatrix( - TypedPipe - .from( - VersionedKeyValSource[ClusterId, List[(TweetId, Double)]](clusterTopKTweetsPath) - ) - .mapValues(_.filter(_._2 > 0.02).toMap), - isSkinnyMatrix = false - ) - - // read the sampled tweets from TweetSimilarityEvaluationSamplingAdhocApp - val tweetSubset = TypedPipe.from(TypedTsv[(Long, Int, Int)](tweetsPath)) - - // the tweet -> top clusters for the sampled tweets - val tweetEmbeddingSubset = - tweetTopKClusters.filterRows(tweetSubset.map(_._1)) - - // compute ground-truth top similar tweets for each sampled tweets. - // for each sampled tweets, we compute their similarity with every tweets in the tweet -> top clusters data set. - // we filter out those with similarity score smaller than the threshold and keep top k as the ground truth similar tweets - val groundTruthData = tweetTopKClusters.toSparseMatrix - .multiplySkinnySparseRowMatrix( - tweetEmbeddingSubset.toSparseMatrix.transpose.toSparseRowMatrix(true), - numReducersOpt = Some(5000) - ) - .toSparseMatrix - .transpose - .filter((_, _, v) => v > threshold) - .sortWithTakePerRow(topK)(Ordering.by(-_._2)) - - // compute approximate similar tweets for each sampled tweets. - // this is achieved by multiplying "sampled_tweets -> top clusters" matrix with "cluster -> top tweets" matrix. - // note that in the implementation, we first compute the transponse of this matrix in order to ultlize the optimization done on skinny matrices - val predictionData = clusterTopTweets.toSparseMatrix.transpose - .multiplySkinnySparseRowMatrix( - tweetEmbeddingSubset.toSparseMatrix.transpose.toSparseRowMatrix(true), - numReducersOpt = Some(5000) - ) - .toSparseMatrix - .transpose - .toTypedPipe - .map { - case (queryTweet, candidateTweet, _) => - (queryTweet, candidateTweet) - } - .join(tweetEmbeddingSubset.toTypedPipe) - .map { - case (queryId, (candidateId, queryEmbedding)) => - candidateId -> (queryId, queryEmbedding) - } - .join(tweetTopKClusters.toTypedPipe) - .map { - case (candidateId, ((queryId, queryEmbedding), candidateEmbedding)) => - queryId -> (candidateId, CosineSimilarityUtil - .dotProduct( - queryEmbedding, - candidateEmbedding - )) - } - .filter(_._2._2 > threshold) - .group - .sortedReverseTake(topK)(Ordering.by(_._2)) - - // Exist in Ground Truth but not exist in Predication - val potentialData = - groundTruthData - .leftJoin(predictionData) - .map { - case (tweetId, (groundTruthCandidates, predictedCandidates)) => - val predictedCandidateSet = predictedCandidates.toSeq.flatten.map(_._1).toSet - val potentialTweets = groundTruthCandidates.filterNot { - case (candidateId, _) => - predictedCandidateSet.contains(candidateId) - } - (tweetId, potentialTweets) - } - - val debuggingData = - groundTruthData - .leftJoin(predictionData) - .map { - case (tweetId, (groundTruthTweets, maybepredictedTweets)) => - val predictedTweets = maybepredictedTweets.toSeq.flatten - val predictedTweetSet = predictedTweets.map(_._1).toSet - val potentialTweets = groundTruthTweets.filterNot { - case (candidateId, _) => - predictedTweetSet.contains(candidateId) - } - - ( - tweetId, - Seq( - formatList(potentialTweets), - formatList(groundTruthTweets), - formatList(predictedTweets))) - } - - // for each tweet, compare the approximate topk and ground-truth topk. - // compute precision and recall, then averaging them per bucket. - val eval = tweetSubset - .map { - case (tweetId, bucket, bucketSize) => - tweetId -> (bucket, bucketSize) - } - .leftJoin(groundTruthData) - .leftJoin(predictionData) - .map { - case (_, (((bucket, bucketSize), groundTruthOpt), predictionOpt)) => - val groundTruth = groundTruthOpt.getOrElse(Nil).map(_._1) - val prediction = predictionOpt.getOrElse(Nil).map(_._1) - - assert(groundTruth.distinct.size == groundTruth.size) - assert(prediction.distinct.size == prediction.size) - - val intersection = groundTruth.toSet.intersect(prediction.toSet) - - val precision = - if (prediction.nonEmpty) - intersection.size.toDouble / prediction.size.toDouble - else 0.0 - val recall = - if (groundTruth.nonEmpty) - intersection.size.toDouble / groundTruth.size.toDouble - else 0.0 - - ( - bucket, - bucketSize) -> (groundTruth.size, prediction.size, intersection.size, precision, recall, 1.0) - } - .sumByKey - .map { - case ( - (bucket, bucketSize), - (groundTruthSum, predictionSum, interSectionSum, precisionSum, recallSum, count)) => - ( - bucket, - bucketSize, - groundTruthSum / count, - predictionSum / count, - interSectionSum / count, - precisionSum / count, - recallSum / count, - count) - } - - // output the eval results and some sample results for eyeballing - Execution - .zip( - eval - .writeExecution(TypedTsv(output)), - groundTruthData - .map { - case (tweetId, neighbors) => - tweetId -> neighbors - .map { - case (id, score) => s"$id:$score" - } - .mkString(",") - } - .writeExecution( - TypedTsv(args("output") + "_ground_truth") - ), - predictionData - .map { - case (tweetId, neighbors) => - tweetId -> neighbors - .map { - case (id, score) => s"$id:$score" - } - .mkString(",") - } - .writeExecution( - TypedTsv(args("output") + "_prediction") - ), - potentialData - .map { - case (tweetId, neighbors) => - tweetId -> neighbors - .map { - case (id, score) => s"$id:$score" - } - .mkString(",") - }.writeExecution( - TypedTsv(args("output") + "_potential") - ), - debuggingData - .map { - case (tweetId, candidateList) => - val value = candidateList - .map { candidates => - candidates - .map { - case (id, score) => - s"${id}D$score" - }.mkString("C") - }.mkString("B") - s"${tweetId}A$value" - }.writeExecution( - TypedTsv(args("output") + "_debugging") - ) - ) - .unit - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/BUILD.bazel b/src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/BUILD.bazel deleted file mode 100644 index f1293bac2..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/BUILD.bazel +++ /dev/null @@ -1,27 +0,0 @@ -scala_library( - sources = [ - "*.scala", - ], - platform = "java8", - tags = ["bazel-only"], - dependencies = [ - "src/scala/com/twitter/simclusters_v2/scalding", - "tweetsource/public_tweets/src/main/scala/com/twitter/tweetsource/public_tweets:public_tweets-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - ], -) - -hadoop_binary( - name = "offline_cluster_top_media_tweets_20M_145K_2020-adhoc", - main = "com.twitter.simclusters_v2.scalding.offline_tweets.AdhocClusterTopTweetsJob", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":offline_tweets", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/BUILD.docx new file mode 100644 index 000000000..3b0b06018 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/ClusterTopMediaTweetsJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/ClusterTopMediaTweetsJob.docx new file mode 100644 index 000000000..bcf77d297 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/ClusterTopMediaTweetsJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/ClusterTopMediaTweetsJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/ClusterTopMediaTweetsJob.scala deleted file mode 100644 index f966a6a93..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/ClusterTopMediaTweetsJob.scala +++ /dev/null @@ -1,267 +0,0 @@ -package com.twitter.simclusters_v2.scalding.offline_tweets - -import com.twitter.algebird.Aggregator.size -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding.Args -import com.twitter.scalding.DateOps -import com.twitter.scalding.DateParser -import com.twitter.scalding.DateRange -import com.twitter.scalding.Days -import com.twitter.scalding.Duration -import com.twitter.scalding.Execution -import com.twitter.scalding.Hours -import com.twitter.scalding.RichDate -import com.twitter.scalding.TypedTsv -import com.twitter.scalding.UniqueID -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.scalding_internal.dalv2.DALWrite.WriteExtension -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.Timestamp -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.hdfs_sources.DataPaths -import com.twitter.simclusters_v2.hdfs_sources.OfflineClusterTopMediaTweets20M145K2020ScalaDataset -import com.twitter.simclusters_v2.scalding.common.LogFavBasedPersistentTweetEmbeddingMhExportSource -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.thriftscala.DayPartitionedClusterId -import com.twitter.simclusters_v2.thriftscala.PersistentSimClustersEmbedding -import com.twitter.simclusters_v2.thriftscala.TweetWithScore -import com.twitter.simclusters_v2.thriftscala.TweetsWithScore -import com.twitter.snowflake.id.SnowflakeId -import com.twitter.tweetsource.common.thriftscala.MediaType -import com.twitter.tweetsource.common.thriftscala.UnhydratedFlatTweet -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone -import java.text.SimpleDateFormat - -object ClusterTopTweetsJob { - - def serviceIdentifier(zone: String, env: String): ServiceIdentifier = ServiceIdentifier( - role = "cassowary", - service = "offline_cluster_top_media_tweets_20M_145K_2020", - environment = env, - zone = zone - ) - - private def isMediaTweet(tweet: UnhydratedFlatTweet): Boolean = { - tweet.media.exists { mediaSeq => - mediaSeq.exists { e => - e.mediaType.contains(MediaType.Video) - } - } - } - - private val dateFormatter = new SimpleDateFormat("yyyy-MM-dd") - - def getClusterTopMediaTweets( - persistentEmbeddingPipe: TypedPipe[((TweetId, Timestamp), PersistentSimClustersEmbedding)], - tweetSourcePipe: TypedPipe[UnhydratedFlatTweet], - maxTweetsPerClusterPerPartition: Int - ): TypedPipe[(DayPartitionedClusterId, Seq[(TweetId, Double)])] = { - val mediaTweetsPipe = tweetSourcePipe.collect { - case tweet if isMediaTweet(tweet) => (tweet.tweetId, ()) - } - - val tweetEmbeddingsPipe: TypedPipe[(TweetId, (Int, Double))] = { - persistentEmbeddingPipe.collect { - case ((tweetId, timestamp), persistentEmbedding) - if timestamp == 1L => // 1L is the longest L2 embedding - - persistentEmbedding.embedding.embedding.map { clusterWithScore => - (tweetId, (clusterWithScore.clusterId, clusterWithScore.score)) - } - }.flatten - } - - mediaTweetsPipe - .join(tweetEmbeddingsPipe) - .withReducers(2000) - .map { - case (tweetId, ((), (clusterId, score))) => - val dayPartition = dateFormatter.format(SnowflakeId(tweetId).time.inMilliseconds) - ((clusterId, dayPartition), Seq((tweetId, score))) - } - .sumByKey - .mapValues(_.sortBy(-_._2).take(maxTweetsPerClusterPerPartition)) - .map { case ((cid, partition), values) => (DayPartitionedClusterId(cid, partition), values) } - } - - // Convert to Manhattan compatible format - def toKeyVal( - clusterTopTweets: TypedPipe[(DayPartitionedClusterId, Seq[(TweetId, Double)])], - ): TypedPipe[KeyVal[DayPartitionedClusterId, TweetsWithScore]] = { - clusterTopTweets.map { - case (key, tweetsWithScores) => - val thrift = tweetsWithScores.map { t => TweetWithScore(t._1, t._2) } - KeyVal(key, TweetsWithScore(thrift)) - } - } -} - -/** - * Scheduled job. Runs every couple of hours (check the .yaml for exact cron schedule). - * Reads 21 days of tweets, and the most recent persistent tweet embeddings from a Manhattan dump. - * It outputs a clusterId-> List[tweetId] index. - -capesospy-v2 update --build_locally --start_cron \ -offline_cluster_top_media_tweets_20M_145K_2020 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object ClusterTopMediaTweets20M145K2020BatchJob extends ScheduledExecutionApp { - override def firstTime: RichDate = RichDate("2021-08-29") - - override def batchIncrement: Duration = Hours(3) - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - // max public tweet has 21 days. read 1 day fewer go give some buffer - val lookbackDateRange = dateRange.prepend(Days(21)) - - val tweetSource: TypedPipe[UnhydratedFlatTweet] = - ExternalDataSources.flatTweetsSource(lookbackDateRange) - - val persistentEmbeddingPipe: TypedPipe[ - ((TweetId, Timestamp), PersistentSimClustersEmbedding) - ] = - TypedPipe.from( - new LogFavBasedPersistentTweetEmbeddingMhExportSource( - range = lookbackDateRange, - serviceIdentifier = ClusterTopTweetsJob.serviceIdentifier(args("zone"), args("env")) - )) - - val maxTweetsPerClusterPerPartition = 1200 - - val dailyClusterTopTweets = ClusterTopTweetsJob.getClusterTopMediaTweets( - persistentEmbeddingPipe, - tweetSource, - maxTweetsPerClusterPerPartition - ) - - val keyValPipe: TypedPipe[KeyVal[DayPartitionedClusterId, TweetsWithScore]] = - ClusterTopTweetsJob.toKeyVal(dailyClusterTopTweets) - - keyValPipe - .writeDALVersionedKeyValExecution( - OfflineClusterTopMediaTweets20M145K2020ScalaDataset, - D.Suffix(DataPaths.OfflineClusterTopMediaTweets2020DatasetPath) - ) - } -} - -/** -Adhoc debugging job. Uses Entity Embeddings dataset to infer user interests - -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/ &&\ -scalding remote run \ - --main-class com.twitter.simclusters_v2.scalding.offline_tweets.AdhocClusterTopMediaTweetsJob \ - --target src/scala/com/twitter/simclusters_v2/scalding/offline_tweets/:offline_cluster_top_media_tweets_20M_145K_2020-adhoc \ - --user cassowary \ - -- --output_dir /scratch_user/cassowary/your_ldap --date 2021-08-30 --zone atla --env prod --email your_ldap@twitter.com - */ -object AdhocClusterTopMediaTweetsJob extends AdhocExecutionApp { - - /** - * Run some stat analysis on the results, such as the number of tweets in a cluster, tweet score - * distributions, etc. - * - * Ideally works on 1 day data only. If multiple days data are passed in, it'll aggregate over - * multiple days anyway - */ - def analyzeClusterResults( - clusterTopTweets: TypedPipe[(DayPartitionedClusterId, Seq[(TweetId, Double)])] - ): Execution[String] = { - - val tweetSizeExec = Util.printSummaryOfNumericColumn( - clusterTopTweets.map { case (_, tweets) => tweets.size }, - columnName = Some("Tweet size distribution of clusters") - ) - - val scoreDistExec = Util.printSummaryOfNumericColumn( - clusterTopTweets.flatMap(_._2.map(_._2)), - columnName = Some("Score distribution of the tweets") - ) - - val numClustersExec = - clusterTopTweets.map(_._1._1).distinct.aggregate(size).getOrElseExecution(0L) - - val numTweetsExec = - clusterTopTweets.flatMap(_._2.map(_._1)).distinct.aggregate(size).getOrElseExecution(0L) - - Execution.zip(tweetSizeExec, scoreDistExec, numClustersExec, numTweetsExec).map { - case (tweetSizeDist, scoreDist, numClusters, numTweets) => - s""" - |Number of unique tweets = $numTweets - |Number of clusters = $numClusters - |------------------------ - |$tweetSizeDist - |------------------------ - |$scoreDist - |""".stripMargin - } - } - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val startTime = System.currentTimeMillis() - Execution.withArgs { args => - Execution.getMode.flatMap { implicit mode => - implicit val dateRange: DateRange = - DateRange.parse(args.list("date"))(DateOps.UTC, DateParser.default) - - val outputDir = args("output_dir") - - val maxTweetsPerCluster = 100 - - // max public tweet has 21 days. read 1 day fewer go give some buffer - val lookbackDateRange = dateRange.prepend(Days(21)) - - val tweetSource: TypedPipe[UnhydratedFlatTweet] = - ExternalDataSources.flatTweetsSource(lookbackDateRange) - - val persistentEmbeddingPipe: TypedPipe[ - ((TweetId, Timestamp), PersistentSimClustersEmbedding) - ] = - TypedPipe.from( - new LogFavBasedPersistentTweetEmbeddingMhExportSource( - range = lookbackDateRange, - serviceIdentifier = ClusterTopTweetsJob.serviceIdentifier(args("zone"), args("env")) - )) - - val results = ClusterTopTweetsJob.getClusterTopMediaTweets( - persistentEmbeddingPipe, - tweetSource, - maxTweetsPerCluster - ) - analyzeClusterResults(TypedPipe.empty) - .flatMap { distributions => - val timeTakenMin = (System.currentTimeMillis() - startTime) / 60000 - val text = - s""" - | AdhocClusterTopMediaTweetsJob finished on: $dateRange. - | Time taken: $timeTakenMin minutes. - | maxTweetsPerCluster: $maxTweetsPerCluster. - | output_dir: $outputDir - | - | $distributions - """.stripMargin - Util.sendEmail(text, "AdhocClusterTopMediaTweetsJob finished.", args("email")) - - results - .writeExecution(TypedTsv(outputDir)) - } - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/optout/BUILD.bazel b/src/scala/com/twitter/simclusters_v2/scalding/optout/BUILD.bazel deleted file mode 100644 index af06cdd1a..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/optout/BUILD.bazel +++ /dev/null @@ -1,81 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = [ - "bazel-compatible", - "bazel-only", - ], - dependencies = [ - "src/scala/com/twitter/octain/p13n/batch:p13n_preferences-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:data_sources", - "src/scala/com/twitter/simclusters_v2/scalding", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/scala/com/twitter/simclusters_v2/scalding/inferred_entities", - "src/scala/com/twitter/wtf/scalding/jobs/common:cassowary_job", - "src/scala/com/twitter/wtf/scalding/jobs/common:execution_app", - "src/scala/com/twitter/wtf/scalding/jobs/common:sources", - "src/scala/com/twitter/wtf/scalding/jobs/common:stats_util", - ], -) - -hadoop_binary( - name = "known_for_optout-adhoc", - main = "com.twitter.simclusters_v2.scalding.optout.KnownForOptOutAdhocJob", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":optout", - ], -) - -hadoop_binary( - name = "known_for_optout_daily", - main = "com.twitter.simclusters_v2.scalding.optout.KnownForOptOutDailyBatchJob", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":optout", - ], -) - -hadoop_binary( - name = "interested_in_optout-adhoc", - main = "com.twitter.simclusters_v2.scalding.optout.InterestedInOptOutAdhocJob", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - "known-to-fail-jira:SD-14439", - ], - dependencies = [ - ":optout", - ], -) - -hadoop_binary( - name = "interested_in_optout_daily", - main = "com.twitter.simclusters_v2.scalding.optout.InterestedInOptOutDailyBatchJob", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":optout", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/optout/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/optout/BUILD.docx new file mode 100644 index 000000000..d2cc91926 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/optout/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/optout/InterestedInOptOut.docx b/src/scala/com/twitter/simclusters_v2/scalding/optout/InterestedInOptOut.docx new file mode 100644 index 000000000..34726bbcb Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/optout/InterestedInOptOut.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/optout/InterestedInOptOut.scala b/src/scala/com/twitter/simclusters_v2/scalding/optout/InterestedInOptOut.scala deleted file mode 100644 index 3e24c7b0c..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/optout/InterestedInOptOut.scala +++ /dev/null @@ -1,269 +0,0 @@ -package com.twitter.simclusters_v2.scalding.optout - -import com.twitter.dal.client.dataset.{KeyValDALDataset, SnapshotDALDataset} -import com.twitter.scalding.{ - Args, - DateRange, - Days, - Duration, - Execution, - RichDate, - TypedPipe, - TypedTsv, - UniqueID -} -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.{ClusterId, ModelVersions, SemanticCoreEntityId, UserId} -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.scalding.inferred_entities.InferredEntities -import com.twitter.simclusters_v2.thriftscala.{ - ClusterType, - ClustersUserIsInterestedIn, - SemanticCoreEntityWithScore, - UserToInterestedInClusters -} -import com.twitter.wtf.scalding.jobs.common.{AdhocExecutionApp, ScheduledExecutionApp} -import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ -import com.twitter.simclusters_v2.scalding.common.Util -import java.util.TimeZone - -object InterestedInOptOut { - - def filterOptedOutInterestedIn( - interestedInPipe: TypedPipe[(UserId, ClustersUserIsInterestedIn)], - optedOutEntities: TypedPipe[(UserId, Set[SemanticCoreEntityId])], - clusterToEntities: TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] - ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = { - - val validInterestedIn = SimClustersOptOutUtil.filterOptedOutClusters( - userToClusters = interestedInPipe.mapValues(_.clusterIdToScores.keySet.toSeq), - optedOutEntities = optedOutEntities, - legibleClusters = clusterToEntities - ) - - interestedInPipe - .leftJoin(validInterestedIn) - .mapValues { - case (originalInterestedIn, validInterestedInOpt) => - val validInterestedIn = validInterestedInOpt.getOrElse(Seq()).toSet - - originalInterestedIn.copy( - clusterIdToScores = originalInterestedIn.clusterIdToScores.filterKeys(validInterestedIn) - ) - } - .filter(_._2.clusterIdToScores.nonEmpty) - } - - /** - * Writes InterestedIn data to HDFS - */ - def writeInterestedInOutputExecution( - interestedIn: TypedPipe[(UserId, ClustersUserIsInterestedIn)], - interestedInDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]], - outputPath: String - ): Execution[Unit] = { - interestedIn - .map { case (k, v) => KeyVal(k, v) } - .writeDALVersionedKeyValExecution( - interestedInDataset, - D.Suffix(outputPath) - ) - } - - /** - * Convert InterestedIn to thrift structs, then write to HDFS - */ - def writeInterestedInThriftOutputExecution( - interestedIn: TypedPipe[(UserId, ClustersUserIsInterestedIn)], - modelVersion: String, - interestedInThriftDatset: SnapshotDALDataset[UserToInterestedInClusters], - thriftOutputPath: String, - dateRange: DateRange - ): Execution[Unit] = { - interestedIn - .map { - case (userId, clusters) => - UserToInterestedInClusters(userId, modelVersion, clusters.clusterIdToScores) - } - .writeDALSnapshotExecution( - interestedInThriftDatset, - D.Daily, - D.Suffix(thriftOutputPath), - D.EBLzo(), - dateRange.end - ) - } -} - -/** -capesospy-v2 update --build_locally --start_cron \ - --start_cron interested_in_optout_daily \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object InterestedInOptOutDailyBatchJob extends ScheduledExecutionApp { - - override def firstTime: RichDate = RichDate("2019-11-24") - - override def batchIncrement: Duration = Days(1) - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val userOptoutEntities = - SimClustersOptOutUtil - .getP13nOptOutSources(dateRange.embiggen(Days(4)), ClusterType.InterestedIn) - .count("num_users_with_optouts") - .forceToDisk - - val interestedIn2020Pipe = InterestedInSources - .simClustersRawInterestedIn2020Source(dateRange, timeZone) - .count("num_users_with_2020_interestedin") - - val interestedInLite2020Pipe = InterestedInSources - .simClustersRawInterestedInLite2020Source(dateRange, timeZone) - .count("num_users_with_2020_interestedin_lite") - - val clusterToEntities = InferredEntities - .getLegibleEntityEmbeddings(dateRange.prepend(Days(21)), timeZone) - .count("num_cluster_to_entities") - - val filtered2020InterestedIn = InterestedInOptOut - .filterOptedOutInterestedIn(interestedIn2020Pipe, userOptoutEntities, clusterToEntities) - .count("num_users_with_compliant_2020_interestedin") - - val write2020Exec = InterestedInOptOut.writeInterestedInOutputExecution( - filtered2020InterestedIn, - SimclustersV2InterestedIn20M145K2020ScalaDataset, - DataPaths.InterestedIn2020Path - ) - - val write2020ThriftExec = InterestedInOptOut.writeInterestedInThriftOutputExecution( - filtered2020InterestedIn, - ModelVersions.Model20M145K2020, - SimclustersV2UserToInterestedIn20M145K2020ScalaDataset, - DataPaths.InterestedIn2020ThriftPath, - dateRange - ) - - val sanityCheck2020Exec = SimClustersOptOutUtil.sanityCheckAndSendEmail( - oldNumClustersPerUser = interestedIn2020Pipe.map(_._2.clusterIdToScores.size), - newNumClustersPerUser = filtered2020InterestedIn.map(_._2.clusterIdToScores.size), - modelVersion = ModelVersions.Model20M145K2020, - alertEmail = SimClustersOptOutUtil.AlertEmail - ) - - val filtered2020InterestedInLite = InterestedInOptOut - .filterOptedOutInterestedIn(interestedInLite2020Pipe, userOptoutEntities, clusterToEntities) - .count("num_users_with_compliant_2020_interestedin_lite") - - val write2020LiteExec = InterestedInOptOut.writeInterestedInOutputExecution( - filtered2020InterestedInLite, - SimclustersV2InterestedInLite20M145K2020ScalaDataset, - DataPaths.InterestedInLite2020Path - ) - - val write2020LiteThriftExec = InterestedInOptOut.writeInterestedInThriftOutputExecution( - filtered2020InterestedInLite, - ModelVersions.Model20M145K2020, - SimclustersV2UserToInterestedInLite20M145K2020ScalaDataset, - DataPaths.InterestedInLite2020ThriftPath, - dateRange - ) - - val sanityCheck2020LiteExec = SimClustersOptOutUtil.sanityCheckAndSendEmail( - oldNumClustersPerUser = interestedInLite2020Pipe.map(_._2.clusterIdToScores.size), - newNumClustersPerUser = filtered2020InterestedInLite.map(_._2.clusterIdToScores.size), - modelVersion = ModelVersions.Model20M145K2020, - alertEmail = SimClustersOptOutUtil.AlertEmail - ) - - Util.printCounters( - Execution.zip( - Execution.zip( - write2020Exec, - write2020ThriftExec, - sanityCheck2020Exec), - Execution.zip( - write2020LiteExec, - write2020LiteThriftExec, - sanityCheck2020LiteExec - ) - ) - ) - } -} - -/** - * For debugging only. Does a filtering run and prints the differences before/after the opt out - - scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/optout:interested_in_optout-adhoc \ - --user cassowary --cluster bluebird-qus1 \ - --main-class com.twitter.simclusters_v2.scalding.optout.InterestedInOptOutAdhocJob -- \ - --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \ - --principal service_acoount@TWITTER.BIZ \ - -- \ - --outputDir /user/cassowary/adhoc/interestedin_optout \ - --date 2020-09-03 - */ -object InterestedInOptOutAdhocJob extends AdhocExecutionApp { - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val outputDir = args("outputDir") - - val interestedInPipe = InterestedInSources - .simClustersInterestedInUpdatedSource(dateRange, timeZone) - .count("num_users_with_interestedin") - - val userOptoutEntities: TypedPipe[(UserId, Set[SemanticCoreEntityId])] = - SimClustersOptOutUtil - .getP13nOptOutSources(dateRange.embiggen(Days(4)), ClusterType.InterestedIn) - .count("num_users_with_optouts") - - val clusterToEntities = InferredEntities - .getLegibleEntityEmbeddings(dateRange, timeZone) - .count("num_cluster_to_entities") - - val filteredInterestedInPipe = InterestedInOptOut - .filterOptedOutInterestedIn( - interestedInPipe, - userOptoutEntities, - clusterToEntities - ) - .count("num_users_with_interestedin_after_optout") - - val output = interestedInPipe - .join(filteredInterestedInPipe) - .filter { - case (userId, (originalInterestedIn, filtered)) => - originalInterestedIn.clusterIdToScores != filtered.clusterIdToScores - } - .join(userOptoutEntities) - .map { - case (userId, ((originalInterestedIn, filtered), optoutEntities)) => - Seq( - "userId=" + userId, - "originalInterestedInVersion=" + originalInterestedIn.knownForModelVersion, - "originalInterestedIn=" + originalInterestedIn.clusterIdToScores.keySet, - "filteredInterestedIn=" + filtered.knownForModelVersion, - "filteredInterestedIn=" + filtered.clusterIdToScores.keySet, - "optoutEntities=" + optoutEntities - ).mkString("\t") - } - - Util.printCounters( - output.writeExecution(TypedTsv(outputDir)) - ) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/optout/KnownForOptOut.docx b/src/scala/com/twitter/simclusters_v2/scalding/optout/KnownForOptOut.docx new file mode 100644 index 000000000..a4a644d23 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/optout/KnownForOptOut.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/optout/KnownForOptOut.scala b/src/scala/com/twitter/simclusters_v2/scalding/optout/KnownForOptOut.scala deleted file mode 100644 index 621e7f994..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/optout/KnownForOptOut.scala +++ /dev/null @@ -1,198 +0,0 @@ -package com.twitter.simclusters_v2.scalding.optout - -import com.twitter.scalding.Args -import com.twitter.scalding.DateRange -import com.twitter.scalding.Days -import com.twitter.scalding.Duration -import com.twitter.scalding.Execution -import com.twitter.scalding.RichDate -import com.twitter.scalding.TypedPipe -import com.twitter.scalding.TypedTsv -import com.twitter.scalding.UniqueID -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.SemanticCoreEntityId -import com.twitter.simclusters_v2.common.UserId -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.hdfs_sources._ -import com.twitter.simclusters_v2.thriftscala.ClusterType -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor -import com.twitter.simclusters_v2.thriftscala.SemanticCoreEntityWithScore -import com.twitter.simclusters_v2.thriftscala.UserToKnownForClusters -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone -import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._ -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.scalding.inferred_entities.InferredEntities - -/** - * Creates opt-out compliant KnownFor datasets based on plain user -> KnownFor data and users' - * opt-out selections from YourTwitterData. In essence, we remove any cluster whose inferred - * entities were opted out by the user. - * The opted out KnownFor dataset should be the default dataset to be consumed, instead of the - * plain KnownFor, which is not opt-out compliant. - */ -object KnownForOptOut { - - def filterOptedOutKnownFor( - knownForPipe: TypedPipe[(UserId, ClustersUserIsKnownFor)], - optedOutEntities: TypedPipe[(UserId, Set[SemanticCoreEntityId])], - clusterToEntities: TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] - ): TypedPipe[(UserId, ClustersUserIsKnownFor)] = { - - val validKnownFor = SimClustersOptOutUtil.filterOptedOutClusters( - userToClusters = knownForPipe.mapValues(_.clusterIdToScores.keySet.toSeq), - optedOutEntities = optedOutEntities, - legibleClusters = clusterToEntities - ) - - knownForPipe - .leftJoin(validKnownFor) - .mapValues { - case (originalKnownFors, validKnownForOpt) => - val validKnownFor = validKnownForOpt.getOrElse(Seq()).toSet - - originalKnownFors.copy( - clusterIdToScores = originalKnownFors.clusterIdToScores.filterKeys(validKnownFor) - ) - } - .filter(_._2.clusterIdToScores.nonEmpty) - } -} - -/** -capesospy-v2 update --build_locally --start_cron \ - --start_cron known_for_optout_daily \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ -object KnownForOptOutDailyBatchJob extends ScheduledExecutionApp { - override def firstTime: RichDate = RichDate("2021-03-29") - - override def batchIncrement: Duration = Days(1) - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val optedOutEntitiesPipe = SimClustersOptOutUtil - .getP13nOptOutSources(dateRange.embiggen(Days(2)), ClusterType.KnownFor) - .forceToDisk - - val clusterToEntitiesPipe = InferredEntities.getLegibleEntityEmbeddings(dateRange, timeZone) - - val knownFor2020 = DAL - .readMostRecentSnapshot( - SimclustersV2RawKnownFor20M145K2020ScalaDataset, - dateRange.embiggen(Days(10))) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .map { case KeyVal(k, v) => (k, v) } - .count("num_users_with_2020_knownfor") - - val filtered2020KnownForExec = { - val filtered2020KnownForData = KnownForOptOut - .filterOptedOutKnownFor( - knownForPipe = knownFor2020, - optedOutEntities = optedOutEntitiesPipe, - clusterToEntities = clusterToEntitiesPipe - ) - .count("num_users_with_compliant_2020_knownfor") - .forceToDisk - - Execution - .zip( - filtered2020KnownForData - .map { case (k, v) => KeyVal(k, v) } - .writeDALVersionedKeyValExecution( - SimclustersV2KnownFor20M145K2020ScalaDataset, - D.Suffix(DataPaths.KnownFor2020Path) - ), - filtered2020KnownForData - .map { - case (userId, ClustersUserIsKnownFor(modelVersion, clusters)) => - UserToKnownForClusters(userId, modelVersion, clusters) - } - .writeDALSnapshotExecution( - dataset = SimclustersV2KnownFor20M145K2020ThriftScalaDataset, - updateStep = D.Daily, - pathLayout = D.Suffix(DataPaths.KnownFor2020ThriftDatasetPath), - fmt = D.Parquet, - endDate = dateRange.end - ) - ).unit - } - - Util.printCounters(filtered2020KnownForExec) - - } -} - -/** - * For debugging only. Does a filtering run and prints the differences before/after the opt out -./bazel bundle src/scala/com/twitter/simclusters_v2/scalding/optout:knownfor_optout-adhoc && \ - oscar hdfs --user recos-platform --screen --tee your_ldap \ - --bundle knownfor_optout-adhoc \ - --tool com.twitter.simclusters_v2.scalding.optout.KnownForOptOutAdhocJob \ - -- --date 2019-10-12 - */ -object KnownForOptOutAdhocJob extends AdhocExecutionApp { - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val knownForPipe = DAL - .readMostRecentSnapshotNoOlderThan(SimclustersV2RawKnownFor20M145KDec11ScalaDataset, Days(30)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .map { case KeyVal(k, v) => (k, v) } - .count("num_users_with_knownfor") - - val userOptoutEntities: TypedPipe[(UserId, Set[SemanticCoreEntityId])] = - SimClustersOptOutUtil - .getP13nOptOutSources(dateRange.embiggen(Days(4)), ClusterType.KnownFor) - .count("num_users_with_optouts") - - val clusterToEntities = InferredEntities - .getLegibleEntityEmbeddings(dateRange, timeZone) - .count("num_cluster_to_entities") - - val filteredKnownForPipe = KnownForOptOut.filterOptedOutKnownFor( - knownForPipe, - userOptoutEntities, - clusterToEntities - ) - - val output = knownForPipe - .join(filteredKnownForPipe) - .collect { - case (userId, (originalKnownFor, filtered)) - if originalKnownFor.clusterIdToScores != filtered.clusterIdToScores => - (userId, (originalKnownFor, filtered)) - } - .join(userOptoutEntities) - .map { - case (userId, ((originalKnownFor, filtered), optoutEntities)) => - Seq( - "userId=" + userId, - "originalKnownFor=" + originalKnownFor, - "filteredKnownFor=" + filtered, - "optoutEntities=" + optoutEntities - ).mkString("\t") - } - - val outputPath = "/user/recos-platform/adhoc/knownfor_optout" - output.writeExecution(TypedTsv(outputPath)) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/optout/SimClustersOptOutUtil.docx b/src/scala/com/twitter/simclusters_v2/scalding/optout/SimClustersOptOutUtil.docx new file mode 100644 index 000000000..17f930aeb Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/optout/SimClustersOptOutUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/optout/SimClustersOptOutUtil.scala b/src/scala/com/twitter/simclusters_v2/scalding/optout/SimClustersOptOutUtil.scala deleted file mode 100644 index 3b9ad7779..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/optout/SimClustersOptOutUtil.scala +++ /dev/null @@ -1,166 +0,0 @@ -package com.twitter.simclusters_v2.scalding.optout - -import com.twitter.algebird.Aggregator.size -import com.twitter.algebird.QTreeAggregatorLowerBound -import com.twitter.octain.identifiers.thriftscala.RawId -import com.twitter.octain.p13n.batch.P13NPreferencesScalaDataset -import com.twitter.octain.p13n.preferences.CompositeInterest -import com.twitter.scalding.DateRange -import com.twitter.scalding.Execution -import com.twitter.scalding.TypedPipe -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.SemanticCoreEntityId -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.thriftscala.ClusterType -import com.twitter.simclusters_v2.thriftscala.SemanticCoreEntityWithScore -import com.twitter.wtf.interest.thriftscala.Interest - -/** - * Opts out InterestedIn clusters based on clusters' entity embeddings. If a user opted out an - * entity and the user also is interested in a cluster with that entity embedding, unlink the - * user from that entity. - */ -object SimClustersOptOutUtil { - - /** - * Reads User's Your Twitter Data opt-out selections - */ - def getP13nOptOutSources( - dateRange: DateRange, - clusterType: ClusterType - ): TypedPipe[(UserId, Set[SemanticCoreEntityId])] = { - DAL - .readMostRecentSnapshot( - P13NPreferencesScalaDataset, - dateRange - ) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe - .map { record => (record.id, record.preferences) } - .flatMap { - case (RawId.UserId(userId), p13nPreferences) => - val optedOutEntities = p13nPreferences.interestPreferences - .map { preference => - preference.disabledInterests - .collect { - case CompositeInterest.RecommendationInterest(recInterest) - if clusterType == ClusterType.InterestedIn => - recInterest.interest match { - case Interest.SemanticEntityInterest(semanticCoreInterest) => - Some(semanticCoreInterest.entityId) - case _ => - None - } - - case CompositeInterest.RecommendationKnownFor(recInterest) - if clusterType == ClusterType.KnownFor => - recInterest.interest match { - case Interest.SemanticEntityInterest(semanticCoreInterest) => - Some(semanticCoreInterest.entityId) - case _ => - None - } - }.flatten.toSet - }.getOrElse(Set.empty) - if (optedOutEntities.nonEmpty) { - Some((userId, optedOutEntities)) - } else { - None - } - case _ => - None - } - } - - /** - * Remove user's clusters whose inferred entity embeddings are opted out. Will retain the user - * entry in the pipe even if all the clusters are filtered out. - */ - def filterOptedOutClusters( - userToClusters: TypedPipe[(UserId, Seq[ClusterId])], - optedOutEntities: TypedPipe[(UserId, Set[SemanticCoreEntityId])], - legibleClusters: TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] - ): TypedPipe[(UserId, Seq[ClusterId])] = { - - val inMemoryValidClusterToEntities = - legibleClusters - .mapValues(_.map(_.entityId).toSet) - .map(Map(_)).sum - - userToClusters - .leftJoin(optedOutEntities) - .mapWithValue(inMemoryValidClusterToEntities) { - case ((userId, (userClusters, optedOutEntitiesOpt)), validClusterToEntitiesOpt) => - val optedOutEntitiesSet = optedOutEntitiesOpt.getOrElse(Set.empty) - val validClusterToEntities = validClusterToEntitiesOpt.getOrElse(Map.empty) - - val clustersAfterOptOut = userClusters.filter { clusterId => - val isClusterOptedOut = validClusterToEntities - .getOrElse(clusterId, Set.empty) - .intersect(optedOutEntitiesSet) - .nonEmpty - !isClusterOptedOut - }.distinct - - (userId, clustersAfterOptOut) - } - .filter { _._2.nonEmpty } - } - - val AlertEmail = "no-reply@twitter.com" - - /** - * Does sanity check on the results, to make sure the opt out outputs are comparable to the - * raw version. If the delta in the number of users >= 0.1% or median of number of clusters per - * user >= 1%, send alert emails - */ - def sanityCheckAndSendEmail( - oldNumClustersPerUser: TypedPipe[Int], - newNumClustersPerUser: TypedPipe[Int], - modelVersion: String, - alertEmail: String - ): Execution[Unit] = { - val oldNumUsersExec = oldNumClustersPerUser.aggregate(size).toOptionExecution - val newNumUsersExec = newNumClustersPerUser.aggregate(size).toOptionExecution - - val oldMedianExec = oldNumClustersPerUser - .aggregate(QTreeAggregatorLowerBound(0.5)) - .toOptionExecution - - val newMedianExec = newNumClustersPerUser - .aggregate(QTreeAggregatorLowerBound(0.5)) - .toOptionExecution - - Execution - .zip(oldNumUsersExec, newNumUsersExec, oldMedianExec, newMedianExec) - .map { - case (Some(oldNumUsers), Some(newNumUsers), Some(oldMedian), Some(newMedian)) => - val deltaNum = (newNumUsers - oldNumUsers).toDouble / oldNumUsers.toDouble - val deltaMedian = (oldMedian - newMedian) / oldMedian - val message = - s"num users before optout=$oldNumUsers,\n" + - s"num users after optout=$newNumUsers,\n" + - s"median num clusters per user before optout=$oldMedian,\n" + - s"median num clusters per user after optout=$newMedian\n" - - println(message) - if (Math.abs(deltaNum) >= 0.001 || Math.abs(deltaMedian) >= 0.01) { - Util.sendEmail( - message, - s"Anomaly in $modelVersion opt out job. Please check cluster optout jobs in Eagleeye", - alertEmail - ) - } - case err => - Util.sendEmail( - err.toString(), - s"Anomaly in $modelVersion opt out job. Please check cluster optout jobs in Eagleeye", - alertEmail - ) - } - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/BUILD b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/BUILD deleted file mode 100644 index d048c58ca..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/BUILD +++ /dev/null @@ -1,168 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "escherbird/src/scala/com/twitter/escherbird/scalding/jobs/exportentities:entities-scala", - "escherbird/src/scala/com/twitter/escherbird/scalding/source/utt:utt_source-scala", - "interests-ds/src/main/scala/com/twitter/interests_ds/jobs/interests_service", - "interests-ds/src/main/scala/com/twitter/interests_ds/jobs/interests_service:user_topic_relation_snapshot-scala", - "src/scala/com/twitter/scalding_internal/dalv2", - "src/scala/com/twitter/scalding_internal/job", - "src/scala/com/twitter/scalding_internal/job/analytics_batch", - "src/scala/com/twitter/scalding_internal/source", - "src/scala/com/twitter/scalding_internal/source/lzo_scrooge", - "src/scala/com/twitter/simclusters_v2/candidate_source", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding", - "src/scala/com/twitter/wtf/scalding/jobs/common:em_util", - "timelines/data_processing/jobs/metrics/per_topic_metrics:per_topic_aggregate_engagement-scala", - ], -) - -hadoop_binary( - name = "geopopular_top_tweets_impressed_topics", - main = "com.twitter.simclusters_v2.scalding.topic_recommendations.GeoPopularTopicsBatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":topic_recommendations", - ], -) - -hadoop_binary( - name = "geopopular_top_tweets_impressed_topics_adhoc", - main = "com.twitter.simclusters_v2.scalding.topic_recommendations.GeoPopularTopicsAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":topic_recommendations", - ], -) - -hadoop_binary( - name = "similar_topics_from_topic_follow_graph", - main = "com.twitter.simclusters_v2.scalding.topic_recommendations.SimilarTopicsFromTopicFollowGraphScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":topic_recommendations", - ], -) - -hadoop_binary( - name = "similar_topics_from_topic_follow_graph-adhoc", - main = "com.twitter.simclusters_v2.scalding.topic_recommendations.SimilarTopicsFromTopicFollowGraphAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":topic_recommendations", - ], -) - -hadoop_binary( - name = "top_topics_for_producers_from_em", - main = "com.twitter.simclusters_v2.scalding.topic_recommendations.TopicsForProducersFromEMBatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":topic_recommendations", - ], -) - -hadoop_binary( - name = "top_topics_for_producers_from_em-adhoc", - main = "com.twitter.simclusters_v2.scalding.topic_recommendations.TopicsForProducersFromEMAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":topic_recommendations", - ], -) - -hadoop_binary( - name = "top_producers_for_topics_from_topic_follow_graph", - main = "com.twitter.simclusters_v2.scalding.topic_recommendations.ProducersForTopicsFromTopicFollowGraphBatchApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":topic_recommendations", - ], -) - -hadoop_binary( - name = "top_producers_for_topics_from_topic_follow_graph-adhoc", - main = "com.twitter.simclusters_v2.scalding.topic_recommendations.ProducersForTopicsFromTopicFollowGraphAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":topic_recommendations", - ], -) - -# Generated with `capesospy-v2 create_target popular_topics_per_country src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml`, config hash beffad. -scalding_job( - name = "popular_topics_per_country", - main = "com.twitter.simclusters_v2.scalding.topic_recommendations.GeoPopularTopicsBatchApp", - args = ["--maxTopics 100"], - config = [ - ("hadoop.combine-input", "true"), - ("hadoop.map.jvm.total-memory", "3072m"), - ("hadoop.queue", "cassowary.default"), - ("hadoop.reduce.jvm.total-memory", "3072m"), - ("hadoop.submitter.jvm.total-memory", "5120m"), - ("submitter.tier", "preemptible"), - ], - cron = "16 * * * *", - hadoop_cluster = "atla-proc3", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":topic_recommendations", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/BUILD.docx new file mode 100644 index 000000000..298e3ef0b Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/GeoPopularTopicsApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/GeoPopularTopicsApp.docx new file mode 100644 index 000000000..6357508a6 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/GeoPopularTopicsApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/GeoPopularTopicsApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/GeoPopularTopicsApp.scala deleted file mode 100644 index df4a0707c..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/GeoPopularTopicsApp.scala +++ /dev/null @@ -1,165 +0,0 @@ -package com.twitter.simclusters_v2.scalding.topic_recommendations - -import com.twitter.bijection.Bufferable -import com.twitter.bijection.Injection -import com.twitter.recos.entities.thriftscala.SemanticCoreEntity -import com.twitter.recos.entities.thriftscala.SemanticCoreEntityScoreList -import com.twitter.recos.entities.thriftscala.SemanticEntityScore -import com.twitter.scalding.commons.source.VersionedKeyValSource -import com.twitter.scalding.Execution -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.Proc2Atla -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.SemanticCoreEntityId -import com.twitter.simclusters_v2.hdfs_sources.GeopopularTopTweetImpressedTopicsScalaDataset -import com.twitter.timelines.per_topic_metrics.thriftscala.PerTopicAggregateEngagementMetric -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone -import timelines.data_processing.jobs.metrics.per_topic_metrics.PerTopicAggregateEngagementScalaDataset - -/** - scalding remote run \ - --target src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations:geopopular_top_tweets_impressed_topics_adhoc \ - --main-class com.twitter.simclusters_v2.scalding.topic_recommendations.GeoPopularTopicsAdhocApp \ - --submitter hadoopnest1.atla.twitter.com --user recos-platform \ - -- \ - --date 2020-03-28 --output_dir /user/recos-platform/adhoc/your_ldap/topics_country_counts - */ -object GeoPopularTopicsAdhocApp extends AdhocExecutionApp { - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val maxTopicsPerCountry = args.int("maxTopics", 2000) - val typedTsv = args.boolean("tsv") - implicit val inj: Injection[List[(SemanticCoreEntityId, Double)], Array[Byte]] = - Bufferable.injectionOf[List[(SemanticCoreEntityId, Double)]] - - val perTopicEngagementLogData = DAL - .read(PerTopicAggregateEngagementScalaDataset, dateRange.prepend(Days(7))) - .toTypedPipe - val topicsWithEngagement = - GeoPopularTopicsApp - .getPopularTopicsFromLogs(perTopicEngagementLogData, maxTopicsPerCountry) - .mapValues(_.toList) - - if (typedTsv) { - topicsWithEngagement.writeExecution( - TypedTsv(args("/user/recos-platform/adhoc/your_ldap/topics_country_counts_tsv")) - ) - } else { - topicsWithEngagement.writeExecution( - VersionedKeyValSource[String, List[(SemanticCoreEntityId, Double)]](args("output_dir")) - ) - } - } -} - -/** - capesospy-v2 update --build_locally \ - --start_cron popular_topics_per_country \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object GeoPopularTopicsBatchApp extends ScheduledExecutionApp { - override val firstTime: RichDate = RichDate("2020-04-06") - - override val batchIncrement: Duration = Days(1) - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val maxTopicsPerCountry = args.int("maxTopics", 2000) - - val geoPopularTopicsPath: String = - "/user/cassowary/manhattan_sequence_files/geo_popular_top_tweet_impressed_topics" - - // Read engagement logs from the past 7 days - val perTopicEngagementLogData = DAL - .read(PerTopicAggregateEngagementScalaDataset, dateRange.prepend(Days(7))) - .withRemoteReadPolicy(ExplicitLocation(Proc2Atla)) - .toTypedPipe - - val topicsWithScores = - GeoPopularTopicsApp.getPopularTopicsFromLogs(perTopicEngagementLogData, maxTopicsPerCountry) - - val topicsWithEntityScores = topicsWithScores - .mapValues(_.map { - case (topicid, topicScore) => - SemanticEntityScore(SemanticCoreEntity(entityId = topicid), topicScore) - }) - .mapValues(SemanticCoreEntityScoreList(_)) - - val writeKeyValResultExec = topicsWithEntityScores - .map { case (country, topics) => KeyVal(country, topics) } - .writeDALVersionedKeyValExecution( - GeopopularTopTweetImpressedTopicsScalaDataset, - D.Suffix(geoPopularTopicsPath) - ) - writeKeyValResultExec - } -} - -object GeoPopularTopicsApp { - - def getPopularTopicsFromLogs( - engagementLogs: TypedPipe[PerTopicAggregateEngagementMetric], - maxTopics: Int - )( - implicit uniqueId: UniqueID - ): TypedPipe[(String, Seq[(SemanticCoreEntityId, Double)])] = { - val numTopicEngagementsRead = Stat("num_topic_engagements_read") - val intermediate = engagementLogs - .map { - case PerTopicAggregateEngagementMetric( - topicId, - dateId, - country, - page, - item, - engagementType, - engagementCount, - algorithmType, - annotationType) => - numTopicEngagementsRead.inc() - ( - topicId, - dateId, - country, - page, - item, - engagementType, - engagementCount, - algorithmType, - annotationType) - } - - // We want to find the topics with the most impressed tweets in each country - // This will ensure that the topics suggested as recommendations also have tweets that can be recommended - intermediate - .collect { - case (topicId, _, Some(country), _, item, engagementType, engagementCount, _, _) - if item == "Tweet" && engagementType == "impression" => - ((country, topicId), engagementCount) - } - .sumByKey // returns country-wise engagements for topics - .map { - case ((country, topicId), totalEngagementCountryCount) => - (country, (topicId, totalEngagementCountryCount.toDouble)) - } - .group - .sortedReverseTake(maxTopics)(Ordering.by(_._2)) - .toTypedPipe - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/ProducersForTopicsFromTopicFollowGraph.docx b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/ProducersForTopicsFromTopicFollowGraph.docx new file mode 100644 index 000000000..217bfe89a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/ProducersForTopicsFromTopicFollowGraph.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/ProducersForTopicsFromTopicFollowGraph.scala b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/ProducersForTopicsFromTopicFollowGraph.scala deleted file mode 100644 index b6d6b567b..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/ProducersForTopicsFromTopicFollowGraph.scala +++ /dev/null @@ -1,206 +0,0 @@ -package com.twitter.simclusters_v2.scalding.topic_recommendations - -import com.twitter.bijection.Bufferable -import com.twitter.bijection.Injection -import com.twitter.recos.entities.thriftscala._ -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.Country -import com.twitter.simclusters_v2.common.Language -import com.twitter.simclusters_v2.common.TopicId -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources.DataSources -import com.twitter.simclusters_v2.hdfs_sources.TopProducersForLocaleTopicsFromTopicFollowGraphScalaDataset -import com.twitter.simclusters_v2.scalding.common.matrix.SparseMatrix -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.ProducerId -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * In this file, we compute the top producers for a topic from the Topic Follow Graph - * - * It works as follows: - * - * 1. Producer embedding: List of users who follow the producer's profile and follow atleast one topic - * - * 2. Topic embedding: List of users who follow the topic - * - * 3. Score(producer, topic) = cosine similarity of the producer and topic embedding as defined above - * - * 4. Please note that we compute the top producers for each topic locale. - */ - -/** -scalding remote run --user cassowary \ - --target src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations:top_producers_for_topics_from_topic_follow_graph-adhoc \ - --main-class com.twitter.simclusters_v2.scalding.topic_recommendations.ProducersForTopicsFromTopicFollowGraphAdhocApp \ - --submitter hadoopnest1.atla.twitter.com \ - -- --date 2021-01-06 --minActiveFollowers 400 --maxProducersPerTopic 50 \ - --output_dir_producers_per_topic /user/cassowary/adhoc/ldap/ttf_profile_pages_topics_to_producers - */ - -object ProducersForTopicsFromTopicFollowGraphAdhocApp extends AdhocExecutionApp { - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - import ProducersForTopicsFromTopicFollowGraph._ - val outputDirProducersPerTopic = args("output_dir_producers_per_topic") - val minActiveFollowersForProducer = args.int("minActiveFollowers", 400) - val maxProducersPerTopicPerLocale = args.int("maxProducersPerTopic", 50) - val minTopicFollows = args.int("minTopicFollows", 100) - - val topicsFollowedByProducersFollowers = getTopicsFromProducersFollowers( - DataSources - .userUserNormalizedGraphSource(dateRange.prepend(Days(7))), - ExternalDataSources.topicFollowGraphSource, - ExternalDataSources.userSource, - ExternalDataSources.inferredUserConsumedLanguageSource, - minActiveFollowersForProducer, - minTopicFollows - ) - - sortAndGetTopProducersPerLocaleTopic( - topicsFollowedByProducersFollowers, - maxProducersPerTopicPerLocale).writeExecution(TypedTsv(outputDirProducersPerTopic)) - - } -} - -/** -capesospy-v2 update --build_locally \ - --start_cron top_producers_for_topics_from_topic_follow_graph \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ - -object ProducersForTopicsFromTopicFollowGraphBatchApp extends ScheduledExecutionApp { - override val firstTime: RichDate = RichDate("2020-10-01") - - override val batchIncrement: Duration = Days(1) - - private val topProducersForLocaleTopicsPath: String = - "/user/cassowary/manhattan_sequence_files/top_producers_for_topics_from_topic_follow_graph" - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - import ProducersForTopicsFromTopicFollowGraph._ - val minActiveFollowersForProducer = args.int("minActiveFollowers", 400) - val maxProducersPerTopicPerLocale = args.int("maxProducersPerTopic", 50) - val minTopicFollows = args.int("minTopicFollows", 100) - - val topicsFollowedByProducersFollowers = getTopicsFromProducersFollowers( - DataSources - .userUserNormalizedGraphSource(dateRange.prepend(Days(7))), - ExternalDataSources.topicFollowGraphSource, - ExternalDataSources.userSource, - ExternalDataSources.inferredUserConsumedLanguageSource, - minActiveFollowersForProducer, - minTopicFollows - ) - - sortAndGetTopProducersPerLocaleTopic( - topicsFollowedByProducersFollowers, - maxProducersPerTopicPerLocale) - .map { - case ((topicId, languageOpt, countryOpt), producersWithScores) => - KeyVal( - SemanticCoreEntityWithLocale( - entityId = topicId, - context = Locale(language = languageOpt, country = countryOpt)), - UserScoreList(producersWithScores.map { - case (producerId, producerScore) => - UserWithScore(userId = producerId, score = producerScore) - }) - ) - }.writeDALVersionedKeyValExecution( - TopProducersForLocaleTopicsFromTopicFollowGraphScalaDataset, - D.Suffix(topProducersForLocaleTopicsPath), - version = ExplicitEndTime(dateRange.end) - ) - } -} - -object ProducersForTopicsFromTopicFollowGraph { - - implicit val sparseMatrixInj: Injection[ - (ProducerId, Option[Language], Option[Country]), - Array[Byte] - ] = - Bufferable.injectionOf[(ProducerId, Option[Language], Option[Country])] - - // This function takes the producer to topics map and generates the sorted and - // truncated top producers ranked list for each locale topic - def sortAndGetTopProducersPerLocaleTopic( - producerToTopics: TypedPipe[(ProducerId, (TopicId, Option[Language], Option[Country]), Double)], - maxProducersPerLocaleTopic: Int - )( - implicit uniqueID: UniqueID - ): TypedPipe[((TopicId, Option[Language], Option[Country]), List[(ProducerId, Double)])] = { - val numTopicsWithLocales = Stat("num_topics_with_locales") - producerToTopics - .map { - case (producerId, (topicId, languageOpt, countryOpt), score) => - ((topicId, languageOpt, countryOpt), Seq((producerId, score))) - } - .sumByKey.mapValues { producersList => - numTopicsWithLocales.inc() - producersList.sortBy(-_._2).take(maxProducersPerLocaleTopic).toList - }.toTypedPipe - } - - def getTopicsFromProducersFollowers( - userUserGraph: TypedPipe[UserAndNeighbors], - followedTopicsToUsers: TypedPipe[(TopicId, UserId)], - userSource: TypedPipe[(UserId, (Country, Language))], - userLanguages: TypedPipe[(UserId, Seq[(Language, Double)])], - minActiveFollowersForProducer: Int, - minTopicFollows: Int - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[(ProducerId, (TopicId, Option[Language], Option[Country]), Double)] = { - - val usersFollowingTopics: TypedPipe[UserId] = followedTopicsToUsers.map(_._2).distinct - val producerToUsersSparseMatrix: SparseMatrix[ProducerId, UserId, Double] = - TopicsForProducersUtils - .getProducersToFollowedByUsersSparseMatrix( - userUserGraph, - minActiveFollowersForProducer).filterCols(usersFollowingTopics).rowL2Normalize - - val userToTopicsSparseSkinnyMatrix: SparseMatrix[ - UserId, - (TopicId, Option[Language], Option[Country]), - Double - ] = - TopicsForProducersUtils - .getFollowedTopicsToUserSparseMatrix( - followedTopicsToUsers, - userSource, - userLanguages, - minTopicFollows).rowL2Normalize.transpose - - // Obtain the Producer to Locale Topics Matrix - val producersToLocaleTopicsMatrix: SparseMatrix[ - ProducerId, - (TopicId, Option[Language], Option[Country]), - Double - ] = - producerToUsersSparseMatrix.multiplySparseMatrix(userToTopicsSparseSkinnyMatrix) - - producersToLocaleTopicsMatrix.toTypedPipe - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/SimilarTopicsFromTopicFollowGraphApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/SimilarTopicsFromTopicFollowGraphApp.docx new file mode 100644 index 000000000..43be9b88c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/SimilarTopicsFromTopicFollowGraphApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/SimilarTopicsFromTopicFollowGraphApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/SimilarTopicsFromTopicFollowGraphApp.scala deleted file mode 100644 index 78bd6d658..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/SimilarTopicsFromTopicFollowGraphApp.scala +++ /dev/null @@ -1,222 +0,0 @@ -package com.twitter.simclusters_v2.scalding.topic_recommendations - -import com.twitter.escherbird.scalding.source.FullMetadataSource -import com.twitter.interests_ds.jobs.interests_service.UserTopicRelationSnapshotScalaDataset -import com.twitter.interests.thriftscala.InterestRelationType -import com.twitter.interests.thriftscala.UserInterestsRelationSnapshot -import com.twitter.recos.entities.thriftscala.SemanticCoreEntity -import com.twitter.recos.entities.thriftscala.SemanticCoreEntityScoreList -import com.twitter.recos.entities.thriftscala.SemanticEntityScore -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.SemanticCoreEntityId -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources.SimilarTopicsFromTopicFollowGraphScalaDataset -import com.twitter.simclusters_v2.scalding.common.matrix.SparseRowMatrix -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * In this file, we compute the similarities between topics based on how often they are co-followed - * by users. - * - * Similarity(i, j) = #co-follow(i,j) / sqrt(#follow(i) * #follow(j)) - * - * It works as follows: - * - * 1. it first reads the data set of user to topics follow graph, and construct a sparse matrix M with - * N rows and K columns, where N is the number of users, and K is the number of topics. - * In the matrix, M(u,i) = 1 if user u follows topic i; otherwise it is 0. In the sparse matrix, - * we only save non-zero elements. - * - * 2. we do l2-normalization for each column of the matrix M, to get a normalized version M'. - * - * 3. we get topic-topic similarity matrix S = M'.transpose.multiply(M'). The resulting matrix will - * contain the similarities between all topics, i.e., S(i,j) is the similarity we mentioned above. - * - * 4. for each topic, we only keep its K similar topics with largest similarity scores, while not - * including those with scores lower than a threshold. - * - */ -/** - * capesospy-v2 update --build_locally \ - * --start_cron similar_topics_from_topic_follow_graph \ - * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object SimilarTopicsFromTopicFollowGraphScheduledApp extends ScheduledExecutionApp { - import SimilarTopics._ - - private val outputPath: String = - "/user/cassowary/manhattan_sequence_files/similar_topics_from_topics_follow_graph" - - override def firstTime: RichDate = RichDate("2020-05-07") - - override def batchIncrement: Duration = Days(7) - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val numSimilarTopics = args.int("numSimilarTopics", default = 100) - val scoreThreshold = args.double("scoreThreshold", default = 0.01) - - val numOutputTopics = Stat("NumOutputTopics") - - computeSimilarTopics( - getExplicitFollowedTopics, - getFollowableTopics, - numSimilarTopics, - scoreThreshold) - .map { - case (topicId, similarTopics) => - numOutputTopics.inc() - KeyVal( - topicId, - SemanticCoreEntityScoreList(similarTopics.map { - case (similarTopicId, score) => - SemanticEntityScore(SemanticCoreEntity(similarTopicId), score) - })) - } - .writeDALVersionedKeyValExecution( - SimilarTopicsFromTopicFollowGraphScalaDataset, - D.Suffix(outputPath), - version = ExplicitEndTime(dateRange.end) - ) - } - -} - -/** - scalding remote run --user cassowary --reducers 2000 \ - --target src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations:similar_topics_from_topic_follow_graph-adhoc \ - --main-class com.twitter.simclusters_v2.scalding.topic_recommendations.SimilarTopicsFromTopicFollowGraphAdhocApp \ - --submitter hadoopnest1.atla.twitter.com \ - -- --date 2020-04-28 - */ -object SimilarTopicsFromTopicFollowGraphAdhocApp extends AdhocExecutionApp { - import SimilarTopics._ - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val numSimilarTopics = args.int("numSimilarTopics", default = 100) - val scoreThreshold = args.double("scoreThreshold", default = 0.01) - - val numOutputTopics = Stat("NumOutputTopics") - - computeSimilarTopics( - getExplicitFollowedTopics, - getFollowableTopics, - numSimilarTopics, - scoreThreshold) - .map { - case (topicId, similarTopics) => - numOutputTopics.inc() - topicId -> similarTopics - .collect { - case (similarTopic, score) if similarTopic != topicId => - s"$similarTopic:$score" - } - .mkString(",") - } - .writeExecution( - TypedTsv("/user/cassowary/adhoc/topic_recos/similar_topics") - ) - } - -} - -object SimilarTopics { - - val UTTDomain: Long = 131L - - val FollowableTag: String = "utt:followable_topic" - - def getFollowableTopics( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[SemanticCoreEntityId] = { - val NumFollowableTopics = Stat("NumFollowableTopics") - - TypedPipe - .from( - new FullMetadataSource("/atla/proc" + FullMetadataSource.DefaultHdfsPath)()( - dateRange.embiggen(Days(7)))) - .flatMap { - case fullMetadata if fullMetadata.domainId == UTTDomain => - for { - basicMetadata <- fullMetadata.basicMetadata - indexableFields <- basicMetadata.indexableFields - tags <- indexableFields.tags - if tags.contains(FollowableTag) - } yield { - NumFollowableTopics.inc() - fullMetadata.entityId - } - case _ => None - } - .forceToDisk - } - - def getExplicitFollowedTopics( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[(UserId, Map[SemanticCoreEntityId, Double])] = { - - DAL - .readMostRecentSnapshotNoOlderThan(UserTopicRelationSnapshotScalaDataset, Days(7)) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - .collect { - case userInterestsRelationSnapshot: UserInterestsRelationSnapshot - if userInterestsRelationSnapshot.interestType == "UTT" && - userInterestsRelationSnapshot.relation == InterestRelationType.Followed => - ( - userInterestsRelationSnapshot.userId, - Map(userInterestsRelationSnapshot.interestId -> 1.0)) - } - .sumByKey - } - - def computeSimilarTopics( - userTopicsFollowGraph: TypedPipe[(UserId, Map[SemanticCoreEntityId, Double])], - followableTopics: TypedPipe[SemanticCoreEntityId], - numSimilarTopics: Int, - scoreThreshold: Double - ): TypedPipe[(SemanticCoreEntityId, Seq[(SemanticCoreEntityId, Double)])] = { - val userTopicFollowGraph = - SparseRowMatrix[UserId, SemanticCoreEntityId, Double]( - userTopicsFollowGraph, - isSkinnyMatrix = true) - .filterCols(followableTopics) // filter out unfollowable topics - .colL2Normalize // normalization - // due to the small number of the topics, - // Scalding only allocates 1-2 mappers for the next step which makes it take unnecessarily long time. - // Changing it to 10 to make it a bit faster - .forceToDisk(numShardsOpt = Some(10)) - - userTopicFollowGraph - .transposeAndMultiplySkinnySparseRowMatrix(userTopicFollowGraph) - .filter { (i, j, v) => - // exclude topic itself from being considered as similar; also the similarity score should - // be larger than a threshold - i != j && v > scoreThreshold - } - .sortWithTakePerRow(numSimilarTopics)(Ordering.by(-_._2)) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/TopicsForProducersFromEM.docx b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/TopicsForProducersFromEM.docx new file mode 100644 index 000000000..9b50145d4 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/TopicsForProducersFromEM.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/TopicsForProducersFromEM.scala b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/TopicsForProducersFromEM.scala deleted file mode 100644 index 44eb83f1e..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/TopicsForProducersFromEM.scala +++ /dev/null @@ -1,261 +0,0 @@ -package com.twitter.simclusters_v2.scalding.topic_recommendations -import com.twitter.bijection.Bufferable -import com.twitter.bijection.Injection -import com.twitter.recos.entities.thriftscala._ -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.Country -import com.twitter.simclusters_v2.common.Language -import com.twitter.simclusters_v2.common.SemanticCoreEntityId -import com.twitter.simclusters_v2.common.TopicId -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources.DataSources -import com.twitter.simclusters_v2.hdfs_sources.TopLocaleTopicsForProducerFromEmScalaDataset -import com.twitter.simclusters_v2.scalding.common.matrix.SparseMatrix -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.ProducerId -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import com.twitter.wtf.scalding.jobs.common.EMRunner -import java.util.TimeZone - -/** - * In this file, we compute the top topics for a producer to be shown on the Topics To Follow Module on Profile Pages - * - * The top topics for a producer are computed using the Expectation-Maximization (EM) approach - * - * It works as follows: - * - * 1. Obtain the background model distribution of number of followers for a topic - * - * 2. Obtain the domain model distribution of the number of producer's followers who follow a topic - * - * 4. Iteratively, use the Expectation-Maximization approach to get the best estimate of the domain model's topic distribution for a producer - * - * 5. for each producer, we only keep its top K topics with highest weights in the domain model's topic distribution after the EM step - * - * 6. Please note that we also store the locale info for each producer along with the topics - */ -/** -scalding remote run --user cassowary --reducers 2000 \ - --target src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations:top_topics_for_producers_from_em-adhoc \ - --main-class com.twitter.simclusters_v2.scalding.topic_recommendations.TopicsForProducersFromEMAdhocApp \ - --submitter hadoopnest1.atla.twitter.com \ - -- --date 2020-07-05 --minActiveFollowers 10000 --minTopicFollowsThreshold 100 --maxTopicsPerProducerPerLocale 50 \ - --output_dir_topics_per_producer /user/cassowary/adhoc/your_ldap/ttf_profile_pages_producers_to_topics - */ -object TopicsForProducersFromEMAdhocApp extends AdhocExecutionApp { - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - import TopicsForProducersFromEM._ - val outputDirTopicsPerProducer = args("output_dir_topics_per_producer") - val minActiveFollowersForProducer = args.int("minActiveFollowers", 100) - val minTopicFollowsThreshold = args.int("minNumTopicFollows", 100) - val maxTopicsPerProducerPerLocale = args.int("maxTopicsPerProducer", 100) - val lambda = args.double("lambda", 0.95) - - val numEMSteps = args.int("numEM", 100) - - val topicsFollowedByProducersFollowers: TypedPipe[ - (ProducerId, (TopicId, Option[Language], Option[Country]), Double) - ] = getTopLocaleTopicsForProducersFromEM( - DataSources - .userUserNormalizedGraphSource(dateRange.prepend(Days(7))), - ExternalDataSources.topicFollowGraphSource, - ExternalDataSources.userSource, - ExternalDataSources.inferredUserConsumedLanguageSource, - minActiveFollowersForProducer, - minTopicFollowsThreshold, - lambda, - numEMSteps - ) - - val topTopicsPerLocaleProducerTsvExec = sortAndGetTopLocaleTopicsPerProducer( - topicsFollowedByProducersFollowers, - maxTopicsPerProducerPerLocale - ).writeExecution( - TypedTsv(outputDirTopicsPerProducer) - ) - - topTopicsPerLocaleProducerTsvExec - } -} - -/** -capesospy-v2 update --build_locally \ - --start_cron top_topics_for_producers_from_em \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object TopicsForProducersFromEMBatchApp extends ScheduledExecutionApp { - override val firstTime: RichDate = RichDate("2020-07-26") - - override val batchIncrement: Duration = Days(7) - - private val topTopicsPerProducerFromEMPath: String = - "/user/cassowary/manhattan_sequence_files/top_topics_for_producers_from_em" - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - import TopicsForProducersFromEM._ - - // threshold of the minimum number of active followers needed for a user to be considered as a producer - val minActiveFollowersForProducer = args.int("minActiveFollowers", 100) - - // threshold of the topic locale follows score needed for a topic to be considered as valid - val minTopicFollowsThreshold = args.int("minNumTopicFollows", 100) - - val maxTopicsPerProducer = args.int("maxTopicsPerProducer", 100) - - // lambda parameter for the EM algorithm - val lambda = args.double("lambda", 0.95) - - // number of EM iterations - val numEMSteps = args.int("numEM", 100) - - // (producer, locale) -> List<(topics, scores)> from Expectation Maximization approach - val topicsFollowedByProducersFollowers = getTopLocaleTopicsForProducersFromEM( - DataSources - .userUserNormalizedGraphSource(dateRange.prepend(Days(7))), - ExternalDataSources.topicFollowGraphSource, - ExternalDataSources.userSource, - ExternalDataSources.inferredUserConsumedLanguageSource, - minActiveFollowersForProducer, - minTopicFollowsThreshold, - lambda, - numEMSteps - ) - - val topLocaleTopicsForProducersFromEMKeyValExec = - sortAndGetTopLocaleTopicsPerProducer( - topicsFollowedByProducersFollowers, - maxTopicsPerProducer - ).map { - case ((producerId, languageOpt, countryOpt), topicsWithScores) => - KeyVal( - UserIdWithLocale( - userId = producerId, - locale = Locale(language = languageOpt, country = countryOpt)), - SemanticCoreEntityScoreList(topicsWithScores.map { - case (topicid, topicScore) => - SemanticEntityScore(SemanticCoreEntity(entityId = topicid), score = topicScore) - }) - ) - }.writeDALVersionedKeyValExecution( - TopLocaleTopicsForProducerFromEmScalaDataset, - D.Suffix(topTopicsPerProducerFromEMPath), - version = ExplicitEndTime(dateRange.end) - ) - topLocaleTopicsForProducersFromEMKeyValExec - } -} - -object TopicsForProducersFromEM { - - private val MinProducerTopicScoreThreshold = 0.0 - - implicit val sparseMatrixInj: Injection[ - (SemanticCoreEntityId, Option[Language], Option[Country]), - Array[Byte] - ] = - Bufferable.injectionOf[(SemanticCoreEntityId, Option[Language], Option[Country])] - - // This function takes the producer to topics map and generates the sorted and - // truncated top locale topics ranked list for each producer - def sortAndGetTopLocaleTopicsPerProducer( - producerToTopics: TypedPipe[(ProducerId, (TopicId, Option[Language], Option[Country]), Double)], - maxTopicsPerProducerPerLocale: Int - )( - implicit uniqueID: UniqueID - ): TypedPipe[((ProducerId, Option[Language], Option[Country]), List[(TopicId, Double)])] = { - val numProducersWithLocales = Stat("num_producers_with_locales") - producerToTopics - .map { - case (producerId, (topicId, languageOpt, countryOpt), score) => - ((producerId, languageOpt, countryOpt), Seq((topicId, score))) - }.sumByKey.mapValues { topicsList: Seq[(TopicId, Double)] => - numProducersWithLocales.inc() - topicsList - .filter(_._2 >= MinProducerTopicScoreThreshold).sortBy(-_._2).take( - maxTopicsPerProducerPerLocale).toList - }.toTypedPipe - } - - def getTopLocaleTopicsForProducersFromEM( - userUserGraph: TypedPipe[UserAndNeighbors], - followedTopicsToUsers: TypedPipe[(TopicId, UserId)], - userSource: TypedPipe[(UserId, (Country, Language))], - userLanguages: TypedPipe[(UserId, Seq[(Language, Double)])], - minActiveFollowersForProducer: Int, - minTopicFollowsThreshold: Int, - lambda: Double, - numEMSteps: Int - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[(ProducerId, (TopicId, Option[Language], Option[Country]), Double)] = { - - // Obtain Producer To Users Matrix - val producersToUsersMatrix: SparseMatrix[ProducerId, UserId, Double] = - TopicsForProducersUtils.getProducersToFollowedByUsersSparseMatrix( - userUserGraph, - minActiveFollowersForProducer) - - // Obtain Users to TopicsWithLocales Matrix - val topicToUsersMatrix: SparseMatrix[ - (TopicId, Option[Language], Option[Country]), - UserId, - Double - ] = TopicsForProducersUtils.getFollowedTopicsToUserSparseMatrix( - followedTopicsToUsers, - userSource, - userLanguages, - minTopicFollowsThreshold) - - // Domain input probability distribution is the Map(topics->followers) per producer locale - val domainInputModel = producersToUsersMatrix - .multiplySparseMatrix(topicToUsersMatrix.transpose).toTypedPipe.map { - case (producerId, (topicId, languageOpt, countryOpt), dotProduct) => - ((producerId, languageOpt, countryOpt), Map(topicId -> dotProduct)) - }.sumByKey.toTypedPipe.map { - case ((producerId, languageOpt, countryOpt), topicsDomainInputMap) => - ((languageOpt, countryOpt), (producerId, topicsDomainInputMap)) - } - - // BackgroundModel is the Map(topics -> Expected value of the number of users who follow the topic) - val backgroundModel = topicToUsersMatrix.rowL1Norms.map { - case ((topicId, languageOpt, countryOpt), numFollowersOfTopic) => - ((languageOpt, countryOpt), Map(topicId -> numFollowersOfTopic)) - }.sumByKey - - val resultsFromEMForEachLocale = domainInputModel.hashJoin(backgroundModel).flatMap { - case ( - (languageOpt, countryOpt), - ((producerId, domainInputTopicFollowersMap), backgroundModelTopicFollowersMap)) => - val emScoredTopicsForEachProducerPerLocale = EMRunner.estimateDomainModel( - domainInputTopicFollowersMap, - backgroundModelTopicFollowersMap, - lambda, - numEMSteps) - - emScoredTopicsForEachProducerPerLocale.map { - case (topicId, topicScore) => - (producerId, (topicId, languageOpt, countryOpt), topicScore) - } - } - resultsFromEMForEachLocale - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/TopicsForProducersUtils.docx b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/TopicsForProducersUtils.docx new file mode 100644 index 000000000..48f7012e1 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/TopicsForProducersUtils.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/TopicsForProducersUtils.scala b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/TopicsForProducersUtils.scala deleted file mode 100644 index 94a2404b8..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/TopicsForProducersUtils.scala +++ /dev/null @@ -1,103 +0,0 @@ -package com.twitter.simclusters_v2.scalding.topic_recommendations -import com.twitter.bijection.{Bufferable, Injection} -import com.twitter.scalding._ -import com.twitter.simclusters_v2.common.{Country, Language, SemanticCoreEntityId, TopicId, UserId} -import com.twitter.simclusters_v2.scalding.common.matrix.SparseMatrix -import com.twitter.simclusters_v2.scalding.embedding.common.EmbeddingUtil.ProducerId -import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors - -object TopicsForProducersUtils { - - implicit val sparseMatrixInj: Injection[ - (SemanticCoreEntityId, Option[Language], Option[Country]), - Array[Byte] - ] = - Bufferable.injectionOf[(SemanticCoreEntityId, Option[Language], Option[Country])] - - // This function provides the set of 'valid' topics, i.e topics with atleast a certain number of - // follows. This helps remove some noisy topic associations to producers in the dataset. - def getValidTopics( - topicUsers: TypedPipe[((TopicId, Option[Language], Option[Country]), UserId, Double)], - minTopicFollowsThreshold: Int - )( - implicit uniqueID: UniqueID - ): TypedPipe[(TopicId, Option[Language], Option[Country])] = { - val numValidTopics = Stat("num_valid_topics") - SparseMatrix(topicUsers).rowNnz.collect { - case (topicsWithLocaleKey, numFollows) if numFollows >= minTopicFollowsThreshold => - numValidTopics.inc() - topicsWithLocaleKey - } - } - - // Get the users with atleast minNumUserFollowers following - def getValidProducers( - userToFollowersEdges: TypedPipe[(UserId, UserId, Double)], - minNumUserFollowers: Int - )( - implicit uniqueID: UniqueID - ): TypedPipe[ProducerId] = { - val numProducersForTopics = Stat("num_producers_for_topics") - SparseMatrix(userToFollowersEdges).rowL1Norms.collect { - case (userId, l1Norm) if l1Norm >= minNumUserFollowers => - numProducersForTopics.inc() - userId - } - } - - // This function returns the User to Followed Topics Matrix - def getFollowedTopicsToUserSparseMatrix( - followedTopicsToUsers: TypedPipe[(TopicId, UserId)], - userCountryAndLanguage: TypedPipe[(UserId, (Country, Language))], - userLanguages: TypedPipe[(UserId, Seq[(Language, Double)])], - minTopicFollowsThreshold: Int - )( - implicit uniqueID: UniqueID - ): SparseMatrix[(TopicId, Option[Language], Option[Country]), UserId, Double] = { - val localeTopicsWithUsers: TypedPipe[ - ((TopicId, Option[Language], Option[Country]), UserId, Double) - ] = - followedTopicsToUsers - .map { case (topic, user) => (user, topic) } - .join(userCountryAndLanguage) - .join(userLanguages) - .withDescription("joining user locale information") - .flatMap { - case (user, ((topic, (country, _)), scoredLangs)) => - scoredLangs.flatMap { - case (lang, score) => - // To compute the top topics with/without language and country level personalization - // So the same dataset has 3 keys for each topicId (unless it gets filtered after): - // (TopicId, Language, Country), (TopicId, Language, None), (TopicId, None, None) - Seq( - ((topic, Some(lang), Some(country)), user, score), // with language and country - ((topic, Some(lang), None), user, score) // with language - ) - } ++ Seq(((topic, None, None), user, 1.0)) // no locale - } - SparseMatrix(localeTopicsWithUsers).filterRowsByMinSum(minTopicFollowsThreshold) - } - - // This function returns the Producers To User Followers Matrix - def getProducersToFollowedByUsersSparseMatrix( - userUserGraph: TypedPipe[UserAndNeighbors], - minActiveFollowers: Int, - )( - implicit uniqueID: UniqueID - ): SparseMatrix[ProducerId, UserId, Double] = { - - val numEdgesFromUsersToFollowers = Stat("num_edges_from_users_to_followers") - - val userToFollowersEdges: TypedPipe[(UserId, UserId, Double)] = - userUserGraph - .flatMap { userAndNeighbors => - userAndNeighbors.neighbors - .collect { - case neighbor if neighbor.isFollowed.getOrElse(false) => - numEdgesFromUsersToFollowers.inc() - (neighbor.neighborId, userAndNeighbors.userId, 1.0) - } - } - SparseMatrix(userToFollowersEdges).filterRowsByMinSum(minActiveFollowers) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/BUILD b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/BUILD deleted file mode 100644 index e27970b99..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/BUILD +++ /dev/null @@ -1,70 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":topic_recommendations_test_datarecords-java", - ":topic_recommendations_train_datarecords-java", - "escherbird/src/scala/com/twitter/escherbird/scalding/jobs/exportentities:entities-scala", - "interests-ds/src/main/scala/com/twitter/interests_ds/jobs/interests_service", - "interests-ds/src/main/scala/com/twitter/interests_ds/jobs/interests_service:user_topic_relation_snapshot-scala", - "src/java/com/twitter/ml/api/constant", - "src/scala/com/twitter/scalding_internal/dalv2", - "src/scala/com/twitter/scalding_internal/dalv2/dataset", - "src/scala/com/twitter/scalding_internal/job", - "src/scala/com/twitter/scalding_internal/job/analytics_batch", - "src/scala/com/twitter/scalding_internal/source", - "src/scala/com/twitter/scalding_internal/source/lzo_scrooge", - "src/scala/com/twitter/simclusters_v2/candidate_source", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "timelines/data_processing/jobs/metrics/per_topic_metrics:per_topic_aggregate_engagement-scala", - "twml/runtime/src/main/scala/com/twitter/twml/runtime/scalding", - ], -) - -hadoop_binary( - name = "training_data_for_topic_recommendations-adhoc", - main = "com.twitter.simclusters_v2.scalding.topic_recommendations.model_based_topic_recommendations.UserTopicFeatureHydrationAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":model_based_topic_recommendations", - ], -) - -hadoop_binary( - name = "training_data_for_topic_recommendations", - main = "com.twitter.simclusters_v2.scalding.topic_recommendations.model_based_topic_recommendations.UserTopicFeatureHydrationScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":model_based_topic_recommendations", - ], -) - -create_datarecord_datasets( - base_name = "topic_recommendations_train_datarecords", - platform = "java8", - role = "cassowary", - segment_type = "snapshot", - tags = ["bazel-compatible"], -) - -create_datarecord_datasets( - base_name = "topic_recommendations_test_datarecords", - platform = "java8", - role = "cassowary", - segment_type = "snapshot", - tags = ["bazel-compatible"], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/BUILD.docx new file mode 100644 index 000000000..fe417fa8d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/DataSources.docx b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/DataSources.docx new file mode 100644 index 000000000..19a428e69 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/DataSources.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/DataSources.scala b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/DataSources.scala deleted file mode 100644 index baa25590f..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/DataSources.scala +++ /dev/null @@ -1,74 +0,0 @@ -package com.twitter.simclusters_v2.scalding.topic_recommendations.model_based_topic_recommendations - -import com.twitter.scalding.{DateRange, Days, Stat, TypedPipe, UniqueID} -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, Proc3Atla} -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.common.{Language, TopicId, UserId} -import com.twitter.simclusters_v2.hdfs_sources.FavTfgTopicEmbeddingsScalaDataset -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore -import com.twitter.simclusters_v2.thriftscala.{ - EmbeddingType, - InternalId, - LocaleEntityId, - ModelVersion, - SimClustersEmbeddingId -} -import java.util.TimeZone - -/** - * DataSources object to read datasets for the model based topic recommendations - */ -object DataSources { - - private val topicEmbeddingDataset = FavTfgTopicEmbeddingsScalaDataset - private val topicEmbeddingType = EmbeddingType.FavTfgTopic - - /** - * Get user InterestedIn data, filter popular clusters and return fav-scores interestedIn embedding for user - */ - def getUserInterestedInData( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[(UserId, Map[Int, Double])] = { - val numUserInterestedInInput = Stat("num_user_interested_in") - ExternalDataSources.simClustersInterestInSource - .map { - case KeyVal(userId, clustersUserIsInterestedIn) => - val clustersPostFiltering = clustersUserIsInterestedIn.clusterIdToScores.filter { - case (clusterId, clusterScores) => - // filter out popular clusters (i.e clusters with > 5M users interested in it) from the user embedding - clusterScores.numUsersInterestedInThisClusterUpperBound.exists( - _ < UserInterestedInReadableStore.MaxClusterSizeForUserInterestedInDataset) - } - numUserInterestedInInput.inc() - (userId, clustersPostFiltering.mapValues(_.favScore.getOrElse(0.0)).toMap) - } - } - - def getPerLanguageTopicEmbeddings( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): TypedPipe[((TopicId, Language), Map[Int, Double])] = { - val numTFGPerLanguageEmbeddings = Stat("num_per_language_tfg_embeddings") - DAL - .readMostRecentSnapshotNoOlderThan(topicEmbeddingDataset, Days(30)) - .withRemoteReadPolicy(ExplicitLocation(Proc3Atla)) - .toTypedPipe - .map { - case KeyVal(k, v) => (k, v) - }.collect { - case ( - SimClustersEmbeddingId( - embedType, - ModelVersion.Model20m145kUpdated, - InternalId.LocaleEntityId(LocaleEntityId(entityId, lang))), - embedding) if (embedType == topicEmbeddingType) => - numTFGPerLanguageEmbeddings.inc() - ((entityId, lang), embedding.embedding.map(_.toTuple).toMap) - }.forceToDisk - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserFeatures.docx b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserFeatures.docx new file mode 100644 index 000000000..7cfd4105e Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserFeatures.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserFeatures.scala b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserFeatures.scala deleted file mode 100644 index f2af4b62d..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserFeatures.scala +++ /dev/null @@ -1,57 +0,0 @@ -package com.twitter.simclusters_v2.scalding.topic_recommendations.model_based_topic_recommendations - -import com.twitter.ml.api.{Feature, FeatureContext} -import com.twitter.ml.api.constant.SharedFeatures - -object UserFeatures { - val UserIdFeature = SharedFeatures.USER_ID // User-id - - val UserSimClusterFeatures = - new Feature.SparseContinuous( - "user.simclusters.interested_in" - ) // User's interestedIn simcluster embeddding - - val UserCountryFeature = new Feature.Text("user.country") // user's country code - - val UserLanguageFeature = new Feature.Text("user.language") // user's language - - val FollowedTopicIdFeatures = - new Feature.SparseBinary( - "followed_topics.id" - ) // SparseBinary features for the set of followed topics - - val NotInterestedTopicIdFeatures = - new Feature.SparseBinary( - "not_interested_topics.id" - ) // SparseBinary features for the set of not-interested topics - - val FollowedTopicSimClusterAvgFeatures = - new Feature.SparseContinuous( - "followed_topics.simclusters.avg" - ) // Average SimCluster Embedding of the followed topics - - val NotInterestedTopicSimClusterAvgFeatures = - new Feature.SparseContinuous( - "not_interested_topics.simclusters.avg" - ) // Average SimCluster Embedding of the followed topics - - val TargetTopicIdFeatures = new Feature.Discrete("target_topic.id") // target topic-id - - val TargetTopicSimClustersFeature = - new Feature.SparseContinuous( - "target_topic.simclusters" - ) // SimCluster embedding of the target topic - - val FeatureContext = new FeatureContext( - UserIdFeature, - UserSimClusterFeatures, - UserCountryFeature, - UserLanguageFeature, - FollowedTopicIdFeatures, - NotInterestedTopicIdFeatures, - FollowedTopicSimClusterAvgFeatures, - NotInterestedTopicSimClusterAvgFeatures, - TargetTopicIdFeatures, - TargetTopicSimClustersFeature - ) -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserTopicDataRecordAdapter.docx b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserTopicDataRecordAdapter.docx new file mode 100644 index 000000000..fa918f461 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserTopicDataRecordAdapter.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserTopicDataRecordAdapter.scala b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserTopicDataRecordAdapter.scala deleted file mode 100644 index 9e9c0378c..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserTopicDataRecordAdapter.scala +++ /dev/null @@ -1,64 +0,0 @@ -package com.twitter.simclusters_v2.scalding.topic_recommendations.model_based_topic_recommendations - -import com.twitter.ml.api.util.FDsl._ -import com.twitter.ml.api.{DataRecord, FeatureContext, IRecordOneToOneAdapter} - -case class UserTopicTrainingSample( - userId: Long, - followedTopics: Set[Long], - notInterestedTopics: Set[Long], - userCountry: String, - userLanguage: String, - targetTopicId: Int, - userInterestedInSimClusters: Map[Int, Double], - followedTopicsSimClusters: Map[Int, Double], - notInterestedTopicsSimClusters: Map[Int, Double]) - -class UserTopicDataRecordAdapter extends IRecordOneToOneAdapter[UserTopicTrainingSample] { - import UserFeatures._ - - /** - * Get its feature context used to annotate the data. - * - * @return feature context - */ - override def getFeatureContext: FeatureContext = UserFeatures.FeatureContext - - /** - * Adapt record of type T to DataRecord. - * - * @param record raw record of type T - * - * @return a DataRecord - * - * @throws com.twitter.ml.api.InvalidFeatureException - */ - override def adaptToDataRecord(record: UserTopicTrainingSample): DataRecord = { - val dr = new DataRecord() - - dr.setFeatureValue(UserIdFeature, record.userId) - dr.setFeatureValue( - UserSimClusterFeatures, - record.userInterestedInSimClusters.map { - case (id, score) => id.toString -> score - }) - dr.setFeatureValue(FollowedTopicIdFeatures, record.followedTopics.map(_.toString)) - dr.setFeatureValue(NotInterestedTopicIdFeatures, record.notInterestedTopics.map(_.toString)) - dr.setFeatureValue(UserCountryFeature, record.userCountry) - dr.setFeatureValue(UserLanguageFeature, record.userLanguage) - - dr.setFeatureValue( - FollowedTopicSimClusterAvgFeatures, - record.followedTopicsSimClusters.map { - case (id, score) => id.toString -> score - }) - - dr.setFeatureValue( - NotInterestedTopicSimClusterAvgFeatures, - record.notInterestedTopicsSimClusters.map { - case (id, score) => id.toString -> score - }) - dr.setFeatureValue(TargetTopicIdFeatures, record.targetTopicId.toLong) - dr.getRecord - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserTopicModellingTrainingDataCollectionJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserTopicModellingTrainingDataCollectionJob.docx new file mode 100644 index 000000000..90947c3a1 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserTopicModellingTrainingDataCollectionJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserTopicModellingTrainingDataCollectionJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserTopicModellingTrainingDataCollectionJob.scala deleted file mode 100644 index 49a73ca32..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations/UserTopicModellingTrainingDataCollectionJob.scala +++ /dev/null @@ -1,449 +0,0 @@ -package com.twitter.simclusters_v2.scalding.topic_recommendations.model_based_topic_recommendations - -import com.twitter.algebird.Monoid -import com.twitter.bijection.Injection -import com.twitter.dal.client.dataset.SnapshotDALDatasetBase -import com.twitter.ml.api.DataRecord -import com.twitter.ml.api._ -import com.twitter.scalding.TypedPipe -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.scalding_internal.dalv2.dataset.DALWrite._ -import com.twitter.simclusters_v2.common.Country -import com.twitter.simclusters_v2.common.Language -import com.twitter.simclusters_v2.common.TopicId -import com.twitter.simclusters_v2.common.UserId -import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone -import scala.util.Random -import com.twitter.ml.api.util.FDsl._ -import com.twitter.scalding.source.DailySuffixCsv -import com.twitter.scalding.source.DailySuffixTypedTsv -import com.twitter.simclusters_v2.hdfs_sources.FavTfgTopicEmbeddingsScalaDataset -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.thriftscala.EmbeddingType - -/** - This job is to obtain the training and test data for the model-based approach to topic recommendations: - Approach: - 1. Read FavTfgTopicEmbeddingsScalaDataset - to get topic simclusters embeddings for the followed and not interested in topics - 2. Read SimclustersV2InterestedIn20M145KUpdatedScalaDataset - to get user's interestedIn Simclusters embeddings - 3. Read UsersourceScalaDataset - to get user's countryCode and language - Use the datasets above to get the features for the model and generate DataRecords. - */ - -/* -To run: -scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/topic_recommendations/model_based_topic_recommendations:training_data_for_topic_recommendations-adhoc \ ---user cassowary \ ---submitter atla-aor-08-sr1 \ ---main-class com.twitter.simclusters_v2.scalding.topic_recommendations.model_based_topic_recommendations.UserTopicFeatureHydrationAdhocApp \ ---submitter-memory 128192.megabyte --hadoop-properties "mapreduce.map.memory.mb=8192 mapreduce.map.java.opts='-Xmx7618M' mapreduce.reduce.memory.mb=8192 mapreduce.reduce.java.opts='-Xmx7618M'" \ --- \ ---date 2020-10-14 \ ---outputDir "/user/cassowary/adhoc/your_ldap/user_topic_features_popular_clusters_filtered_oct_16" - */ - -object UserTopicFeatureHydrationAdhocApp extends AdhocExecutionApp { - - import UserTopicModellingJobUtils._ - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - val outputDir = args("outputDir") - val numDataRecordsTraining = Stat("num_data_records_training") - val numDataRecordsTesting = Stat("num_data_records_testing") - val testingRatio = args.double("testingRatio", 0.2) - - val (trainingDataSamples, testDataSamples, sortedVocab) = UserTopicModellingJobUtils.run( - ExternalDataSources.topicFollowGraphSource, - ExternalDataSources.notInterestedTopicsSource, - ExternalDataSources.userSource, - DataSources.getUserInterestedInData, - DataSources.getPerLanguageTopicEmbeddings, - testingRatio - ) - - val userTopicAdapter = new UserTopicDataRecordAdapter() - Execution - .zip( - convertTypedPipeToDataSetPipe( - trainingDataSamples.map { train => - numDataRecordsTraining.inc() - train - }, - userTopicAdapter) - .writeExecution( - DailySuffixFeatureSink(outputDir + "/training") - ), - convertTypedPipeToDataSetPipe( - testDataSamples.map { test => - numDataRecordsTesting.inc() - test - }, - userTopicAdapter) - .writeExecution( - DailySuffixFeatureSink(outputDir + "/testing") - ), - sortedVocab - .map { topicsWithSortedIndexes => - topicsWithSortedIndexes.map(_._1) - }.flatten.writeExecution(DailySuffixTypedTsv(outputDir + "/vocab")) - ).unit - } -} - -/** -capesospy-v2 update --build_locally \ - --start_cron training_data_for_topic_recommendations \ - src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ - -object UserTopicFeatureHydrationScheduledApp extends ScheduledExecutionApp { - - import UserTopicModellingJobUtils._ - - private val outputPath: String = - "/user/cassowary/processed/user_topic_modelling" - - override def batchIncrement: Duration = Days(1) - - override def firstTime: RichDate = RichDate("2020-10-13") - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val testingRatio = args.double("testingRatio", 0.2) - - val (trainingDataSamples, testDataSamples, sortedVocab) = UserTopicModellingJobUtils.run( - ExternalDataSources.topicFollowGraphSource, - ExternalDataSources.notInterestedTopicsSource, - ExternalDataSources.userSource, - DataSources.getUserInterestedInData, - DataSources.getPerLanguageTopicEmbeddings, - testingRatio - ) - - val userTopicAdapter = new UserTopicDataRecordAdapter() - Execution - .zip( - getTrainTestExec( - trainingDataSamples, - testDataSamples, - TopicRecommendationsTrainDatarecordsJavaDataset, - TopicRecommendationsTestDatarecordsJavaDataset, - outputPath, - userTopicAdapter - ), - sortedVocab - .map { topicsWithSortedIndexes => - topicsWithSortedIndexes.map(_._1) - }.flatten.writeExecution(DailySuffixTypedTsv(outputPath + "/vocab")) - ).unit - - } -} - -object UserTopicModellingJobUtils { - - /** - * The main function that produces training and the test data - * - * @param topicFollowGraphSource user with followed topics from TFG - * @param notInterestedTopicsSource user with not interested in topics - * @param userSource user with country and language - * @param userInterestedInData user with interestedin simcluster embeddings - * @param topicPerLanguageEmbeddings topics with simcluster embeddings - * - * @return Tuple (trainingDataSamples, testingDataSamples, sortedTopicsVocab) - */ - def run( - topicFollowGraphSource: TypedPipe[(TopicId, UserId)], - notInterestedTopicsSource: TypedPipe[(TopicId, UserId)], - userSource: TypedPipe[(UserId, (Country, Language))], - userInterestedInData: TypedPipe[(UserId, Map[Int, Double])], - topicPerLanguageEmbeddings: TypedPipe[((TopicId, Language), Map[Int, Double])], - testingRatio: Double - )( - implicit uniqueID: UniqueID, - dateRange: DateRange, - timeZone: TimeZone - ): ( - TypedPipe[UserTopicTrainingSample], - TypedPipe[UserTopicTrainingSample], - TypedPipe[Seq[(TopicId, Int)]] - ) = { - val allFollowableTopics: TypedPipe[TopicId] = - topicFollowGraphSource.map(_._1).distinct - - val allFollowableTopicsWithMappedIds: TypedPipe[(TopicId, Int)] = - allFollowableTopics.groupAll.mapGroup { - case (_, topicIter) => - topicIter.zipWithIndex.map { - case (topicId, mappedId) => - (topicId, mappedId) - } - }.values - - val sortedVocab: TypedPipe[Seq[(TopicId, Int)]] = - allFollowableTopicsWithMappedIds.map(Seq(_)).map(_.sortBy(_._2)) - - val dataTrainingSamples: TypedPipe[UserTopicTrainingSample] = getDataSamplesFromTrainingData( - topicFollowGraphSource, - notInterestedTopicsSource, - userSource, - userInterestedInData, - topicPerLanguageEmbeddings, - allFollowableTopicsWithMappedIds - ) - val (trainSplit, testSplit) = splitByUser(dataTrainingSamples, testingRatio) - - (trainSplit, testSplit, sortedVocab) - } - - /** - * Split the data samples based on user_id into train and test data. This ensures that the same - * user's data records are not part of both train and test data. - */ - def splitByUser( - dataTrainingSamples: TypedPipe[UserTopicTrainingSample], - testingRatio: Double - ): (TypedPipe[UserTopicTrainingSample], TypedPipe[UserTopicTrainingSample]) = { - val (trainSplit, testSplit) = dataTrainingSamples - .map { currSmple => (currSmple.userId, currSmple) }.groupBy(_._1).partition(_ => - Random.nextDouble() > testingRatio) - val trainingData = trainSplit.values.map(_._2) - val testingData = testSplit.values.map(_._2) - (trainingData, testingData) - } - - /** - * To get the target topic for each training data sample for a user from the TopicFollowGraph - * - * @param topicFollowSource - * @return (UserId, Set(allFollowedTopicsExceptTargetTopic), targetTopic) - */ - def getTargetTopicsFromTFG( - topicFollowSource: TypedPipe[(TopicId, UserId)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[(UserId, Set[TopicId], TopicId)] = { - val numTrainingSamples = Stat("num_positive_training_samples") - - val userFollowedTopics = topicFollowSource.swap - .map { - case (userId, topicId) => (userId, Set(topicId)) - }.sumByKey.toTypedPipe - - userFollowedTopics.flatMap { - case (userID, followedTopicsSet) => - followedTopicsSet.map { currFollowedTopic => - numTrainingSamples.inc() - val remainingTopics = followedTopicsSet - currFollowedTopic - (userID, remainingTopics, currFollowedTopic) - } - } - } - - /** - * Helper function that does the intermediate join operation between a user's followed, - * not-interested, interestedIn, country and language typedpipe sources, read from different sources. - */ - - def getFeaturesIntermediateJoin( - topicFollowGraphSource: TypedPipe[(TopicId, UserId)], - notInterestedTopicsSource: TypedPipe[(TopicId, UserId)], - allFollowableTopicsWithMappedIds: TypedPipe[(TopicId, Int)], - userCountryAndLanguage: TypedPipe[(UserId, (Country, Language))], - userInterestedInData: TypedPipe[(UserId, Map[Int, Double])] - )( - implicit uniqueID: UniqueID - ): TypedPipe[ - ( - UserId, - Set[TopicId], - Set[TopicId], - TopicId, - Int, - Country, - Language, - Map[Int, Double] - ) - ] = { - implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian - - val userWithFollowedTargetTopics: TypedPipe[ - (UserId, Set[TopicId], TopicId) - ] = getTargetTopicsFromTFG(topicFollowGraphSource) - - val userWithNotInterestedTopics: TypedPipe[(UserId, Set[TopicId])] = - notInterestedTopicsSource.swap.mapValues(Set(_)).sumByKey.toTypedPipe - - userWithFollowedTargetTopics - .groupBy(_._1).leftJoin(userWithNotInterestedTopics).values.map { - case ((userId, followedTopics, targetFollowedTopic), notInterestedOpt) => - ( - userId, - followedTopics, - targetFollowedTopic, - notInterestedOpt.getOrElse(Set.empty[TopicId])) - } - .map { - case (userId, followedTopics, targetFollowedTopic, notInterestedTopics) => - (targetFollowedTopic, (userId, followedTopics, notInterestedTopics)) - }.join(allFollowableTopicsWithMappedIds).map { - case (targetTopic, ((userId, followedTopics, notInterestedTopics), targetTopicIdx)) => - (userId, followedTopics, notInterestedTopics, targetTopic, targetTopicIdx) - } - .groupBy(_._1).sketch(4000) - .join(userCountryAndLanguage - .groupBy(_._1)).sketch(4000).leftJoin(userInterestedInData) - .values.map { - case ( - ( - (userId, followedTopics, notInterestedTopics, targetTopic, targetTopicIdx), - (_, (userCountry, userLanguage)) - ), - userIntOpt) => - ( - userId, - followedTopics, - notInterestedTopics, - targetTopic, - targetTopicIdx, - userCountry, - userLanguage, - userIntOpt.getOrElse(Map.empty)) - } - } - - /** - * Helper function that aggregates user's followed topics, not-interested topics, - * country, language with join operations and generates the UserTopicTrainingSample - * for each DataRecord - */ - def getDataSamplesFromTrainingData( - topicFollowGraphSource: TypedPipe[(TopicId, UserId)], - notInterestedTopicsSource: TypedPipe[(TopicId, UserId)], - userCountryAndLanguage: TypedPipe[(UserId, (Country, Language))], - userInterestedInData: TypedPipe[(UserId, Map[Int, Double])], - topicPerLanguageEmbeddings: TypedPipe[((TopicId, Language), Map[Int, Double])], - allFollowableTopicsWithMappedIds: TypedPipe[(TopicId, Int)] - )( - implicit uniqueID: UniqueID - ): TypedPipe[UserTopicTrainingSample] = { - - implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian - - val allTopicEmbeddingsMap: ValuePipe[Map[(TopicId, Language), Map[Int, Double]]] = - topicPerLanguageEmbeddings.map { - case (topicWithLang, embedding) => - Map(topicWithLang -> embedding) - }.sum - - val userWithFollowedAndNotInterestedTopics = getFeaturesIntermediateJoin( - topicFollowGraphSource, - notInterestedTopicsSource, - allFollowableTopicsWithMappedIds, - userCountryAndLanguage, - userInterestedInData) - - userWithFollowedAndNotInterestedTopics.flatMapWithValue(allTopicEmbeddingsMap) { - case ( - ( - userId, - followedTopics, - notInterestedTopics, - targetTopic, - targetTopicIdx, - userCountry, - userLanguage, - userInt), - Some(allTopicEmbeddings)) => - val averageFollowedTopicsSimClusters = Monoid - .sum(followedTopics.toSeq.map { topicId => - allTopicEmbeddings.getOrElse((topicId, userLanguage), Map.empty) - }).mapValues(v => - v / followedTopics.size) // average simcluster embedding of the followed topics - - val averageNotInterestedTopicsSimClusters = Monoid - .sum(notInterestedTopics.toSeq.map { topicId => - allTopicEmbeddings.getOrElse((topicId, userLanguage), Map.empty) - }).mapValues(v => - v / notInterestedTopics.size) // average simcluster embedding of the notInterested topics - - Some( - UserTopicTrainingSample( - userId, - followedTopics, - notInterestedTopics, - userCountry, - userLanguage, - targetTopicIdx, - userInt, - averageFollowedTopicsSimClusters, - averageNotInterestedTopicsSimClusters - ) - ) - - case _ => - None - } - } - - /** - * Write train and test data - */ - def getTrainTestExec( - trainingData: TypedPipe[UserTopicTrainingSample], - testingData: TypedPipe[UserTopicTrainingSample], - trainDataset: SnapshotDALDatasetBase[DataRecord], - testDataset: SnapshotDALDatasetBase[DataRecord], - outputPath: String, - adapter: IRecordOneToOneAdapter[UserTopicTrainingSample] - )( - implicit dateRange: DateRange - ): Execution[Unit] = { - val trainExec = - convertTypedPipeToDataSetPipe(trainingData, adapter) - .writeDALSnapshotExecution( - trainDataset, - D.Daily, - D.Suffix(s"$outputPath/training"), - D.EBLzo(), - dateRange.end) - val testExec = - convertTypedPipeToDataSetPipe(testingData, adapter) - .writeDALSnapshotExecution( - testDataset, - D.Daily, - D.Suffix(s"$outputPath/testing"), - D.EBLzo(), - dateRange.end) - Execution.zip(trainExec, testExec).unit - } - - /** - * To get the datasetPipe containing datarecords hydrated by datarecordAdapter - * @param userTrainingSamples - * @param adapter - * @return DataSetPipe - */ - def convertTypedPipeToDataSetPipe( - userTrainingSamples: TypedPipe[UserTopicTrainingSample], - adapter: IRecordOneToOneAdapter[UserTopicTrainingSample] - ): DataSetPipe = { - userTrainingSamples.toDataSetPipe(adapter) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/BUILD b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/BUILD deleted file mode 100644 index 0a1532588..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/BUILD +++ /dev/null @@ -1,234 +0,0 @@ -scala_library( - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":tweet_similarity_test_datarecords_120min-java", - ":tweet_similarity_test_datarecords_30min-java", - ":tweet_similarity_train_datarecords_120min-java", - ":tweet_similarity_train_datarecords_30min-java", - ":tweet_similarity_unhydrated_pairs_120min-scala", - ":tweet_similarity_unhydrated_pairs_30min-scala", - "3rdparty/jvm/com/twitter/storehaus:algebra", - "3rdparty/jvm/com/twitter/storehaus:core", - "dataproducts/insights/common/common", - "snowflake:id", - "src/java/com/twitter/ml/api/constant", - "src/scala/com/twitter/ads/dataservice_account/snapshot/jobs:db_snapshots_promoted_tweets-scala", - "src/scala/com/twitter/ml/api:api-base", - "src/scala/com/twitter/ml/featurestore/catalog/features/recommendations:aggregate", - "src/scala/com/twitter/ml/featurestore/lib/embedding", - "src/scala/com/twitter/scalding_internal/dalv2", - "src/scala/com/twitter/scalding_internal/dalv2/dataset", - "src/scala/com/twitter/scalding_internal/job", - "src/scala/com/twitter/scalding_internal/job/analytics_batch", - "src/scala/com/twitter/scalding_internal/source", - "src/scala/com/twitter/scalding_internal/source/lzo_scrooge", - "src/scala/com/twitter/simclusters_v2/common", - "src/scala/com/twitter/simclusters_v2/scalding", - "src/scala/com/twitter/simclusters_v2/tweet_similarity", - "src/scala/com/twitter/wtf/scalding/jobs/client_event_processing:user_interaction-scala", - "twadoop_config/configuration/log_categories/group/timeline:timeline_service_favorites-scala", - "tweetsource/common:unhydrated_flat-scala", - ], -) - -hadoop_binary( - name = "training_data_collection-adhoc", - main = "com.twitter.simclusters_v2.scalding.tweet_similarity.TrainingDataCollectionAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweet_similarity", - ], -) - -hadoop_binary( - name = "training_data_collection_30min", - main = "com.twitter.simclusters_v2.scalding.tweet_similarity.TrainingDataCollection30MinScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweet_similarity", - ], -) - -hadoop_binary( - name = "training_data_collection_120min", - main = "com.twitter.simclusters_v2.scalding.tweet_similarity.TrainingDataCollection120MinScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweet_similarity", - ], -) - -hadoop_binary( - name = "unhydrated_pair_collection-adhoc", - main = "com.twitter.simclusters_v2.scalding.tweet_similarity.UnhydratedPairsCollectionAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweet_similarity", - ], -) - -hadoop_binary( - name = "unhydrated_pair_collection_30min", - main = "com.twitter.simclusters_v2.scalding.tweet_similarity.UnhydratedPairsCollection30MinScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweet_similarity", - ], -) - -hadoop_binary( - name = "unhydrated_pair_collection_120min", - main = "com.twitter.simclusters_v2.scalding.tweet_similarity.UnhydratedPairsCollection120MinScheduledApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweet_similarity", - ], -) - -hadoop_binary( - name = "model_eval-adhoc", - main = "com.twitter.simclusters_v2.scalding.tweet_similarity.ModelEvalAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweet_similarity", - ], -) - -hadoop_binary( - name = "dataset_topk_analysis-adhoc", - main = "com.twitter.simclusters_v2.scalding.tweet_similarity.DatasetTopKAnalysisAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweet_similarity", - ], -) - -hadoop_binary( - name = "dataset_topk_analysis_dump-adhoc", - main = "com.twitter.simclusters_v2.scalding.tweet_similarity.DatasetTopKAnalysisDumpApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweet_similarity", - ], -) - -create_datarecord_datasets( - base_name = "tweet_similarity_train_datarecords_30min", - platform = "java8", - role = "cassowary", - segment_type = "partitioned", - tags = ["bazel-compatible"], -) - -create_datarecord_datasets( - base_name = "tweet_similarity_test_datarecords_30min", - platform = "java8", - role = "cassowary", - segment_type = "partitioned", - tags = ["bazel-compatible"], -) - -create_datarecord_datasets( - base_name = "tweet_similarity_train_datarecords_120min", - platform = "java8", - role = "cassowary", - segment_type = "partitioned", - tags = ["bazel-compatible"], -) - -create_datarecord_datasets( - base_name = "tweet_similarity_test_datarecords_120min", - platform = "java8", - role = "cassowary", - segment_type = "partitioned", - tags = ["bazel-compatible"], -) - -create_datasets( - base_name = "tweet_similarity_unhydrated_pairs_30min", - description = "30min coocurrence training pairs before feature hydration", - java_schema = "com.twitter.simclusters_v2.thriftjava.LabelledTweetPairs", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.LabelledTweetPairs", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "tweet_similarity_unhydrated_pairs_120min", - description = "120min coocurrence training pairs before feature hydration", - java_schema = "com.twitter.simclusters_v2.thriftjava.LabelledTweetPairs", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.LabelledTweetPairs", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/BUILD.docx new file mode 100644 index 000000000..6fadea4aa Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/DatasetTopKAnalysisJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/DatasetTopKAnalysisJob.docx new file mode 100644 index 000000000..41ed1e34a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/DatasetTopKAnalysisJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/DatasetTopKAnalysisJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/DatasetTopKAnalysisJob.scala deleted file mode 100644 index b277dd02e..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/DatasetTopKAnalysisJob.scala +++ /dev/null @@ -1,255 +0,0 @@ -package com.twitter.simclusters_v2.scalding.tweet_similarity - -import com.twitter.ml.api.DailySuffixFeatureSource -import com.twitter.ml.api.DataSetPipe -import com.twitter.ml.api.RichDataRecord -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding.Execution -import com.twitter.scalding._ -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.tweet_similarity.TweetSimilarityFeatures -import java.util.TimeZone - -object DatasetTopKAnalysisJob { - - case class TweetPairWithStats( - queryTweet: TweetId, - candidateTweet: TweetId, - cooccurrenceCount: Double, - coengagementCount: Double, - coengagementRate: Double) - - def getCoocurrenceTweetPairs(dataset: DataSetPipe): TypedPipe[TweetPairWithStats] = { - val featureContext = dataset.featureContext - - dataset.records - .map { record => - val richDataRecord = new RichDataRecord(record, featureContext) - val coengaged = - if (richDataRecord - .getFeatureValue(TweetSimilarityFeatures.Label) - .booleanValue) 1 - else 0 - ( - ( - richDataRecord.getFeatureValue(TweetSimilarityFeatures.QueryTweetId).toLong, - richDataRecord.getFeatureValue(TweetSimilarityFeatures.CandidateTweetId).toLong), - (1, coengaged) - ) - }.sumByKey - .map { - case ((queryTweet, candidateTweet), (coocurrenceCount, coengagementCount)) => - TweetPairWithStats( - queryTweet, - candidateTweet, - coocurrenceCount.toDouble, - coengagementCount.toDouble, - coengagementCount.toDouble / coocurrenceCount.toDouble - ) - } - } - - def getQueryTweetToCounts(dataset: DataSetPipe): TypedPipe[(Long, (Int, Int))] = { - val featureContext = dataset.featureContext - dataset.records.map { record => - val richDataRecord = new RichDataRecord(record, featureContext) - val coengaged = - if (richDataRecord - .getFeatureValue(TweetSimilarityFeatures.Label) - .booleanValue) 1 - else 0 - ( - richDataRecord.getFeatureValue(TweetSimilarityFeatures.QueryTweetId).toLong, - (1, coengaged) - ) - }.sumByKey - } - - def printGlobalTopKTweetPairsBy( - tweetPairs: TypedPipe[TweetPairWithStats], - k: Int, - orderByFnt: TweetPairWithStats => Double - ): Execution[Unit] = { - val topKTweetPairs = - tweetPairs.groupAll - .sortedReverseTake(k)(Ordering.by(orderByFnt)) - .values - topKTweetPairs.toIterableExecution.map { s => - println(s.map(Util.prettyJsonMapper.writeValueAsString).mkString("\n")) - } - } - - def printTweetTopKTweetsBy( - groupedBy: Grouped[TweetId, TweetPairWithStats], - k: Int, - orderByFnt: TweetPairWithStats => Double, - descending: Boolean = true - ): Execution[Unit] = { - if (descending) { - println("TweetTopKTweets (descending order)") - groupedBy - .sortedReverseTake(k)(Ordering.by(orderByFnt)) - .toIterableExecution - .map { record => println(record.toString()) } - } else { - println("TweetTopKTweets (ascending order)") - groupedBy - .sortedTake(k)(Ordering.by(orderByFnt)) - .toIterableExecution - .map { record => println(record.toString()) } - } - } - - def printTweetPairStatsExec( - tweetPairs: TypedPipe[TweetPairWithStats], - k: Int - ): Execution[Unit] = { - Execution - .sequence( - Seq( - Util.printSummaryOfNumericColumn( - tweetPairs.map(_.cooccurrenceCount), - Some("Tweet-pair Coocurrence Count")), - printGlobalTopKTweetPairsBy( - tweetPairs, - k, - { tweetPairs => tweetPairs.cooccurrenceCount }), - Util.printSummaryOfNumericColumn( - tweetPairs.map(_.coengagementCount), - Some("Tweet-pair Coengagement Count")), - printGlobalTopKTweetPairsBy( - tweetPairs, - k, - { tweetPairs => tweetPairs.coengagementCount }), - Util.printSummaryOfNumericColumn( - tweetPairs.map(_.coengagementRate), - Some("Tweet-pair Coengagement Rate")), - printGlobalTopKTweetPairsBy(tweetPairs, k, { tweetPairs => tweetPairs.coengagementRate }) - ) - ).unit - } - - def printPerQueryStatsExec(dataset: DataSetPipe, k: Int): Execution[Unit] = { - val queryToCounts = getQueryTweetToCounts(dataset) - - val topKQueryTweetsByOccurrence = - queryToCounts.groupAll - .sortedReverseTake(k)(Ordering.by { case (_, (cooccurrenceCount, _)) => cooccurrenceCount }) - .values - - val topKQueryTweetsByEngagement = - queryToCounts.groupAll - .sortedReverseTake(k)(Ordering.by { case (_, (_, coengagementCount)) => coengagementCount }) - .values - - Execution - .sequence( - Seq( - Util.printSummaryOfNumericColumn( - queryToCounts.map(_._2._1), - Some("Per-query Total Cooccurrence Count")), - topKQueryTweetsByOccurrence.toIterableExecution.map { s => - println(s.map(Util.prettyJsonMapper.writeValueAsString).mkString("\n")) - }, - Util.printSummaryOfNumericColumn( - queryToCounts.map(_._2._2), - Some("Per-query Total Coengagement Count")), - topKQueryTweetsByEngagement.toIterableExecution.map { s => - println(s.map(Util.prettyJsonMapper.writeValueAsString).mkString("\n")) - } - ) - ).unit - } - - def runTweetTopKTweetsOutputExecs( - tweetPairs: TypedPipe[TweetPairWithStats], - k: Int, - outputPath: String - ): Execution[Unit] = { - tweetPairs - .groupBy(_.queryTweet) - .sortedReverseTake(k)(Ordering.by(_.coengagementRate)) - .writeExecution(TypedTsv(outputPath + "/topK_by_coengagement_rate")) - } -} - -/** To run: - scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity:dataset_topk_analysis-adhoc \ - --user cassowary \ - --submitter hadoopnest2.atla.twitter.com \ - --main-class com.twitter.simclusters_v2.scalding.tweet_similarity.DatasetTopKAnalysisAdhocApp -- \ - --date 2020-02-19 \ - --dataset_path /user/cassowary/adhoc/training_data/2020-02-19_class_balanced/train \ - --output_path /user/cassowary/adhoc/training_data/2020-02-19_class_balanced/train/analysis - * */ -object DatasetTopKAnalysisAdhocApp extends TwitterExecutionApp { - implicit val timeZone: TimeZone = DateOps.UTC - implicit val dateParser: DateParser = DateParser.default - - def job: Execution[Unit] = Execution.withId { implicit uniqueId => - Execution.withArgs { args: Args => - implicit val dateRange: DateRange = DateRange.parse(args.list("date")) - val dataset: DataSetPipe = DailySuffixFeatureSource(args("dataset_path")).read - val outputPath: String = args("output_path") - val topK: Int = args.int("top_K", default = 10) - - val tweetPairs = DatasetTopKAnalysisJob.getCoocurrenceTweetPairs(dataset) - - Execution - .zip( - DatasetTopKAnalysisJob.printTweetPairStatsExec(tweetPairs, topK), - DatasetTopKAnalysisJob.runTweetTopKTweetsOutputExecs(tweetPairs, topK, outputPath), - DatasetTopKAnalysisJob.printPerQueryStatsExec(dataset, topK) - ).unit - } - } -} - -/** To run: - scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity:dataset_topk_analysis-dump \ - --user cassowary \ - --submitter hadoopnest2.atla.twitter.com \ - --main-class com.twitter.simclusters_v2.scalding.tweet_similarity.DatasetTopKAnalysisDumpApp -- \ - --date 2020-02-01 \ - --dataset_path /user/cassowary/adhoc/training_data/2020-02-01/train \ - --tweets 1223105606757695490 \ - --top_K 100 - * */ -object DatasetTopKAnalysisDumpApp extends TwitterExecutionApp { - implicit val timeZone: TimeZone = DateOps.UTC - implicit val dateParser: DateParser = DateParser.default - - def job: Execution[Unit] = Execution.withId { implicit uniqueId => - Execution.withArgs { args: Args => - implicit val dateRange: DateRange = DateRange.parse(args.list("date")) - val dataset: DataSetPipe = DailySuffixFeatureSource(args("dataset_path")).read - val tweets = args.list("tweets").map(_.toLong).toSet - val topK: Int = args.int("top_K", default = 100) - - val tweetPairs = DatasetTopKAnalysisJob.getCoocurrenceTweetPairs(dataset) - - if (tweets.isEmpty) { - Execution.from(println("Empty query tweets")) - } else { - val filteredGroupby = tweetPairs - .filter { record => tweets.contains(record.queryTweet) } - .groupBy(_.queryTweet) - - Execution - .zip( - //Top K - DatasetTopKAnalysisJob - .printTweetTopKTweetsBy(filteredGroupby, topK, pair => pair.coengagementCount), - //Bottom K - DatasetTopKAnalysisJob.printTweetTopKTweetsBy( - filteredGroupby, - topK, - pair => pair.coengagementCount, - descending = false) - ).unit - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TrainingDataCollectionJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TrainingDataCollectionJob.docx new file mode 100644 index 000000000..0a00c7c7f Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TrainingDataCollectionJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TrainingDataCollectionJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TrainingDataCollectionJob.scala deleted file mode 100644 index 93941a5da..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TrainingDataCollectionJob.scala +++ /dev/null @@ -1,228 +0,0 @@ -package com.twitter.simclusters_v2.scalding.tweet_similarity - -import com.twitter.dal.client.dataset.TimePartitionedDALDataset -import com.twitter.ml.api.DataRecord -import com.twitter.ml.api.DataSetPipe -import com.twitter.scalding._ -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.Proc3Atla -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.simclusters_v2.hdfs_sources.TweetSimilarityUnhydratedPairsSource -import com.twitter.simclusters_v2.scalding.common.LogFavBasedPersistentTweetEmbeddingMhExportSource -import com.twitter.simclusters_v2.scalding.tweet_similarity.TweetPairLabelCollectionUtil.FeaturedTweet -import com.twitter.simclusters_v2.thriftscala.LabelledTweetPairs -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * Hydrate tweet pairs with features - */ -object TrainingDataCollectionJob { - val LookbackDays = 2 //lookbackdays considered when looking for author information - val testLookbackHours = 2 //hours in test dataset if doing time-based train/test split - val testRatio = 0.1 //ratio for test dataset if doing query-based train/test split - - def getHydratedDataPipe( - dateRange: DateRange, - useAuthorFeatures: Boolean, - unhydratedPairs: TypedPipe[LabelledTweetPairs] - )( - implicit timeZone: TimeZone - ): DataSetPipe = { - - val persistentEmbeddingRecords = - TypedPipe.from(new LogFavBasedPersistentTweetEmbeddingMhExportSource(range = dateRange)) - - val tweetAuthorPairs = - TweetPairLabelCollectionUtil.getTweetAuthorPairs(dateRange.prepend(Days(LookbackDays))) - - val labelledPairs = unhydratedPairs - .map { labelledPair => - ( - FeaturedTweet( - labelledPair.queryFeaturedTweet.tweetId, - labelledPair.queryFeaturedTweet.timestamp, - None, - None), - FeaturedTweet( - labelledPair.candidateFeaturedTweet.tweetId, - labelledPair.candidateFeaturedTweet.timestamp, - None, - None), - labelledPair.label - ) - } - - TweetPairFeatureHydrationUtil.getDataSetPipeWithFeatures( - labelledPairs, - persistentEmbeddingRecords, - tweetAuthorPairs, - useAuthorFeatures) - } - - def getTrainTestExec( - dataSetPipe: DataSetPipe, - splitBy: Option[String], - trainDataset: TimePartitionedDALDataset[DataRecord], - testDataset: TimePartitionedDALDataset[DataRecord], - outputPath: String - )( - implicit timeZone: TimeZone, - dateRange: DateRange - ): Execution[Unit] = { - splitBy match { - case Some("time") => - TrainingDataCollectionUtil.getTrainTestByTimeExec( - dataSetPipe, - dateRange.end - Hours(testLookbackHours), - trainDataset, - testDataset, - outputPath)(dateRange) - case Some("query_tweet") => - TrainingDataCollectionUtil.getTrainTestByQueryExec( - dataSetPipe, - testRatio, - trainDataset, - testDataset, - outputPath)(dateRange) - // Default at no splitting - case _ => - TrainingDataCollectionUtil.getTrainTestByQueryExec( - dataSetPipe, - 0.0, - trainDataset, - testDataset, - outputPath)(dateRange) - } - } -} - -/** To run: -scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity:training_data_collection-adhoc \ ---user cassowary \ ---submitter hadoopnest2.atla.twitter.com \ ---hadoop-properties "mapreduce.reduce.java.opts=-Xmx8000m mapreduce.reduce.memory.mb=8000 scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=2000 mapreduce.task.timeout=0" \ ---main-class com.twitter.simclusters_v2.scalding.tweet_similarity.TrainingDataCollectionAdhocApp -- \ ---date 2020-04-15 \ ---input_path /user/cassowary/adhoc/unhydrated_pairs/2020-04-15_30min/ \ ---output_path /user/cassowary/adhoc/training_data/2020-04-15_30min_2xneg_qtweet_split \ ---split_by query_tweet - * */ -object TrainingDataCollectionAdhocApp extends TwitterExecutionApp { - implicit val timeZone: TimeZone = DateOps.UTC - implicit val dateParser: DateParser = DateParser.default - - override def job: Execution[Unit] = - Execution.withId { implicit uniqueId => - Execution.withArgs { args: Args => - implicit val dateRange: DateRange = DateRange.parse(args.list("date")) - val useAuthorFeatures: Boolean = args.boolean("use_author_features") - val inputPath: String = args("input_path") - val outputPath: String = args("output_path") - val splitBy: Option[String] = args.optional("split_by") - - val labelledPairs = TypedPipe - .from(TweetSimilarityUnhydratedPairsSource(inputPath, dateRange)) - - val dataSetPipe = TrainingDataCollectionJob.getHydratedDataPipe( - dateRange, - useAuthorFeatures, - labelledPairs - ) - TrainingDataCollectionJob.getTrainTestExec( - dataSetPipe, - splitBy, - TweetSimilarityTrainDatarecords30MinJavaDataset, - TweetSimilarityTestDatarecords30MinJavaDataset, - outputPath - ) - } - } -} - -/** - capesospy-v2 update --build_locally --start_cron \ - training_data_collection_30min src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object TrainingDataCollection30MinScheduledApp extends ScheduledExecutionApp { - - private val outputPath: String = - "/user/cassowary/processed/tweet_similarity/training_data_30min" - - override def batchIncrement: Duration = Hours(24) - - override def firstTime: RichDate = RichDate("2020-03-26") - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val useAuthorFeatures: Boolean = args.boolean("use_author_features") - val splitBy: Option[String] = args.optional("split_by") - - val unhydratedPairs = DAL - .read(TweetSimilarityUnhydratedPairs30MinScalaDataset, dateRange) - .withRemoteReadPolicy(ExplicitLocation(Proc3Atla)) - .toTypedPipe - - val dataSetPipe = TrainingDataCollectionJob.getHydratedDataPipe( - dateRange, - useAuthorFeatures, - unhydratedPairs - ) - TrainingDataCollectionJob.getTrainTestExec( - dataSetPipe, - splitBy, - TweetSimilarityTrainDatarecords30MinJavaDataset, - TweetSimilarityTestDatarecords30MinJavaDataset, - outputPath) - } -} - -/** -capesospy-v2 update --build_locally --start_cron \ - training_data_collection_120min src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object TrainingDataCollection120MinScheduledApp extends ScheduledExecutionApp { - - private val outputPath: String = - "/user/cassowary/processed/tweet_similarity/training_data_120min" - - override def batchIncrement: Duration = Hours(24) - - override def firstTime: RichDate = RichDate("2020-03-26") - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val useAuthorFeatures: Boolean = args.boolean("use_author_features") - val splitBy: Option[String] = args.optional("split_by") - - val unhydratedPairs = DAL - .read(TweetSimilarityUnhydratedPairs120MinScalaDataset, dateRange) - .withRemoteReadPolicy(ExplicitLocation(Proc3Atla)) - .toTypedPipe - - val dataSetPipe = TrainingDataCollectionJob.getHydratedDataPipe( - dateRange, - useAuthorFeatures, - unhydratedPairs - ) - - TrainingDataCollectionJob.getTrainTestExec( - dataSetPipe, - splitBy, - TweetSimilarityTrainDatarecords120MinJavaDataset, - TweetSimilarityTestDatarecords120MinJavaDataset, - outputPath) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TrainingDataCollectionUtil.docx b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TrainingDataCollectionUtil.docx new file mode 100644 index 000000000..4b3ae7e0e Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TrainingDataCollectionUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TrainingDataCollectionUtil.scala b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TrainingDataCollectionUtil.scala deleted file mode 100644 index 4fdc90ec4..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TrainingDataCollectionUtil.scala +++ /dev/null @@ -1,138 +0,0 @@ -package com.twitter.simclusters_v2.scalding.tweet_similarity - -import com.twitter.dal.client.dataset.TimePartitionedDALDataset -import com.twitter.ml.api.util.FDsl._ -import com.twitter.ml.api.{DataRecord, DataSetPipe} -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.scalding_internal.dalv2.dataset.DALWrite._ -import com.twitter.simclusters_v2.tweet_similarity.TweetSimilarityFeatures -import com.twitter.util.Time -import java.util.Random - -/** - * Collect training data for supervised tweet similarity - */ -object TrainingDataCollectionUtil { - - /** - * Split dataset into train and test based on time - * @param dataset: input dataset - * @param testStartDate: samples before/after testStartDate will be used for training/testing - * @return (train dataset, test dataset) - */ - def splitRecordsByTime( - dataset: DataSetPipe, - testStartDate: RichDate - ): (DataSetPipe, DataSetPipe) = { - val (leftRecords, rightRecords) = dataset.records.partition { record => - // record will be in training dataset when both tweets were engaged before testStartDate - (record.getFeatureValue( - TweetSimilarityFeatures.QueryTweetTimestamp) < testStartDate.timestamp) & - (record.getFeatureValue( - TweetSimilarityFeatures.CandidateTweetTimestamp) < testStartDate.timestamp) - } - ( - DataSetPipe(leftRecords, dataset.featureContext), - DataSetPipe(rightRecords, dataset.featureContext)) - } - - /** - * Split dataset into train and test randomly based on query - * @param dataset: input dataset - * @param testRatio: ratio for test - * @return (train dataset, test dataset) - */ - def splitRecordsByQuery(dataset: DataSetPipe, testRatio: Double): (DataSetPipe, DataSetPipe) = { - val queryToRand = dataset.records - .map { record => record.getFeatureValue(TweetSimilarityFeatures.QueryTweetId) } - .distinct - .map { queryTweet => queryTweet -> new Random(Time.now.inMilliseconds).nextDouble() } - .forceToDisk - - val (trainRecords, testRecords) = dataset.records - .groupBy { record => record.getFeatureValue(TweetSimilarityFeatures.QueryTweetId) } - .join(queryToRand) - .values - .partition { - case (_, random) => random > testRatio - } - - ( - DataSetPipe(trainRecords.map { case (record, _) => record }, dataset.featureContext), - DataSetPipe(testRecords.map { case (record, _) => record }, dataset.featureContext)) - } - - /** - * Get the write exec for train and test datasets - * @param dataset: input dataset - * @param testStartDate: samples before/after testStartDate will be used for training/testing - * @param outputPath: output path for the train/test datasets - * @return execution of the the writing exec - */ - def getTrainTestByTimeExec( - dataset: DataSetPipe, - testStartDate: RichDate, - trainDataset: TimePartitionedDALDataset[DataRecord], - testDataset: TimePartitionedDALDataset[DataRecord], - outputPath: String - )( - implicit dateRange: DateRange - ): Execution[Unit] = { - val (trainDataSet, testDataSet) = splitRecordsByTime(dataset, testStartDate) - val trainExecution: Execution[Unit] = trainDataSet - .writeDALExecution(trainDataset, D.Daily, D.Suffix(s"$outputPath/train"), D.EBLzo()) - val trainStatsExecution: Execution[Unit] = - getStatsExec(trainDataSet, s"$outputPath/train_stats") - val testExecution: Execution[Unit] = testDataSet - .writeDALExecution(testDataset, D.Daily, D.Suffix(s"$outputPath/test"), D.EBLzo()) - val testStatsExecution: Execution[Unit] = getStatsExec(testDataSet, s"$outputPath/test_stats") - Execution.zip(trainExecution, trainStatsExecution, testExecution, testStatsExecution).unit - } - - /** - * Get the write exec for train and test datasets - * @param dataset: input dataset - * @param testRatio: samples before/after testStartDate will be used for training/testing - * @param outputPath: output path for the train/test datasets - * @return execution of the the writing exec - */ - def getTrainTestByQueryExec( - dataset: DataSetPipe, - testRatio: Double, - trainDataset: TimePartitionedDALDataset[DataRecord], - testDataset: TimePartitionedDALDataset[DataRecord], - outputPath: String - )( - implicit dateRange: DateRange - ): Execution[Unit] = { - val (trainDataSet, testDataSet) = splitRecordsByQuery(dataset, testRatio) - val trainExecution: Execution[Unit] = trainDataSet - .writeDALExecution(trainDataset, D.Daily, D.Suffix(s"$outputPath/train"), D.EBLzo()) - val trainStatsExecution: Execution[Unit] = - getStatsExec(trainDataSet, s"$outputPath/train_stats") - val testExecution: Execution[Unit] = testDataSet - .writeDALExecution(testDataset, D.Daily, D.Suffix(s"$outputPath/test"), D.EBLzo()) - val testStatsExecution: Execution[Unit] = getStatsExec(testDataSet, s"$outputPath/test_stats") - Execution.zip(trainExecution, trainStatsExecution, testExecution, testStatsExecution).unit - } - - /** - * Get the exec for reporting dataset stats - * @param dataset: dataset of interest - * @param outputPath: path for outputting the stats - * @return exec - */ - def getStatsExec(dataset: DataSetPipe, outputPath: String): Execution[Unit] = { - dataset.records - .map { rec => - if (TweetSimilarityFeatures.isCoengaged(rec)) - "total_positive_records" -> 1L - else - "total_negative_records" -> 1L - } - .sumByKey - .shard(1) - .writeExecution(TypedTsv(outputPath)) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TweetPairFeatureHydrationUtil.docx b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TweetPairFeatureHydrationUtil.docx new file mode 100644 index 000000000..f485ce690 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TweetPairFeatureHydrationUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TweetPairFeatureHydrationUtil.scala b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TweetPairFeatureHydrationUtil.scala deleted file mode 100644 index 458ea8525..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TweetPairFeatureHydrationUtil.scala +++ /dev/null @@ -1,289 +0,0 @@ -package com.twitter.simclusters_v2.scalding.tweet_similarity - -import com.twitter.ml.api.util.FDsl._ -import com.twitter.ml.api.{DataRecord, DataRecordMerger, DataSetPipe, FeatureContext} -import com.twitter.ml.featurestore.lib.data.EntityIds.Entry -import com.twitter.ml.featurestore.lib.data.{EntityIds, FeatureValuesById, PredictionRecord} -import com.twitter.scalding.typed.TypedPipe -import com.twitter.simclusters_v2.common.SimClustersEmbedding._ -import com.twitter.simclusters_v2.tweet_similarity.ModelBasedTweetSimilaritySimClustersEmbeddingAdapter.{ - NormalizedCandidateEmbAdapter, - NormalizedQueryEmbAdapter -} -import com.twitter.simclusters_v2.tweet_similarity.{ - TweetSimilarityFeatures, - TweetSimilarityFeaturesStoreConfig -} -import com.twitter.simclusters_v2.common.{Timestamp, TweetId, UserId} -import com.twitter.simclusters_v2.scalding.tweet_similarity.TweetPairLabelCollectionUtil.FeaturedTweet -import com.twitter.simclusters_v2.thriftscala.{ - PersistentSimClustersEmbedding, - SimClustersEmbedding => ThriftSimClustersEmbedding -} - -object TweetPairFeatureHydrationUtil { - val QueryTweetConfig = new TweetSimilarityFeaturesStoreConfig("query_tweet_user_id") - val CandidateTweetConfig = new TweetSimilarityFeaturesStoreConfig("candidate_tweet_user_id") - val DataRecordMerger = new DataRecordMerger() - - /** - * Given persistentEmbeddings TypedPipe, extract tweetId, timestamp, and the embedding - * - * @param persistentEmbeddings TypedPipe of ((TweetId, Timestamp), PersistentSimClustersEmbedding), read from PersistentTweetEmbeddingMhExportSource - * - * @return Extracted TypedPipe of (TweetId, (Timestamp, SimClustersEmbedding)) - */ - def extractEmbeddings( - persistentEmbeddings: TypedPipe[((TweetId, Timestamp), PersistentSimClustersEmbedding)] - ): TypedPipe[(TweetId, (Timestamp, ThriftSimClustersEmbedding))] = { - persistentEmbeddings - .collect { - case ((tweetId, _), embedding) if embedding.metadata.updatedAtMs.isDefined => - (tweetId, (embedding.metadata.updatedAtMs.get, embedding.embedding)) - } - } - - /** - * Hydrate the tweet pairs with the latest persistent embeddings before engagement/impression. - * - * @param tweetPairs TypedPipe of the (userId, queryFeaturedTweet, candidateFeaturedTweet, label) - * @param persistentEmbeddings TypedPipe of persistentEmbeddings from PersistentTweetEmbeddingMhExportSource - * - * @return TypedPipe of the (userId, queryFeaturedTweet, candidateFeaturedTweet, label) with persistent embeddings set - */ - def getTweetPairsWithPersistentEmbeddings( - tweetPairs: TypedPipe[(FeaturedTweet, FeaturedTweet, Boolean)], - persistentEmbeddings: TypedPipe[((TweetId, Timestamp), PersistentSimClustersEmbedding)] - ): TypedPipe[(FeaturedTweet, FeaturedTweet, Boolean)] = { - val extractedEmbeddings = extractEmbeddings(persistentEmbeddings) - tweetPairs - .groupBy { - case (queryFeaturedTweet, _, _) => queryFeaturedTweet.tweet - } - .join(extractedEmbeddings) - .collect { - case ( - _, - ( - (queryFeaturedTweet, candidateFeaturedTweet, label), - (embeddingTimestamp, embedding))) - if embeddingTimestamp <= queryFeaturedTweet.timestamp => - ((queryFeaturedTweet, candidateFeaturedTweet), (embeddingTimestamp, embedding, label)) - } - .group - .maxBy(_._1) - .map { - case ((queryFeaturedTweet, candidateFeaturedTweet), (_, embedding, label)) => - ( - candidateFeaturedTweet.tweet, - (queryFeaturedTweet.copy(embedding = Some(embedding)), candidateFeaturedTweet, label) - ) - } - .join(extractedEmbeddings) - .collect { - case ( - _, - ( - (queryFeaturedTweet, candidateFeaturedTweet, label), - (embeddingTimestamp, embedding))) - if embeddingTimestamp <= candidateFeaturedTweet.timestamp => - ((queryFeaturedTweet, candidateFeaturedTweet), (embeddingTimestamp, embedding, label)) - } - .group - .maxBy(_._1) - .map { - case ((queryFeaturedTweet, candidateFeaturedTweet), (_, embedding, label)) => - (queryFeaturedTweet, candidateFeaturedTweet.copy(embedding = Some(embedding)), label) - } - } - - /** - * Get tweet pairs with the author userIds - * - * @param tweetPairs TypedPipe of (queryTweet, queryEmbedding, queryTimestamp, candidateTweet, candidateEmbedding, candidateTimestamp, label) - * @param tweetAuthorPairs TypedPipe of (tweetId, author userId) - * - * @return TypedPipe of (queryTweet, queryAuthor, queryEmbedding, queryTimestamp, candidateTweet, candidateAuthor, candidateEmbedding, candidateTimestamp, label) - */ - def getTweetPairsWithAuthors( - tweetPairs: TypedPipe[(FeaturedTweet, FeaturedTweet, Boolean)], - tweetAuthorPairs: TypedPipe[(TweetId, UserId)] - ): TypedPipe[(FeaturedTweet, FeaturedTweet, Boolean)] = { - tweetPairs - //keyed by queryTweet s.t. we get queryTweet's author after joining with tweetAuthorPairs - .groupBy { case (queryFeaturedTweet, _, _) => queryFeaturedTweet.tweet } - .join(tweetAuthorPairs) - .values - //keyed by candidateTweet - .groupBy { case ((_, candidateFeaturedTweet, _), _) => candidateFeaturedTweet.tweet } - .join(tweetAuthorPairs) - .values - .map { - case ( - ((queryFeaturedTweet, candidateFeaturedTweet, label), queryAuthor), - candidateAuthor) => - ( - queryFeaturedTweet.copy(author = Some(queryAuthor)), - candidateFeaturedTweet.copy(author = Some(candidateAuthor)), - label - ) - } - } - - /** - * Get tweet pairs with popularity counts - * - * @param tweetPairs TypedPipe of the (userId, queryFeaturedTweet, candidateFeaturedTweet, label) - * - * @return TypedPipe of the (userId, queryFeaturedTweet, candidateFeaturedTweet, tweetPairCount, queryTweetCount, label) - */ - def getTweetPairsWithCounts( - tweetPairs: TypedPipe[(FeaturedTweet, FeaturedTweet, Boolean)] - ): TypedPipe[(FeaturedTweet, FeaturedTweet, Long, Long, Boolean)] = { - val tweetPairCount = tweetPairs.groupBy { - case (queryFeaturedTweet, candidateFeaturedTweet, _) => - (queryFeaturedTweet.tweet, candidateFeaturedTweet.tweet) - }.size - - val queryTweetCount = tweetPairs.groupBy { - case (queryFeaturedTweet, _, _) => queryFeaturedTweet.tweet - }.size - - tweetPairs - .groupBy { - case (queryFeaturedTweet, candidateFeaturedTweet, _) => - (queryFeaturedTweet.tweet, candidateFeaturedTweet.tweet) - } - .join(tweetPairCount) - .values - .map { - case ((queryFeaturedTweet, candidateFeaturedTweet, label), tweetPairCount) => - (queryFeaturedTweet, candidateFeaturedTweet, tweetPairCount, label) - } - .groupBy { case (queryFeaturedTweet, _, _, _) => queryFeaturedTweet.tweet } - .join(queryTweetCount) - .values - .map { - case ( - (queryFeaturedTweet, candidateFeaturedTweet, tweetPairCount, label), - queryTweetCount) => - (queryFeaturedTweet, candidateFeaturedTweet, tweetPairCount, queryTweetCount, label) - } - } - - /** - * Get training data records - * - * @param tweetPairs TypedPipe of the (userId, queryFeaturedTweet, candidateFeaturedTweet, label) - * @param persistentEmbeddings TypedPipe of persistentEmbeddings from PersistentTweetEmbeddingMhExportSource - * @param tweetAuthorPairs TypedPipe of (tweetId, author userId) - * @param useAuthorFeatures whether to use author features or not - * - * @return DataSetPipe with features and label - */ - def getDataSetPipeWithFeatures( - tweetPairs: TypedPipe[(FeaturedTweet, FeaturedTweet, Boolean)], - persistentEmbeddings: TypedPipe[((TweetId, Timestamp), PersistentSimClustersEmbedding)], - tweetAuthorPairs: TypedPipe[(TweetId, UserId)], - useAuthorFeatures: Boolean - ): DataSetPipe = { - val featuredTweetPairs = - if (useAuthorFeatures) - getTweetPairsWithCounts( - getTweetPairsWithPersistentEmbeddings( - getTweetPairsWithAuthors(tweetPairs, tweetAuthorPairs), - persistentEmbeddings)) - else - getTweetPairsWithCounts( - getTweetPairsWithPersistentEmbeddings(tweetPairs, persistentEmbeddings)) - - DataSetPipe( - featuredTweetPairs.flatMap { - case (queryFeaturedTweet, candidateFeaturedTweet, tweetPairCount, queryTweetCount, label) => - getDataRecordWithFeatures( - queryFeaturedTweet, - candidateFeaturedTweet, - tweetPairCount, - queryTweetCount, - label) - }, - FeatureContext.merge( - TweetSimilarityFeatures.FeatureContext, - QueryTweetConfig.predictionRecordAdapter.getFeatureContext, - CandidateTweetConfig.predictionRecordAdapter.getFeatureContext - ) - ) - } - - /** - * Given raw features, return a DataRecord with all the features - * - * @param queryFeaturedTweet FeaturedTweet for query tweet - * @param candidateFeaturedTweet FeaturedTweet for candidate tweet - * @param tweetPairCount popularity count for the (query tweet, candidate tweet) pair - * @param queryTweetCount popularity count for each query tweet - * @param label true for positive and false for negative - * - * @return - */ - def getDataRecordWithFeatures( - queryFeaturedTweet: FeaturedTweet, - candidateFeaturedTweet: FeaturedTweet, - tweetPairCount: Long, - queryTweetCount: Long, - label: Boolean - ): Option[DataRecord] = { - - for { - queryEmbedding <- queryFeaturedTweet.embedding - candidateEmbedding <- candidateFeaturedTweet.embedding - } yield { - val featureDataRecord = NormalizedQueryEmbAdapter.adaptToDataRecord(queryEmbedding) - DataRecordMerger.merge( - featureDataRecord, - NormalizedCandidateEmbAdapter.adaptToDataRecord(candidateEmbedding)) - featureDataRecord.setFeatureValue( - TweetSimilarityFeatures.QueryTweetId, - queryFeaturedTweet.tweet) - featureDataRecord.setFeatureValue( - TweetSimilarityFeatures.CandidateTweetId, - candidateFeaturedTweet.tweet) - featureDataRecord.setFeatureValue( - TweetSimilarityFeatures.QueryTweetTimestamp, - queryFeaturedTweet.timestamp) - featureDataRecord.setFeatureValue( - TweetSimilarityFeatures.CandidateTweetTimestamp, - candidateFeaturedTweet.timestamp) - featureDataRecord.setFeatureValue( - TweetSimilarityFeatures.CosineSimilarity, - queryEmbedding.cosineSimilarity(candidateEmbedding)) - featureDataRecord.setFeatureValue(TweetSimilarityFeatures.TweetPairCount, tweetPairCount) - featureDataRecord.setFeatureValue(TweetSimilarityFeatures.QueryTweetCount, queryTweetCount) - featureDataRecord.setFeatureValue(TweetSimilarityFeatures.Label, label) - - if (queryFeaturedTweet.author.isDefined && candidateFeaturedTweet.author.isDefined) { - DataRecordMerger.merge( - featureDataRecord, - new DataRecord( - QueryTweetConfig.predictionRecordAdapter.adaptToDataRecord(PredictionRecord( - FeatureValuesById.empty, - EntityIds(Entry( - QueryTweetConfig.bindingIdentifier, - Set(com.twitter.ml.featurestore.lib.UserId(queryFeaturedTweet.author.get)))) - ))) - ) - DataRecordMerger.merge( - featureDataRecord, - new DataRecord( - CandidateTweetConfig.predictionRecordAdapter.adaptToDataRecord(PredictionRecord( - FeatureValuesById.empty, - EntityIds(Entry( - CandidateTweetConfig.bindingIdentifier, - Set(com.twitter.ml.featurestore.lib.UserId(candidateFeaturedTweet.author.get)))) - ))) - ) - } - featureDataRecord - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TweetPairLabelCollectionUtil.docx b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TweetPairLabelCollectionUtil.docx new file mode 100644 index 000000000..0f8f8b609 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TweetPairLabelCollectionUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TweetPairLabelCollectionUtil.scala b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TweetPairLabelCollectionUtil.scala deleted file mode 100644 index 26a479342..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/TweetPairLabelCollectionUtil.scala +++ /dev/null @@ -1,490 +0,0 @@ -package com.twitter.simclusters_v2.scalding.tweet_similarity - -import com.twitter.ads.entities.db.thriftscala.PromotedTweet -import com.twitter.dataproducts.estimation.ReservoirSampler -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding.{DateRange, Execution, TypedTsv} -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, Proc3Atla, ProcAtla} -import com.twitter.simclusters_v2.common.{SimClustersEmbedding, Timestamp, TweetId, UserId} -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources -import com.twitter.simclusters_v2.thriftscala.{ - TweetTopKTweetsWithScore, - TweetWithScore, - TweetsWithScore -} -import com.twitter.timelineservice.thriftscala.{ContextualizedFavoriteEvent, FavoriteEventUnion} -import com.twitter.wtf.scalding.client_event_processing.thriftscala.{ - InteractionDetails, - InteractionType, - TweetImpressionDetails -} -import com.twitter.wtf.scalding.jobs.client_event_processing.UserInteractionScalaDataset -import java.util.Random -import scala.collection.mutable.ArrayBuffer -import scala.util.control.Breaks._ -import twadoop_config.configuration.log_categories.group.timeline.TimelineServiceFavoritesScalaDataset - -object TweetPairLabelCollectionUtil { - - case class FeaturedTweet( - tweet: TweetId, - timestamp: Timestamp, //engagement or impression time - author: Option[UserId], - embedding: Option[SimClustersEmbedding]) - extends Ordered[FeaturedTweet] { - - import scala.math.Ordered.orderingToOrdered - - def compare(that: FeaturedTweet): Int = - (this.tweet, this.timestamp, this.author) compare (that.tweet, that.timestamp, that.author) - } - - val MaxFavPerUser: Int = 100 - - /** - * Get all fav events within the given dateRange and where all users' out-degree <= maxOutDegree - * from TimelineServiceFavoritesScalaDataset - * - * @param dateRange date of interest - * @param maxOutgoingDegree max #degrees for the users of interests - * - * @return Filtered fav events, TypedPipe of (userid, tweetid, timestamp) tuples - */ - def getFavEvents( - dateRange: DateRange, - maxOutgoingDegree: Int - ): TypedPipe[(UserId, TweetId, Timestamp)] = { - val fullTimelineFavData: TypedPipe[ContextualizedFavoriteEvent] = - DAL - .read(TimelineServiceFavoritesScalaDataset, dateRange) - .withRemoteReadPolicy(ExplicitLocation(ProcAtla)) - .toTypedPipe - - val userTweetTuples = fullTimelineFavData - .flatMap { cfe: ContextualizedFavoriteEvent => - cfe.event match { - case FavoriteEventUnion.Favorite(fav) => - Some((fav.userId, (fav.tweetId, fav.eventTimeMs))) - case _ => - None - } - } - //Get users with the out-degree <= maxOutDegree first - val usersWithValidOutDegree = userTweetTuples - .groupBy(_._1) - .withReducers(1000) - .size - .filter(_._2 <= maxOutgoingDegree) - - // Keep only usersWithValidOutDegree in the graph - userTweetTuples - .join(usersWithValidOutDegree).map { - case (userId, ((tweetId, eventTime), _)) => (userId, tweetId, eventTime) - }.forceToDisk - } - - /** - * Get impression events where users stay at the tweets for more than one minute - * - * @param dateRange time range of interest - * - * @return - */ - def getImpressionEvents(dateRange: DateRange): TypedPipe[(UserId, TweetId, Timestamp)] = { - DAL - .read(UserInteractionScalaDataset, dateRange) - .withRemoteReadPolicy(ExplicitLocation(Proc3Atla)) - .toTypedPipe - .flatMap { - case userInteraction - if userInteraction.interactionType == InteractionType.TweetImpressions => - userInteraction.interactionDetails match { - case InteractionDetails.TweetImpressionDetails( - TweetImpressionDetails(tweetId, _, dwellTimeInSecOpt)) - if dwellTimeInSecOpt.exists(_ >= 1) => - Some(userInteraction.userId, tweetId, userInteraction.timeStamp) - case _ => - None - } - case _ => None - } - .forceToDisk - } - - /** - * Given an events dataset, return a filtered events limited to a given set of tweets - * - * @param events user fav events, a TypedPipe of (userid, tweetid, timestamp) tuples - * @param tweets tweets of interest - * - * @return Filtered fav events on the given tweets of interest only, TypedPipe of (userid, tweetid, timestamp) tuples - */ - def getFilteredEvents( - events: TypedPipe[(UserId, TweetId, Timestamp)], - tweets: TypedPipe[TweetId] - ): TypedPipe[(UserId, TweetId, Timestamp)] = { - events - .map { - case (userId, tweetId, eventTime) => (tweetId, (userId, eventTime)) - } - .join(tweets.asKeys) - .withReducers(1000) - .map { - case (tweetId, ((userId, eventTime), _)) => (userId, tweetId, eventTime) - } - } - - /** Get (tweetId, author userId) of a given dateRange - * - * @param dateRange time range of interest - * - * @return TypedPipe of (tweetId, userId) - */ - def getTweetAuthorPairs(dateRange: DateRange): TypedPipe[(TweetId, UserId)] = { - ExternalDataSources - .flatTweetsSource(dateRange) - .collect { - // Exclude retweets and quoted tweets - case record if record.shareSourceTweetId.isEmpty && record.quotedTweetTweetId.isEmpty => - (record.tweetId, record.userId) - } - } - - /** Given a set of tweets, get all non-promoted tweets from the given set - * - * @param promotedTweets TypedPipe of promoted tweets - * @param tweets tweets of interest - * - * @return TypedPipe of tweetId - */ - def getNonPromotedTweets( - promotedTweets: TypedPipe[PromotedTweet], - tweets: TypedPipe[TweetId] - ): TypedPipe[TweetId] = { - promotedTweets - .collect { - case promotedTweet if promotedTweet.tweetId.isDefined => promotedTweet.tweetId.get - } - .asKeys - .rightJoin(tweets.asKeys) - .withReducers(1000) - .filterNot(joined => joined._2._1.isDefined) //filter out those in promotedTweets - .keys - } - - /** - * Given a fav events dataset, return all distinct ordered tweet pairs, labelled by whether they are co-engaged or not - * Note we distinguish between (t1, t2) and (t2, t1) because o.w we introduce bias to training samples - * - * @param events user fav events, a TypedPipe of (userid, featuredTweet) tuples - * @param timeframe two tweets will be considered co-engaged if they are fav-ed within coengagementTimeframe - * @param isCoengaged if pairs are co-engaged - * - * @return labelled tweet pairs, TypedPipe of (userid, featuredTweet1, featuredTweet2, isCoengaged) tuples - */ - def getTweetPairs( - events: TypedPipe[(UserId, FeaturedTweet)], - timeframe: Long, - isCoengaged: Boolean - ): TypedPipe[(UserId, FeaturedTweet, FeaturedTweet, Boolean)] = { - events - .map { - case (userId, featuredTweet) => (userId, Seq(featuredTweet)) - } - .sumByKey - .flatMap { - case (userId, featuredTweets) if featuredTweets.size > 1 => - val sortedFeaturedTweet = featuredTweets.sortBy(_.timestamp) - // Get all distinct ordered pairs that happen within coengagementTimeframe - val distinctPairs = ArrayBuffer[(UserId, FeaturedTweet, FeaturedTweet, Boolean)]() - breakable { - for (i <- sortedFeaturedTweet.indices) { - for (j <- i + 1 until sortedFeaturedTweet.size) { - val featuredTweet1 = sortedFeaturedTweet(i) - val featuredTweet2 = sortedFeaturedTweet(j) - if (math.abs(featuredTweet1.timestamp - featuredTweet2.timestamp) <= timeframe) - distinctPairs ++= Seq( - (userId, featuredTweet1, featuredTweet2, isCoengaged), - (userId, featuredTweet2, featuredTweet1, isCoengaged)) - else - break - } - } - } - distinctPairs - case _ => Nil - } - } - - /** - * Get co-engaged tweet pairs - * - * @param favEvents user fav events, TypedPipe of (userid, tweetid, timestamp) - * @param tweets tweets to be considered - * @param coengagementTimeframe time window for two tweets to be considered as co-engaged - * - * @return TypedPipe of co-engaged tweet pairs - */ - def getCoengagedPairs( - favEvents: TypedPipe[(UserId, TweetId, Timestamp)], - tweets: TypedPipe[TweetId], - coengagementTimeframe: Long - ): TypedPipe[(UserId, FeaturedTweet, FeaturedTweet, Boolean)] = { - val userFeaturedTweetPairs = - getFilteredEvents(favEvents, tweets) - .map { - case (user, tweet, timestamp) => (user, FeaturedTweet(tweet, timestamp, None, None)) - } - - getTweetPairs(userFeaturedTweetPairs, coengagementTimeframe, isCoengaged = true) - } - - /** - * Get co-impressed tweet pairs - * - * @param impressionEvents tweet impression events, TypedPipe of (userid, tweetid, timestamp) - * @param tweets set of tweets considered to be part of co-impressed tweet pairs - * @param timeframe time window for two tweets to be considered as co-impressed - * - * @return TypedPipe of co-impressed tweet pairs - */ - def getCoimpressedPairs( - impressionEvents: TypedPipe[(UserId, TweetId, Timestamp)], - tweets: TypedPipe[TweetId], - timeframe: Long - ): TypedPipe[(UserId, FeaturedTweet, FeaturedTweet, Boolean)] = { - val userFeaturedTweetPairs = getFilteredEvents(impressionEvents, tweets) - .map { - case (user, tweet, timestamp) => (user, FeaturedTweet(tweet, timestamp, None, None)) - } - - getTweetPairs(userFeaturedTweetPairs, timeframe, isCoengaged = false) - } - - /** - * Consolidate co-engaged pairs and co-impressed pairs, and compute all the labelled tweet pairs - * Given a pair: - * label = 1 if co-engaged (whether or not it's co-impressed) - * label = 0 if co-impressed and not co-engaged - * - * @param coengagedPairs co-engaged tweet pairs, TypedPipe of (user, queryFeaturedTweet, candidateFeaturedTweet, label) - * @param coimpressedPairs co-impressed tweet pairs, TypedPipe of (user, queryFeaturedTweet, candidateFeaturedTweet, label) - * - * @return labelled tweet pairs, TypedPipe of (queryFeaturedTweet, candidateFeaturedTweet, label) tuples - */ - def computeLabelledTweetPairs( - coengagedPairs: TypedPipe[(UserId, FeaturedTweet, FeaturedTweet, Boolean)], - coimpressedPairs: TypedPipe[(UserId, FeaturedTweet, FeaturedTweet, Boolean)] - ): TypedPipe[(FeaturedTweet, FeaturedTweet, Boolean)] = { - (coengagedPairs ++ coimpressedPairs) - .groupBy { - case (userId, queryFeaturedTweet, candidateFeaturedTweet, _) => - (userId, queryFeaturedTweet.tweet, candidateFeaturedTweet.tweet) - } - // consolidate all the labelled pairs into one with the max label - // (label order: co-engagement = true > co-impression = false) - .maxBy { - case (_, _, _, label) => label - } - .values - .map { case (_, queryTweet, candidateTweet, label) => (queryTweet, candidateTweet, label) } - } - - /** - * Get a balanced-class sampling of tweet pairs. - * For each query tweet, we make sure the numbers of positives and negatives are equal. - * - * @param labelledPairs labelled tweet pairs, TypedPipe of (queryFeaturedTweet, candidateFeaturedTweet, label) tuples - * @param maxSamplesPerClass max number of samples per class - * - * @return sampled labelled pairs after balanced-class sampling - */ - def getQueryTweetBalancedClassPairs( - labelledPairs: TypedPipe[(FeaturedTweet, FeaturedTweet, Boolean)], - maxSamplesPerClass: Int - ): TypedPipe[(FeaturedTweet, FeaturedTweet, Boolean)] = { - val queryTweetToSampleCount = labelledPairs - .map { - case (queryTweet, _, label) => - if (label) (queryTweet.tweet, (1, 0)) else (queryTweet.tweet, (0, 1)) - } - .sumByKey - .map { - case (queryTweet, (posCount, negCount)) => - (queryTweet, Math.min(Math.min(posCount, negCount), maxSamplesPerClass)) - } - - labelledPairs - .groupBy { case (queryTweet, _, _) => queryTweet.tweet } - .join(queryTweetToSampleCount) - .values - .map { - case ((queryTweet, candidateTweet, label), samplePerClass) => - ((queryTweet.tweet, label, samplePerClass), (queryTweet, candidateTweet, label)) - } - .group - .mapGroup { - case ((_, _, samplePerClass), iter) => - val random = new Random(123L) - val sampler = - new ReservoirSampler[(FeaturedTweet, FeaturedTweet, Boolean)](samplePerClass, random) - iter.foreach { pair => sampler.sampleItem(pair) } - sampler.sample.toIterator - } - .values - } - - /** - * Given a user fav dataset, computes the similarity scores (based on engagers) between every tweet pairs - * - * @param events user fav events, a TypedPipe of (userid, tweetid, timestamp) tuples - * @param minInDegree min number of engagement count for the tweets - * @param coengagementTimeframe two tweets will be considered co-engaged if they are fav-ed within coengagementTimeframe - * - * @return tweet similarity based on engagers, a TypedPipe of (tweet1, tweet2, similarity_score) tuples - **/ - def getScoredCoengagedTweetPairs( - events: TypedPipe[(UserId, TweetId, Timestamp)], - minInDegree: Int, - coengagementTimeframe: Long - )( - ): TypedPipe[(TweetId, TweetWithScore)] = { - - // compute tweet norms (based on engagers) - // only keep tweets whose indegree >= minInDegree - val tweetNorms = events - .map { case (_, tweetId, _) => (tweetId, 1.0) } - .sumByKey //the number of engagers per tweetId - .filter(_._2 >= minInDegree) - .mapValues(math.sqrt) - - val edgesWithWeight = events - .map { - case (userId, tweetId, eventTime) => (tweetId, (userId, eventTime)) - } - .join(tweetNorms) - .map { - case (tweetId, ((userId, eventTime), norm)) => - (userId, Seq((tweetId, eventTime, 1 / norm))) - } - - // get cosine similarity - val tweetPairsWithWeight = edgesWithWeight.sumByKey - .flatMap { - case (_, tweets) if tweets.size > 1 => - allUniquePairs(tweets).flatMap { - case ((tweetId1, eventTime1, weight1), (tweetId2, eventTime2, weight2)) => - // consider only co-engagement happened within the given timeframe - if ((eventTime1 - eventTime2).abs <= coengagementTimeframe) { - if (tweetId1 > tweetId2) // each worker generate allUniquePairs in different orders, hence should standardize the pairs - Some(((tweetId2, tweetId1), weight1 * weight2)) - else - Some(((tweetId1, tweetId2), weight1 * weight2)) - } else { - None - } - case _ => - None - } - case _ => Nil - } - tweetPairsWithWeight.sumByKey - .flatMap { - case ((tweetId1, tweetId2), weight) => - Seq( - (tweetId1, TweetWithScore(tweetId2, weight)), - (tweetId2, TweetWithScore(tweetId1, weight)) - ) - case _ => Nil - } - } - - /** - * Get the write exec for per-query stats - * - * @param tweetPairs input dataset - * @param outputPath output path for the per-query stats - * @param identifier identifier for the tweetPairs dataset - * - * @return execution of the the writing exec - */ - def getPerQueryStatsExec( - tweetPairs: TypedPipe[(FeaturedTweet, FeaturedTweet, Boolean)], - outputPath: String, - identifier: String - ): Execution[Unit] = { - val queryTweetsToCounts = tweetPairs - .map { - case (queryTweet, _, label) => - if (label) (queryTweet.tweet, (1, 0)) else (queryTweet.tweet, (0, 1)) - } - .sumByKey - .map { case (queryTweet, (posCount, negCount)) => (queryTweet, posCount, negCount) } - - Execution - .zip( - queryTweetsToCounts.writeExecution( - TypedTsv[(TweetId, Int, Int)](s"${outputPath}_$identifier")), - Util.printSummaryOfNumericColumn( - queryTweetsToCounts - .map { case (_, posCount, _) => posCount }, - Some(s"Per-query Positive Count ($identifier)")), - Util.printSummaryOfNumericColumn( - queryTweetsToCounts - .map { case (_, _, negCount) => negCount }, - Some(s"Per-query Negative Count ($identifier)")) - ).unit - } - - /** - * Get the top K similar tweets key-val dataset - * - * @param allTweetPairs all tweet pairs with their similarity scores - * @param k the maximum number of top results for each user - * - * @return key-val top K results for each tweet - */ - def getKeyValTopKSimilarTweets( - allTweetPairs: TypedPipe[(TweetId, TweetWithScore)], - k: Int - )( - ): TypedPipe[(TweetId, TweetsWithScore)] = { - allTweetPairs.group - .sortedReverseTake(k)(Ordering.by(_.score)) - .map { case (tweetId, tweetWithScoreSeq) => (tweetId, TweetsWithScore(tweetWithScoreSeq)) } - } - - /** - * Get the top K similar tweets dataset. - * - * @param allTweetPairs all tweet pairs with their similarity scores - * @param k the maximum number of top results for each user - * - * @return top K results for each tweet - */ - def getTopKSimilarTweets( - allTweetPairs: TypedPipe[(TweetId, TweetWithScore)], - k: Int - )( - ): TypedPipe[TweetTopKTweetsWithScore] = { - allTweetPairs.group - .sortedReverseTake(k)(Ordering.by(_.score)) - .map { - case (tweetId, tweetWithScoreSeq) => - TweetTopKTweetsWithScore(tweetId, TweetsWithScore(tweetWithScoreSeq)) - } - } - - /** - * Given a input sequence, output all unique pairs in this sequence. - */ - def allUniquePairs[T](input: Seq[T]): Stream[(T, T)] = { - input match { - case Nil => Stream.empty - case seq => - seq.tail.toStream.map(a => (seq.head, a)) #::: allUniquePairs(seq.tail) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/UnhydratedPairsCollectionJob.docx b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/UnhydratedPairsCollectionJob.docx new file mode 100644 index 000000000..b13a01e6e Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/UnhydratedPairsCollectionJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/UnhydratedPairsCollectionJob.scala b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/UnhydratedPairsCollectionJob.scala deleted file mode 100644 index 626cc35a8..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/UnhydratedPairsCollectionJob.scala +++ /dev/null @@ -1,209 +0,0 @@ -package com.twitter.simclusters_v2.scalding.tweet_similarity - -import com.twitter.ads.dataservice_account.snapshot.jobs.DbSnapshotsPromotedTweetsScalaDataset -import com.twitter.conversions.DurationOps._ -import com.twitter.dal.client.dataset.TimePartitionedDALDataset -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcrevAtla -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.simclusters_v2.scalding.tweet_similarity.TweetPairLabelCollectionUtil.MaxFavPerUser -import com.twitter.simclusters_v2.thriftscala.LabelledTweetPairs -import com.twitter.simclusters_v2.thriftscala.{FeaturedTweet => FeaturedTweetThrift} -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * Collect unhydrated training pairs for supervised tweet similarity. - * Here're the steps for this job - * 1) Consider non-promoted tweets that are created within the given #lookback days - * 2) From the tweets in 1), get co-engaged pairs - * 3) Take all tweets shown in 2), and get co-impressed pairs. Note that we take all tweets (not tweet pairs) in 2). - * That is, a co-impressed pairs (t1,t2) will be considered iff t1 appears in 2) and t2 appears in 2). - * But (t1, t2) doesn't need to appear as a pair in 2). - * 4) Compute labels from co-engaged pairs and co-impressed pairs. - * A pair is true if its user has co-engaged the pair, and is false if otherwise. - */ -object UnhydratedPairsCollectionJob { - //tweets have to be created within dateRange - lookbackdays in order to be considered - val LookbackDays = 2 - - def getLabelledPairs( - dateRange: DateRange, - timeframe: Long, - maxSamplesPerClass: Int, - dalDataset: TimePartitionedDALDataset[LabelledTweetPairs], - outputPath: String - )( - implicit timeZone: TimeZone - ): Execution[Unit] = { - - val promotedTweets = DAL - .readMostRecentSnapshot(DbSnapshotsPromotedTweetsScalaDataset, dateRange) - .withRemoteReadPolicy(ExplicitLocation(ProcrevAtla)) - .toTypedPipe - - val tweetAuthorPairs = - TweetPairLabelCollectionUtil.getTweetAuthorPairs(dateRange.prepend(Days(LookbackDays))) - - val tweets = - TweetPairLabelCollectionUtil.getNonPromotedTweets(promotedTweets, tweetAuthorPairs.keys) - - val coengagedPairs = TweetPairLabelCollectionUtil.getCoengagedPairs( - TweetPairLabelCollectionUtil.getFavEvents(dateRange, MaxFavPerUser), - tweets, - timeframe) - - val engagedTweets = coengagedPairs.map { - // Consider only query tweet b/c coengagedPairs contains both (t1,t2) and (t2,t1) - case (_, queryFeaturedTweet, _, _) => queryFeaturedTweet.tweet - }.distinct - - val coimpressedPairs = TweetPairLabelCollectionUtil - .getCoimpressedPairs( - TweetPairLabelCollectionUtil.getImpressionEvents(dateRange), - engagedTweets, - timeframe) - - val rawLabelledPairs = - TweetPairLabelCollectionUtil.computeLabelledTweetPairs(coengagedPairs, coimpressedPairs) - - val labelledPairs = - if (maxSamplesPerClass > 0) - TweetPairLabelCollectionUtil.getQueryTweetBalancedClassPairs( - rawLabelledPairs, - maxSamplesPerClass) - else - rawLabelledPairs - - val perQueryStatsExec = - if (maxSamplesPerClass > 0) { - Execution - .zip( - TweetPairLabelCollectionUtil - .getPerQueryStatsExec(rawLabelledPairs, s"$outputPath/per_query_stats", "raw"), - TweetPairLabelCollectionUtil - .getPerQueryStatsExec(labelledPairs, s"$outputPath/per_query_stats", "final") - ).unit - } else { - TweetPairLabelCollectionUtil.getPerQueryStatsExec( - labelledPairs, - s"$outputPath/per_query_stats", - "final") - } - - Execution - .zip( - labelledPairs - .map { - case (queryFeaturedTweet, candidateFeaturedTweet, label) => - LabelledTweetPairs( - FeaturedTweetThrift( - tweetId = queryFeaturedTweet.tweet, - timestamp = queryFeaturedTweet.timestamp), - FeaturedTweetThrift( - tweetId = candidateFeaturedTweet.tweet, - timestamp = candidateFeaturedTweet.timestamp), - label - ) - } - .writeDALExecution(dalDataset, D.Daily, D.Suffix(outputPath), D.EBLzo())(dateRange), - perQueryStatsExec - ).unit - } -} - -/** To run: - * scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity:unhydrated_pair_collection-adhoc \ - --user cassowary \ - --submitter hadoopnest2.atla.twitter.com \ - --hadoop-properties "mapreduce.reduce.java.opts=-Xmx8000m mapreduce.reduce.memory.mb=8000 scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=2000 mapreduce.task.timeout=0" \ - --main-class com.twitter.simclusters_v2.scalding.tweet_similarity.UnhydratedPairsCollectionAdhocApp -- \ - --date 2020-03-04 \ - --output_path /user/cassowary/adhoc/unhydrated_pairs/2020-03-04_class_balanced \ - --samples_per_query_tweet_class 2000 - * */ -object UnhydratedPairsCollectionAdhocApp extends TwitterExecutionApp { - implicit val timeZone: TimeZone = DateOps.UTC - implicit val dateParser: DateParser = DateParser.default - - override def job: Execution[Unit] = - Execution.withId { implicit uniqueId => - Execution.withArgs { args: Args => - implicit val dateRange: DateRange = DateRange.parse(args.list("date")) - val maxSamplesPerClass: Int = args.int("samples_per_query_tweet_class", default = 2000) - val timeframe: Int = 30 - val outputPath: String = s"${args("output_path")}_${timeframe}min" - - UnhydratedPairsCollectionJob.getLabelledPairs( - dateRange, - timeframe.minute.inMilliseconds, - maxSamplesPerClass, - TweetSimilarityUnhydratedPairs30MinScalaDataset, - outputPath - ) - } - } -} - -/** -capesospy-v2 update --build_locally --start_cron \ -unhydrated_pair_collection_30min src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object UnhydratedPairsCollection30MinScheduledApp extends ScheduledExecutionApp { - - override def batchIncrement: Duration = Hours(24) - override def firstTime: RichDate = RichDate("2020-03-26") - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val maxSamplesPerClass: Int = args.int("samples_per_query_tweet_class", default = 2000) - val timeframe: Int = 30 - val outputPath: String = - s"/user/cassowary/processed/tweet_similarity/unhydrated_pairs_${timeframe}min" - - UnhydratedPairsCollectionJob.getLabelledPairs( - dateRange, - timeframe.minute.inMilliseconds, - maxSamplesPerClass, - TweetSimilarityUnhydratedPairs30MinScalaDataset, - outputPath) - } -} - -/** -capesospy-v2 update --build_locally --start_cron \ -unhydrated_pair_collection_120min src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml - */ -object UnhydratedPairsCollection120MinScheduledApp extends ScheduledExecutionApp { - - override def batchIncrement: Duration = Hours(24) - override def firstTime: RichDate = RichDate("2020-03-26") - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - val maxSamplesPerClass: Int = args.int("samples_per_query_tweet_class", default = 2000) - val timeframe: Int = 120 - val outputPath: String = - s"/user/cassowary/processed/tweet_similarity/unhydrated_pairs_${timeframe}min" - - UnhydratedPairsCollectionJob.getLabelledPairs( - dateRange, - timeframe.minute.inMilliseconds, - maxSamplesPerClass, - TweetSimilarityUnhydratedPairs120MinScalaDataset, - outputPath) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/BUILD.bazel b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/BUILD.bazel deleted file mode 100644 index e231ab769..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/BUILD.bazel +++ /dev/null @@ -1,40 +0,0 @@ -scala_library( - platform = "java8", - tags = ["bazel-only"], - dependencies = [ - "3rdparty/jvm/com/twitter/storehaus:algebra", - "3rdparty/jvm/com/twitter/storehaus:core", - "snowflake:id", - "src/java/com/twitter/ml/api/constant", - "src/scala/com/twitter/ml/api:api-base", - "src/scala/com/twitter/rux/landing_page/data_pipeline:labeled_rux_service_scribe-scala", - "src/scala/com/twitter/rux/landing_page/data_pipeline:landing_page_labeled_data_record-java", - "src/scala/com/twitter/scalding_internal/dalv2", - "src/scala/com/twitter/scalding_internal/dalv2/dataset", - "src/scala/com/twitter/scalding_internal/job", - "src/scala/com/twitter/scalding_internal/job/analytics_batch", - "src/scala/com/twitter/scalding_internal/source", - "src/scala/com/twitter/scalding_internal/source/lzo_scrooge", - "src/scala/com/twitter/simclusters_v2/common", - "src/scala/com/twitter/simclusters_v2/scalding", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/scala/com/twitter/simclusters_v2/summingbird/common", - "src/scala/com/twitter/wtf/scalding/jobs/common:ddg_util", - "twml/runtime/src/main/scala/com/twitter/twml/runtime/scalding", - ], -) - -hadoop_binary( - name = "rux_landing_ddg_analysis-adhoc", - main = "com.twitter.simclusters_v2.scalding.tweet_similarity.evaluation.RUXLandingDdgAnalysisAdhocApp", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":evaluation", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/BUILD.docx new file mode 100644 index 000000000..f7f6404ba Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/ModelEvalAdhocApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/ModelEvalAdhocApp.docx new file mode 100644 index 000000000..3e1896c87 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/ModelEvalAdhocApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/ModelEvalAdhocApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/ModelEvalAdhocApp.scala deleted file mode 100644 index e0d848f95..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/ModelEvalAdhocApp.scala +++ /dev/null @@ -1,91 +0,0 @@ -package com.twitter.simclusters_v2.scalding.tweet_similarity.evaluation - -import com.twitter.ml.api.Feature.Continuous -import com.twitter.ml.api.DailySuffixFeatureSource -import com.twitter.ml.api.DataSetPipe -import com.twitter.ml.api.RichDataRecord -import com.twitter.scalding._ -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.simclusters_v2.tweet_similarity.TweetSimilarityFeatures -import com.twitter.twml.runtime.scalding.TensorflowBatchPredictor -import java.util.TimeZone - -/** - * Scalding execution app for scoring a Dataset against an exported Tensorflow model. - -** Arguments: - * dataset_path - Path for the dataset on hdfs - * date - Date for the dataset paths, required if Daily dataset. - * model_source - Path of the exported model on HDFS. Must start with hdfs:// scheme. - * output_path - Path of the output result file - -scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity:model_eval-adhoc \ ---user cassowary \ ---submitter hadoopnest2.atla.twitter.com \ ---main-class com.twitter.simclusters_v2.scalding.tweet_similarity.ModelEvalAdhocApp -- \ ---date 2020-02-19 \ ---dataset_path /user/cassowary/adhoc/training_data/2020-02-19_class_balanced/test \ ---model_path hdfs:///user/cassowary/tweet_similarity/2020-02-07-15-20-15/exported_models/1581253926 \ ---output_path /user/cassowary/adhoc/training_data/2020-02-19_class_balanced/test/prediction_v1 - **/ -object ModelEvalAdhocApp extends TwitterExecutionApp { - implicit val timeZone: TimeZone = DateOps.UTC - implicit val dateParser: DateParser = DateParser.default - - /** - * Get predictor for the given model path - * @param modelName name of the model - * @param modelSource path of the exported model on HDFS. Must start with hdfs:// scheme. - * @return - */ - def getPredictor(modelName: String, modelSource: String): TensorflowBatchPredictor = { - val defaultInputNode = "request:0" - val defaultOutputNode = "response:0" - TensorflowBatchPredictor(modelName, modelSource, defaultInputNode, defaultOutputNode) - } - - /** - * Given input pipe and predictor, return the predictions in TypedPipe - * @param dataset dataset for prediction - * @param batchPredictor predictor - * @return - */ - def getPrediction( - dataset: DataSetPipe, - batchPredictor: TensorflowBatchPredictor - ): TypedPipe[(Long, Long, Boolean, Double, Double)] = { - val featureContext = dataset.featureContext - val predictionFeature = new Continuous("output") - - batchPredictor - .predict(dataset.records) - .map { - case (originalDataRecord, predictedDataRecord) => - val prediction = new RichDataRecord(predictedDataRecord, featureContext) - .getFeatureValue(predictionFeature).toDouble - val richDataRecord = new RichDataRecord(originalDataRecord, featureContext) - ( - richDataRecord.getFeatureValue(TweetSimilarityFeatures.QueryTweetId).toLong, - richDataRecord.getFeatureValue(TweetSimilarityFeatures.CandidateTweetId).toLong, - richDataRecord.getFeatureValue(TweetSimilarityFeatures.Label).booleanValue, - richDataRecord.getFeatureValue(TweetSimilarityFeatures.CosineSimilarity).toDouble, - prediction - ) - } - } - - override def job: Execution[Unit] = - Execution.withId { implicit uniqueId => - Execution.withArgs { args: Args => - implicit val dateRange: DateRange = DateRange.parse(args.list("date")) - val outputPath: String = args("output_path") - val dataset: DataSetPipe = DailySuffixFeatureSource(args("dataset_path")).read - val modelSource: String = args("model_path") - val modelName: String = "tweet_similarity" - - getPrediction(dataset, getPredictor(modelName, modelSource)) - .writeExecution(TypedTsv[(Long, Long, Boolean, Double, Double)](outputPath)) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/RUXLandingDdgAnalysisAdhocApp.docx b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/RUXLandingDdgAnalysisAdhocApp.docx new file mode 100644 index 000000000..19ff39beb Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/RUXLandingDdgAnalysisAdhocApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/RUXLandingDdgAnalysisAdhocApp.scala b/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/RUXLandingDdgAnalysisAdhocApp.scala deleted file mode 100644 index 8cb575ee5..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation/RUXLandingDdgAnalysisAdhocApp.scala +++ /dev/null @@ -1,82 +0,0 @@ -package com.twitter.simclusters_v2.scalding.tweet_similarity.evaluation - -import com.twitter.rux.landing_page.data_pipeline.LabeledRuxServiceScribeScalaDataset -import com.twitter.rux.landing_page.data_pipeline.thriftscala.LandingPageLabel -import com.twitter.rux.service.thriftscala.FocalObject -import com.twitter.rux.service.thriftscala.UserContext -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.common.UserId -import com.twitter.wtf.scalding.jobs.common.DDGUtil -import java.util.TimeZone - -/** To run: -scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding/tweet_similarity/evaluation:rux_landing_ddg_analysis-adhoc \ ---user cassowary \ ---submitter hadoopnest2.atla.twitter.com \ ---main-class com.twitter.simclusters_v2.scalding.tweet_similarity.evaluation.RUXLandingDdgAnalysisAdhocApp -- \ ---date 2020-04-06 2020-04-13 \ ---ddg model_based_tweet_similarity_10254 \ ---version 1 \ ---output_path /user/cassowary/adhoc/ddg10254 - * */ -object RUXLandingDdgAnalysisAdhocApp extends TwitterExecutionApp { - override def job: Execution[Unit] = - Execution.withId { implicit uniqueId => - Execution.withArgs { args: Args => - implicit val timeZone: TimeZone = DateOps.UTC - implicit val dateParser: DateParser = DateParser.default - implicit val dateRange: DateRange = DateRange.parse(args.list("date")) - val ddgName: String = args("ddg") - val ddgVersion: String = args("version") - val outputPath: String = args("output_path") - val now = RichDate.now - - val ruxLabels = getLabeledRuxServiceScribe(dateRange).map { - case (userId, focalTweet, candidateTweet, impression, fav) => - userId -> (focalTweet, candidateTweet, impression, fav) - } - - // getUsersInDDG reads from a snapshot dataset. - // Just prepend dateRange so that we can look back far enough to make sure there is data. - DDGUtil - .getUsersInDDG(ddgName, ddgVersion.toInt)(DateRange(now - Days(7), now)).map { ddgUser => - ddgUser.userId -> (ddgUser.bucket, ddgUser.enterUserState.getOrElse("no_user_state")) - }.join(ruxLabels) - .map { - case (userId, ((bucket, state), (focalTweet, candidateTweet, impression, fav))) => - (userId, bucket, state, focalTweet, candidateTweet, impression, fav) - } - .writeExecution( - TypedTsv[(UserId, String, String, TweetId, TweetId, Int, Int)](s"$outputPath")) - } - } - - def getLabeledRuxServiceScribe( - dateRange: DateRange - ): TypedPipe[(UserId, TweetId, TweetId, Int, Int)] = { - DAL - .read(LabeledRuxServiceScribeScalaDataset, dateRange) - .toTypedPipe.map { record => - ( - record.ruxServiceScribe.userContext, - record.ruxServiceScribe.focalObject, - record.landingPageLabel) - }.flatMap { - case ( - Some(UserContext(Some(userId), _, _, _, _, _, _, _)), - Some(FocalObject.TweetId(tweet)), - Some(labels)) => - labels.map { - case LandingPageLabel.LandingPageFavoriteEvent(favEvent) => - //(focal tweet, impressioned tweet, impression, fav) - (userId, tweet, favEvent.tweetId, 0, 1) - case LandingPageLabel.LandingPageImpressionEvent(impressionEvent) => - (userId, tweet, impressionEvent.tweetId, 1, 0) - } - case _ => Nil - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/BUILD.bazel b/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/BUILD.bazel deleted file mode 100644 index d196a0d99..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/BUILD.bazel +++ /dev/null @@ -1,59 +0,0 @@ -scala_library( - sources = [ - "*.scala", - ], - platform = "java8", - tags = [ - "bazel-compatible", - "bazel-only", - ], - dependencies = [ - "src/java/com/twitter/sbf/core", - "src/java/com/twitter/sbf/graph", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:simclusters_v2_embeddings_lite-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding", - "src/scala/com/twitter/simclusters_v2/scalding/common", - "src/scala/com/twitter/simclusters_v2/scalding/common/matrix", - "src/scala/com/twitter/wtf/entity_real_graph/common", - "src/scala/com/twitter/wtf/entity_real_graph/scalding/common", - "src/scala/com/twitter/wtf/scalding/jobs/common:execution_app", - "src/scala/com/twitter/wtf/scalding/jobs/common:sources", - "src/scala/com/twitter/wtf/scalding/jobs/common:stats_util", - "src/thrift/com/twitter/recos/entities:entities-thrift-scala", - "src/thrift/com/twitter/wtf/entity_real_graph:entity_real_graph-thrift-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - "usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala", - ], -) - -hadoop_binary( - name = "update_known_for_20m_145k_2020-adhoc", - main = "com.twitter.simclusters_v2.scalding.update_known_for.UpdateKnownFor20M145K2020Adhoc", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":update_known_for", - ], -) - -hadoop_binary( - name = "update_known_for_20m_145k_2020", - main = "com.twitter.simclusters_v2.scalding.update_known_for.UpdateKnownFor20M145K2020", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":update_known_for", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/BUILD.docx new file mode 100644 index 000000000..7bb29da23 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/UpdateKnownFor20M145K2020.docx b/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/UpdateKnownFor20M145K2020.docx new file mode 100644 index 000000000..6d42f7864 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/UpdateKnownFor20M145K2020.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/UpdateKnownFor20M145K2020.scala b/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/UpdateKnownFor20M145K2020.scala deleted file mode 100644 index 07f070592..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/UpdateKnownFor20M145K2020.scala +++ /dev/null @@ -1,256 +0,0 @@ -package com.twitter.simclusters_v2.scalding.update_known_for - -import com.twitter.bijection.scrooge.BinaryScalaCodec -import com.twitter.hermit.candidate.thriftscala.Candidates -import com.twitter.logging.Logger -import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding.DateOps -import com.twitter.scalding.DateParser -import com.twitter.scalding.Days -import com.twitter.scalding.Execution -import com.twitter.scalding.RichDate -import com.twitter.scalding.TypedTsv -import com.twitter.scalding.UniqueID -import com.twitter.scalding._ -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.DALWrite.D -import com.twitter.scalding_internal.dalv2.DALWrite._ -import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC -import com.twitter.scalding_internal.job.TwitterExecutionApp -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources -import com.twitter.simclusters_v2.hdfs_sources.InternalDataPaths -import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2KnownFor20M145KDec11ScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2KnownFor20M145KUpdatedScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2RawKnownFor20M145K2020ScalaDataset -import com.twitter.simclusters_v2.scalding.KnownForSources -import com.twitter.simclusters_v2.scalding.KnownForSources.fromKeyVal -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp -import java.util.TimeZone - -/** - * Scheduled job - * - * capesospy-v2 update --build_locally --start_cron update_known_for_20m_145k_2020 \ - * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml - */ - -object UpdateKnownFor20M145K2020 extends ScheduledExecutionApp { - - override val firstTime: RichDate = RichDate("2020-10-04") - - override val batchIncrement: Duration = Days(7) - - private val tempLocationPath = "/user/cassowary/temp/simclusters_v2/known_for_20m_145k_2020" - - private val simsGraphPath = - "/atla/proc/user/cassowary/manhattan_sequence_files/approximate_cosine_similarity_follow" - - override def runOnDateRange( - args: Args - )( - implicit dateRange: DateRange, - timeZone: TimeZone, - uniqueID: UniqueID - ): Execution[Unit] = { - - Execution.getConfigMode.flatMap { - case (_, mode) => - implicit def valueCodec: BinaryScalaCodec[Candidates] = BinaryScalaCodec(Candidates) - // Step - 1 (DataProcessing): Parameters for getting mapped indices for user-ids - val minActiveFollowers = args.int("minActiveFollowers", 400) - val topK = args.int("topK", 20000000) - - // Step - 2 (DataProcessing): Parameters to remove users not in the topK most followed users from simsGraph - val maxNeighbors = args.int("maxNeighbors", 400) - - // Step - 3 (Final Clustering): Parameters to run the clustering algorithm - /* squareWeightEnable is a boolean flag that changes the edge weights obtained from the - underlying sims graph - a) If false - edge weight between two neighbors is just their cosine similarity. - b) If true - edge weight = cosine_sim * cosine_sim * 10. The squaring makes the higher - weight edges relatively more important; this is based on the intuition that a neighbor - with cosine similarity of 0.1 is more than 2x important compared to a neighbor with - cosine similarity of 0.05. The multiplication with 10 brings the weights back into a - 'nicer' range since squaring will reduce their absolute value. - */ - val squareWeightsEnable = args.boolean("squareWeightsEnable") - - val maxEpochsForClustering = args.int("maxEpochs", 3) - val wtCoeff = args.double("wtCoeff", 10.0) - - val previousKnownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])] = - fromKeyVal( - DAL - .readMostRecentSnapshot( - SimclustersV2RawKnownFor20M145K2020ScalaDataset, - dateRange.embiggen(Days(30))) - .withRemoteReadPolicy(AllowCrossClusterSameDC) - .toTypedPipe, - ModelVersions.Model20M145K2020 - ) - - UpdateKnownForSBFRunner - .runUpdateKnownFor( - TypedPipe - .from(FollowingsCosineSimilaritiesManhattanSource(simsGraphPath)) - .map(_._2), - minActiveFollowers, - topK, - maxNeighbors, - tempLocationPath, - previousKnownFor, - maxEpochsForClustering, - squareWeightsEnable, - wtCoeff, - mode - ) - .flatMap { updateKnownFor => - Execution - .zip( - KnownForSources - .toKeyVal(updateKnownFor, ModelVersions.Model20M145K2020) - .writeDALVersionedKeyValExecution( - SimclustersV2RawKnownFor20M145K2020ScalaDataset, - D.Suffix(InternalDataPaths.RawKnownFor2020Path) - ), - UpdateKnownForSBFRunner - .evaluateUpdatedKnownFor(updateKnownFor, previousKnownFor) - .flatMap { emailText => - Util - .sendEmail( - emailText, - s"Change in cluster assignments for new KnownFor ModelVersion: 20M145K2020", - "no-reply@twitter.com") - Execution.unit - } - ).unit - } - } - } -} -/* -knownFor Week-1: -scalding remote run \ ---target src/scala/com/twitter/simclusters_v2/scalding/update_known_for:update_known_for_20m_145k_2020-adhoc \ ---main-class com.twitter.simclusters_v2.scalding.update_known_for.UpdateKnownFor20M145K2020Adhoc \ ---submitter atla-aor-08-sr1 --user cassowary \ ---submitter-memory 128192.megabyte --hadoop-properties "mapreduce.map.memory.mb=8192 mapreduce.map.java.opts='-Xmx7618M' mapreduce.reduce.memory.mb=8192 mapreduce.reduce.java.opts='-Xmx7618M'" \ --- \ ---date 2020-08-30 --maxNeighbors 100 --minActiveFollowers 400 --topK 20000000 --numNodesPerCommunity 200 --maxEpochs 4 --squareWeightsEnable --wtCoeff 10.0 \ ---inputSimsDir /atla/proc/user/cassowary/manhattan_sequence_files/approximate_cosine_similarity_follow \ ---outputClusterDir /user/cassowary/adhoc/your_ldap/simclusters/clustering_outputs/output_clustering_assignments_2020_readAgain_v4_week_1 - -knownFor Week-2: -scalding remote run \ ---target src/scala/com/twitter/simclusters_v2/scalding/update_known_for:update_known_for_20m_145k_2020-adhoc \ ---main-class com.twitter.simclusters_v2.scalding.update_known_for.UpdateKnownFor20M145K2020Adhoc \ ---submitter atla-aor-08-sr1 --user cassowary \ ---submitter-memory 128192.megabyte --hadoop-properties "mapreduce.map.memory.mb=8192 mapreduce.map.java.opts='-Xmx7618M' mapreduce.reduce.memory.mb=8192 mapreduce.reduce.java.opts='-Xmx7618M'" \ --- \ ---date 2020-08-30 --maxNeighbors 100 --minActiveFollowers 400 --topK 20000000 --numNodesPerCommunity 200 --maxEpochs 4 --squareWeightsEnable --wtCoeff 10.0 \ ---inputSimsDir /atla/proc/user/cassowary/manhattan_sequence_files/approximate_cosine_similarity_follow \ ---inputPreviousKnownForDataSet /user/cassowary/adhoc/your_ldap/simclusters/clustering_outputs/output_clustering_assignments_2020_readAgain_v4_week_1_KeyVal \ ---outputClusterDir /user/cassowary/adhoc/your_ldap/simclusters/clustering_outputs/output_clustering_assignments_2020_readAgain_v4_week_2 - */ - -object UpdateKnownFor20M145K2020Adhoc extends TwitterExecutionApp { - implicit val tz: java.util.TimeZone = DateOps.UTC - implicit val dp = DateParser.default - val log = Logger() - - def job: Execution[Unit] = - Execution.getConfigMode.flatMap { - case (config, mode) => - Execution.withId { implicit uniqueId => - val args = config.getArgs - - implicit def valueCodec: BinaryScalaCodec[Candidates] = BinaryScalaCodec(Candidates) - // Step - 1 (DataProcessing): Parameters for getting mapped indices for user-ids - val minActiveFollowers = args.int("minActiveFollowers", 400) - val topK = args.int("topK", 20000000) - - // Step - 2 (DataProcessing): Parameters to remove users not in the topK most followed users from simsGraph - val clusterAssignmentOutput = args("outputClusterDir") - val maxNeighbors = args.int("maxNeighbors", 400) - - // Step - 3 (Final Clustering): Parameters to run the clustering algorithm - val squareWeightsEnable = args.boolean("squareWeightsEnable") - - val maxEpochsForClustering = args.int("maxEpochs", 3) - val wtCoeff = args.double("wtCoeff", 10.0) - - val simsGraphPath = - "/atla/proc/user/cassowary/manhattan_sequence_files/approximate_cosine_similarity_follow" - // Read in the knownFor dataset, that can be used to initialize the clusters for this week. - val inputPreviousKnownFor: TypedPipe[(Long, Array[(Int, Float)])] = - args.optional("inputPreviousKnownForDataSet") match { - case Some(inputKnownForDir) => - println( - "Input knownFors provided, using these as the initial cluster assignments for users") - TypedPipe - .from(AdhocKeyValSources.knownForSBFResultsDevelSource(inputKnownForDir)) - case None => - println( - "Using knownFor Assignments from prod as no previous assignment was provided in the input") - if (args.boolean("dec11")) { - KnownForSources - .fromKeyVal( - DAL - .readMostRecentSnapshotNoOlderThan( - SimclustersV2KnownFor20M145KDec11ScalaDataset, - Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe, - ModelVersions.Model20M145KDec11 - ) - } else { - KnownForSources - .fromKeyVal( - DAL - .readMostRecentSnapshotNoOlderThan( - SimclustersV2KnownFor20M145KUpdatedScalaDataset, - Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe, - ModelVersions.Model20M145KUpdated - ) - } - } - UpdateKnownForSBFRunner - .runUpdateKnownFor( - TypedPipe - .from(FollowingsCosineSimilaritiesManhattanSource(simsGraphPath)) - .map(_._2), - minActiveFollowers, - topK, - maxNeighbors, - clusterAssignmentOutput, - inputPreviousKnownFor, - maxEpochsForClustering, - squareWeightsEnable, - wtCoeff, - mode - ) - .flatMap { updateKnownFor => - Execution - .zip( - updateKnownFor - .mapValues(_.toList).writeExecution(TypedTsv(clusterAssignmentOutput)), - updateKnownFor.writeExecution(AdhocKeyValSources.knownForSBFResultsDevelSource( - clusterAssignmentOutput + "_KeyVal")), - UpdateKnownForSBFRunner - .evaluateUpdatedKnownFor(updateKnownFor, inputPreviousKnownFor) - .flatMap { emailText => - Util - .sendEmail( - emailText, - s"Change in cluster assignments for new KnownFor ModelVersion: 20M145K2020" + clusterAssignmentOutput, - "no-reply@twitter.com") - Execution.unit - } - ).unit - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/UpdateKnownForSBFRunner.docx b/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/UpdateKnownForSBFRunner.docx new file mode 100644 index 000000000..e68611952 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/UpdateKnownForSBFRunner.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/UpdateKnownForSBFRunner.scala b/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/UpdateKnownForSBFRunner.scala deleted file mode 100644 index 952e88d13..000000000 --- a/src/scala/com/twitter/simclusters_v2/scalding/update_known_for/UpdateKnownForSBFRunner.scala +++ /dev/null @@ -1,685 +0,0 @@ -package com.twitter.simclusters_v2.scalding.update_known_for - -import com.twitter.algebird.Max -import com.twitter.hermit.candidate.thriftscala.Candidates -import com.twitter.sbf.core.AlgorithmConfig -import com.twitter.sbf.core.MHAlgorithm -import com.twitter.sbf.core.SparseBinaryMatrix -import com.twitter.sbf.core.SparseRealMatrix -import com.twitter.sbf.graph.Graph -import com.twitter.scalding.Days -import com.twitter.scalding.Execution -import com.twitter.scalding.Hdfs -import com.twitter.scalding.Mode -import com.twitter.scalding.Stat -import com.twitter.scalding.TypedTsv -import com.twitter.scalding.UniqueID -import com.twitter.scalding.commons.source.VersionedKeyValSource -import com.twitter.scalding.typed.TypedPipe -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation -import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources -import com.twitter.simclusters_v2.scalding.CompareClusters -import com.twitter.simclusters_v2.scalding.KnownForSources -import com.twitter.simclusters_v2.scalding.TopUser -import com.twitter.simclusters_v2.scalding.TopUserWithMappedId -import com.twitter.simclusters_v2.scalding.TopUsersSimilarityGraph -import com.twitter.simclusters_v2.scalding.common.Util -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import java.io.PrintWriter -import java.util.TimeZone -import org.apache.commons.math3.random.JDKRandomGenerator -import org.apache.commons.math3.random.RandomAdaptor -import org.apache.hadoop.fs.FileSystem -import org.apache.hadoop.fs.Path -import scala.collection.mutable - -object UpdateKnownForSBFRunner { - - /** - * The main logic of the job. It works as follows: - * - * 1. read the top 20M users, and convert their UserIds to an integer Id from 0 to 20M in order to use the clustering library - * 2. read the user similarity graph from Sims, and convert their UserIds to the same mapped integer Id - * 3. read the previous known_for data set for initialization of the clustering algorithm; - * for users without previous assignments, we randomly assign them to some unused clusters (if there are any). - * 4. run the clustering algorithm for x iterations (x = 4 in the prod setting) - * 5. output of the clustering result as the new known_for. - * - */ - def runUpdateKnownFor( - simsGraph: TypedPipe[Candidates], - minActiveFollowers: Int, - topK: Int, - maxNeighbors: Int, - tempLocationPath: String, - previousKnownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])], - maxEpochsForClustering: Int, - squareWeightsEnable: Boolean, - wtCoeff: Double, - mode: Mode - )( - implicit - uniqueId: UniqueID, - tz: TimeZone - ): Execution[TypedPipe[(UserId, Array[(ClusterId, Float)])]] = { - - val tempLocationPathSimsGraph = tempLocationPath + "/sims_graph" - val tempLocationPathMappedIds = tempLocationPath + "/mapped_user_ids" - val tempLocationPathClustering = tempLocationPath + "/clustering_output" - - val mappedIdsToUserIds: TypedPipe[(Int, UserId)] = - getTopFollowedUsersWithMappedIds(minActiveFollowers, topK) - .map { - case (id, mappedId) => - (mappedId, id) - } - .shard(partitions = topK / 1e5.toInt) - - val mappedSimsGraphInput: TypedPipe[(Int, List[(Int, Float)])] = - getMappedSimsGraph( - mappedIdsToUserIds, - simsGraph, - maxNeighbors - ) // The simsGraph here consists of the mapped Ids and mapped ngbr Ids and not the original userIds - - val mappedSimsGraphVersionedKeyVal: VersionedKeyValSource[Int, List[(Int, Float)]] = - AdhocKeyValSources.intermediateSBFResultsDevelSource(tempLocationPathSimsGraph) - val mappedIdsToUserIdsVersionedKeyVal: VersionedKeyValSource[Int, UserId] = - AdhocKeyValSources.mappedIndicesDevelSource(tempLocationPathMappedIds) - - // exec to write intermediate results for mapped Sims Graph and mappedIds - val mappedSimsGraphAndMappedIdsWriteExec: Execution[Unit] = Execution - .zip( - mappedSimsGraphInput.writeExecution(mappedSimsGraphVersionedKeyVal), - mappedIdsToUserIds.writeExecution(mappedIdsToUserIdsVersionedKeyVal) - ).unit - - mappedSimsGraphAndMappedIdsWriteExec.flatMap { _ => - // The simsGraph and the mappedIds from userId(long) -> mappedIds are - // having to be written to a temporary location and read again before running - // the clustering algorithm. - - Execution - .zip( - readIntermediateExec( - TypedPipe.from(mappedSimsGraphVersionedKeyVal), - mode, - tempLocationPathSimsGraph), - readIntermediateExec( - TypedPipe.from(mappedIdsToUserIdsVersionedKeyVal), - mode, - tempLocationPathMappedIds) - ) - .flatMap { - case (mappedSimsGraphInputReadAgain, mappedIdsToUserIdsReadAgain) => - val previousKnownForMappedIdsAssignments: TypedPipe[(Int, List[(ClusterId, Float)])] = - getKnownForWithMappedIds( - previousKnownFor, - mappedIdsToUserIdsReadAgain, - ) - - val clusteringResults = getClusteringAssignments( - mappedSimsGraphInputReadAgain, - previousKnownForMappedIdsAssignments, - maxEpochsForClustering, - squareWeightsEnable, - wtCoeff - ) - clusteringResults - .flatMap { updatedKnownFor => - // convert the list of updated KnownFor to a TypedPipe - convertKnownForListToTypedPipe( - updatedKnownFor, - mode, - tempLocationPathClustering - ) - } - .flatMap { updatedKnownForTypedPipe => - // convert the mapped integer id to raw user ids - val updatedKnownFor = - updatedKnownForTypedPipe - .join(mappedIdsToUserIdsReadAgain) - .values - .swap - .mapValues(_.toArray) - - Execution.from(updatedKnownFor) - } - } - } - } - - /** - * Helper function to compare newKnownFor with the previous week knownFor assignments - */ - def evaluateUpdatedKnownFor( - newKnownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])], - inputKnownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])] - )( - implicit uniqueId: UniqueID - ): Execution[String] = { - - val minSizeOfBiggerClusterForComparison = 10 - - val compareClusterExec = CompareClusters.summarize( - CompareClusters.compare( - KnownForSources.transpose(inputKnownFor), - KnownForSources.transpose(newKnownFor), - minSizeOfBiggerCluster = minSizeOfBiggerClusterForComparison - )) - - val compareProducerExec = CompareClusters.compareClusterAssignments( - newKnownFor.mapValues(_.toList), - inputKnownFor.mapValues(_.toList) - ) - - Execution - .zip(compareClusterExec, compareProducerExec) - .map { - case (compareClusterResults, compareProducerResult) => - s"Cosine similarity distribution between cluster membership vectors for " + - s"clusters with at least $minSizeOfBiggerClusterForComparison members\n" + - Util.prettyJsonMapper - .writeValueAsString(compareClusterResults) + - "\n\n-------------------\n\n" + - "Custom counters:\n" + compareProducerResult + - "\n\n-------------------\n\n" - } - } - - /** - * - * Convert the list of updated KnownFor to a TypedPipe - * - * This step should have been done using TypedPipe.from(updatedKnownForList), however, due to the - * large size of the list, TypedPipe would throw out-of-memory exceptions. So we have to first - * dump it to a temp file on HDFS and using a customized read function to load to TypedPipe - * - */ - def convertKnownForListToTypedPipe( - updatedKnownForList: List[(Int, List[(ClusterId, Float)])], - mode: Mode, - temporaryOutputStringPath: String - ): Execution[TypedPipe[(Int, List[(ClusterId, Float)])]] = { - - val stringOutput = updatedKnownForList.map { - case (mappedUserId, clusterArray) => - assert(clusterArray.isEmpty || clusterArray.length == 1) - val str = if (clusterArray.nonEmpty) { - clusterArray.head._1 + " " + clusterArray.head._2 // each user is known for at most 1 cluster - } else { - "" - } - if (mappedUserId % 100000 == 0) - println(s"MappedIds:$mappedUserId ClusterAssigned$str") - s"$mappedUserId $str" - } - - // using Execution to enforce the order of the following 3 steps: - // 1. write the list of strings to a temp file on HDFS - // 2. read the strings to TypedPipe - // 3. delete the temp file - Execution - .from( - // write the output to HDFS; the data will be loaded to Typedpipe later; - // the reason of doing this is that we can not just do TypePipe.from(stringOutput) which - // results in OOM. - TopUsersSimilarityGraph.writeToHDFSIfHDFS( - stringOutput.toIterator, - mode, - temporaryOutputStringPath - ) - ) - .flatMap { _ => - println(s"Start loading the data from $temporaryOutputStringPath") - val clustersWithScores = TypedPipe.from(TypedTsv[String](temporaryOutputStringPath)).map { - mappedIdsWithArrays => - val strArray = mappedIdsWithArrays.trim().split("\\s+") - assert(strArray.length == 3 || strArray.length == 1) - val rowId = strArray(0).toInt - val clusterAssignment: List[(ClusterId, Float)] = - if (strArray.length > 1) { - List((strArray(1).toInt, strArray(2).toFloat)) - } else { - // the knownFors will have users with Array.empty as their assignment if - // the clustering step have empty results for that user. - Nil - } - - if (rowId % 100000 == 0) - println(s"rowId:$rowId ClusterAssigned: $clusterAssignment") - (rowId, clusterAssignment) - } - // return the dataset as an execution and delete the temp location - readIntermediateExec(clustersWithScores, mode, temporaryOutputStringPath) - } - } - - /** - * Helper function to read the dataset as execution and delete the temporary - * location on HDFS for PDP compliance - */ - def readIntermediateExec[K, V]( - dataset: TypedPipe[(K, V)], - mode: Mode, - tempLocationPath: String - ): Execution[TypedPipe[(K, V)]] = { - Execution - .from(dataset) - .flatMap { output => - // delete the temporary outputs for PDP compliance - mode match { - case Hdfs(_, conf) => - val fs = FileSystem.newInstance(conf) - if (fs.deleteOnExit(new Path(tempLocationPath))) { - println(s"Successfully deleted the temporary folder $tempLocationPath!") - } else { - println(s"Failed to delete the temporary folder $tempLocationPath!") - } - case _ => () - } - Execution.from(output) - } - } - - /** - * Converts the userIDs in the sims graph to their mapped integer indices. - * All the users who donot have a mapping are filtered out from the sims graph input - * - * @param mappedUsers mapping of long userIDs to their integer indices - * @param allEdges sims graph - * @param maxNeighborsPerNode number of neighbors for each user - * - * @return simsGraph of users and neighbors with their mapped interger ids - */ - def getMappedSimsGraph( - mappedUsers: TypedPipe[(Int, UserId)], - allEdges: TypedPipe[Candidates], - maxNeighborsPerNode: Int - )( - implicit uniqueId: UniqueID - ): TypedPipe[(Int, List[(Int, Float)])] = { - - val numEdgesAfterFirstJoin = Stat("num_edges_after_first_join") - val numEdgesAfterSecondJoin = Stat("num_edges_after_second_join") - val numEdgesLostTopKTruncated = Stat("num_edges_lost_topk_truncated") - val finalNumEdges = Stat("final_num_edges") - - val mappedUserIdsToIds: TypedPipe[(UserId, Int)] = mappedUsers.swap - allEdges - .map { cs => (cs.userId, cs.candidates) } - // filter the users not present in the mapped userIDs list - .join(mappedUserIdsToIds) - .withReducers(6000) - .flatMap { - case (id, (neighbors, mappedId)) => - val before = neighbors.size - val topKNeighbors = neighbors.sortBy(-_.score).take(maxNeighborsPerNode) - val after = topKNeighbors.size - numEdgesLostTopKTruncated.incBy(before - after) - topKNeighbors.map { candidate => - numEdgesAfterFirstJoin.inc() - (candidate.userId, (mappedId, candidate.score.toFloat)) - } - } - .join(mappedUserIdsToIds) - .withReducers(9000) - .flatMap { - case (id, ((mappedNeighborId, score), mappedId)) => - numEdgesAfterSecondJoin.inc() - // to make the graph symmetric, add those edges back that might have been filtered - // due to maxNeighborsPerNodefor a user but not for its neighbors - List( - (mappedId, Map(mappedNeighborId -> Max(score))), - (mappedNeighborId, Map(mappedId -> Max(score))) - ) - } - .sumByKey - .withReducers(9100) - .map { - case (id, nbrMap) => - // Graph initialization expects neighbors to be sorted in ascending order of ids - val sorted = nbrMap.mapValues(_.get).toList.sortBy(_._1) - finalNumEdges.incBy(sorted.size) - (id, sorted) - } - } - - def getTopFollowedUsersWithMappedIds( - minActiveFollowers: Int, - topK: Int - )( - implicit uniqueId: UniqueID, - timeZone: TimeZone - ): TypedPipe[(Long, Int)] = { - val numTopUsersMappings = Stat("num_top_users_with_mapped_ids") - println("Going to include mappedIds in output") - TopUsersSimilarityGraph - .topUsersWithMappedIdsTopK( - DAL - .readMostRecentSnapshotNoOlderThan( - UsersourceFlatScalaDataset, - Days(30)).withRemoteReadPolicy(ExplicitLocation(ProcAtla)).toTypedPipe, - minActiveFollowers, - topK - ) - .map { - case TopUserWithMappedId(TopUser(id, activeFollowerCount, screenName), mappedId) => - numTopUsersMappings.inc() - (id, mappedId) - } - } - - /** - * Map the userIds in the knownFor dataset to their integer Ids . - */ - def getKnownForWithMappedIds( - knownForDataset: TypedPipe[(UserId, Array[(ClusterId, Float)])], //original userId as the key - mappedIdsWithUserId: TypedPipe[(Int, UserId)] //mapped userId as the key - ): TypedPipe[(Int, List[(ClusterId, Float)])] = { - val userIdsAndTheirMappedIndices = mappedIdsWithUserId.map { - case (mappedId, originalId) => (originalId, mappedId) - } - knownForDataset.join(userIdsAndTheirMappedIndices).map { - case (userId, (userClusterArray, mappedUserId)) => - (mappedUserId, userClusterArray.toList) - } - } - - /** - * Attach the cluster assignments from knownFor dataset to the users in mapped Sims graph . - */ - def attachClusterAssignments( - mappedSimsGraph: TypedPipe[(Int, List[(Int, Float)])], - knownForAssignments: TypedPipe[(Int, List[(ClusterId, Float)])], - squareWeights: Boolean - )( - implicit uniqueId: UniqueID - ): TypedPipe[(Int, Array[Int], Array[Float], List[(ClusterId, Float)])] = { - val numPopularUsersWithNoKnownForBefore = Stat( - "num_popular_users_with_no_knownfor_before_but_popular_now") - - val input = mappedSimsGraph.map { - case (id, nbrsList) => - val ngbrIds = nbrsList.map(_._1).toArray - val ngbrWts = if (squareWeights) { - nbrsList.map(_._2).map(currWt => currWt * currWt * 10).toArray - } else { - nbrsList.map(_._2).toArray - } - (id, ngbrIds, ngbrWts) - } - - // input simsGraph consists of popular ppl with most followed users, who might not have been - // a knownFor user in the previous week. So left join with the knownFor dataset, and these - // new popular users will not have any prior cluster assignments while clustering this time - input - .groupBy(_._1) - .leftJoin(knownForAssignments.groupBy(_._1)) - .toTypedPipe - .map { - case (mappedUserId, ((mappedId, ngbrIds, ngbrWts), knownForResult)) => - val clustersList: List[(Int, Float)] = knownForResult match { - case Some(values) => values._2 - case None => - numPopularUsersWithNoKnownForBefore.inc() - List.empty - } - (mappedUserId, ngbrIds, ngbrWts, clustersList) - } - } - - /** - * Initialize graph with users and neighbors with edge weights . - */ - def getGraphFromSimsInput( - mappedSimsIter: Iterable[ - (Int, Array[Int], Array[Float], List[(ClusterId, Float)]) - ], - numUsers: Int - ): Graph = { - val nbrsIds: Array[Array[Int]] = new Array[Array[Int]](numUsers) - val nbrsWts: Array[Array[Float]] = new Array[Array[Float]](numUsers) - var numEdges = 0L - var numVertices = 0 - var numVerticesWithNoNgbrs = 0 - mappedSimsIter.foreach { - case (id, nbrArrayIds, nbArrayScores, _) => - nbrsIds(id) = nbrArrayIds - nbrsWts(id) = nbArrayScores - numEdges += nbrArrayIds.length - numVertices += 1 - if (numVertices % 100000 == 0) { - println(s"Done loading $numVertices many vertices. Edges so far: $numEdges") - } - } - - (0 until numUsers).foreach { i => - if (nbrsIds(i) == null) { - numVerticesWithNoNgbrs += 1 - nbrsIds(i) = Array[Int]() - nbrsWts(i) = Array[Float]() - } - } - - println( - s"Done loading graph with $numUsers nodes and $numEdges edges (counting each edge twice)") - println("Number of nodes with at least one neighbor is " + numVertices) - println("Number of nodes with at no neighbors is " + numVerticesWithNoNgbrs) - new Graph(numUsers, numEdges / 2, nbrsIds, nbrsWts) - } - - /** - * Helper function that initializes users to clusters based on previous knownFor assignments - * and for users with no previous assignments, assign them randomly to any of the empty clusters - */ - def initializeSparseBinaryMatrix( - graph: Graph, - mappedSimsGraphIter: Iterable[ - (Int, Array[Int], Array[Float], List[(ClusterId, Float)]) - ], // user with neighbors, neighbor wts and previous knownfor assignments - numUsers: Int, - numClusters: Int, - algoConfig: AlgorithmConfig, - ): SparseBinaryMatrix = { - var clustersSeenFromPreviousWeek: Set[Int] = Set.empty - var emptyClustersFromPreviousWeek: Set[Int] = Set.empty - var usersWithNoAssignmentsFromPreviousWeek: Set[Int] = Set.empty - mappedSimsGraphIter.foreach { - case (id, _, _, knownFor) => - if (knownFor.isEmpty) { - usersWithNoAssignmentsFromPreviousWeek += id - } - knownFor.foreach { - case (clusterId, _) => - clustersSeenFromPreviousWeek += clusterId - } - } - (1 to numClusters).foreach { i => - if (!clustersSeenFromPreviousWeek.contains(i)) emptyClustersFromPreviousWeek += i - } - var z = new SparseBinaryMatrix(numUsers, numClusters) - println("Going to initialize from previous KnownFor") - var zeroIndexedClusterIdsFromPreviousWeek: Set[Int] = Set.empty - for (clusterIdOneIndexed <- emptyClustersFromPreviousWeek) { - zeroIndexedClusterIdsFromPreviousWeek += (clusterIdOneIndexed - 1) - } - // Initialize z - users with no previous assignments are assigned to empty clusters - z.initFromSubsetOfRowsForSpecifiedColumns( - graph, - (gr: Graph, i: Integer) => algoConfig.rng.nextDouble, - zeroIndexedClusterIdsFromPreviousWeek.toArray, - usersWithNoAssignmentsFromPreviousWeek.toArray, - new PrintWriter(System.err) - ) - println("Initialized the empty clusters") - mappedSimsGraphIter.foreach { - case (id, _, _, knownFor) => - val currClustersForUserZeroIndexed = knownFor.map(_._1).map(x => x - 1) - // Users who have a previous cluster assignment are initialized with the same cluster - if (currClustersForUserZeroIndexed.nonEmpty) { - z.updateRow(id, currClustersForUserZeroIndexed.sorted.toArray) - } - } - println("Done initializing from previous knownFor assignment") - z - } - - /** - * Optimize the sparseBinaryMatrix. This function runs the clustering epochs and computes the - * cluster assignments for the next week, based on the underlying user-user graph - */ - def optimizeSparseBinaryMatrix( - algoConfig: AlgorithmConfig, - graph: Graph, - z: SparseBinaryMatrix - ): SparseBinaryMatrix = { - val prec0 = MHAlgorithm.clusterPrecision(graph, z, 0, 1000, algoConfig.rng) - println("Precision of cluster 0:" + prec0.precision) - val prec1 = MHAlgorithm.clusterPrecision(graph, z, 1, 1000, algoConfig.rng) - println("Precision of cluster 1:" + prec1.precision) - val algo = new MHAlgorithm(algoConfig, graph, z, new PrintWriter(System.err)) - val optimizedZ = algo.optimize - optimizedZ - } - - /** - * Helper function that takes the heuristically scored association of user to a cluster - * and returns the knownFor result - * @param srm SparseRealMatrix with (row, col) score denoting the membership score of user in the cluster - * @return assignments of users (mapped integer indices) to clusters with knownFor scores. - */ - def getKnownForHeuristicScores(srm: SparseRealMatrix): List[(Int, List[(ClusterId, Float)])] = { - val knownForAssignmentsFromClusterScores = (0 until srm.getNumRows).map { rowId => - val rowWithIndices = srm.getColIdsForRow(rowId) - val rowWithScores = srm.getValuesForRow(rowId) - val allClustersWithScores: Array[(ClusterId, Float)] = - rowWithIndices.zip(rowWithScores).map { - case (colId, score) => (colId + 1, score.toFloat) - } - if (rowId % 100000 == 0) { - println("Inside outputIter:" + rowId + " " + srm.getNumRows) - } - - val clusterAssignmentWithMaxScore: List[(ClusterId, Float)] = - if (allClustersWithScores.length > 1) { - // if sparseBinaryMatrix z has rows with more than one non-zero column (i.e a user - // initialized with more than one cluster), and the clustering algorithm doesnot find - // a better proposal for cluster assignment, the user's multi-cluster membership - // from the initialization step can continue. - // We found that this happens in ~0.1% of the knownFor users. Hence choose the - // cluster with the highest score to deal with such edge cases. - val result: (ClusterId, Float) = allClustersWithScores.maxBy(_._2) - println( - "Found a user with mappedId: %s with more than 1 cluster assignment:%s; Assigned to the best cluster: %s" - .format( - rowId.toString, - allClustersWithScores.mkString("Array(", ", ", ")"), - result - .toString())) - List(result) - } else { - allClustersWithScores.toList - } - (rowId, clusterAssignmentWithMaxScore) - } - knownForAssignmentsFromClusterScores.toList - } - - /** - * Function that computes the clustering assignments to users - * - * @param mappedSimsGraph user-user graph as input to clustering - * @param previousKnownForAssignments previous week clustering assignments - * @param maxEpochsForClustering number of neighbors for each user - * @param squareWeights boolean flag for the edge weights in the sims graph - * @param wtCoeff wtCoeff - * - * @return users with clusters assigned - */ - def getClusteringAssignments( - mappedSimsGraph: TypedPipe[(Int, List[(Int, Float)])], - previousKnownForAssignments: TypedPipe[(Int, List[(ClusterId, Float)])], - maxEpochsForClustering: Int, - squareWeights: Boolean, - wtCoeff: Double - )( - implicit uniqueId: UniqueID - ): Execution[List[(Int, List[(ClusterId, Float)])]] = { - - attachClusterAssignments( - mappedSimsGraph, - previousKnownForAssignments, - squareWeights).toIterableExecution.flatMap { mappedSimsGraphWithClustersIter => - val tic = System.currentTimeMillis - var maxVertexId = 0 - var maxClusterIdInPreviousAssignment = 0 - mappedSimsGraphWithClustersIter.foreach { - case (id, _, _, knownFor) => - maxVertexId = Math.max(id, maxVertexId) - knownFor.foreach { - case (clusterId, _) => - maxClusterIdInPreviousAssignment = - Math.max(clusterId, maxClusterIdInPreviousAssignment) - } - } - - val numUsersToCluster = - maxVertexId + 1 //since users were mapped with index starting from 0, using zipWithIndex - println("Total number of topK users to be clustered this time:" + numUsersToCluster) - println( - "Total number of clusters in the previous knownFor assignment:" + maxClusterIdInPreviousAssignment) - println("Will set number of communities to " + maxClusterIdInPreviousAssignment) - - // Initialize the graph with users, neighbors and the corresponding edge weights - val graph = getGraphFromSimsInput(mappedSimsGraphWithClustersIter, numUsersToCluster) - val toc = System.currentTimeMillis() - println("Time to load the graph " + (toc - tic) / 1000.0 / 60.0 + " minutes") - - // define the algoConfig parameters - val algoConfig = new AlgorithmConfig() - .withCpu(16).withK(maxClusterIdInPreviousAssignment) - .withWtCoeff(wtCoeff.toDouble) - .withMaxEpoch(maxEpochsForClustering) - algoConfig.divideResultIntoConnectedComponents = false - algoConfig.minClusterSize = 1 - algoConfig.updateImmediately = true - algoConfig.rng = new RandomAdaptor(new JDKRandomGenerator(1)) - - // Initialize a sparseBinaryMatrix with users assigned to their previous week knownFor - // assignments. For those users who do not a prior assignment, we assign - // the (user + the neighbors from the graph) to the empty clusters. - // Please note that this neighborhood-based initialization to empty clusters can - // have a few cases where the same user was assigned to more than one cluster - val z = initializeSparseBinaryMatrix( - graph, - mappedSimsGraphWithClustersIter, - numUsersToCluster, - maxClusterIdInPreviousAssignment, - algoConfig - ) - - // Run the epochs of the clustering algorithm to find the new cluster assignments - val tic2 = System.currentTimeMillis - val optimizedZ = optimizeSparseBinaryMatrix(algoConfig, graph, z) - val toc2 = System.currentTimeMillis - println("Time to optimize: %.2f seconds\n".format((toc2 - tic2) / 1000.0)) - println("Time to initialize & optimize: %.2f seconds\n".format((toc2 - toc) / 1000.0)) - - // Attach scores to the cluster assignments - val srm = MHAlgorithm.heuristicallyScoreClusterAssignments(graph, optimizedZ) - - // Get the knownfor assignments of users from the heuristic scores - // assigned based on neigbhorhood of the user and their cluster assignments - // The returned result has userIDs in the mapped integer indices - val knownForAssignmentsFromClusterScores: List[(Int, List[(ClusterId, Float)])] = - getKnownForHeuristicScores(srm) - - Execution.from(knownForAssignmentsFromClusterScores) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/BQGenerationUtil.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/BQGenerationUtil.docx new file mode 100644 index 000000000..8c1d5f61c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/BQGenerationUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/BQGenerationUtil.scala b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/BQGenerationUtil.scala deleted file mode 100644 index a433bc732..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/BQGenerationUtil.scala +++ /dev/null @@ -1,255 +0,0 @@ -package com.twitter.simclusters_v2.scio -package bq_generation.common - -import com.twitter.wtf.beam.bq_embedding_export.BQQueryUtils -import org.joda.time.DateTime - -object BQGenerationUtil { - // Consumer Embeddings BQ table details - val interestedInEmbeddings20M145K2020Table = BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_v2_user_to_interested_in_20M_145K_2020", - ) - val mtsConsumerEmbeddingsFav90P20MTable = BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "mts_consumer_embeddings_fav90p_20m", - ) - - // Common SQL path - val TweetFavCountSQLPath = - s"/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_fav_count.sql" - - val NSFWTweetIdDenylistSQLPath = - s"/com/twitter/simclusters_v2/scio/bq_generation/sql/nsfw_tweet_denylist.sql" - - val ClusterTopTweetsIntersectionWithFavBasedIndexSQLPath = - s"/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets_intersection_with_fav_based_index.sql" - - // Read InterestedIn 2020 - def getInterestedIn2020SQL( - queryDate: DateTime, - lookBackDays: Int - ): String = { - s""" - |SELECT userId, - | clusterIdToScores.key AS clusterId, - | clusterIdToScores.value.logFavScore AS userScore, - | clusterIdToScores.value.logFavScoreClusterNormalizedOnly AS clusterNormalizedLogFavScore, - |FROM `$interestedInEmbeddings20M145K2020Table`, UNNEST(clusterIdToScores) AS clusterIdToScores - |WHERE DATE(_PARTITIONTIME) = - | ( -- Get latest partition time - | SELECT MAX(DATE(_PARTITIONTIME)) latest_partition - | FROM `$interestedInEmbeddings20M145K2020Table` - | WHERE Date(_PARTITIONTIME) BETWEEN - | DATE_SUB(Date("${queryDate}"), - | INTERVAL $lookBackDays DAY) AND DATE("$queryDate") - | ) - | AND clusterIdToScores.value.logFavScore > 0.0 # min score threshold for user embedding values - |""".stripMargin - } - - // Read MTS Consumer Embeddings - Fav90P20M config - def getMTSConsumerEmbeddingsFav90P20MSQL( - queryDate: DateTime, - lookBackDays: Int - ): String = { - // We read the most recent snapshot of MTS Consumer Embeddings Fav90P20M - s""" - |SELECT userId, - | clusterIdToScores.key AS clusterId, - | clusterIdToScores.value.logFavUserScore AS userScore, - | clusterIdToScores.value.logFavUserScoreClusterNormalized AS clusterNormalizedLogFavScore - | FROM `$mtsConsumerEmbeddingsFav90P20MTable`, UNNEST(embedding.clusterIdToScores) AS clusterIdToScores - |WHERE DATE(ingestionTime) = ( - | -- Get latest partition time - | SELECT MAX(DATE(ingestionTime)) latest_partition - | FROM `$mtsConsumerEmbeddingsFav90P20MTable` - | WHERE Date(ingestionTime) BETWEEN - | DATE_SUB(Date("${queryDate}"), - | INTERVAL $lookBackDays DAY) AND DATE("${queryDate}") - |) AND clusterIdToScores.value.logFavUserScore > 0.0 - |""".stripMargin - } - - /* - * For a specific tweet engagement, retrieve the user id, tweet id, and timestamp - * - * Return: - * String - UserId, TweetId and Timestamp table SQL string format - * Table Schema - * - userId: Long - * - tweetId: Long - * - tsMillis: Long - */ - def getUserTweetEngagementEventPairSQL( - startTime: DateTime, - endTime: DateTime, - userTweetEngagementEventPairSqlPath: String, - userTweetEngagementEventPairTemplateVariable: Map[String, String] - ): String = { - val templateVariables = Map( - "START_TIME" -> startTime.toString(), - "END_TIME" -> endTime.toString(), - "NO_OLDER_TWEETS_THAN_DATE" -> startTime.toString() - ) ++ userTweetEngagementEventPairTemplateVariable - BQQueryUtils.getBQQueryFromSqlFile(userTweetEngagementEventPairSqlPath, templateVariables) - } - - /* - * Retrieve tweets and the # of favs it got from a given time window - * - * Return: - * String - TweetId and fav count table SQL string format - * Table Schema - * - tweetId: Long - * - favCount: Long - */ - def getTweetIdWithFavCountSQL( - startTime: DateTime, - endTime: DateTime, - ): String = { - val templateVariables = - Map( - "START_TIME" -> startTime.toString(), - "END_TIME" -> endTime.toString(), - ) - BQQueryUtils.getBQQueryFromSqlFile(TweetFavCountSQLPath, templateVariables) - } - - /* - * From a given time window, retrieve tweetIds that were created by specific author or media type - * - * Input: - * - startTime: DateTime - * - endTime: DateTime - * - filterMediaType: Option[Int] - * MediaType - * 1: Image - * 2: GIF - * 3: Video - * - filterNSFWAuthor: Boolean - * Whether we want to filter out NSFW tweet authors - * - * Return: - * String - TweetId table SQL string format - * Table Schema - * - tweetId: Long - */ - def getTweetIdWithMediaAndNSFWAuthorFilterSQL( - startTime: DateTime, - endTime: DateTime, - filterMediaType: Option[Int], - filterNSFWAuthor: Boolean - ): String = { - val sql = s""" - |SELECT DISTINCT tweetId - |FROM `twttr-bq-tweetsource-prod.user.unhydrated_flat` tweetsource, UNNEST(media) AS media - |WHERE (DATE(_PARTITIONTIME) >= DATE("${startTime}") AND DATE(_PARTITIONTIME) <= DATE("${endTime}")) AND - | timestamp_millis((1288834974657 + - | ((tweetId & 9223372036850581504) >> 22))) >= TIMESTAMP("${startTime}") - | AND timestamp_millis((1288834974657 + - | ((tweetId & 9223372036850581504) >> 22))) <= TIMESTAMP("${endTime}") - |""".stripMargin - - val filterMediaStr = filterMediaType match { - case Some(mediaType) => s" AND media.media_type =${mediaType}" - case _ => "" - } - val filterNSFWAuthorStr = if (filterNSFWAuthor) " AND nsfwUser = false" else "" - sql + filterMediaStr + filterNSFWAuthorStr - } - - /* - * From a given time window, retrieve tweetIds that fall into the NSFW deny list - * - * Input: - * - startTime: DateTime - * - endTime: DateTime - * - * Return: - * String - TweetId table SQL string format - * Table Schema - * - tweetId: Long - */ - def getNSFWTweetIdDenylistSQL( - startTime: DateTime, - endTime: DateTime, - ): String = { - val templateVariables = - Map( - "START_TIME" -> startTime.toString(), - "END_TIME" -> endTime.toString(), - ) - BQQueryUtils.getBQQueryFromSqlFile(NSFWTweetIdDenylistSQLPath, templateVariables) - } - - /* - * From a given cluster id to top k tweets table and a time window, - * (1) Retrieve the latest fav-based top tweets per cluster table within the time window - * (2) Inner join with the given table using cluster id and tweet id - * (3) Create the top k tweets per cluster table for the intersection - * - * Input: - * - startTime: DateTime - * - endTime: DateTime - * - topKTweetsForClusterKeySQL: String, a SQL query - * - * Return: - * String - TopKTweetsForClusterKey table SQL string format - * Table Schema - * - clusterId: Long - * - topKTweetsForClusterKey: (Long, Long) - * - tweetId: Long - * - tweetScore: Long - */ - def generateClusterTopTweetIntersectionWithFavBasedIndexSQL( - startTime: DateTime, - endTime: DateTime, - clusterTopKTweets: Int, - topKTweetsForClusterKeySQL: String - ): String = { - val templateVariables = - Map( - "START_TIME" -> startTime.toString(), - "END_TIME" -> endTime.toString(), - "CLUSTER_TOP_K_TWEETS" -> clusterTopKTweets.toString, - "CLUSTER_TOP_TWEETS_SQL" -> topKTweetsForClusterKeySQL - ) - BQQueryUtils.getBQQueryFromSqlFile( - ClusterTopTweetsIntersectionWithFavBasedIndexSQLPath, - templateVariables) - } - - /* - * Given a list of action types, build a string that indicates the user - * engaged with the tweet - * - * Example use case: We want to build a SQL query that specifies this user engaged - * with tweet with either fav or retweet actions. - * - * Input: - * - actionTypes: Seq("ServerTweetFav", "ServerTweetRetweet") - * - booleanOperator: "OR" - * Output: "ServerTweetFav.engaged = 1 OR ServerTweetRetweet.engaged = 1" - * - * Example SQL: - * SELECT ServerTweetFav, ServerTweetRetweet - * FROM table - * WHERE ServerTweetFav.engaged = 1 OR ServerTweetRetweet.engaged = 1 - */ - def buildActionTypesEngagementIndicatorString( - actionTypes: Seq[String], - booleanOperator: String = "OR" - ): String = { - actionTypes.map(action => f"""${action}.engaged = 1""").mkString(f""" ${booleanOperator} """) - } -} - -case class BQTableDetails( - projectName: String, - tableName: String, - datasetName: String) { - override def toString: String = s"${projectName}.${tableName}.${datasetName}" -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/BUILD b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/BUILD deleted file mode 100644 index 1ed000bc5..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/BUILD +++ /dev/null @@ -1,10 +0,0 @@ -scala_library( - sources = [ - "*.scala", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/scala/com/twitter/wtf/beam/bq_embedding_export:bq_embedding_export_lib", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/BUILD.docx new file mode 100644 index 000000000..ee5596fd5 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/IndexGenerationUtil.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/IndexGenerationUtil.docx new file mode 100644 index 000000000..07684c7ea Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/IndexGenerationUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/IndexGenerationUtil.scala b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/IndexGenerationUtil.scala deleted file mode 100644 index bfbc00e71..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/common/IndexGenerationUtil.scala +++ /dev/null @@ -1,63 +0,0 @@ -package com.twitter.simclusters_v2.scio -package bq_generation.common - -import com.twitter.algebird_internal.thriftscala.DecayedValue -import com.twitter.simclusters_v2.thriftscala.FullClusterId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.Scores -import com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores -import com.twitter.snowflake.id.SnowflakeId -import org.apache.avro.generic.GenericRecord -import org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord -import org.apache.beam.sdk.transforms.SerializableFunction -import scala.collection.JavaConverters._ - -object IndexGenerationUtil { - // Function that parses [GenericRecord] results we read from BQ into [TopKTweetsForClusterKey] - def parseClusterTopKTweetsFn(tweetEmbeddingsHalfLife: Int) = - new SerializableFunction[SchemaAndRecord, TopKTweetsForClusterKey] { - override def apply(record: SchemaAndRecord): TopKTweetsForClusterKey = { - val genericRecord: GenericRecord = record.getRecord() - TopKTweetsForClusterKey( - clusterId = FullClusterId( - modelVersion = ModelVersion.Model20m145k2020, - clusterId = genericRecord.get("clusterId").toString.toInt - ), - topKTweetsWithScores = parseTopKTweetsForClusterKeyColumn( - genericRecord, - "topKTweetsForClusterKey", - tweetEmbeddingsHalfLife), - ) - } - } - - // Function that parses the topKTweetsForClusterKey column into [TopKTweetsWithScores] - def parseTopKTweetsForClusterKeyColumn( - genericRecord: GenericRecord, - columnName: String, - tweetEmbeddingsHalfLife: Int - ): TopKTweetsWithScores = { - val tweetScorePairs: java.util.List[GenericRecord] = - genericRecord.get(columnName).asInstanceOf[java.util.List[GenericRecord]] - val tweetIdToScoresMap = tweetScorePairs.asScala - .map((gr: GenericRecord) => { - // Retrieve the tweetId and tweetScore - val tweetId = gr.get("tweetId").toString.toLong - val tweetScore = gr.get("tweetScore").toString.toDouble - - // Transform tweetScore into DecayedValue - // Ref: https://github.com/twitter/algebird/blob/develop/algebird-core/src/main/scala/com/twitter/algebird/DecayedValue.scala - val scaledTime = - SnowflakeId.unixTimeMillisFromId(tweetId) * math.log(2.0) / tweetEmbeddingsHalfLife - val decayedValue = DecayedValue(tweetScore, scaledTime) - - // Update the TopTweets Map - tweetId -> Scores(favClusterNormalized8HrHalfLifeScore = Some(decayedValue)) - }).toMap - TopKTweetsWithScores(topTweetsByFavClusterNormalizedScore = Some(tweetIdToScoresMap)) - } - case class TopKTweetsForClusterKey( - clusterId: FullClusterId, - topKTweetsWithScores: TopKTweetsWithScores) - -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/BUILD b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/BUILD deleted file mode 100644 index 441d8a98a..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/BUILD +++ /dev/null @@ -1,250 +0,0 @@ -scala_library( - name = "ftr_bq_generation", - sources = [ - "**/*.scala", - ], - tags = ["bazel-compatible"], - dependencies = [ - "src/scala/com/twitter/simclusters_v2/common", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/common", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:offline_tweet_recommendations_decayed_sum-scala", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:offline_tweet_recommendations_ftr_adhoc-scala", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:offline_tweet_recommendations_ftrat5_pop_biased_1000-scala", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:offline_tweet_recommendations_ftrat5_pop_biased_10000-scala", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:simclusters_decayed_sum_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:simclusters_ftr_adhoc_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:simclusters_ftr_pop10000_rnkdecay11_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:simclusters_ftr_pop1000_rnkdecay11_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:simclusters_oon_ftr_adhoc_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:simclusters_oon_ftr_pop1000_rnkdecay_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann:bq_generation", - ], -) - -jvm_binary( - name = "ftr-tweet-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.ftr_tweet.FTRAdhocJob", - dependencies = [ - ":ftr_bq_generation", - ], -) - -jvm_binary( - name = "iikf2020-decayed-sum-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.ftr_tweet.IIKF2020DecayedSumBatchJobProd", - dependencies = [ - ":ftr_bq_generation", - ], -) - -jvm_binary( - name = "iikf2020-ftrat5-pop1000-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.ftr_tweet.IIKF2020FTRAt5Pop1000batchJobProd", - dependencies = [ - ":ftr_bq_generation", - ], -) - -jvm_binary( - name = "iikf2020-ftrat5-pop10000-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.ftr_tweet.IIKF2020FTRAt5Pop10000batchJobProd", - dependencies = [ - ":ftr_bq_generation", - ], -) - -create_datasets( - base_name = "offline_tweet_recommendations_ftr_adhoc", - key_type = "Long", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.tweetRecommendationsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.CandidateTweetsList", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "offline_tweet_recommendations_decayed_sum", - key_type = "Long", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.tweetRecommendationsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.CandidateTweetsList", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "offline_tweet_recommendations_ftrat5_pop_biased_1000", - key_type = "Long", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.tweetRecommendationsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.CandidateTweetsList", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "offline_tweet_recommendations_ftrat5_pop_biased_10000", - key_type = "Long", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.tweetRecommendationsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.CandidateTweetsList", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -jvm_binary( - name = "ftr-tweet-index-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.ftr_tweet.FTRClusterToTweetIndexGenerationAdhoc", - dependencies = [ - ":ftr_bq_generation", - ], -) - -jvm_binary( - name = "oon-ftr-tweet-index-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.ftr_tweet.OONFTRClusterToTweetIndexGenerationAdhoc", - dependencies = [ - ":ftr_bq_generation", - ], -) - -jvm_binary( - name = "ftr-tweet-index-generation-pop1000-rnkdecay11-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.ftr_tweet.FTRPop1000RankDecay11ClusterToTweetIndexGenerationBatch", - dependencies = [ - ":ftr_bq_generation", - ], -) - -jvm_binary( - name = "ftr-tweet-index-generation-pop10000-rnkdecay11-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.ftr_tweet.FTRPop10000RankDecay11ClusterToTweetIndexGenerationBatch", - dependencies = [ - ":ftr_bq_generation", - ], -) - -jvm_binary( - name = "oon-ftr-tweet-index-generation-pop1000-rnkdecay-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.ftr_tweet.OONFTRPop1000RankDecayClusterToTweetIndexGenerationBatch", - dependencies = [ - ":ftr_bq_generation", - ], -) - -jvm_binary( - name = "ftr-tweet-index-generation-decayed-sum-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.ftr_tweet.DecayedSumClusterToTweetIndexGenerationBatch", - dependencies = [ - ":ftr_bq_generation", - ], -) - -create_datasets( - base_name = "simclusters_ftr_adhoc_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_oon_ftr_adhoc_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_ftr_pop1000_rnkdecay11_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_ftr_pop10000_rnkdecay11_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_oon_ftr_pop1000_rnkdecay_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) - -create_datasets( - base_name = "simclusters_decayed_sum_cluster_to_tweet_index", - key_type = "com.twitter.simclusters_v2.thriftscala.FullClusterId", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.ClusterTopTweetsInjection.clusterIdToTopKTweetsInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/BUILD.docx new file mode 100644 index 000000000..71ec3f8be Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/Config.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/Config.docx new file mode 100644 index 000000000..180ea5daa Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/Config.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/Config.scala b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/Config.scala deleted file mode 100644 index f1b747c86..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/Config.scala +++ /dev/null @@ -1,43 +0,0 @@ -package com.twitter.simclusters_v2.scio.bq_generation.ftr_tweet - -object Config { - // Variables for MH output path - val FTRRootMHPath: String = "manhattan_sequence_files/ftr_tweet_embedding/" - val FTRAdhocpath: String = "adhoc/ftr_tweet_embedding/" - val IIKFFTRAdhocANNOutputPath: String = "ftr_tweets_test/your_ldap_test" - val IIKFFTRAt5Pop1000ANNOutputPath: String = "ftr_tweets/ftr_at_5_pop_biased_1000" - val IIKFFTRAt5Pop10000ANNOutputPath: String = "ftr_tweets/ftr_at_5_pop_biased_10000" - val IIKFDecayedSumANNOutputPath: String = "ftr_tweets/decayed_sum" - - val DecayedSumClusterToTweetIndexOutputPath = "ftr_cluster_to_tweet/decayed_sum" - - val FTRPop1000RankDecay11ClusterToTweetIndexOutputPath = - "ftr_cluster_to_tweet/ftr_pop1000_rnkdecay11" - val FTRPop10000RankDecay11ClusterToTweetIndexOutputPath = - "ftr_cluster_to_tweet/ftr_pop10000_rnkdecay11" - val OONFTRPop1000RankDecayClusterToTweetIndexOutputPath = - "oon_ftr_cluster_to_tweet/oon_ftr_pop1000_rnkdecay" - - // Variables for tweet embeddings generation - val TweetSampleRate = 1 // 100% sample rate - val EngSampleRate = 1 // engagement from 50% of users - val MinTweetFavs = 8 // min favs for tweets - val MinTweetImps = 50 // min impressions for tweets - val MaxTweetFTR = 0.5 // maximum tweet FTR, a way to combat spammy tweets - val MaxUserLogNImps = 5 // maximum number of impressions 1e5 for users - val MaxUserLogNFavs = 4 // maximum number of favs 1e4 for users - val MaxUserFTR = 0.3 // maximum user FTR, a way to combat accounts that fav everything - - val SimClustersTweetEmbeddingsGenerationHalfLife: Int = 28800000 // 8hrs in ms - val SimClustersTweetEmbeddingsGenerationEmbeddingLength = 15 - - // Variables for BQ ANN - val SimClustersANNTopNClustersPerSourceEmbedding: Int = 20 - val SimClustersANNTopMTweetsPerCluster: Int = 50 - val SimClustersANNTopKTweetsPerUserRequest: Int = 200 - - // Cluster-to-tweet index configs - val clusterTopKTweets: Int = 2000 - val maxTweetAgeHours: Int = 24 - val TweetEmbeddingHalfLife: Int = 28800000 // for usage in DecayedValue struct -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/FTRJob.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/FTRJob.docx new file mode 100644 index 000000000..ba60c729b Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/FTRJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/FTRJob.scala b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/FTRJob.scala deleted file mode 100644 index a6027d3e4..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/FTRJob.scala +++ /dev/null @@ -1,242 +0,0 @@ -package com.twitter.simclusters_v2.scio.bq_generation -package ftr_tweet - -import com.google.api.services.bigquery.model.TimePartitioning -import com.spotify.scio.ScioContext -import com.spotify.scio.coders.Coder -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.job.DateRangeOptions -import com.twitter.conversions.DurationOps.richDurationFromInt -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scio_internal.coders.ThriftStructLazyBinaryScroogeCoder -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.scrooge.ThriftStruct -import com.twitter.simclusters_v2.scio.bq_generation.common.BQTableDetails -import com.twitter.simclusters_v2.scio.bq_generation.common.BQGenerationUtil.getInterestedIn2020SQL -import com.twitter.simclusters_v2.thriftscala.CandidateTweets -import com.twitter.simclusters_v2.thriftscala.CandidateTweetsList -import com.twitter.tcdc.bqblaster.beam.syntax._ -import com.twitter.tcdc.bqblaster.core.avro.TypedProjection -import com.twitter.tcdc.bqblaster.core.transform.RootTransform -import java.time.Instant -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO -import com.twitter.simclusters_v2.thriftscala.CandidateTweet -import org.apache.avro.generic.GenericData -import scala.collection.mutable.ListBuffer -import org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord -import org.apache.beam.sdk.transforms.SerializableFunction -import org.apache.avro.generic.GenericRecord -import com.twitter.wtf.beam.bq_embedding_export.BQQueryUtils - -trait FTRJob extends ScioBeamJob[DateRangeOptions] { - // Configs to set for different type of embeddings and jobs - val isAdhoc: Boolean - val outputTable: BQTableDetails - val keyValDatasetOutputPath: String - val tweetRecommentationsSnapshotDataset: KeyValDALDataset[KeyVal[Long, CandidateTweetsList]] - val scoreKey: String - val scoreColumn: String - - // Base configs - val projectId = "twttr-recos-ml-prod" - val environment: DAL.Env = if (isAdhoc) DAL.Environment.Dev else DAL.Environment.Prod - - override implicit def scroogeCoder[T <: ThriftStruct: Manifest]: Coder[T] = - ThriftStructLazyBinaryScroogeCoder.scroogeCoder - - override def configurePipeline(sc: ScioContext, opts: DateRangeOptions): Unit = { - // The time when the job is scheduled - val queryTimestamp = opts.interval.getEnd - - // Parse tweetId candidates column - def parseTweetIdColumn( - genericRecord: GenericRecord, - columnName: String - ): List[CandidateTweet] = { - val tweetIds: GenericData.Array[GenericRecord] = - genericRecord.get(columnName).asInstanceOf[GenericData.Array[GenericRecord]] - val results: ListBuffer[CandidateTweet] = new ListBuffer[CandidateTweet]() - tweetIds.forEach((sc: GenericRecord) => { - results += CandidateTweet( - tweetId = sc.get("tweetId").toString.toLong, - score = Some(sc.get("cosineSimilarityScore").toString.toDouble) - ) - }) - results.toList - } - - //Function that parses the GenericRecord results we read from BQ - val parseUserToTweetRecommendationsFunc = - new SerializableFunction[SchemaAndRecord, UserToTweetRecommendations] { - override def apply(record: SchemaAndRecord): UserToTweetRecommendations = { - val genericRecord: GenericRecord = record.getRecord - UserToTweetRecommendations( - userId = genericRecord.get("userId").toString.toLong, - tweetCandidates = parseTweetIdColumn(genericRecord, "tweets"), - ) - } - } - - val tweetEmbeddingTemplateVariables = - Map( - "START_TIME" -> queryTimestamp.minusDays(1).toString(), - "END_TIME" -> queryTimestamp.toString(), - "TWEET_SAMPLE_RATE" -> Config.TweetSampleRate.toString, - "ENG_SAMPLE_RATE" -> Config.EngSampleRate.toString, - "MIN_TWEET_FAVS" -> Config.MinTweetFavs.toString, - "MIN_TWEET_IMPS" -> Config.MinTweetImps.toString, - "MAX_TWEET_FTR" -> Config.MaxTweetFTR.toString, - "MAX_USER_LOG_N_IMPS" -> Config.MaxUserLogNImps.toString, - "MAX_USER_LOG_N_FAVS" -> Config.MaxUserLogNFavs.toString, - "MAX_USER_FTR" -> Config.MaxUserFTR.toString, - "TWEET_EMBEDDING_LENGTH" -> Config.SimClustersTweetEmbeddingsGenerationEmbeddingLength.toString, - "HALFLIFE" -> Config.SimClustersTweetEmbeddingsGenerationHalfLife.toString, - "SCORE_COLUMN" -> scoreColumn, - "SCORE_KEY" -> scoreKey, - ) - - val tweetEmbeddingSql = BQQueryUtils.getBQQueryFromSqlFile( - "/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/ftr_tweet_embeddings.sql", - tweetEmbeddingTemplateVariables) - val consumerEmbeddingSql = getInterestedIn2020SQL(queryTimestamp, 14) - - val tweetRecommendationsTemplateVariables = - Map( - "CONSUMER_EMBEDDINGS_SQL" -> consumerEmbeddingSql, - "TWEET_EMBEDDINGS_SQL" -> tweetEmbeddingSql, - "TOP_N_CLUSTER_PER_SOURCE_EMBEDDING" -> Config.SimClustersANNTopNClustersPerSourceEmbedding.toString, - "TOP_M_TWEETS_PER_CLUSTER" -> Config.SimClustersANNTopMTweetsPerCluster.toString, - "TOP_K_TWEETS_PER_USER_REQUEST" -> Config.SimClustersANNTopKTweetsPerUserRequest.toString, - ) - val tweetRecommendationsSql = BQQueryUtils.getBQQueryFromSqlFile( - "/com/twitter/simclusters_v2/scio/bq_generation/sql/tweets_ann.sql", - tweetRecommendationsTemplateVariables) - - val tweetRecommendations = sc.customInput( - s"SimClusters FTR BQ ANN", - BigQueryIO - .read(parseUserToTweetRecommendationsFunc) - .fromQuery(tweetRecommendationsSql) - .usingStandardSql() - ) - - //Setup BQ writer - val ingestionTime = opts.getDate().value.getEnd.toDate - val bqFieldsTransform = RootTransform - .Builder() - .withPrependedFields("ingestionTime" -> TypedProjection.fromConstant(ingestionTime)) - val timePartitioning = new TimePartitioning() - .setType("HOUR").setField("ingestionTime").setExpirationMs(3.days.inMilliseconds) - val bqWriter = BigQueryIO - .write[CandidateTweets] - .to(outputTable.toString) - .withExtendedErrorInfo() - .withTimePartitioning(timePartitioning) - .withLoadJobProjectId(projectId) - .withThriftSupport(bqFieldsTransform.build(), AvroConverter.Legacy) - .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) - .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) - - // Save Tweet ANN results to BQ - tweetRecommendations - .map { userToTweetRecommendations => - { - CandidateTweets( - targetUserId = userToTweetRecommendations.userId, - recommendedTweets = userToTweetRecommendations.tweetCandidates) - } - } - .saveAsCustomOutput(s"WriteToBQTable - $outputTable", bqWriter) - - val RootMHPath: String = Config.FTRRootMHPath - val AdhocRootPath = Config.FTRAdhocpath - - // Save Tweet ANN results as KeyValSnapshotDataset - tweetRecommendations - .map { userToTweetRecommendations => - KeyVal( - userToTweetRecommendations.userId, - CandidateTweetsList(userToTweetRecommendations.tweetCandidates)) - }.saveAsCustomOutput( - name = "WriteFtrTweetRecommendationsToKeyValDataset", - DAL.writeVersionedKeyVal( - tweetRecommentationsSnapshotDataset, - PathLayout.VersionedPath(prefix = - ((if (!isAdhoc) - RootMHPath - else - AdhocRootPath) - + keyValDatasetOutputPath)), - instant = Instant.ofEpochMilli(opts.interval.getEndMillis - 1L), - environmentOverride = environment, - ) - ) - } - -} - -object FTRAdhocJob extends FTRJob { - override val isAdhoc = true - override val outputTable: BQTableDetails = - BQTableDetails("twttr-recos-ml-prod", "simclusters", "offline_tweet_recommendations_ftr_adhoc") - override val keyValDatasetOutputPath = Config.IIKFFTRAdhocANNOutputPath - - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = - OfflineTweetRecommendationsFtrAdhocScalaDataset - override val scoreColumn = "ftrat5_decayed_pop_bias_1000_rank_decay_1_1_embedding" - override val scoreKey = "ftrat5_decayed_pop_bias_1000_rank_decay_1_1" -} - -object IIKF2020DecayedSumBatchJobProd extends FTRJob { - override val isAdhoc = false - override val outputTable: BQTableDetails = BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "offline_tweet_recommendations_decayed_sum" - ) - override val keyValDatasetOutputPath = Config.IIKFDecayedSumANNOutputPath - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = - OfflineTweetRecommendationsDecayedSumScalaDataset - override val scoreColumn = "dec_sum_logfavScoreClusterNormalizedOnly_embedding" - override val scoreKey = "dec_sum_logfavScoreClusterNormalizedOnly" -} - -object IIKF2020FTRAt5Pop1000batchJobProd extends FTRJob { - override val isAdhoc = false - override val outputTable: BQTableDetails = BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "offline_tweet_recommendations_ftrat5_pop_biased_1000") - override val keyValDatasetOutputPath = Config.IIKFFTRAt5Pop1000ANNOutputPath - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = - OfflineTweetRecommendationsFtrat5PopBiased1000ScalaDataset - override val scoreColumn = "ftrat5_decayed_pop_bias_1000_rank_decay_1_1_embedding" - override val scoreKey = "ftrat5_decayed_pop_bias_1000_rank_decay_1_1" -} - -object IIKF2020FTRAt5Pop10000batchJobProd extends FTRJob { - override val isAdhoc = false - override val outputTable: BQTableDetails = BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "offline_tweet_recommendations_ftrat5_pop_biased_10000") - override val keyValDatasetOutputPath = Config.IIKFFTRAt5Pop10000ANNOutputPath - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = - OfflineTweetRecommendationsFtrat5PopBiased10000ScalaDataset - override val scoreColumn = "ftrat5_decayed_pop_bias_10000_rank_decay_1_1_embedding" - override val scoreKey = "ftrat5_decayed_pop_bias_10000_rank_decay_1_1" -} - -case class UserToTweetRecommendations( - userId: Long, - tweetCandidates: List[CandidateTweet]) diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/FtrClusterToTweetIndexGenerationJob.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/FtrClusterToTweetIndexGenerationJob.docx new file mode 100644 index 000000000..0de5f6289 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/FtrClusterToTweetIndexGenerationJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/FtrClusterToTweetIndexGenerationJob.scala b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/FtrClusterToTweetIndexGenerationJob.scala deleted file mode 100644 index d7560be53..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/FtrClusterToTweetIndexGenerationJob.scala +++ /dev/null @@ -1,264 +0,0 @@ -package com.twitter.simclusters_v2 -package scio.bq_generation.ftr_tweet - -import com.google.api.services.bigquery.model.TimePartitioning -import com.twitter.conversions.DurationOps.richDurationFromInt -import com.spotify.scio.ScioContext -import com.spotify.scio.coders.Coder -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.PathLayout -import com.twitter.simclusters_v2.scio.bq_generation.common.IndexGenerationUtil.parseClusterTopKTweetsFn -import java.time.Instant -import com.twitter.beam.job.DateRangeOptions -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scio_internal.coders.ThriftStructLazyBinaryScroogeCoder -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.scrooge.ThriftStruct -import com.twitter.simclusters_v2.scio.bq_generation.common.BQTableDetails -import com.twitter.simclusters_v2.thriftscala.ClusterIdToTopKTweetsWithScores -import com.twitter.simclusters_v2.thriftscala.FullClusterId -import com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores -import com.twitter.tcdc.bqblaster.beam.syntax._ -import com.twitter.tcdc.bqblaster.core.avro.TypedProjection -import com.twitter.tcdc.bqblaster.core.transform.RootTransform -import com.twitter.wtf.beam.bq_embedding_export.BQQueryUtils -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO - -trait FTRClusterToTweetIndexGenerationJob extends ScioBeamJob[DateRangeOptions] { - val isAdhoc: Boolean - - val outputTable: BQTableDetails - val keyValDatasetOutputPath: String - val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] - - // Base configs - val projectId = "twttr-recos-ml-prod" - val environment: DAL.Env = if (isAdhoc) DAL.Environment.Dev else DAL.Environment.Prod - - // Variables for Tweet Embedding SQL - val scoreKey: String - val scoreColumn: String - - // Variables for spam treatment - val maxTweetFTR: Double - val maxUserFTR: Double - - // Tweet embeddings parameters - val tweetEmbeddingsLength: Int = Config.SimClustersTweetEmbeddingsGenerationEmbeddingLength - - // Clusters-to-tweet index parameters - val clusterTopKTweets: Int = Config.clusterTopKTweets - val maxTweetAgeHours: Int = Config.maxTweetAgeHours - - override implicit def scroogeCoder[T <: ThriftStruct: Manifest]: Coder[T] = - ThriftStructLazyBinaryScroogeCoder.scroogeCoder - - override def configurePipeline(sc: ScioContext, opts: DateRangeOptions): Unit = { - // The time when the job is scheduled - val queryTimestamp = opts.interval.getEnd - - val tweetEmbeddingTemplateVariables = - Map( - "START_TIME" -> queryTimestamp.minusDays(1).toString(), - "END_TIME" -> queryTimestamp.toString(), - "TWEET_SAMPLE_RATE" -> Config.TweetSampleRate.toString, - "ENG_SAMPLE_RATE" -> Config.EngSampleRate.toString, - "MIN_TWEET_FAVS" -> Config.MinTweetFavs.toString, - "MIN_TWEET_IMPS" -> Config.MinTweetImps.toString, - "MAX_TWEET_FTR" -> maxTweetFTR.toString, - "MAX_USER_LOG_N_IMPS" -> Config.MaxUserLogNImps.toString, - "MAX_USER_LOG_N_FAVS" -> Config.MaxUserLogNFavs.toString, - "MAX_USER_FTR" -> maxUserFTR.toString, - "TWEET_EMBEDDING_LENGTH" -> Config.SimClustersTweetEmbeddingsGenerationEmbeddingLength.toString, - "HALFLIFE" -> Config.SimClustersTweetEmbeddingsGenerationHalfLife.toString, - "SCORE_COLUMN" -> scoreColumn, - "SCORE_KEY" -> scoreKey, - ) - val tweetEmbeddingSql = BQQueryUtils.getBQQueryFromSqlFile( - "/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/ftr_tweet_embeddings.sql", - tweetEmbeddingTemplateVariables) - - val clusterTopTweetsTemplateVariables = - Map( - "CLUSTER_TOP_K_TWEETS" -> Config.clusterTopKTweets.toString, - "TWEET_EMBEDDING_SQL" -> tweetEmbeddingSql - ) - - val clusterTopTweetsSql = BQQueryUtils.getBQQueryFromSqlFile( - "/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets.sql", - clusterTopTweetsTemplateVariables - ) - - // Generate SimClusters cluster-to-tweet index - val topKtweetsForClusterKey = sc.customInput( - s"SimClusters cluster-to-tweet index generation BQ job", - BigQueryIO - .read(parseClusterTopKTweetsFn(Config.TweetEmbeddingHalfLife)) - .fromQuery(clusterTopTweetsSql) - .usingStandardSql() - ) - - // Setup BQ writer - val ingestionTime = opts.getDate().value.getEnd.toDate - val bqFieldsTransform = RootTransform - .Builder() - .withPrependedFields("dateHour" -> TypedProjection.fromConstant(ingestionTime)) - val timePartitioning = new TimePartitioning() - .setType("HOUR").setField("dateHour").setExpirationMs(3.days.inMilliseconds) - val bqWriter = BigQueryIO - .write[ClusterIdToTopKTweetsWithScores] - .to(outputTable.toString) - .withExtendedErrorInfo() - .withTimePartitioning(timePartitioning) - .withLoadJobProjectId(projectId) - .withThriftSupport(bqFieldsTransform.build(), AvroConverter.Legacy) - .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) - .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) - - // Save SimClusters index to a BQ table - topKtweetsForClusterKey - .map { clusterIdToTopKTweets => - { - ClusterIdToTopKTweetsWithScores( - clusterId = clusterIdToTopKTweets.clusterId, - topKTweetsWithScores = clusterIdToTopKTweets.topKTweetsWithScores - ) - } - } - .saveAsCustomOutput(s"WriteToBQTable - $outputTable", bqWriter) - - // Save SimClusters index as a KeyValSnapshotDataset - topKtweetsForClusterKey - .map { clusterIdToTopKTweets => - KeyVal(clusterIdToTopKTweets.clusterId, clusterIdToTopKTweets.topKTweetsWithScores) - }.saveAsCustomOutput( - name = s"WriteClusterToKeyIndexToKeyValDataset at $keyValDatasetOutputPath", - DAL.writeVersionedKeyVal( - clusterToTweetIndexSnapshotDataset, - PathLayout.VersionedPath(prefix = - ((if (!isAdhoc) - Config.FTRRootMHPath - else - Config.FTRAdhocpath) - + keyValDatasetOutputPath)), - instant = Instant.ofEpochMilli(opts.interval.getEndMillis - 1L), - environmentOverride = environment, - ) - ) - } -} - -object FTRClusterToTweetIndexGenerationAdhoc extends FTRClusterToTweetIndexGenerationJob { - override val isAdhoc: Boolean = true - override val outputTable: BQTableDetails = - BQTableDetails( - "twttr-recos-ml-prod", - "simclusters", - "simcluster_adhoc_test_cluster_to_tweet_index") - override val keyValDatasetOutputPath: String = - "ftr_tweets_adhoc/ftr_cluster_to_tweet_adhoc" - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = SimclustersFtrAdhocClusterToTweetIndexScalaDataset - override val scoreColumn = "ftrat5_decayed_pop_bias_1000_rank_decay_1_1_embedding" - override val scoreKey = "ftrat5_decayed_pop_bias_1000_rank_decay_1_1" - override val maxUserFTR: Double = Config.MaxUserFTR - override val maxTweetFTR: Double = Config.MaxTweetFTR - -} - -object OONFTRClusterToTweetIndexGenerationAdhoc extends FTRClusterToTweetIndexGenerationJob { - override val isAdhoc: Boolean = true - override val outputTable: BQTableDetails = - BQTableDetails( - "twttr-recos-ml-prod", - "simclusters", - "simcluster_adhoc_test_cluster_to_tweet_index") - override val keyValDatasetOutputPath: String = - "oon_ftr_tweets_adhoc/oon_ftr_cluster_to_tweet_adhoc" - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = SimclustersOonFtrAdhocClusterToTweetIndexScalaDataset - override val scoreColumn = "oon_ftrat5_decayed_pop_bias_1000_rank_decay_embedding" - override val scoreKey = "oon_ftrat5_decayed_pop_bias_1000_rank_decay" - override val maxUserFTR: Double = Config.MaxUserFTR - override val maxTweetFTR: Double = Config.MaxTweetFTR -} - -object FTRPop1000RankDecay11ClusterToTweetIndexGenerationBatch - extends FTRClusterToTweetIndexGenerationJob { - override val isAdhoc: Boolean = false - override val outputTable: BQTableDetails = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_ftr_pop1000_rnkdecay11_cluster_to_tweet_index") - override val keyValDatasetOutputPath: String = - Config.FTRPop1000RankDecay11ClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = SimclustersFtrPop1000Rnkdecay11ClusterToTweetIndexScalaDataset - override val scoreColumn = "ftrat5_decayed_pop_bias_1000_rank_decay_1_1_embedding" - override val scoreKey = "ftrat5_decayed_pop_bias_1000_rank_decay_1_1" - override val maxUserFTR: Double = Config.MaxUserFTR - override val maxTweetFTR: Double = Config.MaxTweetFTR -} - -object FTRPop10000RankDecay11ClusterToTweetIndexGenerationBatch - extends FTRClusterToTweetIndexGenerationJob { - override val isAdhoc: Boolean = false - override val outputTable: BQTableDetails = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_ftr_pop10000_rnkdecay11_cluster_to_tweet_index") - override val keyValDatasetOutputPath: String = - Config.FTRPop10000RankDecay11ClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = SimclustersFtrPop10000Rnkdecay11ClusterToTweetIndexScalaDataset - override val scoreColumn = "ftrat5_decayed_pop_bias_10000_rank_decay_1_1_embedding" - override val scoreKey = "ftrat5_decayed_pop_bias_10000_rank_decay_1_1" - override val maxUserFTR: Double = Config.MaxUserFTR - override val maxTweetFTR: Double = Config.MaxTweetFTR -} - -object OONFTRPop1000RankDecayClusterToTweetIndexGenerationBatch - extends FTRClusterToTweetIndexGenerationJob { - override val isAdhoc: Boolean = false - override val outputTable: BQTableDetails = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_oon_ftr_pop1000_rnkdecay_cluster_to_tweet_index") - override val keyValDatasetOutputPath: String = - Config.OONFTRPop1000RankDecayClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = SimclustersOonFtrPop1000RnkdecayClusterToTweetIndexScalaDataset - override val scoreColumn = "oon_ftrat5_decayed_pop_bias_1000_rank_decay_embedding" - override val scoreKey = "oon_ftrat5_decayed_pop_bias_1000_rank_decay" - override val maxUserFTR: Double = Config.MaxUserFTR - override val maxTweetFTR: Double = Config.MaxTweetFTR -} - -object DecayedSumClusterToTweetIndexGenerationBatch extends FTRClusterToTweetIndexGenerationJob { - override val isAdhoc: Boolean = false - override val outputTable: BQTableDetails = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_decayed_sum_cluster_to_tweet_index") - override val keyValDatasetOutputPath: String = - Config.DecayedSumClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = SimclustersDecayedSumClusterToTweetIndexScalaDataset - override val scoreColumn = "dec_sum_logfavScoreClusterNormalizedOnly_embedding" - override val scoreKey = "dec_sum_logfavScoreClusterNormalizedOnly" - override val maxUserFTR = 1.0 - override val maxTweetFTR = 1.0 -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/README.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/README.docx new file mode 100644 index 000000000..6eec7e1e9 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/README.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/README.md b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/README.md deleted file mode 100644 index 4d9e7d081..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/README.md +++ /dev/null @@ -1,212 +0,0 @@ -# FTR Tweet embeddings - -export GCP_PROJECT_NAME='twttr-recos-ml-prod' - -## Running Adhoc jobs -### Base ftrat5 -``` -rm dist/ftr-tweet-adhoc-job-bundle/ftr-tweet-adhoc-job.jar -./bazel bundle src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:ftr-tweet-adhoc-job && \ -bin/d6w create \ -${GCP_PROJECT_NAME}/us-central1/ftr-tweets-ann-adhoc-job \ -src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-tweets-ann-adhoc-job.d6w \ ---jar dist/ftr-tweet-adhoc-job-bundle/ftr-tweet-adhoc-job.jar \ ---bind=profile.project= -${GCP_PROJECT_NAME} \ ---bind=profile.user_name=your_ldap \ ---bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:ftr-tweet-index-generation-adhoc-job" \ ---bind=profile.date="2022-08-26T12" \ ---bind=profile.machine="n2-standard-2" \ ---bind=profile.job_name="ftr-tweets-ann-adhoc-job" --ignore-existing -``` -### ClusterToTweet Index with base ftrat5 -``` -export GCP_PROJECT_NAME='twttr-recos-ml-prod' - -rm dist/ftr-tweet-index-generation-adhoc-job-bundle/ftr-tweet-index-generation-adhoc-job.jar -./bazel bundle src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:ftr-tweet-index-generation-adhoc-job && \ -bin/d6w create \ -${GCP_PROJECT_NAME}/us-central1/ftr-tweet-index-generation-adhoc-job \ -src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-based-simclusters-index-generation-job.d6w \ ---jar dist/ftr-tweet-index-generation-adhoc-job-bundle/ftr-tweet-index-generation-adhoc-job.jar \ ---bind=profile.project=${GCP_PROJECT_NAME} \ ---bind=profile.user_name=your_ldap \ ---bind=profile.date="2022-08-27T12" \ ---bind=profile.machine="n2-standard-2" \ ---bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:ftr-tweet-index-generation-adhoc-job" \ ---bind=profile.job_name="ftr-tweet-index-generation-adhoc-job" --ignore-existing -``` - -### OON ftrat5 -``` -rm dist/oon-ftr-tweet-index-generation-adhoc-job-bundle/oon-ftr-tweet-index-generation-adhoc-job.jar -./bazel bundle src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:oon-ftr-tweet-index-generation-adhoc-job && \ -bin/d6w create \ -${GCP_PROJECT_NAME}/us-central1/oon-ftr-ann-adhoc-job \ -src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-based-simclusters-index-generation-job.d6w \ ---jar dist/oon-ftr-tweet-index-generation-adhoc-job-bundle/oon-ftr-tweet-index-generation-adhoc-job.jar \ ---bind=profile.project=${GCP_PROJECT_NAME} \ ---bind=profile.user_name=${USER} \ ---bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:oon-ftr-tweet-index-generation-adhoc-job" \ ---bind=profile.date="2022-09-21T12" \ ---bind=profile.machine="n2-standard-2" \ ---bind=profile.job_name="oon-ftr-ann-adhoc-job" --ignore-existing -``` - - -## Scheduling jobs -### decayed_sum_job -``` -export SERVICE_ACCOUNT='cassowary' -export GCP_PROJECT_NAME='twttr-recos-ml-prod' -export PROJECT_DATE='2022-07-24T16' - -bin/d6w schedule \ -${GCP_PROJECT_NAME}/us-central1/iikf2020-decayed-sum-ann-batch-job \ -src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-decayed-sum-ann-batch-job.d6w \ ---bind=profile.project=${GCP_PROJECT_NAME} \ ---bind=profile.user_name=${SERVICE_ACCOUNT} \ ---bind=profile.machine="n2-highmem-4" \ ---bind=profile.job_name="iikf2020-decayed-sum-ann-batch-job" \ ---bind=profile.date=${PROJECT_DATE} \ ---bind=profile.environment=prod -``` - -### ftrat5 pop1000 - -``` -export SERVICE_ACCOUNT='cassowary' -export GCP_PROJECT_NAME='twttr-recos-ml-prod' -export PROJECT_DATE='2022-07-24T17' - -bin/d6w schedule \ -${GCP_PROJECT_NAME}/us-central1/iikf2020-ftrat5-pop1000-ann-batch-job \ -src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop1000-ann-batch-job.d6w \ ---bind=profile.project=${GCP_PROJECT_NAME} \ ---bind=profile.user_name=${SERVICE_ACCOUNT} \ ---bind=profile.machine="n2-highmem-4" \ ---bind=profile.job_name="iikf2020-ftrat5-pop1000-ann-batch-job" \ ---bind=profile.date=${PROJECT_DATE} \ ---bind=profile.environment=prod -``` - - -### ftrat5 pop10000 -``` -export SERVICE_ACCOUNT='cassowary' -export GCP_PROJECT_NAME='twttr-recos-ml-prod' -export PROJECT_DATE='2022-07-24T18' - -bin/d6w schedule \ -${GCP_PROJECT_NAME}/us-central1/iikf2020-ftrat5-pop10000-ann-batch-job \ -src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop10000-ann-batch-job.d6w \ ---bind=profile.project=${GCP_PROJECT_NAME} \ ---bind=profile.user_name=${SERVICE_ACCOUNT} \ ---bind=profile.machine="n2-highmem-4" \ ---bind=profile.job_name="iikf2020-ftrat5-pop10000-ann-batch-job" \ ---bind=profile.date=${PROJECT_DATE} \ ---bind=profile.environment=prod -``` - -### Deschedule -``` -export SERVICE_ACCOUNT='cassowary' - -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-iikf2020-decayed-sum-ann-batch-job -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-iikf2020-ftrat5-pop1000-ann-batch-job -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-iikf2020-ftrat5-pop10000-ann-batch-job - -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-iikf2020-decayed-sum-ann-batch-job -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-iikf2020-ftrat5-pop1000-ann-batch-job -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-iikf2020-ftrat5-pop10000-ann-batch-job -``` - -### pop1000-rnkdecay11 -``` -export SERVICE_ACCOUNT='cassowary' -export GCP_PROJECT_NAME='twttr-recos-ml-prod' -export PROJECT_DATE='2022-08-27T16' - -bin/d6w schedule \ -${GCP_PROJECT_NAME}/us-central1/ftr-pop1000-rnkdecay11-tweet-index-generation-batch-job \ -src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-based-simclusters-index-generation-job.d6w \ ---bind=profile.project=${GCP_PROJECT_NAME} \ ---bind=profile.user_name=${SERVICE_ACCOUNT} \ ---bind=profile.machine="n2-standard-2" \ ---bind=profile.job_name="ftr-pop1000-rnkdecay11-tweet-index-generation-batch-job" \ ---bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:ftr-tweet-index-generation-pop1000-rnkdecay11-job" \ ---bind=profile.date=${PROJECT_DATE} \ ---bind=profile.environment=prod -``` - -### pop10000-rnkdecay11 -``` -export SERVICE_ACCOUNT='cassowary' -export GCP_PROJECT_NAME='twttr-recos-ml-prod' -export PROJECT_DATE='2022-08-27T16' - -bin/d6w schedule \ -${GCP_PROJECT_NAME}/us-central1/ftr-pop10000-rnkdecay11-tweet-index-generation-batch-job \ -src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-based-simclusters-index-generation-job.d6w \ ---bind=profile.project=${GCP_PROJECT_NAME} \ ---bind=profile.user_name=${SERVICE_ACCOUNT} \ ---bind=profile.machine="n2-standard-2" \ ---bind=profile.job_name="ftr-pop10000-rnkdecay11-tweet-index-generation-batch-job" \ ---bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:ftr-tweet-index-generation-pop10000-rnkdecay11-job" \ ---bind=profile.date=${PROJECT_DATE} \ ---bind=profile.environment=prod -``` - -### decayed_sum -``` -export SERVICE_ACCOUNT='cassowary' -export GCP_PROJECT_NAME='twttr-recos-ml-prod' -export PROJECT_DATE='2022-09-05T16' - -bin/d6w schedule \ -${GCP_PROJECT_NAME}/us-central1/decayed-sum-tweet-index-generation-batch-job \ -src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-based-simclusters-index-generation-job.d6w \ ---bind=profile.project=${GCP_PROJECT_NAME} \ ---bind=profile.user_name=${SERVICE_ACCOUNT} \ ---bind=profile.machine="n2-standard-2" \ ---bind=profile.job_name="decayed-sum-tweet-index-generation-batch-job" \ ---bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:ftr-tweet-index-generation-decayed-sum-job" \ ---bind=profile.date=${PROJECT_DATE} \ ---bind=profile.environment=prod -``` - - -### OON ftrat5 -``` -export SERVICE_ACCOUNT='cassowary' -export GCP_PROJECT_NAME='twttr-recos-ml-prod' -export PROJECT_DATE='2022-09-21T16' - -bin/d6w schedule \ -${GCP_PROJECT_NAME}/us-central1/oon-ftr-pop1000-rnkdecay-tweet-index-generation-batch-job \ -src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-based-simclusters-index-generation-job.d6w \ ---bind=profile.project=${GCP_PROJECT_NAME} \ ---bind=profile.user_name=${SERVICE_ACCOUNT} \ ---bind=profile.machine="n2-standard-2" \ ---bind=profile.job_name="oon-ftr-pop1000-rnkdecay-tweet-index-generation-batch-job" \ ---bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:oon-ftr-tweet-index-generation-pop1000-rnkdecay-job" \ ---bind=profile.date=${PROJECT_DATE} \ ---bind=profile.environment=prod -``` - -### Deschedule -``` -export SERVICE_ACCOUNT='cassowary' - -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-ftr-pop1000-rnkdecay11-tweet-index-generation-batch-job -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-ftr-pop1000-rnkdecay11-tweet-index-generation-batch-job - -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-ftr-pop10000-rnkdecay11-tweet-index-generation-batch-job -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-ftr-pop10000-rnkdecay11-tweet-index-generation-batch-job - -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-decayed-sum-tweet-index-generation-batch-job -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-decayed-sum-tweet-index-generation-batch-job - -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-oon-ftr-pop1000-rnkdecay-tweet-index-generation-batch-job -aurora cron deschedule atla/${SERVICE_ACCOUNT}/prod/twttr-recos-ml-prod-us-central1-oon-ftr-pop1000-rnkdecay-tweet-index-generation-batch-job -``` diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-based-simclusters-index-generation-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-based-simclusters-index-generation-job.d6w deleted file mode 100644 index 39b2f16bf..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-based-simclusters-index-generation-job.d6w +++ /dev/null @@ -1,44 +0,0 @@ -class Profile(Struct): - project = Default(String, 'twttr-recos-ml-prod') - date = Required(String) - build_target = Required(String) - job_name = Required(String) - environment = Default(String, 'dev') - machine = Default(String, 'n2-standard-2') - -SimClustersIndexGenerationJob = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - environment='{{profile.environment}}', - build_target='{{profile.build_target}}', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT2H', - first_time='{{profile.date}}', - ), - workflow_config=WorkflowConfig( - play=True, - ), - timeout='PT4H', - parallelism=1 - ) -) - -jobs=[SimClustersIndexGenerationJob.bind(profile=Profile())] - - - diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-based-simclusters-index-generation-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-based-simclusters-index-generation-job.docx new file mode 100644 index 000000000..f623642de Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-based-simclusters-index-generation-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-tweets-ann-adhoc-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-tweets-ann-adhoc-job.d6w deleted file mode 100644 index cb9afafca..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-tweets-ann-adhoc-job.d6w +++ /dev/null @@ -1,36 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - build_target = Required(String) - environment = Default(String, 'dev') - job_name = Default(String, 'ftr-recs-d6w-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - build_target='{{profile.build_target}}', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT4H', - first_time='{{profile.date}}', - ), - timeout='PT8H' - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-tweets-ann-adhoc-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-tweets-ann-adhoc-job.docx new file mode 100644 index 000000000..4aa512973 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/ftr-tweets-ann-adhoc-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-decayed-sum-ann-batch-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-decayed-sum-ann-batch-job.d6w deleted file mode 100644 index 759d8e0c2..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-decayed-sum-ann-batch-job.d6w +++ /dev/null @@ -1,35 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - job_name = Default(String, 'ftr-recs-d6w-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - build_target='src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:iikf2020-decayed-sum-batch-job', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT4H', - first_time='{{profile.date}}', - ), - timeout='PT8H', - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-decayed-sum-ann-batch-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-decayed-sum-ann-batch-job.docx new file mode 100644 index 000000000..8a19359d2 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-decayed-sum-ann-batch-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop1000-ann-batch-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop1000-ann-batch-job.d6w deleted file mode 100644 index 7c7001400..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop1000-ann-batch-job.d6w +++ /dev/null @@ -1,35 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - job_name = Default(String, 'ftr-recs-d6w-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - build_target='src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:iikf2020-ftrat5-pop1000-batch-job', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT4H', - first_time='{{profile.date}}', - ), - timeout='PT8H', - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop1000-ann-batch-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop1000-ann-batch-job.docx new file mode 100644 index 000000000..85852422a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop1000-ann-batch-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop10000-ann-batch-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop10000-ann-batch-job.d6w deleted file mode 100644 index 24c594aa3..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop10000-ann-batch-job.d6w +++ /dev/null @@ -1,35 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - job_name = Default(String, 'ftr-recs-d6w-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - build_target='src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet:iikf2020-ftrat5-pop10000-batch-job', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT4H', - first_time='{{profile.date}}', - ), - timeout='PT8H', - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop10000-ann-batch-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop10000-ann-batch-job.docx new file mode 100644 index 000000000..52d472b7b Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/iikf2020-ftrat5-pop10000-ann-batch-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/BUILD b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/BUILD deleted file mode 100644 index ba87e2b54..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/BUILD +++ /dev/null @@ -1,3 +0,0 @@ -resources( - sources = ["*"], -) diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/BUILD.docx new file mode 100644 index 000000000..588d56c57 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/ftr_tweet_embeddings.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/ftr_tweet_embeddings.docx new file mode 100644 index 000000000..a142c6fb1 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/ftr_tweet_embeddings.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/ftr_tweet_embeddings.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/ftr_tweet_embeddings.sql deleted file mode 100644 index fe52dfedb..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/ftr_tweet/sql/ftr_tweet_embeddings.sql +++ /dev/null @@ -1,280 +0,0 @@ -WITH vars AS ( - SELECT - TIMESTAMP('{START_TIME}') AS start_time, - TIMESTAMP('{END_TIME}') AS end_time, - UNIX_MILLIS('{END_TIME}') AS currentTs, - {HALFLIFE} AS halfLife, - {TWEET_SAMPLE_RATE} AS tweet_sample_rate, - {ENG_SAMPLE_RATE} AS eng_user_sample_rate, - {MIN_TWEET_FAVS} AS min_tweet_favs, - {MIN_TWEET_IMPS} AS min_tweet_imps, - {MAX_USER_LOG_N_IMPS} AS max_user_log_n_imps, - {MAX_USER_LOG_N_FAVS} AS max_user_log_n_favs, - {MAX_USER_FTR} AS max_user_ftr, - {MAX_TWEET_FTR} AS max_tweet_ftr, - 700 AS MAX_EXPONENT, -- this is the maximum exponent one can have in bigquery - ), - -- step 1: get impressions and favs - impressions AS ( - SELECT - userIdentifier.userId AS user_id, - item.tweetInfo.actionTweetId AS tweet_id, - item.tweetInfo.actionTweetAuthorInfo.authorId AS author_id, - TRUE AS impressed, - MIN(eventMetadata.sourceTimestampMs) AS minTsMilli - FROM twttr-bql-unified-prod.unified_user_actions.streaming_unified_user_actions, vars - WHERE - actionType = "ClientTweetLingerImpression" - AND DATE(dateHour) BETWEEN DATE(vars.start_time) AND DATE(vars.end_time) - AND TIMESTAMP_MILLIS(eventMetadata.sourceTimestampMs) BETWEEN vars.start_time AND vars.end_time - AND MOD(ABS(farm_fingerprint(item.tweetInfo.actionTweetId || '')), vars.tweet_sample_rate) = 0 - AND MOD(ABS(farm_fingerprint(userIdentifier.userId || '')), vars.eng_user_sample_rate) = 0 - -- Apply tweet age filter here - AND timestamp_millis((1288834974657 + - ((item.tweetInfo.actionTweetId & 9223372036850581504) >> 22))) >= (vars.start_time) - GROUP BY 1, 2, 3 - ), - favs AS ( - SELECT - userIdentifier.userId AS user_id, - item.tweetInfo.actionTweetId AS tweet_id, - item.tweetInfo.actionTweetAuthorInfo.authorId AS author_id, - MIN(eventMetadata.sourceTimestampMs) AS minTsMilli, - -- get last action, and make sure that it's a fav - ARRAY_AGG(actionType ORDER BY eventMetadata.sourceTimestampMs DESC LIMIT 1)[OFFSET(0)] = "ServerTweetFav" AS favorited, - FROM `twttr-bql-unified-prod.unified_user_actions_engagements.streaming_unified_user_actions_engagements`, vars - WHERE - actionType IN ("ServerTweetFav", "ServerTweetUnfav") - AND DATE(dateHour) BETWEEN DATE(vars.start_time) AND DATE(vars.end_time) - AND TIMESTAMP_MILLIS(eventMetadata.sourceTimestampMs) BETWEEN vars.start_time AND vars.end_time - AND MOD(ABS(farm_fingerprint(item.tweetInfo.actionTweetId || '')), vars.tweet_sample_rate) = 0 - AND MOD(ABS(farm_fingerprint(userIdentifier.userId || '')), vars.eng_user_sample_rate) = 0 - -- Apply tweet age filter here - AND timestamp_millis((1288834974657 + - ((item.tweetInfo.actionTweetId & 9223372036850581504) >> 22))) >= (vars.start_time) - GROUP BY 1, 2, 3 - HAVING favorited - ), - eng_data AS ( - SELECT - user_id, tweet_id, author_id, impressions.minTsMilli, favorited, impressed - FROM impressions - LEFT JOIN favs USING(user_id, tweet_id, author_id) - ), - eligible_tweets AS ( - SELECT - tweet_id, - author_id, - COUNTIF(favorited) num_favs, - COUNTIF(impressed) num_imps, - COUNTIF(favorited) * 1.0 / COUNTIF(impressed) AS tweet_ftr, - ANY_VALUE(vars.min_tweet_favs) min_tweet_favs, - ANY_VALUE(vars.min_tweet_imps) min_tweet_imps, - ANY_VALUE(vars.max_tweet_ftr) max_tweet_ftr, - FROM eng_data, vars - GROUP BY 1, 2 - HAVING num_favs >= min_tweet_favs -- this is an aggressive filter to make the workflow efficient - AND num_imps >= min_tweet_imps - AND tweet_ftr <= max_tweet_ftr -- filter to combat spam - ), - eligible_users AS ( - SELECT - user_id, - CAST(LOG10(COUNTIF(impressed) + 1) AS INT64) log_n_imps, - CAST(LOG10(COUNTIF(favorited) + 1) AS INT64) log_n_favs, - ANY_VALUE(vars.max_user_log_n_imps) max_user_log_n_imps, - ANY_VALUE(vars.max_user_log_n_favs) max_user_log_n_favs, - ANY_VALUE(vars.max_user_ftr) max_user_ftr, - COUNTIF(favorited) * 1.0 / COUNTIF(impressed) user_ftr - from eng_data, vars - GROUP BY 1 - HAVING - log_n_imps < max_user_log_n_imps - AND log_n_favs < max_user_log_n_favs - AND user_ftr < max_user_ftr - ), - eligible_eng_data AS ( - SELECT - user_id, - eng_data.author_id, - tweet_id, - minTsMilli, - favorited, - impressed - FROM eng_data - INNER JOIN eligible_tweets USING(tweet_id) - INNER JOIN eligible_users USING(user_id) - ), - follow_graph AS ( - SELECT userId, neighbor - FROM `twttr-bq-cassowary-prod.user.user_user_normalized_graph` user_user_graph, unnest(user_user_graph.neighbors) as neighbor - WHERE DATE(_PARTITIONTIME) = - ( -- Get latest partition time - SELECT MAX(DATE(_PARTITIONTIME)) latest_partition - FROM `twttr-bq-cassowary-prod.user.user_user_normalized_graph`, vars - WHERE Date(_PARTITIONTIME) BETWEEN - DATE_SUB(Date(vars.end_time), - INTERVAL 14 DAY) AND DATE(vars.end_time) - ) - AND neighbor.isFollowed is True - ), - extended_eligible_eng_data AS ( - SELECT - user_id, - tweet_id, - minTsMilli, - favorited, - impressed, - neighbor.neighborId is NULL as is_oon_eng - FROM eligible_eng_data left JOIN follow_graph ON (follow_graph.userId = eligible_eng_data.user_id AND follow_graph.neighbor.neighborId = eligible_eng_data.author_id) - ), - -- step 2: merge with iikf - iikf AS ( - SELECT - userId AS user_id, - - clusterIdToScore.key AS clusterId, - clusterIdToScore.value.favScore AS favScore, - clusterIdToScore.value.favScoreClusterNormalizedOnly AS favScoreClusterNormalizedOnly, - clusterIdToScore.value.favScoreProducerNormalizedOnly AS favScoreProducerNormalizedOnly, - - clusterIdToScore.value.logFavScore AS logFavScore, - clusterIdToScore.value.logfavScoreClusterNormalizedOnly AS logfavScoreClusterNormalizedOnly, -- probably no need for cluster normalization anymore - ROW_NUMBER() OVER (PARTITION BY userId ORDER BY clusterIdToScore.value.logFavScore DESC) AS uii_cluster_rank_logfavscore, - ROW_NUMBER() OVER (PARTITION BY userId ORDER BY clusterIdToScore.value.logfavScoreClusterNormalizedOnly DESC) AS uii_cluster_rank_logfavscoreclusternormalized, - FROM `twttr-bq-cassowary-prod.user.simclusters_v2_user_to_interested_in_20M_145K_2020`, UNNEST(clusterIdToScores) clusterIdToScore, vars - WHERE DATE(_PARTITIONTIME) = - (-- Get latest partition time - SELECT MAX(DATE(_PARTITIONTIME)) latest_partition - FROM `twttr-bq-cassowary-prod.user.simclusters_v2_user_to_interested_in_20M_145K_2020` - WHERE Date(_PARTITIONTIME) BETWEEN - DATE_SUB(Date(vars.end_time), - INTERVAL 14 DAY) AND DATE(vars.end_time) - ) - AND MOD(ABS(farm_fingerprint(userId || '')), vars.eng_user_sample_rate) = 0 - AND clusterIdToScore.value.logFavScore != 0 - ), - eng_w_uii AS ( - SELECT - T_IMP_FAV.user_id, - T_IMP_FAV.tweet_id, - T_IMP_FAV.impressed, - T_IMP_FAV.favorited, - T_IMP_FAV.minTsMilli, - T_IMP_FAV.is_oon_eng, - - IIKF.clusterId, - IIKF.logFavScore, - IIKF.logfavScoreClusterNormalizedOnly, - IIKF.uii_cluster_rank_logfavscore, - IIKF.uii_cluster_rank_logfavscoreclusternormalized, - FROM extended_eligible_eng_data T_IMP_FAV, vars - INNER JOIN iikf - ON T_IMP_FAV.user_id = IIKF.user_id - WHERE - T_IMP_FAV.impressed - ), - -- step 3: Calculate tweet embedding - tweet_cluster_agg AS ( - SELECT - tweet_id, - clusterId, - - SUM(IF(impressed, logFavScore, 0)) denom_logFavScore, - SUM(IF(favorited, logFavScore, 0)) nom_logFavScore, - - COUNTIF(impressed) n_imps, - COUNTIF(favorited) n_favs, - - COUNTIF(impressed AND uii_cluster_rank_logfavscore <= 5) n_imps_at_5, - COUNTIF(favorited AND uii_cluster_rank_logfavscore <= 5) n_favs_at_5, - - COUNTIF(favorited AND uii_cluster_rank_logfavscore <= 5 AND is_oon_eng) n_oon_favs_at_5, - COUNTIF(impressed AND uii_cluster_rank_logfavscore <= 5 AND is_oon_eng) n_oon_imps_at_5, - - SUM(IF(favorited AND uii_cluster_rank_logfavscore <= 5, 1, 0) * POW(0.5, (currentTs - minTsMilli) / vars.halfLife)) AS decayed_n_favs_at_5, - SUM(IF(impressed AND uii_cluster_rank_logfavscore <= 5, 1, 0) * POW(0.5, (currentTs - minTsMilli) / vars.halfLife)) AS decayed_n_imps_at_5, - - SUM(IF(favorited, logfavScoreClusterNormalizedOnly, 0) * POW(0.5, (currentTs - minTsMilli) / vars.halfLife)) AS dec_sum_logfavScoreClusterNormalizedOnly, - - MIN(minTsMilli) minTsMilli, - - FROM eng_w_uii, vars - GROUP BY 1, 2 - ), - tweet_cluster_intermediate AS ( - SELECT - tweet_id, - clusterId, - minTsMilli, - - n_imps, - n_favs, - - n_favs_at_5, - n_imps_at_5, - n_oon_favs_at_5, - n_oon_imps_at_5, - decayed_n_favs_at_5, - decayed_n_imps_at_5, - - denom_logFavScore, - nom_logFavScore, - - dec_sum_logfavScoreClusterNormalizedOnly, - - SAFE_DIVIDE(n_favs_at_5, n_imps_at_5) AS ftr_at_5, - - SAFE_DIVIDE(n_oon_favs_at_5, n_oon_imps_at_5) AS ftr_oon_at_5, - - row_number() OVER (PARTITION BY tweet_id ORDER BY nom_logFavScore DESC) cluster_nom_logFavScore_ranking, - row_number() OVER (PARTITION BY tweet_id ORDER BY dec_sum_logfavScoreClusterNormalizedOnly DESC) cluster_decSumLogFavClusterNormalized_ranking, - FROM tweet_cluster_agg - ), - tweet_e AS ( - SELECT - tweet_id, - - MIN(minTsMilli) first_serve_millis, - DATE(TIMESTAMP_MILLIS(MIN(minTsMilli))) date_first_serve, - - ARRAY_AGG(STRUCT( - clusterId, - -- the division by MAX_EXPONENT is to avoid overflow operation - ftr_at_5 * (2 / (1+EXP(-1* (decayed_n_favs_at_5/1000))) - 1) * IF(cluster_decSumLogFavClusterNormalized_ranking > MAX_EXPONENT, 0, 1.0/(POW(1.1, cluster_decSumLogFavClusterNormalized_ranking-1))) AS ftrat5_decayed_pop_bias_1000_rank_decay_1_1 - ) ORDER BY ftr_at_5 * (2 / (1+EXP(-1* (decayed_n_favs_at_5/1000))) - 1) * IF(cluster_decSumLogFavClusterNormalized_ranking > MAX_EXPONENT, 0, 1.0/(POW(1.1, cluster_decSumLogFavClusterNormalized_ranking-1))) DESC LIMIT {TWEET_EMBEDDING_LENGTH}) ftrat5_decayed_pop_bias_1000_rank_decay_1_1_embedding, - - ARRAY_AGG(STRUCT( - clusterId, - -- the division by MAX_EXPONENT is to avoid overflow operation - ftr_at_5 * (2 / (1+EXP(-1* (decayed_n_favs_at_5/10000))) - 1) * IF(cluster_decSumLogFavClusterNormalized_ranking > MAX_EXPONENT, 0, 1.0/(POW(1.1, cluster_decSumLogFavClusterNormalized_ranking-1))) AS ftrat5_decayed_pop_bias_10000_rank_decay_1_1 - ) ORDER BY ftr_at_5 * (2 / (1+EXP(-1* (decayed_n_favs_at_5/1000))) - 1) * IF(cluster_decSumLogFavClusterNormalized_ranking > MAX_EXPONENT, 0, 1.0/(POW(1.1, cluster_decSumLogFavClusterNormalized_ranking-1))) DESC LIMIT {TWEET_EMBEDDING_LENGTH}) ftrat5_decayed_pop_bias_10000_rank_decay_1_1_embedding, - - ARRAY_AGG(STRUCT( - clusterId, - -- the division by MAX_EXPONENT is to avoid overflow operation - ftr_oon_at_5 * (2 / (1+EXP(-1* (decayed_n_favs_at_5/1000))) - 1) * IF(cluster_nom_logFavScore_ranking > MAX_EXPONENT, 0, 1.0/(POW(1.1, cluster_nom_logFavScore_ranking-1))) AS oon_ftrat5_decayed_pop_bias_1000_rank_decay - ) ORDER BY ftr_oon_at_5 * (2 / (1+EXP(-1* (decayed_n_favs_at_5/1000))) - 1) * IF(cluster_nom_logFavScore_ranking > MAX_EXPONENT, 0, 1.0/(POW(1.1, cluster_nom_logFavScore_ranking-1))) DESC LIMIT {TWEET_EMBEDDING_LENGTH}) oon_ftrat5_decayed_pop_bias_1000_rank_decay_embedding, - - ARRAY_AGG(STRUCT( - clusterId, - dec_sum_logfavScoreClusterNormalizedOnly - ) ORDER BY dec_sum_logfavScoreClusterNormalizedOnly DESC LIMIT {TWEET_EMBEDDING_LENGTH}) dec_sum_logfavScoreClusterNormalizedOnly_embedding, - - FROM tweet_cluster_intermediate, vars - GROUP BY 1 - ), - tweet_e_unnest AS ( - SELECT - tweet_id AS tweetId, - clusterToScores.clusterId AS clusterId, - clusterToScores.{SCORE_KEY} tweetScore - FROM tweet_e, UNNEST({SCORE_COLUMN}) clusterToScores - WHERE clusterToScores.{SCORE_KEY} IS NOT NULL - AND clusterToScores.{SCORE_KEY} > 0 - ) - SELECT - tweetId, - clusterId, - tweetScore - FROM tweet_e_unnest diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/BUILD b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/BUILD deleted file mode 100644 index 681483b95..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/BUILD +++ /dev/null @@ -1,167 +0,0 @@ -scala_library( - name = "simclusters_index_generation", - sources = [ - "**/*.scala", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources:ads_fav_based_simclusters_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:ads_fav_click_based_simclusters_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:fav_based_evergreen_content_simclusters_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:fav_based_simclusters_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:fav_based_video_simclusters_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:push_open_based_simclusters_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:reply_based_simclusters_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:retweet_based_simclusters_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:video_view_based_simclusters_cluster_to_tweet_index-scala", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/common", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann:bq_generation", - "unified_user_actions/thrift/src/main/thrift/com/twitter/unified_user_actions:unified_user_actions-scala", - ], -) - -jvm_binary( - name = "fav-based-cluster-to-tweet-index-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.FavBasedClusterToTweetIndexGenerationAdhocJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "fav-based-cluster-to-tweet-index-generation-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.FavBasedClusterToTweetIndexGenerationBatchJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "video-view-based-cluster-to-tweet-index-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.VideoViewBasedClusterToTweetIndexGenerationAdhocJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "video-view-based-cluster-to-tweet-index-generation-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.VideoViewBasedClusterToTweetIndexGenerationBatchJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "retweet-based-cluster-to-tweet-index-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.RetweetBasedClusterToTweetIndexGenerationAdhocJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "retweet-based-cluster-to-tweet-index-generation-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.RetweetBasedClusterToTweetIndexGenerationBatchJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "reply-based-cluster-to-tweet-index-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.ReplyBasedClusterToTweetIndexGenerationAdhocJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "reply-based-cluster-to-tweet-index-generation-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.ReplyBasedClusterToTweetIndexGenerationBatchJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "push-open-based-cluster-to-tweet-index-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.PushOpenBasedClusterToTweetIndexGenerationAdhocJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "push-open-based-cluster-to-tweet-index-generation-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.PushOpenBasedClusterToTweetIndexGenerationBatchJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "ads-fav-based-cluster-to-tweet-index-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.AdsFavBasedClusterToTweetIndexGenerationAdhocJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "ads-fav-based-cluster-to-tweet-index-generation-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.AdsFavBasedClusterToTweetIndexGenerationBatchJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "ads-fav-click-based-cluster-to-tweet-index-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.AdsFavClickBasedClusterToTweetIndexGenerationAdhocJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "ads-fav-click-based-cluster-to-tweet-index-generation-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.AdsFavClickBasedClusterToTweetIndexGenerationBatchJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "fav-based-evergreen-content-cluster-to-tweet-index-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.FavBasedEvergreenContentClusterToTweetIndexGenerationAdhocJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "fav-based-evergreen-content-cluster-to-tweet-index-generation-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.FavBasedEvergreenContentClusterToTweetIndexGenerationBatchJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "fav-based-video-cluster-to-tweet-index-generation-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.FavBasedVideoClusterToTweetIndexGenerationAdhocJob", - dependencies = [ - ":simclusters_index_generation", - ], -) - -jvm_binary( - name = "fav-based-video-cluster-to-tweet-index-generation-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.FavBasedVideoClusterToTweetIndexGenerationBatchJob", - dependencies = [ - ":simclusters_index_generation", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/BUILD.docx new file mode 100644 index 000000000..4029ca845 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/Config.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/Config.docx new file mode 100644 index 000000000..2d67cfa0a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/Config.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/Config.scala b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/Config.scala deleted file mode 100644 index 5f44986e9..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/Config.scala +++ /dev/null @@ -1,82 +0,0 @@ -package com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation - -object Config { - // Common Root Path - val RootMHPath: String = "manhattan_sequence_files/simclusters_to_tweet_index/" - val RootThriftPath: String = "processed/simclusters_to_tweet_index/" - val AdhocRootPath = "adhoc/simclusters_to_tweet_index/" - // cluster-to-tweet KeyVal Dataset Output Path - val FavBasedClusterToTweetIndexOutputPath = "fav_based_index" - val FavBasedEvergreenContentClusterToTweetIndexOutputPath = "fav_based_evergreen_index" - val FavBasedVideoClusterToTweetIndexOutputPath = "fav_based_video_index" - val VideoViewBasedClusterToTweetIndexOutputPath = "video_view_based_index" - val RetweetBasedClusterToTweetIndexOutputPath = "retweet_based_index" - val ReplyBasedClusterToTweetIndexOutputPath = "reply_based_index" - val PushOpenBasedClusterToTweetIndexOutputPath = "push_open_based_index" - val AdsFavBasedClusterToTweetIndexOutputPath = "ads_fav_based_index" - val AdsFavClickBasedClusterToTweetIndexOutputPath = "ads_fav_click_based_index" - - // SQL file path - val simclustersEngagementBasedIndexGenerationSQLPath = - s"/com/twitter/simclusters_v2/scio/bq_generation/sql/engagement_based_index_generation.sql" - val unifiedUserTweetActionPairGenerationSQLPath = - s"/com/twitter/simclusters_v2/scio/bq_generation/sql/unified_user_tweet_action_pair_generation.sql" - val combinedUserTweetActionPairGenerationSQLPath = - s"/com/twitter/simclusters_v2/scio/bq_generation/sql/combined_user_tweet_action_pair_generation.sql" - val adsUserTweetActionPairGenerationSQLPath = - s"/com/twitter/simclusters_v2/scio/bq_generation/sql/ads_user_tweet_action_pair_generation.sql" - val evergreenContentUserTweetActionPairGenerationSQLPath = - s"/com/twitter/simclusters_v2/scio/bq_generation/sql/evergreen_content_user_tweet_action_pair_generation.sql" - val favBasedVideoTweetActionPairGenerationSQLPath = - s"/com/twitter/simclusters_v2/scio/bq_generation/sql/user_video_tweet_fav_engagement_generation.sql" - - // Table name for server/client engagements - val clientEngagementTableName: String = "twttr-bq-iesource-prod.user.client_engagements" - val serverEngagementTableName: String = "twttr-bq-iesource-prod.user.server_engagements" - - // Tweet id column names from UUA - val actionTweetIdColumn: String = "item.tweetInfo.actionTweetId" - val retweetTweetIdColumn: String = "item.tweetInfo.retweetedTweetId" - val replyTweetIdColumn: String = "item.tweetInfo.inReplyToTweetId" - val pushTweetIdColumn: String = "item.notificationInfo.content.tweetNotification.tweetId" - - // Do not enable health or video filters by default - val enableHealthAndVideoFilters: Boolean = false - - // Do not enable top k tweets per cluster intersection with fav-based clusters - val enableIntersectionWithFavBasedClusterTopKTweetsIndex: Boolean = false - - // Min fav/interaction threshold - val minInteractionCount: Int = 50 - val minFavCount: Int = 50 - - // Tweet Embeddings configs - val tweetEmbeddingsLength: Int = 50 - val tweetEmbeddingsHalfLife: Int = 28800000 - - // Cluster-to-tweet index configs - val clusterTopKTweets: Int = 2000 - val maxTweetAgeHours: Int = 24 - val minEngagementPerCluster: Int = 0 - - // Placeholder action type for interactions that don't have undo events (e.g. video views) - val PlaceholderActionType: String = "PLACEHOLDER_ACTION_TYPE" - - // Ads event engagement type ids - val AdsFavEngagementTypeIds = Seq(8) // Fav promoted tweet - val AdsClickEngagementTypeIds = Seq( - 1, //URL - 42, // CARD_URL_CLICK - 53, // WEBSITE_CARD_CONTAINER_CLICK - 54, // WEBSITE_CARD_BUTTON_CLICK - 55, // WEBSITE_CARD_IMAGE_CLICK - 56, // WEBSITE_CARD_TITLE_CLICK - 69, // BUYNOW_CARD_CLICK - 70, // BUYNOW_PURCHASE_SUCCESS - 72, // VIDEO_CTA_URL_CLICK - 76, // VIDEO_AD_CTA_URL_CLICK - 80, // VIDEO_CONTENT_CTA_URL_CLICK - 84, // CL_OFFER_CARD_CLICK - ) - -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexFromBQ.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexFromBQ.docx new file mode 100644 index 000000000..518df1c6e Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexFromBQ.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexFromBQ.scala b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexFromBQ.scala deleted file mode 100644 index 93d6c9ee7..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexFromBQ.scala +++ /dev/null @@ -1,177 +0,0 @@ -package com.twitter.simclusters_v2.scio.bq_generation -package simclusters_index_generation - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.simclusters_v2.scio.bq_generation.common.BQGenerationUtil.getNSFWTweetIdDenylistSQL -import com.twitter.simclusters_v2.scio.bq_generation.common.BQGenerationUtil.getTweetIdWithFavCountSQL -import com.twitter.simclusters_v2.scio.bq_generation.common.BQGenerationUtil.getTweetIdWithMediaAndNSFWAuthorFilterSQL -import com.twitter.simclusters_v2.scio.bq_generation.common.BQGenerationUtil.getUserTweetEngagementEventPairSQL -import com.twitter.simclusters_v2.scio.bq_generation.common.BQGenerationUtil.generateClusterTopTweetIntersectionWithFavBasedIndexSQL -import com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.Config.simclustersEngagementBasedIndexGenerationSQLPath -import com.twitter.simclusters_v2.scio.bq_generation.common.IndexGenerationUtil.TopKTweetsForClusterKey -import com.twitter.simclusters_v2.scio.bq_generation.common.IndexGenerationUtil.parseClusterTopKTweetsFn -import com.twitter.wtf.beam.bq_embedding_export.BQQueryUtils -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO -import org.joda.time.DateTime - -object EngagementEventBasedClusterToTweetIndexFromBQ { - - /* - * Reads the user-tweet-interaction table and apply tweet fav count filter - * Returns the post processed table results in SQL string format - * -* Input: - * - startTime: DateTime - * The earliest timestamp from the user-tweet-interaction table - * - endTime: DateTime - * The latest timestamp from the user-tweet-interaction table - * - minFavCount: Int - * Whether we want to enable tweet fav count filters - * -* Return: - * String - Post processed table results in SQL string format - */ - def getTweetInteractionTableWithFavCountFilter( - startTime: DateTime, - endTime: DateTime, - minFavCount: Int - ): String = { - if (minFavCount > 0) { - val tweetFavCountSQL = getTweetIdWithFavCountSQL(startTime, endTime) - s""" - | WITH tweet_fav_count AS (${tweetFavCountSQL}) - | SELECT userId, tweetId, tsMillis - | FROM user_tweet_interaction_with_min_interaction_count_filter - | JOIN tweet_fav_count - | USING(tweetId) - | WHERE tweet_fav_count.favCount >= ${minFavCount} - |""".stripMargin - } else { - // Directly read from the table without applying any filters - s"SELECT userId, tweetId, tsMillis FROM user_tweet_interaction_with_min_interaction_count_filter" - } - } - - /* - * Reads the user-tweet-interaction table and apply health and video filters if specified. - * Returns the post processed table results in SQL string format - * - * Input: - * - tableName: String - * Schema of the table - * userId: Long - * tweetId: Long - * tsMillis: Long - * - startTime: DateTime - * The earliest timestamp from the user-tweet-interaction table - * - endTime: DateTime - * The latest timestamp from the user-tweet-interaction table - * - enableHealthAndVideoFilters: Boolean - * Whether we want to enable health filters and video only filters - * - * Return: - * String - Post processed table results in SQL string format - */ - def getTweetInteractionTableWithHealthFilter( - startTime: DateTime, - endTime: DateTime, - enableHealthAndVideoFilters: Boolean, - ): String = { - if (enableHealthAndVideoFilters) { - // Get SQL for tweets with media and NSFW filter - val tweetWithMediaAndNSFWAuthorFilterSQL = getTweetIdWithMediaAndNSFWAuthorFilterSQL( - startTime, - endTime, - filterMediaType = Some(3), // VideoTweets MediaType = 3 - filterNSFWAuthor = true - ) - // Get SQL for NSFW tweet id deny list - val nsfwTweetDenylistSQL = getNSFWTweetIdDenylistSQL(startTime, endTime) - // Combine the health filter SQLs - s""" - |SELECT userId, tweetId, tsMillis FROM user_tweet_interaction_with_fav_count_filter JOIN ( - | ${tweetWithMediaAndNSFWAuthorFilterSQL} - | AND tweetId NOT IN (${nsfwTweetDenylistSQL}) - |) USING(tweetId) - |""".stripMargin - } else { - // Directly read from the table without applying any filters - s"SELECT userId, tweetId, tsMillis FROM user_tweet_interaction_with_fav_count_filter" - } - } - - def getTopKTweetsForClusterKeyBQ( - sc: ScioContext, - queryTimestamp: DateTime, - maxTweetAgeHours: Int, - consumerEmbeddingsSQL: String, - userTweetEngagementEventPairSqlPath: String, - userTweetEngagementEventPairTemplateVariable: Map[String, String], - enableHealthAndVideoFilters: Boolean, - enableFavClusterTopKTweetsIntersection: Boolean, - minInteractionCount: Int, - minFavCount: Int, - tweetEmbeddingsLength: Int, - tweetEmbeddingsHalfLife: Int, - minEngagementPerCluster: Int, - clusterTopKTweets: Int - ): SCollection[TopKTweetsForClusterKey] = { - // Define template variables which we would like to be replaced in the corresponding sql file - val startTime = queryTimestamp.minusHours(maxTweetAgeHours) - val endTime = queryTimestamp - - val indexGenerationTemplateVariables = - Map( - "HALF_LIFE" -> tweetEmbeddingsHalfLife.toString, - "CURRENT_TS" -> queryTimestamp.toString(), - "START_TIME" -> startTime.toString(), - "END_TIME" -> endTime.toString(), - "USER_TWEET_ENGAGEMENT_TABLE_SQL" -> - getUserTweetEngagementEventPairSQL( - startTime, - endTime, - userTweetEngagementEventPairSqlPath, - userTweetEngagementEventPairTemplateVariable - ), - // Min interaction count filter - "MIN_INTERACTION_COUNT" -> minInteractionCount.toString, - // Min fav count filter - "TWEET_INTERACTION_WITH_FAV_COUNT_FILTER_SQL" -> getTweetInteractionTableWithFavCountFilter( - startTime, - endTime, - minFavCount - ), - // Health filter - "TWEET_INTERACTION_WITH_HEALTH_FILTER_SQL" -> getTweetInteractionTableWithHealthFilter( - startTime, - endTime, - enableHealthAndVideoFilters), - "CONSUMER_EMBEDDINGS_SQL" -> consumerEmbeddingsSQL, - "TWEET_EMBEDDING_LENGTH" -> tweetEmbeddingsLength.toString, - "MIN_ENGAGEMENT_PER_CLUSTER" -> minEngagementPerCluster.toString, - "CLUSTER_TOP_K_TWEETS" -> clusterTopKTweets.toString - ) - val query = BQQueryUtils.getBQQueryFromSqlFile( - simclustersEngagementBasedIndexGenerationSQLPath, - indexGenerationTemplateVariables) - - val postFilterQuery = if (enableFavClusterTopKTweetsIntersection) { - generateClusterTopTweetIntersectionWithFavBasedIndexSQL( - startTime, - endTime, - clusterTopKTweets, - query) - } else { - query - } - // Generate SimClusters cluster-to-tweet index - sc.customInput( - s"SimClusters cluster-to-tweet index generation BQ job", - BigQueryIO - .read(parseClusterTopKTweetsFn(tweetEmbeddingsHalfLife)) - .fromQuery(postFilterQuery) - .usingStandardSql() - ) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexGenerationJob.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexGenerationJob.docx new file mode 100644 index 000000000..77fc07b31 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexGenerationJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexGenerationJob.scala b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexGenerationJob.scala deleted file mode 100644 index 46c2af2f0..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexGenerationJob.scala +++ /dev/null @@ -1,659 +0,0 @@ -package com.twitter.simclusters_v2.scio.bq_generation -package simclusters_index_generation - -import com.google.api.services.bigquery.model.TimePartitioning -import com.spotify.scio.ScioContext -import com.spotify.scio.coders.Coder -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.job.DateRangeOptions -import com.twitter.conversions.DurationOps.richDurationFromInt -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scio_internal.coders.ThriftStructLazyBinaryScroogeCoder -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.scrooge.ThriftStruct -import com.twitter.simclusters_v2.hdfs_sources.AdsFavBasedSimclustersClusterToTweetIndexScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.AdsFavClickBasedSimclustersClusterToTweetIndexScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.FavBasedEvergreenContentSimclustersClusterToTweetIndexScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.FavBasedSimclustersClusterToTweetIndexScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.FavBasedVideoSimclustersClusterToTweetIndexScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.ReplyBasedSimclustersClusterToTweetIndexScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.RetweetBasedSimclustersClusterToTweetIndexScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.VideoViewBasedSimclustersClusterToTweetIndexScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.PushOpenBasedSimclustersClusterToTweetIndexScalaDataset -import com.twitter.simclusters_v2.scio.bq_generation.common.BQGenerationUtil.buildActionTypesEngagementIndicatorString -import com.twitter.simclusters_v2.scio.bq_generation.common.BQGenerationUtil.getInterestedIn2020SQL -import com.twitter.simclusters_v2.scio.bq_generation.common.BQTableDetails -import com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.Config.AdsClickEngagementTypeIds -import com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.Config.AdsFavEngagementTypeIds -import com.twitter.simclusters_v2.scio.bq_generation.simclusters_index_generation.EngagementEventBasedClusterToTweetIndexFromBQ.getTopKTweetsForClusterKeyBQ -import com.twitter.simclusters_v2.thriftscala.ClusterIdToTopKTweetsWithScores -import com.twitter.simclusters_v2.thriftscala.FullClusterId -import com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores -import com.twitter.tcdc.bqblaster.beam.syntax._ -import com.twitter.tcdc.bqblaster.core.avro.TypedProjection -import com.twitter.tcdc.bqblaster.core.transform.RootTransform -import com.twitter.unified_user_actions.thriftscala.ActionType -import java.time.Instant -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO -import org.joda.time.DateTime - -trait EngagementEventBasedClusterToTweetIndexGenerationJob extends ScioBeamJob[DateRangeOptions] { - // Configs to set for different type of embeddings and jobs - val isAdhoc: Boolean - val getConsumerEmbeddingsSQLFunc: (DateTime, Int) => String - val outputTable: BQTableDetails - val keyValDatasetOutputPath: String - val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] - // Base configs - val projectId = "twttr-recos-ml-prod" - val environment: DAL.Env = if (isAdhoc) DAL.Environment.Dev else DAL.Environment.Prod - - // Point to different user tweet interaction table generation sql - // UUA-supported events: Config.unifiedUserTweetActionPairGenerationSQLPath - val userTweetEngagementEventPairSqlPath: String - lazy val userTweetEngagementEventPairTemplateVariable: Map[String, String] = Map.empty - - // Enable Video-only filters and health filters (for VideoViewBased embeddings) - val enableHealthAndVideoFilters: Boolean = Config.enableHealthAndVideoFilters - - val enableFavClusterTopKTweetsIntersection: Boolean = - Config.enableIntersectionWithFavBasedClusterTopKTweetsIndex - - // Min fav/interaction threshold - val minInteractionCount: Int = Config.minInteractionCount - val minFavCount: Int = Config.minFavCount - - // Tweet embeddings parameters - val tweetEmbeddingsLength: Int = Config.tweetEmbeddingsLength - val tweetEmbeddingsHalfLife: Int = Config.tweetEmbeddingsHalfLife - - // Clusters-to-tweet index parameters - val clusterTopKTweets: Int = Config.clusterTopKTweets - val maxTweetAgeHours: Int = Config.maxTweetAgeHours - val minEngagementPerCluster: Int = Config.minEngagementPerCluster - - override implicit def scroogeCoder[T <: ThriftStruct: Manifest]: Coder[T] = - ThriftStructLazyBinaryScroogeCoder.scroogeCoder - - override def configurePipeline(sc: ScioContext, opts: DateRangeOptions): Unit = { - // The time when the job is scheduled - val queryTimestamp = opts.interval.getEnd - - // Read consumer embeddings SQL - val consumerEmbeddingsSQL = getConsumerEmbeddingsSQLFunc(queryTimestamp, 21) - - // Generate SimClusters cluster-to-tweet index via BQ - val topKtweetsForClusterKey = - getTopKTweetsForClusterKeyBQ( - sc, - queryTimestamp, - maxTweetAgeHours, - consumerEmbeddingsSQL, - userTweetEngagementEventPairSqlPath, - userTweetEngagementEventPairTemplateVariable, - enableHealthAndVideoFilters, - enableFavClusterTopKTweetsIntersection, - minInteractionCount, - minFavCount, - tweetEmbeddingsLength, - tweetEmbeddingsHalfLife, - minEngagementPerCluster, - clusterTopKTweets - ) - - // Setup BQ writer - val ingestionTime = opts.getDate().value.getEnd.toDate - val bqFieldsTransform = RootTransform - .Builder() - .withPrependedFields("dateHour" -> TypedProjection.fromConstant(ingestionTime)) - val timePartitioning = new TimePartitioning() - .setType("HOUR").setField("dateHour").setExpirationMs(3.days.inMilliseconds) - val bqWriter = BigQueryIO - .write[ClusterIdToTopKTweetsWithScores] - .to(outputTable.toString) - .withExtendedErrorInfo() - .withTimePartitioning(timePartitioning) - .withLoadJobProjectId(projectId) - .withThriftSupport(bqFieldsTransform.build(), AvroConverter.Legacy) - .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) - .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) - - // Save SimClusters index to a BQ table - topKtweetsForClusterKey - .map { clusterIdToTopKTweets => - { - ClusterIdToTopKTweetsWithScores( - clusterId = clusterIdToTopKTweets.clusterId, - topKTweetsWithScores = clusterIdToTopKTweets.topKTweetsWithScores - ) - } - } - .saveAsCustomOutput(s"WriteToBQTable - ${outputTable}", bqWriter) - - // Save SimClusters index as a KeyValSnapshotDataset - topKtweetsForClusterKey - .map { clusterIdToTopKTweets => - KeyVal(clusterIdToTopKTweets.clusterId, clusterIdToTopKTweets.topKTweetsWithScores) - }.saveAsCustomOutput( - name = s"WriteClusterToKeyIndexToKeyValDataset at ${keyValDatasetOutputPath}", - DAL.writeVersionedKeyVal( - clusterToTweetIndexSnapshotDataset, - PathLayout.VersionedPath(prefix = - ((if (!isAdhoc) - Config.RootMHPath - else - Config.AdhocRootPath) - + keyValDatasetOutputPath)), - instant = Instant.ofEpochMilli(opts.interval.getEndMillis - 1L), - environmentOverride = environment, - ) - ) - } -} - -// This abstract class is used to define parameters specific to UUA events. -abstract class UUABasedClusterToTweetIndexGenerationJob - extends EngagementEventBasedClusterToTweetIndexGenerationJob { - // UUA Action types and column names - val contributingActionTypes: Seq[String] - val contributingActionReferenceTweetIdColumn: String = Config.actionTweetIdColumn - val undoActionTypes: Seq[String] - // Default undo tweet id is same as the actionTweetId (e.g. for favs these are the same tweet id) - val undoActionReferenceTweetIdColumn: String = Config.actionTweetIdColumn - - // Get the string that represents the list of undo event ids - lazy val undoActionTypesStr: String = { - // Populate the action type list with a placeholder action if its empty - val actionTypes = - if (undoActionTypes.nonEmpty) undoActionTypes - else Seq(Config.PlaceholderActionType) - convertActionTypesSeqToString(actionTypes) - } - - override lazy val userTweetEngagementEventPairTemplateVariable: Map[String, String] = { - Map( - "CONTRIBUTING_ACTION_TYPES_STR" -> convertActionTypesSeqToString(contributingActionTypes), - "CONTRIBUTING_ACTION_TWEET_ID_COLUMN" -> contributingActionReferenceTweetIdColumn, - "UNDO_ACTION_TYPES_STR" -> undoActionTypesStr, - "UNDO_ACTION_TWEET_ID_COLUMN" -> undoActionReferenceTweetIdColumn - ) - } - - /*** - * Convert a list of actions to a string that could be easily used in SQLs - * Example input: Seq("ServerTweetFav", "ClientTweetFav") - * output: "ServerTweetFav","ClientTweetFav" - * SQL use case: SELECT * FROM table WHERE actionType IN ("ServerTweetFav","ClientTweetFav") - */ - private def convertActionTypesSeqToString(actionTypes: Seq[String]): String = { - actionTypes.map(action => f"""\"${action}\"""").mkString(",") - } -} - -abstract class AdsClusterToTweetIndexGenerationJob - extends EngagementEventBasedClusterToTweetIndexGenerationJob { - // Ads contributing action types - fav, click, etc - val contributingActionTypes: Seq[Int] - - override lazy val userTweetEngagementEventPairTemplateVariable: Map[String, String] = { - Map( - "CONTRIBUTING_ACTION_TYPES_STR" -> convertActionTypesSeqToString(contributingActionTypes) - ) - } - private def convertActionTypesSeqToString(actionTypes: Seq[Int]): String = { - actionTypes.map(action => f"""${action}""").mkString(",") - } -} - -object FavBasedClusterToTweetIndexGenerationAdhocJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = true - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.unifiedUserTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq(ActionType.ServerTweetFav.name) - override val undoActionTypes: Seq[String] = Seq(ActionType.ServerTweetUnfav.name) - override val minInteractionCount: Int = 8 - override val minFavCount: Int = 8 - override val outputTable = - BQTableDetails( - "twttr-recos-ml-prod", - "simclusters", - "simclusters_fav_based_cluster_to_tweet_index") - override val keyValDatasetOutputPath = Config.FavBasedClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - FavBasedSimclustersClusterToTweetIndexScalaDataset -} - -object FavBasedClusterToTweetIndexGenerationBatchJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.unifiedUserTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq(ActionType.ServerTweetFav.name) - override val undoActionTypes: Seq[String] = Seq(ActionType.ServerTweetUnfav.name) - override val minInteractionCount: Int = 8 - override val minFavCount: Int = 8 - override val outputTable = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_fav_based_cluster_to_tweet_index") - override val keyValDatasetOutputPath = Config.FavBasedClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - FavBasedSimclustersClusterToTweetIndexScalaDataset -} - -object VideoViewBasedClusterToTweetIndexGenerationAdhocJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = true - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.unifiedUserTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq( - ActionType.ClientTweetVideoPlayback50.name) - override val undoActionTypes: Seq[String] = Seq.empty - override val enableHealthAndVideoFilters: Boolean = true - override val outputTable = - BQTableDetails( - "twttr-recos-ml-prod", - "simclusters", - "simclusters_video_view_based_cluster_to_tweet_index") - override val keyValDatasetOutputPath = Config.VideoViewBasedClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - VideoViewBasedSimclustersClusterToTweetIndexScalaDataset -} - -object VideoViewBasedClusterToTweetIndexGenerationBatchJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.unifiedUserTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq( - ActionType.ClientTweetVideoPlayback50.name) - override val undoActionTypes: Seq[String] = Seq.empty - override val enableHealthAndVideoFilters: Boolean = true - override val outputTable = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_video_view_based_cluster_to_tweet_index") - override val keyValDatasetOutputPath = Config.VideoViewBasedClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - VideoViewBasedSimclustersClusterToTweetIndexScalaDataset -} - -object RetweetBasedClusterToTweetIndexGenerationAdhocJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = true - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.unifiedUserTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq(ActionType.ServerTweetRetweet.name) - override val undoActionTypes: Seq[String] = Seq(ActionType.ServerTweetUnretweet.name) - override val undoActionReferenceTweetIdColumn: String = Config.retweetTweetIdColumn - override val outputTable = - BQTableDetails( - "twttr-recos-ml-prod", - "simclusters", - "simclusters_retweet_based_cluster_to_tweet_index") - override val keyValDatasetOutputPath = Config.RetweetBasedClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - RetweetBasedSimclustersClusterToTweetIndexScalaDataset -} - -object RetweetBasedClusterToTweetIndexGenerationBatchJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.unifiedUserTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq(ActionType.ServerTweetRetweet.name) - override val undoActionTypes: Seq[String] = Seq(ActionType.ServerTweetUnretweet.name) - override val undoActionReferenceTweetIdColumn: String = Config.retweetTweetIdColumn - override val outputTable = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_retweet_based_cluster_to_tweet_index") - override val keyValDatasetOutputPath = Config.RetweetBasedClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - RetweetBasedSimclustersClusterToTweetIndexScalaDataset -} - -object ReplyBasedClusterToTweetIndexGenerationAdhocJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = true - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.combinedUserTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq(ActionType.ServerTweetReply.name) - override val undoActionTypes: Seq[String] = Seq(ActionType.ServerTweetDelete.name) - override val undoActionReferenceTweetIdColumn: String = Config.replyTweetIdColumn - override val minInteractionCount: Int = 8 - override val minFavCount: Int = 8 - override val minEngagementPerCluster: Int = 3 - // Add supplemental positive signals to the user tweet engagement event template - // We bundle each reply signal with a positive signal (fav or retweet) - val supplementalPositiveSignals: Seq[String] = - Seq(ActionType.ServerTweetFav.name, ActionType.ServerTweetRetweet.name) - override lazy val userTweetEngagementEventPairTemplateVariable: Map[String, String] = { - Map( - "CONTRIBUTING_ACTION_TYPE_STR" -> contributingActionTypes.head, - "UNDO_ACTION_TYPES_STR" -> undoActionTypesStr, - "UNDO_ACTION_TWEET_ID_COLUMN" -> undoActionReferenceTweetIdColumn, - "SUPPLEMENTAL_ACTION_TYPES_ENGAGEMENT_STR" -> buildActionTypesEngagementIndicatorString( - supplementalPositiveSignals) - ) - } - override val outputTable = - BQTableDetails( - "twttr-recos-ml-prod", - "simclusters", - "simclusters_reply_based_cluster_to_tweet_index") - override val keyValDatasetOutputPath = Config.ReplyBasedClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - ReplyBasedSimclustersClusterToTweetIndexScalaDataset -} - -object ReplyBasedClusterToTweetIndexGenerationBatchJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.combinedUserTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq(ActionType.ServerTweetReply.name) - override val undoActionTypes: Seq[String] = Seq(ActionType.ServerTweetDelete.name) - override val undoActionReferenceTweetIdColumn: String = Config.replyTweetIdColumn - override val minInteractionCount: Int = 8 - override val minFavCount: Int = 8 - override val minEngagementPerCluster: Int = 3 - // Add supplemental positive signals to the user tweet engagement event template - // We bundle each reply signal with a positive signal (fav or retweet) - val supplementalPositiveSignals: Seq[String] = - Seq(ActionType.ServerTweetFav.name, ActionType.ServerTweetRetweet.name) - override lazy val userTweetEngagementEventPairTemplateVariable: Map[String, String] = { - Map( - "CONTRIBUTING_ACTION_TYPE_STR" -> contributingActionTypes.head, - "UNDO_ACTION_TYPES_STR" -> undoActionTypesStr, - "UNDO_ACTION_TWEET_ID_COLUMN" -> undoActionReferenceTweetIdColumn, - "SUPPLEMENTAL_ACTION_TYPES_ENGAGEMENT_STR" -> buildActionTypesEngagementIndicatorString( - supplementalPositiveSignals) - ) - } - override val outputTable = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_reply_based_cluster_to_tweet_index") - override val keyValDatasetOutputPath = Config.ReplyBasedClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - ReplyBasedSimclustersClusterToTweetIndexScalaDataset -} - -object PushOpenBasedClusterToTweetIndexGenerationAdhocJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = true - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.unifiedUserTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq(ActionType.ClientNotificationOpen.name) - override val contributingActionReferenceTweetIdColumn: String = Config.pushTweetIdColumn - override val undoActionTypes: Seq[String] = Seq.empty - override val minInteractionCount = 1 - override val minFavCount = 0 - override val enableFavClusterTopKTweetsIntersection = true - override val outputTable = - BQTableDetails( - "twttr-recos-ml-prod", - "simclusters", - "simclusters_push_open_based_cluster_to_tweet_index") - override val keyValDatasetOutputPath = Config.PushOpenBasedClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - PushOpenBasedSimclustersClusterToTweetIndexScalaDataset -} - -object PushOpenBasedClusterToTweetIndexGenerationBatchJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.unifiedUserTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq(ActionType.ClientNotificationOpen.name) - override val contributingActionReferenceTweetIdColumn: String = Config.pushTweetIdColumn - override val undoActionTypes: Seq[String] = Seq.empty - override val minInteractionCount = 1 - override val minFavCount = 0 - override val enableFavClusterTopKTweetsIntersection = true - override val outputTable = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_push_open_based_cluster_to_tweet_index") - override val keyValDatasetOutputPath = Config.PushOpenBasedClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - PushOpenBasedSimclustersClusterToTweetIndexScalaDataset -} - -object AdsFavBasedClusterToTweetIndexGenerationAdhocJob - extends AdsClusterToTweetIndexGenerationJob { - val isAdhoc: Boolean = true - val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val contributingActionTypes: Seq[Int] = AdsFavEngagementTypeIds // fav - override val tweetEmbeddingsHalfLife: Int = 345600000 // 4 days - // The earliest user tweet engagement event we consider is 7 days ago - // The tweet could be older than 7 days - override val maxTweetAgeHours: Int = 168 // 7 days - override val minInteractionCount: Int = 3 - override val minFavCount: Int = 3 - override val minEngagementPerCluster: Int = 2 - override val outputTable = - BQTableDetails( - "twttr-recos-ml-prod", - "simclusters", - "simclusters_ads_fav_based_cluster_to_tweet_index") - val keyValDatasetOutputPath: String = Config.AdsFavBasedClusterToTweetIndexOutputPath - val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = AdsFavBasedSimclustersClusterToTweetIndexScalaDataset - val userTweetEngagementEventPairSqlPath: String = - Config.adsUserTweetActionPairGenerationSQLPath -} -object AdsFavBasedClusterToTweetIndexGenerationBatchJob - extends AdsClusterToTweetIndexGenerationJob { - val isAdhoc: Boolean = false - val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val contributingActionTypes: Seq[Int] = AdsFavEngagementTypeIds // fav - override val tweetEmbeddingsHalfLife: Int = 345600000 // 4 days - // The earliest user tweet engagement event we consider is 7 days ago - // The tweet could be older than 7 days - override val maxTweetAgeHours: Int = 168 // 7 days - override val minInteractionCount: Int = 3 - override val minFavCount: Int = 3 - override val minEngagementPerCluster: Int = 2 - override val outputTable = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_ads_fav_based_cluster_to_tweet_index") - val keyValDatasetOutputPath: String = Config.AdsFavBasedClusterToTweetIndexOutputPath - val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = AdsFavBasedSimclustersClusterToTweetIndexScalaDataset - val userTweetEngagementEventPairSqlPath: String = - Config.adsUserTweetActionPairGenerationSQLPath -} - -object AdsFavClickBasedClusterToTweetIndexGenerationAdhocJob - extends AdsClusterToTweetIndexGenerationJob { - val isAdhoc: Boolean = true - val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val contributingActionTypes: Seq[Int] = - AdsFavEngagementTypeIds ++ AdsClickEngagementTypeIds // fav + click - override val tweetEmbeddingsHalfLife: Int = 604800000 // 7 days - // The earliest user tweet engagement event we consider is 21 days ago - // The tweet could be older than 21 days - override val maxTweetAgeHours: Int = 504 // 21 days - override val minInteractionCount: Int = 3 - override val minFavCount: Int = 3 - override val minEngagementPerCluster: Int = 2 - override val outputTable = - BQTableDetails( - "twttr-recos-ml-prod", - "simclusters", - "simclusters_ads_fav_click_ sbased_cluster_to_tweet_index") - val keyValDatasetOutputPath: String = Config.AdsFavClickBasedClusterToTweetIndexOutputPath - val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = AdsFavClickBasedSimclustersClusterToTweetIndexScalaDataset - val userTweetEngagementEventPairSqlPath: String = - Config.adsUserTweetActionPairGenerationSQLPath -} - -object AdsFavClickBasedClusterToTweetIndexGenerationBatchJob - extends AdsClusterToTweetIndexGenerationJob { - val isAdhoc: Boolean = false - val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val contributingActionTypes: Seq[Int] = - AdsFavEngagementTypeIds ++ AdsClickEngagementTypeIds // fav + click - override val tweetEmbeddingsHalfLife: Int = 604800000 // 7 days - // The earliest user tweet engagement event we consider is 21 days ago - // The tweet could be older than 21 days - override val maxTweetAgeHours: Int = 504 // 21 days - override val minInteractionCount: Int = 3 - override val minFavCount: Int = 3 - override val minEngagementPerCluster: Int = 2 - override val outputTable = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_ads_fav_click_based_cluster_to_tweet_index") - val keyValDatasetOutputPath: String = Config.AdsFavClickBasedClusterToTweetIndexOutputPath - val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = AdsFavClickBasedSimclustersClusterToTweetIndexScalaDataset - val userTweetEngagementEventPairSqlPath: String = - Config.adsUserTweetActionPairGenerationSQLPath -} - -object FavBasedEvergreenContentClusterToTweetIndexGenerationAdhocJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = true - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.evergreenContentUserTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq(ActionType.ServerTweetFav.name) - override val undoActionTypes: Seq[String] = Seq(ActionType.ServerTweetUnfav.name) - override val tweetEmbeddingsHalfLife: Int = 57600000 // 16 hours - override val maxTweetAgeHours: Int = 48 // 2 days - override val minInteractionCount: Int = 8 - override val minFavCount: Int = 0 - override val outputTable = - BQTableDetails( - "twttr-recos-ml-prod", - "simclusters", - "simclusters_fav_based_evergreen_content_cluster_to_tweet_index") - override val keyValDatasetOutputPath = - Config.FavBasedEvergreenContentClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - FavBasedEvergreenContentSimclustersClusterToTweetIndexScalaDataset -} - -object FavBasedEvergreenContentClusterToTweetIndexGenerationBatchJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.evergreenContentUserTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq(ActionType.ServerTweetFav.name) - override val undoActionTypes: Seq[String] = Seq(ActionType.ServerTweetUnfav.name) - override val tweetEmbeddingsHalfLife: Int = 57600000 // 16 hours - override val maxTweetAgeHours: Int = 48 // 2 days - override val minInteractionCount: Int = 8 - override val minFavCount: Int = 0 - override val outputTable = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_fav_based_evergreen_content_cluster_to_tweet_index") - override val keyValDatasetOutputPath = - Config.FavBasedEvergreenContentClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - FavBasedEvergreenContentSimclustersClusterToTweetIndexScalaDataset -} - -object FavBasedVideoClusterToTweetIndexGenerationAdhocJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = true - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.favBasedVideoTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq(ActionType.ServerTweetFav.name) - override val undoActionTypes: Seq[String] = Seq(ActionType.ServerTweetUnfav.name) - override val minInteractionCount: Int = 8 - override val minFavCount: Int = 0 - override val outputTable = - BQTableDetails( - "twttr-recos-ml-prod", - "simclusters", - "simclusters_fav_based_video_cluster_to_tweet_index") - override val keyValDatasetOutputPath = - Config.FavBasedVideoClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - FavBasedVideoSimclustersClusterToTweetIndexScalaDataset -} - -object FavBasedVideoClusterToTweetIndexGenerationBatchJob - extends UUABasedClusterToTweetIndexGenerationJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val userTweetEngagementEventPairSqlPath: String = - Config.favBasedVideoTweetActionPairGenerationSQLPath - override val contributingActionTypes: Seq[String] = Seq(ActionType.ServerTweetFav.name) - override val undoActionTypes: Seq[String] = Seq(ActionType.ServerTweetUnfav.name) - override val minInteractionCount: Int = 8 - override val minFavCount: Int = 0 - override val outputTable = - BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "simclusters_fav_based_video_cluster_to_tweet_index") - override val keyValDatasetOutputPath = - Config.FavBasedVideoClusterToTweetIndexOutputPath - override val clusterToTweetIndexSnapshotDataset: KeyValDALDataset[ - KeyVal[FullClusterId, TopKTweetsWithScores] - ] = - FavBasedVideoSimclustersClusterToTweetIndexScalaDataset -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/README b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/README deleted file mode 100644 index 4b6a2dc16..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/README +++ /dev/null @@ -1,146 +0,0 @@ -# Adhoc SimClusters Cluster-to-tweet Index Generation Jobs -## Build and bundle the binaries - -``` - bazel bundle src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/... -``` - -## Run the adhoc jobs -### To run fav based cluster-to-tweet index generation job (adhoc): -bin/d6w create \ - twttr-recos-ml-prod/us-central1/fav-based-index-generation-adhoc-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w \ - --jar dist/fav-based-cluster-to-tweet-index-generation-adhoc-job-bundle/fav-based-cluster-to-tweet-index-generation-adhoc-job.jar \ - --bind=profile.user_name=your_ldap \ - --bind=profile.date="2022-07-15" \ - --bind=profile.frequency="PT1H" \ - --bind=profile.environment=dev \ - --bind=profile.job_name="fav-based-index-generation-adhoc-job" \ - --bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation:fav-based-cluster-to-tweet-index-generation-adhoc-job" - -### To run VideoView based cluster-to-tweet index generation job (adhoc): -bin/d6w create \ - twttr-recos-ml-prod/us-central1/video-view-based-index-generation-adhoc-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w \ - --jar dist/video-view-based-cluster-to-tweet-index-generation-adhoc-job-bundle/video-view-based-cluster-to-tweet-index-generation-adhoc-job.jar \ - --bind=profile.user_name=your_ldap \ - --bind=profile.date="2022-07-15" \ - --bind=profile.frequency="PT1H" \ - --bind=profile.environment=dev \ - --bind=profile.job_name="video-view-based-index-generation-adhoc-job" \ - --bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation:video-view-based-cluster-to-tweet-index-generation-adhoc-job" - -### To run retweet based cluster-to-tweet index generation job (adhoc): -bin/d6w create \ - twttr-recos-ml-prod/us-central1/retweet-based-index-generation-adhoc-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w \ - --jar dist/retweet-based-cluster-to-tweet-index-generation-adhoc-job-bundle/retweet-based-cluster-to-tweet-index-generation-adhoc-job.jar \ - --bind=profile.user_name=your_ldap \ - --bind=profile.date="2022-07-15" \ - --bind=profile.frequency="PT1H" \ - --bind=profile.environment=dev \ - --bind=profile.job_name="retweet-based-index-generation-adhoc-job" \ - --bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation:retweet-based-cluster-to-tweet-index-generation-adhoc-job" - -### To run reply based cluster-to-tweet index generation job (adhoc): -bin/d6w create \ - twttr-recos-ml-prod/us-central1/reply-based-index-generation-adhoc-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w \ - --jar dist/reply-based-cluster-to-tweet-index-generation-adhoc-job-bundle/reply-based-cluster-to-tweet-index-generation-adhoc-job.jar \ - --bind=profile.user_name=your_ldap \ - --bind=profile.date="2022-07-15" \ - --bind=profile.frequency="PT1H" \ - --bind=profile.environment=dev \ - --bind=profile.job_name="reply-based-index-generation-adhoc-job" \ - --bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation:reply-based-cluster-to-tweet-index-generation-adhoc-job" - -### To run push open based cluster-to-tweet index generation job (adhoc): -bin/d6w create \ - twttr-recos-ml-prod/us-central1/push-open-based-index-generation-adhoc-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w \ - --jar dist/push-open-based-cluster-to-tweet-index-generation-adhoc-job-bundle/push-open-based-cluster-to-tweet-index-generation-adhoc-job.jar \ - --bind=profile.user_name=your_ldap \ - --bind=profile.date="2022-10-06" \ - --bind=profile.frequency="PT1H" \ - --bind=profile.environment=dev \ - --bind=profile.job_name="push-open-based-index-generation-adhoc-job" \ - --bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation:push-open-based-cluster-to-tweet-index-generation-adhoc-job" - -# For prod scheduled Cluster-to-tweet Index Generation Jobs -### To run Fav based cluster-to-tweet index generation job (batch): - bin/d6w schedule \ - twttr-recos-ml-prod/us-central1/fav-based-index-generation-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.environment=prod \ - --bind=profile.date="2022-07-19" \ - --bind=profile.frequency="PT1H" \ - --bind=profile.job_name="fav-based-index-generation-batch-job" \ - --bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation:fav-based-cluster-to-tweet-index-generation-batch-job" - -### To run VideoView based cluster-to-tweet index generation job (batch): - bin/d6w schedule \ - twttr-recos-ml-prod/us-central1/video-view-based-index-generation-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.environment=prod \ - --bind=profile.date="2022-07-19" \ - --bind=profile.frequency="PT1H" \ - --bind=profile.job_name="video-view-based-index-generation-batch-job" \ - --bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation:video-view-based-cluster-to-tweet-index-generation-batch-job" - -### To run Retweet based cluster-to-tweet index generation job (batch): - bin/d6w schedule \ - twttr-recos-ml-prod/us-central1/retweet-based-index-generation-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.environment=prod \ - --bind=profile.date="2022-07-19" \ - --bind=profile.frequency="PT1H" \ - --bind=profile.job_name="retweet-based-index-generation-batch-job" \ - --bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation:retweet-based-cluster-to-tweet-index-generation-batch-job" - -### To run Reply based cluster-to-tweet index generation job (batch): - bin/d6w schedule \ - twttr-recos-ml-prod/us-central1/reply-based-index-generation-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.environment=prod \ - --bind=profile.date="2022-07-19" \ - --bind=profile.frequency="PT1H" \ - --bind=profile.job_name="reply-based-index-generation-batch-job" \ - --bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation:reply-based-cluster-to-tweet-index-generation-batch-job" - -### To run Push open based cluster-to-tweet index generation job (batch): - bin/d6w schedule \ - twttr-recos-ml-prod/us-central1/push-open-based-index-generation-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.environment=prod \ - --bind=profile.date="2022-10-06" \ - --bind=profile.frequency="PT1H" \ - --bind=profile.job_name="push-open-based-index-generation-batch-job" \ - --bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation:push-open-based-cluster-to-tweet-index-generation-batch-job" - -### To run Ads Fav based cluster-to-tweet index generation job (batch): - bin/d6w schedule \ - twttr-recos-ml-prod/us-central1/ads-fav-based-index-generation-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.environment=prod \ - --bind=profile.date="2022-10-06" \ - --bind=profile.frequency="PT3H" \ - --bind=profile.job_name="ads-fav-based-index-generation-batch-job" \ - --bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation:ads-fav-based-cluster-to-tweet-index-generation-batch-job" - -### To run Ads Fav Click based cluster-to-tweet index generation job (batch): - bin/d6w schedule \ - twttr-recos-ml-prod/us-central1/ads-fav-click-based-index-generation-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.environment=prod \ - --bind=profile.date="2022-12-09" \ - --bind=profile.frequency="PT3H" \ - --bind=profile.job_name="ads-fav-click-based-index-generation-batch-job" \ - --bind=profile.build_target="src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation:ads-fav-click-based-cluster-to-tweet-index-generation-batch-job" - diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/README.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/README.docx new file mode 100644 index 000000000..f593a5fed Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/README.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w deleted file mode 100644 index 8da4ba43c..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.d6w +++ /dev/null @@ -1,44 +0,0 @@ -class Profile(Struct): - project = Default(String, 'twttr-recos-ml-prod') - date = Required(String) - build_target = Required(String) - job_name = Required(String) - environment = Default(String, 'dev') - machine = Default(String, 'n2-standard-2') - frequency = Default(String, 'PT1H') - -SimClustersIndexGenerationJob = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - environment='{{profile.environment}}', - build_target='{{profile.build_target}}', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='{{profile.frequency}}', - first_time='{{profile.date}}', - ), - workflow_config=WorkflowConfig( - play=True, - ), - timeout='PT4H' - ) -) - -jobs=[SimClustersIndexGenerationJob.bind(profile=Profile())] - - - diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.docx new file mode 100644 index 000000000..43c347c99 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/simclusters_index_generation/engagement-event-based-simclusters-index-generation-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/BUILD b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/BUILD deleted file mode 100644 index ba87e2b54..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/BUILD +++ /dev/null @@ -1,3 +0,0 @@ -resources( - sources = ["*"], -) diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/BUILD.docx new file mode 100644 index 000000000..588d56c57 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/ads_user_tweet_action_pair_generation.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/ads_user_tweet_action_pair_generation.docx new file mode 100644 index 000000000..137a15791 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/ads_user_tweet_action_pair_generation.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/ads_user_tweet_action_pair_generation.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/ads_user_tweet_action_pair_generation.sql deleted file mode 100644 index c5f1e702a..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/ads_user_tweet_action_pair_generation.sql +++ /dev/null @@ -1,38 +0,0 @@ -WITH - vars AS ( - SELECT - TIMESTAMP("{START_TIME}") AS start_date, - TIMESTAMP("{END_TIME}") AS end_date - ), - - ads_engagement AS ( - SELECT - userId64 as userId, - promotedTweetId as tweetId, - UNIX_MILLIS(timestamp) AS tsMillis, - lineItemId - FROM `twttr-rev-core-data-prod.core_served_impressions.spend`, vars - WHERE TIMESTAMP(_batchEnd) >= vars.start_date AND TIMESTAMP(_batchEnd) <= vars.end_date - AND - engagementType IN ({CONTRIBUTING_ACTION_TYPES_STR}) - AND lineItemObjective != 9 -- not pre-roll ads - ), - - line_items AS ( - SELECT - id AS lineItemId, - end_time.posixTime AS endTime - FROM - `twttr-rev-core-data-prod.rev_ads_production.line_items` - ) - - -SELECT - userId, - tweetId, - tsMillis -FROM ads_engagement JOIN line_items USING(lineItemId), vars -WHERE - line_items.endTime IS NULL - OR TIMESTAMP_MILLIS(line_items.endTime) >= vars.end_date - diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets.docx new file mode 100644 index 000000000..c2eed4315 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets.sql deleted file mode 100644 index 9150e7c92..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets.sql +++ /dev/null @@ -1,15 +0,0 @@ -WITH tweet_embedding AS ( --- Expected columns: --- tweetId, clusterId, tweetScore - {TWEET_EMBEDDING_SQL} -), -clusters_top_k_tweets AS ( - SELECT clusterId, ARRAY_AGG(STRUCT(tweetId, tweetScore) ORDER BY tweetScore DESC LIMIT {CLUSTER_TOP_K_TWEETS}) AS topKTweetsForClusterKey - FROM tweet_embedding - GROUP BY clusterId -) -SELECT - clusterId, - topKTweetsForClusterKey -FROM clusters_top_k_tweets - diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets_intersection_with_fav_based_index.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets_intersection_with_fav_based_index.docx new file mode 100644 index 000000000..671f6e730 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets_intersection_with_fav_based_index.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets_intersection_with_fav_based_index.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets_intersection_with_fav_based_index.sql deleted file mode 100644 index 52d13c154..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/cluster_top_tweets_intersection_with_fav_based_index.sql +++ /dev/null @@ -1,59 +0,0 @@ -WITH - cluster_top_tweets AS ( - {CLUSTER_TOP_TWEETS_SQL} - ), - - flatten_cluster_top_tweets AS ( - SELECT - clusterId, - tweet.tweetId, - tweet.tweetScore, - FROM cluster_top_tweets, UNNEST(topKTweetsForClusterKey) AS tweet - ), - ---- There might be delay or skip for the fav-based dataset. ---- This query retrieved the dateHour of the latest partition available. - latest_fav_cluster_to_tweet AS ( - SELECT - MAX(dateHour) AS latestTimestamp - FROM - `twttr-bq-cassowary-prod.user.simclusters_fav_based_cluster_to_tweet_index` - WHERE - TIMESTAMP(dateHour) >= TIMESTAMP("{START_TIME}") - AND TIMESTAMP(dateHour) <= TIMESTAMP("{END_TIME}") - ), - - flatten_fav_cluster_top_tweets AS ( - SELECT - clusterId.clusterId AS clusterId, - tweet.key AS tweetId - FROM - `twttr-bq-cassowary-prod.user.simclusters_fav_based_cluster_to_tweet_index`, - UNNEST(topKTweetsWithScores.topTweetsByFavClusterNormalizedScore) AS tweet, - latest_fav_cluster_to_tweet - WHERE - dateHour=latest_fav_cluster_to_tweet.latestTimestamp - ), - - flatten_cluster_top_tweets_intersection AS ( - SELECT - clusterId, - flatten_cluster_top_tweets.tweetId, - flatten_cluster_top_tweets.tweetScore - FROM - flatten_cluster_top_tweets - INNER JOIN - flatten_fav_cluster_top_tweets - USING(clusterId, tweetId) - ), - - processed_cluster_top_tweets AS ( - SELECT - clusterId, - ARRAY_AGG(STRUCT(tweetId, tweetScore) ORDER BY tweetScore LIMIT {CLUSTER_TOP_K_TWEETS}) AS topKTweetsForClusterKey - FROM flatten_cluster_top_tweets_intersection - GROUP BY clusterId - ) - - SELECT * - FROM processed_cluster_top_tweets diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/combined_user_tweet_action_pair_generation.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/combined_user_tweet_action_pair_generation.docx new file mode 100644 index 000000000..6812578bb Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/combined_user_tweet_action_pair_generation.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/combined_user_tweet_action_pair_generation.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/combined_user_tweet_action_pair_generation.sql deleted file mode 100644 index ed8880c11..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/combined_user_tweet_action_pair_generation.sql +++ /dev/null @@ -1,68 +0,0 @@ -WITH - vars AS ( - SELECT - TIMESTAMP("{START_TIME}") AS start_date, - TIMESTAMP("{END_TIME}") AS end_date, - TIMESTAMP("{NO_OLDER_TWEETS_THAN_DATE}") AS no_older_tweets_than_date - ), - - -- Get raw user-tweet interaction events from UUA - actions_unioned AS ( - SELECT - userIdentifier.userId AS userId, - item.tweetInfo.actionTweetId AS tweetId, - eventMetadata.sourceTimestampMs AS tsMillis, - CASE - WHEN actionType = "ServerTweetFav" THEN 1 - WHEN actionType = "ServerTweetUnfav" THEN -1 - END AS favAction, - CASE - WHEN actionType = "ServerTweetReply" THEN 1 - WHEN actionType = "ServerTweetDelete" THEN -1 - END AS replyAction, - CASE - WHEN actionType = "ServerTweetRetweet" THEN 1 - WHEN actionType = "ServerTweetUnretweet" THEN -1 - END AS retweetAction, - IF(actionType = "ClientTweetVideoPlayback50", 1, NULL) AS videoPlayback50Action - FROM `twttr-bql-unified-prod.unified_user_actions_engagements.streaming_unified_user_actions_engagements`, vars - WHERE (DATE(dateHour) >= DATE(vars.start_date) AND DATE(dateHour) <= DATE(vars.end_date)) - AND eventMetadata.sourceTimestampMs >= UNIX_MILLIS(vars.start_date) - AND eventMetadata.sourceTimestampMs <= UNIX_MILLIS(vars.end_date) - AND (actionType = "ServerTweetReply" - OR actionType = "ServerTweetRetweet" - OR actionType = "ServerTweetFav" - OR actionType = "ServerTweetUnfav" - OR actionType = "ClientTweetVideoPlayback50" - ) - ), - - user_tweet_action_pairs AS ( - SELECT - userId, - tweetId, - -- Get the most recent fav event - ARRAY_AGG(IF(favAction IS NOT NULL, STRUCT(favAction AS engaged, tsMillis), NULL) IGNORE NULLS ORDER BY tsMillis DESC LIMIT 1)[OFFSET(0)] as ServerTweetFav, - -- Get the most recent reply / unreply event - ARRAY_AGG(IF(replyAction IS NOT NULL,STRUCT(replyAction AS engaged, tsMillis), NULL) IGNORE NULLS ORDER BY tsMillis DESC LIMIT 1)[OFFSET(0)] as ServerTweetReply, - -- Get the most recent retweet / unretweet event - ARRAY_AGG(IF(retweetAction IS NOT NULL, STRUCT(retweetAction AS engaged, tsMillis), NULL) IGNORE NULLS ORDER BY tsMillis DESC LIMIT 1)[OFFSET(0)] as ServerTweetRetweet, - -- Get the most recent video view event - ARRAY_AGG(IF(videoPlayback50Action IS NOT NULL, STRUCT(videoPlayback50Action AS engaged, tsMillis), NULL) IGNORE NULLS ORDER BY tsMillis DESC LIMIT 1)[OFFSET(0)] as ClientTweetVideoPlayback50 - FROM actions_unioned - GROUP BY userId, tweetId - ) - --- Combine signals --- Apply age filter in this step -SELECT - userId, - tweetId, - CAST({CONTRIBUTING_ACTION_TYPE_STR}.tsMillis AS FLOAT64) AS tsMillis -FROM user_tweet_action_pairs, vars -WHERE - {CONTRIBUTING_ACTION_TYPE_STR}.engaged = 1 - AND - ({SUPPLEMENTAL_ACTION_TYPES_ENGAGEMENT_STR}) - AND timestamp_millis((1288834974657 + - ((tweetId & 9223372036850581504) >> 22))) >= vars.no_older_tweets_than_date diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/engagement_based_index_generation.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/engagement_based_index_generation.docx new file mode 100644 index 000000000..760c4870c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/engagement_based_index_generation.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/engagement_based_index_generation.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/engagement_based_index_generation.sql deleted file mode 100644 index 5adce6f4b..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/engagement_based_index_generation.sql +++ /dev/null @@ -1,85 +0,0 @@ --- This SQL query generate the cluster to top k tweets index based on tweet engagements. --- The engagement type is decided by USER_TWEET_ENGAGEMENT_TABLE_SQL. - -with vars as ( - SELECT {HALF_LIFE} AS halfLife, -- Default: 8 hour halfLife in millis - UNIX_MILLIS("{CURRENT_TS}") AS currentTs, - ), - - user_tweet_engagement_pairs AS ( - {USER_TWEET_ENGAGEMENT_TABLE_SQL} - ), - - -- A sequence of filters to get eligible tweetIds for tweet embedding generation - -- Apply min interaction count filter - user_tweet_interaction_with_min_interaction_count_filter AS ( - SELECT userId, user_tweet_engagement_pairs.tweetId, tsMillis - FROM user_tweet_engagement_pairs, vars - JOIN ( - SELECT tweetId, COUNT(DISTINCT(userId)) AS interactionCount - FROM user_tweet_engagement_pairs - GROUP BY tweetId - HAVING interactionCount >= {MIN_INTERACTION_COUNT} -- Only generate tweet embeddings for tweets with >= {MIN_INTERACTION_COUNT} interactions - ) eligible_tweets USING(tweetId) - ), - - -- Apply min fav count filter - user_tweet_interaction_with_fav_count_filter AS ( - {TWEET_INTERACTION_WITH_FAV_COUNT_FILTER_SQL} - ), - - -- Apply health and video filter - user_tweet_interaction_with_health_filter AS ( - {TWEET_INTERACTION_WITH_HEALTH_FILTER_SQL} - ), - - -- Final filtered user tweet interaction table - -- Read the result from the last filter - user_tweet_interaction_processed_table AS ( - SELECT * - FROM user_tweet_interaction_with_health_filter - ), - - -- Read consumer embeddings - consumer_embeddings AS ( - {CONSUMER_EMBEDDINGS_SQL} - ), - - -- Update tweet cluster scores based on interaction events - tweet_cluster_scores AS ( - SELECT tweetId, - STRUCT( - clusterId, - CASE vars.halfLife - -- halfLife = -1 means there is no half life decay and we directly take the sum as the score - WHEN -1 THEN SUM(clusterNormalizedLogFavScore) - ELSE SUM(clusterNormalizedLogFavScore * POW(0.5, (currentTs - tsMillis) / vars.halfLife)) - END AS normalizedScore, - COUNT(*) AS engagementCount) - AS clusterIdToScores - FROM user_tweet_interaction_processed_table, vars - JOIN consumer_embeddings USING(userId) - GROUP BY tweetId, clusterId, vars.halfLife - ), - - -- Generate tweet embeddings - tweet_embeddings_with_top_clusters AS ( - SELECT tweetId, ARRAY_AGG( - clusterIdToScores - ORDER BY clusterIdToScores.normalizedScore DESC - LIMIT {TWEET_EMBEDDING_LENGTH} - ) AS clusterIdToScores - FROM tweet_cluster_scores - GROUP BY tweetId - ), - - clusters_top_k_tweets AS ( - SELECT clusterId, ARRAY_AGG(STRUCT(tweetId, normalizedScore AS tweetScore) ORDER BY normalizedScore DESC LIMIT {CLUSTER_TOP_K_TWEETS}) AS topKTweetsForClusterKey - FROM tweet_embeddings_with_top_clusters, UNNEST(clusterIdToScores) AS clusterIdToScores - WHERE engagementCount >= {MIN_ENGAGEMENT_PER_CLUSTER} - GROUP BY clusterId - ) - -SELECT * -FROM clusters_top_k_tweets - diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/evergreen_content_user_tweet_action_pair_generation.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/evergreen_content_user_tweet_action_pair_generation.docx new file mode 100644 index 000000000..74089d12f Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/evergreen_content_user_tweet_action_pair_generation.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/evergreen_content_user_tweet_action_pair_generation.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/evergreen_content_user_tweet_action_pair_generation.sql deleted file mode 100644 index a23763a06..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/evergreen_content_user_tweet_action_pair_generation.sql +++ /dev/null @@ -1,62 +0,0 @@ -WITH - vars AS ( - SELECT - TIMESTAMP("{START_TIME}") AS start_date, - TIMESTAMP("{END_TIME}") AS end_date, - ), - - -- Get raw user-tweet interaction events from UUA - raw_engagements AS ( - SELECT - userIdentifier.userId AS userId, - eventMetadata.sourceTimestampMs AS tsMillis, - CASE - WHEN actionType IN ({CONTRIBUTING_ACTION_TYPES_STR}) THEN {CONTRIBUTING_ACTION_TWEET_ID_COLUMN} - WHEN actionType IN ({UNDO_ACTION_TYPES_STR}) THEN {UNDO_ACTION_TWEET_ID_COLUMN} - END AS tweetId, - CASE - WHEN actionType IN ({CONTRIBUTING_ACTION_TYPES_STR}) THEN 1 - WHEN actionType IN ({UNDO_ACTION_TYPES_STR}) THEN -1 - END AS doOrUndo - FROM `twttr-bql-unified-prod.unified_user_actions_engagements.streaming_unified_user_actions_engagements`, vars - WHERE (DATE(dateHour) >= DATE(vars.start_date) AND DATE(dateHour) <= DATE(vars.end_date)) - AND eventMetadata.sourceTimestampMs >= UNIX_MILLIS(vars.start_date) - AND eventMetadata.sourceTimestampMs <= UNIX_MILLIS(vars.end_date) - AND (actionType IN ({CONTRIBUTING_ACTION_TYPES_STR}) - OR actionType IN ({UNDO_ACTION_TYPES_STR})) - ), - - -- Get evergreen tweet ids - evergreen_tweet_ids AS ( - SELECT - tweetId - FROM `twttr-recos-ml-prod.simclusters.evergreen_content_data` - WHERE TIMESTAMP(ts) = - ( -- Get latest partition time - SELECT MAX(TIMESTAMP(ts)) latest_partition - FROM `twttr-recos-ml-prod.simclusters.evergreen_content_data` - WHERE DATE(ts) BETWEEN - DATE_SUB(DATE("{END_TIME}"), - INTERVAL 14 DAY) AND DATE("{END_TIME}") - ) - ), - - -- Join evergreen content table - evergreen_tweets_engagements AS ( - SELECT raw_engagements.* - FROM raw_engagements JOIN evergreen_tweet_ids USING(tweetId) - ), - - -- Group by userId and tweetId - user_tweet_engagement_pairs AS ( - SELECT userId, tweetId, ARRAY_AGG(STRUCT(doOrUndo, tsMillis) ORDER BY tsMillis DESC LIMIT 1) AS details, COUNT(*) AS cnt - FROM evergreen_tweets_engagements - GROUP BY userId, tweetId - ) - --- Remove undo events -SELECT userId, tweetId, CAST(dt.tsMillis AS FLOAT64) AS tsMillis -FROM user_tweet_engagement_pairs, vars -CROSS JOIN UNNEST(details) AS dt -WHERE cnt <= 10 - AND dt.doOrUndo = 1 diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/nsfw_tweet_denylist.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/nsfw_tweet_denylist.docx new file mode 100644 index 000000000..1fdb468e2 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/nsfw_tweet_denylist.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/nsfw_tweet_denylist.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/nsfw_tweet_denylist.sql deleted file mode 100644 index 472218075..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/nsfw_tweet_denylist.sql +++ /dev/null @@ -1,43 +0,0 @@ - SELECT DISTINCT tweetId - FROM `twttr-bq-tweetsource-prod.user.unhydrated_flat`, UNNEST(entity_annotations) AS ea - WHERE - (DATE(_PARTITIONTIME) >= DATE("{START_TIME}") AND DATE(_PARTITIONTIME) <= DATE("{END_TIME}")) AND - timestamp_millis((1288834974657 + - ((tweetId & 9223372036850581504) >> 22))) >= TIMESTAMP("{START_TIME}") - AND timestamp_millis((1288834974657 + - ((tweetId & 9223372036850581504) >> 22))) <= TIMESTAMP("{END_TIME}") - AND ( - ea.entityId IN ( - 883054128338878464, - 1453131634669019141, - 1470464132432347136, - 1167512219786997760, - 1151588902739644416, - 1151920148661489664, - 1155582950991228928, - 738501328687628288, - 1047106191829028865 - ) - OR ( - ea.groupId IN (34, 35) # Cortex media understanding - AND ea.entityId IN ( - 1072916828484038657, - 1133752108212035585, - 1072916828488327170 - ) - ) - OR ( - ea.groupId IN (14) # Agatha Tweet Health Annotations - AND ea.entityId IN ( - 1242898721278324736, - 1230229436697473026, - 1230229470050603008 - ) - ) - OR ( - ea.groupId IN (10) - AND ea.entityId IN ( - 953701302608961536 - ) - ) - ) diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_embeddings_generation.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_embeddings_generation.docx new file mode 100644 index 000000000..3dc573fd1 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_embeddings_generation.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_embeddings_generation.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_embeddings_generation.sql deleted file mode 100644 index ffd14729c..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_embeddings_generation.sql +++ /dev/null @@ -1,104 +0,0 @@ -with vars as ( - SELECT - UNIX_MILLIS("{QUERY_DATE}") AS currentTs, - TIMESTAMP("{START_TIME}") AS startTime, - TIMESTAMP("{END_TIME}") AS endTime, - {MIN_SCORE_THRESHOLD} AS tweetEmbeddingsMinClusterScore, - {HALF_LIFE} AS halfLife, - TIMESTAMP("{NO_OLDER_TWEETS_THAN_DATE}") AS noOlderTweetsThanDate -), - --- Get raw fav events -raw_favs AS ( - SELECT event.favorite.user_id AS userId, event.favorite.tweet_id AS tweetId, event.favorite.event_time_ms AS tsMillis, 1 AS favOrUnfav - FROM `twttr-bql-timeline-prod.timeline_service_favorites.timeline_service_favorites`, vars - WHERE (DATE(_PARTITIONTIME) = DATE(vars.startTime) OR DATE(_PARTITIONTIME) = DATE(vars.endTime)) AND - TIMESTAMP_MILLIS(event.favorite.event_time_ms) >= vars.startTime - AND TIMESTAMP_MILLIS(event.favorite.event_time_ms) <= vars.endTime - AND event.favorite IS NOT NULL -), - --- Get raw unfav events -raw_unfavs AS ( - SELECT event.unfavorite.user_id AS userId, event.unfavorite.tweet_id AS tweetId, event.unfavorite.event_time_ms AS tsMillis, -1 AS favOrUnfav - FROM `twttr-bql-timeline-prod.timeline_service_favorites.timeline_service_favorites`, vars - WHERE (DATE(_PARTITIONTIME) = DATE(vars.startTime) OR DATE(_PARTITIONTIME) = DATE(vars.endTime)) AND - TIMESTAMP_MILLIS(event.favorite.event_time_ms) >= vars.startTime - AND TIMESTAMP_MILLIS(event.favorite.event_time_ms) <= vars.endTime - AND event.unfavorite IS NOT NULL -), - --- Union fav and unfav events -favs_unioned AS ( - SELECT * FROM raw_favs - UNION ALL - SELECT * FROM raw_unfavs -), - --- Group by user and tweetId -user_tweet_fav_pairs AS ( - SELECT userId, tweetId, ARRAY_AGG(STRUCT(favOrUnfav, tsMillis) ORDER BY tsMillis DESC LIMIT 1) as details, count(*) as cnt - FROM favs_unioned - GROUP BY userId, tweetId -), - --- Remove unfav events -tweet_raw_favs_table AS ( - SELECT userId, tweetId, CAST(dt.tsMillis AS FLOAT64) AS tsMillis - FROM user_tweet_fav_pairs CROSS JOIN UNNEST(details) as dt - WHERE cnt < 3 AND dt.favOrUnfav = 1 -- cnt < 3 to remove crazy fav/unfav users -), - --- Get tweetIds that are eligible for tweet embeddings -tweet_favs_table AS ( - SELECT userId, tweet_raw_favs_table.tweetId, tsMillis - FROM tweet_raw_favs_table, vars - JOIN ( - SELECT tweetId, COUNT(DISTINCT(userId)) AS favCount - FROM tweet_raw_favs_table - GROUP BY tweetId - HAVING favCount >= 8 --we only generate tweet embeddings for tweets with >= 8 favs - ) eligible_tweets USING(tweetId) - -- Apply tweet age filter here - WHERE timestamp_millis((1288834974657 + ((tweet_raw_favs_table.tweetId & 9223372036850581504) >> 22))) >= vars.noOlderTweetsThanDate -), - --- Read consumer embeddings -consumer_embeddings AS ( - {CONSUMER_EMBEDDINGS_SQL} -), - --- Update tweet cluster scores based on fav events -tweet_cluster_scores AS ( - SELECT tweetId, - STRUCT( - clusterId, - CASE vars.halfLife - -- halfLife = -1 means there is no half life/decay and we directly take the sum as the score - WHEN -1 THEN SUM(clusterNormalizedLogFavScore) - ELSE SUM(clusterNormalizedLogFavScore * POW(0.5, (currentTs - tsMillis) / vars.halfLife)) - END AS clusterNormalizedLogFavScore, - COUNT(*) AS favCount) - AS clusterIdToScores - FROM tweet_favs_table, vars - JOIN consumer_embeddings USING(userId) - GROUP BY tweetId, clusterId, vars.halfLife -), - --- Generate tweet embeddings -tweet_embeddings_with_top_clusters AS ( - SELECT tweetId, ARRAY_AGG( - clusterIdToScores - ORDER BY clusterIdToScores.clusterNormalizedLogFavScore DESC - LIMIT {TWEET_EMBEDDING_LENGTH} - ) AS clusterIdToScores - FROM tweet_cluster_scores - GROUP BY tweetId -) - --- Return (tweetId, clusterId, tweetScore) pairs where tweetScore > tweetEmbeddingsMinClusterScore -SELECT tweetId, - clusterId, - clusterNormalizedLogFavScore AS tweetScore, clusterIdToScores -FROM tweet_embeddings_with_top_clusters, UNNEST(clusterIdToScores) AS clusterIdToScores, vars -WHERE clusterIdToScores.clusterNormalizedLogFavScore > vars.tweetEmbeddingsMinClusterScore diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_fav_count.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_fav_count.docx new file mode 100644 index 000000000..5c3529302 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_fav_count.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_fav_count.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_fav_count.sql deleted file mode 100644 index 63b085937..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_fav_count.sql +++ /dev/null @@ -1,38 +0,0 @@ --- Calculate the fav counts for tweets within a given timeframe -with vars as ( - SELECT TIMESTAMP("{START_TIME}") AS start_date, - TIMESTAMP("{END_TIME}") AS end_date -), - -favs_unioned AS ( - SELECT - userIdentifier.userId AS userId, - item.tweetInfo.actionTweetId AS tweetId, - eventMetadata.sourceTimestampMs AS tsMillis, - CASE - WHEN actionType = "ServerTweetFav" THEN 1 - WHEN actionType = "ServerTweetUnfav" THEN -1 - END AS favOrUnfav - FROM `twttr-bql-unified-prod.unified_user_actions_engagements.streaming_unified_user_actions_engagements`, vars - WHERE (DATE(dateHour) >= DATE(vars.start_date) AND DATE(dateHour) <= DATE(vars.end_date)) - AND eventMetadata.sourceTimestampMs >= UNIX_MILLIS(vars.start_date) - AND eventMetadata.sourceTimestampMs <= UNIX_MILLIS(vars.end_date) - AND userIdentifier.userId IS NOT NULL - AND (actionType = "ServerTweetFav" OR actionType = "ServerTweetUnfav") -), - -user_tweet_fav_pairs AS ( - SELECT userId, tweetId, ARRAY_AGG(STRUCT(favOrUnfav, tsMillis) ORDER BY tsMillis DESC LIMIT 1) as details, count(*) as cnt - FROM favs_unioned - GROUP BY userId, tweetId -), - -tweet_raw_favs_table AS ( - SELECT userId, tweetId, CAST(dt.tsMillis AS FLOAT64) AS tsMillis - FROM user_tweet_fav_pairs CROSS JOIN UNNEST(details) as dt - WHERE cnt < 3 AND dt.favOrUnfav = 1 -) - -SELECT tweetId, COUNT(DISTINCT(userId)) AS favCount -FROM tweet_raw_favs_table -GROUP BY tweetId diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweets_ann.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweets_ann.docx new file mode 100644 index 000000000..fc452a92d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweets_ann.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweets_ann.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweets_ann.sql deleted file mode 100644 index f9eb10d2b..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/tweets_ann.sql +++ /dev/null @@ -1,64 +0,0 @@ --- (step 1) Read consumer embeddings -WITH consumer_embeddings AS ( - {CONSUMER_EMBEDDINGS_SQL} -), --- (step 1) Read tweet embeddings -tweet_embeddings AS ( - {TWEET_EMBEDDINGS_SQL} -), --- (step 1) Compute tweet embeddings norms (we will use this to compute cosine sims later) -tweet_embeddings_norm AS ( - SELECT tweetId, SUM(tweetScore * tweetScore) AS norm - FROM tweet_embeddings - GROUP BY tweetId - HAVING norm > 0.0 -), --- (step 2) Get top N clusters for each consumer embedding. N = 25 in prod -consumer_embeddings_top_n_clusters AS ( - SELECT userId, ARRAY_AGG(STRUCT(clusterId, userScore) ORDER BY userScore DESC LIMIT {TOP_N_CLUSTER_PER_SOURCE_EMBEDDING}) AS topClustersWithScores - FROM consumer_embeddings - GROUP BY userId -), --- (step 2) Get top M tweets for each cluster id. M = 100 in prod -clusters_top_m_tweets AS ( - SELECT clusterId, ARRAY_AGG(STRUCT(tweetId, tweetScore) ORDER BY tweetScore DESC LIMIT {TOP_M_TWEETS_PER_CLUSTER}) AS tweets - FROM tweet_embeddings - GROUP BY clusterId -), --- (step 3) Join the results, get top M * N tweets for each user -user_top_mn_tweets AS ( - SELECT userId, consumer_embedding_cluster_score_pairs.userScore AS userScore, clusters_top_m_tweets.clusterId AS clusterId, clusters_top_m_tweets.tweets AS tweets - FROM ( - SELECT userId, clusterId, userScore - FROM consumer_embeddings_top_n_clusters, UNNEST(topClustersWithScores) - ) AS consumer_embedding_cluster_score_pairs - JOIN clusters_top_m_tweets ON consumer_embedding_cluster_score_pairs.clusterId = clusters_top_m_tweets.clusterId -), --- (step 4) Compute the dot product between each user and tweet embedding pair -user_tweet_embedding_dot_product AS ( - SELECT userId, - tweetId, - SUM(userScore * tweetScore) AS dotProductScore - FROM user_top_mn_tweets, UNNEST(tweets) AS tweets - GROUP BY userId, tweetId -), --- (step 5) Compute similarity scores: dot product, cosine sim, log-cosine sim -user_tweet_embedding_similarity_scores AS ( - SELECT userId, - user_tweet_embedding_dot_product.tweetId AS tweetId, - dotProductScore, - SAFE_DIVIDE(dotProductScore, SQRT(tweet_embeddings_norm.norm)) AS cosineSimilarityScore, - SAFE_DIVIDE(dotProductScore, LN(1+tweet_embeddings_norm.norm)) AS logCosineSimilarityScore, - FROM user_tweet_embedding_dot_product - JOIN tweet_embeddings_norm ON user_tweet_embedding_dot_product.tweetId = tweet_embeddings_norm.tweetId -), --- (step 6) Get final top K tweets per user. K = 150 in prod -results AS ( - SELECT userId, ARRAY_AGG(STRUCT(tweetId, dotProductScore, cosineSimilarityScore, logCosineSimilarityScore) - ORDER BY logCosineSimilarityScore DESC LIMIT {TOP_K_TWEETS_PER_USER_REQUEST}) AS tweets - FROM user_tweet_embedding_similarity_scores - GROUP BY userId -) - -SELECT * -FROM results diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/unified_user_tweet_action_pair_generation.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/unified_user_tweet_action_pair_generation.docx new file mode 100644 index 000000000..3b0e82abb Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/unified_user_tweet_action_pair_generation.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/unified_user_tweet_action_pair_generation.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/unified_user_tweet_action_pair_generation.sql deleted file mode 100644 index ad2e1d7bd..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/unified_user_tweet_action_pair_generation.sql +++ /dev/null @@ -1,45 +0,0 @@ -WITH - vars AS ( - SELECT - TIMESTAMP("{START_TIME}") AS start_date, - TIMESTAMP("{END_TIME}") AS end_date, - TIMESTAMP("{NO_OLDER_TWEETS_THAN_DATE}") AS no_older_tweets_than_date - ), - - -- Get raw user-tweet interaction events from UUA - interactions_unioned AS ( - SELECT - userIdentifier.userId AS userId, - eventMetadata.sourceTimestampMs AS tsMillis, - CASE - WHEN actionType IN ({CONTRIBUTING_ACTION_TYPES_STR}) THEN {CONTRIBUTING_ACTION_TWEET_ID_COLUMN} - WHEN actionType IN ({UNDO_ACTION_TYPES_STR}) THEN {UNDO_ACTION_TWEET_ID_COLUMN} - END AS tweetId, - CASE - WHEN actionType IN ({CONTRIBUTING_ACTION_TYPES_STR}) THEN 1 - WHEN actionType IN ({UNDO_ACTION_TYPES_STR}) THEN -1 - END AS doOrUndo - FROM `twttr-bql-unified-prod.unified_user_actions_engagements.streaming_unified_user_actions_engagements`, vars - WHERE (DATE(dateHour) >= DATE(vars.start_date) AND DATE(dateHour) <= DATE(vars.end_date)) - AND eventMetadata.sourceTimestampMs >= UNIX_MILLIS(vars.start_date) - AND eventMetadata.sourceTimestampMs <= UNIX_MILLIS(vars.end_date) - AND (actionType IN ({CONTRIBUTING_ACTION_TYPES_STR}) - OR actionType IN ({UNDO_ACTION_TYPES_STR})) - ), - - -- Group by userId and tweetId - user_tweet_interaction_pairs AS ( - SELECT userId, tweetId, ARRAY_AGG(STRUCT(doOrUndo, tsMillis) ORDER BY tsMillis DESC LIMIT 1) AS details, COUNT(*) AS cnt - FROM interactions_unioned - GROUP BY userId, tweetId - ) - --- Remove undo events --- Apply age filter in this step -SELECT userId, tweetId, CAST(dt.tsMillis AS FLOAT64) AS tsMillis -FROM user_tweet_interaction_pairs, vars -CROSS JOIN UNNEST(details) AS dt -WHERE cnt < 3 - AND dt.doOrUndo = 1 - AND timestamp_millis((1288834974657 + - ((tweetId & 9223372036850581504) >> 22))) >= vars.no_older_tweets_than_date