[docx] split commit for file 5000

Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
2024-01-23 19:17:38 +02:00 · 2024-01-23 19:17:38 +02:00 · 2f5f511bb8
parent c4b4b821a3
commit 2f5f511bb8
394 changed files with 0 additions and 37240 deletions
--- a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.docx
--- a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.scala
@ -1,32 +0,0 @@
-package com.twitter.simclusters_v2.common
-
-import com.twitter.simclusters_v2.common.SimClustersMultiEmbeddingId._
-import com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding.{Ids, Values}
-import com.twitter.simclusters_v2.thriftscala.{
-  SimClustersMultiEmbedding,
-  SimClustersEmbeddingId,
-  SimClustersMultiEmbeddingId
-}
-
-/**
- * Helper methods for SimClustersMultiEmbedding
- */
-object SimClustersMultiEmbedding {
-
-  // Convert a multiEmbedding to a list of (embeddingId, score)
-  def toSimClustersEmbeddingIdWithScores(
-    simClustersMultiEmbeddingId: SimClustersMultiEmbeddingId,
-    simClustersMultiEmbedding: SimClustersMultiEmbedding
-  ): Seq[(SimClustersEmbeddingId, Double)] = {
-    simClustersMultiEmbedding match {
-      case Values(values) =>
-        values.embeddings.zipWithIndex.map {
-          case (embeddingWithScore, i) =>
-            (toEmbeddingId(simClustersMultiEmbeddingId, i), embeddingWithScore.score)
-        }
-      case Ids(ids) =>
-        ids.ids.map(_.toTuple)
-    }
-  }
-
-}
--- a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.docx
--- a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.scala
@ -1,96 +0,0 @@
-package com.twitter.simclusters_v2.common
-
-import com.twitter.simclusters_v2.thriftscala.{
-  EmbeddingType,
-  InternalId,
-  MultiEmbeddingType,
-  TopicId,
-  TopicSubId,
-  SimClustersEmbeddingId => ThriftEmbeddingId,
-  SimClustersMultiEmbeddingId => ThriftMultiEmbeddingId
-}
-
-/**
- * Helper methods for SimClustersMultiEmbeddingId
- */
-object SimClustersMultiEmbeddingId {
-
-  private val MultiEmbeddingTypeToEmbeddingType: Map[MultiEmbeddingType, EmbeddingType] =
-    Map(
-      MultiEmbeddingType.LogFavApeBasedMuseTopic -> EmbeddingType.LogFavApeBasedMuseTopic,
-      MultiEmbeddingType.TwiceUserInterestedIn -> EmbeddingType.TwiceUserInterestedIn,
-    )
-
-  private val EmbeddingTypeToMultiEmbeddingType: Map[EmbeddingType, MultiEmbeddingType] =
-    MultiEmbeddingTypeToEmbeddingType.map(_.swap)
-
-  def toEmbeddingType(multiEmbeddingType: MultiEmbeddingType): EmbeddingType = {
-    MultiEmbeddingTypeToEmbeddingType.getOrElse(
-      multiEmbeddingType,
-      throw new IllegalArgumentException(s"Invalid type: $multiEmbeddingType"))
-  }
-
-  def toMultiEmbeddingType(embeddingType: EmbeddingType): MultiEmbeddingType = {
-    EmbeddingTypeToMultiEmbeddingType.getOrElse(
-      embeddingType,
-      throw new IllegalArgumentException(s"Invalid type: $embeddingType")
-    )
-  }
-
-  /**
-   * Convert a SimClusters Multi-Embedding Id and SubId to SimClusters Embedding Id.
-   */
-  def toEmbeddingId(
-    simClustersMultiEmbeddingId: ThriftMultiEmbeddingId,
-    subId: Int
-  ): ThriftEmbeddingId = {
-    val internalId = simClustersMultiEmbeddingId.internalId match {
-      case InternalId.TopicId(topicId) =>
-        InternalId.TopicSubId(
-          TopicSubId(topicId.entityId, topicId.language, topicId.country, subId))
-      case _ =>
-        throw new IllegalArgumentException(
-          s"Invalid simClusters InternalId ${simClustersMultiEmbeddingId.internalId}")
-    }
-    ThriftEmbeddingId(
-      toEmbeddingType(simClustersMultiEmbeddingId.embeddingType),
-      simClustersMultiEmbeddingId.modelVersion,
-      internalId
-    )
-  }
-
-  /**
-   * Fetch a subId from a SimClusters EmbeddingId.
-   */
-  def toSubId(simClustersEmbeddingId: ThriftEmbeddingId): Int = {
-    simClustersEmbeddingId.internalId match {
-      case InternalId.TopicSubId(topicSubId) =>
-        topicSubId.subId
-      case _ =>
-        throw new IllegalArgumentException(
-          s"Invalid SimClustersEmbeddingId InternalId type, $simClustersEmbeddingId")
-    }
-  }
-
-  /**
-   * Convert a SimClustersEmbeddingId to SimClustersMultiEmbeddingId.
-   * Only support the Multi embedding based EmbeddingTypes.
-   */
-  def toMultiEmbeddingId(
-    simClustersEmbeddingId: ThriftEmbeddingId
-  ): ThriftMultiEmbeddingId = {
-    simClustersEmbeddingId.internalId match {
-      case InternalId.TopicSubId(topicSubId) =>
-        ThriftMultiEmbeddingId(
-          toMultiEmbeddingType(simClustersEmbeddingId.embeddingType),
-          simClustersEmbeddingId.modelVersion,
-          InternalId.TopicId(TopicId(topicSubId.entityId, topicSubId.language, topicSubId.country))
-        )
-
-      case _ =>
-        throw new IllegalArgumentException(
-          s"Invalid SimClustersEmbeddingId InternalId type, $simClustersEmbeddingId")
-    }
-  }
-
-}
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD
@ -1,11 +0,0 @@
-scala_library(
-    compiler_option_sets = ["fatal_warnings"],
-    platform = "java8",
-    tags = ["bazel-compatible"],
-    dependencies = [
-        "eventdetection/common/src/main/java/com/twitter/eventdetection/common/louvain",
-        "eventdetection/common/src/main/java/com/twitter/eventdetection/common/model",
-        "src/java/com/twitter/sbf/graph",
-        "src/scala/com/twitter/simclusters_v2/scalding/common",
-    ],
-)
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.scala
@ -1,30 +0,0 @@
-package com.twitter.simclusters_v2.common.clustering
-
-import com.twitter.simclusters_v2.common.UserId
-import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
-
-/**
- * Select a cluster member as cluster representative.
- */
-trait ClusterRepresentativeSelectionMethod[T] {
-
-  /**
-   * The main external-facing method. Sub-classes should implement this method.
-   *
-   * @param cluster A set of NeighborWithWeights.
-   * @param embeddings A map of producer ID -> embedding.
-   *
-   * @return UserId of the member chosen as representative.
-   */
-  def selectClusterRepresentative(
-    cluster: Set[NeighborWithWeights],
-    embeddings: Map[UserId, T]
-  ): UserId
-
-}
-
-object ClusterRepresentativeSelectionStatistics {
-
-  // Statistics, to be imported where recorded.
-  val StatClusterRepresentativeSelectionTime = "cluster_representative_selection_total_time_ms"
-}
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.scala
@ -1,34 +0,0 @@
-package com.twitter.simclusters_v2.common.clustering
-
-/**
- * Partitions a set of entities into clusters.
- * NOTE: The selection/construction of the cluster representatives (e.g. medoid, random, average) is implemented in ClusterRepresentativeSelectionMethod.scala
- */
-trait ClusteringMethod {
-
-  /**
-   * The main external-facing method. Sub-classes should implement this method.
-   *
-   * @param embeddings map of entity IDs and corresponding embeddings
-   * @param similarityFn function that outputs similarity (>=0, the larger, more similar), given two embeddings
-   * @tparam T embedding type. e.g. SimClustersEmbedding
-   *
-   * @return A set of sets of entity IDs, each set representing a distinct cluster.
-   */
-  def cluster[T](
-    embeddings: Map[Long, T],
-    similarityFn: (T, T) => Double,
-    recordStatCallback: (String, Long) => Unit = (_, _) => ()
-  ): Set[Set[Long]]
-
-}
-
-object ClusteringStatistics {
-
-  // Statistics, to be imported where recorded.
-  val StatSimilarityGraphTotalBuildTime = "similarity_graph_total_build_time_ms"
-  val StatClusteringAlgorithmRunTime = "clustering_algorithm_total_run_time_ms"
-  val StatMedoidSelectionTime = "medoid_selection_total_time_ms"
-  val StatComputedSimilarityBeforeFilter = "computed_similarity_before_filter"
-
-}
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.scala
@ -1,67 +0,0 @@
-package com.twitter.simclusters_v2.common.clustering
-
-import com.twitter.sbf.graph.ConnectedComponents
-import com.twitter.sbf.graph.Graph
-import com.twitter.util.Stopwatch
-import it.unimi.dsi.fastutil.ints.IntSet
-import scala.collection.SortedMap
-import scala.jdk.CollectionConverters._
-
-/**
- * Aggregate entities into clusters such that a cluster contains all embeddings with a similarity
- * above a configurable threshold to any other embedding.
- *
- * @param similarityThreshold: When building the edges between entities, edges with weight
- * less than or equal to this threshold will be filtered out.
- */
-class ConnectedComponentsClusteringMethod(
-  similarityThreshold: Double)
-    extends ClusteringMethod {
-
-  import ClusteringStatistics._
-
-  def cluster[T](
-    embeddings: Map[Long, T],
-    similarityFn: (T, T) => Double,
-    recordStatCallback: (String, Long) => Unit = (_, _) => ()
-  ): Set[Set[Long]] = {
-
-    val timeSinceGraphBuildStart = Stopwatch.start()
-    // com.twitter.sbf.graph.Graph expects neighbors to be sorted in ascending order.
-    val sourcesById = SortedMap(embeddings.zipWithIndex.map {
-      case (source, idx) => idx -> source
-    }.toSeq: _*)
-
-    val neighbours = sourcesById.map {
-      case (srcIdx, (_, src)) =>
-        sourcesById
-          .collect {
-            case (dstIdx, (_, dst)) if srcIdx != dstIdx => // avoid self-edges
-              val similarity = similarityFn(src, dst)
-              recordStatCallback(
-                StatComputedSimilarityBeforeFilter,
-                (similarity * 100).toLong // preserve up to two decimal points
-              )
-              if (similarity > similarityThreshold)
-                Some(dstIdx)
-              else None
-          }.flatten.toArray
-    }.toArray
-
-    recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
-
-    val timeSinceClusteringAlgRunStart = Stopwatch.start()
-    val nEdges = neighbours.map(_.length).sum / 2 // Graph expects count of undirected edges
-    val graph = new Graph(sourcesById.size, nEdges, neighbours)
-
-    val clusters = ConnectedComponents
-      .connectedComponents(graph).asScala.toSet
-      .map { i: IntSet => i.asScala.map(sourcesById(_)._1).toSet }
-
-    recordStatCallback(
-      StatClusteringAlgorithmRunTime,
-      timeSinceClusteringAlgRunStart().inMilliseconds)
-
-    clusters
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.scala
@ -1,33 +0,0 @@
-package com.twitter.simclusters_v2.common.clustering
-
-/**
- * Groups entities by a single embedding dimension with the largest score.
- */
-class LargestDimensionClusteringMethod extends ClusteringMethod {
-
-  /**
-   * @param embeddings   map of entity IDs and corresponding embeddings
-   * @param similarityFn function that outputs discrete value (0.0 or 1.0).
-   *                     1.0 if the dimensions of the highest score (weight) from two given embeddings match.
-   *                     0.0 otherwise.
-   *                     e.g.
-   *                        case 1: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.3, 0.8, 0.0]. similarityFn(E1, E2)=1.0
-   *                        case 2: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.4, 0.2, 0.0]. similarityFn(E1, E2)=0.0
-   * @tparam T embedding type. e.g. SimClustersEmbedding
-   *
-   * @return A set of sets of entity IDs, each set representing a distinct cluster.
-   */
-  override def cluster[T](
-    embeddings: Map[Long, T],
-    similarityFn: (T, T) => Double,
-    recordStatCallback: (String, Long) => Unit
-  ): Set[Set[Long]] = {
-
-    // rely on clustering by connected component.
-    // similarityThreshold=0.1 because it's larger than 0.0 (similarityFn returns 0.0 if two embeddings
-    // don't share the largest dimension.
-    new ConnectedComponentsClusteringMethod(similarityThreshold = 0.1)
-      .cluster(embeddings, similarityFn, recordStatCallback)
-  }
-
-}
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.scala
@ -1,236 +0,0 @@
-package com.twitter.simclusters_v2.common.clustering
-
-import com.twitter.eventdetection.common.louvain.LouvainDriver
-import com.twitter.eventdetection.common.louvain.NetworkFactory
-import com.twitter.eventdetection.common.model.Entity
-import com.twitter.eventdetection.common.model.NetworkInput
-import com.twitter.eventdetection.common.model.TextEntityValue
-import com.twitter.util.Stopwatch
-import scala.collection.JavaConverters._
-import scala.math.max
-
-/**
- * Groups entities by the Louvain clustering method.
- * @param similarityThreshold: When building the edges between entities, edges with weight
- * less than or equal to this threshold will be filtered out.
- * @param appliedResolutionFactor: If present, will be used to multiply the applied resolution
- * parameter of the Louvain method by this factor.
- * Note that the DEFAULT_MAX_RESOLUTION will not be applied.
- */
-class LouvainClusteringMethod(
-  similarityThreshold: Double,
-  appliedResolutionFactor: Option[Double])
-    extends ClusteringMethod {
-
-  import ClusteringStatistics._
-
-  def cluster[T](
-    embeddings: Map[Long, T],
-    similarityFn: (T, T) => Double,
-    recordStatCallback: (String, Long) => Unit = (_, _) => ()
-  ): Set[Set[Long]] = {
-
-    // 1. Build the graph on which to run Louvain:
-    //   - Weigh edges by the similarity between the 2 embeddings,
-    //   - Filter out edges with weight <= threshold.
-    val timeSinceGraphBuildStart = Stopwatch.start()
-    val edges: Seq[((Long, Long), Double)] = embeddings.toSeq
-      .combinations(2)
-      .map { pair: Seq[(Long, T)] => // pair of 2
-        val (user1, embedding1) = pair.head
-        val (user2, embedding2) = pair(1)
-        val similarity = similarityFn(embedding1, embedding2)
-
-        recordStatCallback(
-          StatComputedSimilarityBeforeFilter,
-          (similarity * 100).toLong // preserve up to two decimal places
-        )
-
-        ((user1, user2), similarity)
-      }
-      .filter(_._2 > similarityThreshold)
-      .toSeq
-
-    recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
-
-    // check if some entities do not have any incoming / outgoing edge
-    // these are size-1 clusters (i.e. their own)
-    val individualClusters: Set[Long] = embeddings.keySet -- edges.flatMap {
-      case ((user1, user2), _) => Set(user1, user2)
-    }.toSet
-
-    // 2. LouvainDriver uses "Entity" as input, so build 2 mappings
-    // - Long (entity id) -> Entity
-    // - Entity -> Long (entity id)
-    val embeddingIdToEntity: Map[Long, Entity] = embeddings.map {
-      case (id, _) => id -> Entity(TextEntityValue(id.toString, Some(id.toString)), None)
-    }
-    val entityToEmbeddingId: Map[Entity, Long] = embeddingIdToEntity.map {
-      case (id, e) => e -> id
-    }
-
-    // 3. Create the list of NetworkInput on which to run LouvainDriver
-    val networkInputList = edges
-      .map {
-        case ((fromUserId: Long, toUserId: Long), weight: Double) =>
-          new NetworkInput(embeddingIdToEntity(fromUserId), embeddingIdToEntity(toUserId), weight)
-      }.toList.asJava
-
-    val timeSinceClusteringAlgRunStart = Stopwatch.start()
-    val networkDictionary = NetworkFactory.buildDictionary(networkInputList)
-    val network = NetworkFactory.buildNetwork(networkInputList, networkDictionary)
-
-    if (networkInputList.size() == 0) {
-      // handle case if no edge at all (only one entity or all entities are too far apart)
-      embeddings.keySet.map(e => Set(e))
-    } else {
-      // 4. Run clustering algorithm
-      val clusteredIds = appliedResolutionFactor match {
-        case Some(res) =>
-          LouvainDriver.clusterAppliedResolutionFactor(network, networkDictionary, res)
-        case None => LouvainDriver.cluster(network, networkDictionary)
-      }
-
-      recordStatCallback(
-        StatClusteringAlgorithmRunTime,
-        timeSinceClusteringAlgRunStart().inMilliseconds)
-
-      // 5. Post-processing
-      val atLeast2MembersClusters: Set[Set[Long]] = clusteredIds.asScala
-        .groupBy(_._2)
-        .mapValues(_.map { case (e, _) => entityToEmbeddingId(e) }.toSet)
-        .values.toSet
-
-      atLeast2MembersClusters ++ individualClusters.map { e => Set(e) }
-
-    }
-  }
-
-  def clusterWithSilhouette[T](
-    embeddings: Map[Long, T],
-    similarityFn: (T, T) => Double,
-    similarityFnForSil: (T, T) => Double,
-    recordStatCallback: (String, Long) => Unit = (_, _) => ()
-  ): (Set[Set[Long]], Set[Set[(Long, Double)]]) = {
-
-    // 1. Build the graph on which to run Louvain:
-    //   - Weigh edges by the similarity between the 2 embeddings,
-    //   - Filter out edges with weight <= threshold.
-    val timeSinceGraphBuildStart = Stopwatch.start()
-    val edgesSimilarityMap = collection.mutable.Map[(Long, Long), Double]()
-
-    val edges: Seq[((Long, Long), Double)] = embeddings.toSeq
-      .combinations(2)
-      .map { pair: Seq[(Long, T)] => // pair of 2
-        val (user1, embedding1) = pair.head
-        val (user2, embedding2) = pair(1)
-        val similarity = similarityFn(embedding1, embedding2)
-        val similarityForSil = similarityFnForSil(embedding1, embedding2)
-        edgesSimilarityMap.put((user1, user2), similarityForSil)
-        edgesSimilarityMap.put((user2, user1), similarityForSil)
-
-        recordStatCallback(
-          StatComputedSimilarityBeforeFilter,
-          (similarity * 100).toLong // preserve up to two decimal places
-        )
-
-        ((user1, user2), similarity)
-      }
-      .filter(_._2 > similarityThreshold)
-      .toSeq
-
-    recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
-
-    // check if some entities do not have any incoming / outgoing edge
-    // these are size-1 clusters (i.e. their own)
-    val individualClusters: Set[Long] = embeddings.keySet -- edges.flatMap {
-      case ((user1, user2), _) => Set(user1, user2)
-    }.toSet
-
-    // 2. LouvainDriver uses "Entity" as input, so build 2 mappings
-    // - Long (entity id) -> Entity
-    // - Entity -> Long (entity id)
-    val embeddingIdToEntity: Map[Long, Entity] = embeddings.map {
-      case (id, _) => id -> Entity(TextEntityValue(id.toString, Some(id.toString)), None)
-    }
-    val entityToEmbeddingId: Map[Entity, Long] = embeddingIdToEntity.map {
-      case (id, e) => e -> id
-    }
-
-    // 3. Create the list of NetworkInput on which to run LouvainDriver
-    val networkInputList = edges
-      .map {
-        case ((fromUserId: Long, toUserId: Long), weight: Double) =>
-          new NetworkInput(embeddingIdToEntity(fromUserId), embeddingIdToEntity(toUserId), weight)
-      }.toList.asJava
-
-    val timeSinceClusteringAlgRunStart = Stopwatch.start()
-    val networkDictionary = NetworkFactory.buildDictionary(networkInputList)
-    val network = NetworkFactory.buildNetwork(networkInputList, networkDictionary)
-
-    val clusters = if (networkInputList.size() == 0) {
-      // handle case if no edge at all (only one entity or all entities are too far apart)
-      embeddings.keySet.map(e => Set(e))
-    } else {
-      // 4. Run clustering algorithm
-      val clusteredIds = appliedResolutionFactor match {
-        case Some(res) =>
-          LouvainDriver.clusterAppliedResolutionFactor(network, networkDictionary, res)
-        case None => LouvainDriver.cluster(network, networkDictionary)
-      }
-
-      recordStatCallback(
-        StatClusteringAlgorithmRunTime,
-        timeSinceClusteringAlgRunStart().inMilliseconds)
-
-      // 5. Post-processing
-      val atLeast2MembersClusters: Set[Set[Long]] = clusteredIds.asScala
-        .groupBy(_._2)
-        .mapValues(_.map { case (e, _) => entityToEmbeddingId(e) }.toSet)
-        .values.toSet
-
-      atLeast2MembersClusters ++ individualClusters.map { e => Set(e) }
-
-    }
-
-    // Calculate silhouette metrics
-    val contactIdWithSilhouette = clusters.map {
-      case cluster =>
-        val otherClusters = clusters - cluster
-
-        cluster.map {
-          case contactId =>
-            if (otherClusters.isEmpty) {
-              (contactId, 0.0)
-            } else {
-              val otherSameClusterContacts = cluster - contactId
-
-              if (otherSameClusterContacts.isEmpty) {
-                (contactId, 0.0)
-              } else {
-                // calculate similarity of given userId with all other users in the same cluster
-                val a_i = otherSameClusterContacts.map {
-                  case sameClusterContact =>
-                    edgesSimilarityMap((contactId, sameClusterContact))
-                }.sum / otherSameClusterContacts.size
-
-                // calculate similarity of given userId to all other clusters, find the best nearest cluster
-                val b_i = otherClusters.map {
-                  case otherCluster =>
-                    otherCluster.map {
-                      case otherClusterContact =>
-                        edgesSimilarityMap((contactId, otherClusterContact))
-                    }.sum / otherCluster.size
-                }.max
-
-                // silhouette (value) of one userId i
-                val s_i = (a_i - b_i) / max(a_i, b_i)
-                (contactId, s_i)
-              }
-            }
-        }
-    }
-
-    (clusters, contactIdWithSilhouette)
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.scala
@ -1,21 +0,0 @@
-package com.twitter.simclusters_v2.common.clustering
-
-import com.twitter.simclusters_v2.common.UserId
-import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
-
-class MaxFavScoreRepresentativeSelectionMethod[T] extends ClusterRepresentativeSelectionMethod[T] {
-
-  /**
-   * Identify the member with largest favScoreHalfLife100Days and return it.
-   *
-   * @param cluster A set of NeighborWithWeights.
-   * @param embeddings A map of producer ID -> embedding.
-   */
-  def selectClusterRepresentative(
-    cluster: Set[NeighborWithWeights],
-    embeddings: Map[UserId, T],
-  ): UserId = {
-    val key = cluster.maxBy { x: NeighborWithWeights => x.favScoreHalfLife100Days.getOrElse(0.0) }
-    key.neighborId
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.scala
@ -1,28 +0,0 @@
-package com.twitter.simclusters_v2.common.clustering
-
-import com.twitter.simclusters_v2.common.UserId
-import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
-
-class MedoidRepresentativeSelectionMethod[T](
-  producerProducerSimilarityFn: (T, T) => Double)
-    extends ClusterRepresentativeSelectionMethod[T] {
-
-  /**
-   * Identify the medoid of a cluster and return it.
-   *
-   * @param cluster A set of NeighborWithWeights.
-   * @param embeddings A map of producer ID -> embedding.
-   */
-  def selectClusterRepresentative(
-    cluster: Set[NeighborWithWeights],
-    embeddings: Map[UserId, T],
-  ): UserId = {
-    val key = cluster.maxBy {
-      id1 => // maxBy because we use similarity, which gets larger as we get closer.
-        val v = embeddings(id1.neighborId)
-        cluster
-          .map(id2 => producerProducerSimilarityFn(v, embeddings(id2.neighborId))).sum
-    }
-    key.neighborId
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.scala
@ -1,32 +0,0 @@
-package com.twitter.simclusters_v2.common.clustering
-
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-
-/**
- * SimilarityFunctions provide commonly used similarity functions that this clustering library needs.
- */
-object SimilarityFunctions {
-  def simClustersCosineSimilarity: (SimClustersEmbedding, SimClustersEmbedding) => Double =
-    (e1, e2) => e1.cosineSimilarity(e2)
-
-  def simClustersMatchingLargestDimension: (
-    SimClustersEmbedding,
-    SimClustersEmbedding
-  ) => Double = (e1, e2) => {
-    val doesMatchLargestDimension: Boolean = e1
-      .topClusterIds(1)
-      .exists { id1 =>
-        e2.topClusterIds(1).contains(id1)
-      }
-
-    if (doesMatchLargestDimension) 1.0
-    else 0.0
-  }
-
-  def simClustersFuzzyJaccardSimilarity: (
-    SimClustersEmbedding,
-    SimClustersEmbedding
-  ) => Double = (e1, e2) => {
-    e1.fuzzyJaccardSimilarity(e2)
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/common/ml/BUILD
+++ b/src/scala/com/twitter/simclusters_v2/common/ml/BUILD
@ -1,12 +0,0 @@
-# This package/target is separate from other simclusters common packages because the ml/api dep is
-# large (350MB+). Having it as a separate target means that we can avoid bundling it with targets
-# that do not need it.
-scala_library(
-    platform = "java8",
-    tags = ["bazel-compatible"],
-    dependencies = [
-        "src/java/com/twitter/ml/api:api-base",
-        "src/scala/com/twitter/ml/api/util",
-        "src/scala/com/twitter/simclusters_v2/common",
-    ],
-)
--- a/src/scala/com/twitter/simclusters_v2/common/ml/BUILD.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/ml/BUILD.docx
--- a/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.docx
--- a/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.scala
@ -1,39 +0,0 @@
-package com.twitter.simclusters_v2.common.ml
-
-import com.twitter.ml.api.Feature.Continuous
-import com.twitter.ml.api.Feature.SparseContinuous
-import com.twitter.ml.api._
-import com.twitter.ml.api.util.FDsl._
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-
-class SimClustersEmbeddingAdapter(embeddingFeature: SparseContinuous)
-    extends IRecordOneToOneAdapter[SimClustersEmbedding] {
-
-  override def getFeatureContext: FeatureContext = new FeatureContext(embeddingFeature)
-
-  override def adaptToDataRecord(embedding: SimClustersEmbedding): DataRecord = {
-    val embeddingMap = embedding.embedding.map {
-      case (clusterId, score) =>
-        (clusterId.toString, score)
-    }.toMap
-
-    new DataRecord().setFeatureValue(embeddingFeature, embeddingMap)
-  }
-}
-
-class NormalizedSimClustersEmbeddingAdapter(
-  embeddingFeature: SparseContinuous,
-  normFeature: Continuous)
-    extends IRecordOneToOneAdapter[SimClustersEmbedding] {
-
-  override def getFeatureContext: FeatureContext = new FeatureContext(embeddingFeature, normFeature)
-
-  override def adaptToDataRecord(embedding: SimClustersEmbedding): DataRecord = {
-
-    val normalizedEmbedding = Map(
-      embedding.sortedClusterIds.map(_.toString).zip(embedding.normalizedSortedScores): _*)
-
-    val dataRecord = new DataRecord().setFeatureValue(embeddingFeature, normalizedEmbedding)
-    dataRecord.setFeatureValue(normFeature, embedding.l2norm)
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/common/package.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/package.docx
--- a/src/scala/com/twitter/simclusters_v2/common/package.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/package.scala
@ -1,17 +0,0 @@
-package com.twitter.simclusters_v2
-
-package object common {
-
-  type TweetId = Long
-  type UserId = Long
-  type ClusterId = Int
-  type SemanticCoreEntityId = Long // Use TopicId if it's a Topic related project.
-  type UTTEntityId = Long
-  type Timestamp = Long
-  type Language = String
-  type Country = String
-  type LocaleEntity = (Long, Language)
-  type TopicId = Long
-  type GroupId = Long
-  type SpaceId = String
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.scala
@ -1,164 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources
-
-import com.twitter.bijection.scrooge.BinaryScalaCodec
-import com.twitter.bijection.scrooge.CompactScalaCodec
-import com.twitter.bijection.Bufferable
-import com.twitter.bijection.Injection
-import com.twitter.hermit.candidate.thriftscala.Candidates
-import com.twitter.scalding.DateRange
-import com.twitter.scalding.commons.source.VersionedKeyValSource
-import com.twitter.scalding_internal.source.lzo_scrooge.DailySuffixMostRecentLzoScrooge
-import com.twitter.scalding_internal.source.lzo_scrooge.FixedPathLzoScrooge
-import com.twitter.scalding_internal.source.lzo_scrooge.HourlySuffixMostRecentLzoScrooge
-import com.twitter.simclusters_v2.thriftscala._
-
-case class EdgeWithDecayedWtsFixedPathSource(path: String)
-    extends FixedPathLzoScrooge[EdgeWithDecayedWeights](path, EdgeWithDecayedWeights)
-
-case class UserAndNeighborsFixedPathSource(path: String)
-    extends FixedPathLzoScrooge[UserAndNeighbors](path, UserAndNeighbors)
-
-case class NormsAndCountsFixedPathSource(path: String)
-    extends FixedPathLzoScrooge[NormsAndCounts](path, NormsAndCounts)
-
-case class UserToInterestedInClustersFixedPathSource(path: String)
-    extends FixedPathLzoScrooge[UserToInterestedInClusters](path, UserToInterestedInClusters)
-
-case class TimelineDataExtractorFixedPathSource(path: String)
-    extends FixedPathLzoScrooge[ReferenceTweets](path, ReferenceTweets)
-
-case class TweetClusterScoresHourlySuffixSource(path: String, override val dateRange: DateRange)
-    extends HourlySuffixMostRecentLzoScrooge[TweetAndClusterScores](path, dateRange)
-
-case class TweetTopKClustersHourlySuffixSource(path: String, override val dateRange: DateRange)
-    extends HourlySuffixMostRecentLzoScrooge[TweetTopKClustersWithScores](
-      path,
-      dateRange
-    )
-
-case class ClusterTopKTweetsHourlySuffixSource(path: String, override val dateRange: DateRange)
-    extends HourlySuffixMostRecentLzoScrooge[ClusterTopKTweetsWithScores](
-      path,
-      dateRange
-    )
-
-case class TweetSimilarityUnhydratedPairsSource(path: String, override val dateRange: DateRange)
-    extends DailySuffixMostRecentLzoScrooge[LabelledTweetPairs](
-      path,
-      dateRange
-    )
-
-case class WTFCandidatesSource(path: String)
-    extends FixedPathLzoScrooge[Candidates](path, Candidates)
-
-case class EmbeddingsLiteSource(path: String)
-    extends FixedPathLzoScrooge[EmbeddingsLite](path, EmbeddingsLite)
-
-object AdhocKeyValSources {
-  def interestedInSource(path: String): VersionedKeyValSource[Long, ClustersUserIsInterestedIn] = {
-    implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
-    implicit val valInject: Injection[ClustersUserIsInterestedIn, Array[Byte]] =
-      CompactScalaCodec(ClustersUserIsInterestedIn)
-    VersionedKeyValSource[Long, ClustersUserIsInterestedIn](path)
-  }
-
-  def clusterDetailsSource(path: String): VersionedKeyValSource[(String, Int), ClusterDetails] = {
-    implicit val keyInject: Injection[(String, Int), Array[Byte]] =
-      Bufferable.injectionOf[(String, Int)]
-    implicit val valInject: Injection[ClusterDetails, Array[Byte]] =
-      CompactScalaCodec(ClusterDetails)
-    VersionedKeyValSource[(String, Int), ClusterDetails](path)
-  }
-
-  def bipartiteQualitySource(
-    path: String
-  ): VersionedKeyValSource[(String, Int), BipartiteClusterQuality] = {
-    implicit val keyInject: Injection[(String, Int), Array[Byte]] =
-      Bufferable.injectionOf[(String, Int)]
-    implicit val valInject: Injection[BipartiteClusterQuality, Array[Byte]] =
-      CompactScalaCodec(BipartiteClusterQuality)
-    VersionedKeyValSource[(String, Int), BipartiteClusterQuality](path)
-  }
-
-  def entityToClustersSource(
-    path: String
-  ): VersionedKeyValSource[SimClustersEmbeddingId, SimClustersEmbedding] = {
-    implicit val keyInject: Injection[SimClustersEmbeddingId, Array[Byte]] =
-      BinaryScalaCodec(SimClustersEmbeddingId)
-    implicit val valInject: Injection[SimClustersEmbedding, Array[Byte]] =
-      BinaryScalaCodec(SimClustersEmbedding)
-    VersionedKeyValSource[SimClustersEmbeddingId, SimClustersEmbedding](path)
-  }
-
-  def clusterToEntitiesSource(
-    path: String
-  ): VersionedKeyValSource[SimClustersEmbeddingId, InternalIdEmbedding] = {
-    implicit val keyInject: Injection[SimClustersEmbeddingId, Array[Byte]] = BinaryScalaCodec(
-      SimClustersEmbeddingId)
-    implicit val valInject: Injection[InternalIdEmbedding, Array[Byte]] =
-      BinaryScalaCodec(InternalIdEmbedding)
-    VersionedKeyValSource[SimClustersEmbeddingId, InternalIdEmbedding](path)
-  }
-
-  // For storing producer-simclusters embeddings
-  def topProducerToClusterEmbeddingsSource(
-    path: String
-  ): VersionedKeyValSource[Long, TopSimClustersWithScore] = {
-    implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
-    implicit val valInject: Injection[TopSimClustersWithScore, Array[Byte]] =
-      CompactScalaCodec(TopSimClustersWithScore)
-    VersionedKeyValSource[Long, TopSimClustersWithScore](path)
-  }
-
-  // For storing producer-simclusters embeddings
-  def topClusterEmbeddingsToProducerSource(
-    path: String
-  ): VersionedKeyValSource[PersistedFullClusterId, TopProducersWithScore] = {
-    implicit val keyInject: Injection[PersistedFullClusterId, Array[Byte]] =
-      CompactScalaCodec(PersistedFullClusterId)
-    implicit val valInject: Injection[TopProducersWithScore, Array[Byte]] =
-      CompactScalaCodec(TopProducersWithScore)
-    VersionedKeyValSource[PersistedFullClusterId, TopProducersWithScore](path)
-  }
-
-  def userToInferredEntitiesSource(
-    path: String
-  ): VersionedKeyValSource[Long, SimClustersInferredEntities] = {
-    implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
-    implicit val valInject: Injection[SimClustersInferredEntities, Array[Byte]] =
-      CompactScalaCodec(SimClustersInferredEntities)
-    VersionedKeyValSource[Long, SimClustersInferredEntities](path)
-  }
-
-  def knownForAdhocSource(path: String): VersionedKeyValSource[Long, ClustersUserIsKnownFor] = {
-    implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
-    implicit val valInject: Injection[ClustersUserIsKnownFor, Array[Byte]] =
-      CompactScalaCodec(ClustersUserIsKnownFor)
-    VersionedKeyValSource[Long, ClustersUserIsKnownFor](path)
-  }
-
-  def knownForSBFResultsDevelSource(
-    path: String
-  ): VersionedKeyValSource[Long, Array[(Int, Float)]] = {
-    implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
-    implicit val valInject: Injection[Array[(Int, Float)], Array[Byte]] =
-      Bufferable.injectionOf[Array[(Int, Float)]]
-    VersionedKeyValSource[Long, Array[(Int, Float)]](path)
-  }
-
-  // injection to store adjlist in the mapped indices space for users
-  def intermediateSBFResultsDevelSource(
-    path: String
-  ): VersionedKeyValSource[Int, List[(Int, Float)]] = {
-    implicit val keyInject: Injection[Int, Array[Byte]] = Injection.int2BigEndian
-    implicit val valInject: Injection[List[(Int, Float)], Array[Byte]] =
-      Bufferable.injectionOf[List[(Int, Float)]]
-    VersionedKeyValSource[Int, List[(Int, Float)]](path)
-  }
-
-  def mappedIndicesDevelSource(path: String): VersionedKeyValSource[Int, Long] = {
-    implicit val keyInject: Injection[Int, Array[Byte]] = Injection.int2BigEndian
-    implicit val valInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
-    VersionedKeyValSource[Int, Long](path)
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.scala
@ -1,49 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources
-
-object DataPaths {
-
-  val InterestedIn2020Path =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_20M_145K_2020"
-
-  val InterestedIn2020ThriftPath =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_20M_145K_2020_thrift"
-
-  val InterestedInLite2020Path =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_lite_20M_145K_2020"
-
-  val InterestedInLite2020ThriftPath =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_lite_20M_145K_2020_thrift"
-
-  val KnownFor2020Path =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_known_for_20M_145K_2020"
-
-  // keep this inside /user/cassowary/manhattan_sequence_files/ to use the latest 3 retention policy
-  val KnownFor2020ThriftDatasetPath =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_known_for_20M_145K_2020_thrift"
-
-  val OfflineClusterTopMediaTweets2020DatasetPath =
-    "/user/cassowary/manhattan_sequence_files/cluster_top_media_tweets_20M_145K_2020"
-}
-
-/**
- * These should only be accessed from simclusters_v2 data pipeline for intermediate data, these
- * are not opt-out compliant and shouldn't be exposed externally.
- */
-object InternalDataPaths {
-  // Internal versions, not to be read or written outside of simcluster_v2
-
-  private[simclusters_v2] val RawInterestedIn2020Path =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_interested_in_20M_145K_2020"
-
-  private[simclusters_v2] val RawInterestedInLite2020Path =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_interested_in_lite_20M_145K_2020"
-
-  private[simclusters_v2] val RawKnownForDec11Path =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_dec11"
-
-  private[simclusters_v2] val RawKnownForUpdatedPath =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_updated"
-
-  private[simclusters_v2] val RawKnownFor2020Path =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_2020"
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.scala
@ -1,39 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources
-
-import com.twitter.scalding.DateOps
-import com.twitter.scalding.DateRange
-import com.twitter.scalding.Days
-import com.twitter.scalding.TypedPipe
-import com.twitter.scalding_internal.dalv2.DAL
-import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
-import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
-import com.twitter.simclusters_v2.thriftscala.NormsAndCounts
-import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
-import java.util.TimeZone
-
-object DataSources {
-
-  /**
-   * Reads production normalized graph data from atla-proc
-   */
-  def userUserNormalizedGraphSource(implicit dateRange: DateRange): TypedPipe[UserAndNeighbors] = {
-    DAL
-      .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(14)(DateOps.UTC))
-      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-      .toTypedPipe
-  }
-
-  /**
-   * Reads production user norms and counts data from atla-proc
-   */
-  def userNormsAndCounts(
-    implicit dateRange: DateRange,
-    timeZone: TimeZone
-  ): TypedPipe[NormsAndCounts] = {
-    DAL
-      .readMostRecentSnapshot(ProducerNormsAndCountsScalaDataset, dateRange.prepend(Days(14)))
-      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-      .toTypedPipe
-  }
-
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.scala
@ -1,222 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources
-
-import com.twitter.dal.client.dataset.KeyValDALDataset
-import com.twitter.scalding.DateRange
-import com.twitter.scalding.typed.TypedPipe
-import com.twitter.scalding_internal.dalv2.DAL
-import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
-import com.twitter.simclusters_v2.thriftscala._
-import com.twitter.wtf.entity_real_graph.thriftscala.EntityType
-import com.twitter.simclusters_v2.common.ClusterId
-import com.twitter.simclusters_v2.common.ModelVersions
-
-object EntityEmbeddingsSources {
-
-  final val SemanticCoreSimClustersEmbeddingsDec11Dataset =
-    SemanticCoreSimclustersEmbeddingsScalaDataset
-
-  final val SemanticCoreSimClustersEmbeddingsUpdatedDataset =
-    SemanticCoreSimclustersEmbeddingsUpdatedScalaDataset
-
-  final val SemanticCoreSimClustersEmbeddings2020Dataset =
-    SemanticCoreSimclustersEmbeddings2020ScalaDataset
-
-  final val SemanticCorePerLanguageSimClustersEmbeddingsDataset =
-    SemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
-
-  final val LogFavSemanticCorePerLanguageSimClustersEmbeddingsDataset =
-    LogFavSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
-
-  final val HashtagSimClustersEmbeddingsUpdatedDataset =
-    HashtagSimclustersEmbeddingsUpdatedScalaDataset
-
-  final val ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset =
-    ReverseIndexSemanticCoreSimclustersEmbeddingsScalaDataset
-
-  final val ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset =
-    ReverseIndexSemanticCoreSimclustersEmbeddingsUpdatedScalaDataset
-
-  final val ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset =
-    ReverseIndexSemanticCoreSimclustersEmbeddings2020ScalaDataset
-
-  final val ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
-    ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
-
-  final val LogFavReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
-    LogFavReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
-
-  final val ReverseIndexHashtagSimClustersEmbeddingsUpdatedDataset =
-    ReverseIndexHashtagSimclustersEmbeddingsUpdatedScalaDataset
-
-  // Fav-based TFG topic embeddings built from user device languages
-  // Keyed by SimClustersEmbeddingId with InternalId.TopicId ((topic, language) pair, with country = None)
-  final val FavTfgTopicEmbeddingsDataset = FavTfgTopicEmbeddingsScalaDataset
-
-  final val FavTfgTopicEmbeddingsParquetDataset = FavTfgTopicEmbeddingsParquetScalaDataset
-
-  final val FavTfgTopicEmbeddings2020Dataset = FavTfgTopicEmbeddings2020ScalaDataset
-
-  final val FavTfgTopicEmbeddings2020ParquetDataset = FavTfgTopicEmbeddings2020ParquetScalaDataset
-
-  // Logfav-based TFG topic embeddings built from user device languages
-  // Keyed by SimClustersEmbeddingId with InternalId.LocaleEntityId ((topic, language) pair)
-  final val LogFavTfgTopicEmbeddingsDataset = LogFavTfgTopicEmbeddingsScalaDataset
-
-  final val LogFavTfgTopicEmbeddingsParquetDataset = LogFavTfgTopicEmbeddingsParquetScalaDataset
-
-  // Fav-based TFG topic embeddings built from inferred user consumed languages
-  // Keyed by SimClustersEmbeddingId with InternalId.TopicId ((topic, country, language) tuple)
-  final val FavInferredLanguageTfgTopicEmbeddingsDataset =
-    FavInferredLanguageTfgTopicEmbeddingsScalaDataset
-
-  private val validSemanticCoreEmbeddingTypes = Seq(
-    EmbeddingType.FavBasedSematicCoreEntity,
-    EmbeddingType.FollowBasedSematicCoreEntity
-  )
-
-  /**
-   * Given a fav/follow/etc embedding type and a ModelVersion, retrieve the corresponding dataset to
-   * (SemanticCore entityId -> List(clusterId)) from a certain dateRange.
-   */
-  def getSemanticCoreEntityEmbeddingsSource(
-    embeddingType: EmbeddingType,
-    modelVersion: String,
-    dateRange: DateRange
-  ): TypedPipe[(Long, SimClustersEmbedding)] = {
-    val dataSet = modelVersion match {
-      case ModelVersions.Model20M145KDec11 => SemanticCoreSimClustersEmbeddingsDec11Dataset
-      case ModelVersions.Model20M145KUpdated => SemanticCoreSimClustersEmbeddingsUpdatedDataset
-      case _ => throw new IllegalArgumentException(s"ModelVersion $modelVersion is not supported")
-    }
-    assert(validSemanticCoreEmbeddingTypes.contains(embeddingType))
-    entityEmbeddingsSource(dataSet, embeddingType, dateRange)
-  }
-
-  /**
-   * Given a fav/follow/etc embedding type and a ModelVersion, retrieve the corresponding dataset to
-   * (clusterId -> List(SemanticCore entityId)) from a certain dateRange.
-   */
-  def getReverseIndexedSemanticCoreEntityEmbeddingsSource(
-    embeddingType: EmbeddingType,
-    modelVersion: String,
-    dateRange: DateRange
-  ): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
-    val dataSet = modelVersion match {
-      case ModelVersions.Model20M145KDec11 =>
-        ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset
-      case ModelVersions.Model20M145KUpdated =>
-        ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset
-      case ModelVersions.Model20M145K2020 =>
-        ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset
-      case _ => throw new IllegalArgumentException(s"ModelVersion $modelVersion is not supported")
-    }
-
-    assert(validSemanticCoreEmbeddingTypes.contains(embeddingType))
-    reverseIndexedEntityEmbeddingsSource(dataSet, embeddingType, dateRange)
-  }
-
-  // Return the raw DAL dataset reference. Use this if you're writing to DAL.
-  def getEntityEmbeddingsDataset(
-    entityType: EntityType,
-    modelVersion: String,
-    isEmbeddingsPerLocale: Boolean = false
-  ): KeyValDALDataset[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] = {
-    (entityType, modelVersion) match {
-      case (EntityType.SemanticCore, ModelVersions.Model20M145KDec11) =>
-        SemanticCoreSimClustersEmbeddingsDec11Dataset
-      case (EntityType.SemanticCore, ModelVersions.Model20M145KUpdated) =>
-        if (isEmbeddingsPerLocale) {
-          SemanticCorePerLanguageSimClustersEmbeddingsDataset
-        } else {
-          SemanticCoreSimClustersEmbeddingsUpdatedDataset
-        }
-      case (EntityType.SemanticCore, ModelVersions.Model20M145K2020) =>
-        SemanticCoreSimClustersEmbeddings2020Dataset
-      case (EntityType.Hashtag, ModelVersions.Model20M145KUpdated) =>
-        HashtagSimClustersEmbeddingsUpdatedDataset
-      case (entityType, modelVersion) =>
-        throw new IllegalArgumentException(
-          s"(Entity Type, ModelVersion) ($entityType, $modelVersion) not supported.")
-    }
-  }
-
-  // Return the raw DAL dataset reference. Use this if you're writing to DAL.
-  def getReverseIndexedEntityEmbeddingsDataset(
-    entityType: EntityType,
-    modelVersion: String,
-    isEmbeddingsPerLocale: Boolean = false
-  ): KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]] = {
-    (entityType, modelVersion) match {
-      case (EntityType.SemanticCore, ModelVersions.Model20M145KDec11) =>
-        ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset
-      case (EntityType.SemanticCore, ModelVersions.Model20M145KUpdated) =>
-        if (isEmbeddingsPerLocale) {
-          ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset
-        } else {
-          ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset
-        }
-      case (EntityType.SemanticCore, ModelVersions.Model20M145K2020) =>
-        ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset
-      case (EntityType.Hashtag, ModelVersions.Model20M145KUpdated) =>
-        ReverseIndexHashtagSimClustersEmbeddingsUpdatedDataset
-      case (entityType, modelVersion) =>
-        throw new IllegalArgumentException(
-          s"(Entity Type, ModelVersion) ($entityType, $modelVersion) not supported.")
-    }
-  }
-
-  private def entityEmbeddingsSource(
-    dataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]],
-    embeddingType: EmbeddingType,
-    dateRange: DateRange
-  ): TypedPipe[(Long, SimClustersEmbedding)] = {
-    val pipe = DAL
-      .readMostRecentSnapshot(dataset, dateRange)
-      .withRemoteReadPolicy(AllowCrossDC)
-      .toTypedPipe
-    filterEntityEmbeddingsByType(pipe, embeddingType)
-  }
-
-  private def reverseIndexedEntityEmbeddingsSource(
-    dataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]],
-    embeddingType: EmbeddingType,
-    dateRange: DateRange
-  ): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
-    val pipe = DAL
-      .readMostRecentSnapshot(dataset, dateRange)
-      .withRemoteReadPolicy(AllowCrossDC)
-      .toTypedPipe
-    filterReverseIndexedEntityEmbeddingsByType(pipe, embeddingType)
-  }
-
-  private[hdfs_sources] def filterEntityEmbeddingsByType(
-    pipe: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]],
-    embeddingType: EmbeddingType
-  ): TypedPipe[(Long, SimClustersEmbedding)] = {
-    pipe.collect {
-      case KeyVal(
-            SimClustersEmbeddingId(_embeddingType, _, InternalId.EntityId(entityId)),
-            embedding
-          ) if _embeddingType == embeddingType =>
-        (entityId, embedding)
-    }
-  }
-
-  private[hdfs_sources] def filterReverseIndexedEntityEmbeddingsByType(
-    pipe: TypedPipe[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]],
-    embeddingType: EmbeddingType
-  ): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
-    pipe.collect {
-      case KeyVal(
-            SimClustersEmbeddingId(_embeddingType, _, InternalId.ClusterId(clusterId)),
-            embedding
-          ) if _embeddingType == embeddingType =>
-        val entitiesWithScores = embedding.embedding.collect {
-          case InternalIdWithScore(InternalId.EntityId(entityId), score) =>
-            SemanticCoreEntityWithScore(entityId, score)
-        }
-        (clusterId, entitiesWithScores)
-    }
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.scala
@ -1,178 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources
-
-import com.twitter.dal.client.dataset.KeyValDALDataset
-import com.twitter.scalding.{DateOps, DateRange, Days, TypedPipe}
-import com.twitter.scalding_internal.dalv2.DAL
-import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, ProcAtla}
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
-import com.twitter.simclusters_v2.common.UserId
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
-import java.util.TimeZone
-
-object InterestedInSources {
-
-  private val ModelVersionInterestedInDatasetMap: Map[ModelVersion, KeyValDALDataset[
-    KeyVal[UserId, ClustersUserIsInterestedIn]
-  ]] = Map(
-    ModelVersion.Model20m145kDec11 -> SimclustersV2InterestedInScalaDataset,
-    ModelVersion.Model20m145kUpdated -> SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
-    ModelVersion.Model20m145k2020 -> SimclustersV2InterestedIn20M145K2020ScalaDataset
-  )
-
-  /**
-   * Internal version, not PDP compliant, not to be used outside simclusters_v2
-   * Reads 20M145KDec11 production InterestedIn data from atla-proc, with a 14-day extended window
-   */
-  private[simclusters_v2] def simClustersRawInterestedInDec11Source(
-    dateRange: DateRange,
-    timeZone: TimeZone
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-
-    DAL
-      .readMostRecentSnapshot(
-        SimclustersV2RawInterestedIn20M145KDec11ScalaDataset,
-        dateRange.prepend(Days(14)(timeZone))
-      )
-      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-      .toTypedPipe
-      .map {
-        case KeyVal(userId, clustersUserIsInterestedIn) =>
-          (userId, clustersUserIsInterestedIn)
-      }
-  }
-
-  /**
-   * Internal version, not PDP compliant, not to be used outside simclusters_v2
-   * Reads 20M145KUpdated InterestedIn data from atla-proc, with a 14-day extended window
-   */
-  private[simclusters_v2] def simClustersRawInterestedInUpdatedSource(
-    dateRange: DateRange,
-    timeZone: TimeZone
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-    DAL
-      .readMostRecentSnapshot(
-        SimclustersV2RawInterestedIn20M145KUpdatedScalaDataset,
-        dateRange.prepend(Days(14)(timeZone))
-      )
-      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-      .toTypedPipe.map {
-        case KeyVal(userId, clustersUserIsInterestedIn) =>
-          (userId, clustersUserIsInterestedIn)
-      }
-  }
-
-  /**
-   * Internal version, not PDP compliant, not to be used outside simclusters_v2
-   * Reads 20M145K2020 InterestedIn data from atla-proc, with a 14-day extended window
-   */
-  private[simclusters_v2] def simClustersRawInterestedIn2020Source(
-    dateRange: DateRange,
-    timeZone: TimeZone
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-    DAL
-      .readMostRecentSnapshot(
-        SimclustersV2RawInterestedIn20M145K2020ScalaDataset,
-        dateRange.prepend(Days(14)(timeZone))
-      )
-      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-      .toTypedPipe.map {
-        case KeyVal(userId, clustersUserIsInterestedIn) =>
-          (userId, clustersUserIsInterestedIn)
-      }
-  }
-
-  private[simclusters_v2] def simClustersRawInterestedInLite2020Source(
-    dateRange: DateRange,
-    timeZone: TimeZone
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-    DAL
-      .readMostRecentSnapshot(
-        SimclustersV2RawInterestedInLite20M145K2020ScalaDataset,
-        dateRange.extend(Days(14)(timeZone)))
-      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-      .toTypedPipe.map {
-        case KeyVal(userId, clustersUserIsInterestedIn) =>
-          (userId, clustersUserIsInterestedIn)
-      }
-  }
-
-  /**
-   * Reads 20M145KDec11 production InterestedIn data from atla-proc, with a 14-day extended window
-   */
-  def simClustersInterestedInDec11Source(
-    dateRange: DateRange,
-    timeZone: TimeZone
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-
-    DAL
-      .readMostRecentSnapshot(
-        SimclustersV2InterestedInScalaDataset,
-        dateRange.prepend(Days(14)(timeZone)))
-      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-      .toTypedPipe.map {
-        case KeyVal(userId, clustersUserIsInterestedIn) =>
-          (userId, clustersUserIsInterestedIn)
-      }
-  }
-
-  /**
-   * Reads 20M145KUpdated InterestedIn data from atla-proc, with a 14-day extended window
-   */
-  def simClustersInterestedInUpdatedSource(
-    dateRange: DateRange,
-    timeZone: TimeZone
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-    DAL
-      .readMostRecentSnapshot(
-        SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
-        dateRange.prepend(Days(14)(timeZone))
-      )
-      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-      .toTypedPipe.map {
-        case KeyVal(userId, clustersUserIsInterestedIn) =>
-          (userId, clustersUserIsInterestedIn)
-      }
-  }
-
-  /**
-   * Reads 20M145K2020 InterestedIn data from atla-proc, with a 14-day extended window
-   */
-  def simClustersInterestedIn2020Source(
-    dateRange: DateRange,
-    timeZone: TimeZone
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-    DAL
-      .readMostRecentSnapshot(
-        SimclustersV2InterestedIn20M145K2020ScalaDataset,
-        dateRange.prepend(Days(14)(timeZone))
-      )
-      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-      .toTypedPipe.map {
-        case KeyVal(userId, clustersUserIsInterestedIn) =>
-          (userId, clustersUserIsInterestedIn)
-      }
-  }
-
-  /**
-   * Reads InterestedIn data based on ModelVersion from atla-proc, with a 14-day extended window
-   */
-  def simClustersInterestedInSource(
-    modelVersion: ModelVersion,
-    dateRange: DateRange,
-    timeZone: TimeZone
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-
-    DAL
-      .readMostRecentSnapshot(
-        ModelVersionInterestedInDatasetMap(modelVersion),
-        dateRange.prepend(Days(14)(timeZone))
-      )
-      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-      .toTypedPipe.map {
-        case KeyVal(userId, clustersUserIsInterestedIn) =>
-          (userId, clustersUserIsInterestedIn)
-      }
-  }
-
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.scala
@ -1,86 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources
-
-import com.twitter.scalding.DateRange
-import com.twitter.scalding.TypedPipe
-import com.twitter.scalding_internal.dalv2.DAL
-import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC
-import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
-import com.twitter.scalding_internal.dalv2.remote_access.Proc3Atla
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore
-
-object ProducerEmbeddingSources {
-
-  /**
-   * Helper function to retrieve producer SimClusters embeddings with the legacy `TopSimClustersWithScore`
-   * value type.
-   */
-  def producerEmbeddingSourceLegacy(
-    embeddingType: EmbeddingType,
-    modelVersion: ModelVersion
-  )(
-    implicit dateRange: DateRange
-  ): TypedPipe[(Long, TopSimClustersWithScore)] = {
-    val producerEmbeddingDataset = (embeddingType, modelVersion) match {
-      case (EmbeddingType.ProducerFollowBasedSemanticCoreEntity, ModelVersion.Model20m145kDec11) =>
-        ProducerTopKSimclusterEmbeddingsByFollowScoreScalaDataset
-      case (EmbeddingType.ProducerFavBasedSemanticCoreEntity, ModelVersion.Model20m145kDec11) =>
-        ProducerTopKSimclusterEmbeddingsByFavScoreScalaDataset
-      case (
-            EmbeddingType.ProducerFollowBasedSemanticCoreEntity,
-            ModelVersion.Model20m145kUpdated) =>
-        ProducerTopKSimclusterEmbeddingsByFollowScoreUpdatedScalaDataset
-      case (EmbeddingType.ProducerFavBasedSemanticCoreEntity, ModelVersion.Model20m145kUpdated) =>
-        ProducerTopKSimclusterEmbeddingsByFavScoreUpdatedScalaDataset
-      case (_, _) =>
-        throw new ClassNotFoundException(
-          "Unsupported embedding type: " + embeddingType + " and model version: " + modelVersion)
-    }
-
-    DAL
-      .readMostRecentSnapshot(producerEmbeddingDataset).withRemoteReadPolicy(
-        AllowCrossClusterSameDC)
-      .toTypedPipe.map {
-        case KeyVal(producerId, topSimClustersWithScore) =>
-          (producerId, topSimClustersWithScore)
-      }
-  }
-
-  def producerEmbeddingSource(
-    embeddingType: EmbeddingType,
-    modelVersion: ModelVersion
-  )(
-    implicit dateRange: DateRange
-  ): TypedPipe[(Long, SimClustersEmbedding)] = {
-    val producerEmbeddingDataset = (embeddingType, modelVersion) match {
-      case (EmbeddingType.AggregatableLogFavBasedProducer, ModelVersion.Model20m145k2020) =>
-        AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
-      case (EmbeddingType.AggregatableFollowBasedProducer, ModelVersion.Model20m145k2020) =>
-        AggregatableProducerSimclustersEmbeddingsByFollowScore2020ScalaDataset
-      case (EmbeddingType.RelaxedAggregatableLogFavBasedProducer, ModelVersion.Model20m145k2020) =>
-        AggregatableProducerSimclustersEmbeddingsByLogFavScoreRelaxedFavEngagementThreshold2020ScalaDataset
-      case (_, _) =>
-        throw new ClassNotFoundException(
-          "Unsupported embedding type: " + embeddingType + " and model version: " + modelVersion)
-    }
-
-    DAL
-      .readMostRecentSnapshot(
-        producerEmbeddingDataset
-      )
-      .withRemoteReadPolicy(ExplicitLocation(Proc3Atla))
-      .toTypedPipe
-      .map {
-        case KeyVal(
-              SimClustersEmbeddingId(_, _, InternalId.UserId(producerId: Long)),
-              embedding: SimClustersEmbedding) =>
-          (producerId, embedding)
-      }
-  }
-
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD
@ -1,13 +0,0 @@
-scala_library(
-    sources = ["*.scala"],
-    platform = "java8",
-    tags = ["bazel-compatible"],
-    dependencies = [
-        "src/scala/com/twitter/scalding_internal/multiformat/format",
-        "src/scala/com/twitter/simclusters_v2/common",
-        "src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala",
-        "src/thrift/com/twitter/ml/api:embedding-scala",
-        "src/thrift/com/twitter/recos/entities:entities-thrift-scala",
-        "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
-    ],
-)
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.scala
@ -1,16 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.injections
-
-import com.twitter.bijection.Bufferable
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
-  ScalaCompactThrift,
-  genericInjection
-}
-import com.twitter.simclusters_v2.thriftscala.ClusterDetails
-
-object ClusterDetailsInjection {
-  val injection = KeyValInjection[(String, Int), ClusterDetails](
-    genericInjection(Bufferable.injectionOf[(String, Int)]),
-    ScalaCompactThrift(ClusterDetails)
-  )
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.scala
@ -1,13 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.injections
-
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
-import com.twitter.simclusters_v2.thriftscala.{TweetsWithScore, DayPartitionedClusterId}
-
-object ClusterTopMediaTweetsInjection {
-
-  val injection = KeyValInjection[DayPartitionedClusterId, TweetsWithScore](
-    ScalaCompactThrift(DayPartitionedClusterId),
-    ScalaCompactThrift(TweetsWithScore)
-  )
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.scala
@ -1,14 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.injections
-
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
-import com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores
-import com.twitter.simclusters_v2.thriftscala.FullClusterId
-
-object ClusterTopTweetsInjection {
-
-  val clusterIdToTopKTweetsInjection = KeyValInjection[FullClusterId, TopKTweetsWithScores](
-    ScalaCompactThrift(FullClusterId),
-    ScalaCompactThrift(TopKTweetsWithScores)
-  )
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.scala
@ -1,16 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.injections
-
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaBinaryThrift
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
-import com.twitter.simclusters_v2.common.UserId
-import com.twitter.simclusters_v2.thriftscala._
-
-object ClusteringInjections {
-
-  final val OrderedClustersAndMembersInjection: KeyValInjection[
-    UserId,
-    OrderedClustersAndMembers
-  ] =
-    KeyValInjection(Long2BigEndian, ScalaBinaryThrift(OrderedClustersAndMembers))
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.scala
@ -1,47 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.injections
-
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaBinaryThrift
-import com.twitter.simclusters_v2.thriftscala._
-import com.twitter.ml.api.thriftscala.Embedding
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
-
-object EntityEmbeddingsInjections {
-
-  final val EntitySimClustersEmbeddingInjection: KeyValInjection[
-    SimClustersEmbeddingId,
-    SimClustersEmbedding
-  ] =
-    KeyValInjection(
-      ScalaBinaryThrift(SimClustersEmbeddingId),
-      ScalaBinaryThrift(SimClustersEmbedding)
-    )
-
-  final val InternalIdEmbeddingInjection: KeyValInjection[
-    SimClustersEmbeddingId,
-    InternalIdEmbedding
-  ] =
-    KeyValInjection(
-      ScalaBinaryThrift(SimClustersEmbeddingId),
-      ScalaBinaryThrift(InternalIdEmbedding)
-    )
-
-  final val EntitySimClustersMultiEmbeddingInjection: KeyValInjection[
-    SimClustersMultiEmbeddingId,
-    SimClustersMultiEmbedding
-  ] =
-    KeyValInjection(
-      ScalaBinaryThrift(SimClustersMultiEmbeddingId),
-      ScalaBinaryThrift(SimClustersMultiEmbedding)
-    )
-
-  final val UserMbcgEmbeddingInjection: KeyValInjection[
-    Long,
-    Embedding
-  ] =
-    KeyValInjection[Long, Embedding](
-      Long2BigEndian,
-      ScalaCompactThrift(Embedding)
-    )
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.scala
@ -1,27 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.injections
-
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
-  Int2BigEndian,
-  Long2BigEndian,
-  ScalaCompactThrift
-}
-import com.twitter.simclusters_v2.thriftscala.SimClustersInferredEntities
-
-object InferredEntitiesInjections {
-
-  final val InferredEntityInjection: KeyValInjection[Long, SimClustersInferredEntities] =
-    KeyValInjection(
-      Long2BigEndian,
-      ScalaCompactThrift(SimClustersInferredEntities)
-    )
-
-  final val InferredEntityKeyedByClusterInjection: KeyValInjection[
-    Int,
-    SimClustersInferredEntities
-  ] =
-    KeyValInjection(
-      Int2BigEndian,
-      ScalaCompactThrift(SimClustersInferredEntities)
-    )
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.scala
@ -1,13 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.injections
-
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.StringUtf8
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
-import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
-
-object InterestedInInjection {
-  val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(ClustersUserIsInterestedIn))
-  val languageInjection =
-    KeyValInjection(StringUtf8, ScalaCompactThrift(ClustersUserIsInterestedIn))
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.scala
@ -1,12 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.injections
-
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
-  Long2BigEndian,
-  ScalaCompactThrift
-}
-import com.twitter.simclusters_v2.thriftscala._
-
-object KnownForInjection {
-  val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(ClustersUserIsKnownFor))
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.scala
@ -1,31 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.injections
-
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
-import com.twitter.simclusters_v2.thriftscala.LeftNode
-import com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList
-import com.twitter.simclusters_v2.thriftscala.RightNode
-import com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct
-import com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeightList
-import com.twitter.simclusters_v2.thriftscala.SimilarRightNodes
-import com.twitter.simclusters_v2.thriftscala.CandidateTweetsList
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
-
-object MultiTypeGraphInjections {
-  final val truncatedMultiTypeGraphInjection =
-    KeyValInjection(ScalaCompactThrift(LeftNode), ScalaCompactThrift(RightNodeWithEdgeWeightList))
-  final val topKRightNounListInjection =
-    KeyValInjection(
-      ScalaCompactThrift(RightNodeTypeStruct),
-      ScalaCompactThrift(NounWithFrequencyList))
-  final val similarRightNodesInjection =
-    KeyValInjection[RightNode, SimilarRightNodes](
-      ScalaCompactThrift(RightNode),
-      ScalaCompactThrift(SimilarRightNodes)
-    )
-  final val tweetRecommendationsInjection =
-    KeyValInjection[Long, CandidateTweetsList](
-      Long2BigEndian,
-      ScalaCompactThrift(CandidateTweetsList)
-    )
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.scala
@ -1,45 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.injections
-
-import com.twitter.hermit.candidate.thriftscala.Candidates
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
-  Long2BigEndian,
-  ScalaBinaryThrift,
-  ScalaCompactThrift
-}
-import com.twitter.simclusters_v2.thriftscala.{
-  PersistedFullClusterId,
-  SimClustersEmbedding,
-  SimClustersEmbeddingId,
-  TopProducersWithScore,
-  TopSimClustersWithScore
-}
-
-object ProducerEmbeddingsInjections {
-  final val ProducerTopKSimClusterEmbeddingsInjection: KeyValInjection[
-    Long,
-    TopSimClustersWithScore
-  ] =
-    KeyValInjection(
-      keyCodec = Long2BigEndian,
-      valueCodec = ScalaCompactThrift(TopSimClustersWithScore))
-
-  final val SimClusterEmbeddingTopKProducersInjection: KeyValInjection[
-    PersistedFullClusterId,
-    TopProducersWithScore
-  ] =
-    KeyValInjection(
-      keyCodec = ScalaCompactThrift(PersistedFullClusterId),
-      valueCodec = ScalaCompactThrift(TopProducersWithScore))
-
-  final val SimilarUsersInjection: KeyValInjection[Long, Candidates] =
-    KeyValInjection(keyCodec = Long2BigEndian, valueCodec = ScalaCompactThrift(Candidates))
-
-  final val ProducerSimClustersEmbeddingInjection: KeyValInjection[
-    SimClustersEmbeddingId,
-    SimClustersEmbedding
-  ] =
-    KeyValInjection(
-      keyCodec = ScalaBinaryThrift(SimClustersEmbeddingId),
-      valueCodec = ScalaBinaryThrift(SimClustersEmbedding))
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.scala
@ -1,53 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.injections
-
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
-  Long2BigEndian,
-  ScalaCompactThrift,
-  StringUtf8
-}
-import com.twitter.recos.entities.thriftscala.{
-  SemanticCoreEntityScoreList,
-  SemanticCoreEntityWithLocale,
-  UserIdWithLocale,
-  UserScoreList
-}
-
-object SemanticCoreEntitiesInjections {
-
-  final val StringToSemanticCoreEntityScoreListInjection: KeyValInjection[
-    String,
-    SemanticCoreEntityScoreList
-  ] =
-    KeyValInjection(
-      StringUtf8,
-      ScalaCompactThrift(SemanticCoreEntityScoreList)
-    )
-
-  final val LongToSemanticCoreEntityScoreListInjection: KeyValInjection[
-    Long,
-    SemanticCoreEntityScoreList
-  ] =
-    KeyValInjection(
-      Long2BigEndian,
-      ScalaCompactThrift(SemanticCoreEntityScoreList)
-    )
-
-  final val UserWithLocaleToSemanticCoreEntityScoreListInjection: KeyValInjection[
-    UserIdWithLocale,
-    SemanticCoreEntityScoreList
-  ] =
-    KeyValInjection(
-      ScalaCompactThrift(UserIdWithLocale),
-      ScalaCompactThrift(SemanticCoreEntityScoreList)
-    )
-
-  final val SemanticCoreEntityWithLocaleToUsersScoreListInjection: KeyValInjection[
-    SemanticCoreEntityWithLocale,
-    UserScoreList
-  ] =
-    KeyValInjection(
-      ScalaCompactThrift(SemanticCoreEntityWithLocale),
-      ScalaCompactThrift(UserScoreList)
-    )
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.scala
@ -1,12 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.injections
-
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
-  Long2BigEndian,
-  ScalaCompactThrift
-}
-import com.twitter.simclusters_v2.thriftscala.SingleSideUserScores
-
-object SingleSideUserScoresInjection {
-  val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(SingleSideUserScores))
-}
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD
@ -1,60 +0,0 @@
-scala_library(
-    sources = ["*.scala"],
-    platform = "java8",
-    tags = ["bazel-compatible"],
-    dependencies = [
-        ":data_sources",
-        "3rdparty/src/jvm/com/twitter/scalding:core",
-        "src/scala/com/twitter/scalding_internal/dalv2",
-        "src/scala/com/twitter/scalding_internal/multiformat/format",
-        "src/scala/com/twitter/scalding_internal/source/lzo_scrooge",
-        "src/scala/com/twitter/simclusters_v2/common",
-        "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
-        "src/thrift/com/twitter/wtf/entity_real_graph:entity_real_graph-thrift-scala",
-    ],
-)
-
-scala_library(
-    name = "data_sources",
-    sources = [],
-    description = "DAL datasets we wish to expose externally",
-    platform = "java8",
-    tags = ["bazel-compatible"],
-    dependencies = [
-        ":reverse_index_semantic_core_per_language_simclusters_embeddings_presto-scala",
-        ":semantic_core_per_language_simclusters_embeddings_presto-scala",
-        "src/scala/com/twitter/simclusters_v2/common",
-    ],
-)
-
-create_datasets(
-    base_name = "reverse_index_semantic_core_per_language_simclusters_embeddings_presto",
-    java_schema = "com.twitter.simclusters_v2.thriftjava.InternalIdEmbeddingWithId",
-    platform = "java8",
-    role = "cassowary",
-    scala_schema = "com.twitter.simclusters_v2.thriftscala.InternalIdEmbeddingWithId",
-    segment_type = "snapshot",
-    tags = ["bazel-compatible"],
-    java_dependencies = [
-        "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java",
-    ],
-    scala_dependencies = [
-        "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
-    ],
-)
-
-create_datasets(
-    base_name = "semantic_core_per_language_simclusters_embeddings_presto",
-    java_schema = "com.twitter.simclusters_v2.thriftjava.SimClustersEmbeddingWithId",
-    platform = "java8",
-    role = "cassowary",
-    scala_schema = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId",
-    segment_type = "snapshot",
-    tags = ["bazel-compatible"],
-    java_dependencies = [
-        "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java",
-    ],
-    scala_dependencies = [
-        "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
-    ],
-)
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.scala
@ -1,10 +0,0 @@
-package com.twitter.simclusters_v2.hdfs_sources.presto_hdfs_sources
-
-object EntityEmbeddingsPrestoSources {
-
-  final val SemanticCorePerLanguageSimClustersEmbeddingsDataset =
-    SemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset
-
-  final val ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
-    ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset
-}
--- a/src/scala/com/twitter/simclusters_v2/images/bipartite_graph.png
+++ b/src/scala/com/twitter/simclusters_v2/images/bipartite_graph.png
--- a/src/scala/com/twitter/simclusters_v2/images/interestedin.png
+++ b/src/scala/com/twitter/simclusters_v2/images/interestedin.png
--- a/src/scala/com/twitter/simclusters_v2/images/knownfor.png
+++ b/src/scala/com/twitter/simclusters_v2/images/knownfor.png
--- a/src/scala/com/twitter/simclusters_v2/images/producer_embeddings.png
+++ b/src/scala/com/twitter/simclusters_v2/images/producer_embeddings.png
--- a/src/scala/com/twitter/simclusters_v2/images/producer_producer_similarity.png
+++ b/src/scala/com/twitter/simclusters_v2/images/producer_producer_similarity.png
--- a/src/scala/com/twitter/simclusters_v2/images/topic_embeddings.png
+++ b/src/scala/com/twitter/simclusters_v2/images/topic_embeddings.png
--- a/src/scala/com/twitter/simclusters_v2/scalding/BUILD
+++ b/src/scala/com/twitter/simclusters_v2/scalding/BUILD
@ -1,521 +0,0 @@
-scala_library(
-    sources = ["*.scala"],
-    platform = "java8",
-    tags = ["bazel-compatible"],
-    dependencies = [
-        "3rdparty/jvm/com/fasterxml/jackson:jackson-module-scala",
-        "3rdparty/jvm/com/fasterxml/jackson/core:jackson-core",
-        "3rdparty/jvm/com/fasterxml/jackson/core:jackson-databind",
-        "3rdparty/jvm/com/fasterxml/jackson/module:jackson-module-scala",
-        "3rdparty/jvm/com/googlecode/matrix-toolkits-java",
-        "3rdparty/jvm/com/twitter/storehaus:algebra",
-        "3rdparty/jvm/com/twitter/storehaus:core",
-        "escherbird/src/scala/com/twitter/escherbird/scalding/source",
-        "flockdb-tools/datasets/flock:flock-follows-edges-scala",
-        "src/java/com/twitter/ml/api/constant",
-        "src/java/com/twitter/sbf/core",
-        "src/java/com/twitter/sbf/graph",
-        "src/scala/com/twitter/frigate/user_sampler/common",
-        "src/scala/com/twitter/ml/api:api-base",
-        "src/scala/com/twitter/ml/api/bq",
-        "src/scala/com/twitter/pluck/source/cassowary:sims",
-        "src/scala/com/twitter/pluck/source/core_workflows/user_model:condensed_user_state-scala",
-        "src/scala/com/twitter/scalding_internal/dalv2",
-        "src/scala/com/twitter/scalding_internal/job",
-        "src/scala/com/twitter/scalding_internal/job/analytics_batch",
-        "src/scala/com/twitter/scalding_internal/source",
-        "src/scala/com/twitter/scalding_internal/source/lzo_scrooge",
-        "src/scala/com/twitter/simclusters_v2/candidate_source",
-        "src/scala/com/twitter/simclusters_v2/hdfs_sources",
-        "src/scala/com/twitter/simclusters_v2/scalding/common",
-        "src/scala/com/twitter/simclusters_v2/summingbird/common",
-        "src/scala/com/twitter/timelines/prediction/features/common",
-        "src/scala/com/twitter/timelines/prediction/features/itl",
-        "src/scala/com/twitter/timelines/prediction/features/recap",
-        "src/scala/com/twitter/wtf/entity_real_graph/scalding/common",
-        "src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala",
-        "src/thrift/com/twitter/wtf/scalding/sims:sims-thrift-scala",
-        "twadoop_config/configuration/log_categories/group/recos-platform:content_recommender_get_content_recommendations-scala",
-        "twadoop_config/configuration/log_categories/group/recos-platform:content_recommender_get_topic_tweets_recommendations-scala",
-        "twadoop_config/configuration/log_categories/group/timeline:timeline_service_favorites-scala",
-        "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala",
-        "usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala",
-        "util/util-core:util-core-util",
-    ],
-)
-
-hadoop_binary(
-    name = "evd_cluster_similarity",
-    main = "com.twitter.simclusters_v2.scalding.EigenVectorsForClusterSimilarityAdhoc",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "cluster_evaluation",
-    main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationAdhoc",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "cluster_evaluation_20m_145k",
-    main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "cluster_evaluation_20m_145k_2020",
-    main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K2020",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "bp_cluster_evaluation",
-    main = "com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluation",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "update_knownfor",
-    main = "com.twitter.simclusters_v2.scalding.UpdateKnownForAdhoc",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "update_knownfor_prod",
-    main = "com.twitter.simclusters_v2.scalding.UpdateKnownFor20M145K",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "cluster_details",
-    main = "com.twitter.simclusters_v2.scalding.ClusterDetailsBatch",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "cluster_details_20m_145k_updated",
-    main = "com.twitter.simclusters_v2.scalding.ClusterDetails20M145KUpdated",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "cluster_details_20m_145k_2020",
-    main = "com.twitter.simclusters_v2.scalding.ClusterDetails20M145K2020",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "cluster_details-adhoc",
-    main = "com.twitter.simclusters_v2.scalding.ClusterDetailsAdhoc",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "cluster_details-dump",
-    main = "com.twitter.simclusters_v2.scalding.DumpClusterDetailsAdhoc",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "interested_in",
-    main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForBatch",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "interested_in_from_producer_embeddings",
-    main = "com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsBatchApp",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "employee_graph_from_user_user",
-    main = "com.twitter.simclusters_v2.scalding.EmployeeGraphFromUserUser",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "interested_in_20m_145k_updated",
-    main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145KUpdated",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "interested_in_20m_145k_2020",
-    main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145K2020",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "interested_in_lite_20m_145k_2020",
-    main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "interested_in_lite_20m_145k_2020-adhoc",
-    main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020Adhoc",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "interested_in_from_ape_2020-adhoc",
-    main = "com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020AdhocApp",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "interested_in_from_ape_2020",
-    main = "com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020BatchApp",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "known_for_to_mh",
-    main = "com.twitter.simclusters_v2.scalding.KnownForToMHBatch",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "user_user_normalized_graph",
-    main = "com.twitter.simclusters_v2.scalding.UserUserNormalizedGraphBatch",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "user_user_graph",
-    main = "com.twitter.simclusters_v2.scalding.UserUserGraphBatch",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "user_user_graph-adhoc",
-    main = "com.twitter.simclusters_v2.scalding.UserUserGraphAdhoc",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "producer_norms_and_counts",
-    main = "com.twitter.simclusters_v2.scalding.ProducerNormsAndCountsBatch",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "fav_graph",
-    main = "com.twitter.simclusters_v2.scalding.UserUserFavGraphBatch",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "top_users_similarity_graph",
-    main = "com.twitter.simclusters_v2.scalding.TopUsersSimilarityGraphApp",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "top_users_only",
-    main = "com.twitter.simclusters_v2.scalding.TopUsersOnlyApp",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-hadoop_binary(
-    name = "dump_fav_graph_adhoc",
-    main = "com.twitter.simclusters_v2.scalding.DumpFavGraphAdhoc",
-    platform = "java8",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible",
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
-
-# Generated with `capesospy-v2 create_target interested_in_for_20M_145k_2020 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml`, config hash 8f19bf.
-scalding_job(
-    name = "interested_in_for_20M_145k_2020",
-    main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145K2020",
-    args = ["--socialProofThreshold 2 --maxClustersPerUser 50"],
-    config = [
-        ("hadoop.combine-input", "true"),
-        ("hadoop.map.jvm.total-memory", "3072m"),
-        ("hadoop.reduce.jvm.total-memory", "3072m"),
-        ("hadoop.submitter.jvm.total-memory", "5120m"),
-        ("submitter.tier", "preemptible"),
-    ],
-    cron = "14 * * * *",
-    hadoop_cluster = "atla-proc",
-    platform = "java8",
-    role = "cassowary",
-    runtime_platform = "java8",
-    tags = [
-        "bazel-compatible:migrated",
-        "bazel-only",
-    ],
-    dependencies = [
-        ":scalding",
-    ],
-)
--- a/src/scala/com/twitter/simclusters_v2/scalding/BUILD.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/BUILD.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.scala
@ -1,513 +0,0 @@
-package com.twitter.simclusters_v2.scalding
-
-import com.twitter.algebird.Aggregator
-import com.twitter.algebird.Monoid
-import com.twitter.scalding._
-import com.twitter.scalding.commons.source.VersionedKeyValSource
-import com.twitter.scalding.typed.TypedPipe
-import com.twitter.scalding_internal.dalv2.DAL
-import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
-import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
-import com.twitter.scalding_internal.job.TwitterExecutionApp
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
-import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
-import com.twitter.simclusters_v2.hdfs_sources.NormsAndCountsFixedPathSource
-import com.twitter.simclusters_v2.hdfs_sources.ProducerNormsAndCountsScalaDataset
-import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInScalaDataset
-import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
-import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
-import com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluationClasses._
-import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._
-import com.twitter.simclusters_v2.scalding.common.Util
-import com.twitter.simclusters_v2.thriftscala.BipartiteClusterQuality
-import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
-import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
-import com.twitter.simclusters_v2.thriftscala.NormsAndCounts
-import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
-import scala.collection.JavaConverters._
-
-object BipartiteClusterEvaluation extends TwitterExecutionApp {
-
-  implicit val tz: java.util.TimeZone = DateOps.UTC
-  implicit val dp = DateParser.default
-
-  private def getClusterL2Norms(
-    knownFor: TypedPipe[(Long, Array[(Int, Float)])]
-  ): Execution[Map[Int, Float]] = {
-    knownFor
-      .flatMap {
-        case (_, clusterArray) =>
-          clusterArray.map {
-            case (clusterId, score) =>
-              Map(clusterId -> score * score)
-          }
-      }
-      .sum
-      .getExecution
-      .map(_.mapValues { x => math.sqrt(x).toFloat })
-  }
-
-  def l2NormalizeKnownFor(
-    knownFor: TypedPipe[(Long, Array[(Int, Float)])]
-  ): Execution[TypedPipe[(Long, Array[(Int, Float)])]] = {
-    getClusterL2Norms(knownFor).map { clusterToNorms =>
-      knownFor.mapValues { clusterScoresArray =>
-        clusterScoresArray.map {
-          case (clusterId, score) =>
-            (clusterId, score / clusterToNorms(clusterId))
-        }
-      }
-    }
-  }
-
-  /**
-   * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:bp_cluster_evaluation && \
-   * oscar hdfs --user frigate --host hadoopnest2.atla.twitter.com --bundle bp_cluster_evaluation \
-   * --tool com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluation --screen --screen-detached \
-   * --tee logs/newBpQuality_updateUnnormalizedScores_interestedInUsing20190329Graph_evaluatedOn20190329Graph_run2 \
-   * -- --normsAndCountsDir /user/frigate/your_ldap/producerNormsAndCounts_20190330 \
-   * --graphInputDir /user/frigate/your_ldap/user_user_normalized_graph_copiedFromAtlaProc_20190329 \
-   * --knownForDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownFor \
-   * --interestedInDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/interestedInUsing20190329Graph \
-   * --outgoingVolumesResultsDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_outgoingVolumes \
-   * --incomingVolumesResultsDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_incomingVolumes \
-   * --outputDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_perCluster \
-   * --toEmailAddress your_ldap@twitter.com --modelVersion 20M_145K_updated
-   */
-  override def job: Execution[Unit] = Execution.getConfigMode.flatMap {
-    case (config, mode) =>
-      Execution.withId { implicit uniqueId =>
-        val args = config.getArgs
-
-        val interestedIn = args.optional("interestedInDir") match {
-          case Some(dir) =>
-            TypedPipe
-              .from(AdhocKeyValSources.interestedInSource(args("interestedInDir")))
-          case None =>
-            DAL
-              .readMostRecentSnapshotNoOlderThan(
-                SimclustersV2InterestedInScalaDataset,
-                Days(20)
-              )
-              .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-              .toTypedPipe
-              .map {
-                case KeyVal(key, value) => (key, value)
-              }
-        }
-
-        val inputKnownFor = args
-          .optional("knownForDir")
-          .map { location => KnownForSources.readKnownFor(location) }
-          .getOrElse(KnownForSources.knownFor_20M_Dec11_145K)
-
-        val modelVersion =
-          args.optional("modelVersion").getOrElse("20M_145K_dec11")
-
-        val useLogFavWeights = args.boolean("useLogFavWeights")
-
-        val shouldL2NormalizeKnownFor = args.boolean("l2NormalizeKnownFor")
-
-        val toEmailAddressOpt = args.optional("toEmailAddress")
-
-        val knownForExec = if (shouldL2NormalizeKnownFor) {
-          l2NormalizeKnownFor(inputKnownFor)
-        } else {
-          Execution.from(inputKnownFor)
-        }
-
-        val finalExec = knownForExec.flatMap { knownFor =>
-          val graph = args.optional("graphInputDir") match {
-            case Some(dir) =>
-              TypedPipe.from(UserAndNeighborsFixedPathSource(dir))
-            case None =>
-              DAL
-                .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(20))
-                .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-                .toTypedPipe
-          }
-
-          val producerNormsAndCounts = args.optional("normsAndCountsDir") match {
-            case Some(dir) =>
-              TypedPipe.from(NormsAndCountsFixedPathSource(args(dir)))
-            case None =>
-              DAL
-                .readMostRecentSnapshotNoOlderThan(ProducerNormsAndCountsScalaDataset, Days(20))
-                .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-                .toTypedPipe
-          }
-
-          val clusterIncomingVolumesExec = loadOrMake(
-            computeClusterIncomingVolumes(knownFor, producerNormsAndCounts, useLogFavWeights),
-            modelVersion,
-            args("incomingVolumesResultsDir")
-          )
-
-          val resultsWithOutgoingVolumesExec = loadOrMake(
-            getResultsWithOutgoingVolumes(graph, interestedIn, useLogFavWeights),
-            modelVersion,
-            args("outgoingVolumesResultsDir")
-          )
-
-          val finalPerClusterResultsExec =
-            finalPerClusterResults(
-              knownFor,
-              interestedIn,
-              resultsWithOutgoingVolumesExec,
-              clusterIncomingVolumesExec)
-              .flatMap { pipe => loadOrMake(pipe, modelVersion, args("outputDir")) }
-
-          finalPerClusterResultsExec.flatMap { finalPerClusterResults =>
-            val perClusterResults = finalPerClusterResults.values
-            val distributionResultsExec = getClusterResultsSummary(perClusterResults).map {
-              case Some(summary) =>
-                "Summary of results across clusters: \n" +
-                  Util.prettyJsonMapper.writeValueAsString(summary)
-              case _ =>
-                "No summary of results! The cluster level results pipe must be empty!"
-            }
-
-            val overallResultsExec = perClusterResults.sum.toOptionExecution.map {
-              case Some(overallQuality) =>
-                "Overall Quality: \n" +
-                  Util.prettyJsonMapper.writeValueAsString(
-                    printableBipartiteQuality(overallQuality)
-                  )
-              case _ =>
-                "No overall quality! The cluster level results pipe must be empty!"
-            }
-
-            Execution.zip(distributionResultsExec, overallResultsExec).map {
-              case (distResults, overallResults) =>
-                toEmailAddressOpt.foreach { address =>
-                  Util.sendEmail(
-                    distResults + "\n" + overallResults,
-                    "Bipartite cluster quality for " + modelVersion,
-                    address
-                  )
-                }
-                println(distResults + "\n" + overallResults)
-            }
-          }
-        }
-        Util.printCounters(finalExec)
-      }
-  }
-
-  def getResultsWithOutgoingVolumes(
-    graph: TypedPipe[UserAndNeighbors],
-    interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
-    useLogFavWeights: Boolean
-  ): TypedPipe[(Int, BipartiteClusterQuality)] = {
-    graph
-      .map { un => (un.userId, un.neighbors) }
-      // should this be a leftJoin? For now, leaving it as an inner join. If in the future,
-      // we want to compare two approaches with very different coverages on interestedIn, this
-      // could become a problem.
-      .join(interestedIn)
-      .withReducers(4000)
-      .flatMap {
-        case (userId, (neighbors, clusters)) =>
-          getBIResultsFromSingleUser(userId, neighbors, clusters, useLogFavWeights)
-      }
-      .sumByKey
-      .withReducers(600)
-      .map {
-        case (clusterId, bir) =>
-          (
-            clusterId,
-            BipartiteClusterQuality(
-              inClusterFollowEdges = Some(bir.inClusterWeights.isFollowEdge),
-              inClusterFavEdges = Some(bir.inClusterWeights.isFavEdge),
-              favWtSumOfInClusterFollowEdges = Some(bir.inClusterWeights.favWtIfFollowEdge),
-              favWtSumOfInClusterFavEdges = Some(bir.inClusterWeights.favWtIfFavEdge),
-              outgoingFollowEdges = Some(bir.totalOutgoingVolumes.isFollowEdge),
-              outgoingFavEdges = Some(bir.totalOutgoingVolumes.isFavEdge),
-              favWtSumOfOutgoingFollowEdges = Some(bir.totalOutgoingVolumes.favWtIfFollowEdge),
-              favWtSumOfOutgoingFavEdges = Some(bir.totalOutgoingVolumes.favWtIfFavEdge),
-              interestedInSize = Some(bir.interestedInSize),
-              sampledEdges = Some(
-                bir.edgeSample
-                  .iterator()
-                  .asScala
-                  .toSeq
-                  .map {
-                    case (edge, data) => makeThriftSampledEdge(edge, data)
-                  }
-              )
-            )
-          )
-      }
-  }
-
-  def getBIResultsFromSingleUser(
-    userId: Long,
-    neighbors: Seq[NeighborWithWeights],
-    clusters: ClustersUserIsInterestedIn,
-    useLogFavScores: Boolean
-  ): List[(Int, BipartiteIntermediateResults)] = {
-    val neighborsToWeights = neighbors.map { neighborAndWeights =>
-      val isFollowEdge = neighborAndWeights.isFollowed match {
-        case Some(true) => 1.0
-        case _ => 0.0
-      }
-      val favScore = if (useLogFavScores) {
-        neighborAndWeights.logFavScore.getOrElse(0.0)
-      } else neighborAndWeights.favScoreHalfLife100Days.getOrElse(0.0)
-      val isFavEdge = math.min(1, math.ceil(favScore))
-      neighborAndWeights.neighborId -> Weights(
-        isFollowEdge,
-        isFavEdge,
-        favScore * isFollowEdge,
-        favScore
-      )
-    }.toMap
-
-    val outgoingVolumes = Monoid.sum(neighborsToWeights.values)(WeightsMonoid)
-
-    clusters.clusterIdToScores.toList.map {
-      case (clusterId, scoresStruct) =>
-        val inClusterNeighbors =
-          (scoresStruct.usersBeingFollowed.getOrElse(Nil) ++
-            scoresStruct.usersThatWereFaved.getOrElse(Nil)).toSet
-        val edgesForSampling = inClusterNeighbors.flatMap { neighborId =>
-          if (neighborsToWeights.contains(neighborId)) {
-            Some(
-              (userId, neighborId),
-              SampledEdgeData(
-                neighborsToWeights(neighborId).favWtIfFollowEdge,
-                neighborsToWeights(neighborId).favWtIfFavEdge,
-                scoresStruct.followScore.getOrElse(0.0),
-                scoresStruct.favScore.getOrElse(0.0)
-              )
-            )
-          } else {
-            None
-          }
-        }
-
-        val inClusterWeights =
-          Monoid.sum(neighborsToWeights.filterKeys(inClusterNeighbors).values)(WeightsMonoid)
-
-        (
-          clusterId,
-          BipartiteIntermediateResults(
-            inClusterWeights,
-            outgoingVolumes,
-            1,
-            samplerMonoid.build(edgesForSampling)
-          ))
-    }
-  }
-
-  def computeClusterIncomingVolumes(
-    knownFor: TypedPipe[(Long, Array[(Int, Float)])],
-    producerNormsAndCounts: TypedPipe[NormsAndCounts],
-    useLogFavWeights: Boolean
-  ): TypedPipe[(Int, BipartiteClusterQuality)] = {
-    producerNormsAndCounts
-      .map { x => (x.userId, x) }
-      .join(knownFor)
-      .withReducers(100)
-      .flatMap {
-        case (userId, (normsAndCounts, clusters)) =>
-          clusters.map {
-            case (clusterId, _) =>
-              val followerCount =
-                normsAndCounts.followerCount.getOrElse(0L).toDouble
-              val faverCount = normsAndCounts.faverCount.getOrElse(0L).toDouble
-              val favWtSumOfIncomingFollows = if (useLogFavWeights) {
-                normsAndCounts.logFavWeightsOnFollowEdgesSum.getOrElse(0.0)
-              } else {
-                normsAndCounts.favWeightsOnFollowEdgesSum.getOrElse(0.0)
-              }
-              val favWtSumOfIncomingFavs = if (useLogFavWeights) {
-                normsAndCounts.logFavWeightsOnFavEdgesSum.getOrElse(0.0)
-              } else {
-                normsAndCounts.favWeightsOnFavEdgesSum.getOrElse(0.0)
-              }
-              (
-                clusterId,
-                BipartiteClusterQuality(
-                  incomingFollowEdges = Some(followerCount),
-                  incomingFavEdges = Some(faverCount),
-                  favWtSumOfIncomingFollowEdges = Some(favWtSumOfIncomingFollows),
-                  favWtSumOfIncomingFavEdges = Some(favWtSumOfIncomingFavs)
-                ))
-          }
-      }
-      .sumByKey
-      .toTypedPipe
-  }
-
-  def loadOrMake(
-    pipe: TypedPipe[(Int, BipartiteClusterQuality)],
-    modelVersion: String,
-    path: String
-  ): Execution[TypedPipe[(Int, BipartiteClusterQuality)]] = {
-    val mapped = pipe.map {
-      case (clusterId, struct) => ((modelVersion, clusterId), struct)
-    }
-    makeForKeyValSource(mapped, AdhocKeyValSources.bipartiteQualitySource(path), path).map { pipe =>
-      // discard model version
-      pipe.map { case ((_, clusterId), struct) => (clusterId, struct) }
-    }
-  }
-
-  def makeForKeyValSource[K, V](
-    pipe: TypedPipe[(K, V)],
-    dest: VersionedKeyValSource[K, V],
-    path: String
-  ): Execution[TypedPipe[(K, V)]] =
-    Execution.getMode.flatMap { mode =>
-      if (dest.resourceExists(mode)) {
-        println(s"validated path $path")
-        Execution.from(TypedPipe.from(dest))
-      } else {
-        println(s"Could not load from $path")
-        pipe.writeThrough(dest)
-      }
-    }
-
-  def precisionOfWholeGraph(
-    knownFor: TypedPipe[(Long, Array[(Int, Float)])],
-    interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
-    clusterIncomingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]]
-  ): Execution[Option[Double]] = {
-    val knownForSizeExec = knownFor.aggregate(Aggregator.size).toOptionExecution
-    val interestedInSizeExec =
-      interestedIn.aggregate(Aggregator.size).toOptionExecution
-    val numExec = clusterIncomingVolumesExec.flatMap { volumes =>
-      volumes.values.flatMap(_.favWtSumOfIncomingFavEdges).sum.toOptionExecution
-    }
-    Execution.zip(numExec, interestedInSizeExec, knownForSizeExec).map {
-      case (Some(num), Some(interestedInSize), Some(knownForSize)) =>
-        Some(num / interestedInSize / knownForSize)
-      case x @ _ =>
-        println("Precision of whole graph zip: " + x)
-        None
-    }
-  }
-
-  def finalPerClusterResults(
-    knownFor: TypedPipe[(Long, Array[(Int, Float)])],
-    interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
-    resultsWithOutgoingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]],
-    incomingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]]
-  ): Execution[TypedPipe[(Int, BipartiteClusterQuality)]] = {
-    val knownForTranspose = KnownForSources.transpose(knownFor)
-
-    val precisionOfWholeGraphExec =
-      precisionOfWholeGraph(knownFor, interestedIn, incomingVolumesExec)
-
-    Execution
-      .zip(resultsWithOutgoingVolumesExec, incomingVolumesExec, precisionOfWholeGraphExec)
-      .map {
-        case (resultsWithOutgoingVolumes, clusterIncomingVolumes, precisionOfWholeGraph) =>
-          println("Precision of whole graph " + precisionOfWholeGraph)
-          resultsWithOutgoingVolumes
-            .join(knownForTranspose)
-            .leftJoin(clusterIncomingVolumes)
-            .withReducers(500)
-            .map {
-              case (clusterId, ((outgoingVolumeQuality, knownForList), incomingVolumesOpt)) =>
-                val incomingVolumes =
-                  incomingVolumesOpt.getOrElse(BipartiteClusterQuality())
-                val knownForMap = knownForList.toMap
-                (
-                  clusterId,
-                  getFullQuality(
-                    outgoingVolumeQuality,
-                    incomingVolumes,
-                    knownForMap,
-                    precisionOfWholeGraph))
-            }
-      }
-  }
-
-  def getFullQuality(
-    qualityWithOutgoingVolumes: BipartiteClusterQuality,
-    incomingVolumes: BipartiteClusterQuality,
-    knownFor: Map[Long, Float],
-    precisionOfWholeGraph: Option[Double]
-  ): BipartiteClusterQuality = {
-    val newSampledEdges = qualityWithOutgoingVolumes.sampledEdges.map { sampledEdges =>
-      sampledEdges.map { sampledEdge =>
-        val knownForScore = knownFor.getOrElse(sampledEdge.followeeId, 0.0f)
-        sampledEdge.copy(
-          predictedFollowScore = sampledEdge.followScoreToCluster.map { x => x * knownForScore },
-          predictedFavScore = sampledEdge.favScoreToCluster.map { x => x * knownForScore }
-        )
-      }
-    }
-    val correlationOfFavWtIfFollow = newSampledEdges.map { samples =>
-      val pairs = samples.map { s =>
-        (s.predictedFollowScore.getOrElse(0.0), s.favWtIfFollowEdge.getOrElse(0.0))
-      }
-      Util.computeCorrelation(pairs.iterator)
-    }
-    val correlationOfFavWtIfFav = newSampledEdges.map { samples =>
-      val pairs = samples.map { s =>
-        (s.predictedFavScore.getOrElse(0.0), s.favWtIfFavEdge.getOrElse(0.0))
-      }
-      Util.computeCorrelation(pairs.iterator)
-    }
-    val relativePrecisionNum = {
-      if (qualityWithOutgoingVolumes.interestedInSize.exists(_ > 0) && knownFor.nonEmpty) {
-        qualityWithOutgoingVolumes.favWtSumOfInClusterFavEdges
-          .getOrElse(0.0) / qualityWithOutgoingVolumes.interestedInSize.get / knownFor.size
-      } else 0.0
-    }
-    val relativePrecision = if (precisionOfWholeGraph.exists(_ > 0.0)) {
-      Some(relativePrecisionNum / precisionOfWholeGraph.get)
-    } else None
-    qualityWithOutgoingVolumes.copy(
-      incomingFollowEdges = incomingVolumes.incomingFollowEdges,
-      incomingFavEdges = incomingVolumes.incomingFavEdges,
-      favWtSumOfIncomingFollowEdges = incomingVolumes.favWtSumOfIncomingFollowEdges,
-      favWtSumOfIncomingFavEdges = incomingVolumes.favWtSumOfIncomingFavEdges,
-      knownForSize = Some(knownFor.size),
-      correlationOfFavWtIfFollowWithPredictedFollow = correlationOfFavWtIfFollow,
-      correlationOfFavWtIfFavWithPredictedFav = correlationOfFavWtIfFav,
-      sampledEdges = newSampledEdges,
-      relativePrecisionUsingFavWtIfFav = relativePrecision,
-      averagePrecisionOfWholeGraphUsingFavWtIfFav = precisionOfWholeGraph
-    )
-  }
-}
-
-object DumpBpQuality extends TwitterExecutionApp {
-  def job: Execution[Unit] = Execution.getConfigMode.flatMap {
-    case (config, mode) =>
-      Execution.withId { implicit uniqueId =>
-        val args = config.getArgs
-        val inputDir = args("inputDir")
-
-        val clusters = args.list("clusters").map(_.toInt).toSet
-        val input =
-          TypedPipe
-            .from(AdhocKeyValSources.bipartiteQualitySource(inputDir))
-            .map {
-              case ((modelVersion, clusterId), quality) =>
-                (
-                  (modelVersion, clusterId),
-                  BipartiteClusterEvaluationClasses
-                    .printableBipartiteQuality(quality))
-            }
-
-        if (clusters.isEmpty) {
-          input.printSummary("Bipartite quality")
-        } else {
-          input
-            .collect {
-              case rec @ ((_, clusterId), quality) if clusters(clusterId) =>
-                Util.prettyJsonMapper
-                  .writeValueAsString(rec)
-                  .replaceAll("\n", " ")
-            }
-            .toIterableExecution
-            .map { strings => println(strings.mkString("\n")) }
-        }
-      }
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.scala
@ -1,316 +0,0 @@
-package com.twitter.simclusters_v2.scalding
-
-import com.twitter.algebird.{Monoid, OptionMonoid, Semigroup}
-import com.twitter.algebird.mutable.PriorityQueueMonoid
-import com.twitter.scalding.Execution
-import com.twitter.scalding.typed.TypedPipe
-import com.twitter.simclusters_v2.scalding.common.Util
-import com.twitter.simclusters_v2.scalding.common.Util.Distribution
-import com.twitter.simclusters_v2.thriftscala.{BipartiteClusterQuality, SampledEdge}
-import java.util.PriorityQueue
-import scala.collection.JavaConverters._
-
-object BipartiteClusterEvaluationClasses {
-  case class Weights(
-    isFollowEdge: Double,
-    isFavEdge: Double,
-    favWtIfFollowEdge: Double,
-    favWtIfFavEdge: Double)
-
-  object WeightsMonoid extends Monoid[Weights] {
-    override def zero = Weights(0.0, 0.0, 0.0, 0.0)
-
-    override def plus(l: Weights, r: Weights): Weights = {
-      Weights(
-        l.isFollowEdge + r.isFollowEdge,
-        l.isFavEdge + r.isFavEdge,
-        l.favWtIfFollowEdge + r.favWtIfFollowEdge,
-        l.favWtIfFavEdge + r.favWtIfFavEdge
-      )
-    }
-  }
-
-  implicit val wm: Monoid[Weights] = WeightsMonoid
-
-  case class SampledEdgeData(
-    favWtIfFollowEdge: Double,
-    favWtIfFavEdge: Double,
-    followScoreToCluster: Double,
-    favScoreToCluster: Double)
-
-  implicit val samplerMonoid: PriorityQueueMonoid[((Long, Long), SampledEdgeData)] =
-    Util.reservoirSamplerMonoidForPairs[(Long, Long), SampledEdgeData](2000)(Util.edgeOrdering)
-
-  implicit val sampledEdgesMonoid: PriorityQueueMonoid[SampledEdge] =
-    Util.reservoirSamplerMonoid(
-      10000,
-      { sampledEdge: SampledEdge => (sampledEdge.followerId, sampledEdge.followeeId) }
-    )(Util.edgeOrdering)
-
-  case class BipartiteIntermediateResults(
-    inClusterWeights: Weights,
-    totalOutgoingVolumes: Weights,
-    interestedInSize: Int,
-    edgeSample: PriorityQueue[((Long, Long), SampledEdgeData)]) {
-    override def toString: String = {
-      "BCR(%s, %s, %d, %s)".format(
-        inClusterWeights,
-        totalOutgoingVolumes,
-        interestedInSize,
-        edgeSample.iterator().asScala.toSeq.toString()
-      )
-    }
-  }
-
-  object BIRMonoid extends Monoid[BipartiteIntermediateResults] {
-    override def zero =
-      BipartiteIntermediateResults(WeightsMonoid.zero, WeightsMonoid.zero, 0, samplerMonoid.zero)
-
-    override def plus(
-      l: BipartiteIntermediateResults,
-      r: BipartiteIntermediateResults
-    ): BipartiteIntermediateResults = {
-      BipartiteIntermediateResults(
-        WeightsMonoid.plus(l.inClusterWeights, r.inClusterWeights),
-        WeightsMonoid.plus(l.totalOutgoingVolumes, r.totalOutgoingVolumes),
-        l.interestedInSize + r.interestedInSize,
-        samplerMonoid.plus(l.edgeSample, r.edgeSample)
-      )
-    }
-  }
-
-  implicit val bIRMonoid: Monoid[BipartiteIntermediateResults] = BIRMonoid
-
-  def makeThriftSampledEdge(edge: (Long, Long), data: SampledEdgeData): SampledEdge = {
-    val (followerId, followeeId) = edge
-    SampledEdge(
-      followerId = followerId,
-      followeeId = followeeId,
-      favWtIfFollowEdge = Some(data.favWtIfFollowEdge),
-      favWtIfFavEdge = Some(data.favWtIfFavEdge),
-      followScoreToCluster = Some(data.followScoreToCluster),
-      favScoreToCluster = Some(data.favScoreToCluster)
-    )
-  }
-
-  object ClusterQualitySemigroup extends Semigroup[BipartiteClusterQuality] {
-    val doubleOM: Monoid[Option[Double]] = new OptionMonoid[Double]
-    val intOM: Monoid[Option[Int]] = new OptionMonoid[Int]
-    val longOM: Monoid[Option[Long]] = new OptionMonoid[Long]
-
-    override def plus(l: BipartiteClusterQuality, r: BipartiteClusterQuality) =
-      BipartiteClusterQuality(
-        inClusterFollowEdges = doubleOM.plus(l.inClusterFollowEdges, r.inClusterFollowEdges),
-        inClusterFavEdges = doubleOM.plus(l.inClusterFavEdges, r.inClusterFavEdges),
-        favWtSumOfInClusterFollowEdges = doubleOM
-          .plus(l.favWtSumOfInClusterFollowEdges, r.favWtSumOfInClusterFollowEdges),
-        favWtSumOfInClusterFavEdges = doubleOM
-          .plus(l.favWtSumOfInClusterFavEdges, r.favWtSumOfInClusterFavEdges),
-        outgoingFollowEdges = doubleOM.plus(l.outgoingFollowEdges, r.outgoingFollowEdges),
-        outgoingFavEdges = doubleOM.plus(l.outgoingFavEdges, r.outgoingFavEdges),
-        favWtSumOfOutgoingFollowEdges = doubleOM
-          .plus(l.favWtSumOfOutgoingFollowEdges, r.favWtSumOfOutgoingFollowEdges),
-        favWtSumOfOutgoingFavEdges = doubleOM
-          .plus(l.favWtSumOfOutgoingFavEdges, r.favWtSumOfOutgoingFavEdges),
-        incomingFollowEdges = doubleOM.plus(l.incomingFollowEdges, r.incomingFollowEdges),
-        incomingFavEdges = doubleOM.plus(l.incomingFavEdges, r.incomingFavEdges),
-        favWtSumOfIncomingFollowEdges = doubleOM
-          .plus(l.favWtSumOfIncomingFollowEdges, r.favWtSumOfIncomingFollowEdges),
-        favWtSumOfIncomingFavEdges = doubleOM
-          .plus(l.favWtSumOfIncomingFavEdges, r.favWtSumOfIncomingFavEdges),
-        interestedInSize = None,
-        sampledEdges = Some(
-          sampledEdgesMonoid
-            .plus(
-              sampledEdgesMonoid.build(l.sampledEdges.getOrElse(Nil)),
-              sampledEdgesMonoid.build(r.sampledEdges.getOrElse(Nil))
-            )
-            .iterator()
-            .asScala
-            .toSeq),
-        knownForSize = intOM.plus(l.knownForSize, r.knownForSize),
-        correlationOfFavWtIfFollowWithPredictedFollow = None,
-        correlationOfFavWtIfFavWithPredictedFav = None,
-        relativePrecisionUsingFavWtIfFav = None,
-        averagePrecisionOfWholeGraphUsingFavWtIfFav = l.averagePrecisionOfWholeGraphUsingFavWtIfFav
-      )
-  }
-
-  implicit val bcqSemigroup: Semigroup[BipartiteClusterQuality] =
-    ClusterQualitySemigroup
-
-  case class PrintableBipartiteQuality(
-    incomingFollowUnweightedRecall: String,
-    incomingFavUnweightedRecall: String,
-    incomingFollowWeightedRecall: String,
-    incomingFavWeightedRecall: String,
-    outgoingFollowUnweightedRecall: String,
-    outgoingFavUnweightedRecall: String,
-    outgoingFollowWeightedRecall: String,
-    outgoingFavWeightedRecall: String,
-    incomingFollowEdges: String,
-    incomingFavEdges: String,
-    favWtSumOfIncomingFollowEdges: String,
-    favWtSumOfIncomingFavEdges: String,
-    outgoingFollowEdges: String,
-    outgoingFavEdges: String,
-    favWtSumOfOutgoingFollowEdges: String,
-    favWtSumOfOutgoingFavEdges: String,
-    correlationOfFavWtIfFollow: String,
-    correlationOfFavWtIfFav: String,
-    relativePrecisionUsingFavWt: String,
-    averagePrecisionOfWholeGraphUsingFavWt: String,
-    interestedInSize: String,
-    knownForSize: String)
-
-  def printableBipartiteQuality(in: BipartiteClusterQuality): PrintableBipartiteQuality = {
-    def getRatio(numOpt: Option[Double], denOpt: Option[Double]): String = {
-      val r = if (denOpt.exists(_ > 0)) {
-        numOpt.getOrElse(0.0) / denOpt.get
-      } else 0.0
-      "%.3f".format(r)
-    }
-
-    val formatter = new java.text.DecimalFormat("###,###.#")
-
-    def denString(denOpt: Option[Double]): String =
-      formatter.format(denOpt.getOrElse(0.0))
-
-    val correlationOfFavWtIfFollow =
-      in.correlationOfFavWtIfFollowWithPredictedFollow match {
-        case None =>
-          in.sampledEdges.map { samples =>
-            val pairs = samples.map { s =>
-              (s.predictedFollowScore.getOrElse(0.0), s.favWtIfFollowEdge.getOrElse(0.0))
-            }
-            Util.computeCorrelation(pairs.iterator)
-          }
-        case x @ _ => x
-      }
-
-    val correlationOfFavWtIfFav =
-      in.correlationOfFavWtIfFavWithPredictedFav match {
-        case None =>
-          in.sampledEdges.map { samples =>
-            val pairs = samples.map { s =>
-              (s.predictedFavScore.getOrElse(0.0), s.favWtIfFavEdge.getOrElse(0.0))
-            }
-            Util.computeCorrelation(pairs.iterator)
-          }
-        case x @ _ => x
-      }
-
-    PrintableBipartiteQuality(
-      incomingFollowUnweightedRecall = getRatio(in.inClusterFollowEdges, in.incomingFollowEdges),
-      incomingFavUnweightedRecall = getRatio(in.inClusterFavEdges, in.incomingFavEdges),
-      incomingFollowWeightedRecall =
-        getRatio(in.favWtSumOfInClusterFollowEdges, in.favWtSumOfIncomingFollowEdges),
-      incomingFavWeightedRecall =
-        getRatio(in.favWtSumOfInClusterFavEdges, in.favWtSumOfIncomingFavEdges),
-      outgoingFollowUnweightedRecall = getRatio(in.inClusterFollowEdges, in.outgoingFollowEdges),
-      outgoingFavUnweightedRecall = getRatio(in.inClusterFavEdges, in.outgoingFavEdges),
-      outgoingFollowWeightedRecall =
-        getRatio(in.favWtSumOfInClusterFollowEdges, in.favWtSumOfOutgoingFollowEdges),
-      outgoingFavWeightedRecall =
-        getRatio(in.favWtSumOfInClusterFavEdges, in.favWtSumOfOutgoingFavEdges),
-      incomingFollowEdges = denString(in.incomingFollowEdges),
-      incomingFavEdges = denString(in.incomingFavEdges),
-      favWtSumOfIncomingFollowEdges = denString(in.favWtSumOfIncomingFollowEdges),
-      favWtSumOfIncomingFavEdges = denString(in.favWtSumOfIncomingFavEdges),
-      outgoingFollowEdges = denString(in.outgoingFollowEdges),
-      outgoingFavEdges = denString(in.outgoingFavEdges),
-      favWtSumOfOutgoingFollowEdges = denString(in.favWtSumOfOutgoingFollowEdges),
-      favWtSumOfOutgoingFavEdges = denString(in.favWtSumOfOutgoingFavEdges),
-      correlationOfFavWtIfFollow = "%.3f"
-        .format(correlationOfFavWtIfFollow.getOrElse(0.0)),
-      correlationOfFavWtIfFav = "%.3f"
-        .format(correlationOfFavWtIfFav.getOrElse(0.0)),
-      relativePrecisionUsingFavWt =
-        "%.2g".format(in.relativePrecisionUsingFavWtIfFav.getOrElse(0.0)),
-      averagePrecisionOfWholeGraphUsingFavWt =
-        "%.2g".format(in.averagePrecisionOfWholeGraphUsingFavWtIfFav.getOrElse(0.0)),
-      interestedInSize = in.interestedInSize.getOrElse(0).toString,
-      knownForSize = in.knownForSize.getOrElse(0).toString
-    )
-  }
-
-  case class ClusterResultsSummary(
-    numClustersWithZeroInterestedIn: Int,
-    numClustersWithZeroFollowWtRecall: Int,
-    numClustersWithZeroFavWtRecall: Int,
-    numClustersWithZeroFollowAndFavWtRecall: Int,
-    interestedInSizeDist: Distribution,
-    outgoingFollowWtRecallDist: Distribution,
-    outgoingFavWtRecallDist: Distribution,
-    incomingFollowWtRecallDist: Distribution,
-    incomingFavWtRecallDist: Distribution,
-    followCorrelationDist: Distribution,
-    favCorrelationDist: Distribution,
-    relativePrecisionDist: Distribution)
-
-  def getClusterResultsSummary(
-    perClusterResults: TypedPipe[BipartiteClusterQuality]
-  ): Execution[Option[ClusterResultsSummary]] = {
-    perClusterResults
-      .map { clusterQuality =>
-        val printableQuality = printableBipartiteQuality(clusterQuality)
-        val isFollowRecallZero =
-          if (!clusterQuality.favWtSumOfInClusterFollowEdges
-              .exists(_ > 0)) 1
-          else 0
-        val isFavRecallZero =
-          if (!clusterQuality.favWtSumOfInClusterFavEdges.exists(_ > 0)) 1
-          else 0
-        (
-          if (!clusterQuality.interestedInSize.exists(_ > 0)) 1 else 0,
-          isFollowRecallZero,
-          isFavRecallZero,
-          isFavRecallZero * isFollowRecallZero,
-          clusterQuality.interestedInSize.toList.map(_.toDouble),
-          List(printableQuality.outgoingFollowWeightedRecall.toDouble),
-          List(printableQuality.outgoingFavWeightedRecall.toDouble),
-          List(printableQuality.incomingFollowWeightedRecall.toDouble),
-          List(printableQuality.incomingFavWeightedRecall.toDouble),
-          List(printableQuality.correlationOfFavWtIfFollow.toDouble),
-          List(printableQuality.correlationOfFavWtIfFav.toDouble),
-          List(printableQuality.relativePrecisionUsingFavWt.toDouble)
-        )
-      }
-      .sum
-      .toOptionExecution
-      .map { opt =>
-        opt.map {
-          case (
-                zeroInterestedIn,
-                zeroFollowRecall,
-                zeroFavRecall,
-                zeroFollowAndFavRecall,
-                interestedInSizeList,
-                outgoingFollowWtRecallList,
-                outgoingFavWtRecallList,
-                incomingFollowWtRecallList,
-                incomingFavWtRecallList,
-                followCorrelationList,
-                favCorrelationList,
-                relativePrecisionList
-              ) =>
-            ClusterResultsSummary(
-              numClustersWithZeroInterestedIn = zeroInterestedIn,
-              numClustersWithZeroFollowWtRecall = zeroFollowRecall,
-              numClustersWithZeroFavWtRecall = zeroFavRecall,
-              numClustersWithZeroFollowAndFavWtRecall = zeroFollowAndFavRecall,
-              interestedInSizeDist = Util.distributionFromArray(interestedInSizeList.toArray),
-              outgoingFollowWtRecallDist = Util
-                .distributionFromArray(outgoingFollowWtRecallList.toArray),
-              outgoingFavWtRecallDist = Util.distributionFromArray(outgoingFavWtRecallList.toArray),
-              incomingFollowWtRecallDist = Util
-                .distributionFromArray(incomingFollowWtRecallList.toArray),
-              incomingFavWtRecallDist = Util.distributionFromArray(incomingFavWtRecallList.toArray),
-              followCorrelationDist = Util.distributionFromArray(followCorrelationList.toArray),
-              favCorrelationDist = Util.distributionFromArray(favCorrelationList.toArray),
-              relativePrecisionDist = Util.distributionFromArray(relativePrecisionList.toArray)
-            )
-        }
-      }
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.scala
@ -1,794 +0,0 @@
-package com.twitter.simclusters_v2.scalding
-
-import com.twitter.algebird.OptionMonoid
-import com.twitter.algebird.QTree
-import com.twitter.algebird.QTreeSemigroup
-import com.twitter.algebird.Semigroup
-import com.twitter.dal.client.dataset.KeyValDALDataset
-import com.twitter.dal.client.dataset.SnapshotDALDataset
-import com.twitter.hermit.candidate.thriftscala.Candidates
-import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource
-import com.twitter.pluck.source.cassowary.SimsCandidatesSource
-import com.twitter.scalding._
-import com.twitter.scalding_internal.dalv2.DAL
-import com.twitter.scalding_internal.dalv2.DALWrite._
-import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
-import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
-import com.twitter.scalding_internal.job.TwitterExecutionApp
-import com.twitter.scalding_internal.job.analytics_batch._
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
-import com.twitter.simclusters_v2.common.ModelVersions
-import com.twitter.simclusters_v2.hdfs_sources._
-import com.twitter.simclusters_v2.scalding.common.Util
-import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources
-import com.twitter.simclusters_v2.thriftscala._
-import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset
-import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser
-
-object ClusterDetailsJob {
-  case class Scores(followScore: Double, favScore: Double, logFavScore: Double)
-
-  case class IntermediateDetails(
-    numUsersWithAnyNonZeroScore: Int,
-    numUsersWithNonZeroFollowScore: Int,
-    numUsersWithNonZeroFavScore: Int,
-    favQTree: Option[QTree[Double]],
-    followQTree: Option[QTree[Double]],
-    logFavQTree: Option[QTree[Double]],
-    sumOfSquares: Scores,
-    sum: Scores,
-    min: Scores,
-    max: Scores)
-
-  case class InfoFromUserSource(
-    fractionMarkedNSFWUser: Double,
-    languageToFractionDeviceLanguage: Map[String, Double],
-    countryCodeToFractionKnownForWithCountryCode: Map[String, Double],
-    languageToFractionInferredLanguage: Map[String, Double])
-
-  def positiveMin(a: Double, b: Double) = {
-    if (math.min(a, b) == 0.0) math.max(a, b) else math.min(a, b)
-  }
-
-  case class ClusterDetailsSemigroup(implicit qtreeSemigroup: Semigroup[QTree[Double]])
-      extends Semigroup[IntermediateDetails] {
-    val optionMonoid: OptionMonoid[QTree[Double]] = new OptionMonoid[QTree[Double]]()
-    override def plus(
-      left: IntermediateDetails,
-      right: IntermediateDetails
-    ): IntermediateDetails = {
-      IntermediateDetails(
-        left.numUsersWithAnyNonZeroScore + right.numUsersWithAnyNonZeroScore,
-        left.numUsersWithNonZeroFollowScore + right.numUsersWithNonZeroFollowScore,
-        left.numUsersWithNonZeroFavScore + right.numUsersWithNonZeroFavScore,
-        optionMonoid.plus(left.favQTree, right.favQTree),
-        optionMonoid.plus(left.followQTree, right.followQTree),
-        optionMonoid.plus(left.logFavQTree, right.logFavQTree),
-        Scores(
-          left.sumOfSquares.followScore + right.sumOfSquares.followScore,
-          left.sumOfSquares.favScore + right.sumOfSquares.favScore,
-          left.sumOfSquares.logFavScore + right.sumOfSquares.logFavScore
-        ),
-        Scores(
-          left.sum.followScore + right.sum.followScore,
-          left.sum.favScore + right.sum.favScore,
-          left.sum.logFavScore + right.sum.logFavScore
-        ),
-        Scores(
-          positiveMin(left.min.followScore, right.min.followScore),
-          positiveMin(left.min.favScore, right.min.favScore),
-          positiveMin(left.min.logFavScore, right.min.logFavScore)
-        ),
-        Scores(
-          math.max(left.max.followScore, right.max.followScore),
-          math.max(left.max.favScore, right.max.favScore),
-          math.max(left.max.logFavScore, right.max.logFavScore)
-        )
-      )
-    }
-  }
-
-  def intermediateDetailsPipe(
-    input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
-    qtreeSemigroupKParameter: Int
-  ): TypedPipe[(Int, IntermediateDetails)] = {
-    implicit val qtSg: Semigroup[QTree[Double]] =
-      new QTreeSemigroup[Double](qtreeSemigroupKParameter)
-    implicit val cdSg: Semigroup[IntermediateDetails] = ClusterDetailsSemigroup()
-    input
-      .flatMap {
-        case (userId, clusterScoresStruct) =>
-          val clusterScoresArray = clusterScoresStruct.clusterIdToScores.toArray
-          clusterScoresArray.map {
-            case (clusterId, scoresStruct) =>
-              val followScore = scoresStruct.followScore.getOrElse(0.0)
-              val favScore = scoresStruct.favScore.getOrElse(0.0)
-              val logFavScore = scoresStruct.logFavScore.getOrElse(0.0)
-              (
-                clusterId,
-                IntermediateDetails(
-                  numUsersWithAnyNonZeroScore = 1,
-                  numUsersWithNonZeroFollowScore = if (followScore > 0) 1 else 0,
-                  numUsersWithNonZeroFavScore = if (favScore > 0) 1 else 0,
-                  favQTree = if (favScore > 0) Some(QTree(favScore)) else None,
-                  followQTree = if (followScore > 0) Some(QTree(followScore)) else None,
-                  logFavQTree = if (logFavScore > 0) Some(QTree(logFavScore)) else None,
-                  sumOfSquares = Scores(
-                    followScore * followScore,
-                    favScore * favScore,
-                    logFavScore * logFavScore),
-                  sum = Scores(followScore, favScore, logFavScore),
-                  min = Scores(followScore, favScore, logFavScore),
-                  max = Scores(followScore, favScore, logFavScore)
-                )
-              )
-          }
-      }
-      .sumByKey
-      // Uncomment for adhoc job
-      //.withReducers(100)
-      .toTypedPipe
-  }
-
-  private def safeGetDoubleOpt(x: Option[Double]): Double = {
-    x.map { y => if (y.isNaN) 0 else y }.getOrElse(0)
-  }
-
-  private def getSimilaritiesForAllPairs(
-    input: TypedPipe[(Long, ClustersUserIsInterestedIn)]
-  )(
-    implicit uniqueID: UniqueID
-  ): TypedPipe[((Int, Int), Scores)] = {
-    val allClusterPairsBeforeSumByKey = Stat("all_cluster_pairs_before_sum_by_key")
-    val clusterPairsWithin10Ratio = Stat("cluster_pairs_within_10_ratio")
-    val clusterPairsBeforeTopK = Stat("cluster_pairs_before_thresholding")
-
-    input
-      .flatMap {
-        case (userId, clusterScoresStruct) =>
-          val clusterScoresArray = clusterScoresStruct.clusterIdToScores.toArray
-          (0 until clusterScoresArray.length).flatMap { i =>
-            (0 until clusterScoresArray.length).map { j =>
-              val (clusterI, scoresI) = clusterScoresArray(i)
-              val (clusterJ, scoresJ) = clusterScoresArray(j)
-              val ratioOfSizes =
-                scoresI.numUsersInterestedInThisClusterUpperBound.getOrElse(1).toDouble /
-                  scoresJ.numUsersInterestedInThisClusterUpperBound.getOrElse(1).toDouble
-              allClusterPairsBeforeSumByKey.inc()
-              if (ratioOfSizes > 0.1 && ratioOfSizes < 10) {
-                clusterPairsWithin10Ratio.inc()
-              }
-              val followI = safeGetDoubleOpt(scoresI.followScoreClusterNormalizedOnly)
-              val followJ = safeGetDoubleOpt(scoresJ.followScoreClusterNormalizedOnly)
-              val follow = followI * followJ
-              val favI = safeGetDoubleOpt(scoresI.favScoreClusterNormalizedOnly)
-              val favJ = safeGetDoubleOpt(scoresJ.favScoreClusterNormalizedOnly)
-              val fav = favI * favJ
-              val logFavI = safeGetDoubleOpt(scoresI.logFavScoreClusterNormalizedOnly)
-              val logFavJ = safeGetDoubleOpt(scoresJ.logFavScoreClusterNormalizedOnly)
-              val logFav = logFavI * logFavJ
-              ((clusterI, clusterJ), (follow, fav, logFav))
-            }
-          }
-      }
-      .sumByKey
-      // Uncomment for adhoc job
-      //.withReducers(600)
-      .map {
-        case (key, (follow, fav, logFav)) =>
-          clusterPairsBeforeTopK.inc()
-          (key, Scores(follow, fav, logFav))
-      }
-  }
-
-  private def keepTopNeighbors(
-    allPairs: TypedPipe[((Int, Int), Scores)],
-    cosineThreshold: Double
-  )(
-    implicit uniqueID: UniqueID
-  ): TypedPipe[(Int, List[ClusterNeighbor])] = {
-    val clusterPairsMoreThanThreshold = Stat("cluster_pairs_cosine_gt_" + cosineThreshold)
-    val clusterPairsAfterTopK = Stat("cluster_pairs_after_topk")
-    val clustersWithFewNeighbors = Stat(s"clusters_with_fewer_than_100_neighbors")
-    val clustersWithManyNeighbors = Stat(s"clusters_with_more_than_100_neighbors")
-
-    allPairs
-      .flatMap {
-        case ((cI, cJ), Scores(followScore, favScore, logFavScore)) =>
-          if (followScore > cosineThreshold || logFavScore > cosineThreshold || favScore > cosineThreshold) {
-            clusterPairsMoreThanThreshold.inc()
-            Some((cI, ClusterNeighbor(cJ, Some(followScore), Some(favScore), Some(logFavScore))))
-          } else None
-      }
-      .group
-      .toList
-      // Uncomment for adhoc job
-      //.withReducers(40)
-      .map {
-        case (key, seq) =>
-          val finalSize = seq.size
-          clusterPairsAfterTopK.incBy(finalSize)
-          if (finalSize < 100) {
-            clustersWithFewNeighbors.inc()
-          } else {
-            clustersWithManyNeighbors.inc()
-          }
-          (
-            key,
-            seq.sortBy {
-              case cn: ClusterNeighbor =>
-                -(cn.followCosineSimilarity.getOrElse(0.0) + cn.logFavCosineSimilarity.getOrElse(
-                  0.0)) / 2
-            })
-      }
-  }
-
-  def getTopSimilarClustersWithCosine(
-    input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
-    cosineThreshold: Double
-  )(
-    implicit uniqueID: UniqueID
-  ): TypedPipe[(Int, List[ClusterNeighbor])] = {
-    keepTopNeighbors(getSimilaritiesForAllPairs(input), cosineThreshold)
-  }
-
-  def getDistributionDetails(
-    qtree: QTree[Double],
-    sum: Double,
-    sumOfSquares: Double,
-    min: Double,
-    max: Double,
-    fullSize: Int
-  ): DistributionDetails = {
-    val mean = sum / fullSize
-    // note that the below is the naive calculation, and not the sample standard dev formula
-    // that divides by n-1. I don't think it makes a difference at our scale whether we use n or n-1
-    // and I'd rather use the simpler one.
-    val stdDev = math.sqrt(sumOfSquares / fullSize - mean * mean)
-
-    def getQB(percentile: Double): QuantileBounds = {
-      val (lb, ub) = qtree.quantileBounds(percentile)
-      QuantileBounds(lb, ub)
-    }
-
-    DistributionDetails(
-      mean = mean,
-      standardDeviation = Some(stdDev),
-      min = Some(min),
-      p25 = Some(getQB(0.25)),
-      p50 = Some(getQB(0.5)),
-      p75 = Some(getQB(0.75)),
-      p95 = Some(getQB(0.95)),
-      max = Some(max)
-    )
-  }
-
-  def keepCorrectModel(
-    input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
-    modelVersionToKeep: String
-  )(
-    implicit uniqId: UniqueID
-  ): TypedPipe[(Long, ClustersUserIsInterestedIn)] = {
-    val allRecords = Stat("all_input_records")
-    val withCorrectVersion = Stat("with_correct_version")
-    input.filter {
-      case (_, clusterScoresStruct) =>
-        //  allRecords.inc()
-        val result = clusterScoresStruct.knownForModelVersion == modelVersionToKeep
-        //  if (result) withCorrectVersion.inc()
-        result
-    }
-  }
-
-  def getInfoFromUserSource(
-    knownFor: TypedPipe[(Int, List[(Long, Float)])],
-    usersource: TypedPipe[FlatUser],
-    inferredLanguages: TypedPipe[(Long, Seq[(String, Double)])]
-  )(
-    implicit uniqId: UniqueID
-  ): TypedPipe[(Int, InfoFromUserSource)] = {
-    val knownForUsers = knownFor.flatMap {
-      case (clusterId, userScoreList) =>
-        userScoreList.map {
-          case (userId, _) =>
-            (userId, clusterId)
-        }
-    }
-
-    usersource
-      .collect {
-        case fuser: FlatUser if fuser.id.isDefined =>
-          (
-            fuser.id.get,
-            (
-              fuser.accountCountryCode.getOrElse(""),
-              fuser.language.getOrElse(""),
-              fuser.nsfwUser.getOrElse(false)
-            ))
-      }
-      .join(knownForUsers)
-      .leftJoin(inferredLanguages)
-      .map {
-        case (_, (((countryCode, language, nsfw), clusterId), inferredLangsOpt)) =>
-          val nsfwInt = if (nsfw) 1 else 0
-          (
-            clusterId,
-            (
-              1,
-              nsfwInt,
-              Map(language -> 1),
-              Map(countryCode -> 1),
-              inferredLangsOpt.getOrElse(Seq(("", 1.0))).toMap
-            )
-          )
-      }
-      .sumByKey
-      .mapValues {
-        case (
-              denominator,
-              nsfwNumerator,
-              languageNumeratorsMap,
-              countryNumeratorsMap,
-              inferredLangsNumeratorsMap) =>
-          InfoFromUserSource(
-            nsfwNumerator * 1.0 / denominator,
-            languageNumeratorsMap.mapValues { x => x * 1.0 / denominator },
-            countryNumeratorsMap.mapValues { x => x * 1.0 / denominator },
-            inferredLangsNumeratorsMap.mapValues { x => x * 1.0 / denominator }
-          )
-      }
-  }
-
-  /**
-   * Run the cluster details job and return the details for each cluster
-   * @param input interestedIn data
-   * @param qtreeSemigroupKParameter parameter for calculating percentiles using qtree monoid (set to a small number, usually < 7)
-   * @param modelVersionToKeep which modelVersion to use from interestedIn dataset
-   * @param knownFor clusterId -> users known for this cluster and their scores
-   * @param knownForTranspose userId -> clusters this user is known for and their scores
-   * @param usersource -> user source
-   * @param simsGraph -> sims graph in the form of userId -> adjacency list
-   * @param cosineThreshold -> cosine threshold to include a cluster in the list of similar clusters for a given cluster
-   * @param uniqId
-   * @return pipe with (modelVersion, clusterId) as the key and ClusterDetails struct as the value.
-   */
-  def run(
-    input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
-    qtreeSemigroupKParameter: Int,
-    modelVersionToKeep: String,
-    knownFor: TypedPipe[(Int, List[(Long, Float)])],
-    knownForTranspose: TypedPipe[(Long, Array[(Int, Float)])],
-    usersource: Option[TypedPipe[FlatUser]],
-    inferredLanguageSource: Option[TypedPipe[(Long, Seq[(String, Double)])]],
-    simsGraph: Option[TypedPipe[(Long, Map[Long, Float])]],
-    cosineThreshold: Double
-  )(
-    implicit uniqId: UniqueID
-  ): Execution[TypedPipe[((String, Int), ClusterDetails)]] = {
-    val topSimilarClusters = getTopSimilarClustersWithCosine(input, cosineThreshold)
-    val infoFromUserSource: TypedPipe[(Int, InfoFromUserSource)] = (for {
-      us <- usersource
-      inferredLanguages <- inferredLanguageSource
-    } yield getInfoFromUserSource(knownFor, us, inferredLanguages)).getOrElse(TypedPipe.empty)
-
-    val clusterEvaluationExec = simsGraph match {
-      case Some(sg) =>
-        ClusterEvaluation.clusterLevelEvaluation(sg, knownForTranspose, "eval")
-      case None =>
-        val dummyPipe: TypedPipe[(Int, (Int, ClusterQuality))] = TypedPipe.empty
-        Execution.from(dummyPipe)
-    }
-
-    clusterEvaluationExec
-      .map { clusterIdToSizesAndQualities =>
-        val clusterQualities: TypedPipe[(Int, ClusterQuality)] =
-          clusterIdToSizesAndQualities.mapValues(_._2)
-        intermediateDetailsPipe(
-          keepCorrectModel(input, modelVersionToKeep),
-          qtreeSemigroupKParameter)
-          .leftJoin(topSimilarClusters)
-          .leftJoin(infoFromUserSource)
-          .leftJoin(clusterQualities)
-          .join(knownFor)
-          .map {
-            case (
-                  clusterId,
-                  (
-                    (
-                      ((intermediateDetails, topSimilarNeighborsOpt), userSourceInfoOpt),
-                      qualityOpt),
-                    knownForUsers)
-                ) =>
-              val knownForSorted = knownForUsers.sortBy(-_._2).map {
-                case (userId, score) =>
-                  UserWithScore(userId, score)
-              }
-              (modelVersionToKeep, clusterId) ->
-                ClusterDetails(
-                  numUsersWithAnyNonZeroScore = intermediateDetails.numUsersWithAnyNonZeroScore,
-                  numUsersWithNonZeroFavScore = intermediateDetails.numUsersWithNonZeroFavScore,
-                  numUsersWithNonZeroFollowScore =
-                    intermediateDetails.numUsersWithNonZeroFollowScore,
-                  favScoreDistributionDetails = intermediateDetails.favQTree.map { qt =>
-                    getDistributionDetails(
-                      qtree = qt,
-                      sum = intermediateDetails.sum.favScore,
-                      sumOfSquares = intermediateDetails.sumOfSquares.favScore,
-                      min = intermediateDetails.min.favScore,
-                      max = intermediateDetails.max.favScore,
-                      fullSize = intermediateDetails.numUsersWithNonZeroFavScore
-                    )
-                  },
-                  followScoreDistributionDetails = intermediateDetails.followQTree.map { qt =>
-                    getDistributionDetails(
-                      qtree = qt,
-                      sum = intermediateDetails.sum.followScore,
-                      sumOfSquares = intermediateDetails.sumOfSquares.followScore,
-                      min = intermediateDetails.min.followScore,
-                      max = intermediateDetails.max.followScore,
-                      fullSize = intermediateDetails.numUsersWithNonZeroFollowScore
-                    )
-                  },
-                  logFavScoreDistributionDetails = intermediateDetails.logFavQTree.map { qt =>
-                    getDistributionDetails(
-                      qtree = qt,
-                      sum = intermediateDetails.sum.logFavScore,
-                      sumOfSquares = intermediateDetails.sumOfSquares.logFavScore,
-                      min = intermediateDetails.min.logFavScore,
-                      max = intermediateDetails.max.logFavScore,
-                      // note: user has non-zero fav score iff a user has non-zero log-fav score
-                      fullSize = intermediateDetails.numUsersWithNonZeroFavScore
-                    )
-                  },
-                  knownForUsersAndScores = Some(knownForSorted),
-                  neighborClusters = topSimilarNeighborsOpt,
-                  fractionKnownForMarkedNSFWUser = userSourceInfoOpt.map(_.fractionMarkedNSFWUser),
-                  languageToFractionDeviceLanguage =
-                    userSourceInfoOpt.map(_.languageToFractionDeviceLanguage),
-                  countryCodeToFractionKnownForWithCountryCode =
-                    userSourceInfoOpt.map(_.countryCodeToFractionKnownForWithCountryCode),
-                  qualityMeasuredOnSimsGraph = qualityOpt,
-                  languageToFractionInferredLanguage =
-                    userSourceInfoOpt.map(_.languageToFractionInferredLanguage),
-                )
-          }
-      }
-  }
-
-  def getTruncatedSims(
-    sims: TypedPipe[Candidates],
-    maxNeighbors: Int
-  ): TypedPipe[(Long, Map[Long, Float])] = {
-    sims.map { cands =>
-      (
-        cands.userId,
-        // These candidates are already sorted, but leaving it in just in case the behavior changes upstream
-        cands.candidates
-          .map { c => (c.userId, c.score.toFloat) }.sortBy(-_._2).take(maxNeighbors).toMap
-      )
-    }
-  }
-}
-
-/**
- scalding remote run  --main-class com.twitter.simclusters_v2.scalding.ClusterDetailsAdhoc \
-  --target src/scala/com/twitter/simclusters_v2/scalding:cluster_details-adhoc \
-  --hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \
-  --user recos-platform -- \
-  --date 2020-06-25 \
-  --dateForUserSource 2020-06-25 \
-  --includeUserSource \
-  --outputDir /user/recos-platform/adhoc/your_ldap/cluster_details_inferred_lang
- */
-object ClusterDetailsAdhoc extends TwitterExecutionApp {
-  implicit val tz: java.util.TimeZone = DateOps.UTC
-  implicit val dp = DateParser.default
-
-  def job: Execution[Unit] =
-    Execution.getConfigMode.flatMap {
-      case (config, mode) =>
-        Execution.withId { implicit uniqueId =>
-          val args = config.getArgs
-          val date = DateRange.parse(args("dateForUserSource"))
-          val (knownFor, knownForTranspose) =
-            args
-              .optional("knownForDir").map { location =>
-                (
-                  KnownForSources.transpose(KnownForSources.readKnownFor(location)),
-                  KnownForSources.readKnownFor(location)
-                )
-              }.getOrElse(
-                (
-                  KnownForSources.clusterToKnownFor_20M_145K_updated,
-                  KnownForSources.knownFor_20M_145K_updated
-                )
-              )
-
-          val interestedIn = args
-            .optional("inputDir").map { interestedInInputDir =>
-              TypedPipe.from(AdhocKeyValSources.interestedInSource(interestedInInputDir))
-            }.getOrElse(
-              DAL
-                .readMostRecentSnapshotNoOlderThan(
-                  SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
-                  Days(14))
-                .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
-                .toTypedPipe
-                .map {
-                  case KeyVal(userId, clustersUserIsInterestedIn) =>
-                    (userId, clustersUserIsInterestedIn)
-                }
-            )
-
-          val userSourceOpt = if (args.boolean("includeUserSource")) {
-            Some(DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe)
-          } else None
-
-          val inferredLanguagesOpt = if (args.boolean("includeUserSource")) {
-            Some(ExternalDataSources.inferredUserProducedLanguageSource)
-          } else None
-
-          val simsGraphOpt = args.optional("simsForEvalInputDir").map { sgDir =>
-            ClusterDetailsJob.getTruncatedSims(
-              TypedPipe.from(WTFCandidatesSource(sgDir)),
-              args.int("maxSimsNeighborsForEval", 20)
-            )
-          }
-
-          Util.printCounters(
-            ClusterDetailsJob
-              .run(
-                interestedIn,
-                args.int("qtreeSemigroupKParameter", 3),
-                args.getOrElse("modelVersion", "20M_145K_updated"),
-                knownFor,
-                knownForTranspose,
-                userSourceOpt,
-                inferredLanguagesOpt,
-                simsGraphOpt,
-                cosineThreshold = args.double("cosineThreshold", 0.01)
-              ).flatMap(
-                _.writeExecution(AdhocKeyValSources.clusterDetailsSource(args("outputDir"))))
-          )
-        }
-    }
-}
-
-trait ClusterDetailsBatchTrait extends TwitterScheduledExecutionApp {
-  implicit val tz = DateOps.UTC
-  implicit val parser = DateParser.default
-
-  def firstTime: String
-  def batchIncrement: Duration
-  def manhattanOutputPath: String
-  def clusterDetailsLiteOutputPath: String
-  def modelVersion: String
-  def knownForDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
-  def interestedInDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
-  def outputDataset: KeyValDALDataset[KeyVal[(String, Int), ClusterDetails]]
-  def clusterDetailsLiteOutputDataset: SnapshotDALDataset[ClusterDetailsLite]
-
-  private lazy val execArgs = AnalyticsBatchExecutionArgs(
-    batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
-    firstTime = BatchFirstTime(RichDate(firstTime)),
-    lastTime = None,
-    batchIncrement = BatchIncrement(batchIncrement)
-  )
-
-  override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
-    implicit dateRange =>
-      Execution.withId { implicit uniqueId =>
-        Execution.withArgs { args =>
-          val qtreeSemigroupKParameter = args.int("qtreeSemigroupKParameter", 5)
-          val maxSimsNeighborsForEval = args.int("maxSimsNeighborsForEval", 20)
-          val knownForTranspose =
-            KnownForSources.fromKeyVal(
-              DAL.readMostRecentSnapshot(knownForDataset, dateRange.extend(Days(7))).toTypedPipe,
-              modelVersion)
-          val knownFor = KnownForSources.transpose(knownForTranspose)
-          val cosineThreshold = args.double("cosineThreshold", 0.01)
-          val interestedIn =
-            DAL
-              .readMostRecentSnapshot(interestedInDataset, dateRange.extend(Days(7)))
-              .toTypedPipe
-              .map {
-                case KeyVal(userId, clustersUserIsInterestedIn) =>
-                  (userId, clustersUserIsInterestedIn)
-              }
-          val sims = if (modelVersion == ModelVersions.Model20M145K2020) {
-            // The model version 20m_145k_2020 uses approximate_cosine_follow as the input sims graph
-            // to cluster users. The same graph is used to evaluate the clusters
-            TypedPipe
-              .from(FollowingsCosineSimilaritiesManhattanSource())
-              .map(_._2)
-          } else {
-            TypedPipe.from(
-              SimsCandidatesSource()(
-                dateRange = dateRange,
-                suffixPath = "/classified_candidates_rollup"
-              ))
-          }
-          val resultExec = ClusterDetailsJob
-            .run(
-              interestedIn,
-              qtreeSemigroupKParameter,
-              modelVersion,
-              knownFor,
-              knownForTranspose,
-              Some(DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange).toTypedPipe),
-              Some(ExternalDataSources.inferredUserProducedLanguageSource),
-              Some(
-                ClusterDetailsJob.getTruncatedSims(sims, maxNeighbors = maxSimsNeighborsForEval)),
-              cosineThreshold
-            ).flatMap { resultUnmapped =>
-              val clusterDetailsExec = resultUnmapped
-                .map {
-                  case (clusterKey, details) =>
-                    KeyVal(clusterKey, details)
-                }.writeDALVersionedKeyValExecution(
-                  outputDataset,
-                  D.Suffix(manhattanOutputPath)
-                )
-
-              val clusterDetailsLiteExec =
-                resultUnmapped
-                  .map {
-                    case ((_, clusterId), details)
-                        if modelVersion == ModelVersions.Model20M145KDec11 =>
-                      ClusterDetailsLite(
-                        FullClusterId(ModelVersion.Model20m145kDec11, clusterId),
-                        details.numUsersWithAnyNonZeroScore,
-                        details.numUsersWithNonZeroFollowScore,
-                        details.numUsersWithNonZeroFavScore,
-                        details.knownForUsersAndScores.getOrElse(Nil)
-                      )
-                    case ((_, clusterId), details)
-                        if modelVersion == ModelVersions.Model20M145KUpdated =>
-                      ClusterDetailsLite(
-                        FullClusterId(ModelVersion.Model20m145kUpdated, clusterId),
-                        details.numUsersWithAnyNonZeroScore,
-                        details.numUsersWithNonZeroFollowScore,
-                        details.numUsersWithNonZeroFavScore,
-                        details.knownForUsersAndScores.getOrElse(Nil)
-                      )
-                    case ((_, clusterId), details)
-                        if modelVersion == ModelVersions.Model20M145K2020 =>
-                      ClusterDetailsLite(
-                        FullClusterId(ModelVersion.Model20m145k2020, clusterId),
-                        details.numUsersWithAnyNonZeroScore,
-                        details.numUsersWithNonZeroFollowScore,
-                        details.numUsersWithNonZeroFavScore,
-                        details.knownForUsersAndScores.getOrElse(Nil)
-                      )
-                  }.writeDALSnapshotExecution(
-                    clusterDetailsLiteOutputDataset,
-                    D.Daily,
-                    D.Suffix(clusterDetailsLiteOutputPath),
-                    D.EBLzo(),
-                    dateRange.end)
-
-              Execution.zip(clusterDetailsExec, clusterDetailsLiteExec)
-            }
-
-          Util.printCounters(resultExec)
-        }
-      }
-  }
-
-}
-
-object ClusterDetailsBatch extends ClusterDetailsBatchTrait {
-  override val firstTime: String = "2018-07-28"
-  override val batchIncrement: Duration = Days(7)
-
-  override val manhattanOutputPath: String =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details"
-
-  override val clusterDetailsLiteOutputPath: String =
-    "/user/cassowary/processed/simclusters_v2_cluster_details_lite"
-
-  override val modelVersion: String = ModelVersions.Model20M145KDec11
-  override val knownForDataset = SimclustersV2KnownFor20M145KDec11ScalaDataset
-  override val interestedInDataset = SimclustersV2InterestedInScalaDataset
-  override val outputDataset = SimclustersV2ClusterDetailsScalaDataset
-  override val clusterDetailsLiteOutputDataset =
-    SimclustersV2ClusterDetailsLiteScalaDataset
-}
-
-object ClusterDetails20M145KUpdated extends ClusterDetailsBatchTrait {
-  override val firstTime: String = "2019-06-16"
-  override val batchIncrement: Duration = Days(7)
-
-  override val manhattanOutputPath: String =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_updated"
-
-  override val clusterDetailsLiteOutputPath: String =
-    "/user/cassowary/processed/simclusters_v2_cluster_details_lite_20m_145k_updated"
-
-  override val modelVersion: String = ModelVersions.Model20M145KUpdated
-  override val knownForDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
-  override val interestedInDataset = SimclustersV2InterestedIn20M145KUpdatedScalaDataset
-  override val outputDataset = SimclustersV2ClusterDetails20M145KUpdatedScalaDataset
-  override val clusterDetailsLiteOutputDataset =
-    SimclustersV2ClusterDetailsLite20M145KUpdatedScalaDataset
-}
-
-/**
- * capesospy-v2 update --build_locally --start_cron cluster_details_20m_145k_2020 \
- * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
- */
-object ClusterDetails20M145K2020 extends ClusterDetailsBatchTrait {
-  override val firstTime: String = "2020-10-15"
-  override val batchIncrement: Duration = Days(7)
-
-  override val manhattanOutputPath: String =
-    "/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_2020"
-
-  override val clusterDetailsLiteOutputPath: String =
-    "/user/cassowary/processed/simclusters_v2_cluster_details_lite_20m_145k_2020"
-
-  override val modelVersion: String = ModelVersions.Model20M145K2020
-  override val knownForDataset = SimclustersV2KnownFor20M145K2020ScalaDataset
-  override val interestedInDataset = SimclustersV2InterestedIn20M145K2020ScalaDataset
-  override val outputDataset = SimclustersV2ClusterDetails20M145K2020ScalaDataset
-  override val clusterDetailsLiteOutputDataset =
-    SimclustersV2ClusterDetailsLite20M145K2020ScalaDataset
-}
-
-/**
-scalding remote run  --main-class com.twitter.simclusters_v2.scalding.DumpClusterDetailsAdhoc \
-  --target src/scala/com/twitter/simclusters_v2/scalding:cluster_details-dump \
-  --user recos-platform -- \
-  --date 2020-06-25 \
-  --clusterIds 5542 129677 48645 \
-  --inputDir /user/recos-platform/adhoc/your_ldap/cluster_details_inferred_lang
- */
-object DumpClusterDetailsAdhoc extends TwitterExecutionApp {
-  def job: Execution[Unit] =
-    Execution.getConfigMode.flatMap {
-      case (config, mode) =>
-        Execution.withId { implicit uniqueId =>
-          val args = config.getArgs
-          val clusters = args.list("clusterIds").map(_.toInt).toSet //(1 to 2500).toSet //
-          TypedPipe
-            .from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
-            .filter { case ((modelVersion, clusterId), details) => clusters.contains(clusterId) }
-            .toIterableExecution
-            .map { iter =>
-              iter.foreach { x => println(Util.prettyJsonMapper.writeValueAsString(x)) }
-            }
-        }
-    }
-}
-
-/**
- * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:cluster_details && \
- * oscar hdfs --user cassowary --host hadoopnest2.atla.twitter.com --bundle cluster_details \
- * --tool com.twitter.simclusters_v2.scalding.DumpClusterSimilaritiesAdhoc --screen --screen-detached \
- * --tee your_ldap/dumpClusterSimilarities_20200103 -- \
- * --inputDir /user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_updated/ \
- * --outputDir adhoc/your_ldap
- */
-object DumpClusterSimilaritiesAdhoc extends TwitterExecutionApp {
-  def job: Execution[Unit] =
-    Execution.getConfigMode.flatMap {
-      case (config, mode) =>
-        Execution.withId { implicit uniqueId =>
-          val args = config.getArgs
-          TypedPipe
-            .from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
-            .flatMap {
-              case ((_, clusterId), details) =>
-                details.neighborClusters.getOrElse(Nil).map { neighbor =>
-                  val compositeScore = (neighbor.followCosineSimilarity
-                    .getOrElse(0.0) + neighbor.favCosineSimilarity.getOrElse(0.0)) / 2
-                  (
-                    clusterId,
-                    neighbor.clusterId,
-                    "%.4f".format(compositeScore)
-                  )
-                }
-            }.writeExecution(TypedTsv(args("outputDir")))
-        }
-    }
-}
--- a/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.scala
@ -1,607 +0,0 @@
-package com.twitter.simclusters_v2.scalding
-
-import com.twitter.algebird.Monoid
-import com.twitter.algebird.mutable.PriorityQueueMonoid
-import com.twitter.dal.client.dataset.KeyValDALDataset
-import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource
-import com.twitter.scalding._
-import com.twitter.scalding_internal.dalv2.DAL
-import com.twitter.scalding_internal.job.TwitterExecutionApp
-import com.twitter.scalding_internal.job.analytics_batch._
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
-import com.twitter.simclusters_v2.common.ModelVersions
-import com.twitter.simclusters_v2.hdfs_sources._
-import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._
-import com.twitter.simclusters_v2.scalding.common.Util
-import com.twitter.simclusters_v2.scalding.common.Util.Distribution
-import com.twitter.simclusters_v2.thriftscala.ClusterQuality
-import com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor
-import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset
-import java.util.PriorityQueue
-import scala.collection.JavaConverters._
-
-object ClusterEvaluation {
-
-  val samplerMonoid: PriorityQueueMonoid[((Long, Long), (Double, Double))] =
-    Util.reservoirSamplerMonoidForPairs[(Long, Long), (Double, Double)](5000)(Util.edgeOrdering)
-
-  case class ClusterResults(
-    numEdgesInsideCluster: Int,
-    wtOfEdgesInsideCluster: Double,
-    numEdgesOutsideCluster: Int,
-    wtOfEdgesOutsideCluster: Double,
-    originalWtAndProductOfNodeScoresSample: PriorityQueue[((Long, Long), (Double, Double))]) {
-    def clusterQuality(clusterSize: Int, averagePrecisionWholeGraph: Double): ClusterQuality = {
-      val unweightedRecallDenominator = numEdgesInsideCluster + numEdgesOutsideCluster
-      val unweightedRecall = if (unweightedRecallDenominator > 0) {
-        numEdgesInsideCluster.toDouble / unweightedRecallDenominator.toDouble
-      } else 0.0
-
-      val weightedRecallDenominator = wtOfEdgesInsideCluster + wtOfEdgesOutsideCluster
-      val weightedRecall = if (weightedRecallDenominator > 0) {
-        wtOfEdgesInsideCluster / weightedRecallDenominator
-      } else 0.0
-
-      val precision = if (clusterSize > 1) {
-        Some(wtOfEdgesInsideCluster / (clusterSize * (clusterSize - 1)))
-      } else Some(0.0)
-
-      val relativePrecision = if (averagePrecisionWholeGraph > 0) {
-        precision.flatMap { p => Some(p / averagePrecisionWholeGraph) }
-      } else Some(0.0)
-
-      ClusterQuality(
-        unweightedRecall = Some(unweightedRecall),
-        weightedRecall = Some(weightedRecall),
-        unweightedRecallDenominator = Some(unweightedRecallDenominator),
-        weightedRecallDenominator = Some(weightedRecallDenominator),
-        relativePrecisionNumerator = precision,
-        relativePrecision = relativePrecision,
-        weightAndProductOfNodeScoresCorrelation = Some(
-          Util.computeCorrelation(
-            originalWtAndProductOfNodeScoresSample.iterator.asScala.map(_._2)))
-      )
-    }
-  }
-
-  object ClusterResultsMonoid extends Monoid[ClusterResults] {
-    override def zero = ClusterResults(0, 0, 0, 0, samplerMonoid.zero)
-    override def plus(l: ClusterResults, r: ClusterResults) = ClusterResults(
-      l.numEdgesInsideCluster + r.numEdgesInsideCluster,
-      l.wtOfEdgesInsideCluster + r.wtOfEdgesInsideCluster,
-      l.numEdgesOutsideCluster + r.numEdgesOutsideCluster,
-      l.wtOfEdgesOutsideCluster + r.wtOfEdgesOutsideCluster,
-      samplerMonoid
-        .plus(l.originalWtAndProductOfNodeScoresSample, r.originalWtAndProductOfNodeScoresSample)
-    )
-  }
-
-  /**
-   * Evaluate the quality of a cluster.
-   * @param memberScores A map with the members of the cluster as the keys and their scores
-   *                     inside the cluster as values. The more central a member is inside the score,
-   *                     the higher it's score is.
-   * @param membersAdjLists A map that gives the weighted neighbors of each member in the cluster.
-   */
-  def evaluateCluster(
-    memberScores: Map[Long, Double],
-    membersAdjLists: Map[Long, Map[Long, Float]]
-  ): ClusterResults = {
-    val resultsIter = membersAdjLists.flatMap {
-      case (fromNodeId, adjList) =>
-        val fromNodeWt = memberScores.getOrElse(fromNodeId, 0.0)
-        adjList.map {
-          case (toNodeId, edgeWt) =>
-            if (memberScores.contains(toNodeId)) {
-              val productOfMembershipScores = fromNodeWt * memberScores(toNodeId)
-              ClusterResults(
-                1,
-                edgeWt,
-                0,
-                0,
-                samplerMonoid.build(
-                  ((fromNodeId, toNodeId), (edgeWt.toDouble, productOfMembershipScores))))
-            } else {
-              ClusterResults(0, 0, 1, edgeWt, samplerMonoid.zero)
-            }
-        }
-    }
-    Monoid.sum(resultsIter)(ClusterResultsMonoid)
-  }
-
-  /**
-   * Evaluate each cluster with respect to the provided graph.
-   * @param graph graph represented via the adjacency lists of each node, needs to be symmetrized i.e. if u is in v's adjlist, then v needs to be in u's adjlist as well
-   * @param clusters cluster memberships of each node.
-   * @param statsPrefix convenience argument to act as prefix for stats counters
-   * @return key-value pipe with clusterId as key and (size of the cluster, quality struct) as value
-   */
-  def clusterLevelEvaluation(
-    graph: TypedPipe[(Long, Map[Long, Float])],
-    clusters: TypedPipe[(Long, Array[(Int, Float)])],
-    statsPrefix: String = ""
-  )(
-    implicit uniqueId: UniqueID
-  ): Execution[TypedPipe[(Int, (Int, ClusterQuality))]] = {
-    val numRealClusters = Stat(s"${statsPrefix}/numRealClusters")
-    val numFakeClusters = Stat(s"${statsPrefix}/numFakeClusters")
-
-    val numNodesAndEdgesExec = graph
-      .map {
-        case (nId, nbrMap) =>
-          (1L, nbrMap.size.toLong, nbrMap.values.sum.toDouble)
-      }.sum.getExecution
-
-    numNodesAndEdgesExec.map {
-      case (numNodes, numEdges, sumOfAllEdgeWts) =>
-        println("numNodes " + numNodes)
-        println("numEdges " + numEdges)
-        println("sumOfAllEdgeWts " + sumOfAllEdgeWts)
-
-        val numFakeClustersForUnassignedNodes = numNodes / 1e4
-
-        val averagePrecisionWholeGraph = sumOfAllEdgeWts / (numNodes * (numNodes - 1))
-        graph
-          .leftJoin(clusters)
-          // uncomment for adhoc job
-          .withReducers(200)
-          .flatMap {
-            case (nodeId, (adjList, assignedClustersOpt)) =>
-              val nodeDegree = adjList.size.toLong
-              val nodeWeightedDegree = adjList.values.sum
-              assignedClustersOpt match {
-                case Some(assignedClusters) if assignedClusters.nonEmpty =>
-                  assignedClusters.toList.map {
-                    case (clusterId, scoreOfNodeInCluster) =>
-                      (
-                        clusterId,
-                        (
-                          Map(nodeId -> (scoreOfNodeInCluster.toDouble, adjList)),
-                          1,
-                          nodeDegree,
-                          nodeWeightedDegree))
-                  }
-                case _ =>
-                  // For nodes that don't belong to any cluster, create a fake clusterId (0 or lesser)
-                  // and add the node's statistics to that clusterId. We don't need the adjacency lists for
-                  // unassigned nodes, we'll simply track how many edges are incident on those nodes and their weighted sum etc
-                  val fakeClusterId =
-                    (-1 * (math.abs(
-                      Util.hashToLong(nodeId)) % numFakeClustersForUnassignedNodes)).toInt
-                  List(
-                    (
-                      fakeClusterId,
-                      (
-                        Map.empty[Long, (Double, Map[Long, Float])],
-                        1,
-                        nodeDegree,
-                        nodeWeightedDegree)))
-              }
-          }
-          .sumByKey
-          // uncomment for adhoc job
-          .withReducers(60)
-          .map {
-            case (clusterId, (membersMap, clusterSize, volumeOfCluster, weightedVolumeOfCluster)) =>
-              if (clusterId > 0) {
-                numRealClusters.inc()
-
-                val scoresMap =
-                  if (clusterId > 0) membersMap.mapValues(_._1) else Map.empty[Long, Double]
-                val adjListsMap = membersMap.mapValues(_._2)
-
-                val quality = evaluateCluster(scoresMap, adjListsMap)
-                  .clusterQuality(clusterSize, averagePrecisionWholeGraph)
-
-                (clusterId, (clusterSize, quality))
-              } else {
-                // clusterId <= 0 means that this is a fake cluster.
-                numFakeClusters.inc()
-                (
-                  clusterId,
-                  (
-                    clusterSize,
-                    ClusterQuality(
-                      unweightedRecallDenominator = Some(volumeOfCluster),
-                      weightedRecallDenominator = Some(weightedVolumeOfCluster)
-                    )
-                  )
-                )
-              }
-          }
-    }
-  }
-
-  case class OverallResults(
-    unweightedRecall: Double,
-    edgesInsideClusters: Long,
-    allEdges: Long,
-    allNodes: Int,
-    weightedRecall: Double,
-    wtOnEdgesInsideClusters: Double,
-    wtOnAllEdges: Double,
-    weightCorrelation: Double,
-    relativePrecision: Double,
-    numUnassignedNodes: Int,
-    numAssignedNodes: Int,
-    sizeDist: Distribution,
-    recallDist: Distribution,
-    weightedRecallDist: Distribution,
-    relativePrecisionDist: Distribution,
-    weightCorrelationDist: Distribution,
-    numClustersWithNegativeCorrelation: Double,
-    numClustersWithZeroRecall: Double,
-    numClustersWithLessThanOneRelativePrecision: Double,
-    numSingletonClusters: Int)
-
-  def summarizePerClusterResults(
-    perClusterResults: TypedPipe[(Int, (Int, ClusterQuality))]
-  ): Execution[Option[OverallResults]] = {
-    perClusterResults
-      .map {
-        case (clusterId, (size, quality)) =>
-          val unweightedRecallDen = quality.unweightedRecallDenominator.getOrElse(0.0)
-          val unweightedRecallNum = quality.unweightedRecall.getOrElse(0.0) * unweightedRecallDen
-          val weightedRecallDen = quality.weightedRecallDenominator.getOrElse(0.0)
-          val weightedRecallNum = quality.weightedRecall.getOrElse(0.0) * weightedRecallDen
-
-          val weightCorrelationDen = size
-          val weightCorrelationNum =
-            weightCorrelationDen * quality.weightAndProductOfNodeScoresCorrelation
-              .getOrElse(0.0)
-
-          val relativePrecisionDen = size
-          val relativePrecisionNum = relativePrecisionDen * quality.relativePrecision.getOrElse(0.0)
-
-          val numClustersWithNegativeCorrelation =
-            if (weightCorrelationNum < 0 && clusterId > 0) 1 else 0
-          val numClustersWithLessThanOneRelativePrecision =
-            if (quality.relativePrecision.getOrElse(0.0) < 1 && clusterId > 0) 1 else 0
-          val numClustersWithZeroRecall = if (weightedRecallNum < 1e-5 && clusterId > 0) 1 else 0
-          val numUnassignedNodes = if (clusterId < 1) size else 0
-          val numAssignedNodes = if (clusterId > 0) size else 0
-          val numSingletonClusters = if (clusterId > 0 && size == 1) 1 else 0
-
-          (
-            unweightedRecallDen,
-            unweightedRecallNum,
-            weightedRecallDen,
-            weightedRecallNum,
-            weightCorrelationDen,
-            weightCorrelationNum,
-            relativePrecisionDen,
-            relativePrecisionNum,
-            numClustersWithNegativeCorrelation,
-            numClustersWithLessThanOneRelativePrecision,
-            numClustersWithZeroRecall,
-            List(size.toDouble),
-            List(quality.unweightedRecall.getOrElse(0.0)),
-            List(quality.weightedRecall.getOrElse(0.0)),
-            List(quality.relativePrecision.getOrElse(0.0)),
-            List(quality.weightAndProductOfNodeScoresCorrelation.getOrElse(0.0)),
-            numUnassignedNodes,
-            numAssignedNodes,
-            numSingletonClusters
-          )
-      }
-      .sum
-      .toOptionExecution
-      .map { opt =>
-        opt.map {
-          case (
-                unweightedRecallDen,
-                unweightedRecallNum,
-                weightedRecallDen,
-                weightedRecallNum,
-                weightCorrelationDen,
-                weightCorrelationNum,
-                relativePrecisionDen,
-                relativePrecisionNum,
-                numClustersWithNegativeCorrelation,
-                numClustersWithLessThanOneRelativePrecision,
-                numClustersWithZeroRecall,
-                sizeList,
-                unweightedRecallList,
-                weightedRecallList,
-                relativePrecisionList,
-                weightCorrelationList,
-                numUnassignedNodes,
-                numAssignedNodes,
-                numSingletonClusters) =>
-            OverallResults(
-              unweightedRecall = unweightedRecallNum / unweightedRecallDen,
-              edgesInsideClusters = unweightedRecallNum.toLong,
-              allEdges = unweightedRecallDen.toLong,
-              allNodes = numAssignedNodes + numUnassignedNodes,
-              weightedRecall = weightedRecallNum / weightedRecallDen,
-              wtOnEdgesInsideClusters = weightedRecallNum,
-              wtOnAllEdges = weightedRecallDen,
-              weightCorrelation = weightCorrelationNum / weightCorrelationDen,
-              relativePrecision = relativePrecisionNum / relativePrecisionDen,
-              numAssignedNodes = numAssignedNodes,
-              numUnassignedNodes = numUnassignedNodes,
-              sizeDist = Util.distributionFromArray(sizeList.toArray),
-              recallDist = Util.distributionFromArray(unweightedRecallList.toArray),
-              weightedRecallDist = Util.distributionFromArray(weightedRecallList.toArray),
-              weightCorrelationDist = Util.distributionFromArray(weightCorrelationList.toArray),
-              relativePrecisionDist = Util.distributionFromArray(relativePrecisionList.toArray),
-              numClustersWithNegativeCorrelation = numClustersWithNegativeCorrelation,
-              numClustersWithLessThanOneRelativePrecision =
-                numClustersWithLessThanOneRelativePrecision,
-              numClustersWithZeroRecall = numClustersWithZeroRecall,
-              numSingletonClusters = numSingletonClusters
-            )
-        }
-      }
-  }
-
-  /**
-   * @param graph Input similarity graph, needs to be symmetrized i.e. if u is in v's adjlist, then v needs to be in u's adjlist as well
-   * @param clusters cluster assignments to be evaluated
-   * @return summary of results
-   */
-  def overallEvaluation(
-    graph: TypedPipe[(Long, Map[Long, Float])],
-    clusters: TypedPipe[(Long, Array[(Int, Float)])],
-    statsPrefix: String
-  )(
-    implicit uniqueId: UniqueID
-  ): Execution[Option[OverallResults]] = {
-    clusterLevelEvaluation(graph, clusters, statsPrefix).flatMap(summarizePerClusterResults)
-  }
-}
-
-/**
- * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:cluster_evaluation && \
- * oscar hdfs --user frigate --host hadoopnest1.atla.twitter.com --bundle cluster_evaluation \
- * --tool com.twitter.simclusters_v2.scalding.ClusterEvaluationAdhoc --screen --screen-detached \
- * --tee logs/clusterQualityFor_updatedUnnormalizedInputScores_usingSims20190318  -- \
- * --simsInputDir /user/frigate/your_ldap/commonDirForClusterEvaluation/classifiedSims_20190314_copiedFromAtlaProc \
- * --topK 20000000 --date 2019-03-18 --minActiveFollowers 400 \
- * --topUsersDir /user/frigate/your_ldap/commonDirForClusterEvaluation/top20MUsers_minActiveFollowers400_20190215 \
- * --maxSimsNeighborsForEval 40 \
- * --preparedSimsGraph /user/frigate/your_ldap/commonDirForClusterEvaluation/symmetrized_classifiedSims20190318_top20MUsers \
- * --outputDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownForClusterEvaluation \
- * --knownForDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownFor
- */
-object ClusterEvaluationAdhoc extends TwitterExecutionApp {
-  implicit val tz: java.util.TimeZone = DateOps.UTC
-  implicit val dp = DateParser.default
-
-  def job: Execution[Unit] =
-    Execution.getConfigMode.flatMap {
-      case (config, mode) =>
-        Execution.withId { implicit uniqueId =>
-          val args = config.getArgs
-          val knownFor = args
-            .optional("knownForDir").map { location =>
-              KnownForSources.readKnownFor(location)
-            }.getOrElse(KnownForSources.knownFor_20M_Dec11_145K)
-
-          val minActiveFollowers = args.int("minActiveFollowers", 400)
-          val topK = args.int("topK")
-          val date = DateRange.parse(args("date"))
-
-          val topUsersExec =
-            TopUsersSimilarityGraph
-              .topUsers(
-                DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe,
-                minActiveFollowers,
-                topK
-              )
-              .map(_.id)
-              .count("num_top_users")
-              .make(TypedTsv(args("topUsersDir")))
-
-          val simsGraphExec = topUsersExec.flatMap { topUsers =>
-            TopUsersSimilarityGraph.makeGraph(
-              TopUsersSimilarityGraph.getSubgraphFromUserGroupedInput(
-                TypedPipe.from(WTFCandidatesSource(args("simsInputDir"))),
-                topUsers,
-                args.int("maxSimsNeighborsForEval", 40),
-                degreeThresholdForStat = 5
-              ),
-              args("preparedSimsGraph")
-            )
-          }
-
-          val fullExec = simsGraphExec.flatMap { sims =>
-            ClusterEvaluation
-              .clusterLevelEvaluation(sims, knownFor, "eval")
-              .flatMap { clusterResultsPipe =>
-                val clusterResults = clusterResultsPipe.forceToDiskExecution
-                val outputExec = clusterResults.flatMap { pipe =>
-                  pipe
-                    .map {
-                      case (clusterId, (clusterSize, quality)) =>
-                        "%d\t%d\t%.2g\t%.2g\t%.1f\t%.2g\t%.2f\t%.2g\t%.2g"
-                          .format(
-                            clusterId,
-                            clusterSize,
-                            quality.unweightedRecall.getOrElse(0.0),
-                            quality.weightedRecall.getOrElse(0.0),
-                            quality.unweightedRecallDenominator.getOrElse(0.0),
-                            quality.weightedRecallDenominator.getOrElse(0.0),
-                            quality.relativePrecision.getOrElse(0.0),
-                            quality.relativePrecisionNumerator.getOrElse(0.0),
-                            quality.weightAndProductOfNodeScoresCorrelation.getOrElse(0.0)
-                          )
-                    }.writeExecution(TypedTsv(args("outputDir")))
-                }
-
-                val printExec = clusterResults.flatMap { pipe =>
-                  ClusterEvaluation.summarizePerClusterResults(pipe).map {
-                    case Some(res) =>
-                      println("Overall results: " + Util.prettyJsonMapper.writeValueAsString(res))
-                    case None =>
-                      println("No overall results!!! Probably cluster results pipe is empty.")
-                  }
-                }
-
-                Execution.zip(outputExec, printExec)
-              }
-          }
-
-          Util.printCounters(fullExec)
-        }
-    }
-}
-
-trait ClusterEvaluationBatch extends TwitterScheduledExecutionApp {
-  implicit val tz: java.util.TimeZone = DateOps.UTC
-  implicit val dp = DateParser.default
-
-  def firstTime: String
-
-  def batchDescription: String
-
-  def batchIncrement: Duration
-
-  private lazy val execArgs = AnalyticsBatchExecutionArgs(
-    batchDesc = BatchDescription(batchDescription),
-    firstTime = BatchFirstTime(RichDate(firstTime)),
-    lastTime = None,
-    batchIncrement = BatchIncrement(batchIncrement)
-  )
-
-  val emailAddress: String = "no-reply@twitter.com"
-
-  def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
-
-  def knownForModelVersion: String
-
-  def baselineKnownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
-
-  def baselineKnownForModelVersion: String
-
-  override def scheduledJob: Execution[Unit] =
-    AnalyticsBatchExecution(execArgs) { implicit dateRange =>
-      Execution.withId { implicit uniqueId =>
-        Execution.withArgs { args =>
-          val baselineKnownFor =
-            KnownForSources.fromKeyVal(
-              DAL
-                .readMostRecentSnapshot(baselineKnownForDALDataset, dateRange.prepend(Days(7)))
-                .toTypedPipe,
-              baselineKnownForModelVersion
-            )
-
-          val knownFor =
-            KnownForSources.fromKeyVal(
-              DAL
-                .readMostRecentSnapshot(knownForDALDataset, dateRange.prepend(Days(7)))
-                .toTypedPipe,
-              knownForModelVersion
-            )
-
-          val inputSimsGraph = TypedPipe
-            .from(FollowingsCosineSimilaritiesManhattanSource())
-            .map(_._2)
-
-          val minActiveFollowers = args.int("minActiveFollowers")
-          val topK = args.int("topK")
-          val maxSimsNeighborsForEval =
-            args.int("maxSimsNeighborsForEval", 40)
-
-          val topUsers = TopUsersSimilarityGraph
-            .topUsers(
-              DAL
-                .readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange)
-                .toTypedPipe,
-              minActiveFollowers,
-              topK
-            )
-            .map(_.id)
-            .count("num_top_users")
-
-          TopUsersSimilarityGraph
-            .getSubgraphFromUserGroupedInput(
-              fullGraph = inputSimsGraph,
-              usersToInclude = topUsers,
-              maxNeighborsPerNode = maxSimsNeighborsForEval,
-              degreeThresholdForStat = 2
-            )
-            .forceToDiskExecution
-            .flatMap { symmetrizedSims =>
-              val baselineResultsExec = ClusterEvaluation
-                .overallEvaluation(symmetrizedSims, baselineKnownFor, "baselineKnownForEval")
-              val newResultsExec = ClusterEvaluation
-                .overallEvaluation(symmetrizedSims, knownFor, "newKnownForEval")
-              val minSizeOfBiggerClusterForComparison = 10
-              val compareExec = CompareClusters.summarize(
-                CompareClusters.compare(
-                  KnownForSources.transpose(baselineKnownFor),
-                  KnownForSources.transpose(knownFor),
-                  minSizeOfBiggerCluster = minSizeOfBiggerClusterForComparison
-                ))
-
-              Execution
-                .zip(baselineResultsExec, newResultsExec, compareExec)
-                .map {
-                  case (oldResults, newResults, compareResults) =>
-                    val emailText =
-                      s"Evaluation Results for baseline knownFor: $baselineKnownForModelVersion \n" +
-                        Util.prettyJsonMapper.writeValueAsString(oldResults) +
-                        "\n\n-------------------\n\n" +
-                        s"Evaluation Results for new knownFor:$knownForModelVersion\n" +
-                        Util.prettyJsonMapper.writeValueAsString(newResults) +
-                        "\n\n-------------------\n\n" +
-                        s"Cosine similarity distribution between $baselineKnownForModelVersion and " +
-                        s"$knownForModelVersion cluster membership vectors for " +
-                        s"clusters with at least $minSizeOfBiggerClusterForComparison members:\n" +
-                        Util.prettyJsonMapper
-                          .writeValueAsString(compareResults)
-
-                    Util
-                      .sendEmail(
-                        emailText,
-                        s"Evaluation results comparing $knownForModelVersion with baseline $baselineKnownForModelVersion",
-                        emailAddress)
-                    ()
-                }
-            }
-        }
-      }
-    }
-}
-
-/**
- * capesospy-v2 update --build_locally --start_cron cluster_evaluation_for_20M_145k \
- * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
- */
-object ClusterEvaluationFor20M145K extends ClusterEvaluationBatch {
-  override val firstTime: String = "2019-06-11"
-
-  override val batchIncrement: Duration = Days(7)
-
-  override val batchDescription = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K"
-
-  override val knownForDALDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
-
-  override val knownForModelVersion = ModelVersions.Model20M145KUpdated
-
-  override val baselineKnownForDALDataset = SimclustersV2KnownFor20M145KDec11ScalaDataset
-
-  override val baselineKnownForModelVersion = ModelVersions.Model20M145KDec11
-}
-
-/**
- * capesospy-v2 update --build_locally --start_cron cluster_evaluation_for_20M_145k_2020 \
- * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
- */
-object ClusterEvaluationFor20M145K2020 extends ClusterEvaluationBatch {
-  override val firstTime: String = "2021-01-25"
-
-  override val batchIncrement: Duration = Days(7)
-
-  override val batchDescription =
-    "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K2020"
-
-  override val knownForDALDataset = SimclustersV2KnownFor20M145K2020ScalaDataset
-
-  override val knownForModelVersion = ModelVersions.Model20M145K2020
-
-  override val baselineKnownForDALDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
-
-  override val baselineKnownForModelVersion = ModelVersions.Model20M145KUpdated
-}
--- a/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.scala
@ -1,131 +0,0 @@
-package com.twitter.simclusters_v2.scalding
-
-import com.twitter.scalding.{DateOps, DateParser, Execution, Stat, TypedPipe, TypedTsv, UniqueID}
-import com.twitter.scalding_internal.job.TwitterExecutionApp
-import com.twitter.simclusters_v2.common.{ClusterId, UserId}
-import com.twitter.simclusters_v2.scalding.common.Util
-import com.twitter.simclusters_v2.scalding.common.Util.Distribution
-
-object CompareClusters {
-  def norm(a: Iterable[Float]): Float = {
-    math
-      .sqrt(a.map { x => x * x }.sum).toFloat
-  }
-
-  def cosine(a: Map[Long, Float], b: Map[Long, Float]): Float = {
-    val intersect = a.toList.collect {
-      case (id, score) if b.contains(id) =>
-        score * b(id)
-    }
-    val dot = if (intersect.nonEmpty) intersect.sum else 0
-    val aNorm = norm(a.values)
-    val bNorm = norm(b.values)
-    if (aNorm > 0 && bNorm > 0) {
-      dot / aNorm / bNorm
-    } else 0
-  }
-
-  /**
-   * Compare two known-for data set, and generate change in cluster assignment stats
-   */
-  def compareClusterAssignments(
-    newKnownFor: TypedPipe[(UserId, List[(ClusterId, Float)])],
-    oldKnownFor: TypedPipe[(UserId, List[(ClusterId, Float)])]
-  )(
-    implicit uniqueID: UniqueID
-  ): Execution[String] = {
-
-    val emptyToSomething = Stat("no_assignment_to_some")
-    val somethingToEmpty = Stat("some_assignment_to_none")
-    val emptyToEmpty = Stat("empty_to_empty")
-    val sameCluster = Stat("same_cluster")
-    val diffCluster = Stat("diff_cluster")
-
-    val calculateStatExec = newKnownFor
-      .outerJoin(oldKnownFor)
-      .map {
-        case (userId, (newKnownForListOpt, oldKnownForListOpt)) =>
-          val newKnownFor = newKnownForListOpt.getOrElse(Nil)
-          val oldKnownFor = oldKnownForListOpt.getOrElse(Nil)
-
-          if (newKnownFor.nonEmpty && oldKnownFor.isEmpty) {
-            emptyToSomething.inc()
-          }
-          if (newKnownFor.isEmpty && oldKnownFor.nonEmpty) {
-            somethingToEmpty.inc()
-          }
-          if (newKnownFor.isEmpty && oldKnownFor.isEmpty) {
-            emptyToEmpty.inc()
-          }
-
-          if (newKnownFor.nonEmpty && oldKnownFor.nonEmpty) {
-            val newClusterId = newKnownFor.head._1
-            val oldClusterId = oldKnownFor.head._1
-
-            if (newClusterId == oldClusterId) {
-              sameCluster.inc()
-            } else {
-              diffCluster.inc()
-            }
-          }
-          userId
-      }
-      .toIterableExecution
-
-    Util.getCustomCountersString(calculateStatExec)
-  }
-
-  /**
-   * Compare two cluster assignments in terms of cosine similarity of corresponding clusters.
-   * Excludes clusters which are too small
-   * @param knownForA
-   * @param knownForB
-   * @param minSizeOfBiggerCluster Set to 10 or some such.
-   * @return
-   */
-  def compare(
-    knownForA: TypedPipe[(Int, List[(Long, Float)])],
-    knownForB: TypedPipe[(Int, List[(Long, Float)])],
-    minSizeOfBiggerCluster: Int
-  ): TypedPipe[(Int, Float)] = {
-    knownForA
-      .outerJoin(knownForB)
-      .collect {
-        case (clusterId, (membersInAOpt, membersInBOpt))
-            if membersInAOpt.exists(_.size >= minSizeOfBiggerCluster) || membersInBOpt
-              .exists(_.size >= minSizeOfBiggerCluster) =>
-          val membersInA =
-            membersInAOpt.map(_.toMap).getOrElse(Map.empty[Long, Float])
-          val membersInB =
-            membersInBOpt.map(_.toMap).getOrElse(Map.empty[Long, Float])
-          (clusterId, cosine(membersInA, membersInB))
-      }
-  }
-
-  def summarize(clusterToCosines: TypedPipe[(Int, Float)]): Execution[Option[Distribution]] = {
-    clusterToCosines.values.map(x => List(x)).sum.toOptionExecution.map { listOpt =>
-      listOpt.map { list => Util.distributionFromArray(list.map(_.toDouble).toArray) }
-    }
-  }
-}
-
-object CompareClustersAdhoc extends TwitterExecutionApp {
-  implicit val tz: java.util.TimeZone = DateOps.UTC
-  implicit val dp = DateParser.default
-
-  def job: Execution[Unit] =
-    Execution.getConfigMode.flatMap {
-      case (config, mode) =>
-        Execution.withId { implicit uniqueId =>
-          val args = config.getArgs
-
-          val knownForA = KnownForSources.transpose(KnownForSources.readKnownFor(args("knownForA")))
-          val knownForB = KnownForSources.transpose(KnownForSources.readKnownFor(args("knownForB")))
-
-          CompareClusters
-            .compare(knownForA, knownForB, minSizeOfBiggerCluster = 10)
-            .map { case (cId, cos) => "%d\t%.2f".format(cId, cos) }
-            .writeExecution(TypedTsv(args("outputDir")))
-        }
-    }
-}
--- a/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.scala
@ -1,330 +0,0 @@
-package com.twitter.simclusters_v2.scalding
-
-import com.twitter.algebird.Monoid
-import com.twitter.logging.Logger
-import com.twitter.scalding.{Execution, TypedPipe, TypedTsv}
-import com.twitter.scalding_internal.job.TwitterExecutionApp
-import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
-import java.util
-import no.uib.cipr.matrix.Matrix
-import no.uib.cipr.matrix.sparse.{ArpackSym, LinkedSparseMatrix}
-import scala.collection.JavaConverters._
-
-object EigenVectorsForSparseSymmetric {
-  val log: Logger = Logger()
-
-  /**
-   * Construct matrix from the rows of the matrix, specified as a map. The outer map is indexed by rowId, and the inner maps are indexed by columnId.
-   * Note that the input matrix is intended to be symmetric.
-   *
-   * @param map   A map specifying the rows of the matrix. The outer map is indexed by rowId, and the inner maps are indexed by columnId. Both rows and columns are zero-indexed.
-   * @param nRows number of rows in matrix
-   * @param nCols number of columns in matrix
-   *
-   * @return the constructed matrix
-   */
-  def getMatrix(map: Map[Int, Map[Int, Double]], nRows: Int, nCols: Int): Matrix = {
-    val nonzeros = map.toSeq.flatMap {
-      case (i, subMap) =>
-        subMap.toSeq.map {
-          case (j, value) =>
-            (i, j, value)
-        }
-    }
-    getMatrix(nonzeros, nRows, nCols)
-  }
-
-  /**
-   * Construct matrix from iterable of the non-zero entries. Note that the input matrix is intended to be symmetric.
-   *
-   * @param nonzeros non-zeros in (i, j, v) format, where i is row, j is column, and v is value. Both rows and columns are zero-indexed.
-   * @param nRows    number of rows in matrix
-   * @param nCols    number of columns in matrix
-   *
-   * @return the constructed matrix
-   */
-  def getMatrix(nonzeros: Iterable[(Int, Int, Double)], nRows: Int, nCols: Int): Matrix = {
-    val matrix = new LinkedSparseMatrix(nRows, nCols)
-    var numEntries = 0
-    var maxRow = 0
-    var maxCol = 0
-
-    nonzeros.foreach {
-      case (i, j, v) =>
-        if (i > maxRow) {
-          maxRow = i
-        }
-        if (j > maxCol) {
-          maxCol = j
-        }
-        numEntries += 1
-        matrix.set(i, j, v)
-    }
-    log.info(
-      "Finished building matrix with %d entries and maxRow %d and maxCol %d"
-        .format(numEntries, maxRow, maxCol))
-
-    matrix
-  }
-
-  /**
-   * Prints out various diagnostics about how much the given matrix differs from a perfect
-   * symmetric matrix. If (i,j) and (j,i) are different, it sets both of them to be the max of the two.
-   * Call this function before invoking EVD.
-   *
-   * @param matrix Matrix which is modified (if need be) in place.
-   */
-  def ensureMatrixIsSymmetric(matrix: Matrix): Unit = {
-    var numUnequalEntries = 0
-    var numEntriesDifferentBy1Percent = 0
-    var numEqualEntries = 0
-    var numUnequalDueToZero = 0
-    var maxUnequal = (0, 0, 0.0, 0.0)
-    matrix.iterator().asScala.foreach { entry =>
-      val curr = entry.get()
-      val opp = matrix.get(entry.column(), entry.row())
-      if (curr == opp) {
-        numEqualEntries += 1
-      } else {
-        numUnequalEntries += 1
-        if (opp == 0) {
-          numUnequalDueToZero += 1
-        }
-        if (opp != 0 && (math.abs(curr - opp) / math.min(curr, opp)) > 0.01) {
-          numEntriesDifferentBy1Percent += 1
-        }
-        if (opp != 0 && math.abs(curr - opp) > maxUnequal._4) {
-          maxUnequal = (entry.row(), entry.column(), curr, math.abs(curr - opp))
-        }
-        val max = math.max(curr, opp)
-        matrix.set(entry.column(), entry.row(), max)
-        matrix.set(entry.row(), entry.column(), max)
-      }
-    }
-
-    var numUnEqualPrinted = 0
-    matrix.iterator().asScala.foreach { entry =>
-      val opp = matrix.get(entry.column(), entry.row())
-      if (numUnEqualPrinted < 10 && entry.get() != opp) {
-        numUnEqualPrinted += 1
-        log.info(
-          "Entries for (%d, %d) are %s and %s"
-            .format(entry.row(), entry.column(), entry.get(), opp))
-      }
-    }
-
-    log.info(
-      "Num unequal entries: %d, num unequal due to zero: %d, num unequal by 1percent or more: %d, num equal entries: %d, maxUnequal: %s"
-        .format(
-          numUnequalEntries,
-          numUnequalDueToZero,
-          numEntriesDifferentBy1Percent,
-          numEqualEntries,
-          maxUnequal))
-  }
-
-  /**
-   * Get the top-k eigenvalues (largest magnitude) and eigenvectors for an input matrix.
-   * Top eigenvalues means they're the largest in magnitude.
-   * Input matrix needs to be perfectly symmetric; if it's not, this function will fail.
-   *
-   * Many of the eigenvectors will have very small values along most of the dimensions. This method also
-   * only retains the bigger entries in an eigenvector.
-   *
-   * @param matrix               symmetric input matrix.
-   * @param k                    how many of the top eigenvectors to get.
-   * @param ratioToLargestCutoff An entry needs to be at least 1/ratioToLargestCutoff of the biggest entry in that vector to be retained.
-   *
-   * @return seq of (eigenvalue, eigenvector) pairs.
-   */
-  def getTruncatedEVD(
-    matrix: Matrix,
-    k: Int,
-    ratioToLargestCutoff: Float
-  ): Seq[(Double, Seq[(Int, Double)])] = {
-    val solver = new ArpackSym(matrix)
-    val resultsMap = solver.solve(k, ArpackSym.Ritz.LM).asScala.toMap
-    val results = resultsMap.toIndexedSeq.sortBy { case (eigValue, _) => -eigValue }
-    results.zipWithIndex.map {
-      case ((eigValue, denseVectorJava), index) =>
-        val denseVector = new Array[Double](denseVectorJava.size())
-        denseVector.indices.foreach { index => denseVector(index) = denseVectorJava.get(index) }
-        val denseVectorMax = denseVector.maxBy { entry => math.abs(entry) }
-        val cutOff = math.abs(denseVectorMax) / ratioToLargestCutoff
-        val significantEntries = denseVector.zipWithIndex
-          .filter { case (vectorEntry, _) => math.abs(vectorEntry) >= cutOff }
-          .sortBy { case (vectorEntry, _) => -1 * math.abs(vectorEntry) }
-        (eigValue.toDouble, significantEntries.toSeq.map(_.swap))
-    }
-  }
-
-  /**
-   * Compute U*Diag*Ut - where Diag is a diagonal matrix, and U is a sparse matrix.
-   * This is primarily for testing - to make sure that the computed eigenvectors can be used to
-   * reconstruct the original matrix up to some reasonable approximation.
-   *
-   * @param diagToUColumns seq of (diagonal entries, associated column in U)
-   * @param cutoff         cutoff for including a value in the result.
-   *
-   * @return result of multiplication, returned as a map of the rows in the results.
-   */
-  def uTimesDiagTimesUT(
-    diagToUColumns: Seq[(Double, Seq[(Int, Double)])],
-    cutoff: Double
-  ): Map[Int, Map[Int, Double]] = {
-    val result = new util.HashMap[Int, util.HashMap[Int, Double]]()
-    diagToUColumns.foreach {
-      case (diag, uColumn) =>
-        uColumn.foreach {
-          case (i, iVal) =>
-            uColumn.foreach {
-              case (j, jVal) =>
-                val prod = diag * iVal * jVal
-                if (result.containsKey(i)) {
-                  val newVal = if (result.get(i).containsKey(j)) {
-                    result.get(i).get(j) + prod
-                  } else prod
-                  result.get(i).put(j, newVal)
-                } else {
-                  result.put(i, new util.HashMap[Int, Double])
-                  result.get(i).put(j, prod)
-                }
-            }
-        }
-    }
-    val unfiltered = result.asScala.toMap.mapValues(_.asScala.toMap)
-    unfiltered
-      .mapValues { m => m.filter { case (_, value) => math.abs(value) >= cutoff } }
-      .filter { case (_, vector) => vector.nonEmpty }
-  }
-
-  /** Note: This requires a full EVD to correctly compute the inverse! :-( */
-  def getInverseFromEVD(
-    evd: Seq[(Double, Seq[(Int, Double)])],
-    cutoff: Double
-  ): Map[Int, Map[Int, Double]] = {
-    val evdInverse = evd.map {
-      case (eigValue, eigVector) =>
-        (1.0 / eigValue, eigVector)
-    }
-    uTimesDiagTimesUT(evdInverse, cutoff)
-  }
-}
-
-object PCAProjectionMatrixAdhoc extends TwitterExecutionApp {
-  val log = Logger()
-
-  def job: Execution[Unit] =
-    Execution.getConfigMode.flatMap {
-      case (config, _) =>
-        Execution.withId { _ =>
-          val args = config.getArgs
-          val k = args.int("k", 100)
-          val ratioToLargestEntryInVectorCutoff = args.int("ratioToLargestEntryInVectorCutoff", 100)
-          val minClusterFavers = args.int("minClusterFavers", 1000)
-          val input = TypedPipe.from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
-          val outputDir = args("outputDir")
-
-          val filteredClustersExec =
-            input
-              .collect {
-                case ((_, clusterId), details)
-                    if details.numUsersWithNonZeroFavScore > minClusterFavers =>
-                  clusterId
-              }
-              .toIterableExecution
-              .map { fc =>
-                val fcSet = fc.toSet
-                log.info("Number of clusters with favers more than %d is %d"
-                  .format(minClusterFavers, fcSet.size))
-                fcSet
-              }
-
-          filteredClustersExec
-            .flatMap { filteredClusters =>
-              input.flatMap {
-                case ((_, clusterId), details) =>
-                  if (filteredClusters(clusterId)) {
-                    details.neighborClusters.getOrElse(Nil).collect {
-                      case neighbor
-                          if filteredClusters(
-                            neighbor.clusterId) && neighbor.favCosineSimilarity.isDefined =>
-                        (clusterId, neighbor.clusterId, neighbor.favCosineSimilarity.get)
-                    }
-                  } else Nil
-              }.toIterableExecution
-            }
-            .flatMap { edgesIter =>
-              val edges = edgesIter.toSeq
-              val oldIdToNewId = edges
-                .flatMap { case (i, j, _) => Seq(i, j) }
-                .distinct
-                .zipWithIndex
-                .toMap
-
-              val mapString = oldIdToNewId.toList
-                .take(5).map {
-                  case (old, nw) =>
-                    Seq(old, nw).mkString(" ")
-                }.mkString("\n")
-              log.info("A few entries of OldId to NewId map is")
-              log.info(mapString)
-
-              val newIdToOldId = oldIdToNewId.map(_.swap)
-              log.info(
-                "Num clusters after filtering out those with no neighbors with favers more than %d is %d"
-                  .format(minClusterFavers, oldIdToNewId.size))
-              val newEdges = edges.map {
-                case (oldI, oldJ, value) =>
-                  (oldIdToNewId(oldI), oldIdToNewId(oldJ), value)
-              }
-              log.info("Going to build matrix")
-              val matrix = EigenVectorsForSparseSymmetric.getMatrix(
-                newEdges,
-                oldIdToNewId.size,
-                oldIdToNewId.size)
-              EigenVectorsForSparseSymmetric.ensureMatrixIsSymmetric(matrix)
-
-              log.info("Going to solve now for %d eigenvalues".format(k))
-              val tic = System.currentTimeMillis()
-              val results = EigenVectorsForSparseSymmetric.getTruncatedEVD(
-                matrix,
-                k,
-                ratioToLargestEntryInVectorCutoff)
-              val toc = System.currentTimeMillis()
-              log.info("Finished solving in %.2f minutes".format((toc - tic) / 1000 / 60.0))
-
-              val eigValues = results.map(_._1).map { x => "%.3g".format(x) }.mkString(" ")
-              val eigValueNorm = math.sqrt(results.map(_._1).map(x => x * x).sum)
-              val matrixNorm = math.sqrt(matrix.iterator().asScala.map(_.get()).map(x => x * x).sum)
-
-              println(
-                "matrixNorm %s, eigValueNorm %s, explained fraction %s"
-                  .format(matrixNorm, eigValueNorm, eigValueNorm / matrixNorm))
-
-              log.info("The eigenvalues are:")
-              log.info(eigValues)
-
-              val nnzInEigenVectors = results.map(_._2.size).sum
-              log.info("Average nnz per eigenvector using ratioToLargestCutoff %d is %.2g"
-                .format(ratioToLargestEntryInVectorCutoff, nnzInEigenVectors * 1.0 / results.size))
-              val transposedRaw = results.zipWithIndex.flatMap {
-                case ((_, eigVector), eigIndex) =>
-                  eigVector.map {
-                    case (index, vectorEntry) =>
-                      val clusterId = newIdToOldId(index)
-                      Map(clusterId -> List((eigIndex, vectorEntry)))
-                  }
-              }
-              val transposed = Monoid.sum(transposedRaw).mapValues { rowForCluster =>
-                rowForCluster
-                  .map {
-                    case (dimId, weight) =>
-                      "%d:%.2g".format(dimId, weight)
-                  }.mkString(" ")
-              }
-              TypedPipe.from(transposed.toSeq).writeExecution(TypedTsv(outputDir))
-            }
-        }
-    }
-}
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.scala
@ -1,332 +0,0 @@
-package com.twitter.simclusters_v2.scalding
-
-import com.twitter.dal.client.dataset.KeyValDALDataset
-import com.twitter.dal.client.dataset.SnapshotDALDataset
-import com.twitter.scalding._
-import com.twitter.scalding_internal.dalv2.DAL
-import com.twitter.scalding_internal.dalv2.DALWrite.D
-import com.twitter.scalding_internal.dalv2.DALWrite.WriteExtension
-import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC
-import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
-import com.twitter.simclusters_v2.common.ClusterId
-import com.twitter.simclusters_v2.common.ModelVersions
-import com.twitter.simclusters_v2.common.UserId
-import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
-import com.twitter.simclusters_v2.hdfs_sources.AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
-import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
-import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2UserToInterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
-import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
-import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
-import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
-import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores
-import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusters
-import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
-import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp
-import java.util.TimeZone
-
-/**
- * Production job for computing interestedIn data set from the aggregatable producer embeddings for the model version 20M145K2020.
- * It writes the data set in KeyVal format to produce a MH DAL data set.
- *
- * A high level description of this job:
- * - Read the APE dataset
- * - Apply log1p to the scores from the above dataset as the scores for producers is high
- * - Normalize the scores for each producer (offline benchmarking has shown better results from this step.)
- * - Truncate the number of clusters for each producer from the APE dataset to reduce noise
- * - Compute interestedIn
- *
- * To deploy the job:
- *
- * capesospy-v2 update --build_locally --start_cron interested_in_from_ape_2020 \
- * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
- */
-object InterestedInFromAPE2020BatchApp extends InterestedInFromAggregatableProducerEmbeddingsBase {
-
-  override val firstTime: RichDate = RichDate("2021-03-03")
-
-  override val batchIncrement: Duration = Days(7)
-
-  override def modelVersion: ModelVersion = ModelVersion.Model20m145k2020
-
-  override def producerEmbeddingsInputKVDataset: KeyValDALDataset[
-    KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]
-  ] = AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
-
-  override def interestedInFromAPEOutputKVDataset: KeyValDALDataset[
-    KeyVal[UserId, ClustersUserIsInterestedIn]
-  ] = SimclustersV2InterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
-
-  override def interestedInFromAPEOutputThriftDatset: SnapshotDALDataset[
-    UserToInterestedInClusters
-  ] = SimclustersV2UserToInterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
-}
-
-trait InterestedInFromAggregatableProducerEmbeddingsBase extends ScheduledExecutionApp {
-  def modelVersion: ModelVersion
-
-  def interestedInFromAPEOutputKVDataset: KeyValDALDataset[
-    KeyVal[UserId, ClustersUserIsInterestedIn]
-  ]
-
-  def producerEmbeddingsInputKVDataset: KeyValDALDataset[
-    KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]
-  ]
-
-  def interestedInFromAPEOutputThriftDatset: SnapshotDALDataset[UserToInterestedInClusters]
-
-  override def runOnDateRange(
-    args: Args
-  )(
-    implicit dateRange: DateRange,
-    timeZone: TimeZone,
-    uniqueID: UniqueID
-  ): Execution[Unit] = {
-    //Input args for the run
-    val socialProofThreshold = args.int("socialProofThreshold", 2)
-    val maxClustersFromProducer = args.int("maxClustersPerProducer", 5)
-    val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 200)
-
-    //Path variables
-    val interestedInFromProducersPath =
-      s"/user/cassowary/manhattan_sequence_files/interested_in_from_ape/" + modelVersion
-
-    val interestedInFromProducersThriftPath =
-      s"/user/cassowary/manhattan_sequence_files/interested_in_from_ape_thrift/" + modelVersion
-
-    val userUserGraph: TypedPipe[UserAndNeighbors] =
-      DAL
-        .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
-        .withRemoteReadPolicy(AllowCrossDC)
-        .toTypedPipe
-
-    val producerEmbeddings = DAL
-      .readMostRecentSnapshotNoOlderThan(
-        producerEmbeddingsInputKVDataset,
-        Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe.map {
-        case KeyVal(producer, embeddings) => (producer, embeddings)
-      }
-
-    val result = InterestedInFromAggregatableProducerEmbeddingsBase.run(
-      userUserGraph,
-      producerEmbeddings,
-      maxClustersFromProducer,
-      socialProofThreshold,
-      maxClustersPerUserFinalResult,
-      modelVersion)
-
-    val keyValExec =
-      result
-        .map { case (userId, clusters) => KeyVal(userId, clusters) }
-        .writeDALVersionedKeyValExecution(
-          interestedInFromAPEOutputKVDataset,
-          D.Suffix(interestedInFromProducersPath)
-        )
-    val thriftExec =
-      result
-        .map {
-          case (userId, clusters) =>
-            UserToInterestedInClusters(
-              userId,
-              ModelVersions.toKnownForModelVersion(modelVersion),
-              clusters.clusterIdToScores)
-        }
-        .writeDALSnapshotExecution(
-          interestedInFromAPEOutputThriftDatset,
-          D.Daily,
-          D.Suffix(interestedInFromProducersThriftPath),
-          D.EBLzo(),
-          dateRange.end
-        )
-    Execution.zip(keyValExec, thriftExec).unit
-  }
-}
-
-/**
- * Adhoc job to generate the interestedIn from aggregatable producer embeddings for the model version 20M145K2020
- *
- * scalding remote run \
- * --user cassowary \
- * --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
- * --principal service_acoount@TWITTER.BIZ \
- * --cluster bluebird-qus1 \
- * --main-class com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020AdhocApp \
- * --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_ape_2020-adhoc \
- * --hadoop-properties "mapreduce.map.memory.mb=8192 mapreduce.map.java.opts='-Xmx7618M' mapreduce.reduce.memory.mb=8192 mapreduce.reduce.java.opts='-Xmx7618M'" \
- * -- --outputDir /gcs/user/cassowary/adhoc/your_ldap/interested_in_from_ape_2020_keyval --date 2021-03-05
- */
-object InterestedInFromAPE2020AdhocApp extends AdhocExecutionApp {
-  override def runOnDateRange(
-    args: Args
-  )(
-    implicit dateRange: DateRange,
-    timeZone: TimeZone,
-    uniqueID: UniqueID
-  ): Execution[Unit] = {
-    val outputDir = args("outputDir")
-    val socialProofThreshold = args.int("socialProofThreshold", 2)
-    val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 200)
-    val maxClustersFromProducer = args.int("maxClustersFromProducer", 5)
-    val inputGraph = args.optional("graphInputDir") match {
-      case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir))
-      case None =>
-        DAL
-          .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
-          .withRemoteReadPolicy(AllowCrossClusterSameDC)
-          .toTypedPipe
-    }
-
-    val producerEmbeddings = DAL
-      .readMostRecentSnapshotNoOlderThan(
-        AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset,
-        Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe.map {
-        case KeyVal(producer, embeddings) => (producer, embeddings)
-      }
-
-    val result = InterestedInFromAggregatableProducerEmbeddingsBase.run(
-      inputGraph,
-      producerEmbeddings,
-      maxClustersFromProducer,
-      socialProofThreshold,
-      maxClustersPerUserFinalResult,
-      ModelVersion.Model20m145k2020)
-
-    result
-      .writeExecution(AdhocKeyValSources.interestedInSource(outputDir))
-  }
-}
-
-/**
- * Helper functions
- */
-object InterestedInFromAggregatableProducerEmbeddingsBase {
-
-  /**
-   * Helper function to prune the embeddings
-   * @param embeddingsWithScore embeddings
-   * @param maxClusters number of clusters to keep, per userId
-   * @param uniqueId for stats
-   * @return
-   */
-  def getPrunedEmbeddings(
-    embeddingsWithScore: TypedPipe[(UserId, Seq[(ClusterId, Float)])],
-    maxClusters: Int
-  )(
-    implicit uniqueId: UniqueID
-  ): TypedPipe[(UserId, Array[(ClusterId, Float)])] = {
-    val numProducerMappings = Stat("num_producer_embeddings_total")
-    val numProducersWithLargeClusterMappings = Stat(
-      "num_producers_with_more_clusters_than_threshold")
-    val numProducersWithSmallClusterMappings = Stat(
-      "num_producers_with_clusters_less_than_threshold")
-    val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings")
-    embeddingsWithScore.map {
-      case (producerId, clusterArray) =>
-        numProducerMappings.inc()
-        val clusterSize = clusterArray.size
-        totalClustersCoverageProducerEmbeddings.incBy(clusterSize)
-        val prunedList = if (clusterSize > maxClusters) {
-          numProducersWithLargeClusterMappings.inc()
-          clusterArray
-            .sortBy {
-              case (_, knownForScore) => -knownForScore
-            }.take(maxClusters)
-        } else {
-          numProducersWithSmallClusterMappings.inc()
-          clusterArray
-        }
-        (producerId, prunedList.toArray)
-    }
-  }
-
-  /**
-   * helper function to remove all scores except follow and logFav
-   * @param interestedInResult interestedIn clusters for a user
-   * @return
-   */
-  def getInterestedInDiscardScores(
-    interestedInResult: TypedPipe[(UserId, List[(ClusterId, UserToInterestedInClusterScores)])]
-  ): TypedPipe[(UserId, List[(ClusterId, UserToInterestedInClusterScores)])] = {
-    interestedInResult.map {
-      case (srcId, fullClusterList) =>
-        val fullClusterListWithDiscardedScores = fullClusterList.map {
-          case (clusterId, clusterDetails) =>
-            val clusterDetailsWithoutSocial = UserToInterestedInClusterScores(
-              // We are not planning to use the other scores except for logFav and Follow.
-              // Hence, setting others as None for now, we can add them back when needed
-              followScore = clusterDetails.followScore,
-              logFavScore = clusterDetails.logFavScore,
-              logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly
-            )
-            (clusterId, clusterDetailsWithoutSocial)
-        }
-        (srcId, fullClusterListWithDiscardedScores)
-    }
-  }
-
-  /**
-   * Helper function to normalize the embeddings
-   * @param embeddings cluster embeddings
-   * @return
-   */
-  def getNormalizedEmbeddings(
-    embeddings: TypedPipe[(UserId, Seq[(ClusterId, Float)])]
-  ): TypedPipe[(UserId, Seq[(ClusterId, Float)])] = {
-    embeddings.map {
-      case (userId, clustersWithScores) =>
-        val l2norm = math.sqrt(clustersWithScores.map(_._2).map(score => score * score).sum)
-        (
-          userId,
-          clustersWithScores.map {
-            case (clusterId, score) => (clusterId, (score / l2norm).toFloat)
-          })
-    }
-  }
-
-  def run(
-    userUserGraph: TypedPipe[UserAndNeighbors],
-    producerEmbeddings: TypedPipe[(SimClustersEmbeddingId, SimClustersEmbedding)],
-    maxClustersFromProducer: Int,
-    socialProofThreshold: Int,
-    maxClustersPerUserFinalResult: Int,
-    modelVersion: ModelVersion
-  )(
-    implicit uniqueId: UniqueID
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-    import InterestedInFromKnownFor._
-
-    val producerEmbeddingsWithScore: TypedPipe[(UserId, Seq[(ClusterId, Float)])] =
-      producerEmbeddings.map {
-        case (
-              SimClustersEmbeddingId(embeddingType, modelVersion, InternalId.UserId(producerId)),
-              simclusterEmbedding) =>
-          (
-            producerId,
-            simclusterEmbedding.embedding.map { simclusterWithScore =>
-              // APE dataset has very high producer scores, hence applying log to smoothen them out before
-              // computing interestedIn
-              (simclusterWithScore.clusterId, math.log(1.0 + simclusterWithScore.score).toFloat)
-            })
-      }
-
-    val result = keepOnlyTopClusters(
-      getInterestedInDiscardScores(
-        attachNormalizedScores(
-          userClusterPairsWithoutNormalization(
-            userUserGraph,
-            getPrunedEmbeddings(
-              getNormalizedEmbeddings(producerEmbeddingsWithScore),
-              maxClustersFromProducer),
-            socialProofThreshold,
-          ))),
-      maxClustersPerUserFinalResult,
-      ModelVersions.toKnownForModelVersion(modelVersion)
-    )
-    result
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.scala
@ -1,666 +0,0 @@
-package com.twitter.simclusters_v2.scalding
-
-import com.twitter.algebird.Semigroup
-import com.twitter.bijection.Injection
-import com.twitter.dal.client.dataset.KeyValDALDataset
-import com.twitter.scalding.TypedPipe
-import com.twitter.scalding._
-import com.twitter.scalding_internal.dalv2.DAL
-import com.twitter.scalding_internal.dalv2.DALWrite._
-import com.twitter.scalding_internal.job.TwitterExecutionApp
-import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecution
-import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecutionArgs
-import com.twitter.scalding_internal.job.analytics_batch.BatchDescription
-import com.twitter.scalding_internal.job.analytics_batch.BatchFirstTime
-import com.twitter.scalding_internal.job.analytics_batch.BatchIncrement
-import com.twitter.scalding_internal.job.analytics_batch.TwitterScheduledExecutionApp
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
-import com.twitter.simclusters_v2.common.ClusterId
-import com.twitter.simclusters_v2.common.ModelVersions
-import com.twitter.simclusters_v2.common.UserId
-import com.twitter.simclusters_v2.hdfs_sources._
-import com.twitter.simclusters_v2.scalding.common.Util
-import com.twitter.simclusters_v2.thriftscala._
-
-/**
- * This file implements the job for computing users' interestedIn vector from KnownFor data set.
- *
- * It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then
- * based on the known-for clusters of each followed/faved user, we calculate how much a user is
- * interestedIn a cluster.
- */
-
-/**
- * Production job for computing interestedIn data set for the model version 20M145K2020.
- *
- * To deploy the job:
- *
- * capesospy-v2 update --build_locally --start_cron interested_in_for_20M_145k_2020 \
- src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
- */
-object InterestedInFromKnownFor20M145K2020 extends InterestedInFromKnownForBatchBase {
-  override val firstTime: String = "2020-10-06"
-  override val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
-    SimclustersV2RawInterestedIn20M145K2020ScalaDataset
-  override val outputPath: String = InternalDataPaths.RawInterestedIn2020Path
-  override val knownForModelVersion: String = ModelVersions.Model20M145K2020
-  override val knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] =
-    SimclustersV2KnownFor20M145K2020ScalaDataset
-}
-
-/**
- * base class for the main logic of computing interestedIn from KnownFor data set.
- */
-trait InterestedInFromKnownForBatchBase extends TwitterScheduledExecutionApp {
-  implicit val tz = DateOps.UTC
-  implicit val parser = DateParser.default
-
-  def firstTime: String
-  val batchIncrement: Duration = Days(7)
-  val lookBackDays: Duration = Days(30)
-
-  def outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
-  def outputPath: String
-  def knownForModelVersion: String
-  def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
-
-  private lazy val execArgs = AnalyticsBatchExecutionArgs(
-    batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
-    firstTime = BatchFirstTime(RichDate(firstTime)),
-    lastTime = None,
-    batchIncrement = BatchIncrement(batchIncrement)
-  )
-
-  override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
-    implicit dateRange =>
-      Execution.withId { implicit uniqueId =>
-        Execution.withArgs { args =>
-          val normalizedGraph =
-            DAL.readMostRecentSnapshot(UserUserNormalizedGraphScalaDataset).toTypedPipe
-          val knownFor = KnownForSources.fromKeyVal(
-            DAL.readMostRecentSnapshot(knownForDALDataset, dateRange.extend(Days(30))).toTypedPipe,
-            knownForModelVersion
-          )
-
-          val socialProofThreshold = args.int("socialProofThreshold", 2)
-          val maxClustersPerUser = args.int("maxClustersPerUser", 50)
-
-          val result = InterestedInFromKnownFor
-            .run(
-              normalizedGraph,
-              knownFor,
-              socialProofThreshold,
-              maxClustersPerUser,
-              knownForModelVersion
-            )
-
-          val writeKeyValResultExec = result
-            .map { case (userId, clusters) => KeyVal(userId, clusters) }
-            .writeDALVersionedKeyValExecution(
-              outputKVDataset,
-              D.Suffix(outputPath)
-            )
-
-          // read previous data set for validation purpose
-          val previousDataset = if (RichDate(firstTime).timestamp != dateRange.start.timestamp) {
-            DAL
-              .readMostRecentSnapshot(outputKVDataset, dateRange.prepend(lookBackDays)).toTypedPipe
-              .map {
-                case KeyVal(user, interestedIn) =>
-                  (user, interestedIn)
-              }
-          } else {
-            TypedPipe.empty
-          }
-
-          Util.printCounters(
-            Execution
-              .zip(
-                writeKeyValResultExec,
-                InterestedInFromKnownFor.dataSetStats(result, "NewResult"),
-                InterestedInFromKnownFor.dataSetStats(previousDataset, "OldResult")
-              ).unit
-          )
-        }
-      }
-  }
-}
-
-/**
- * Adhoc job to compute user interestedIn.
- *
- * scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_adhoc \
- * --user recos-platform \
- * --submitter hadoopnest2.atla.twitter.com \
- * --main-class com.twitter.simclusters_v2.scalding.InterestedInFromKnownForAdhoc -- \
- * --date 2019-08-26  --outputDir /user/recos-platform/adhoc/simclusters_interested_in_log_fav
- */
-object InterestedInFromKnownForAdhoc extends TwitterExecutionApp {
-  def job: Execution[Unit] =
-    Execution.getConfigMode.flatMap {
-      case (config, mode) =>
-        Execution.withId { implicit uniqueId =>
-          val args = config.getArgs
-          val normalizedGraph = TypedPipe.from(
-            UserAndNeighborsFixedPathSource(args("graphInputDir"))
-          )
-          val socialProofThreshold = args.int("socialProofThreshold", 2)
-          val maxClustersPerUser = args.int("maxClustersPerUser", 20)
-          val knownForModelVersion = args("knownForModelVersion")
-          val knownFor = KnownForSources.readKnownFor(args("knownForInputDir"))
-
-          val outputSink = AdhocKeyValSources.interestedInSource(args("outputDir"))
-          Util.printCounters(
-            InterestedInFromKnownFor
-              .run(
-                normalizedGraph,
-                knownFor,
-                socialProofThreshold,
-                maxClustersPerUser,
-                knownForModelVersion
-              ).writeExecution(outputSink)
-          )
-        }
-    }
-}
-
-/**
- * Adhoc job to check the output of an adhoc interestedInSource.
- */
-object DumpInterestedInAdhoc extends TwitterExecutionApp {
-  def job: Execution[Unit] =
-    Execution.getConfigMode.flatMap {
-      case (config, mode) =>
-        Execution.withId { implicit uniqueId =>
-          val args = config.getArgs
-          val users = args.list("users").map(_.toLong).toSet
-          val input = TypedPipe.from(AdhocKeyValSources.interestedInSource(args("inputDir")))
-          input.filter { case (userId, rec) => users.contains(userId) }.toIterableExecution.map {
-            s => println(s.map(Util.prettyJsonMapper.writeValueAsString).mkString("\n"))
-          }
-        }
-    }
-}
-
-/**
- * Helper functions
- */
-object InterestedInFromKnownFor {
-  private def ifNanMake0(x: Double): Double = if (x.isNaN) 0.0 else x
-
-  case class SrcClusterIntermediateInfo(
-    followScore: Double,
-    followScoreProducerNormalized: Double,
-    favScore: Double,
-    favScoreProducerNormalized: Double,
-    logFavScore: Double,
-    logFavScoreProducerNormalized: Double,
-    followSocialProof: List[Long],
-    favSocialProof: List[Long]) {
-    // overriding for the sake of unit tests
-    override def equals(obj: scala.Any): Boolean = {
-      obj match {
-        case that: SrcClusterIntermediateInfo =>
-          math.abs(followScore - that.followScore) < 1e-5 &&
-            math.abs(followScoreProducerNormalized - that.followScoreProducerNormalized) < 1e-5 &&
-            math.abs(favScore - that.favScore) < 1e-5 &&
-            math.abs(favScoreProducerNormalized - that.favScoreProducerNormalized) < 1e-5 &&
-            math.abs(logFavScore - that.logFavScore) < 1e-5 &&
-            math.abs(logFavScoreProducerNormalized - that.logFavScoreProducerNormalized) < 1e-5 &&
-            followSocialProof.toSet == that.followSocialProof.toSet &&
-            favSocialProof.toSet == that.favSocialProof.toSet
-        case _ => false
-      }
-    }
-  }
-
-  implicit object SrcClusterIntermediateInfoSemigroup
-      extends Semigroup[SrcClusterIntermediateInfo] {
-    override def plus(
-      left: SrcClusterIntermediateInfo,
-      right: SrcClusterIntermediateInfo
-    ): SrcClusterIntermediateInfo = {
-      SrcClusterIntermediateInfo(
-        followScore = left.followScore + right.followScore,
-        followScoreProducerNormalized =
-          left.followScoreProducerNormalized + right.followScoreProducerNormalized,
-        favScore = left.favScore + right.favScore,
-        favScoreProducerNormalized =
-          left.favScoreProducerNormalized + right.favScoreProducerNormalized,
-        logFavScore = left.logFavScore + right.logFavScore,
-        logFavScoreProducerNormalized =
-          left.logFavScoreProducerNormalized + right.logFavScoreProducerNormalized,
-        followSocialProof =
-          Semigroup.plus(left.followSocialProof, right.followSocialProof).distinct,
-        favSocialProof = Semigroup.plus(left.favSocialProof, right.favSocialProof).distinct
-      )
-    }
-  }
-
-  /**
-   * @param adjacencyLists User-User follow/fav graph
-   * @param knownFor KnownFor data set. Each user can be known for several clusters with certain
-   *                 knownFor weights.
-   * @param socialProofThreshold A user will only be interested in a cluster if they follow/fav at
-   *                             least certain number of users known for this cluster.
-   * @param uniqueId required for these Stat
-   * @return
-   */
-  def userClusterPairsWithoutNormalization(
-    adjacencyLists: TypedPipe[UserAndNeighbors],
-    knownFor: TypedPipe[(Long, Array[(Int, Float)])],
-    socialProofThreshold: Int
-  )(
-    implicit uniqueId: UniqueID
-  ): TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] = {
-    val edgesToUsersWithKnownFor = Stat("num_edges_to_users_with_known_for")
-    val srcDestClusterTriples = Stat("num_src_dest_cluster_triples")
-    val srcClusterPairsBeforeSocialProofThresholding =
-      Stat("num_src_cluster_pairs_before_social_proof_thresholding")
-    val srcClusterPairsAfterSocialProofThresholding =
-      Stat("num_src_cluster_pairs_after_social_proof_thresholding")
-
-    val edges = adjacencyLists.flatMap {
-      case UserAndNeighbors(srcId, neighborsWithWeights) =>
-        neighborsWithWeights.map { neighborWithWeights =>
-          (
-            neighborWithWeights.neighborId,
-            neighborWithWeights.copy(neighborId = srcId)
-          )
-        }
-    }
-
-    implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian
-
-    edges
-      .sketch(4000)
-      .join(knownFor)
-      .flatMap {
-        case (destId, (srcWithWeights, clusterArray)) =>
-          edgesToUsersWithKnownFor.inc()
-          clusterArray.toList.map {
-            case (clusterId, knownForScoreF) =>
-              val knownForScore = math.max(0.0, knownForScoreF.toDouble)
-
-              srcDestClusterTriples.inc()
-              val followScore =
-                if (srcWithWeights.isFollowed.contains(true)) knownForScore else 0.0
-              val followScoreProducerNormalizedOnly =
-                srcWithWeights.followScoreNormalizedByNeighborFollowersL2.getOrElse(
-                  0.0) * knownForScore
-              val favScore =
-                srcWithWeights.favScoreHalfLife100Days.getOrElse(0.0) * knownForScore
-
-              val favScoreProducerNormalizedOnly =
-                srcWithWeights.favScoreHalfLife100DaysNormalizedByNeighborFaversL2.getOrElse(
-                  0.0) * knownForScore
-
-              val logFavScore = srcWithWeights.logFavScore.getOrElse(0.0) * knownForScore
-
-              val logFavScoreProducerNormalizedOnly = srcWithWeights.logFavScoreL2Normalized
-                .getOrElse(0.0) * knownForScore
-
-              val followSocialProof = if (srcWithWeights.isFollowed.contains(true)) {
-                List(destId)
-              } else Nil
-              val favSocialProof = if (srcWithWeights.favScoreHalfLife100Days.exists(_ > 0)) {
-                List(destId)
-              } else Nil
-
-              (
-                (srcWithWeights.neighborId, clusterId),
-                SrcClusterIntermediateInfo(
-                  followScore,
-                  followScoreProducerNormalizedOnly,
-                  favScore,
-                  favScoreProducerNormalizedOnly,
-                  logFavScore,
-                  logFavScoreProducerNormalizedOnly,
-                  followSocialProof,
-                  favSocialProof
-                )
-              )
-          }
-      }
-      .sumByKey
-      .withReducers(10000)
-      .filter {
-        case ((_, _), SrcClusterIntermediateInfo(_, _, _, _, _, _, followProof, favProof)) =>
-          srcClusterPairsBeforeSocialProofThresholding.inc()
-          val distinctSocialProof = (followProof ++ favProof).toSet
-          val result = distinctSocialProof.size >= socialProofThreshold
-          if (result) {
-            srcClusterPairsAfterSocialProofThresholding.inc()
-          }
-          result
-      }
-  }
-
-  /**
-   * Add the cluster-level l2 norm scores, and use them to normalize follow/fav scores.
-   */
-  def attachNormalizedScores(
-    intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
-  )(
-    implicit uniqueId: UniqueID
-  ): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
-
-    def square(x: Double): Double = x * x
-
-    val clusterCountsAndNorms =
-      intermediate
-        .map {
-          case (
-                (_, clusterId),
-                SrcClusterIntermediateInfo(
-                  followScore,
-                  followScoreProducerNormalizedOnly,
-                  favScore,
-                  favScoreProducerNormalizedOnly,
-                  logFavScore,
-                  logFavScoreProducerNormalizedOnly,
-                  _,
-                  _
-                )
-              ) =>
-            (
-              clusterId,
-              (
-                1,
-                square(followScore),
-                square(followScoreProducerNormalizedOnly),
-                square(favScore),
-                square(favScoreProducerNormalizedOnly),
-                square(logFavScore),
-                square(logFavScoreProducerNormalizedOnly)
-              )
-            )
-        }
-        .sumByKey
-        //        .withReducers(100)
-        .map {
-          case (
-                clusterId,
-                (
-                  cnt,
-                  squareFollowScore,
-                  squareFollowScoreProducerNormalizedOnly,
-                  squareFavScore,
-                  squareFavScoreProducerNormalizedOnly,
-                  squareLogFavScore,
-                  squareLogFavScoreProducerNormalizedOnly
-                )) =>
-            (
-              clusterId,
-              (
-                cnt,
-                math.sqrt(squareFollowScore),
-                math.sqrt(squareFollowScoreProducerNormalizedOnly),
-                math.sqrt(squareFavScore),
-                math.sqrt(squareFavScoreProducerNormalizedOnly),
-                math.sqrt(squareLogFavScore),
-                math.sqrt(squareLogFavScoreProducerNormalizedOnly)
-              ))
-        }
-
-    implicit val i2b: Int => Array[Byte] = Injection.int2BigEndian
-
-    intermediate
-      .map {
-        case ((srcId, clusterId), clusterScoresTuple) =>
-          (clusterId, (srcId, clusterScoresTuple))
-      }
-      .sketch(reducers = 900)
-      .join(clusterCountsAndNorms)
-      .map {
-        case (
-              clusterId,
-              (
-                (
-                  srcId,
-                  SrcClusterIntermediateInfo(
-                    followScore,
-                    followScoreProducerNormalizedOnly,
-                    favScore,
-                    favScoreProducerNormalizedOnly,
-                    logFavScore,
-                    logFavScoreProducerNormalizedOnly, // not used for now
-                    followProof,
-                    favProof
-                  )
-                ),
-                (
-                  cnt,
-                  followNorm,
-                  followProducerNormalizedNorm,
-                  favNorm,
-                  favProducerNormalizedNorm,
-                  logFavNorm,
-                  logFavProducerNormalizedNorm // not used for now
-                )
-              )
-            ) =>
-          (
-            srcId,
-            List(
-              (
-                clusterId,
-                UserToInterestedInClusterScores(
-                  followScore = Some(ifNanMake0(followScore)),
-                  followScoreClusterNormalizedOnly = Some(ifNanMake0(followScore / followNorm)),
-                  followScoreProducerNormalizedOnly =
-                    Some(ifNanMake0(followScoreProducerNormalizedOnly)),
-                  followScoreClusterAndProducerNormalized = Some(
-                    ifNanMake0(followScoreProducerNormalizedOnly / followProducerNormalizedNorm)),
-                  favScore = Some(ifNanMake0(favScore)),
-                  favScoreClusterNormalizedOnly = Some(ifNanMake0(favScore / favNorm)),
-                  favScoreProducerNormalizedOnly = Some(ifNanMake0(favScoreProducerNormalizedOnly)),
-                  favScoreClusterAndProducerNormalized =
-                    Some(ifNanMake0(favScoreProducerNormalizedOnly / favProducerNormalizedNorm)),
-                  usersBeingFollowed = Some(followProof),
-                  usersThatWereFaved = Some(favProof),
-                  numUsersInterestedInThisClusterUpperBound = Some(cnt),
-                  logFavScore = Some(ifNanMake0(logFavScore)),
-                  logFavScoreClusterNormalizedOnly = Some(ifNanMake0(logFavScore / logFavNorm))
-                ))
-            )
-          )
-      }
-      .sumByKey
-      //      .withReducers(1000)
-      .toTypedPipe
-  }
-
-  /**
-   * aggregate cluster scores for each user, to be used instead of attachNormalizedScores
-   * when we donot want to compute cluster-level l2 norm scores
-   */
-  def groupClusterScores(
-    intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
-  )(
-    implicit uniqueId: UniqueID
-  ): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
-
-    intermediate
-      .map {
-        case (
-              (srcId, clusterId),
-              SrcClusterIntermediateInfo(
-                followScore,
-                followScoreProducerNormalizedOnly,
-                favScore,
-                favScoreProducerNormalizedOnly,
-                logFavScore,
-                logFavScoreProducerNormalizedOnly,
-                followProof,
-                favProof
-              )
-            ) =>
-          (
-            srcId,
-            List(
-              (
-                clusterId,
-                UserToInterestedInClusterScores(
-                  followScore = Some(ifNanMake0(followScore)),
-                  followScoreProducerNormalizedOnly =
-                    Some(ifNanMake0(followScoreProducerNormalizedOnly)),
-                  favScore = Some(ifNanMake0(favScore)),
-                  favScoreProducerNormalizedOnly = Some(ifNanMake0(favScoreProducerNormalizedOnly)),
-                  usersBeingFollowed = Some(followProof),
-                  usersThatWereFaved = Some(favProof),
-                  logFavScore = Some(ifNanMake0(logFavScore)),
-                ))
-            )
-          )
-      }
-      .sumByKey
-      .withReducers(1000)
-      .toTypedPipe
-  }
-
-  /**
-   * For each user, only keep up to a certain number of clusters.
-   * @param allInterests user with a list of interestedIn clusters.
-   * @param maxClustersPerUser number of clusters to keep for each user
-   * @param knownForModelVersion known for model version
-   * @param uniqueId required for these Stat
-   * @return
-   */
-  def keepOnlyTopClusters(
-    allInterests: TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])],
-    maxClustersPerUser: Int,
-    knownForModelVersion: String
-  )(
-    implicit uniqueId: UniqueID
-  ): TypedPipe[(Long, ClustersUserIsInterestedIn)] = {
-    val userClusterPairsBeforeUserTruncation =
-      Stat("num_user_cluster_pairs_before_user_truncation")
-    val userClusterPairsAfterUserTruncation =
-      Stat("num_user_cluster_pairs_after_user_truncation")
-    val usersWithALotOfClusters =
-      Stat(s"num_users_with_more_than_${maxClustersPerUser}_clusters")
-
-    allInterests
-      .map {
-        case (srcId, fullClusterList) =>
-          userClusterPairsBeforeUserTruncation.incBy(fullClusterList.size)
-          val truncatedClusters = if (fullClusterList.size > maxClustersPerUser) {
-            usersWithALotOfClusters.inc()
-            fullClusterList
-              .sortBy {
-                case (_, clusterScores) =>
-                  (
-                    -clusterScores.favScore.getOrElse(0.0),
-                    -clusterScores.logFavScore.getOrElse(0.0),
-                    -clusterScores.followScore.getOrElse(0.0),
-                    -clusterScores.logFavScoreClusterNormalizedOnly.getOrElse(0.0),
-                    -clusterScores.followScoreProducerNormalizedOnly.getOrElse(0.0)
-                  )
-              }
-              .take(maxClustersPerUser)
-          } else {
-            fullClusterList
-          }
-          userClusterPairsAfterUserTruncation.incBy(truncatedClusters.size)
-          (srcId, ClustersUserIsInterestedIn(knownForModelVersion, truncatedClusters.toMap))
-      }
-  }
-
-  def run(
-    adjacencyLists: TypedPipe[UserAndNeighbors],
-    knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
-    socialProofThreshold: Int,
-    maxClustersPerUser: Int,
-    knownForModelVersion: String
-  )(
-    implicit uniqueId: UniqueID
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-    keepOnlyTopClusters(
-      attachNormalizedScores(
-        userClusterPairsWithoutNormalization(
-          adjacencyLists,
-          knownFor,
-          socialProofThreshold
-        )
-      ),
-      maxClustersPerUser,
-      knownForModelVersion
-    )
-  }
-
-  /**
-   * run the interestedIn job, cluster normalized scores are not attached to user's clusters.
-   */
-  def runWithoutClusterNormalizedScores(
-    adjacencyLists: TypedPipe[UserAndNeighbors],
-    knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
-    socialProofThreshold: Int,
-    maxClustersPerUser: Int,
-    knownForModelVersion: String
-  )(
-    implicit uniqueId: UniqueID
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-    keepOnlyTopClusters(
-      groupClusterScores(
-        userClusterPairsWithoutNormalization(
-          adjacencyLists,
-          knownFor,
-          socialProofThreshold
-        )
-      ),
-      maxClustersPerUser,
-      knownForModelVersion
-    )
-  }
-
-  /**
-   * print out some basic stats of the data set to make sure things are not broken
-   */
-  def dataSetStats(
-    interestedInData: TypedPipe[(UserId, ClustersUserIsInterestedIn)],
-    dataSetName: String = ""
-  ): Execution[Unit] = {
-
-    Execution
-      .zip(
-        Util.printSummaryOfNumericColumn(
-          interestedInData.map {
-            case (user, interestedIn) =>
-              interestedIn.clusterIdToScores.size
-          },
-          Some(s"$dataSetName UserInterestedIn Size")
-        ),
-        Util.printSummaryOfNumericColumn(
-          interestedInData.flatMap {
-            case (user, interestedIn) =>
-              interestedIn.clusterIdToScores.map {
-                case (_, scores) =>
-                  scores.favScore.getOrElse(0.0)
-              }
-          },
-          Some(s"$dataSetName UserInterestedIn favScore")
-        ),
-        Util.printSummaryOfNumericColumn(
-          interestedInData.flatMap {
-            case (user, interestedIn) =>
-              interestedIn.clusterIdToScores.map {
-                case (_, scores) =>
-                  scores.favScoreClusterNormalizedOnly.getOrElse(0.0)
-              }
-          },
-          Some(s"$dataSetName UserInterestedIn favScoreClusterNormalizedOnly")
-        ),
-        Util.printSummaryOfNumericColumn(
-          interestedInData.flatMap {
-            case (user, interestedIn) =>
-              interestedIn.clusterIdToScores.map {
-                case (_, scores) =>
-                  scores.logFavScoreClusterNormalizedOnly.getOrElse(0.0)
-              }
-          },
-          Some(s"$dataSetName UserInterestedIn logFavScoreClusterNormalizedOnly")
-        )
-      ).unit
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.scala
@ -1,354 +0,0 @@
-package com.twitter.simclusters_v2.scalding
-
-import com.twitter.algebird.Semigroup
-import com.twitter.bijection.Injection
-import com.twitter.dal.client.dataset.KeyValDALDataset
-import com.twitter.scalding._
-import com.twitter.scalding_internal.dalv2.DAL
-import com.twitter.scalding_internal.dalv2.DALWrite.{D, WriteExtension}
-import com.twitter.scalding_internal.job.TwitterExecutionApp
-import com.twitter.scalding_internal.job.analytics_batch.{
-  AnalyticsBatchExecution,
-  AnalyticsBatchExecutionArgs,
-  BatchDescription,
-  BatchFirstTime,
-  BatchIncrement,
-  TwitterScheduledExecutionApp
-}
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
-import com.twitter.simclusters_v2.common.{ClusterId, ModelVersions, UserId}
-import com.twitter.simclusters_v2.hdfs_sources.{
-  AdhocKeyValSources,
-  InternalDataPaths,
-  SimclustersV2KnownFor20M145K2020ScalaDataset,
-  SimclustersV2RawInterestedInLite20M145K2020ScalaDataset,
-  SimclustersV2RawInterestedIn20M145KUpdatedScalaDataset,
-  UserAndNeighborsFixedPathSource,
-  UserUserGraphScalaDataset
-}
-import com.twitter.simclusters_v2.scalding.common.Util
-import com.twitter.simclusters_v2.thriftscala.{
-  ClustersUserIsInterestedIn,
-  ClustersUserIsKnownFor,
-  UserAndNeighbors,
-  UserToInterestedInClusterScores
-}
-import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
-import java.util.TimeZone
-
-/**
- * This file implements the job for computing users' interestedIn vector from KnownFor data set.
- *
- * It reads the UserUserGraphScalaDataset to get user-user follow + fav graph, and then
- * based on the known-for clusters of each followed/faved user, we calculate how much a user is
- * interestedIn a cluster.
- *
- * The main differences of the InterestedInFromKnownForLite compared to InterestedInFromKnownFor are
- * the following:
- * - We read the UserUserGraph dataset that doesnot contain the producer normalized scores
- * - We donot compute the cluster normalized scores for the clusters per user
- * - For social proof thresholding, we donot keep track of the entire list of follow and
- * fav social proofs but rather make use of numFollowSocial and numFavSocial (this introduces
- * some noise if follow and fav social proof contain the same users)
- * - Store 200 clusters per user compared to 50 in IIKF
- * - Runs more frequently compared to weekly in IIKF
- */
-/**
- * Production job for computing interestedIn data set for the model version 20M145K2020.
- *
- * To deploy the job:
- *
- * capesospy-v2 update --build_locally --start_cron interested_in_lite_for_20M_145k_2020 \
- src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
- */
-object InterestedInFromKnownForLite20M145K2020 extends InterestedInFromKnownForLite {
-  override val firstTime: String = "2021-04-24"
-  override val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
-    SimclustersV2RawInterestedInLite20M145K2020ScalaDataset
-  override val outputPath: String = InternalDataPaths.RawInterestedInLite2020Path
-  override val knownForModelVersion: String = ModelVersions.Model20M145K2020
-  override val knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] =
-    SimclustersV2KnownFor20M145K2020ScalaDataset
-}
-trait InterestedInFromKnownForLite extends TwitterScheduledExecutionApp {
-  implicit val tz = DateOps.UTC
-  implicit val parser = DateParser.default
-
-  def firstTime: String
-  val batchIncrement: Duration = Days(2)
-  val lookBackDays: Duration = Days(30)
-
-  def outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
-  def outputPath: String
-  def knownForModelVersion: String
-  def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
-
-  private lazy val execArgs = AnalyticsBatchExecutionArgs(
-    batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
-    firstTime = BatchFirstTime(RichDate(firstTime)),
-    lastTime = None,
-    batchIncrement = BatchIncrement(batchIncrement)
-  )
-
-  override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
-    implicit dateRange =>
-      Execution.withId { implicit uniqueId =>
-        Execution.withArgs { args =>
-          val userUserGraph =
-            DAL.readMostRecentSnapshot(UserUserGraphScalaDataset).toTypedPipe
-          val knownFor = KnownForSources.fromKeyVal(
-            DAL.readMostRecentSnapshot(knownForDALDataset, dateRange.extend(Days(30))).toTypedPipe,
-            knownForModelVersion
-          )
-
-          val socialProofThreshold = args.int("socialProofThreshold", 2)
-          val maxClustersPerUser = args.int("maxClustersPerUser", 200)
-
-          val result = InterestedInFromKnownForLite
-            .run(
-              userUserGraph,
-              knownFor,
-              socialProofThreshold,
-              maxClustersPerUser,
-              knownForModelVersion
-            )
-
-          val writeKeyValResultExec = result
-            .map {
-              case (userId, clusters) => KeyVal(userId, clusters)
-            }.writeDALVersionedKeyValExecution(
-              outputKVDataset,
-              D.Suffix(outputPath)
-            )
-          Util.printCounters(writeKeyValResultExec)
-        }
-      }
-  }
-}
-
-/**
- * Adhoc job to compute user interestedIn.
- *
- * scalding remote run \
- * --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_lite_20m_145k_2020-adhoc \
- * --main-class com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020Adhoc \
- * --user cassowary --cluster bluebird-qus1 \
- * --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
- * --principal service_acoount@TWITTER.BIZ \
- * -- \
- * --outputDir /gcs/user/cassowary/adhoc/interested_in_from_knownfor_lite/ \
- * --date 2020-08-25
- */
-object InterestedInFromKnownForLite20M145K2020Adhoc extends AdhocExecutionApp {
-  override def runOnDateRange(
-    args: Args
-  )(
-    implicit dateRange: DateRange,
-    timeZone: TimeZone,
-    uniqueID: UniqueID
-  ): Execution[Unit] = {
-    val userUserGraph = DAL.readMostRecentSnapshot(UserUserGraphScalaDataset).toTypedPipe
-    val socialProofThreshold = args.int("socialProofThreshold", 2)
-    val maxClustersPerUser = args.int("maxClustersPerUser", 200)
-    val knownForModelVersion = ModelVersions.Model20M145K2020
-    val knownFor = KnownForSources.fromKeyVal(
-      DAL
-        .readMostRecentSnapshotNoOlderThan(
-          SimclustersV2KnownFor20M145K2020ScalaDataset,
-          Days(30)).toTypedPipe,
-      knownForModelVersion
-    )
-
-    val outputSink = AdhocKeyValSources.interestedInSource(args("outputDir"))
-    Util.printCounters(
-      InterestedInFromKnownForLite
-        .run(
-          userUserGraph,
-          knownFor,
-          socialProofThreshold,
-          maxClustersPerUser,
-          knownForModelVersion
-        ).writeExecution(outputSink)
-    )
-  }
-
-}
-
-object InterestedInFromKnownForLite {
-  private def ifNanMake0(x: Double): Double = if (x.isNaN) 0.0 else x
-
-  case class SrcClusterIntermediateInfo(
-    followScore: Double,
-    favScore: Double,
-    logFavScore: Double,
-    numFollowed: Int,
-    numFaved: Int) {
-
-    // helper function used for test cases
-    override def equals(obj: scala.Any): Boolean = {
-      obj match {
-        case that: SrcClusterIntermediateInfo =>
-          math.abs(followScore - that.followScore) < 1e-5 &&
-            math.abs(favScore - that.favScore) < 1e-5 &&
-            math.abs(logFavScore - that.logFavScore) < 1e-5 &&
-            numFollowed == that.numFollowed &&
-            numFaved == that.numFaved
-        case _ => false
-      }
-    }
-  }
-
-  implicit object SrcClusterIntermediateInfoSemigroup
-      extends Semigroup[SrcClusterIntermediateInfo] {
-    override def plus(
-      left: SrcClusterIntermediateInfo,
-      right: SrcClusterIntermediateInfo
-    ): SrcClusterIntermediateInfo = {
-      SrcClusterIntermediateInfo(
-        followScore = left.followScore + right.followScore,
-        favScore = left.favScore + right.favScore,
-        logFavScore = left.logFavScore + right.logFavScore,
-        numFollowed = left.numFollowed + right.numFollowed,
-        numFaved = left.numFaved + right.numFaved
-      )
-    }
-  }
-
-  def run(
-    adjacencyLists: TypedPipe[UserAndNeighbors],
-    knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
-    socialProofThreshold: Int,
-    maxClustersPerUser: Int,
-    knownForModelVersion: String
-  )(
-    implicit uniqueId: UniqueID
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-    InterestedInFromKnownFor.keepOnlyTopClusters(
-      groupClusterScores(
-        userClusterPairs(
-          adjacencyLists,
-          knownFor,
-          socialProofThreshold
-        )
-      ),
-      maxClustersPerUser,
-      knownForModelVersion
-    )
-  }
-
-  def userClusterPairs(
-    adjacencyLists: TypedPipe[UserAndNeighbors],
-    knownFor: TypedPipe[(Long, Array[(Int, Float)])],
-    socialProofThreshold: Int
-  )(
-    implicit uniqueId: UniqueID
-  ): TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] = {
-    val edgesToUsersWithKnownFor = Stat("num_edges_to_users_with_known_for")
-    val srcDestClusterTriples = Stat("num_src_dest_cluster_triples")
-    val srcClusterPairsBeforeSocialProofThresholding =
-      Stat("num_src_cluster_pairs_before_social_proof_thresholding")
-    val srcClusterPairsAfterSocialProofThresholding =
-      Stat("num_src_cluster_pairs_after_social_proof_thresholding")
-
-    val edges = adjacencyLists.flatMap {
-      case UserAndNeighbors(srcId, neighborsWithWeights) =>
-        neighborsWithWeights.map { neighborWithWeights =>
-          (
-            neighborWithWeights.neighborId,
-            neighborWithWeights.copy(neighborId = srcId)
-          )
-        }
-    }
-
-    implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian
-
-    edges
-      .sketch(4000)
-      .join(knownFor)
-      .flatMap {
-        case (destId, (srcWithWeights, clusterArray)) =>
-          edgesToUsersWithKnownFor.inc()
-          clusterArray.toList.map {
-            case (clusterId, knownForScoreF) =>
-              val knownForScore = math.max(0.0, knownForScoreF.toDouble)
-
-              srcDestClusterTriples.inc()
-              val followScore =
-                if (srcWithWeights.isFollowed.contains(true)) knownForScore else 0.0
-              val favScore =
-                srcWithWeights.favScoreHalfLife100Days.getOrElse(0.0) * knownForScore
-              val logFavScore = srcWithWeights.logFavScore.getOrElse(0.0) * knownForScore
-              val numFollowed = if (srcWithWeights.isFollowed.contains(true)) {
-                1
-              } else 0
-
-              val numFaved = if (srcWithWeights.favScoreHalfLife100Days.exists(_ > 0)) {
-                1
-              } else 0
-
-              (
-                (srcWithWeights.neighborId, clusterId),
-                SrcClusterIntermediateInfo(
-                  followScore,
-                  favScore,
-                  logFavScore,
-                  numFollowed,
-                  numFaved
-                )
-              )
-          }
-      }
-      .sumByKey
-      .withReducers(10000)
-      .filter {
-        case ((_, _), SrcClusterIntermediateInfo(_, _, _, numFollowed, numFaved)) =>
-          srcClusterPairsBeforeSocialProofThresholding.inc()
-          // we donot remove duplicates
-          val socialProofSize = numFollowed + numFaved
-          val result = socialProofSize >= socialProofThreshold
-          if (result) {
-            srcClusterPairsAfterSocialProofThresholding.inc()
-          }
-          result
-      }
-  }
-
-  def groupClusterScores(
-    intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
-  )(
-    implicit uniqueId: UniqueID
-  ): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
-
-    implicit val i2b: Int => Array[Byte] = Injection.int2BigEndian
-
-    intermediate
-      .map {
-        case (
-              (srcId, clusterId),
-              SrcClusterIntermediateInfo(
-                followScore,
-                favScore,
-                logFavScore,
-                numFollowed,
-                numFaved
-              )) =>
-          (
-            srcId,
-            List(
-              (
-                clusterId,
-                UserToInterestedInClusterScores(
-                  followScore = Some(ifNanMake0(followScore)),
-                  favScore = Some(ifNanMake0(favScore)),
-                  logFavScore = Some(ifNanMake0(logFavScore)),
-                  numUsersBeingFollowed = Some(numFollowed),
-                  numUsersThatWereFaved = Some(numFaved)
-                ))
-            )
-          )
-      }
-      .sumByKey
-      //      .withReducers(1000)
-      .toTypedPipe
-  }
-}
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.scala
@ -1,290 +0,0 @@
-package com.twitter.simclusters_v2.scalding
-
-import com.twitter.dal.client.dataset.KeyValDALDataset
-import com.twitter.scalding.Execution
-import com.twitter.scalding.TypedTsv
-import com.twitter.scalding._
-import com.twitter.scalding_internal.dalv2.DAL
-import com.twitter.scalding_internal.dalv2.DALWrite._
-import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
-import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
-import com.twitter.simclusters_v2.common.ModelVersions
-import com.twitter.simclusters_v2.common.UserId
-import com.twitter.simclusters_v2.hdfs_sources.ProducerEmbeddingSources
-import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
-import com.twitter.simclusters_v2.hdfs_sources.DataSources
-import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset
-import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
-import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
-import com.twitter.simclusters_v2.scalding.common.Util
-import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.SimClusterWithScore
-import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore
-import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores
-import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
-import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp
-import java.util.TimeZone
-import scala.util.Random
-
-/**
- * This file implements the job for computing users' interestedIn vector from the producerEmbeddings data set.
- *
- * It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then
- * based on the producerEmbedding clusters of each followed/faved user, we calculate how much a user is
- * interestedIn a cluster. To compute the engagement and determine the clusters for the user, we reuse
- * the functions defined in InterestedInKnownFor.
- *
- * Using producerEmbeddings instead of knownFor to obtain interestedIn increases the coverage (especially
- * for medium and light users) and also the density of the cluster embeddings for the user.
- */
-/**
- * Adhoc job to generate the interestedIn from producer embeddings for the model version 20M145KUpdated
- *
- scalding remote run \
-  --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_producer_embeddings \
-  --main-class com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsAdhocApp \
-  --user cassowary --cluster bluebird-qus1 \
-  --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
-  --principal service_acoount@TWITTER.BIZ \
-  -- \
-  --outputDir /gcs/user/cassowary/adhoc/interested_in_from_prod_embeddings/ \
-  --date 2020-08-25 --typedTsv true
- */
-object InterestedInFromProducerEmbeddingsAdhocApp extends AdhocExecutionApp {
-  override def runOnDateRange(
-    args: Args
-  )(
-    implicit dateRange: DateRange,
-    timeZone: TimeZone,
-    uniqueID: UniqueID
-  ): Execution[Unit] = {
-
-    val outputDir = args("outputDir")
-    val inputGraph = args.optional("graphInputDir") match {
-      case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir))
-      case None =>
-        DAL
-          .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
-          .toTypedPipe
-    }
-    val socialProofThreshold = args.int("socialProofThreshold", 2)
-    val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50)
-    val maxClustersFromProducer = args.int("maxClustersPerProducer", 25)
-    val typedTsvTag = args.boolean("typedTsv")
-
-    val embeddingType =
-      EmbeddingType.ProducerFavBasedSemanticCoreEntity
-    val modelVersion = ModelVersions.Model20M145KUpdated
-    val producerEmbeddings = ProducerEmbeddingSources
-      .producerEmbeddingSourceLegacy(embeddingType, ModelVersions.toModelVersion(modelVersion))(
-        dateRange.embiggen(Days(7)))
-
-    import InterestedInFromProducerEmbeddingsBatchApp._
-
-    val numProducerMappings = Stat("num_producer_embeddings_total")
-    val numProducersWithLargeClusterMappings = Stat(
-      "num_producers_with_more_clusters_than_threshold")
-    val numProducersWithSmallClusterMappings = Stat(
-      "num_producers_with_clusters_less_than_threshold")
-    val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings")
-
-    val producerEmbeddingsWithScore = producerEmbeddings.map {
-      case (userId: Long, topSimClusters: TopSimClustersWithScore) =>
-        (
-          userId,
-          topSimClusters.topClusters.toArray
-            .map {
-              case (simCluster: SimClusterWithScore) =>
-                (simCluster.clusterId, simCluster.score.toFloat)
-            }
-        )
-    }
-    val producerEmbeddingsPruned = producerEmbeddingsWithScore.map {
-      case (producerId, clusterArray) =>
-        numProducerMappings.inc()
-        val clusterSize = clusterArray.size
-        totalClustersCoverageProducerEmbeddings.incBy(clusterSize)
-        val prunedList = if (clusterSize > maxClustersFromProducer) {
-          numProducersWithLargeClusterMappings.inc()
-          clusterArray
-            .sortBy {
-              case (_, knownForScore) => -knownForScore
-            }.take(maxClustersFromProducer)
-        } else {
-          numProducersWithSmallClusterMappings.inc()
-          clusterArray
-        }
-        (producerId, prunedList)
-    }
-
-    val result = InterestedInFromKnownFor
-      .run(
-        inputGraph,
-        producerEmbeddingsPruned,
-        socialProofThreshold,
-        maxClustersPerUserFinalResult,
-        modelVersion
-      )
-
-    val resultWithoutSocial = getInterestedInDiscardSocial(result)
-
-    if (typedTsvTag) {
-      Util.printCounters(
-        resultWithoutSocial
-          .map {
-            case (userId: Long, clusters: ClustersUserIsInterestedIn) =>
-              (
-                userId,
-                clusters.clusterIdToScores.keys.toString()
-              )
-          }
-          .writeExecution(
-            TypedTsv(outputDir)
-          )
-      )
-    } else {
-      Util.printCounters(
-        resultWithoutSocial
-          .writeExecution(
-            AdhocKeyValSources.interestedInSource(outputDir)
-          )
-      )
-    }
-  }
-}
-
-/**
- * Production job for computing interestedIn data set from the producer embeddings for the model version 20M145KUpdated.
- * It writes the data set in KeyVal format to produce a MH DAL data set.
- *
- * To deploy the job:
- *
- * capesospy-v2 update --build_locally --start_cron
- * --start_cron interested_in_from_producer_embeddings
- * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml
- */
-object InterestedInFromProducerEmbeddingsBatchApp extends ScheduledExecutionApp {
-  override val firstTime: RichDate = RichDate("2019-11-01")
-
-  override val batchIncrement: Duration = Days(7)
-
-  def getPrunedEmbeddings(
-    producerEmbeddings: TypedPipe[(Long, TopSimClustersWithScore)],
-    maxClustersFromProducer: Int
-  ): TypedPipe[(Long, TopSimClustersWithScore)] = {
-    producerEmbeddings.map {
-      case (producerId, producerClusters) =>
-        val prunedProducerClusters =
-          producerClusters.topClusters
-            .sortBy {
-              case simCluster => -simCluster.score.toFloat
-            }.take(maxClustersFromProducer)
-        (producerId, TopSimClustersWithScore(prunedProducerClusters, producerClusters.modelVersion))
-    }
-  }
-
-  def getInterestedInDiscardSocial(
-    interestedInFromProducersResult: TypedPipe[(UserId, ClustersUserIsInterestedIn)]
-  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
-    interestedInFromProducersResult.map {
-      case (srcId, fullClusterList) =>
-        val fullClusterListWithoutSocial = fullClusterList.clusterIdToScores.map {
-          case (clusterId, clusterDetails) =>
-            val clusterDetailsWithoutSocial = UserToInterestedInClusterScores(
-              followScore = clusterDetails.followScore,
-              followScoreClusterNormalizedOnly = clusterDetails.followScoreClusterNormalizedOnly,
-              followScoreProducerNormalizedOnly = clusterDetails.followScoreProducerNormalizedOnly,
-              followScoreClusterAndProducerNormalized =
-                clusterDetails.followScoreClusterAndProducerNormalized,
-              favScore = clusterDetails.favScore,
-              favScoreClusterNormalizedOnly = clusterDetails.favScoreClusterNormalizedOnly,
-              favScoreProducerNormalizedOnly = clusterDetails.favScoreProducerNormalizedOnly,
-              favScoreClusterAndProducerNormalized =
-                clusterDetails.favScoreClusterAndProducerNormalized,
-              // Social proof is currently not being used anywhere else, hence being discarded to reduce space for this dataset
-              usersBeingFollowed = None,
-              usersThatWereFaved = None,
-              numUsersInterestedInThisClusterUpperBound =
-                clusterDetails.numUsersInterestedInThisClusterUpperBound,
-              logFavScore = clusterDetails.logFavScore,
-              logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly,
-              // Counts of the social proof are maintained
-              numUsersBeingFollowed = Some(clusterDetails.usersBeingFollowed.getOrElse(Nil).size),
-              numUsersThatWereFaved = Some(clusterDetails.usersThatWereFaved.getOrElse(Nil).size)
-            )
-            (clusterId, clusterDetailsWithoutSocial)
-        }
-        (
-          srcId,
-          ClustersUserIsInterestedIn(
-            fullClusterList.knownForModelVersion,
-            fullClusterListWithoutSocial))
-    }
-  }
-
-  override def runOnDateRange(
-    args: Args
-  )(
-    implicit dateRange: DateRange,
-    timeZone: TimeZone,
-    uniqueID: UniqueID
-  ): Execution[Unit] = {
-
-    //Input args for the run
-    val socialProofThreshold = args.int("socialProofThreshold", 2)
-    val maxClustersFromProducer = args.int("maxClustersPerProducer", 25)
-    val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50)
-
-    //Path variables
-    val modelVersionUpdated = ModelVersions.toModelVersion(ModelVersions.Model20M145KUpdated)
-    val rootPath: String = s"/user/cassowary/manhattan_sequence_files"
-    val interestedInFromProducersPath =
-      rootPath + "/interested_in_from_producer_embeddings/" + modelVersionUpdated
-
-    //Input adjacency list and producer embeddings
-    val userUserNormalGraph =
-      DataSources.userUserNormalizedGraphSource(dateRange.prepend(Days(7))).forceToDisk
-    val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
-      SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset
-    val producerEmbeddings = ProducerEmbeddingSources
-      .producerEmbeddingSourceLegacy(
-        EmbeddingType.ProducerFavBasedSemanticCoreEntity,
-        modelVersionUpdated)(dateRange.embiggen(Days(7)))
-
-    val producerEmbeddingsPruned = getPrunedEmbeddings(producerEmbeddings, maxClustersFromProducer)
-    val producerEmbeddingsWithScore = producerEmbeddingsPruned.map {
-      case (userId: Long, topSimClusters: TopSimClustersWithScore) =>
-        (
-          userId,
-          topSimClusters.topClusters.toArray
-            .map {
-              case (simCluster: SimClusterWithScore) =>
-                (simCluster.clusterId, simCluster.score.toFloat)
-            }
-        )
-    }
-
-    val interestedInFromProducersResult =
-      InterestedInFromKnownFor.run(
-        userUserNormalGraph,
-        producerEmbeddingsWithScore,
-        socialProofThreshold,
-        maxClustersPerUserFinalResult,
-        modelVersionUpdated.toString
-      )
-
-    val interestedInFromProducersWithoutSocial =
-      getInterestedInDiscardSocial(interestedInFromProducersResult)
-
-    val writeKeyValResultExec = interestedInFromProducersWithoutSocial
-      .map { case (userId, clusters) => KeyVal(userId, clusters) }
-      .writeDALVersionedKeyValExecution(
-        outputKVDataset,
-        D.Suffix(interestedInFromProducersPath)
-      )
-    writeKeyValResultExec
-  }
-
-}
--- a/Show More
+++ b/Show More