[docx] split commit for file 5000

Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
2024-11-16 08:29:21 +01:00 · 2024-01-23 19:17:38 +02:00 · 2024-01-23 19:17:38 +02:00 · 2f5f511bb8
commit 2f5f511bb8
parent c4b4b821a3
394 changed files with 0 additions and 37240 deletions
--- a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.docx
--- a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbedding.scala
@ -1,32 +0,0 @@
 package com.twitter.simclusters_v2.common
 import com.twitter.simclusters_v2.common.SimClustersMultiEmbeddingId._
 import com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding.{Ids, Values}
 import com.twitter.simclusters_v2.thriftscala.{
  SimClustersMultiEmbedding,
  SimClustersEmbeddingId,
  SimClustersMultiEmbeddingId
 }
 /**
 * Helper methods for SimClustersMultiEmbedding
 */
 object SimClustersMultiEmbedding {
  // Convert a multiEmbedding to a list of (embeddingId, score)
  def toSimClustersEmbeddingIdWithScores(
    simClustersMultiEmbeddingId: SimClustersMultiEmbeddingId,
    simClustersMultiEmbedding: SimClustersMultiEmbedding
  ): Seq[(SimClustersEmbeddingId, Double)] = {
    simClustersMultiEmbedding match {
      case Values(values) =>
        values.embeddings.zipWithIndex.map {
          case (embeddingWithScore, i) =>
            (toEmbeddingId(simClustersMultiEmbeddingId, i), embeddingWithScore.score)
        }
      case Ids(ids) =>
        ids.ids.map(_.toTuple)
    }
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.docx
--- a/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/SimClustersMultiEmbeddingId.scala
@ -1,96 +0,0 @@
 package com.twitter.simclusters_v2.common
 import com.twitter.simclusters_v2.thriftscala.{
  EmbeddingType,
  InternalId,
  MultiEmbeddingType,
  TopicId,
  TopicSubId,
  SimClustersEmbeddingId => ThriftEmbeddingId,
  SimClustersMultiEmbeddingId => ThriftMultiEmbeddingId
 }
 /**
 * Helper methods for SimClustersMultiEmbeddingId
 */
 object SimClustersMultiEmbeddingId {
  private val MultiEmbeddingTypeToEmbeddingType: Map[MultiEmbeddingType, EmbeddingType] =
    Map(
      MultiEmbeddingType.LogFavApeBasedMuseTopic -> EmbeddingType.LogFavApeBasedMuseTopic,
      MultiEmbeddingType.TwiceUserInterestedIn -> EmbeddingType.TwiceUserInterestedIn,
    )
  private val EmbeddingTypeToMultiEmbeddingType: Map[EmbeddingType, MultiEmbeddingType] =
    MultiEmbeddingTypeToEmbeddingType.map(_.swap)
  def toEmbeddingType(multiEmbeddingType: MultiEmbeddingType): EmbeddingType = {
    MultiEmbeddingTypeToEmbeddingType.getOrElse(
      multiEmbeddingType,
      throw new IllegalArgumentException(s"Invalid type: $multiEmbeddingType"))
  }
  def toMultiEmbeddingType(embeddingType: EmbeddingType): MultiEmbeddingType = {
    EmbeddingTypeToMultiEmbeddingType.getOrElse(
      embeddingType,
      throw new IllegalArgumentException(s"Invalid type: $embeddingType")
    )
  }
  /**
   * Convert a SimClusters Multi-Embedding Id and SubId to SimClusters Embedding Id.
   */
  def toEmbeddingId(
    simClustersMultiEmbeddingId: ThriftMultiEmbeddingId,
    subId: Int
  ): ThriftEmbeddingId = {
    val internalId = simClustersMultiEmbeddingId.internalId match {
      case InternalId.TopicId(topicId) =>
        InternalId.TopicSubId(
          TopicSubId(topicId.entityId, topicId.language, topicId.country, subId))
      case _ =>
        throw new IllegalArgumentException(
          s"Invalid simClusters InternalId ${simClustersMultiEmbeddingId.internalId}")
    }
    ThriftEmbeddingId(
      toEmbeddingType(simClustersMultiEmbeddingId.embeddingType),
      simClustersMultiEmbeddingId.modelVersion,
      internalId
    )
  }
  /**
   * Fetch a subId from a SimClusters EmbeddingId.
   */
  def toSubId(simClustersEmbeddingId: ThriftEmbeddingId): Int = {
    simClustersEmbeddingId.internalId match {
      case InternalId.TopicSubId(topicSubId) =>
        topicSubId.subId
      case _ =>
        throw new IllegalArgumentException(
          s"Invalid SimClustersEmbeddingId InternalId type, $simClustersEmbeddingId")
    }
  }
  /**
   * Convert a SimClustersEmbeddingId to SimClustersMultiEmbeddingId.
   * Only support the Multi embedding based EmbeddingTypes.
   */
  def toMultiEmbeddingId(
    simClustersEmbeddingId: ThriftEmbeddingId
  ): ThriftMultiEmbeddingId = {
    simClustersEmbeddingId.internalId match {
      case InternalId.TopicSubId(topicSubId) =>
        ThriftMultiEmbeddingId(
          toMultiEmbeddingType(simClustersEmbeddingId.embeddingType),
          simClustersEmbeddingId.modelVersion,
          InternalId.TopicId(TopicId(topicSubId.entityId, topicSubId.language, topicSubId.country))
        )
      case _ =>
        throw new IllegalArgumentException(
          s"Invalid SimClustersEmbeddingId InternalId type, $simClustersEmbeddingId")
    }
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD
@ -1,11 +0,0 @@
 scala_library(
    compiler_option_sets = ["fatal_warnings"],
    platform = "java8",
    tags = ["bazel-compatible"],
    dependencies = [
        "eventdetection/common/src/main/java/com/twitter/eventdetection/common/louvain",
        "eventdetection/common/src/main/java/com/twitter/eventdetection/common/model",
        "src/java/com/twitter/sbf/graph",
        "src/scala/com/twitter/simclusters_v2/scalding/common",
    ],
 )
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/BUILD.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusterRepresentativeSelectionMethod.scala
@ -1,30 +0,0 @@
 package com.twitter.simclusters_v2.common.clustering
 import com.twitter.simclusters_v2.common.UserId
 import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
 /**
 * Select a cluster member as cluster representative.
 */
 trait ClusterRepresentativeSelectionMethod[T] {
  /**
   * The main external-facing method. Sub-classes should implement this method.
   *
   * @param cluster A set of NeighborWithWeights.
   * @param embeddings A map of producer ID -> embedding.
   *
   * @return UserId of the member chosen as representative.
   */
  def selectClusterRepresentative(
    cluster: Set[NeighborWithWeights],
    embeddings: Map[UserId, T]
  ): UserId
 }
 object ClusterRepresentativeSelectionStatistics {
  // Statistics, to be imported where recorded.
  val StatClusterRepresentativeSelectionTime = "cluster_representative_selection_total_time_ms"
 }
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/ClusteringMethod.scala
@ -1,34 +0,0 @@
 package com.twitter.simclusters_v2.common.clustering
 /**
 * Partitions a set of entities into clusters.
 * NOTE: The selection/construction of the cluster representatives (e.g. medoid, random, average) is implemented in ClusterRepresentativeSelectionMethod.scala
 */
 trait ClusteringMethod {
  /**
   * The main external-facing method. Sub-classes should implement this method.
   *
   * @param embeddings map of entity IDs and corresponding embeddings
   * @param similarityFn function that outputs similarity (>=0, the larger, more similar), given two embeddings
   * @tparam T embedding type. e.g. SimClustersEmbedding
   *
   * @return A set of sets of entity IDs, each set representing a distinct cluster.
   */
  def cluster[T](
    embeddings: Map[Long, T],
    similarityFn: (T, T) => Double,
    recordStatCallback: (String, Long) => Unit = (_, _) => ()
  ): Set[Set[Long]]
 }
 object ClusteringStatistics {
  // Statistics, to be imported where recorded.
  val StatSimilarityGraphTotalBuildTime = "similarity_graph_total_build_time_ms"
  val StatClusteringAlgorithmRunTime = "clustering_algorithm_total_run_time_ms"
  val StatMedoidSelectionTime = "medoid_selection_total_time_ms"
  val StatComputedSimilarityBeforeFilter = "computed_similarity_before_filter"
 }
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/ConnectedComponentsClusteringMethod.scala
@ -1,67 +0,0 @@
 package com.twitter.simclusters_v2.common.clustering
 import com.twitter.sbf.graph.ConnectedComponents
 import com.twitter.sbf.graph.Graph
 import com.twitter.util.Stopwatch
 import it.unimi.dsi.fastutil.ints.IntSet
 import scala.collection.SortedMap
 import scala.jdk.CollectionConverters._
 /**
 * Aggregate entities into clusters such that a cluster contains all embeddings with a similarity
 * above a configurable threshold to any other embedding.
 *
 * @param similarityThreshold: When building the edges between entities, edges with weight
 * less than or equal to this threshold will be filtered out.
 */
 class ConnectedComponentsClusteringMethod(
  similarityThreshold: Double)
    extends ClusteringMethod {
  import ClusteringStatistics._
  def cluster[T](
    embeddings: Map[Long, T],
    similarityFn: (T, T) => Double,
    recordStatCallback: (String, Long) => Unit = (_, _) => ()
  ): Set[Set[Long]] = {
    val timeSinceGraphBuildStart = Stopwatch.start()
    // com.twitter.sbf.graph.Graph expects neighbors to be sorted in ascending order.
    val sourcesById = SortedMap(embeddings.zipWithIndex.map {
      case (source, idx) => idx -> source
    }.toSeq: _*)
    val neighbours = sourcesById.map {
      case (srcIdx, (_, src)) =>
        sourcesById
          .collect {
            case (dstIdx, (_, dst)) if srcIdx != dstIdx => // avoid self-edges
              val similarity = similarityFn(src, dst)
              recordStatCallback(
                StatComputedSimilarityBeforeFilter,
                (similarity * 100).toLong // preserve up to two decimal points
              )
              if (similarity > similarityThreshold)
                Some(dstIdx)
              else None
          }.flatten.toArray
    }.toArray
    recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
    val timeSinceClusteringAlgRunStart = Stopwatch.start()
    val nEdges = neighbours.map(_.length).sum / 2 // Graph expects count of undirected edges
    val graph = new Graph(sourcesById.size, nEdges, neighbours)
    val clusters = ConnectedComponents
      .connectedComponents(graph).asScala.toSet
      .map { i: IntSet => i.asScala.map(sourcesById(_)._1).toSet }
    recordStatCallback(
      StatClusteringAlgorithmRunTime,
      timeSinceClusteringAlgRunStart().inMilliseconds)
    clusters
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/LargestDimensionClusteringMethod.scala
@ -1,33 +0,0 @@
 package com.twitter.simclusters_v2.common.clustering
 /**
 * Groups entities by a single embedding dimension with the largest score.
 */
 class LargestDimensionClusteringMethod extends ClusteringMethod {
  /**
   * @param embeddings   map of entity IDs and corresponding embeddings
   * @param similarityFn function that outputs discrete value (0.0 or 1.0).
   *                     1.0 if the dimensions of the highest score (weight) from two given embeddings match.
   *                     0.0 otherwise.
   *                     e.g.
   *                        case 1: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.3, 0.8, 0.0]. similarityFn(E1, E2)=1.0
   *                        case 2: E1=[0.0, 0.1, 0.6, 0.2], E2=[0.1, 0.4, 0.2, 0.0]. similarityFn(E1, E2)=0.0
   * @tparam T embedding type. e.g. SimClustersEmbedding
   *
   * @return A set of sets of entity IDs, each set representing a distinct cluster.
   */
  override def cluster[T](
    embeddings: Map[Long, T],
    similarityFn: (T, T) => Double,
    recordStatCallback: (String, Long) => Unit
  ): Set[Set[Long]] = {
    // rely on clustering by connected component.
    // similarityThreshold=0.1 because it's larger than 0.0 (similarityFn returns 0.0 if two embeddings
    // don't share the largest dimension.
    new ConnectedComponentsClusteringMethod(similarityThreshold = 0.1)
      .cluster(embeddings, similarityFn, recordStatCallback)
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/LouvainClusteringMethod.scala
@ -1,236 +0,0 @@
 package com.twitter.simclusters_v2.common.clustering
 import com.twitter.eventdetection.common.louvain.LouvainDriver
 import com.twitter.eventdetection.common.louvain.NetworkFactory
 import com.twitter.eventdetection.common.model.Entity
 import com.twitter.eventdetection.common.model.NetworkInput
 import com.twitter.eventdetection.common.model.TextEntityValue
 import com.twitter.util.Stopwatch
 import scala.collection.JavaConverters._
 import scala.math.max
 /**
 * Groups entities by the Louvain clustering method.
 * @param similarityThreshold: When building the edges between entities, edges with weight
 * less than or equal to this threshold will be filtered out.
 * @param appliedResolutionFactor: If present, will be used to multiply the applied resolution
 * parameter of the Louvain method by this factor.
 * Note that the DEFAULT_MAX_RESOLUTION will not be applied.
 */
 class LouvainClusteringMethod(
  similarityThreshold: Double,
  appliedResolutionFactor: Option[Double])
    extends ClusteringMethod {
  import ClusteringStatistics._
  def cluster[T](
    embeddings: Map[Long, T],
    similarityFn: (T, T) => Double,
    recordStatCallback: (String, Long) => Unit = (_, _) => ()
  ): Set[Set[Long]] = {
    // 1. Build the graph on which to run Louvain:
    //   - Weigh edges by the similarity between the 2 embeddings,
    //   - Filter out edges with weight <= threshold.
    val timeSinceGraphBuildStart = Stopwatch.start()
    val edges: Seq[((Long, Long), Double)] = embeddings.toSeq
      .combinations(2)
      .map { pair: Seq[(Long, T)] => // pair of 2
        val (user1, embedding1) = pair.head
        val (user2, embedding2) = pair(1)
        val similarity = similarityFn(embedding1, embedding2)
        recordStatCallback(
          StatComputedSimilarityBeforeFilter,
          (similarity * 100).toLong // preserve up to two decimal places
        )
        ((user1, user2), similarity)
      }
      .filter(_._2 > similarityThreshold)
      .toSeq
    recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
    // check if some entities do not have any incoming / outgoing edge
    // these are size-1 clusters (i.e. their own)
    val individualClusters: Set[Long] = embeddings.keySet -- edges.flatMap {
      case ((user1, user2), _) => Set(user1, user2)
    }.toSet
    // 2. LouvainDriver uses "Entity" as input, so build 2 mappings
    // - Long (entity id) -> Entity
    // - Entity -> Long (entity id)
    val embeddingIdToEntity: Map[Long, Entity] = embeddings.map {
      case (id, _) => id -> Entity(TextEntityValue(id.toString, Some(id.toString)), None)
    }
    val entityToEmbeddingId: Map[Entity, Long] = embeddingIdToEntity.map {
      case (id, e) => e -> id
    }
    // 3. Create the list of NetworkInput on which to run LouvainDriver
    val networkInputList = edges
      .map {
        case ((fromUserId: Long, toUserId: Long), weight: Double) =>
          new NetworkInput(embeddingIdToEntity(fromUserId), embeddingIdToEntity(toUserId), weight)
      }.toList.asJava
    val timeSinceClusteringAlgRunStart = Stopwatch.start()
    val networkDictionary = NetworkFactory.buildDictionary(networkInputList)
    val network = NetworkFactory.buildNetwork(networkInputList, networkDictionary)
    if (networkInputList.size() == 0) {
      // handle case if no edge at all (only one entity or all entities are too far apart)
      embeddings.keySet.map(e => Set(e))
    } else {
      // 4. Run clustering algorithm
      val clusteredIds = appliedResolutionFactor match {
        case Some(res) =>
          LouvainDriver.clusterAppliedResolutionFactor(network, networkDictionary, res)
        case None => LouvainDriver.cluster(network, networkDictionary)
      }
      recordStatCallback(
        StatClusteringAlgorithmRunTime,
        timeSinceClusteringAlgRunStart().inMilliseconds)
      // 5. Post-processing
      val atLeast2MembersClusters: Set[Set[Long]] = clusteredIds.asScala
        .groupBy(_._2)
        .mapValues(_.map { case (e, _) => entityToEmbeddingId(e) }.toSet)
        .values.toSet
      atLeast2MembersClusters ++ individualClusters.map { e => Set(e) }
    }
  }
  def clusterWithSilhouette[T](
    embeddings: Map[Long, T],
    similarityFn: (T, T) => Double,
    similarityFnForSil: (T, T) => Double,
    recordStatCallback: (String, Long) => Unit = (_, _) => ()
  ): (Set[Set[Long]], Set[Set[(Long, Double)]]) = {
    // 1. Build the graph on which to run Louvain:
    //   - Weigh edges by the similarity between the 2 embeddings,
    //   - Filter out edges with weight <= threshold.
    val timeSinceGraphBuildStart = Stopwatch.start()
    val edgesSimilarityMap = collection.mutable.Map[(Long, Long), Double]()
    val edges: Seq[((Long, Long), Double)] = embeddings.toSeq
      .combinations(2)
      .map { pair: Seq[(Long, T)] => // pair of 2
        val (user1, embedding1) = pair.head
        val (user2, embedding2) = pair(1)
        val similarity = similarityFn(embedding1, embedding2)
        val similarityForSil = similarityFnForSil(embedding1, embedding2)
        edgesSimilarityMap.put((user1, user2), similarityForSil)
        edgesSimilarityMap.put((user2, user1), similarityForSil)
        recordStatCallback(
          StatComputedSimilarityBeforeFilter,
          (similarity * 100).toLong // preserve up to two decimal places
        )
        ((user1, user2), similarity)
      }
      .filter(_._2 > similarityThreshold)
      .toSeq
    recordStatCallback(StatSimilarityGraphTotalBuildTime, timeSinceGraphBuildStart().inMilliseconds)
    // check if some entities do not have any incoming / outgoing edge
    // these are size-1 clusters (i.e. their own)
    val individualClusters: Set[Long] = embeddings.keySet -- edges.flatMap {
      case ((user1, user2), _) => Set(user1, user2)
    }.toSet
    // 2. LouvainDriver uses "Entity" as input, so build 2 mappings
    // - Long (entity id) -> Entity
    // - Entity -> Long (entity id)
    val embeddingIdToEntity: Map[Long, Entity] = embeddings.map {
      case (id, _) => id -> Entity(TextEntityValue(id.toString, Some(id.toString)), None)
    }
    val entityToEmbeddingId: Map[Entity, Long] = embeddingIdToEntity.map {
      case (id, e) => e -> id
    }
    // 3. Create the list of NetworkInput on which to run LouvainDriver
    val networkInputList = edges
      .map {
        case ((fromUserId: Long, toUserId: Long), weight: Double) =>
          new NetworkInput(embeddingIdToEntity(fromUserId), embeddingIdToEntity(toUserId), weight)
      }.toList.asJava
    val timeSinceClusteringAlgRunStart = Stopwatch.start()
    val networkDictionary = NetworkFactory.buildDictionary(networkInputList)
    val network = NetworkFactory.buildNetwork(networkInputList, networkDictionary)
    val clusters = if (networkInputList.size() == 0) {
      // handle case if no edge at all (only one entity or all entities are too far apart)
      embeddings.keySet.map(e => Set(e))
    } else {
      // 4. Run clustering algorithm
      val clusteredIds = appliedResolutionFactor match {
        case Some(res) =>
          LouvainDriver.clusterAppliedResolutionFactor(network, networkDictionary, res)
        case None => LouvainDriver.cluster(network, networkDictionary)
      }
      recordStatCallback(
        StatClusteringAlgorithmRunTime,
        timeSinceClusteringAlgRunStart().inMilliseconds)
      // 5. Post-processing
      val atLeast2MembersClusters: Set[Set[Long]] = clusteredIds.asScala
        .groupBy(_._2)
        .mapValues(_.map { case (e, _) => entityToEmbeddingId(e) }.toSet)
        .values.toSet
      atLeast2MembersClusters ++ individualClusters.map { e => Set(e) }
    }
    // Calculate silhouette metrics
    val contactIdWithSilhouette = clusters.map {
      case cluster =>
        val otherClusters = clusters - cluster
        cluster.map {
          case contactId =>
            if (otherClusters.isEmpty) {
              (contactId, 0.0)
            } else {
              val otherSameClusterContacts = cluster - contactId
              if (otherSameClusterContacts.isEmpty) {
                (contactId, 0.0)
              } else {
                // calculate similarity of given userId with all other users in the same cluster
                val a_i = otherSameClusterContacts.map {
                  case sameClusterContact =>
                    edgesSimilarityMap((contactId, sameClusterContact))
                }.sum / otherSameClusterContacts.size
                // calculate similarity of given userId to all other clusters, find the best nearest cluster
                val b_i = otherClusters.map {
                  case otherCluster =>
                    otherCluster.map {
                      case otherClusterContact =>
                        edgesSimilarityMap((contactId, otherClusterContact))
                    }.sum / otherCluster.size
                }.max
                // silhouette (value) of one userId i
                val s_i = (a_i - b_i) / max(a_i, b_i)
                (contactId, s_i)
              }
            }
        }
    }
    (clusters, contactIdWithSilhouette)
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/MaxFavScoreRepresentativeSelectionMethod.scala
@ -1,21 +0,0 @@
 package com.twitter.simclusters_v2.common.clustering
 import com.twitter.simclusters_v2.common.UserId
 import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
 class MaxFavScoreRepresentativeSelectionMethod[T] extends ClusterRepresentativeSelectionMethod[T] {
  /**
   * Identify the member with largest favScoreHalfLife100Days and return it.
   *
   * @param cluster A set of NeighborWithWeights.
   * @param embeddings A map of producer ID -> embedding.
   */
  def selectClusterRepresentative(
    cluster: Set[NeighborWithWeights],
    embeddings: Map[UserId, T],
  ): UserId = {
    val key = cluster.maxBy { x: NeighborWithWeights => x.favScoreHalfLife100Days.getOrElse(0.0) }
    key.neighborId
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/MedoidRepresentativeSelectionMethod.scala
@ -1,28 +0,0 @@
 package com.twitter.simclusters_v2.common.clustering
 import com.twitter.simclusters_v2.common.UserId
 import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
 class MedoidRepresentativeSelectionMethod[T](
  producerProducerSimilarityFn: (T, T) => Double)
    extends ClusterRepresentativeSelectionMethod[T] {
  /**
   * Identify the medoid of a cluster and return it.
   *
   * @param cluster A set of NeighborWithWeights.
   * @param embeddings A map of producer ID -> embedding.
   */
  def selectClusterRepresentative(
    cluster: Set[NeighborWithWeights],
    embeddings: Map[UserId, T],
  ): UserId = {
    val key = cluster.maxBy {
      id1 => // maxBy because we use similarity, which gets larger as we get closer.
        val v = embeddings(id1.neighborId)
        cluster
          .map(id2 => producerProducerSimilarityFn(v, embeddings(id2.neighborId))).sum
    }
    key.neighborId
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.docx
--- a/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/clustering/SimilarityFunctions.scala
@ -1,32 +0,0 @@
 package com.twitter.simclusters_v2.common.clustering
 import com.twitter.simclusters_v2.common.SimClustersEmbedding
 /**
 * SimilarityFunctions provide commonly used similarity functions that this clustering library needs.
 */
 object SimilarityFunctions {
  def simClustersCosineSimilarity: (SimClustersEmbedding, SimClustersEmbedding) => Double =
    (e1, e2) => e1.cosineSimilarity(e2)
  def simClustersMatchingLargestDimension: (
    SimClustersEmbedding,
    SimClustersEmbedding
  ) => Double = (e1, e2) => {
    val doesMatchLargestDimension: Boolean = e1
      .topClusterIds(1)
      .exists { id1 =>
        e2.topClusterIds(1).contains(id1)
      }
    if (doesMatchLargestDimension) 1.0
    else 0.0
  }
  def simClustersFuzzyJaccardSimilarity: (
    SimClustersEmbedding,
    SimClustersEmbedding
  ) => Double = (e1, e2) => {
    e1.fuzzyJaccardSimilarity(e2)
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/common/ml/BUILD
+++ b/src/scala/com/twitter/simclusters_v2/common/ml/BUILD
@ -1,12 +0,0 @@
 # This package/target is separate from other simclusters common packages because the ml/api dep is
 # large (350MB+). Having it as a separate target means that we can avoid bundling it with targets
 # that do not need it.
 scala_library(
    platform = "java8",
    tags = ["bazel-compatible"],
    dependencies = [
        "src/java/com/twitter/ml/api:api-base",
        "src/scala/com/twitter/ml/api/util",
        "src/scala/com/twitter/simclusters_v2/common",
    ],
 )
--- a/src/scala/com/twitter/simclusters_v2/common/ml/BUILD.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/ml/BUILD.docx
--- a/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.docx
--- a/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/ml/SimClustersEmbeddingAdapter.scala
@ -1,39 +0,0 @@
 package com.twitter.simclusters_v2.common.ml
 import com.twitter.ml.api.Feature.Continuous
 import com.twitter.ml.api.Feature.SparseContinuous
 import com.twitter.ml.api._
 import com.twitter.ml.api.util.FDsl._
 import com.twitter.simclusters_v2.common.SimClustersEmbedding
 class SimClustersEmbeddingAdapter(embeddingFeature: SparseContinuous)
    extends IRecordOneToOneAdapter[SimClustersEmbedding] {
  override def getFeatureContext: FeatureContext = new FeatureContext(embeddingFeature)
  override def adaptToDataRecord(embedding: SimClustersEmbedding): DataRecord = {
    val embeddingMap = embedding.embedding.map {
      case (clusterId, score) =>
        (clusterId.toString, score)
    }.toMap
    new DataRecord().setFeatureValue(embeddingFeature, embeddingMap)
  }
 }
 class NormalizedSimClustersEmbeddingAdapter(
  embeddingFeature: SparseContinuous,
  normFeature: Continuous)
    extends IRecordOneToOneAdapter[SimClustersEmbedding] {
  override def getFeatureContext: FeatureContext = new FeatureContext(embeddingFeature, normFeature)
  override def adaptToDataRecord(embedding: SimClustersEmbedding): DataRecord = {
    val normalizedEmbedding = Map(
      embedding.sortedClusterIds.map(_.toString).zip(embedding.normalizedSortedScores): _*)
    val dataRecord = new DataRecord().setFeatureValue(embeddingFeature, normalizedEmbedding)
    dataRecord.setFeatureValue(normFeature, embedding.l2norm)
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/common/package.docx
+++ b/src/scala/com/twitter/simclusters_v2/common/package.docx
--- a/src/scala/com/twitter/simclusters_v2/common/package.scala
+++ b/src/scala/com/twitter/simclusters_v2/common/package.scala
@ -1,17 +0,0 @@
 package com.twitter.simclusters_v2
 package object common {
  type TweetId = Long
  type UserId = Long
  type ClusterId = Int
  type SemanticCoreEntityId = Long // Use TopicId if it's a Topic related project.
  type UTTEntityId = Long
  type Timestamp = Long
  type Language = String
  type Country = String
  type LocaleEntity = (Long, Language)
  type TopicId = Long
  type GroupId = Long
  type SpaceId = String
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/AdhocSources.scala
@ -1,164 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources
 import com.twitter.bijection.scrooge.BinaryScalaCodec
 import com.twitter.bijection.scrooge.CompactScalaCodec
 import com.twitter.bijection.Bufferable
 import com.twitter.bijection.Injection
 import com.twitter.hermit.candidate.thriftscala.Candidates
 import com.twitter.scalding.DateRange
 import com.twitter.scalding.commons.source.VersionedKeyValSource
 import com.twitter.scalding_internal.source.lzo_scrooge.DailySuffixMostRecentLzoScrooge
 import com.twitter.scalding_internal.source.lzo_scrooge.FixedPathLzoScrooge
 import com.twitter.scalding_internal.source.lzo_scrooge.HourlySuffixMostRecentLzoScrooge
 import com.twitter.simclusters_v2.thriftscala._
 case class EdgeWithDecayedWtsFixedPathSource(path: String)
    extends FixedPathLzoScrooge[EdgeWithDecayedWeights](path, EdgeWithDecayedWeights)
 case class UserAndNeighborsFixedPathSource(path: String)
    extends FixedPathLzoScrooge[UserAndNeighbors](path, UserAndNeighbors)
 case class NormsAndCountsFixedPathSource(path: String)
    extends FixedPathLzoScrooge[NormsAndCounts](path, NormsAndCounts)
 case class UserToInterestedInClustersFixedPathSource(path: String)
    extends FixedPathLzoScrooge[UserToInterestedInClusters](path, UserToInterestedInClusters)
 case class TimelineDataExtractorFixedPathSource(path: String)
    extends FixedPathLzoScrooge[ReferenceTweets](path, ReferenceTweets)
 case class TweetClusterScoresHourlySuffixSource(path: String, override val dateRange: DateRange)
    extends HourlySuffixMostRecentLzoScrooge[TweetAndClusterScores](path, dateRange)
 case class TweetTopKClustersHourlySuffixSource(path: String, override val dateRange: DateRange)
    extends HourlySuffixMostRecentLzoScrooge[TweetTopKClustersWithScores](
      path,
      dateRange
    )
 case class ClusterTopKTweetsHourlySuffixSource(path: String, override val dateRange: DateRange)
    extends HourlySuffixMostRecentLzoScrooge[ClusterTopKTweetsWithScores](
      path,
      dateRange
    )
 case class TweetSimilarityUnhydratedPairsSource(path: String, override val dateRange: DateRange)
    extends DailySuffixMostRecentLzoScrooge[LabelledTweetPairs](
      path,
      dateRange
    )
 case class WTFCandidatesSource(path: String)
    extends FixedPathLzoScrooge[Candidates](path, Candidates)
 case class EmbeddingsLiteSource(path: String)
    extends FixedPathLzoScrooge[EmbeddingsLite](path, EmbeddingsLite)
 object AdhocKeyValSources {
  def interestedInSource(path: String): VersionedKeyValSource[Long, ClustersUserIsInterestedIn] = {
    implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
    implicit val valInject: Injection[ClustersUserIsInterestedIn, Array[Byte]] =
      CompactScalaCodec(ClustersUserIsInterestedIn)
    VersionedKeyValSource[Long, ClustersUserIsInterestedIn](path)
  }
  def clusterDetailsSource(path: String): VersionedKeyValSource[(String, Int), ClusterDetails] = {
    implicit val keyInject: Injection[(String, Int), Array[Byte]] =
      Bufferable.injectionOf[(String, Int)]
    implicit val valInject: Injection[ClusterDetails, Array[Byte]] =
      CompactScalaCodec(ClusterDetails)
    VersionedKeyValSource[(String, Int), ClusterDetails](path)
  }
  def bipartiteQualitySource(
    path: String
  ): VersionedKeyValSource[(String, Int), BipartiteClusterQuality] = {
    implicit val keyInject: Injection[(String, Int), Array[Byte]] =
      Bufferable.injectionOf[(String, Int)]
    implicit val valInject: Injection[BipartiteClusterQuality, Array[Byte]] =
      CompactScalaCodec(BipartiteClusterQuality)
    VersionedKeyValSource[(String, Int), BipartiteClusterQuality](path)
  }
  def entityToClustersSource(
    path: String
  ): VersionedKeyValSource[SimClustersEmbeddingId, SimClustersEmbedding] = {
    implicit val keyInject: Injection[SimClustersEmbeddingId, Array[Byte]] =
      BinaryScalaCodec(SimClustersEmbeddingId)
    implicit val valInject: Injection[SimClustersEmbedding, Array[Byte]] =
      BinaryScalaCodec(SimClustersEmbedding)
    VersionedKeyValSource[SimClustersEmbeddingId, SimClustersEmbedding](path)
  }
  def clusterToEntitiesSource(
    path: String
  ): VersionedKeyValSource[SimClustersEmbeddingId, InternalIdEmbedding] = {
    implicit val keyInject: Injection[SimClustersEmbeddingId, Array[Byte]] = BinaryScalaCodec(
      SimClustersEmbeddingId)
    implicit val valInject: Injection[InternalIdEmbedding, Array[Byte]] =
      BinaryScalaCodec(InternalIdEmbedding)
    VersionedKeyValSource[SimClustersEmbeddingId, InternalIdEmbedding](path)
  }
  // For storing producer-simclusters embeddings
  def topProducerToClusterEmbeddingsSource(
    path: String
  ): VersionedKeyValSource[Long, TopSimClustersWithScore] = {
    implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
    implicit val valInject: Injection[TopSimClustersWithScore, Array[Byte]] =
      CompactScalaCodec(TopSimClustersWithScore)
    VersionedKeyValSource[Long, TopSimClustersWithScore](path)
  }
  // For storing producer-simclusters embeddings
  def topClusterEmbeddingsToProducerSource(
    path: String
  ): VersionedKeyValSource[PersistedFullClusterId, TopProducersWithScore] = {
    implicit val keyInject: Injection[PersistedFullClusterId, Array[Byte]] =
      CompactScalaCodec(PersistedFullClusterId)
    implicit val valInject: Injection[TopProducersWithScore, Array[Byte]] =
      CompactScalaCodec(TopProducersWithScore)
    VersionedKeyValSource[PersistedFullClusterId, TopProducersWithScore](path)
  }
  def userToInferredEntitiesSource(
    path: String
  ): VersionedKeyValSource[Long, SimClustersInferredEntities] = {
    implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
    implicit val valInject: Injection[SimClustersInferredEntities, Array[Byte]] =
      CompactScalaCodec(SimClustersInferredEntities)
    VersionedKeyValSource[Long, SimClustersInferredEntities](path)
  }
  def knownForAdhocSource(path: String): VersionedKeyValSource[Long, ClustersUserIsKnownFor] = {
    implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
    implicit val valInject: Injection[ClustersUserIsKnownFor, Array[Byte]] =
      CompactScalaCodec(ClustersUserIsKnownFor)
    VersionedKeyValSource[Long, ClustersUserIsKnownFor](path)
  }
  def knownForSBFResultsDevelSource(
    path: String
  ): VersionedKeyValSource[Long, Array[(Int, Float)]] = {
    implicit val keyInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
    implicit val valInject: Injection[Array[(Int, Float)], Array[Byte]] =
      Bufferable.injectionOf[Array[(Int, Float)]]
    VersionedKeyValSource[Long, Array[(Int, Float)]](path)
  }
  // injection to store adjlist in the mapped indices space for users
  def intermediateSBFResultsDevelSource(
    path: String
  ): VersionedKeyValSource[Int, List[(Int, Float)]] = {
    implicit val keyInject: Injection[Int, Array[Byte]] = Injection.int2BigEndian
    implicit val valInject: Injection[List[(Int, Float)], Array[Byte]] =
      Bufferable.injectionOf[List[(Int, Float)]]
    VersionedKeyValSource[Int, List[(Int, Float)]](path)
  }
  def mappedIndicesDevelSource(path: String): VersionedKeyValSource[Int, Long] = {
    implicit val keyInject: Injection[Int, Array[Byte]] = Injection.int2BigEndian
    implicit val valInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian
    VersionedKeyValSource[Int, Long](path)
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/BUILD.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataPaths.scala
@ -1,49 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources
 object DataPaths {
  val InterestedIn2020Path =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_20M_145K_2020"
  val InterestedIn2020ThriftPath =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_20M_145K_2020_thrift"
  val InterestedInLite2020Path =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_lite_20M_145K_2020"
  val InterestedInLite2020ThriftPath =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_interested_in_lite_20M_145K_2020_thrift"
  val KnownFor2020Path =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_known_for_20M_145K_2020"
  // keep this inside /user/cassowary/manhattan_sequence_files/ to use the latest 3 retention policy
  val KnownFor2020ThriftDatasetPath =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_known_for_20M_145K_2020_thrift"
  val OfflineClusterTopMediaTweets2020DatasetPath =
    "/user/cassowary/manhattan_sequence_files/cluster_top_media_tweets_20M_145K_2020"
 }
 /**
 * These should only be accessed from simclusters_v2 data pipeline for intermediate data, these
 * are not opt-out compliant and shouldn't be exposed externally.
 */
 object InternalDataPaths {
  // Internal versions, not to be read or written outside of simcluster_v2
  private[simclusters_v2] val RawInterestedIn2020Path =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_interested_in_20M_145K_2020"
  private[simclusters_v2] val RawInterestedInLite2020Path =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_interested_in_lite_20M_145K_2020"
  private[simclusters_v2] val RawKnownForDec11Path =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_dec11"
  private[simclusters_v2] val RawKnownForUpdatedPath =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_updated"
  private[simclusters_v2] val RawKnownFor2020Path =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_raw_known_for_20M_145K_2020"
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/DataSources.scala
@ -1,39 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources
 import com.twitter.scalding.DateOps
 import com.twitter.scalding.DateRange
 import com.twitter.scalding.Days
 import com.twitter.scalding.TypedPipe
 import com.twitter.scalding_internal.dalv2.DAL
 import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
 import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
 import com.twitter.simclusters_v2.thriftscala.NormsAndCounts
 import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
 import java.util.TimeZone
 object DataSources {
  /**
   * Reads production normalized graph data from atla-proc
   */
  def userUserNormalizedGraphSource(implicit dateRange: DateRange): TypedPipe[UserAndNeighbors] = {
    DAL
      .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(14)(DateOps.UTC))
      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
      .toTypedPipe
  }
  /**
   * Reads production user norms and counts data from atla-proc
   */
  def userNormsAndCounts(
    implicit dateRange: DateRange,
    timeZone: TimeZone
  ): TypedPipe[NormsAndCounts] = {
    DAL
      .readMostRecentSnapshot(ProducerNormsAndCountsScalaDataset, dateRange.prepend(Days(14)))
      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
      .toTypedPipe
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/EntityEmbeddingsSources.scala
@ -1,222 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources
 import com.twitter.dal.client.dataset.KeyValDALDataset
 import com.twitter.scalding.DateRange
 import com.twitter.scalding.typed.TypedPipe
 import com.twitter.scalding_internal.dalv2.DAL
 import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
 import com.twitter.simclusters_v2.thriftscala._
 import com.twitter.wtf.entity_real_graph.thriftscala.EntityType
 import com.twitter.simclusters_v2.common.ClusterId
 import com.twitter.simclusters_v2.common.ModelVersions
 object EntityEmbeddingsSources {
  final val SemanticCoreSimClustersEmbeddingsDec11Dataset =
    SemanticCoreSimclustersEmbeddingsScalaDataset
  final val SemanticCoreSimClustersEmbeddingsUpdatedDataset =
    SemanticCoreSimclustersEmbeddingsUpdatedScalaDataset
  final val SemanticCoreSimClustersEmbeddings2020Dataset =
    SemanticCoreSimclustersEmbeddings2020ScalaDataset
  final val SemanticCorePerLanguageSimClustersEmbeddingsDataset =
    SemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
  final val LogFavSemanticCorePerLanguageSimClustersEmbeddingsDataset =
    LogFavSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
  final val HashtagSimClustersEmbeddingsUpdatedDataset =
    HashtagSimclustersEmbeddingsUpdatedScalaDataset
  final val ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset =
    ReverseIndexSemanticCoreSimclustersEmbeddingsScalaDataset
  final val ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset =
    ReverseIndexSemanticCoreSimclustersEmbeddingsUpdatedScalaDataset
  final val ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset =
    ReverseIndexSemanticCoreSimclustersEmbeddings2020ScalaDataset
  final val ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
    ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
  final val LogFavReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
    LogFavReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsScalaDataset
  final val ReverseIndexHashtagSimClustersEmbeddingsUpdatedDataset =
    ReverseIndexHashtagSimclustersEmbeddingsUpdatedScalaDataset
  // Fav-based TFG topic embeddings built from user device languages
  // Keyed by SimClustersEmbeddingId with InternalId.TopicId ((topic, language) pair, with country = None)
  final val FavTfgTopicEmbeddingsDataset = FavTfgTopicEmbeddingsScalaDataset
  final val FavTfgTopicEmbeddingsParquetDataset = FavTfgTopicEmbeddingsParquetScalaDataset
  final val FavTfgTopicEmbeddings2020Dataset = FavTfgTopicEmbeddings2020ScalaDataset
  final val FavTfgTopicEmbeddings2020ParquetDataset = FavTfgTopicEmbeddings2020ParquetScalaDataset
  // Logfav-based TFG topic embeddings built from user device languages
  // Keyed by SimClustersEmbeddingId with InternalId.LocaleEntityId ((topic, language) pair)
  final val LogFavTfgTopicEmbeddingsDataset = LogFavTfgTopicEmbeddingsScalaDataset
  final val LogFavTfgTopicEmbeddingsParquetDataset = LogFavTfgTopicEmbeddingsParquetScalaDataset
  // Fav-based TFG topic embeddings built from inferred user consumed languages
  // Keyed by SimClustersEmbeddingId with InternalId.TopicId ((topic, country, language) tuple)
  final val FavInferredLanguageTfgTopicEmbeddingsDataset =
    FavInferredLanguageTfgTopicEmbeddingsScalaDataset
  private val validSemanticCoreEmbeddingTypes = Seq(
    EmbeddingType.FavBasedSematicCoreEntity,
    EmbeddingType.FollowBasedSematicCoreEntity
  )
  /**
   * Given a fav/follow/etc embedding type and a ModelVersion, retrieve the corresponding dataset to
   * (SemanticCore entityId -> List(clusterId)) from a certain dateRange.
   */
  def getSemanticCoreEntityEmbeddingsSource(
    embeddingType: EmbeddingType,
    modelVersion: String,
    dateRange: DateRange
  ): TypedPipe[(Long, SimClustersEmbedding)] = {
    val dataSet = modelVersion match {
      case ModelVersions.Model20M145KDec11 => SemanticCoreSimClustersEmbeddingsDec11Dataset
      case ModelVersions.Model20M145KUpdated => SemanticCoreSimClustersEmbeddingsUpdatedDataset
      case _ => throw new IllegalArgumentException(s"ModelVersion $modelVersion is not supported")
    }
    assert(validSemanticCoreEmbeddingTypes.contains(embeddingType))
    entityEmbeddingsSource(dataSet, embeddingType, dateRange)
  }
  /**
   * Given a fav/follow/etc embedding type and a ModelVersion, retrieve the corresponding dataset to
   * (clusterId -> List(SemanticCore entityId)) from a certain dateRange.
   */
  def getReverseIndexedSemanticCoreEntityEmbeddingsSource(
    embeddingType: EmbeddingType,
    modelVersion: String,
    dateRange: DateRange
  ): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
    val dataSet = modelVersion match {
      case ModelVersions.Model20M145KDec11 =>
        ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset
      case ModelVersions.Model20M145KUpdated =>
        ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset
      case ModelVersions.Model20M145K2020 =>
        ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset
      case _ => throw new IllegalArgumentException(s"ModelVersion $modelVersion is not supported")
    }
    assert(validSemanticCoreEmbeddingTypes.contains(embeddingType))
    reverseIndexedEntityEmbeddingsSource(dataSet, embeddingType, dateRange)
  }
  // Return the raw DAL dataset reference. Use this if you're writing to DAL.
  def getEntityEmbeddingsDataset(
    entityType: EntityType,
    modelVersion: String,
    isEmbeddingsPerLocale: Boolean = false
  ): KeyValDALDataset[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]] = {
    (entityType, modelVersion) match {
      case (EntityType.SemanticCore, ModelVersions.Model20M145KDec11) =>
        SemanticCoreSimClustersEmbeddingsDec11Dataset
      case (EntityType.SemanticCore, ModelVersions.Model20M145KUpdated) =>
        if (isEmbeddingsPerLocale) {
          SemanticCorePerLanguageSimClustersEmbeddingsDataset
        } else {
          SemanticCoreSimClustersEmbeddingsUpdatedDataset
        }
      case (EntityType.SemanticCore, ModelVersions.Model20M145K2020) =>
        SemanticCoreSimClustersEmbeddings2020Dataset
      case (EntityType.Hashtag, ModelVersions.Model20M145KUpdated) =>
        HashtagSimClustersEmbeddingsUpdatedDataset
      case (entityType, modelVersion) =>
        throw new IllegalArgumentException(
          s"(Entity Type, ModelVersion) ($entityType, $modelVersion) not supported.")
    }
  }
  // Return the raw DAL dataset reference. Use this if you're writing to DAL.
  def getReverseIndexedEntityEmbeddingsDataset(
    entityType: EntityType,
    modelVersion: String,
    isEmbeddingsPerLocale: Boolean = false
  ): KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]] = {
    (entityType, modelVersion) match {
      case (EntityType.SemanticCore, ModelVersions.Model20M145KDec11) =>
        ReverseIndexSemanticCoreSimClustersEmbeddingsDec11Dataset
      case (EntityType.SemanticCore, ModelVersions.Model20M145KUpdated) =>
        if (isEmbeddingsPerLocale) {
          ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset
        } else {
          ReverseIndexSemanticCoreSimClustersEmbeddingsUpdatedDataset
        }
      case (EntityType.SemanticCore, ModelVersions.Model20M145K2020) =>
        ReverseIndexSemanticCoreSimClustersEmbeddings2020Dataset
      case (EntityType.Hashtag, ModelVersions.Model20M145KUpdated) =>
        ReverseIndexHashtagSimClustersEmbeddingsUpdatedDataset
      case (entityType, modelVersion) =>
        throw new IllegalArgumentException(
          s"(Entity Type, ModelVersion) ($entityType, $modelVersion) not supported.")
    }
  }
  private def entityEmbeddingsSource(
    dataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]],
    embeddingType: EmbeddingType,
    dateRange: DateRange
  ): TypedPipe[(Long, SimClustersEmbedding)] = {
    val pipe = DAL
      .readMostRecentSnapshot(dataset, dateRange)
      .withRemoteReadPolicy(AllowCrossDC)
      .toTypedPipe
    filterEntityEmbeddingsByType(pipe, embeddingType)
  }
  private def reverseIndexedEntityEmbeddingsSource(
    dataset: KeyValDALDataset[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]],
    embeddingType: EmbeddingType,
    dateRange: DateRange
  ): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
    val pipe = DAL
      .readMostRecentSnapshot(dataset, dateRange)
      .withRemoteReadPolicy(AllowCrossDC)
      .toTypedPipe
    filterReverseIndexedEntityEmbeddingsByType(pipe, embeddingType)
  }
  private[hdfs_sources] def filterEntityEmbeddingsByType(
    pipe: TypedPipe[KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]],
    embeddingType: EmbeddingType
  ): TypedPipe[(Long, SimClustersEmbedding)] = {
    pipe.collect {
      case KeyVal(
            SimClustersEmbeddingId(_embeddingType, _, InternalId.EntityId(entityId)),
            embedding
          ) if _embeddingType == embeddingType =>
        (entityId, embedding)
    }
  }
  private[hdfs_sources] def filterReverseIndexedEntityEmbeddingsByType(
    pipe: TypedPipe[KeyVal[SimClustersEmbeddingId, InternalIdEmbedding]],
    embeddingType: EmbeddingType
  ): TypedPipe[(ClusterId, Seq[SemanticCoreEntityWithScore])] = {
    pipe.collect {
      case KeyVal(
            SimClustersEmbeddingId(_embeddingType, _, InternalId.ClusterId(clusterId)),
            embedding
          ) if _embeddingType == embeddingType =>
        val entitiesWithScores = embedding.embedding.collect {
          case InternalIdWithScore(InternalId.EntityId(entityId), score) =>
            SemanticCoreEntityWithScore(entityId, score)
        }
        (clusterId, entitiesWithScores)
    }
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/InterestedInSources.scala
@ -1,178 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources
 import com.twitter.dal.client.dataset.KeyValDALDataset
 import com.twitter.scalding.{DateOps, DateRange, Days, TypedPipe}
 import com.twitter.scalding_internal.dalv2.DAL
 import com.twitter.scalding_internal.dalv2.remote_access.{ExplicitLocation, ProcAtla}
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
 import com.twitter.simclusters_v2.common.UserId
 import com.twitter.simclusters_v2.thriftscala.ModelVersion
 import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
 import java.util.TimeZone
 object InterestedInSources {
  private val ModelVersionInterestedInDatasetMap: Map[ModelVersion, KeyValDALDataset[
    KeyVal[UserId, ClustersUserIsInterestedIn]
  ]] = Map(
    ModelVersion.Model20m145kDec11 -> SimclustersV2InterestedInScalaDataset,
    ModelVersion.Model20m145kUpdated -> SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
    ModelVersion.Model20m145k2020 -> SimclustersV2InterestedIn20M145K2020ScalaDataset
  )
  /**
   * Internal version, not PDP compliant, not to be used outside simclusters_v2
   * Reads 20M145KDec11 production InterestedIn data from atla-proc, with a 14-day extended window
   */
  private[simclusters_v2] def simClustersRawInterestedInDec11Source(
    dateRange: DateRange,
    timeZone: TimeZone
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    DAL
      .readMostRecentSnapshot(
        SimclustersV2RawInterestedIn20M145KDec11ScalaDataset,
        dateRange.prepend(Days(14)(timeZone))
      )
      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
      .toTypedPipe
      .map {
        case KeyVal(userId, clustersUserIsInterestedIn) =>
          (userId, clustersUserIsInterestedIn)
      }
  }
  /**
   * Internal version, not PDP compliant, not to be used outside simclusters_v2
   * Reads 20M145KUpdated InterestedIn data from atla-proc, with a 14-day extended window
   */
  private[simclusters_v2] def simClustersRawInterestedInUpdatedSource(
    dateRange: DateRange,
    timeZone: TimeZone
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    DAL
      .readMostRecentSnapshot(
        SimclustersV2RawInterestedIn20M145KUpdatedScalaDataset,
        dateRange.prepend(Days(14)(timeZone))
      )
      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
      .toTypedPipe.map {
        case KeyVal(userId, clustersUserIsInterestedIn) =>
          (userId, clustersUserIsInterestedIn)
      }
  }
  /**
   * Internal version, not PDP compliant, not to be used outside simclusters_v2
   * Reads 20M145K2020 InterestedIn data from atla-proc, with a 14-day extended window
   */
  private[simclusters_v2] def simClustersRawInterestedIn2020Source(
    dateRange: DateRange,
    timeZone: TimeZone
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    DAL
      .readMostRecentSnapshot(
        SimclustersV2RawInterestedIn20M145K2020ScalaDataset,
        dateRange.prepend(Days(14)(timeZone))
      )
      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
      .toTypedPipe.map {
        case KeyVal(userId, clustersUserIsInterestedIn) =>
          (userId, clustersUserIsInterestedIn)
      }
  }
  private[simclusters_v2] def simClustersRawInterestedInLite2020Source(
    dateRange: DateRange,
    timeZone: TimeZone
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    DAL
      .readMostRecentSnapshot(
        SimclustersV2RawInterestedInLite20M145K2020ScalaDataset,
        dateRange.extend(Days(14)(timeZone)))
      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
      .toTypedPipe.map {
        case KeyVal(userId, clustersUserIsInterestedIn) =>
          (userId, clustersUserIsInterestedIn)
      }
  }
  /**
   * Reads 20M145KDec11 production InterestedIn data from atla-proc, with a 14-day extended window
   */
  def simClustersInterestedInDec11Source(
    dateRange: DateRange,
    timeZone: TimeZone
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    DAL
      .readMostRecentSnapshot(
        SimclustersV2InterestedInScalaDataset,
        dateRange.prepend(Days(14)(timeZone)))
      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
      .toTypedPipe.map {
        case KeyVal(userId, clustersUserIsInterestedIn) =>
          (userId, clustersUserIsInterestedIn)
      }
  }
  /**
   * Reads 20M145KUpdated InterestedIn data from atla-proc, with a 14-day extended window
   */
  def simClustersInterestedInUpdatedSource(
    dateRange: DateRange,
    timeZone: TimeZone
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    DAL
      .readMostRecentSnapshot(
        SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
        dateRange.prepend(Days(14)(timeZone))
      )
      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
      .toTypedPipe.map {
        case KeyVal(userId, clustersUserIsInterestedIn) =>
          (userId, clustersUserIsInterestedIn)
      }
  }
  /**
   * Reads 20M145K2020 InterestedIn data from atla-proc, with a 14-day extended window
   */
  def simClustersInterestedIn2020Source(
    dateRange: DateRange,
    timeZone: TimeZone
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    DAL
      .readMostRecentSnapshot(
        SimclustersV2InterestedIn20M145K2020ScalaDataset,
        dateRange.prepend(Days(14)(timeZone))
      )
      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
      .toTypedPipe.map {
        case KeyVal(userId, clustersUserIsInterestedIn) =>
          (userId, clustersUserIsInterestedIn)
      }
  }
  /**
   * Reads InterestedIn data based on ModelVersion from atla-proc, with a 14-day extended window
   */
  def simClustersInterestedInSource(
    modelVersion: ModelVersion,
    dateRange: DateRange,
    timeZone: TimeZone
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    DAL
      .readMostRecentSnapshot(
        ModelVersionInterestedInDatasetMap(modelVersion),
        dateRange.prepend(Days(14)(timeZone))
      )
      .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
      .toTypedPipe.map {
        case KeyVal(userId, clustersUserIsInterestedIn) =>
          (userId, clustersUserIsInterestedIn)
      }
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/ProducerEmbeddingSources.scala
@ -1,86 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources
 import com.twitter.scalding.DateRange
 import com.twitter.scalding.TypedPipe
 import com.twitter.scalding_internal.dalv2.DAL
 import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC
 import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
 import com.twitter.scalding_internal.dalv2.remote_access.Proc3Atla
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
 import com.twitter.simclusters_v2.thriftscala.EmbeddingType
 import com.twitter.simclusters_v2.thriftscala.InternalId
 import com.twitter.simclusters_v2.thriftscala.ModelVersion
 import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding
 import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
 import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore
 object ProducerEmbeddingSources {
  /**
   * Helper function to retrieve producer SimClusters embeddings with the legacy `TopSimClustersWithScore`
   * value type.
   */
  def producerEmbeddingSourceLegacy(
    embeddingType: EmbeddingType,
    modelVersion: ModelVersion
  )(
    implicit dateRange: DateRange
  ): TypedPipe[(Long, TopSimClustersWithScore)] = {
    val producerEmbeddingDataset = (embeddingType, modelVersion) match {
      case (EmbeddingType.ProducerFollowBasedSemanticCoreEntity, ModelVersion.Model20m145kDec11) =>
        ProducerTopKSimclusterEmbeddingsByFollowScoreScalaDataset
      case (EmbeddingType.ProducerFavBasedSemanticCoreEntity, ModelVersion.Model20m145kDec11) =>
        ProducerTopKSimclusterEmbeddingsByFavScoreScalaDataset
      case (
            EmbeddingType.ProducerFollowBasedSemanticCoreEntity,
            ModelVersion.Model20m145kUpdated) =>
        ProducerTopKSimclusterEmbeddingsByFollowScoreUpdatedScalaDataset
      case (EmbeddingType.ProducerFavBasedSemanticCoreEntity, ModelVersion.Model20m145kUpdated) =>
        ProducerTopKSimclusterEmbeddingsByFavScoreUpdatedScalaDataset
      case (_, _) =>
        throw new ClassNotFoundException(
          "Unsupported embedding type: " + embeddingType + " and model version: " + modelVersion)
    }
    DAL
      .readMostRecentSnapshot(producerEmbeddingDataset).withRemoteReadPolicy(
        AllowCrossClusterSameDC)
      .toTypedPipe.map {
        case KeyVal(producerId, topSimClustersWithScore) =>
          (producerId, topSimClustersWithScore)
      }
  }
  def producerEmbeddingSource(
    embeddingType: EmbeddingType,
    modelVersion: ModelVersion
  )(
    implicit dateRange: DateRange
  ): TypedPipe[(Long, SimClustersEmbedding)] = {
    val producerEmbeddingDataset = (embeddingType, modelVersion) match {
      case (EmbeddingType.AggregatableLogFavBasedProducer, ModelVersion.Model20m145k2020) =>
        AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
      case (EmbeddingType.AggregatableFollowBasedProducer, ModelVersion.Model20m145k2020) =>
        AggregatableProducerSimclustersEmbeddingsByFollowScore2020ScalaDataset
      case (EmbeddingType.RelaxedAggregatableLogFavBasedProducer, ModelVersion.Model20m145k2020) =>
        AggregatableProducerSimclustersEmbeddingsByLogFavScoreRelaxedFavEngagementThreshold2020ScalaDataset
      case (_, _) =>
        throw new ClassNotFoundException(
          "Unsupported embedding type: " + embeddingType + " and model version: " + modelVersion)
    }
    DAL
      .readMostRecentSnapshot(
        producerEmbeddingDataset
      )
      .withRemoteReadPolicy(ExplicitLocation(Proc3Atla))
      .toTypedPipe
      .map {
        case KeyVal(
              SimClustersEmbeddingId(_, _, InternalId.UserId(producerId: Long)),
              embedding: SimClustersEmbedding) =>
          (producerId, embedding)
      }
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD
@ -1,13 +0,0 @@
 scala_library(
    sources = ["*.scala"],
    platform = "java8",
    tags = ["bazel-compatible"],
    dependencies = [
        "src/scala/com/twitter/scalding_internal/multiformat/format",
        "src/scala/com/twitter/simclusters_v2/common",
        "src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala",
        "src/thrift/com/twitter/ml/api:embedding-scala",
        "src/thrift/com/twitter/recos/entities:entities-thrift-scala",
        "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
    ],
 )
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/BUILD.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterDetailsInjection.scala
@ -1,16 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.injections
 import com.twitter.bijection.Bufferable
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
  ScalaCompactThrift,
  genericInjection
 }
 import com.twitter.simclusters_v2.thriftscala.ClusterDetails
 object ClusterDetailsInjection {
  val injection = KeyValInjection[(String, Int), ClusterDetails](
    genericInjection(Bufferable.injectionOf[(String, Int)]),
    ScalaCompactThrift(ClusterDetails)
  )
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopMediaTweetsInjection.scala
@ -1,13 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.injections
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
 import com.twitter.simclusters_v2.thriftscala.{TweetsWithScore, DayPartitionedClusterId}
 object ClusterTopMediaTweetsInjection {
  val injection = KeyValInjection[DayPartitionedClusterId, TweetsWithScore](
    ScalaCompactThrift(DayPartitionedClusterId),
    ScalaCompactThrift(TweetsWithScore)
  )
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusterTopTweetsInjection.scala
@ -1,14 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.injections
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
 import com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores
 import com.twitter.simclusters_v2.thriftscala.FullClusterId
 object ClusterTopTweetsInjection {
  val clusterIdToTopKTweetsInjection = KeyValInjection[FullClusterId, TopKTweetsWithScores](
    ScalaCompactThrift(FullClusterId),
    ScalaCompactThrift(TopKTweetsWithScores)
  )
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ClusteringInjections.scala
@ -1,16 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.injections
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaBinaryThrift
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
 import com.twitter.simclusters_v2.common.UserId
 import com.twitter.simclusters_v2.thriftscala._
 object ClusteringInjections {
  final val OrderedClustersAndMembersInjection: KeyValInjection[
    UserId,
    OrderedClustersAndMembers
  ] =
    KeyValInjection(Long2BigEndian, ScalaBinaryThrift(OrderedClustersAndMembers))
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/EntityEmbeddingsInjections.scala
@ -1,47 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.injections
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaBinaryThrift
 import com.twitter.simclusters_v2.thriftscala._
 import com.twitter.ml.api.thriftscala.Embedding
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
 object EntityEmbeddingsInjections {
  final val EntitySimClustersEmbeddingInjection: KeyValInjection[
    SimClustersEmbeddingId,
    SimClustersEmbedding
  ] =
    KeyValInjection(
      ScalaBinaryThrift(SimClustersEmbeddingId),
      ScalaBinaryThrift(SimClustersEmbedding)
    )
  final val InternalIdEmbeddingInjection: KeyValInjection[
    SimClustersEmbeddingId,
    InternalIdEmbedding
  ] =
    KeyValInjection(
      ScalaBinaryThrift(SimClustersEmbeddingId),
      ScalaBinaryThrift(InternalIdEmbedding)
    )
  final val EntitySimClustersMultiEmbeddingInjection: KeyValInjection[
    SimClustersMultiEmbeddingId,
    SimClustersMultiEmbedding
  ] =
    KeyValInjection(
      ScalaBinaryThrift(SimClustersMultiEmbeddingId),
      ScalaBinaryThrift(SimClustersMultiEmbedding)
    )
  final val UserMbcgEmbeddingInjection: KeyValInjection[
    Long,
    Embedding
  ] =
    KeyValInjection[Long, Embedding](
      Long2BigEndian,
      ScalaCompactThrift(Embedding)
    )
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InferredEntitiesInjections.scala
@ -1,27 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.injections
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
  Int2BigEndian,
  Long2BigEndian,
  ScalaCompactThrift
 }
 import com.twitter.simclusters_v2.thriftscala.SimClustersInferredEntities
 object InferredEntitiesInjections {
  final val InferredEntityInjection: KeyValInjection[Long, SimClustersInferredEntities] =
    KeyValInjection(
      Long2BigEndian,
      ScalaCompactThrift(SimClustersInferredEntities)
    )
  final val InferredEntityKeyedByClusterInjection: KeyValInjection[
    Int,
    SimClustersInferredEntities
  ] =
    KeyValInjection(
      Int2BigEndian,
      ScalaCompactThrift(SimClustersInferredEntities)
    )
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/InterestedInInjection.scala
@ -1,13 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.injections
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.StringUtf8
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
 import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
 object InterestedInInjection {
  val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(ClustersUserIsInterestedIn))
  val languageInjection =
    KeyValInjection(StringUtf8, ScalaCompactThrift(ClustersUserIsInterestedIn))
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/KnownForInjection.scala
@ -1,12 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.injections
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
  Long2BigEndian,
  ScalaCompactThrift
 }
 import com.twitter.simclusters_v2.thriftscala._
 object KnownForInjection {
  val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(ClustersUserIsKnownFor))
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/MultiTypeGraphInjections.scala
@ -1,31 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.injections
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
 import com.twitter.simclusters_v2.thriftscala.LeftNode
 import com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList
 import com.twitter.simclusters_v2.thriftscala.RightNode
 import com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct
 import com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeightList
 import com.twitter.simclusters_v2.thriftscala.SimilarRightNodes
 import com.twitter.simclusters_v2.thriftscala.CandidateTweetsList
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
 object MultiTypeGraphInjections {
  final val truncatedMultiTypeGraphInjection =
    KeyValInjection(ScalaCompactThrift(LeftNode), ScalaCompactThrift(RightNodeWithEdgeWeightList))
  final val topKRightNounListInjection =
    KeyValInjection(
      ScalaCompactThrift(RightNodeTypeStruct),
      ScalaCompactThrift(NounWithFrequencyList))
  final val similarRightNodesInjection =
    KeyValInjection[RightNode, SimilarRightNodes](
      ScalaCompactThrift(RightNode),
      ScalaCompactThrift(SimilarRightNodes)
    )
  final val tweetRecommendationsInjection =
    KeyValInjection[Long, CandidateTweetsList](
      Long2BigEndian,
      ScalaCompactThrift(CandidateTweetsList)
    )
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/ProducerEmbeddingsInjections.scala
@ -1,45 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.injections
 import com.twitter.hermit.candidate.thriftscala.Candidates
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
  Long2BigEndian,
  ScalaBinaryThrift,
  ScalaCompactThrift
 }
 import com.twitter.simclusters_v2.thriftscala.{
  PersistedFullClusterId,
  SimClustersEmbedding,
  SimClustersEmbeddingId,
  TopProducersWithScore,
  TopSimClustersWithScore
 }
 object ProducerEmbeddingsInjections {
  final val ProducerTopKSimClusterEmbeddingsInjection: KeyValInjection[
    Long,
    TopSimClustersWithScore
  ] =
    KeyValInjection(
      keyCodec = Long2BigEndian,
      valueCodec = ScalaCompactThrift(TopSimClustersWithScore))
  final val SimClusterEmbeddingTopKProducersInjection: KeyValInjection[
    PersistedFullClusterId,
    TopProducersWithScore
  ] =
    KeyValInjection(
      keyCodec = ScalaCompactThrift(PersistedFullClusterId),
      valueCodec = ScalaCompactThrift(TopProducersWithScore))
  final val SimilarUsersInjection: KeyValInjection[Long, Candidates] =
    KeyValInjection(keyCodec = Long2BigEndian, valueCodec = ScalaCompactThrift(Candidates))
  final val ProducerSimClustersEmbeddingInjection: KeyValInjection[
    SimClustersEmbeddingId,
    SimClustersEmbedding
  ] =
    KeyValInjection(
      keyCodec = ScalaBinaryThrift(SimClustersEmbeddingId),
      valueCodec = ScalaBinaryThrift(SimClustersEmbedding))
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SemanticCoreEntitiesInjections.scala
@ -1,53 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.injections
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
  Long2BigEndian,
  ScalaCompactThrift,
  StringUtf8
 }
 import com.twitter.recos.entities.thriftscala.{
  SemanticCoreEntityScoreList,
  SemanticCoreEntityWithLocale,
  UserIdWithLocale,
  UserScoreList
 }
 object SemanticCoreEntitiesInjections {
  final val StringToSemanticCoreEntityScoreListInjection: KeyValInjection[
    String,
    SemanticCoreEntityScoreList
  ] =
    KeyValInjection(
      StringUtf8,
      ScalaCompactThrift(SemanticCoreEntityScoreList)
    )
  final val LongToSemanticCoreEntityScoreListInjection: KeyValInjection[
    Long,
    SemanticCoreEntityScoreList
  ] =
    KeyValInjection(
      Long2BigEndian,
      ScalaCompactThrift(SemanticCoreEntityScoreList)
    )
  final val UserWithLocaleToSemanticCoreEntityScoreListInjection: KeyValInjection[
    UserIdWithLocale,
    SemanticCoreEntityScoreList
  ] =
    KeyValInjection(
      ScalaCompactThrift(UserIdWithLocale),
      ScalaCompactThrift(SemanticCoreEntityScoreList)
    )
  final val SemanticCoreEntityWithLocaleToUsersScoreListInjection: KeyValInjection[
    SemanticCoreEntityWithLocale,
    UserScoreList
  ] =
    KeyValInjection(
      ScalaCompactThrift(SemanticCoreEntityWithLocale),
      ScalaCompactThrift(UserScoreList)
    )
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/injections/SingleSideUserScoresInjection.scala
@ -1,12 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.injections
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{
  Long2BigEndian,
  ScalaCompactThrift
 }
 import com.twitter.simclusters_v2.thriftscala.SingleSideUserScores
 object SingleSideUserScoresInjection {
  val injection = KeyValInjection(Long2BigEndian, ScalaCompactThrift(SingleSideUserScores))
 }
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD
@ -1,60 +0,0 @@
 scala_library(
    sources = ["*.scala"],
    platform = "java8",
    tags = ["bazel-compatible"],
    dependencies = [
        ":data_sources",
        "3rdparty/src/jvm/com/twitter/scalding:core",
        "src/scala/com/twitter/scalding_internal/dalv2",
        "src/scala/com/twitter/scalding_internal/multiformat/format",
        "src/scala/com/twitter/scalding_internal/source/lzo_scrooge",
        "src/scala/com/twitter/simclusters_v2/common",
        "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
        "src/thrift/com/twitter/wtf/entity_real_graph:entity_real_graph-thrift-scala",
    ],
 )
 scala_library(
    name = "data_sources",
    sources = [],
    description = "DAL datasets we wish to expose externally",
    platform = "java8",
    tags = ["bazel-compatible"],
    dependencies = [
        ":reverse_index_semantic_core_per_language_simclusters_embeddings_presto-scala",
        ":semantic_core_per_language_simclusters_embeddings_presto-scala",
        "src/scala/com/twitter/simclusters_v2/common",
    ],
 )
 create_datasets(
    base_name = "reverse_index_semantic_core_per_language_simclusters_embeddings_presto",
    java_schema = "com.twitter.simclusters_v2.thriftjava.InternalIdEmbeddingWithId",
    platform = "java8",
    role = "cassowary",
    scala_schema = "com.twitter.simclusters_v2.thriftscala.InternalIdEmbeddingWithId",
    segment_type = "snapshot",
    tags = ["bazel-compatible"],
    java_dependencies = [
        "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java",
    ],
    scala_dependencies = [
        "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
    ],
 )
 create_datasets(
    base_name = "semantic_core_per_language_simclusters_embeddings_presto",
    java_schema = "com.twitter.simclusters_v2.thriftjava.SimClustersEmbeddingWithId",
    platform = "java8",
    role = "cassowary",
    scala_schema = "com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingWithId",
    segment_type = "snapshot",
    tags = ["bazel-compatible"],
    java_dependencies = [
        "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java",
    ],
    scala_dependencies = [
        "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
    ],
 )
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/BUILD.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.docx
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.docx
--- a/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.scala
+++ b/src/scala/com/twitter/simclusters_v2/hdfs_sources/presto_hdfs_sources/EntityEmbeddingsPrestoSources.scala
@ -1,10 +0,0 @@
 package com.twitter.simclusters_v2.hdfs_sources.presto_hdfs_sources
 object EntityEmbeddingsPrestoSources {
  final val SemanticCorePerLanguageSimClustersEmbeddingsDataset =
    SemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset
  final val ReverseIndexSemanticCorePerLanguageSimClustersEmbeddingsDataset =
    ReverseIndexSemanticCorePerLanguageSimclustersEmbeddingsPrestoScalaDataset
 }
--- a/src/scala/com/twitter/simclusters_v2/images/bipartite_graph.png
+++ b/src/scala/com/twitter/simclusters_v2/images/bipartite_graph.png
--- a/src/scala/com/twitter/simclusters_v2/images/interestedin.png
+++ b/src/scala/com/twitter/simclusters_v2/images/interestedin.png
--- a/src/scala/com/twitter/simclusters_v2/images/knownfor.png
+++ b/src/scala/com/twitter/simclusters_v2/images/knownfor.png
--- a/src/scala/com/twitter/simclusters_v2/images/producer_embeddings.png
+++ b/src/scala/com/twitter/simclusters_v2/images/producer_embeddings.png
--- a/src/scala/com/twitter/simclusters_v2/images/producer_producer_similarity.png
+++ b/src/scala/com/twitter/simclusters_v2/images/producer_producer_similarity.png
--- a/src/scala/com/twitter/simclusters_v2/images/topic_embeddings.png
+++ b/src/scala/com/twitter/simclusters_v2/images/topic_embeddings.png
--- a/src/scala/com/twitter/simclusters_v2/scalding/BUILD
+++ b/src/scala/com/twitter/simclusters_v2/scalding/BUILD
@ -1,521 +0,0 @@
 scala_library(
    sources = ["*.scala"],
    platform = "java8",
    tags = ["bazel-compatible"],
    dependencies = [
        "3rdparty/jvm/com/fasterxml/jackson:jackson-module-scala",
        "3rdparty/jvm/com/fasterxml/jackson/core:jackson-core",
        "3rdparty/jvm/com/fasterxml/jackson/core:jackson-databind",
        "3rdparty/jvm/com/fasterxml/jackson/module:jackson-module-scala",
        "3rdparty/jvm/com/googlecode/matrix-toolkits-java",
        "3rdparty/jvm/com/twitter/storehaus:algebra",
        "3rdparty/jvm/com/twitter/storehaus:core",
        "escherbird/src/scala/com/twitter/escherbird/scalding/source",
        "flockdb-tools/datasets/flock:flock-follows-edges-scala",
        "src/java/com/twitter/ml/api/constant",
        "src/java/com/twitter/sbf/core",
        "src/java/com/twitter/sbf/graph",
        "src/scala/com/twitter/frigate/user_sampler/common",
        "src/scala/com/twitter/ml/api:api-base",
        "src/scala/com/twitter/ml/api/bq",
        "src/scala/com/twitter/pluck/source/cassowary:sims",
        "src/scala/com/twitter/pluck/source/core_workflows/user_model:condensed_user_state-scala",
        "src/scala/com/twitter/scalding_internal/dalv2",
        "src/scala/com/twitter/scalding_internal/job",
        "src/scala/com/twitter/scalding_internal/job/analytics_batch",
        "src/scala/com/twitter/scalding_internal/source",
        "src/scala/com/twitter/scalding_internal/source/lzo_scrooge",
        "src/scala/com/twitter/simclusters_v2/candidate_source",
        "src/scala/com/twitter/simclusters_v2/hdfs_sources",
        "src/scala/com/twitter/simclusters_v2/scalding/common",
        "src/scala/com/twitter/simclusters_v2/summingbird/common",
        "src/scala/com/twitter/timelines/prediction/features/common",
        "src/scala/com/twitter/timelines/prediction/features/itl",
        "src/scala/com/twitter/timelines/prediction/features/recap",
        "src/scala/com/twitter/wtf/entity_real_graph/scalding/common",
        "src/thrift/com/twitter/hermit/candidate:hermit-candidate-scala",
        "src/thrift/com/twitter/wtf/scalding/sims:sims-thrift-scala",
        "twadoop_config/configuration/log_categories/group/recos-platform:content_recommender_get_content_recommendations-scala",
        "twadoop_config/configuration/log_categories/group/recos-platform:content_recommender_get_topic_tweets_recommendations-scala",
        "twadoop_config/configuration/log_categories/group/timeline:timeline_service_favorites-scala",
        "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala",
        "usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala",
        "util/util-core:util-core-util",
    ],
 )
 hadoop_binary(
    name = "evd_cluster_similarity",
    main = "com.twitter.simclusters_v2.scalding.EigenVectorsForClusterSimilarityAdhoc",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "cluster_evaluation",
    main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationAdhoc",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "cluster_evaluation_20m_145k",
    main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "cluster_evaluation_20m_145k_2020",
    main = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K2020",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "bp_cluster_evaluation",
    main = "com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluation",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "update_knownfor",
    main = "com.twitter.simclusters_v2.scalding.UpdateKnownForAdhoc",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "update_knownfor_prod",
    main = "com.twitter.simclusters_v2.scalding.UpdateKnownFor20M145K",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "cluster_details",
    main = "com.twitter.simclusters_v2.scalding.ClusterDetailsBatch",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "cluster_details_20m_145k_updated",
    main = "com.twitter.simclusters_v2.scalding.ClusterDetails20M145KUpdated",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "cluster_details_20m_145k_2020",
    main = "com.twitter.simclusters_v2.scalding.ClusterDetails20M145K2020",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "cluster_details-adhoc",
    main = "com.twitter.simclusters_v2.scalding.ClusterDetailsAdhoc",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "cluster_details-dump",
    main = "com.twitter.simclusters_v2.scalding.DumpClusterDetailsAdhoc",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "interested_in",
    main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForBatch",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "interested_in_from_producer_embeddings",
    main = "com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsBatchApp",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "employee_graph_from_user_user",
    main = "com.twitter.simclusters_v2.scalding.EmployeeGraphFromUserUser",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "interested_in_20m_145k_updated",
    main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145KUpdated",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "interested_in_20m_145k_2020",
    main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145K2020",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "interested_in_lite_20m_145k_2020",
    main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "interested_in_lite_20m_145k_2020-adhoc",
    main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020Adhoc",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "interested_in_from_ape_2020-adhoc",
    main = "com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020AdhocApp",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "interested_in_from_ape_2020",
    main = "com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020BatchApp",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "known_for_to_mh",
    main = "com.twitter.simclusters_v2.scalding.KnownForToMHBatch",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "user_user_normalized_graph",
    main = "com.twitter.simclusters_v2.scalding.UserUserNormalizedGraphBatch",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "user_user_graph",
    main = "com.twitter.simclusters_v2.scalding.UserUserGraphBatch",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "user_user_graph-adhoc",
    main = "com.twitter.simclusters_v2.scalding.UserUserGraphAdhoc",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "producer_norms_and_counts",
    main = "com.twitter.simclusters_v2.scalding.ProducerNormsAndCountsBatch",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "fav_graph",
    main = "com.twitter.simclusters_v2.scalding.UserUserFavGraphBatch",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "top_users_similarity_graph",
    main = "com.twitter.simclusters_v2.scalding.TopUsersSimilarityGraphApp",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "top_users_only",
    main = "com.twitter.simclusters_v2.scalding.TopUsersOnlyApp",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 hadoop_binary(
    name = "dump_fav_graph_adhoc",
    main = "com.twitter.simclusters_v2.scalding.DumpFavGraphAdhoc",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
 # Generated with `capesospy-v2 create_target interested_in_for_20M_145k_2020 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml`, config hash 8f19bf.
 scalding_job(
    name = "interested_in_for_20M_145k_2020",
    main = "com.twitter.simclusters_v2.scalding.InterestedInFromKnownFor20M145K2020",
    args = ["--socialProofThreshold 2 --maxClustersPerUser 50"],
    config = [
        ("hadoop.combine-input", "true"),
        ("hadoop.map.jvm.total-memory", "3072m"),
        ("hadoop.reduce.jvm.total-memory", "3072m"),
        ("hadoop.submitter.jvm.total-memory", "5120m"),
        ("submitter.tier", "preemptible"),
    ],
    cron = "14 * * * *",
    hadoop_cluster = "atla-proc",
    platform = "java8",
    role = "cassowary",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":scalding",
    ],
 )
--- a/src/scala/com/twitter/simclusters_v2/scalding/BUILD.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/BUILD.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluation.scala
@ -1,513 +0,0 @@
 package com.twitter.simclusters_v2.scalding
 import com.twitter.algebird.Aggregator
 import com.twitter.algebird.Monoid
 import com.twitter.scalding._
 import com.twitter.scalding.commons.source.VersionedKeyValSource
 import com.twitter.scalding.typed.TypedPipe
 import com.twitter.scalding_internal.dalv2.DAL
 import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
 import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
 import com.twitter.scalding_internal.job.TwitterExecutionApp
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
 import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
 import com.twitter.simclusters_v2.hdfs_sources.NormsAndCountsFixedPathSource
 import com.twitter.simclusters_v2.hdfs_sources.ProducerNormsAndCountsScalaDataset
 import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInScalaDataset
 import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
 import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
 import com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluationClasses._
 import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._
 import com.twitter.simclusters_v2.scalding.common.Util
 import com.twitter.simclusters_v2.thriftscala.BipartiteClusterQuality
 import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
 import com.twitter.simclusters_v2.thriftscala.NeighborWithWeights
 import com.twitter.simclusters_v2.thriftscala.NormsAndCounts
 import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
 import scala.collection.JavaConverters._
 object BipartiteClusterEvaluation extends TwitterExecutionApp {
  implicit val tz: java.util.TimeZone = DateOps.UTC
  implicit val dp = DateParser.default
  private def getClusterL2Norms(
    knownFor: TypedPipe[(Long, Array[(Int, Float)])]
  ): Execution[Map[Int, Float]] = {
    knownFor
      .flatMap {
        case (_, clusterArray) =>
          clusterArray.map {
            case (clusterId, score) =>
              Map(clusterId -> score * score)
          }
      }
      .sum
      .getExecution
      .map(_.mapValues { x => math.sqrt(x).toFloat })
  }
  def l2NormalizeKnownFor(
    knownFor: TypedPipe[(Long, Array[(Int, Float)])]
  ): Execution[TypedPipe[(Long, Array[(Int, Float)])]] = {
    getClusterL2Norms(knownFor).map { clusterToNorms =>
      knownFor.mapValues { clusterScoresArray =>
        clusterScoresArray.map {
          case (clusterId, score) =>
            (clusterId, score / clusterToNorms(clusterId))
        }
      }
    }
  }
  /**
   * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:bp_cluster_evaluation && \
   * oscar hdfs --user frigate --host hadoopnest2.atla.twitter.com --bundle bp_cluster_evaluation \
   * --tool com.twitter.simclusters_v2.scalding.BipartiteClusterEvaluation --screen --screen-detached \
   * --tee logs/newBpQuality_updateUnnormalizedScores_interestedInUsing20190329Graph_evaluatedOn20190329Graph_run2 \
   * -- --normsAndCountsDir /user/frigate/your_ldap/producerNormsAndCounts_20190330 \
   * --graphInputDir /user/frigate/your_ldap/user_user_normalized_graph_copiedFromAtlaProc_20190329 \
   * --knownForDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownFor \
   * --interestedInDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/interestedInUsing20190329Graph \
   * --outgoingVolumesResultsDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_outgoingVolumes \
   * --incomingVolumesResultsDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_incomingVolumes \
   * --outputDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/bpQualityForInterestedInUsing20190329On20190329Graph_perCluster \
   * --toEmailAddress your_ldap@twitter.com --modelVersion 20M_145K_updated
   */
  override def job: Execution[Unit] = Execution.getConfigMode.flatMap {
    case (config, mode) =>
      Execution.withId { implicit uniqueId =>
        val args = config.getArgs
        val interestedIn = args.optional("interestedInDir") match {
          case Some(dir) =>
            TypedPipe
              .from(AdhocKeyValSources.interestedInSource(args("interestedInDir")))
          case None =>
            DAL
              .readMostRecentSnapshotNoOlderThan(
                SimclustersV2InterestedInScalaDataset,
                Days(20)
              )
              .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
              .toTypedPipe
              .map {
                case KeyVal(key, value) => (key, value)
              }
        }
        val inputKnownFor = args
          .optional("knownForDir")
          .map { location => KnownForSources.readKnownFor(location) }
          .getOrElse(KnownForSources.knownFor_20M_Dec11_145K)
        val modelVersion =
          args.optional("modelVersion").getOrElse("20M_145K_dec11")
        val useLogFavWeights = args.boolean("useLogFavWeights")
        val shouldL2NormalizeKnownFor = args.boolean("l2NormalizeKnownFor")
        val toEmailAddressOpt = args.optional("toEmailAddress")
        val knownForExec = if (shouldL2NormalizeKnownFor) {
          l2NormalizeKnownFor(inputKnownFor)
        } else {
          Execution.from(inputKnownFor)
        }
        val finalExec = knownForExec.flatMap { knownFor =>
          val graph = args.optional("graphInputDir") match {
            case Some(dir) =>
              TypedPipe.from(UserAndNeighborsFixedPathSource(dir))
            case None =>
              DAL
                .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(20))
                .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
                .toTypedPipe
          }
          val producerNormsAndCounts = args.optional("normsAndCountsDir") match {
            case Some(dir) =>
              TypedPipe.from(NormsAndCountsFixedPathSource(args(dir)))
            case None =>
              DAL
                .readMostRecentSnapshotNoOlderThan(ProducerNormsAndCountsScalaDataset, Days(20))
                .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
                .toTypedPipe
          }
          val clusterIncomingVolumesExec = loadOrMake(
            computeClusterIncomingVolumes(knownFor, producerNormsAndCounts, useLogFavWeights),
            modelVersion,
            args("incomingVolumesResultsDir")
          )
          val resultsWithOutgoingVolumesExec = loadOrMake(
            getResultsWithOutgoingVolumes(graph, interestedIn, useLogFavWeights),
            modelVersion,
            args("outgoingVolumesResultsDir")
          )
          val finalPerClusterResultsExec =
            finalPerClusterResults(
              knownFor,
              interestedIn,
              resultsWithOutgoingVolumesExec,
              clusterIncomingVolumesExec)
              .flatMap { pipe => loadOrMake(pipe, modelVersion, args("outputDir")) }
          finalPerClusterResultsExec.flatMap { finalPerClusterResults =>
            val perClusterResults = finalPerClusterResults.values
            val distributionResultsExec = getClusterResultsSummary(perClusterResults).map {
              case Some(summary) =>
                "Summary of results across clusters: \n" +
                  Util.prettyJsonMapper.writeValueAsString(summary)
              case _ =>
                "No summary of results! The cluster level results pipe must be empty!"
            }
            val overallResultsExec = perClusterResults.sum.toOptionExecution.map {
              case Some(overallQuality) =>
                "Overall Quality: \n" +
                  Util.prettyJsonMapper.writeValueAsString(
                    printableBipartiteQuality(overallQuality)
                  )
              case _ =>
                "No overall quality! The cluster level results pipe must be empty!"
            }
            Execution.zip(distributionResultsExec, overallResultsExec).map {
              case (distResults, overallResults) =>
                toEmailAddressOpt.foreach { address =>
                  Util.sendEmail(
                    distResults + "\n" + overallResults,
                    "Bipartite cluster quality for " + modelVersion,
                    address
                  )
                }
                println(distResults + "\n" + overallResults)
            }
          }
        }
        Util.printCounters(finalExec)
      }
  }
  def getResultsWithOutgoingVolumes(
    graph: TypedPipe[UserAndNeighbors],
    interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
    useLogFavWeights: Boolean
  ): TypedPipe[(Int, BipartiteClusterQuality)] = {
    graph
      .map { un => (un.userId, un.neighbors) }
      // should this be a leftJoin? For now, leaving it as an inner join. If in the future,
      // we want to compare two approaches with very different coverages on interestedIn, this
      // could become a problem.
      .join(interestedIn)
      .withReducers(4000)
      .flatMap {
        case (userId, (neighbors, clusters)) =>
          getBIResultsFromSingleUser(userId, neighbors, clusters, useLogFavWeights)
      }
      .sumByKey
      .withReducers(600)
      .map {
        case (clusterId, bir) =>
          (
            clusterId,
            BipartiteClusterQuality(
              inClusterFollowEdges = Some(bir.inClusterWeights.isFollowEdge),
              inClusterFavEdges = Some(bir.inClusterWeights.isFavEdge),
              favWtSumOfInClusterFollowEdges = Some(bir.inClusterWeights.favWtIfFollowEdge),
              favWtSumOfInClusterFavEdges = Some(bir.inClusterWeights.favWtIfFavEdge),
              outgoingFollowEdges = Some(bir.totalOutgoingVolumes.isFollowEdge),
              outgoingFavEdges = Some(bir.totalOutgoingVolumes.isFavEdge),
              favWtSumOfOutgoingFollowEdges = Some(bir.totalOutgoingVolumes.favWtIfFollowEdge),
              favWtSumOfOutgoingFavEdges = Some(bir.totalOutgoingVolumes.favWtIfFavEdge),
              interestedInSize = Some(bir.interestedInSize),
              sampledEdges = Some(
                bir.edgeSample
                  .iterator()
                  .asScala
                  .toSeq
                  .map {
                    case (edge, data) => makeThriftSampledEdge(edge, data)
                  }
              )
            )
          )
      }
  }
  def getBIResultsFromSingleUser(
    userId: Long,
    neighbors: Seq[NeighborWithWeights],
    clusters: ClustersUserIsInterestedIn,
    useLogFavScores: Boolean
  ): List[(Int, BipartiteIntermediateResults)] = {
    val neighborsToWeights = neighbors.map { neighborAndWeights =>
      val isFollowEdge = neighborAndWeights.isFollowed match {
        case Some(true) => 1.0
        case _ => 0.0
      }
      val favScore = if (useLogFavScores) {
        neighborAndWeights.logFavScore.getOrElse(0.0)
      } else neighborAndWeights.favScoreHalfLife100Days.getOrElse(0.0)
      val isFavEdge = math.min(1, math.ceil(favScore))
      neighborAndWeights.neighborId -> Weights(
        isFollowEdge,
        isFavEdge,
        favScore * isFollowEdge,
        favScore
      )
    }.toMap
    val outgoingVolumes = Monoid.sum(neighborsToWeights.values)(WeightsMonoid)
    clusters.clusterIdToScores.toList.map {
      case (clusterId, scoresStruct) =>
        val inClusterNeighbors =
          (scoresStruct.usersBeingFollowed.getOrElse(Nil) ++
            scoresStruct.usersThatWereFaved.getOrElse(Nil)).toSet
        val edgesForSampling = inClusterNeighbors.flatMap { neighborId =>
          if (neighborsToWeights.contains(neighborId)) {
            Some(
              (userId, neighborId),
              SampledEdgeData(
                neighborsToWeights(neighborId).favWtIfFollowEdge,
                neighborsToWeights(neighborId).favWtIfFavEdge,
                scoresStruct.followScore.getOrElse(0.0),
                scoresStruct.favScore.getOrElse(0.0)
              )
            )
          } else {
            None
          }
        }
        val inClusterWeights =
          Monoid.sum(neighborsToWeights.filterKeys(inClusterNeighbors).values)(WeightsMonoid)
        (
          clusterId,
          BipartiteIntermediateResults(
            inClusterWeights,
            outgoingVolumes,
            1,
            samplerMonoid.build(edgesForSampling)
          ))
    }
  }
  def computeClusterIncomingVolumes(
    knownFor: TypedPipe[(Long, Array[(Int, Float)])],
    producerNormsAndCounts: TypedPipe[NormsAndCounts],
    useLogFavWeights: Boolean
  ): TypedPipe[(Int, BipartiteClusterQuality)] = {
    producerNormsAndCounts
      .map { x => (x.userId, x) }
      .join(knownFor)
      .withReducers(100)
      .flatMap {
        case (userId, (normsAndCounts, clusters)) =>
          clusters.map {
            case (clusterId, _) =>
              val followerCount =
                normsAndCounts.followerCount.getOrElse(0L).toDouble
              val faverCount = normsAndCounts.faverCount.getOrElse(0L).toDouble
              val favWtSumOfIncomingFollows = if (useLogFavWeights) {
                normsAndCounts.logFavWeightsOnFollowEdgesSum.getOrElse(0.0)
              } else {
                normsAndCounts.favWeightsOnFollowEdgesSum.getOrElse(0.0)
              }
              val favWtSumOfIncomingFavs = if (useLogFavWeights) {
                normsAndCounts.logFavWeightsOnFavEdgesSum.getOrElse(0.0)
              } else {
                normsAndCounts.favWeightsOnFavEdgesSum.getOrElse(0.0)
              }
              (
                clusterId,
                BipartiteClusterQuality(
                  incomingFollowEdges = Some(followerCount),
                  incomingFavEdges = Some(faverCount),
                  favWtSumOfIncomingFollowEdges = Some(favWtSumOfIncomingFollows),
                  favWtSumOfIncomingFavEdges = Some(favWtSumOfIncomingFavs)
                ))
          }
      }
      .sumByKey
      .toTypedPipe
  }
  def loadOrMake(
    pipe: TypedPipe[(Int, BipartiteClusterQuality)],
    modelVersion: String,
    path: String
  ): Execution[TypedPipe[(Int, BipartiteClusterQuality)]] = {
    val mapped = pipe.map {
      case (clusterId, struct) => ((modelVersion, clusterId), struct)
    }
    makeForKeyValSource(mapped, AdhocKeyValSources.bipartiteQualitySource(path), path).map { pipe =>
      // discard model version
      pipe.map { case ((_, clusterId), struct) => (clusterId, struct) }
    }
  }
  def makeForKeyValSource[K, V](
    pipe: TypedPipe[(K, V)],
    dest: VersionedKeyValSource[K, V],
    path: String
  ): Execution[TypedPipe[(K, V)]] =
    Execution.getMode.flatMap { mode =>
      if (dest.resourceExists(mode)) {
        println(s"validated path $path")
        Execution.from(TypedPipe.from(dest))
      } else {
        println(s"Could not load from $path")
        pipe.writeThrough(dest)
      }
    }
  def precisionOfWholeGraph(
    knownFor: TypedPipe[(Long, Array[(Int, Float)])],
    interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
    clusterIncomingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]]
  ): Execution[Option[Double]] = {
    val knownForSizeExec = knownFor.aggregate(Aggregator.size).toOptionExecution
    val interestedInSizeExec =
      interestedIn.aggregate(Aggregator.size).toOptionExecution
    val numExec = clusterIncomingVolumesExec.flatMap { volumes =>
      volumes.values.flatMap(_.favWtSumOfIncomingFavEdges).sum.toOptionExecution
    }
    Execution.zip(numExec, interestedInSizeExec, knownForSizeExec).map {
      case (Some(num), Some(interestedInSize), Some(knownForSize)) =>
        Some(num / interestedInSize / knownForSize)
      case x @ _ =>
        println("Precision of whole graph zip: " + x)
        None
    }
  }
  def finalPerClusterResults(
    knownFor: TypedPipe[(Long, Array[(Int, Float)])],
    interestedIn: TypedPipe[(Long, ClustersUserIsInterestedIn)],
    resultsWithOutgoingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]],
    incomingVolumesExec: Execution[TypedPipe[(Int, BipartiteClusterQuality)]]
  ): Execution[TypedPipe[(Int, BipartiteClusterQuality)]] = {
    val knownForTranspose = KnownForSources.transpose(knownFor)
    val precisionOfWholeGraphExec =
      precisionOfWholeGraph(knownFor, interestedIn, incomingVolumesExec)
    Execution
      .zip(resultsWithOutgoingVolumesExec, incomingVolumesExec, precisionOfWholeGraphExec)
      .map {
        case (resultsWithOutgoingVolumes, clusterIncomingVolumes, precisionOfWholeGraph) =>
          println("Precision of whole graph " + precisionOfWholeGraph)
          resultsWithOutgoingVolumes
            .join(knownForTranspose)
            .leftJoin(clusterIncomingVolumes)
            .withReducers(500)
            .map {
              case (clusterId, ((outgoingVolumeQuality, knownForList), incomingVolumesOpt)) =>
                val incomingVolumes =
                  incomingVolumesOpt.getOrElse(BipartiteClusterQuality())
                val knownForMap = knownForList.toMap
                (
                  clusterId,
                  getFullQuality(
                    outgoingVolumeQuality,
                    incomingVolumes,
                    knownForMap,
                    precisionOfWholeGraph))
            }
      }
  }
  def getFullQuality(
    qualityWithOutgoingVolumes: BipartiteClusterQuality,
    incomingVolumes: BipartiteClusterQuality,
    knownFor: Map[Long, Float],
    precisionOfWholeGraph: Option[Double]
  ): BipartiteClusterQuality = {
    val newSampledEdges = qualityWithOutgoingVolumes.sampledEdges.map { sampledEdges =>
      sampledEdges.map { sampledEdge =>
        val knownForScore = knownFor.getOrElse(sampledEdge.followeeId, 0.0f)
        sampledEdge.copy(
          predictedFollowScore = sampledEdge.followScoreToCluster.map { x => x * knownForScore },
          predictedFavScore = sampledEdge.favScoreToCluster.map { x => x * knownForScore }
        )
      }
    }
    val correlationOfFavWtIfFollow = newSampledEdges.map { samples =>
      val pairs = samples.map { s =>
        (s.predictedFollowScore.getOrElse(0.0), s.favWtIfFollowEdge.getOrElse(0.0))
      }
      Util.computeCorrelation(pairs.iterator)
    }
    val correlationOfFavWtIfFav = newSampledEdges.map { samples =>
      val pairs = samples.map { s =>
        (s.predictedFavScore.getOrElse(0.0), s.favWtIfFavEdge.getOrElse(0.0))
      }
      Util.computeCorrelation(pairs.iterator)
    }
    val relativePrecisionNum = {
      if (qualityWithOutgoingVolumes.interestedInSize.exists(_ > 0) && knownFor.nonEmpty) {
        qualityWithOutgoingVolumes.favWtSumOfInClusterFavEdges
          .getOrElse(0.0) / qualityWithOutgoingVolumes.interestedInSize.get / knownFor.size
      } else 0.0
    }
    val relativePrecision = if (precisionOfWholeGraph.exists(_ > 0.0)) {
      Some(relativePrecisionNum / precisionOfWholeGraph.get)
    } else None
    qualityWithOutgoingVolumes.copy(
      incomingFollowEdges = incomingVolumes.incomingFollowEdges,
      incomingFavEdges = incomingVolumes.incomingFavEdges,
      favWtSumOfIncomingFollowEdges = incomingVolumes.favWtSumOfIncomingFollowEdges,
      favWtSumOfIncomingFavEdges = incomingVolumes.favWtSumOfIncomingFavEdges,
      knownForSize = Some(knownFor.size),
      correlationOfFavWtIfFollowWithPredictedFollow = correlationOfFavWtIfFollow,
      correlationOfFavWtIfFavWithPredictedFav = correlationOfFavWtIfFav,
      sampledEdges = newSampledEdges,
      relativePrecisionUsingFavWtIfFav = relativePrecision,
      averagePrecisionOfWholeGraphUsingFavWtIfFav = precisionOfWholeGraph
    )
  }
 }
 object DumpBpQuality extends TwitterExecutionApp {
  def job: Execution[Unit] = Execution.getConfigMode.flatMap {
    case (config, mode) =>
      Execution.withId { implicit uniqueId =>
        val args = config.getArgs
        val inputDir = args("inputDir")
        val clusters = args.list("clusters").map(_.toInt).toSet
        val input =
          TypedPipe
            .from(AdhocKeyValSources.bipartiteQualitySource(inputDir))
            .map {
              case ((modelVersion, clusterId), quality) =>
                (
                  (modelVersion, clusterId),
                  BipartiteClusterEvaluationClasses
                    .printableBipartiteQuality(quality))
            }
        if (clusters.isEmpty) {
          input.printSummary("Bipartite quality")
        } else {
          input
            .collect {
              case rec @ ((_, clusterId), quality) if clusters(clusterId) =>
                Util.prettyJsonMapper
                  .writeValueAsString(rec)
                  .replaceAll("\n", " ")
            }
            .toIterableExecution
            .map { strings => println(strings.mkString("\n")) }
        }
      }
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/BipartiteClusterEvaluationClasses.scala
@ -1,316 +0,0 @@
 package com.twitter.simclusters_v2.scalding
 import com.twitter.algebird.{Monoid, OptionMonoid, Semigroup}
 import com.twitter.algebird.mutable.PriorityQueueMonoid
 import com.twitter.scalding.Execution
 import com.twitter.scalding.typed.TypedPipe
 import com.twitter.simclusters_v2.scalding.common.Util
 import com.twitter.simclusters_v2.scalding.common.Util.Distribution
 import com.twitter.simclusters_v2.thriftscala.{BipartiteClusterQuality, SampledEdge}
 import java.util.PriorityQueue
 import scala.collection.JavaConverters._
 object BipartiteClusterEvaluationClasses {
  case class Weights(
    isFollowEdge: Double,
    isFavEdge: Double,
    favWtIfFollowEdge: Double,
    favWtIfFavEdge: Double)
  object WeightsMonoid extends Monoid[Weights] {
    override def zero = Weights(0.0, 0.0, 0.0, 0.0)
    override def plus(l: Weights, r: Weights): Weights = {
      Weights(
        l.isFollowEdge + r.isFollowEdge,
        l.isFavEdge + r.isFavEdge,
        l.favWtIfFollowEdge + r.favWtIfFollowEdge,
        l.favWtIfFavEdge + r.favWtIfFavEdge
      )
    }
  }
  implicit val wm: Monoid[Weights] = WeightsMonoid
  case class SampledEdgeData(
    favWtIfFollowEdge: Double,
    favWtIfFavEdge: Double,
    followScoreToCluster: Double,
    favScoreToCluster: Double)
  implicit val samplerMonoid: PriorityQueueMonoid[((Long, Long), SampledEdgeData)] =
    Util.reservoirSamplerMonoidForPairs[(Long, Long), SampledEdgeData](2000)(Util.edgeOrdering)
  implicit val sampledEdgesMonoid: PriorityQueueMonoid[SampledEdge] =
    Util.reservoirSamplerMonoid(
      10000,
      { sampledEdge: SampledEdge => (sampledEdge.followerId, sampledEdge.followeeId) }
    )(Util.edgeOrdering)
  case class BipartiteIntermediateResults(
    inClusterWeights: Weights,
    totalOutgoingVolumes: Weights,
    interestedInSize: Int,
    edgeSample: PriorityQueue[((Long, Long), SampledEdgeData)]) {
    override def toString: String = {
      "BCR(%s, %s, %d, %s)".format(
        inClusterWeights,
        totalOutgoingVolumes,
        interestedInSize,
        edgeSample.iterator().asScala.toSeq.toString()
      )
    }
  }
  object BIRMonoid extends Monoid[BipartiteIntermediateResults] {
    override def zero =
      BipartiteIntermediateResults(WeightsMonoid.zero, WeightsMonoid.zero, 0, samplerMonoid.zero)
    override def plus(
      l: BipartiteIntermediateResults,
      r: BipartiteIntermediateResults
    ): BipartiteIntermediateResults = {
      BipartiteIntermediateResults(
        WeightsMonoid.plus(l.inClusterWeights, r.inClusterWeights),
        WeightsMonoid.plus(l.totalOutgoingVolumes, r.totalOutgoingVolumes),
        l.interestedInSize + r.interestedInSize,
        samplerMonoid.plus(l.edgeSample, r.edgeSample)
      )
    }
  }
  implicit val bIRMonoid: Monoid[BipartiteIntermediateResults] = BIRMonoid
  def makeThriftSampledEdge(edge: (Long, Long), data: SampledEdgeData): SampledEdge = {
    val (followerId, followeeId) = edge
    SampledEdge(
      followerId = followerId,
      followeeId = followeeId,
      favWtIfFollowEdge = Some(data.favWtIfFollowEdge),
      favWtIfFavEdge = Some(data.favWtIfFavEdge),
      followScoreToCluster = Some(data.followScoreToCluster),
      favScoreToCluster = Some(data.favScoreToCluster)
    )
  }
  object ClusterQualitySemigroup extends Semigroup[BipartiteClusterQuality] {
    val doubleOM: Monoid[Option[Double]] = new OptionMonoid[Double]
    val intOM: Monoid[Option[Int]] = new OptionMonoid[Int]
    val longOM: Monoid[Option[Long]] = new OptionMonoid[Long]
    override def plus(l: BipartiteClusterQuality, r: BipartiteClusterQuality) =
      BipartiteClusterQuality(
        inClusterFollowEdges = doubleOM.plus(l.inClusterFollowEdges, r.inClusterFollowEdges),
        inClusterFavEdges = doubleOM.plus(l.inClusterFavEdges, r.inClusterFavEdges),
        favWtSumOfInClusterFollowEdges = doubleOM
          .plus(l.favWtSumOfInClusterFollowEdges, r.favWtSumOfInClusterFollowEdges),
        favWtSumOfInClusterFavEdges = doubleOM
          .plus(l.favWtSumOfInClusterFavEdges, r.favWtSumOfInClusterFavEdges),
        outgoingFollowEdges = doubleOM.plus(l.outgoingFollowEdges, r.outgoingFollowEdges),
        outgoingFavEdges = doubleOM.plus(l.outgoingFavEdges, r.outgoingFavEdges),
        favWtSumOfOutgoingFollowEdges = doubleOM
          .plus(l.favWtSumOfOutgoingFollowEdges, r.favWtSumOfOutgoingFollowEdges),
        favWtSumOfOutgoingFavEdges = doubleOM
          .plus(l.favWtSumOfOutgoingFavEdges, r.favWtSumOfOutgoingFavEdges),
        incomingFollowEdges = doubleOM.plus(l.incomingFollowEdges, r.incomingFollowEdges),
        incomingFavEdges = doubleOM.plus(l.incomingFavEdges, r.incomingFavEdges),
        favWtSumOfIncomingFollowEdges = doubleOM
          .plus(l.favWtSumOfIncomingFollowEdges, r.favWtSumOfIncomingFollowEdges),
        favWtSumOfIncomingFavEdges = doubleOM
          .plus(l.favWtSumOfIncomingFavEdges, r.favWtSumOfIncomingFavEdges),
        interestedInSize = None,
        sampledEdges = Some(
          sampledEdgesMonoid
            .plus(
              sampledEdgesMonoid.build(l.sampledEdges.getOrElse(Nil)),
              sampledEdgesMonoid.build(r.sampledEdges.getOrElse(Nil))
            )
            .iterator()
            .asScala
            .toSeq),
        knownForSize = intOM.plus(l.knownForSize, r.knownForSize),
        correlationOfFavWtIfFollowWithPredictedFollow = None,
        correlationOfFavWtIfFavWithPredictedFav = None,
        relativePrecisionUsingFavWtIfFav = None,
        averagePrecisionOfWholeGraphUsingFavWtIfFav = l.averagePrecisionOfWholeGraphUsingFavWtIfFav
      )
  }
  implicit val bcqSemigroup: Semigroup[BipartiteClusterQuality] =
    ClusterQualitySemigroup
  case class PrintableBipartiteQuality(
    incomingFollowUnweightedRecall: String,
    incomingFavUnweightedRecall: String,
    incomingFollowWeightedRecall: String,
    incomingFavWeightedRecall: String,
    outgoingFollowUnweightedRecall: String,
    outgoingFavUnweightedRecall: String,
    outgoingFollowWeightedRecall: String,
    outgoingFavWeightedRecall: String,
    incomingFollowEdges: String,
    incomingFavEdges: String,
    favWtSumOfIncomingFollowEdges: String,
    favWtSumOfIncomingFavEdges: String,
    outgoingFollowEdges: String,
    outgoingFavEdges: String,
    favWtSumOfOutgoingFollowEdges: String,
    favWtSumOfOutgoingFavEdges: String,
    correlationOfFavWtIfFollow: String,
    correlationOfFavWtIfFav: String,
    relativePrecisionUsingFavWt: String,
    averagePrecisionOfWholeGraphUsingFavWt: String,
    interestedInSize: String,
    knownForSize: String)
  def printableBipartiteQuality(in: BipartiteClusterQuality): PrintableBipartiteQuality = {
    def getRatio(numOpt: Option[Double], denOpt: Option[Double]): String = {
      val r = if (denOpt.exists(_ > 0)) {
        numOpt.getOrElse(0.0) / denOpt.get
      } else 0.0
      "%.3f".format(r)
    }
    val formatter = new java.text.DecimalFormat("###,###.#")
    def denString(denOpt: Option[Double]): String =
      formatter.format(denOpt.getOrElse(0.0))
    val correlationOfFavWtIfFollow =
      in.correlationOfFavWtIfFollowWithPredictedFollow match {
        case None =>
          in.sampledEdges.map { samples =>
            val pairs = samples.map { s =>
              (s.predictedFollowScore.getOrElse(0.0), s.favWtIfFollowEdge.getOrElse(0.0))
            }
            Util.computeCorrelation(pairs.iterator)
          }
        case x @ _ => x
      }
    val correlationOfFavWtIfFav =
      in.correlationOfFavWtIfFavWithPredictedFav match {
        case None =>
          in.sampledEdges.map { samples =>
            val pairs = samples.map { s =>
              (s.predictedFavScore.getOrElse(0.0), s.favWtIfFavEdge.getOrElse(0.0))
            }
            Util.computeCorrelation(pairs.iterator)
          }
        case x @ _ => x
      }
    PrintableBipartiteQuality(
      incomingFollowUnweightedRecall = getRatio(in.inClusterFollowEdges, in.incomingFollowEdges),
      incomingFavUnweightedRecall = getRatio(in.inClusterFavEdges, in.incomingFavEdges),
      incomingFollowWeightedRecall =
        getRatio(in.favWtSumOfInClusterFollowEdges, in.favWtSumOfIncomingFollowEdges),
      incomingFavWeightedRecall =
        getRatio(in.favWtSumOfInClusterFavEdges, in.favWtSumOfIncomingFavEdges),
      outgoingFollowUnweightedRecall = getRatio(in.inClusterFollowEdges, in.outgoingFollowEdges),
      outgoingFavUnweightedRecall = getRatio(in.inClusterFavEdges, in.outgoingFavEdges),
      outgoingFollowWeightedRecall =
        getRatio(in.favWtSumOfInClusterFollowEdges, in.favWtSumOfOutgoingFollowEdges),
      outgoingFavWeightedRecall =
        getRatio(in.favWtSumOfInClusterFavEdges, in.favWtSumOfOutgoingFavEdges),
      incomingFollowEdges = denString(in.incomingFollowEdges),
      incomingFavEdges = denString(in.incomingFavEdges),
      favWtSumOfIncomingFollowEdges = denString(in.favWtSumOfIncomingFollowEdges),
      favWtSumOfIncomingFavEdges = denString(in.favWtSumOfIncomingFavEdges),
      outgoingFollowEdges = denString(in.outgoingFollowEdges),
      outgoingFavEdges = denString(in.outgoingFavEdges),
      favWtSumOfOutgoingFollowEdges = denString(in.favWtSumOfOutgoingFollowEdges),
      favWtSumOfOutgoingFavEdges = denString(in.favWtSumOfOutgoingFavEdges),
      correlationOfFavWtIfFollow = "%.3f"
        .format(correlationOfFavWtIfFollow.getOrElse(0.0)),
      correlationOfFavWtIfFav = "%.3f"
        .format(correlationOfFavWtIfFav.getOrElse(0.0)),
      relativePrecisionUsingFavWt =
        "%.2g".format(in.relativePrecisionUsingFavWtIfFav.getOrElse(0.0)),
      averagePrecisionOfWholeGraphUsingFavWt =
        "%.2g".format(in.averagePrecisionOfWholeGraphUsingFavWtIfFav.getOrElse(0.0)),
      interestedInSize = in.interestedInSize.getOrElse(0).toString,
      knownForSize = in.knownForSize.getOrElse(0).toString
    )
  }
  case class ClusterResultsSummary(
    numClustersWithZeroInterestedIn: Int,
    numClustersWithZeroFollowWtRecall: Int,
    numClustersWithZeroFavWtRecall: Int,
    numClustersWithZeroFollowAndFavWtRecall: Int,
    interestedInSizeDist: Distribution,
    outgoingFollowWtRecallDist: Distribution,
    outgoingFavWtRecallDist: Distribution,
    incomingFollowWtRecallDist: Distribution,
    incomingFavWtRecallDist: Distribution,
    followCorrelationDist: Distribution,
    favCorrelationDist: Distribution,
    relativePrecisionDist: Distribution)
  def getClusterResultsSummary(
    perClusterResults: TypedPipe[BipartiteClusterQuality]
  ): Execution[Option[ClusterResultsSummary]] = {
    perClusterResults
      .map { clusterQuality =>
        val printableQuality = printableBipartiteQuality(clusterQuality)
        val isFollowRecallZero =
          if (!clusterQuality.favWtSumOfInClusterFollowEdges
              .exists(_ > 0)) 1
          else 0
        val isFavRecallZero =
          if (!clusterQuality.favWtSumOfInClusterFavEdges.exists(_ > 0)) 1
          else 0
        (
          if (!clusterQuality.interestedInSize.exists(_ > 0)) 1 else 0,
          isFollowRecallZero,
          isFavRecallZero,
          isFavRecallZero * isFollowRecallZero,
          clusterQuality.interestedInSize.toList.map(_.toDouble),
          List(printableQuality.outgoingFollowWeightedRecall.toDouble),
          List(printableQuality.outgoingFavWeightedRecall.toDouble),
          List(printableQuality.incomingFollowWeightedRecall.toDouble),
          List(printableQuality.incomingFavWeightedRecall.toDouble),
          List(printableQuality.correlationOfFavWtIfFollow.toDouble),
          List(printableQuality.correlationOfFavWtIfFav.toDouble),
          List(printableQuality.relativePrecisionUsingFavWt.toDouble)
        )
      }
      .sum
      .toOptionExecution
      .map { opt =>
        opt.map {
          case (
                zeroInterestedIn,
                zeroFollowRecall,
                zeroFavRecall,
                zeroFollowAndFavRecall,
                interestedInSizeList,
                outgoingFollowWtRecallList,
                outgoingFavWtRecallList,
                incomingFollowWtRecallList,
                incomingFavWtRecallList,
                followCorrelationList,
                favCorrelationList,
                relativePrecisionList
              ) =>
            ClusterResultsSummary(
              numClustersWithZeroInterestedIn = zeroInterestedIn,
              numClustersWithZeroFollowWtRecall = zeroFollowRecall,
              numClustersWithZeroFavWtRecall = zeroFavRecall,
              numClustersWithZeroFollowAndFavWtRecall = zeroFollowAndFavRecall,
              interestedInSizeDist = Util.distributionFromArray(interestedInSizeList.toArray),
              outgoingFollowWtRecallDist = Util
                .distributionFromArray(outgoingFollowWtRecallList.toArray),
              outgoingFavWtRecallDist = Util.distributionFromArray(outgoingFavWtRecallList.toArray),
              incomingFollowWtRecallDist = Util
                .distributionFromArray(incomingFollowWtRecallList.toArray),
              incomingFavWtRecallDist = Util.distributionFromArray(incomingFavWtRecallList.toArray),
              followCorrelationDist = Util.distributionFromArray(followCorrelationList.toArray),
              favCorrelationDist = Util.distributionFromArray(favCorrelationList.toArray),
              relativePrecisionDist = Util.distributionFromArray(relativePrecisionList.toArray)
            )
        }
      }
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/ClusterDetailsJob.scala
@ -1,794 +0,0 @@
 package com.twitter.simclusters_v2.scalding
 import com.twitter.algebird.OptionMonoid
 import com.twitter.algebird.QTree
 import com.twitter.algebird.QTreeSemigroup
 import com.twitter.algebird.Semigroup
 import com.twitter.dal.client.dataset.KeyValDALDataset
 import com.twitter.dal.client.dataset.SnapshotDALDataset
 import com.twitter.hermit.candidate.thriftscala.Candidates
 import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource
 import com.twitter.pluck.source.cassowary.SimsCandidatesSource
 import com.twitter.scalding._
 import com.twitter.scalding_internal.dalv2.DAL
 import com.twitter.scalding_internal.dalv2.DALWrite._
 import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
 import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
 import com.twitter.scalding_internal.job.TwitterExecutionApp
 import com.twitter.scalding_internal.job.analytics_batch._
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
 import com.twitter.simclusters_v2.common.ModelVersions
 import com.twitter.simclusters_v2.hdfs_sources._
 import com.twitter.simclusters_v2.scalding.common.Util
 import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources
 import com.twitter.simclusters_v2.thriftscala._
 import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset
 import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser
 object ClusterDetailsJob {
  case class Scores(followScore: Double, favScore: Double, logFavScore: Double)
  case class IntermediateDetails(
    numUsersWithAnyNonZeroScore: Int,
    numUsersWithNonZeroFollowScore: Int,
    numUsersWithNonZeroFavScore: Int,
    favQTree: Option[QTree[Double]],
    followQTree: Option[QTree[Double]],
    logFavQTree: Option[QTree[Double]],
    sumOfSquares: Scores,
    sum: Scores,
    min: Scores,
    max: Scores)
  case class InfoFromUserSource(
    fractionMarkedNSFWUser: Double,
    languageToFractionDeviceLanguage: Map[String, Double],
    countryCodeToFractionKnownForWithCountryCode: Map[String, Double],
    languageToFractionInferredLanguage: Map[String, Double])
  def positiveMin(a: Double, b: Double) = {
    if (math.min(a, b) == 0.0) math.max(a, b) else math.min(a, b)
  }
  case class ClusterDetailsSemigroup(implicit qtreeSemigroup: Semigroup[QTree[Double]])
      extends Semigroup[IntermediateDetails] {
    val optionMonoid: OptionMonoid[QTree[Double]] = new OptionMonoid[QTree[Double]]()
    override def plus(
      left: IntermediateDetails,
      right: IntermediateDetails
    ): IntermediateDetails = {
      IntermediateDetails(
        left.numUsersWithAnyNonZeroScore + right.numUsersWithAnyNonZeroScore,
        left.numUsersWithNonZeroFollowScore + right.numUsersWithNonZeroFollowScore,
        left.numUsersWithNonZeroFavScore + right.numUsersWithNonZeroFavScore,
        optionMonoid.plus(left.favQTree, right.favQTree),
        optionMonoid.plus(left.followQTree, right.followQTree),
        optionMonoid.plus(left.logFavQTree, right.logFavQTree),
        Scores(
          left.sumOfSquares.followScore + right.sumOfSquares.followScore,
          left.sumOfSquares.favScore + right.sumOfSquares.favScore,
          left.sumOfSquares.logFavScore + right.sumOfSquares.logFavScore
        ),
        Scores(
          left.sum.followScore + right.sum.followScore,
          left.sum.favScore + right.sum.favScore,
          left.sum.logFavScore + right.sum.logFavScore
        ),
        Scores(
          positiveMin(left.min.followScore, right.min.followScore),
          positiveMin(left.min.favScore, right.min.favScore),
          positiveMin(left.min.logFavScore, right.min.logFavScore)
        ),
        Scores(
          math.max(left.max.followScore, right.max.followScore),
          math.max(left.max.favScore, right.max.favScore),
          math.max(left.max.logFavScore, right.max.logFavScore)
        )
      )
    }
  }
  def intermediateDetailsPipe(
    input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
    qtreeSemigroupKParameter: Int
  ): TypedPipe[(Int, IntermediateDetails)] = {
    implicit val qtSg: Semigroup[QTree[Double]] =
      new QTreeSemigroup[Double](qtreeSemigroupKParameter)
    implicit val cdSg: Semigroup[IntermediateDetails] = ClusterDetailsSemigroup()
    input
      .flatMap {
        case (userId, clusterScoresStruct) =>
          val clusterScoresArray = clusterScoresStruct.clusterIdToScores.toArray
          clusterScoresArray.map {
            case (clusterId, scoresStruct) =>
              val followScore = scoresStruct.followScore.getOrElse(0.0)
              val favScore = scoresStruct.favScore.getOrElse(0.0)
              val logFavScore = scoresStruct.logFavScore.getOrElse(0.0)
              (
                clusterId,
                IntermediateDetails(
                  numUsersWithAnyNonZeroScore = 1,
                  numUsersWithNonZeroFollowScore = if (followScore > 0) 1 else 0,
                  numUsersWithNonZeroFavScore = if (favScore > 0) 1 else 0,
                  favQTree = if (favScore > 0) Some(QTree(favScore)) else None,
                  followQTree = if (followScore > 0) Some(QTree(followScore)) else None,
                  logFavQTree = if (logFavScore > 0) Some(QTree(logFavScore)) else None,
                  sumOfSquares = Scores(
                    followScore * followScore,
                    favScore * favScore,
                    logFavScore * logFavScore),
                  sum = Scores(followScore, favScore, logFavScore),
                  min = Scores(followScore, favScore, logFavScore),
                  max = Scores(followScore, favScore, logFavScore)
                )
              )
          }
      }
      .sumByKey
      // Uncomment for adhoc job
      //.withReducers(100)
      .toTypedPipe
  }
  private def safeGetDoubleOpt(x: Option[Double]): Double = {
    x.map { y => if (y.isNaN) 0 else y }.getOrElse(0)
  }
  private def getSimilaritiesForAllPairs(
    input: TypedPipe[(Long, ClustersUserIsInterestedIn)]
  )(
    implicit uniqueID: UniqueID
  ): TypedPipe[((Int, Int), Scores)] = {
    val allClusterPairsBeforeSumByKey = Stat("all_cluster_pairs_before_sum_by_key")
    val clusterPairsWithin10Ratio = Stat("cluster_pairs_within_10_ratio")
    val clusterPairsBeforeTopK = Stat("cluster_pairs_before_thresholding")
    input
      .flatMap {
        case (userId, clusterScoresStruct) =>
          val clusterScoresArray = clusterScoresStruct.clusterIdToScores.toArray
          (0 until clusterScoresArray.length).flatMap { i =>
            (0 until clusterScoresArray.length).map { j =>
              val (clusterI, scoresI) = clusterScoresArray(i)
              val (clusterJ, scoresJ) = clusterScoresArray(j)
              val ratioOfSizes =
                scoresI.numUsersInterestedInThisClusterUpperBound.getOrElse(1).toDouble /
                  scoresJ.numUsersInterestedInThisClusterUpperBound.getOrElse(1).toDouble
              allClusterPairsBeforeSumByKey.inc()
              if (ratioOfSizes > 0.1 && ratioOfSizes < 10) {
                clusterPairsWithin10Ratio.inc()
              }
              val followI = safeGetDoubleOpt(scoresI.followScoreClusterNormalizedOnly)
              val followJ = safeGetDoubleOpt(scoresJ.followScoreClusterNormalizedOnly)
              val follow = followI * followJ
              val favI = safeGetDoubleOpt(scoresI.favScoreClusterNormalizedOnly)
              val favJ = safeGetDoubleOpt(scoresJ.favScoreClusterNormalizedOnly)
              val fav = favI * favJ
              val logFavI = safeGetDoubleOpt(scoresI.logFavScoreClusterNormalizedOnly)
              val logFavJ = safeGetDoubleOpt(scoresJ.logFavScoreClusterNormalizedOnly)
              val logFav = logFavI * logFavJ
              ((clusterI, clusterJ), (follow, fav, logFav))
            }
          }
      }
      .sumByKey
      // Uncomment for adhoc job
      //.withReducers(600)
      .map {
        case (key, (follow, fav, logFav)) =>
          clusterPairsBeforeTopK.inc()
          (key, Scores(follow, fav, logFav))
      }
  }
  private def keepTopNeighbors(
    allPairs: TypedPipe[((Int, Int), Scores)],
    cosineThreshold: Double
  )(
    implicit uniqueID: UniqueID
  ): TypedPipe[(Int, List[ClusterNeighbor])] = {
    val clusterPairsMoreThanThreshold = Stat("cluster_pairs_cosine_gt_" + cosineThreshold)
    val clusterPairsAfterTopK = Stat("cluster_pairs_after_topk")
    val clustersWithFewNeighbors = Stat(s"clusters_with_fewer_than_100_neighbors")
    val clustersWithManyNeighbors = Stat(s"clusters_with_more_than_100_neighbors")
    allPairs
      .flatMap {
        case ((cI, cJ), Scores(followScore, favScore, logFavScore)) =>
          if (followScore > cosineThreshold || logFavScore > cosineThreshold || favScore > cosineThreshold) {
            clusterPairsMoreThanThreshold.inc()
            Some((cI, ClusterNeighbor(cJ, Some(followScore), Some(favScore), Some(logFavScore))))
          } else None
      }
      .group
      .toList
      // Uncomment for adhoc job
      //.withReducers(40)
      .map {
        case (key, seq) =>
          val finalSize = seq.size
          clusterPairsAfterTopK.incBy(finalSize)
          if (finalSize < 100) {
            clustersWithFewNeighbors.inc()
          } else {
            clustersWithManyNeighbors.inc()
          }
          (
            key,
            seq.sortBy {
              case cn: ClusterNeighbor =>
                -(cn.followCosineSimilarity.getOrElse(0.0) + cn.logFavCosineSimilarity.getOrElse(
                  0.0)) / 2
            })
      }
  }
  def getTopSimilarClustersWithCosine(
    input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
    cosineThreshold: Double
  )(
    implicit uniqueID: UniqueID
  ): TypedPipe[(Int, List[ClusterNeighbor])] = {
    keepTopNeighbors(getSimilaritiesForAllPairs(input), cosineThreshold)
  }
  def getDistributionDetails(
    qtree: QTree[Double],
    sum: Double,
    sumOfSquares: Double,
    min: Double,
    max: Double,
    fullSize: Int
  ): DistributionDetails = {
    val mean = sum / fullSize
    // note that the below is the naive calculation, and not the sample standard dev formula
    // that divides by n-1. I don't think it makes a difference at our scale whether we use n or n-1
    // and I'd rather use the simpler one.
    val stdDev = math.sqrt(sumOfSquares / fullSize - mean * mean)
    def getQB(percentile: Double): QuantileBounds = {
      val (lb, ub) = qtree.quantileBounds(percentile)
      QuantileBounds(lb, ub)
    }
    DistributionDetails(
      mean = mean,
      standardDeviation = Some(stdDev),
      min = Some(min),
      p25 = Some(getQB(0.25)),
      p50 = Some(getQB(0.5)),
      p75 = Some(getQB(0.75)),
      p95 = Some(getQB(0.95)),
      max = Some(max)
    )
  }
  def keepCorrectModel(
    input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
    modelVersionToKeep: String
  )(
    implicit uniqId: UniqueID
  ): TypedPipe[(Long, ClustersUserIsInterestedIn)] = {
    val allRecords = Stat("all_input_records")
    val withCorrectVersion = Stat("with_correct_version")
    input.filter {
      case (_, clusterScoresStruct) =>
        //  allRecords.inc()
        val result = clusterScoresStruct.knownForModelVersion == modelVersionToKeep
        //  if (result) withCorrectVersion.inc()
        result
    }
  }
  def getInfoFromUserSource(
    knownFor: TypedPipe[(Int, List[(Long, Float)])],
    usersource: TypedPipe[FlatUser],
    inferredLanguages: TypedPipe[(Long, Seq[(String, Double)])]
  )(
    implicit uniqId: UniqueID
  ): TypedPipe[(Int, InfoFromUserSource)] = {
    val knownForUsers = knownFor.flatMap {
      case (clusterId, userScoreList) =>
        userScoreList.map {
          case (userId, _) =>
            (userId, clusterId)
        }
    }
    usersource
      .collect {
        case fuser: FlatUser if fuser.id.isDefined =>
          (
            fuser.id.get,
            (
              fuser.accountCountryCode.getOrElse(""),
              fuser.language.getOrElse(""),
              fuser.nsfwUser.getOrElse(false)
            ))
      }
      .join(knownForUsers)
      .leftJoin(inferredLanguages)
      .map {
        case (_, (((countryCode, language, nsfw), clusterId), inferredLangsOpt)) =>
          val nsfwInt = if (nsfw) 1 else 0
          (
            clusterId,
            (
              1,
              nsfwInt,
              Map(language -> 1),
              Map(countryCode -> 1),
              inferredLangsOpt.getOrElse(Seq(("", 1.0))).toMap
            )
          )
      }
      .sumByKey
      .mapValues {
        case (
              denominator,
              nsfwNumerator,
              languageNumeratorsMap,
              countryNumeratorsMap,
              inferredLangsNumeratorsMap) =>
          InfoFromUserSource(
            nsfwNumerator * 1.0 / denominator,
            languageNumeratorsMap.mapValues { x => x * 1.0 / denominator },
            countryNumeratorsMap.mapValues { x => x * 1.0 / denominator },
            inferredLangsNumeratorsMap.mapValues { x => x * 1.0 / denominator }
          )
      }
  }
  /**
   * Run the cluster details job and return the details for each cluster
   * @param input interestedIn data
   * @param qtreeSemigroupKParameter parameter for calculating percentiles using qtree monoid (set to a small number, usually < 7)
   * @param modelVersionToKeep which modelVersion to use from interestedIn dataset
   * @param knownFor clusterId -> users known for this cluster and their scores
   * @param knownForTranspose userId -> clusters this user is known for and their scores
   * @param usersource -> user source
   * @param simsGraph -> sims graph in the form of userId -> adjacency list
   * @param cosineThreshold -> cosine threshold to include a cluster in the list of similar clusters for a given cluster
   * @param uniqId
   * @return pipe with (modelVersion, clusterId) as the key and ClusterDetails struct as the value.
   */
  def run(
    input: TypedPipe[(Long, ClustersUserIsInterestedIn)],
    qtreeSemigroupKParameter: Int,
    modelVersionToKeep: String,
    knownFor: TypedPipe[(Int, List[(Long, Float)])],
    knownForTranspose: TypedPipe[(Long, Array[(Int, Float)])],
    usersource: Option[TypedPipe[FlatUser]],
    inferredLanguageSource: Option[TypedPipe[(Long, Seq[(String, Double)])]],
    simsGraph: Option[TypedPipe[(Long, Map[Long, Float])]],
    cosineThreshold: Double
  )(
    implicit uniqId: UniqueID
  ): Execution[TypedPipe[((String, Int), ClusterDetails)]] = {
    val topSimilarClusters = getTopSimilarClustersWithCosine(input, cosineThreshold)
    val infoFromUserSource: TypedPipe[(Int, InfoFromUserSource)] = (for {
      us <- usersource
      inferredLanguages <- inferredLanguageSource
    } yield getInfoFromUserSource(knownFor, us, inferredLanguages)).getOrElse(TypedPipe.empty)
    val clusterEvaluationExec = simsGraph match {
      case Some(sg) =>
        ClusterEvaluation.clusterLevelEvaluation(sg, knownForTranspose, "eval")
      case None =>
        val dummyPipe: TypedPipe[(Int, (Int, ClusterQuality))] = TypedPipe.empty
        Execution.from(dummyPipe)
    }
    clusterEvaluationExec
      .map { clusterIdToSizesAndQualities =>
        val clusterQualities: TypedPipe[(Int, ClusterQuality)] =
          clusterIdToSizesAndQualities.mapValues(_._2)
        intermediateDetailsPipe(
          keepCorrectModel(input, modelVersionToKeep),
          qtreeSemigroupKParameter)
          .leftJoin(topSimilarClusters)
          .leftJoin(infoFromUserSource)
          .leftJoin(clusterQualities)
          .join(knownFor)
          .map {
            case (
                  clusterId,
                  (
                    (
                      ((intermediateDetails, topSimilarNeighborsOpt), userSourceInfoOpt),
                      qualityOpt),
                    knownForUsers)
                ) =>
              val knownForSorted = knownForUsers.sortBy(-_._2).map {
                case (userId, score) =>
                  UserWithScore(userId, score)
              }
              (modelVersionToKeep, clusterId) ->
                ClusterDetails(
                  numUsersWithAnyNonZeroScore = intermediateDetails.numUsersWithAnyNonZeroScore,
                  numUsersWithNonZeroFavScore = intermediateDetails.numUsersWithNonZeroFavScore,
                  numUsersWithNonZeroFollowScore =
                    intermediateDetails.numUsersWithNonZeroFollowScore,
                  favScoreDistributionDetails = intermediateDetails.favQTree.map { qt =>
                    getDistributionDetails(
                      qtree = qt,
                      sum = intermediateDetails.sum.favScore,
                      sumOfSquares = intermediateDetails.sumOfSquares.favScore,
                      min = intermediateDetails.min.favScore,
                      max = intermediateDetails.max.favScore,
                      fullSize = intermediateDetails.numUsersWithNonZeroFavScore
                    )
                  },
                  followScoreDistributionDetails = intermediateDetails.followQTree.map { qt =>
                    getDistributionDetails(
                      qtree = qt,
                      sum = intermediateDetails.sum.followScore,
                      sumOfSquares = intermediateDetails.sumOfSquares.followScore,
                      min = intermediateDetails.min.followScore,
                      max = intermediateDetails.max.followScore,
                      fullSize = intermediateDetails.numUsersWithNonZeroFollowScore
                    )
                  },
                  logFavScoreDistributionDetails = intermediateDetails.logFavQTree.map { qt =>
                    getDistributionDetails(
                      qtree = qt,
                      sum = intermediateDetails.sum.logFavScore,
                      sumOfSquares = intermediateDetails.sumOfSquares.logFavScore,
                      min = intermediateDetails.min.logFavScore,
                      max = intermediateDetails.max.logFavScore,
                      // note: user has non-zero fav score iff a user has non-zero log-fav score
                      fullSize = intermediateDetails.numUsersWithNonZeroFavScore
                    )
                  },
                  knownForUsersAndScores = Some(knownForSorted),
                  neighborClusters = topSimilarNeighborsOpt,
                  fractionKnownForMarkedNSFWUser = userSourceInfoOpt.map(_.fractionMarkedNSFWUser),
                  languageToFractionDeviceLanguage =
                    userSourceInfoOpt.map(_.languageToFractionDeviceLanguage),
                  countryCodeToFractionKnownForWithCountryCode =
                    userSourceInfoOpt.map(_.countryCodeToFractionKnownForWithCountryCode),
                  qualityMeasuredOnSimsGraph = qualityOpt,
                  languageToFractionInferredLanguage =
                    userSourceInfoOpt.map(_.languageToFractionInferredLanguage),
                )
          }
      }
  }
  def getTruncatedSims(
    sims: TypedPipe[Candidates],
    maxNeighbors: Int
  ): TypedPipe[(Long, Map[Long, Float])] = {
    sims.map { cands =>
      (
        cands.userId,
        // These candidates are already sorted, but leaving it in just in case the behavior changes upstream
        cands.candidates
          .map { c => (c.userId, c.score.toFloat) }.sortBy(-_._2).take(maxNeighbors).toMap
      )
    }
  }
 }
 /**
 scalding remote run  --main-class com.twitter.simclusters_v2.scalding.ClusterDetailsAdhoc \
  --target src/scala/com/twitter/simclusters_v2/scalding:cluster_details-adhoc \
  --hadoop-properties "scalding.with.reducers.set.explicitly=true mapreduce.job.reduces=4000" \
  --user recos-platform -- \
  --date 2020-06-25 \
  --dateForUserSource 2020-06-25 \
  --includeUserSource \
  --outputDir /user/recos-platform/adhoc/your_ldap/cluster_details_inferred_lang
 */
 object ClusterDetailsAdhoc extends TwitterExecutionApp {
  implicit val tz: java.util.TimeZone = DateOps.UTC
  implicit val dp = DateParser.default
  def job: Execution[Unit] =
    Execution.getConfigMode.flatMap {
      case (config, mode) =>
        Execution.withId { implicit uniqueId =>
          val args = config.getArgs
          val date = DateRange.parse(args("dateForUserSource"))
          val (knownFor, knownForTranspose) =
            args
              .optional("knownForDir").map { location =>
                (
                  KnownForSources.transpose(KnownForSources.readKnownFor(location)),
                  KnownForSources.readKnownFor(location)
                )
              }.getOrElse(
                (
                  KnownForSources.clusterToKnownFor_20M_145K_updated,
                  KnownForSources.knownFor_20M_145K_updated
                )
              )
          val interestedIn = args
            .optional("inputDir").map { interestedInInputDir =>
              TypedPipe.from(AdhocKeyValSources.interestedInSource(interestedInInputDir))
            }.getOrElse(
              DAL
                .readMostRecentSnapshotNoOlderThan(
                  SimclustersV2InterestedIn20M145KUpdatedScalaDataset,
                  Days(14))
                .withRemoteReadPolicy(ExplicitLocation(ProcAtla))
                .toTypedPipe
                .map {
                  case KeyVal(userId, clustersUserIsInterestedIn) =>
                    (userId, clustersUserIsInterestedIn)
                }
            )
          val userSourceOpt = if (args.boolean("includeUserSource")) {
            Some(DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe)
          } else None
          val inferredLanguagesOpt = if (args.boolean("includeUserSource")) {
            Some(ExternalDataSources.inferredUserProducedLanguageSource)
          } else None
          val simsGraphOpt = args.optional("simsForEvalInputDir").map { sgDir =>
            ClusterDetailsJob.getTruncatedSims(
              TypedPipe.from(WTFCandidatesSource(sgDir)),
              args.int("maxSimsNeighborsForEval", 20)
            )
          }
          Util.printCounters(
            ClusterDetailsJob
              .run(
                interestedIn,
                args.int("qtreeSemigroupKParameter", 3),
                args.getOrElse("modelVersion", "20M_145K_updated"),
                knownFor,
                knownForTranspose,
                userSourceOpt,
                inferredLanguagesOpt,
                simsGraphOpt,
                cosineThreshold = args.double("cosineThreshold", 0.01)
              ).flatMap(
                _.writeExecution(AdhocKeyValSources.clusterDetailsSource(args("outputDir"))))
          )
        }
    }
 }
 trait ClusterDetailsBatchTrait extends TwitterScheduledExecutionApp {
  implicit val tz = DateOps.UTC
  implicit val parser = DateParser.default
  def firstTime: String
  def batchIncrement: Duration
  def manhattanOutputPath: String
  def clusterDetailsLiteOutputPath: String
  def modelVersion: String
  def knownForDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
  def interestedInDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
  def outputDataset: KeyValDALDataset[KeyVal[(String, Int), ClusterDetails]]
  def clusterDetailsLiteOutputDataset: SnapshotDALDataset[ClusterDetailsLite]
  private lazy val execArgs = AnalyticsBatchExecutionArgs(
    batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
    firstTime = BatchFirstTime(RichDate(firstTime)),
    lastTime = None,
    batchIncrement = BatchIncrement(batchIncrement)
  )
  override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
    implicit dateRange =>
      Execution.withId { implicit uniqueId =>
        Execution.withArgs { args =>
          val qtreeSemigroupKParameter = args.int("qtreeSemigroupKParameter", 5)
          val maxSimsNeighborsForEval = args.int("maxSimsNeighborsForEval", 20)
          val knownForTranspose =
            KnownForSources.fromKeyVal(
              DAL.readMostRecentSnapshot(knownForDataset, dateRange.extend(Days(7))).toTypedPipe,
              modelVersion)
          val knownFor = KnownForSources.transpose(knownForTranspose)
          val cosineThreshold = args.double("cosineThreshold", 0.01)
          val interestedIn =
            DAL
              .readMostRecentSnapshot(interestedInDataset, dateRange.extend(Days(7)))
              .toTypedPipe
              .map {
                case KeyVal(userId, clustersUserIsInterestedIn) =>
                  (userId, clustersUserIsInterestedIn)
              }
          val sims = if (modelVersion == ModelVersions.Model20M145K2020) {
            // The model version 20m_145k_2020 uses approximate_cosine_follow as the input sims graph
            // to cluster users. The same graph is used to evaluate the clusters
            TypedPipe
              .from(FollowingsCosineSimilaritiesManhattanSource())
              .map(_._2)
          } else {
            TypedPipe.from(
              SimsCandidatesSource()(
                dateRange = dateRange,
                suffixPath = "/classified_candidates_rollup"
              ))
          }
          val resultExec = ClusterDetailsJob
            .run(
              interestedIn,
              qtreeSemigroupKParameter,
              modelVersion,
              knownFor,
              knownForTranspose,
              Some(DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange).toTypedPipe),
              Some(ExternalDataSources.inferredUserProducedLanguageSource),
              Some(
                ClusterDetailsJob.getTruncatedSims(sims, maxNeighbors = maxSimsNeighborsForEval)),
              cosineThreshold
            ).flatMap { resultUnmapped =>
              val clusterDetailsExec = resultUnmapped
                .map {
                  case (clusterKey, details) =>
                    KeyVal(clusterKey, details)
                }.writeDALVersionedKeyValExecution(
                  outputDataset,
                  D.Suffix(manhattanOutputPath)
                )
              val clusterDetailsLiteExec =
                resultUnmapped
                  .map {
                    case ((_, clusterId), details)
                        if modelVersion == ModelVersions.Model20M145KDec11 =>
                      ClusterDetailsLite(
                        FullClusterId(ModelVersion.Model20m145kDec11, clusterId),
                        details.numUsersWithAnyNonZeroScore,
                        details.numUsersWithNonZeroFollowScore,
                        details.numUsersWithNonZeroFavScore,
                        details.knownForUsersAndScores.getOrElse(Nil)
                      )
                    case ((_, clusterId), details)
                        if modelVersion == ModelVersions.Model20M145KUpdated =>
                      ClusterDetailsLite(
                        FullClusterId(ModelVersion.Model20m145kUpdated, clusterId),
                        details.numUsersWithAnyNonZeroScore,
                        details.numUsersWithNonZeroFollowScore,
                        details.numUsersWithNonZeroFavScore,
                        details.knownForUsersAndScores.getOrElse(Nil)
                      )
                    case ((_, clusterId), details)
                        if modelVersion == ModelVersions.Model20M145K2020 =>
                      ClusterDetailsLite(
                        FullClusterId(ModelVersion.Model20m145k2020, clusterId),
                        details.numUsersWithAnyNonZeroScore,
                        details.numUsersWithNonZeroFollowScore,
                        details.numUsersWithNonZeroFavScore,
                        details.knownForUsersAndScores.getOrElse(Nil)
                      )
                  }.writeDALSnapshotExecution(
                    clusterDetailsLiteOutputDataset,
                    D.Daily,
                    D.Suffix(clusterDetailsLiteOutputPath),
                    D.EBLzo(),
                    dateRange.end)
              Execution.zip(clusterDetailsExec, clusterDetailsLiteExec)
            }
          Util.printCounters(resultExec)
        }
      }
  }
 }
 object ClusterDetailsBatch extends ClusterDetailsBatchTrait {
  override val firstTime: String = "2018-07-28"
  override val batchIncrement: Duration = Days(7)
  override val manhattanOutputPath: String =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details"
  override val clusterDetailsLiteOutputPath: String =
    "/user/cassowary/processed/simclusters_v2_cluster_details_lite"
  override val modelVersion: String = ModelVersions.Model20M145KDec11
  override val knownForDataset = SimclustersV2KnownFor20M145KDec11ScalaDataset
  override val interestedInDataset = SimclustersV2InterestedInScalaDataset
  override val outputDataset = SimclustersV2ClusterDetailsScalaDataset
  override val clusterDetailsLiteOutputDataset =
    SimclustersV2ClusterDetailsLiteScalaDataset
 }
 object ClusterDetails20M145KUpdated extends ClusterDetailsBatchTrait {
  override val firstTime: String = "2019-06-16"
  override val batchIncrement: Duration = Days(7)
  override val manhattanOutputPath: String =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_updated"
  override val clusterDetailsLiteOutputPath: String =
    "/user/cassowary/processed/simclusters_v2_cluster_details_lite_20m_145k_updated"
  override val modelVersion: String = ModelVersions.Model20M145KUpdated
  override val knownForDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
  override val interestedInDataset = SimclustersV2InterestedIn20M145KUpdatedScalaDataset
  override val outputDataset = SimclustersV2ClusterDetails20M145KUpdatedScalaDataset
  override val clusterDetailsLiteOutputDataset =
    SimclustersV2ClusterDetailsLite20M145KUpdatedScalaDataset
 }
 /**
 * capesospy-v2 update --build_locally --start_cron cluster_details_20m_145k_2020 \
 * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
 */
 object ClusterDetails20M145K2020 extends ClusterDetailsBatchTrait {
  override val firstTime: String = "2020-10-15"
  override val batchIncrement: Duration = Days(7)
  override val manhattanOutputPath: String =
    "/user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_2020"
  override val clusterDetailsLiteOutputPath: String =
    "/user/cassowary/processed/simclusters_v2_cluster_details_lite_20m_145k_2020"
  override val modelVersion: String = ModelVersions.Model20M145K2020
  override val knownForDataset = SimclustersV2KnownFor20M145K2020ScalaDataset
  override val interestedInDataset = SimclustersV2InterestedIn20M145K2020ScalaDataset
  override val outputDataset = SimclustersV2ClusterDetails20M145K2020ScalaDataset
  override val clusterDetailsLiteOutputDataset =
    SimclustersV2ClusterDetailsLite20M145K2020ScalaDataset
 }
 /**
 scalding remote run  --main-class com.twitter.simclusters_v2.scalding.DumpClusterDetailsAdhoc \
  --target src/scala/com/twitter/simclusters_v2/scalding:cluster_details-dump \
  --user recos-platform -- \
  --date 2020-06-25 \
  --clusterIds 5542 129677 48645 \
  --inputDir /user/recos-platform/adhoc/your_ldap/cluster_details_inferred_lang
 */
 object DumpClusterDetailsAdhoc extends TwitterExecutionApp {
  def job: Execution[Unit] =
    Execution.getConfigMode.flatMap {
      case (config, mode) =>
        Execution.withId { implicit uniqueId =>
          val args = config.getArgs
          val clusters = args.list("clusterIds").map(_.toInt).toSet //(1 to 2500).toSet //
          TypedPipe
            .from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
            .filter { case ((modelVersion, clusterId), details) => clusters.contains(clusterId) }
            .toIterableExecution
            .map { iter =>
              iter.foreach { x => println(Util.prettyJsonMapper.writeValueAsString(x)) }
            }
        }
    }
 }
 /**
 * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:cluster_details && \
 * oscar hdfs --user cassowary --host hadoopnest2.atla.twitter.com --bundle cluster_details \
 * --tool com.twitter.simclusters_v2.scalding.DumpClusterSimilaritiesAdhoc --screen --screen-detached \
 * --tee your_ldap/dumpClusterSimilarities_20200103 -- \
 * --inputDir /user/cassowary/manhattan_sequence_files/simclusters_v2_cluster_details_20m_145k_updated/ \
 * --outputDir adhoc/your_ldap
 */
 object DumpClusterSimilaritiesAdhoc extends TwitterExecutionApp {
  def job: Execution[Unit] =
    Execution.getConfigMode.flatMap {
      case (config, mode) =>
        Execution.withId { implicit uniqueId =>
          val args = config.getArgs
          TypedPipe
            .from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
            .flatMap {
              case ((_, clusterId), details) =>
                details.neighborClusters.getOrElse(Nil).map { neighbor =>
                  val compositeScore = (neighbor.followCosineSimilarity
                    .getOrElse(0.0) + neighbor.favCosineSimilarity.getOrElse(0.0)) / 2
                  (
                    clusterId,
                    neighbor.clusterId,
                    "%.4f".format(compositeScore)
                  )
                }
            }.writeExecution(TypedTsv(args("outputDir")))
        }
    }
 }
--- a/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/ClusterEvaluation.scala
@ -1,607 +0,0 @@
 package com.twitter.simclusters_v2.scalding
 import com.twitter.algebird.Monoid
 import com.twitter.algebird.mutable.PriorityQueueMonoid
 import com.twitter.dal.client.dataset.KeyValDALDataset
 import com.twitter.pluck.source.cassowary.FollowingsCosineSimilaritiesManhattanSource
 import com.twitter.scalding._
 import com.twitter.scalding_internal.dalv2.DAL
 import com.twitter.scalding_internal.job.TwitterExecutionApp
 import com.twitter.scalding_internal.job.analytics_batch._
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
 import com.twitter.simclusters_v2.common.ModelVersions
 import com.twitter.simclusters_v2.hdfs_sources._
 import com.twitter.simclusters_v2.scalding.common.TypedRichPipe._
 import com.twitter.simclusters_v2.scalding.common.Util
 import com.twitter.simclusters_v2.scalding.common.Util.Distribution
 import com.twitter.simclusters_v2.thriftscala.ClusterQuality
 import com.twitter.simclusters_v2.thriftscala.ClustersUserIsKnownFor
 import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset
 import java.util.PriorityQueue
 import scala.collection.JavaConverters._
 object ClusterEvaluation {
  val samplerMonoid: PriorityQueueMonoid[((Long, Long), (Double, Double))] =
    Util.reservoirSamplerMonoidForPairs[(Long, Long), (Double, Double)](5000)(Util.edgeOrdering)
  case class ClusterResults(
    numEdgesInsideCluster: Int,
    wtOfEdgesInsideCluster: Double,
    numEdgesOutsideCluster: Int,
    wtOfEdgesOutsideCluster: Double,
    originalWtAndProductOfNodeScoresSample: PriorityQueue[((Long, Long), (Double, Double))]) {
    def clusterQuality(clusterSize: Int, averagePrecisionWholeGraph: Double): ClusterQuality = {
      val unweightedRecallDenominator = numEdgesInsideCluster + numEdgesOutsideCluster
      val unweightedRecall = if (unweightedRecallDenominator > 0) {
        numEdgesInsideCluster.toDouble / unweightedRecallDenominator.toDouble
      } else 0.0
      val weightedRecallDenominator = wtOfEdgesInsideCluster + wtOfEdgesOutsideCluster
      val weightedRecall = if (weightedRecallDenominator > 0) {
        wtOfEdgesInsideCluster / weightedRecallDenominator
      } else 0.0
      val precision = if (clusterSize > 1) {
        Some(wtOfEdgesInsideCluster / (clusterSize * (clusterSize - 1)))
      } else Some(0.0)
      val relativePrecision = if (averagePrecisionWholeGraph > 0) {
        precision.flatMap { p => Some(p / averagePrecisionWholeGraph) }
      } else Some(0.0)
      ClusterQuality(
        unweightedRecall = Some(unweightedRecall),
        weightedRecall = Some(weightedRecall),
        unweightedRecallDenominator = Some(unweightedRecallDenominator),
        weightedRecallDenominator = Some(weightedRecallDenominator),
        relativePrecisionNumerator = precision,
        relativePrecision = relativePrecision,
        weightAndProductOfNodeScoresCorrelation = Some(
          Util.computeCorrelation(
            originalWtAndProductOfNodeScoresSample.iterator.asScala.map(_._2)))
      )
    }
  }
  object ClusterResultsMonoid extends Monoid[ClusterResults] {
    override def zero = ClusterResults(0, 0, 0, 0, samplerMonoid.zero)
    override def plus(l: ClusterResults, r: ClusterResults) = ClusterResults(
      l.numEdgesInsideCluster + r.numEdgesInsideCluster,
      l.wtOfEdgesInsideCluster + r.wtOfEdgesInsideCluster,
      l.numEdgesOutsideCluster + r.numEdgesOutsideCluster,
      l.wtOfEdgesOutsideCluster + r.wtOfEdgesOutsideCluster,
      samplerMonoid
        .plus(l.originalWtAndProductOfNodeScoresSample, r.originalWtAndProductOfNodeScoresSample)
    )
  }
  /**
   * Evaluate the quality of a cluster.
   * @param memberScores A map with the members of the cluster as the keys and their scores
   *                     inside the cluster as values. The more central a member is inside the score,
   *                     the higher it's score is.
   * @param membersAdjLists A map that gives the weighted neighbors of each member in the cluster.
   */
  def evaluateCluster(
    memberScores: Map[Long, Double],
    membersAdjLists: Map[Long, Map[Long, Float]]
  ): ClusterResults = {
    val resultsIter = membersAdjLists.flatMap {
      case (fromNodeId, adjList) =>
        val fromNodeWt = memberScores.getOrElse(fromNodeId, 0.0)
        adjList.map {
          case (toNodeId, edgeWt) =>
            if (memberScores.contains(toNodeId)) {
              val productOfMembershipScores = fromNodeWt * memberScores(toNodeId)
              ClusterResults(
                1,
                edgeWt,
                0,
                0,
                samplerMonoid.build(
                  ((fromNodeId, toNodeId), (edgeWt.toDouble, productOfMembershipScores))))
            } else {
              ClusterResults(0, 0, 1, edgeWt, samplerMonoid.zero)
            }
        }
    }
    Monoid.sum(resultsIter)(ClusterResultsMonoid)
  }
  /**
   * Evaluate each cluster with respect to the provided graph.
   * @param graph graph represented via the adjacency lists of each node, needs to be symmetrized i.e. if u is in v's adjlist, then v needs to be in u's adjlist as well
   * @param clusters cluster memberships of each node.
   * @param statsPrefix convenience argument to act as prefix for stats counters
   * @return key-value pipe with clusterId as key and (size of the cluster, quality struct) as value
   */
  def clusterLevelEvaluation(
    graph: TypedPipe[(Long, Map[Long, Float])],
    clusters: TypedPipe[(Long, Array[(Int, Float)])],
    statsPrefix: String = ""
  )(
    implicit uniqueId: UniqueID
  ): Execution[TypedPipe[(Int, (Int, ClusterQuality))]] = {
    val numRealClusters = Stat(s"${statsPrefix}/numRealClusters")
    val numFakeClusters = Stat(s"${statsPrefix}/numFakeClusters")
    val numNodesAndEdgesExec = graph
      .map {
        case (nId, nbrMap) =>
          (1L, nbrMap.size.toLong, nbrMap.values.sum.toDouble)
      }.sum.getExecution
    numNodesAndEdgesExec.map {
      case (numNodes, numEdges, sumOfAllEdgeWts) =>
        println("numNodes " + numNodes)
        println("numEdges " + numEdges)
        println("sumOfAllEdgeWts " + sumOfAllEdgeWts)
        val numFakeClustersForUnassignedNodes = numNodes / 1e4
        val averagePrecisionWholeGraph = sumOfAllEdgeWts / (numNodes * (numNodes - 1))
        graph
          .leftJoin(clusters)
          // uncomment for adhoc job
          .withReducers(200)
          .flatMap {
            case (nodeId, (adjList, assignedClustersOpt)) =>
              val nodeDegree = adjList.size.toLong
              val nodeWeightedDegree = adjList.values.sum
              assignedClustersOpt match {
                case Some(assignedClusters) if assignedClusters.nonEmpty =>
                  assignedClusters.toList.map {
                    case (clusterId, scoreOfNodeInCluster) =>
                      (
                        clusterId,
                        (
                          Map(nodeId -> (scoreOfNodeInCluster.toDouble, adjList)),
                          1,
                          nodeDegree,
                          nodeWeightedDegree))
                  }
                case _ =>
                  // For nodes that don't belong to any cluster, create a fake clusterId (0 or lesser)
                  // and add the node's statistics to that clusterId. We don't need the adjacency lists for
                  // unassigned nodes, we'll simply track how many edges are incident on those nodes and their weighted sum etc
                  val fakeClusterId =
                    (-1 * (math.abs(
                      Util.hashToLong(nodeId)) % numFakeClustersForUnassignedNodes)).toInt
                  List(
                    (
                      fakeClusterId,
                      (
                        Map.empty[Long, (Double, Map[Long, Float])],
                        1,
                        nodeDegree,
                        nodeWeightedDegree)))
              }
          }
          .sumByKey
          // uncomment for adhoc job
          .withReducers(60)
          .map {
            case (clusterId, (membersMap, clusterSize, volumeOfCluster, weightedVolumeOfCluster)) =>
              if (clusterId > 0) {
                numRealClusters.inc()
                val scoresMap =
                  if (clusterId > 0) membersMap.mapValues(_._1) else Map.empty[Long, Double]
                val adjListsMap = membersMap.mapValues(_._2)
                val quality = evaluateCluster(scoresMap, adjListsMap)
                  .clusterQuality(clusterSize, averagePrecisionWholeGraph)
                (clusterId, (clusterSize, quality))
              } else {
                // clusterId <= 0 means that this is a fake cluster.
                numFakeClusters.inc()
                (
                  clusterId,
                  (
                    clusterSize,
                    ClusterQuality(
                      unweightedRecallDenominator = Some(volumeOfCluster),
                      weightedRecallDenominator = Some(weightedVolumeOfCluster)
                    )
                  )
                )
              }
          }
    }
  }
  case class OverallResults(
    unweightedRecall: Double,
    edgesInsideClusters: Long,
    allEdges: Long,
    allNodes: Int,
    weightedRecall: Double,
    wtOnEdgesInsideClusters: Double,
    wtOnAllEdges: Double,
    weightCorrelation: Double,
    relativePrecision: Double,
    numUnassignedNodes: Int,
    numAssignedNodes: Int,
    sizeDist: Distribution,
    recallDist: Distribution,
    weightedRecallDist: Distribution,
    relativePrecisionDist: Distribution,
    weightCorrelationDist: Distribution,
    numClustersWithNegativeCorrelation: Double,
    numClustersWithZeroRecall: Double,
    numClustersWithLessThanOneRelativePrecision: Double,
    numSingletonClusters: Int)
  def summarizePerClusterResults(
    perClusterResults: TypedPipe[(Int, (Int, ClusterQuality))]
  ): Execution[Option[OverallResults]] = {
    perClusterResults
      .map {
        case (clusterId, (size, quality)) =>
          val unweightedRecallDen = quality.unweightedRecallDenominator.getOrElse(0.0)
          val unweightedRecallNum = quality.unweightedRecall.getOrElse(0.0) * unweightedRecallDen
          val weightedRecallDen = quality.weightedRecallDenominator.getOrElse(0.0)
          val weightedRecallNum = quality.weightedRecall.getOrElse(0.0) * weightedRecallDen
          val weightCorrelationDen = size
          val weightCorrelationNum =
            weightCorrelationDen * quality.weightAndProductOfNodeScoresCorrelation
              .getOrElse(0.0)
          val relativePrecisionDen = size
          val relativePrecisionNum = relativePrecisionDen * quality.relativePrecision.getOrElse(0.0)
          val numClustersWithNegativeCorrelation =
            if (weightCorrelationNum < 0 && clusterId > 0) 1 else 0
          val numClustersWithLessThanOneRelativePrecision =
            if (quality.relativePrecision.getOrElse(0.0) < 1 && clusterId > 0) 1 else 0
          val numClustersWithZeroRecall = if (weightedRecallNum < 1e-5 && clusterId > 0) 1 else 0
          val numUnassignedNodes = if (clusterId < 1) size else 0
          val numAssignedNodes = if (clusterId > 0) size else 0
          val numSingletonClusters = if (clusterId > 0 && size == 1) 1 else 0
          (
            unweightedRecallDen,
            unweightedRecallNum,
            weightedRecallDen,
            weightedRecallNum,
            weightCorrelationDen,
            weightCorrelationNum,
            relativePrecisionDen,
            relativePrecisionNum,
            numClustersWithNegativeCorrelation,
            numClustersWithLessThanOneRelativePrecision,
            numClustersWithZeroRecall,
            List(size.toDouble),
            List(quality.unweightedRecall.getOrElse(0.0)),
            List(quality.weightedRecall.getOrElse(0.0)),
            List(quality.relativePrecision.getOrElse(0.0)),
            List(quality.weightAndProductOfNodeScoresCorrelation.getOrElse(0.0)),
            numUnassignedNodes,
            numAssignedNodes,
            numSingletonClusters
          )
      }
      .sum
      .toOptionExecution
      .map { opt =>
        opt.map {
          case (
                unweightedRecallDen,
                unweightedRecallNum,
                weightedRecallDen,
                weightedRecallNum,
                weightCorrelationDen,
                weightCorrelationNum,
                relativePrecisionDen,
                relativePrecisionNum,
                numClustersWithNegativeCorrelation,
                numClustersWithLessThanOneRelativePrecision,
                numClustersWithZeroRecall,
                sizeList,
                unweightedRecallList,
                weightedRecallList,
                relativePrecisionList,
                weightCorrelationList,
                numUnassignedNodes,
                numAssignedNodes,
                numSingletonClusters) =>
            OverallResults(
              unweightedRecall = unweightedRecallNum / unweightedRecallDen,
              edgesInsideClusters = unweightedRecallNum.toLong,
              allEdges = unweightedRecallDen.toLong,
              allNodes = numAssignedNodes + numUnassignedNodes,
              weightedRecall = weightedRecallNum / weightedRecallDen,
              wtOnEdgesInsideClusters = weightedRecallNum,
              wtOnAllEdges = weightedRecallDen,
              weightCorrelation = weightCorrelationNum / weightCorrelationDen,
              relativePrecision = relativePrecisionNum / relativePrecisionDen,
              numAssignedNodes = numAssignedNodes,
              numUnassignedNodes = numUnassignedNodes,
              sizeDist = Util.distributionFromArray(sizeList.toArray),
              recallDist = Util.distributionFromArray(unweightedRecallList.toArray),
              weightedRecallDist = Util.distributionFromArray(weightedRecallList.toArray),
              weightCorrelationDist = Util.distributionFromArray(weightCorrelationList.toArray),
              relativePrecisionDist = Util.distributionFromArray(relativePrecisionList.toArray),
              numClustersWithNegativeCorrelation = numClustersWithNegativeCorrelation,
              numClustersWithLessThanOneRelativePrecision =
                numClustersWithLessThanOneRelativePrecision,
              numClustersWithZeroRecall = numClustersWithZeroRecall,
              numSingletonClusters = numSingletonClusters
            )
        }
      }
  }
  /**
   * @param graph Input similarity graph, needs to be symmetrized i.e. if u is in v's adjlist, then v needs to be in u's adjlist as well
   * @param clusters cluster assignments to be evaluated
   * @return summary of results
   */
  def overallEvaluation(
    graph: TypedPipe[(Long, Map[Long, Float])],
    clusters: TypedPipe[(Long, Array[(Int, Float)])],
    statsPrefix: String
  )(
    implicit uniqueId: UniqueID
  ): Execution[Option[OverallResults]] = {
    clusterLevelEvaluation(graph, clusters, statsPrefix).flatMap(summarizePerClusterResults)
  }
 }
 /**
 * ./bazel bundle src/scala/com/twitter/simclusters_v2/scalding:cluster_evaluation && \
 * oscar hdfs --user frigate --host hadoopnest1.atla.twitter.com --bundle cluster_evaluation \
 * --tool com.twitter.simclusters_v2.scalding.ClusterEvaluationAdhoc --screen --screen-detached \
 * --tee logs/clusterQualityFor_updatedUnnormalizedInputScores_usingSims20190318  -- \
 * --simsInputDir /user/frigate/your_ldap/commonDirForClusterEvaluation/classifiedSims_20190314_copiedFromAtlaProc \
 * --topK 20000000 --date 2019-03-18 --minActiveFollowers 400 \
 * --topUsersDir /user/frigate/your_ldap/commonDirForClusterEvaluation/top20MUsers_minActiveFollowers400_20190215 \
 * --maxSimsNeighborsForEval 40 \
 * --preparedSimsGraph /user/frigate/your_ldap/commonDirForClusterEvaluation/symmetrized_classifiedSims20190318_top20MUsers \
 * --outputDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownForClusterEvaluation \
 * --knownForDir /user/frigate/your_ldap/dirFor_updatedKnownFor20M_145K_dec11_usingSims20190127_unnormalizedInputScores/knownFor
 */
 object ClusterEvaluationAdhoc extends TwitterExecutionApp {
  implicit val tz: java.util.TimeZone = DateOps.UTC
  implicit val dp = DateParser.default
  def job: Execution[Unit] =
    Execution.getConfigMode.flatMap {
      case (config, mode) =>
        Execution.withId { implicit uniqueId =>
          val args = config.getArgs
          val knownFor = args
            .optional("knownForDir").map { location =>
              KnownForSources.readKnownFor(location)
            }.getOrElse(KnownForSources.knownFor_20M_Dec11_145K)
          val minActiveFollowers = args.int("minActiveFollowers", 400)
          val topK = args.int("topK")
          val date = DateRange.parse(args("date"))
          val topUsersExec =
            TopUsersSimilarityGraph
              .topUsers(
                DAL.readMostRecentSnapshot(UsersourceFlatScalaDataset, date).toTypedPipe,
                minActiveFollowers,
                topK
              )
              .map(_.id)
              .count("num_top_users")
              .make(TypedTsv(args("topUsersDir")))
          val simsGraphExec = topUsersExec.flatMap { topUsers =>
            TopUsersSimilarityGraph.makeGraph(
              TopUsersSimilarityGraph.getSubgraphFromUserGroupedInput(
                TypedPipe.from(WTFCandidatesSource(args("simsInputDir"))),
                topUsers,
                args.int("maxSimsNeighborsForEval", 40),
                degreeThresholdForStat = 5
              ),
              args("preparedSimsGraph")
            )
          }
          val fullExec = simsGraphExec.flatMap { sims =>
            ClusterEvaluation
              .clusterLevelEvaluation(sims, knownFor, "eval")
              .flatMap { clusterResultsPipe =>
                val clusterResults = clusterResultsPipe.forceToDiskExecution
                val outputExec = clusterResults.flatMap { pipe =>
                  pipe
                    .map {
                      case (clusterId, (clusterSize, quality)) =>
                        "%d\t%d\t%.2g\t%.2g\t%.1f\t%.2g\t%.2f\t%.2g\t%.2g"
                          .format(
                            clusterId,
                            clusterSize,
                            quality.unweightedRecall.getOrElse(0.0),
                            quality.weightedRecall.getOrElse(0.0),
                            quality.unweightedRecallDenominator.getOrElse(0.0),
                            quality.weightedRecallDenominator.getOrElse(0.0),
                            quality.relativePrecision.getOrElse(0.0),
                            quality.relativePrecisionNumerator.getOrElse(0.0),
                            quality.weightAndProductOfNodeScoresCorrelation.getOrElse(0.0)
                          )
                    }.writeExecution(TypedTsv(args("outputDir")))
                }
                val printExec = clusterResults.flatMap { pipe =>
                  ClusterEvaluation.summarizePerClusterResults(pipe).map {
                    case Some(res) =>
                      println("Overall results: " + Util.prettyJsonMapper.writeValueAsString(res))
                    case None =>
                      println("No overall results!!! Probably cluster results pipe is empty.")
                  }
                }
                Execution.zip(outputExec, printExec)
              }
          }
          Util.printCounters(fullExec)
        }
    }
 }
 trait ClusterEvaluationBatch extends TwitterScheduledExecutionApp {
  implicit val tz: java.util.TimeZone = DateOps.UTC
  implicit val dp = DateParser.default
  def firstTime: String
  def batchDescription: String
  def batchIncrement: Duration
  private lazy val execArgs = AnalyticsBatchExecutionArgs(
    batchDesc = BatchDescription(batchDescription),
    firstTime = BatchFirstTime(RichDate(firstTime)),
    lastTime = None,
    batchIncrement = BatchIncrement(batchIncrement)
  )
  val emailAddress: String = "no-reply@twitter.com"
  def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
  def knownForModelVersion: String
  def baselineKnownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
  def baselineKnownForModelVersion: String
  override def scheduledJob: Execution[Unit] =
    AnalyticsBatchExecution(execArgs) { implicit dateRange =>
      Execution.withId { implicit uniqueId =>
        Execution.withArgs { args =>
          val baselineKnownFor =
            KnownForSources.fromKeyVal(
              DAL
                .readMostRecentSnapshot(baselineKnownForDALDataset, dateRange.prepend(Days(7)))
                .toTypedPipe,
              baselineKnownForModelVersion
            )
          val knownFor =
            KnownForSources.fromKeyVal(
              DAL
                .readMostRecentSnapshot(knownForDALDataset, dateRange.prepend(Days(7)))
                .toTypedPipe,
              knownForModelVersion
            )
          val inputSimsGraph = TypedPipe
            .from(FollowingsCosineSimilaritiesManhattanSource())
            .map(_._2)
          val minActiveFollowers = args.int("minActiveFollowers")
          val topK = args.int("topK")
          val maxSimsNeighborsForEval =
            args.int("maxSimsNeighborsForEval", 40)
          val topUsers = TopUsersSimilarityGraph
            .topUsers(
              DAL
                .readMostRecentSnapshot(UsersourceFlatScalaDataset, dateRange)
                .toTypedPipe,
              minActiveFollowers,
              topK
            )
            .map(_.id)
            .count("num_top_users")
          TopUsersSimilarityGraph
            .getSubgraphFromUserGroupedInput(
              fullGraph = inputSimsGraph,
              usersToInclude = topUsers,
              maxNeighborsPerNode = maxSimsNeighborsForEval,
              degreeThresholdForStat = 2
            )
            .forceToDiskExecution
            .flatMap { symmetrizedSims =>
              val baselineResultsExec = ClusterEvaluation
                .overallEvaluation(symmetrizedSims, baselineKnownFor, "baselineKnownForEval")
              val newResultsExec = ClusterEvaluation
                .overallEvaluation(symmetrizedSims, knownFor, "newKnownForEval")
              val minSizeOfBiggerClusterForComparison = 10
              val compareExec = CompareClusters.summarize(
                CompareClusters.compare(
                  KnownForSources.transpose(baselineKnownFor),
                  KnownForSources.transpose(knownFor),
                  minSizeOfBiggerCluster = minSizeOfBiggerClusterForComparison
                ))
              Execution
                .zip(baselineResultsExec, newResultsExec, compareExec)
                .map {
                  case (oldResults, newResults, compareResults) =>
                    val emailText =
                      s"Evaluation Results for baseline knownFor: $baselineKnownForModelVersion \n" +
                        Util.prettyJsonMapper.writeValueAsString(oldResults) +
                        "\n\n-------------------\n\n" +
                        s"Evaluation Results for new knownFor:$knownForModelVersion\n" +
                        Util.prettyJsonMapper.writeValueAsString(newResults) +
                        "\n\n-------------------\n\n" +
                        s"Cosine similarity distribution between $baselineKnownForModelVersion and " +
                        s"$knownForModelVersion cluster membership vectors for " +
                        s"clusters with at least $minSizeOfBiggerClusterForComparison members:\n" +
                        Util.prettyJsonMapper
                          .writeValueAsString(compareResults)
                    Util
                      .sendEmail(
                        emailText,
                        s"Evaluation results comparing $knownForModelVersion with baseline $baselineKnownForModelVersion",
                        emailAddress)
                    ()
                }
            }
        }
      }
    }
 }
 /**
 * capesospy-v2 update --build_locally --start_cron cluster_evaluation_for_20M_145k \
 * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
 */
 object ClusterEvaluationFor20M145K extends ClusterEvaluationBatch {
  override val firstTime: String = "2019-06-11"
  override val batchIncrement: Duration = Days(7)
  override val batchDescription = "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K"
  override val knownForDALDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
  override val knownForModelVersion = ModelVersions.Model20M145KUpdated
  override val baselineKnownForDALDataset = SimclustersV2KnownFor20M145KDec11ScalaDataset
  override val baselineKnownForModelVersion = ModelVersions.Model20M145KDec11
 }
 /**
 * capesospy-v2 update --build_locally --start_cron cluster_evaluation_for_20M_145k_2020 \
 * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
 */
 object ClusterEvaluationFor20M145K2020 extends ClusterEvaluationBatch {
  override val firstTime: String = "2021-01-25"
  override val batchIncrement: Duration = Days(7)
  override val batchDescription =
    "com.twitter.simclusters_v2.scalding.ClusterEvaluationFor20M145K2020"
  override val knownForDALDataset = SimclustersV2KnownFor20M145K2020ScalaDataset
  override val knownForModelVersion = ModelVersions.Model20M145K2020
  override val baselineKnownForDALDataset = SimclustersV2KnownFor20M145KUpdatedScalaDataset
  override val baselineKnownForModelVersion = ModelVersions.Model20M145KUpdated
 }
--- a/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/CompareClusters.scala
@ -1,131 +0,0 @@
 package com.twitter.simclusters_v2.scalding
 import com.twitter.scalding.{DateOps, DateParser, Execution, Stat, TypedPipe, TypedTsv, UniqueID}
 import com.twitter.scalding_internal.job.TwitterExecutionApp
 import com.twitter.simclusters_v2.common.{ClusterId, UserId}
 import com.twitter.simclusters_v2.scalding.common.Util
 import com.twitter.simclusters_v2.scalding.common.Util.Distribution
 object CompareClusters {
  def norm(a: Iterable[Float]): Float = {
    math
      .sqrt(a.map { x => x * x }.sum).toFloat
  }
  def cosine(a: Map[Long, Float], b: Map[Long, Float]): Float = {
    val intersect = a.toList.collect {
      case (id, score) if b.contains(id) =>
        score * b(id)
    }
    val dot = if (intersect.nonEmpty) intersect.sum else 0
    val aNorm = norm(a.values)
    val bNorm = norm(b.values)
    if (aNorm > 0 && bNorm > 0) {
      dot / aNorm / bNorm
    } else 0
  }
  /**
   * Compare two known-for data set, and generate change in cluster assignment stats
   */
  def compareClusterAssignments(
    newKnownFor: TypedPipe[(UserId, List[(ClusterId, Float)])],
    oldKnownFor: TypedPipe[(UserId, List[(ClusterId, Float)])]
  )(
    implicit uniqueID: UniqueID
  ): Execution[String] = {
    val emptyToSomething = Stat("no_assignment_to_some")
    val somethingToEmpty = Stat("some_assignment_to_none")
    val emptyToEmpty = Stat("empty_to_empty")
    val sameCluster = Stat("same_cluster")
    val diffCluster = Stat("diff_cluster")
    val calculateStatExec = newKnownFor
      .outerJoin(oldKnownFor)
      .map {
        case (userId, (newKnownForListOpt, oldKnownForListOpt)) =>
          val newKnownFor = newKnownForListOpt.getOrElse(Nil)
          val oldKnownFor = oldKnownForListOpt.getOrElse(Nil)
          if (newKnownFor.nonEmpty && oldKnownFor.isEmpty) {
            emptyToSomething.inc()
          }
          if (newKnownFor.isEmpty && oldKnownFor.nonEmpty) {
            somethingToEmpty.inc()
          }
          if (newKnownFor.isEmpty && oldKnownFor.isEmpty) {
            emptyToEmpty.inc()
          }
          if (newKnownFor.nonEmpty && oldKnownFor.nonEmpty) {
            val newClusterId = newKnownFor.head._1
            val oldClusterId = oldKnownFor.head._1
            if (newClusterId == oldClusterId) {
              sameCluster.inc()
            } else {
              diffCluster.inc()
            }
          }
          userId
      }
      .toIterableExecution
    Util.getCustomCountersString(calculateStatExec)
  }
  /**
   * Compare two cluster assignments in terms of cosine similarity of corresponding clusters.
   * Excludes clusters which are too small
   * @param knownForA
   * @param knownForB
   * @param minSizeOfBiggerCluster Set to 10 or some such.
   * @return
   */
  def compare(
    knownForA: TypedPipe[(Int, List[(Long, Float)])],
    knownForB: TypedPipe[(Int, List[(Long, Float)])],
    minSizeOfBiggerCluster: Int
  ): TypedPipe[(Int, Float)] = {
    knownForA
      .outerJoin(knownForB)
      .collect {
        case (clusterId, (membersInAOpt, membersInBOpt))
            if membersInAOpt.exists(_.size >= minSizeOfBiggerCluster) || membersInBOpt
              .exists(_.size >= minSizeOfBiggerCluster) =>
          val membersInA =
            membersInAOpt.map(_.toMap).getOrElse(Map.empty[Long, Float])
          val membersInB =
            membersInBOpt.map(_.toMap).getOrElse(Map.empty[Long, Float])
          (clusterId, cosine(membersInA, membersInB))
      }
  }
  def summarize(clusterToCosines: TypedPipe[(Int, Float)]): Execution[Option[Distribution]] = {
    clusterToCosines.values.map(x => List(x)).sum.toOptionExecution.map { listOpt =>
      listOpt.map { list => Util.distributionFromArray(list.map(_.toDouble).toArray) }
    }
  }
 }
 object CompareClustersAdhoc extends TwitterExecutionApp {
  implicit val tz: java.util.TimeZone = DateOps.UTC
  implicit val dp = DateParser.default
  def job: Execution[Unit] =
    Execution.getConfigMode.flatMap {
      case (config, mode) =>
        Execution.withId { implicit uniqueId =>
          val args = config.getArgs
          val knownForA = KnownForSources.transpose(KnownForSources.readKnownFor(args("knownForA")))
          val knownForB = KnownForSources.transpose(KnownForSources.readKnownFor(args("knownForB")))
          CompareClusters
            .compare(knownForA, knownForB, minSizeOfBiggerCluster = 10)
            .map { case (cId, cos) => "%d\t%.2f".format(cId, cos) }
            .writeExecution(TypedTsv(args("outputDir")))
        }
    }
 }
--- a/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/EigenVectorsForSparseSymmetric.scala
@ -1,330 +0,0 @@
 package com.twitter.simclusters_v2.scalding
 import com.twitter.algebird.Monoid
 import com.twitter.logging.Logger
 import com.twitter.scalding.{Execution, TypedPipe, TypedTsv}
 import com.twitter.scalding_internal.job.TwitterExecutionApp
 import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
 import java.util
 import no.uib.cipr.matrix.Matrix
 import no.uib.cipr.matrix.sparse.{ArpackSym, LinkedSparseMatrix}
 import scala.collection.JavaConverters._
 object EigenVectorsForSparseSymmetric {
  val log: Logger = Logger()
  /**
   * Construct matrix from the rows of the matrix, specified as a map. The outer map is indexed by rowId, and the inner maps are indexed by columnId.
   * Note that the input matrix is intended to be symmetric.
   *
   * @param map   A map specifying the rows of the matrix. The outer map is indexed by rowId, and the inner maps are indexed by columnId. Both rows and columns are zero-indexed.
   * @param nRows number of rows in matrix
   * @param nCols number of columns in matrix
   *
   * @return the constructed matrix
   */
  def getMatrix(map: Map[Int, Map[Int, Double]], nRows: Int, nCols: Int): Matrix = {
    val nonzeros = map.toSeq.flatMap {
      case (i, subMap) =>
        subMap.toSeq.map {
          case (j, value) =>
            (i, j, value)
        }
    }
    getMatrix(nonzeros, nRows, nCols)
  }
  /**
   * Construct matrix from iterable of the non-zero entries. Note that the input matrix is intended to be symmetric.
   *
   * @param nonzeros non-zeros in (i, j, v) format, where i is row, j is column, and v is value. Both rows and columns are zero-indexed.
   * @param nRows    number of rows in matrix
   * @param nCols    number of columns in matrix
   *
   * @return the constructed matrix
   */
  def getMatrix(nonzeros: Iterable[(Int, Int, Double)], nRows: Int, nCols: Int): Matrix = {
    val matrix = new LinkedSparseMatrix(nRows, nCols)
    var numEntries = 0
    var maxRow = 0
    var maxCol = 0
    nonzeros.foreach {
      case (i, j, v) =>
        if (i > maxRow) {
          maxRow = i
        }
        if (j > maxCol) {
          maxCol = j
        }
        numEntries += 1
        matrix.set(i, j, v)
    }
    log.info(
      "Finished building matrix with %d entries and maxRow %d and maxCol %d"
        .format(numEntries, maxRow, maxCol))
    matrix
  }
  /**
   * Prints out various diagnostics about how much the given matrix differs from a perfect
   * symmetric matrix. If (i,j) and (j,i) are different, it sets both of them to be the max of the two.
   * Call this function before invoking EVD.
   *
   * @param matrix Matrix which is modified (if need be) in place.
   */
  def ensureMatrixIsSymmetric(matrix: Matrix): Unit = {
    var numUnequalEntries = 0
    var numEntriesDifferentBy1Percent = 0
    var numEqualEntries = 0
    var numUnequalDueToZero = 0
    var maxUnequal = (0, 0, 0.0, 0.0)
    matrix.iterator().asScala.foreach { entry =>
      val curr = entry.get()
      val opp = matrix.get(entry.column(), entry.row())
      if (curr == opp) {
        numEqualEntries += 1
      } else {
        numUnequalEntries += 1
        if (opp == 0) {
          numUnequalDueToZero += 1
        }
        if (opp != 0 && (math.abs(curr - opp) / math.min(curr, opp)) > 0.01) {
          numEntriesDifferentBy1Percent += 1
        }
        if (opp != 0 && math.abs(curr - opp) > maxUnequal._4) {
          maxUnequal = (entry.row(), entry.column(), curr, math.abs(curr - opp))
        }
        val max = math.max(curr, opp)
        matrix.set(entry.column(), entry.row(), max)
        matrix.set(entry.row(), entry.column(), max)
      }
    }
    var numUnEqualPrinted = 0
    matrix.iterator().asScala.foreach { entry =>
      val opp = matrix.get(entry.column(), entry.row())
      if (numUnEqualPrinted < 10 && entry.get() != opp) {
        numUnEqualPrinted += 1
        log.info(
          "Entries for (%d, %d) are %s and %s"
            .format(entry.row(), entry.column(), entry.get(), opp))
      }
    }
    log.info(
      "Num unequal entries: %d, num unequal due to zero: %d, num unequal by 1percent or more: %d, num equal entries: %d, maxUnequal: %s"
        .format(
          numUnequalEntries,
          numUnequalDueToZero,
          numEntriesDifferentBy1Percent,
          numEqualEntries,
          maxUnequal))
  }
  /**
   * Get the top-k eigenvalues (largest magnitude) and eigenvectors for an input matrix.
   * Top eigenvalues means they're the largest in magnitude.
   * Input matrix needs to be perfectly symmetric; if it's not, this function will fail.
   *
   * Many of the eigenvectors will have very small values along most of the dimensions. This method also
   * only retains the bigger entries in an eigenvector.
   *
   * @param matrix               symmetric input matrix.
   * @param k                    how many of the top eigenvectors to get.
   * @param ratioToLargestCutoff An entry needs to be at least 1/ratioToLargestCutoff of the biggest entry in that vector to be retained.
   *
   * @return seq of (eigenvalue, eigenvector) pairs.
   */
  def getTruncatedEVD(
    matrix: Matrix,
    k: Int,
    ratioToLargestCutoff: Float
  ): Seq[(Double, Seq[(Int, Double)])] = {
    val solver = new ArpackSym(matrix)
    val resultsMap = solver.solve(k, ArpackSym.Ritz.LM).asScala.toMap
    val results = resultsMap.toIndexedSeq.sortBy { case (eigValue, _) => -eigValue }
    results.zipWithIndex.map {
      case ((eigValue, denseVectorJava), index) =>
        val denseVector = new Array[Double](denseVectorJava.size())
        denseVector.indices.foreach { index => denseVector(index) = denseVectorJava.get(index) }
        val denseVectorMax = denseVector.maxBy { entry => math.abs(entry) }
        val cutOff = math.abs(denseVectorMax) / ratioToLargestCutoff
        val significantEntries = denseVector.zipWithIndex
          .filter { case (vectorEntry, _) => math.abs(vectorEntry) >= cutOff }
          .sortBy { case (vectorEntry, _) => -1 * math.abs(vectorEntry) }
        (eigValue.toDouble, significantEntries.toSeq.map(_.swap))
    }
  }
  /**
   * Compute U*Diag*Ut - where Diag is a diagonal matrix, and U is a sparse matrix.
   * This is primarily for testing - to make sure that the computed eigenvectors can be used to
   * reconstruct the original matrix up to some reasonable approximation.
   *
   * @param diagToUColumns seq of (diagonal entries, associated column in U)
   * @param cutoff         cutoff for including a value in the result.
   *
   * @return result of multiplication, returned as a map of the rows in the results.
   */
  def uTimesDiagTimesUT(
    diagToUColumns: Seq[(Double, Seq[(Int, Double)])],
    cutoff: Double
  ): Map[Int, Map[Int, Double]] = {
    val result = new util.HashMap[Int, util.HashMap[Int, Double]]()
    diagToUColumns.foreach {
      case (diag, uColumn) =>
        uColumn.foreach {
          case (i, iVal) =>
            uColumn.foreach {
              case (j, jVal) =>
                val prod = diag * iVal * jVal
                if (result.containsKey(i)) {
                  val newVal = if (result.get(i).containsKey(j)) {
                    result.get(i).get(j) + prod
                  } else prod
                  result.get(i).put(j, newVal)
                } else {
                  result.put(i, new util.HashMap[Int, Double])
                  result.get(i).put(j, prod)
                }
            }
        }
    }
    val unfiltered = result.asScala.toMap.mapValues(_.asScala.toMap)
    unfiltered
      .mapValues { m => m.filter { case (_, value) => math.abs(value) >= cutoff } }
      .filter { case (_, vector) => vector.nonEmpty }
  }
  /** Note: This requires a full EVD to correctly compute the inverse! :-( */
  def getInverseFromEVD(
    evd: Seq[(Double, Seq[(Int, Double)])],
    cutoff: Double
  ): Map[Int, Map[Int, Double]] = {
    val evdInverse = evd.map {
      case (eigValue, eigVector) =>
        (1.0 / eigValue, eigVector)
    }
    uTimesDiagTimesUT(evdInverse, cutoff)
  }
 }
 object PCAProjectionMatrixAdhoc extends TwitterExecutionApp {
  val log = Logger()
  def job: Execution[Unit] =
    Execution.getConfigMode.flatMap {
      case (config, _) =>
        Execution.withId { _ =>
          val args = config.getArgs
          val k = args.int("k", 100)
          val ratioToLargestEntryInVectorCutoff = args.int("ratioToLargestEntryInVectorCutoff", 100)
          val minClusterFavers = args.int("minClusterFavers", 1000)
          val input = TypedPipe.from(AdhocKeyValSources.clusterDetailsSource(args("inputDir")))
          val outputDir = args("outputDir")
          val filteredClustersExec =
            input
              .collect {
                case ((_, clusterId), details)
                    if details.numUsersWithNonZeroFavScore > minClusterFavers =>
                  clusterId
              }
              .toIterableExecution
              .map { fc =>
                val fcSet = fc.toSet
                log.info("Number of clusters with favers more than %d is %d"
                  .format(minClusterFavers, fcSet.size))
                fcSet
              }
          filteredClustersExec
            .flatMap { filteredClusters =>
              input.flatMap {
                case ((_, clusterId), details) =>
                  if (filteredClusters(clusterId)) {
                    details.neighborClusters.getOrElse(Nil).collect {
                      case neighbor
                          if filteredClusters(
                            neighbor.clusterId) && neighbor.favCosineSimilarity.isDefined =>
                        (clusterId, neighbor.clusterId, neighbor.favCosineSimilarity.get)
                    }
                  } else Nil
              }.toIterableExecution
            }
            .flatMap { edgesIter =>
              val edges = edgesIter.toSeq
              val oldIdToNewId = edges
                .flatMap { case (i, j, _) => Seq(i, j) }
                .distinct
                .zipWithIndex
                .toMap
              val mapString = oldIdToNewId.toList
                .take(5).map {
                  case (old, nw) =>
                    Seq(old, nw).mkString(" ")
                }.mkString("\n")
              log.info("A few entries of OldId to NewId map is")
              log.info(mapString)
              val newIdToOldId = oldIdToNewId.map(_.swap)
              log.info(
                "Num clusters after filtering out those with no neighbors with favers more than %d is %d"
                  .format(minClusterFavers, oldIdToNewId.size))
              val newEdges = edges.map {
                case (oldI, oldJ, value) =>
                  (oldIdToNewId(oldI), oldIdToNewId(oldJ), value)
              }
              log.info("Going to build matrix")
              val matrix = EigenVectorsForSparseSymmetric.getMatrix(
                newEdges,
                oldIdToNewId.size,
                oldIdToNewId.size)
              EigenVectorsForSparseSymmetric.ensureMatrixIsSymmetric(matrix)
              log.info("Going to solve now for %d eigenvalues".format(k))
              val tic = System.currentTimeMillis()
              val results = EigenVectorsForSparseSymmetric.getTruncatedEVD(
                matrix,
                k,
                ratioToLargestEntryInVectorCutoff)
              val toc = System.currentTimeMillis()
              log.info("Finished solving in %.2f minutes".format((toc - tic) / 1000 / 60.0))
              val eigValues = results.map(_._1).map { x => "%.3g".format(x) }.mkString(" ")
              val eigValueNorm = math.sqrt(results.map(_._1).map(x => x * x).sum)
              val matrixNorm = math.sqrt(matrix.iterator().asScala.map(_.get()).map(x => x * x).sum)
              println(
                "matrixNorm %s, eigValueNorm %s, explained fraction %s"
                  .format(matrixNorm, eigValueNorm, eigValueNorm / matrixNorm))
              log.info("The eigenvalues are:")
              log.info(eigValues)
              val nnzInEigenVectors = results.map(_._2.size).sum
              log.info("Average nnz per eigenvector using ratioToLargestCutoff %d is %.2g"
                .format(ratioToLargestEntryInVectorCutoff, nnzInEigenVectors * 1.0 / results.size))
              val transposedRaw = results.zipWithIndex.flatMap {
                case ((_, eigVector), eigIndex) =>
                  eigVector.map {
                    case (index, vectorEntry) =>
                      val clusterId = newIdToOldId(index)
                      Map(clusterId -> List((eigIndex, vectorEntry)))
                  }
              }
              val transposed = Monoid.sum(transposedRaw).mapValues { rowForCluster =>
                rowForCluster
                  .map {
                    case (dimId, weight) =>
                      "%d:%.2g".format(dimId, weight)
                  }.mkString(" ")
              }
              TypedPipe.from(transposed.toSeq).writeExecution(TypedTsv(outputDir))
            }
        }
    }
 }
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromAggregatableProducerEmbeddings.scala
@ -1,332 +0,0 @@
 package com.twitter.simclusters_v2.scalding
 import com.twitter.dal.client.dataset.KeyValDALDataset
 import com.twitter.dal.client.dataset.SnapshotDALDataset
 import com.twitter.scalding._
 import com.twitter.scalding_internal.dalv2.DAL
 import com.twitter.scalding_internal.dalv2.DALWrite.D
 import com.twitter.scalding_internal.dalv2.DALWrite.WriteExtension
 import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC
 import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossDC
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
 import com.twitter.simclusters_v2.common.ClusterId
 import com.twitter.simclusters_v2.common.ModelVersions
 import com.twitter.simclusters_v2.common.UserId
 import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
 import com.twitter.simclusters_v2.hdfs_sources.AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
 import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
 import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2UserToInterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
 import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
 import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
 import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
 import com.twitter.simclusters_v2.thriftscala.InternalId
 import com.twitter.simclusters_v2.thriftscala.ModelVersion
 import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding
 import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
 import com.twitter.simclusters_v2.thriftscala.UserAndNeighbors
 import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores
 import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusters
 import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
 import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp
 import java.util.TimeZone
 /**
 * Production job for computing interestedIn data set from the aggregatable producer embeddings for the model version 20M145K2020.
 * It writes the data set in KeyVal format to produce a MH DAL data set.
 *
 * A high level description of this job:
 * - Read the APE dataset
 * - Apply log1p to the scores from the above dataset as the scores for producers is high
 * - Normalize the scores for each producer (offline benchmarking has shown better results from this step.)
 * - Truncate the number of clusters for each producer from the APE dataset to reduce noise
 * - Compute interestedIn
 *
 * To deploy the job:
 *
 * capesospy-v2 update --build_locally --start_cron interested_in_from_ape_2020 \
 * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
 */
 object InterestedInFromAPE2020BatchApp extends InterestedInFromAggregatableProducerEmbeddingsBase {
  override val firstTime: RichDate = RichDate("2021-03-03")
  override val batchIncrement: Duration = Days(7)
  override def modelVersion: ModelVersion = ModelVersion.Model20m145k2020
  override def producerEmbeddingsInputKVDataset: KeyValDALDataset[
    KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]
  ] = AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset
  override def interestedInFromAPEOutputKVDataset: KeyValDALDataset[
    KeyVal[UserId, ClustersUserIsInterestedIn]
  ] = SimclustersV2InterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
  override def interestedInFromAPEOutputThriftDatset: SnapshotDALDataset[
    UserToInterestedInClusters
  ] = SimclustersV2UserToInterestedInFromAggregatableProducerEmbeddings20M145K2020ScalaDataset
 }
 trait InterestedInFromAggregatableProducerEmbeddingsBase extends ScheduledExecutionApp {
  def modelVersion: ModelVersion
  def interestedInFromAPEOutputKVDataset: KeyValDALDataset[
    KeyVal[UserId, ClustersUserIsInterestedIn]
  ]
  def producerEmbeddingsInputKVDataset: KeyValDALDataset[
    KeyVal[SimClustersEmbeddingId, SimClustersEmbedding]
  ]
  def interestedInFromAPEOutputThriftDatset: SnapshotDALDataset[UserToInterestedInClusters]
  override def runOnDateRange(
    args: Args
  )(
    implicit dateRange: DateRange,
    timeZone: TimeZone,
    uniqueID: UniqueID
  ): Execution[Unit] = {
    //Input args for the run
    val socialProofThreshold = args.int("socialProofThreshold", 2)
    val maxClustersFromProducer = args.int("maxClustersPerProducer", 5)
    val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 200)
    //Path variables
    val interestedInFromProducersPath =
      s"/user/cassowary/manhattan_sequence_files/interested_in_from_ape/" + modelVersion
    val interestedInFromProducersThriftPath =
      s"/user/cassowary/manhattan_sequence_files/interested_in_from_ape_thrift/" + modelVersion
    val userUserGraph: TypedPipe[UserAndNeighbors] =
      DAL
        .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
        .withRemoteReadPolicy(AllowCrossDC)
        .toTypedPipe
    val producerEmbeddings = DAL
      .readMostRecentSnapshotNoOlderThan(
        producerEmbeddingsInputKVDataset,
        Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe.map {
        case KeyVal(producer, embeddings) => (producer, embeddings)
      }
    val result = InterestedInFromAggregatableProducerEmbeddingsBase.run(
      userUserGraph,
      producerEmbeddings,
      maxClustersFromProducer,
      socialProofThreshold,
      maxClustersPerUserFinalResult,
      modelVersion)
    val keyValExec =
      result
        .map { case (userId, clusters) => KeyVal(userId, clusters) }
        .writeDALVersionedKeyValExecution(
          interestedInFromAPEOutputKVDataset,
          D.Suffix(interestedInFromProducersPath)
        )
    val thriftExec =
      result
        .map {
          case (userId, clusters) =>
            UserToInterestedInClusters(
              userId,
              ModelVersions.toKnownForModelVersion(modelVersion),
              clusters.clusterIdToScores)
        }
        .writeDALSnapshotExecution(
          interestedInFromAPEOutputThriftDatset,
          D.Daily,
          D.Suffix(interestedInFromProducersThriftPath),
          D.EBLzo(),
          dateRange.end
        )
    Execution.zip(keyValExec, thriftExec).unit
  }
 }
 /**
 * Adhoc job to generate the interestedIn from aggregatable producer embeddings for the model version 20M145K2020
 *
 * scalding remote run \
 * --user cassowary \
 * --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
 * --principal service_acoount@TWITTER.BIZ \
 * --cluster bluebird-qus1 \
 * --main-class com.twitter.simclusters_v2.scalding.InterestedInFromAPE2020AdhocApp \
 * --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_ape_2020-adhoc \
 * --hadoop-properties "mapreduce.map.memory.mb=8192 mapreduce.map.java.opts='-Xmx7618M' mapreduce.reduce.memory.mb=8192 mapreduce.reduce.java.opts='-Xmx7618M'" \
 * -- --outputDir /gcs/user/cassowary/adhoc/your_ldap/interested_in_from_ape_2020_keyval --date 2021-03-05
 */
 object InterestedInFromAPE2020AdhocApp extends AdhocExecutionApp {
  override def runOnDateRange(
    args: Args
  )(
    implicit dateRange: DateRange,
    timeZone: TimeZone,
    uniqueID: UniqueID
  ): Execution[Unit] = {
    val outputDir = args("outputDir")
    val socialProofThreshold = args.int("socialProofThreshold", 2)
    val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 200)
    val maxClustersFromProducer = args.int("maxClustersFromProducer", 5)
    val inputGraph = args.optional("graphInputDir") match {
      case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir))
      case None =>
        DAL
          .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
          .withRemoteReadPolicy(AllowCrossClusterSameDC)
          .toTypedPipe
    }
    val producerEmbeddings = DAL
      .readMostRecentSnapshotNoOlderThan(
        AggregatableProducerSimclustersEmbeddingsByLogFavScore2020ScalaDataset,
        Days(30)).withRemoteReadPolicy(AllowCrossClusterSameDC).toTypedPipe.map {
        case KeyVal(producer, embeddings) => (producer, embeddings)
      }
    val result = InterestedInFromAggregatableProducerEmbeddingsBase.run(
      inputGraph,
      producerEmbeddings,
      maxClustersFromProducer,
      socialProofThreshold,
      maxClustersPerUserFinalResult,
      ModelVersion.Model20m145k2020)
    result
      .writeExecution(AdhocKeyValSources.interestedInSource(outputDir))
  }
 }
 /**
 * Helper functions
 */
 object InterestedInFromAggregatableProducerEmbeddingsBase {
  /**
   * Helper function to prune the embeddings
   * @param embeddingsWithScore embeddings
   * @param maxClusters number of clusters to keep, per userId
   * @param uniqueId for stats
   * @return
   */
  def getPrunedEmbeddings(
    embeddingsWithScore: TypedPipe[(UserId, Seq[(ClusterId, Float)])],
    maxClusters: Int
  )(
    implicit uniqueId: UniqueID
  ): TypedPipe[(UserId, Array[(ClusterId, Float)])] = {
    val numProducerMappings = Stat("num_producer_embeddings_total")
    val numProducersWithLargeClusterMappings = Stat(
      "num_producers_with_more_clusters_than_threshold")
    val numProducersWithSmallClusterMappings = Stat(
      "num_producers_with_clusters_less_than_threshold")
    val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings")
    embeddingsWithScore.map {
      case (producerId, clusterArray) =>
        numProducerMappings.inc()
        val clusterSize = clusterArray.size
        totalClustersCoverageProducerEmbeddings.incBy(clusterSize)
        val prunedList = if (clusterSize > maxClusters) {
          numProducersWithLargeClusterMappings.inc()
          clusterArray
            .sortBy {
              case (_, knownForScore) => -knownForScore
            }.take(maxClusters)
        } else {
          numProducersWithSmallClusterMappings.inc()
          clusterArray
        }
        (producerId, prunedList.toArray)
    }
  }
  /**
   * helper function to remove all scores except follow and logFav
   * @param interestedInResult interestedIn clusters for a user
   * @return
   */
  def getInterestedInDiscardScores(
    interestedInResult: TypedPipe[(UserId, List[(ClusterId, UserToInterestedInClusterScores)])]
  ): TypedPipe[(UserId, List[(ClusterId, UserToInterestedInClusterScores)])] = {
    interestedInResult.map {
      case (srcId, fullClusterList) =>
        val fullClusterListWithDiscardedScores = fullClusterList.map {
          case (clusterId, clusterDetails) =>
            val clusterDetailsWithoutSocial = UserToInterestedInClusterScores(
              // We are not planning to use the other scores except for logFav and Follow.
              // Hence, setting others as None for now, we can add them back when needed
              followScore = clusterDetails.followScore,
              logFavScore = clusterDetails.logFavScore,
              logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly
            )
            (clusterId, clusterDetailsWithoutSocial)
        }
        (srcId, fullClusterListWithDiscardedScores)
    }
  }
  /**
   * Helper function to normalize the embeddings
   * @param embeddings cluster embeddings
   * @return
   */
  def getNormalizedEmbeddings(
    embeddings: TypedPipe[(UserId, Seq[(ClusterId, Float)])]
  ): TypedPipe[(UserId, Seq[(ClusterId, Float)])] = {
    embeddings.map {
      case (userId, clustersWithScores) =>
        val l2norm = math.sqrt(clustersWithScores.map(_._2).map(score => score * score).sum)
        (
          userId,
          clustersWithScores.map {
            case (clusterId, score) => (clusterId, (score / l2norm).toFloat)
          })
    }
  }
  def run(
    userUserGraph: TypedPipe[UserAndNeighbors],
    producerEmbeddings: TypedPipe[(SimClustersEmbeddingId, SimClustersEmbedding)],
    maxClustersFromProducer: Int,
    socialProofThreshold: Int,
    maxClustersPerUserFinalResult: Int,
    modelVersion: ModelVersion
  )(
    implicit uniqueId: UniqueID
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    import InterestedInFromKnownFor._
    val producerEmbeddingsWithScore: TypedPipe[(UserId, Seq[(ClusterId, Float)])] =
      producerEmbeddings.map {
        case (
              SimClustersEmbeddingId(embeddingType, modelVersion, InternalId.UserId(producerId)),
              simclusterEmbedding) =>
          (
            producerId,
            simclusterEmbedding.embedding.map { simclusterWithScore =>
              // APE dataset has very high producer scores, hence applying log to smoothen them out before
              // computing interestedIn
              (simclusterWithScore.clusterId, math.log(1.0 + simclusterWithScore.score).toFloat)
            })
      }
    val result = keepOnlyTopClusters(
      getInterestedInDiscardScores(
        attachNormalizedScores(
          userClusterPairsWithoutNormalization(
            userUserGraph,
            getPrunedEmbeddings(
              getNormalizedEmbeddings(producerEmbeddingsWithScore),
              maxClustersFromProducer),
            socialProofThreshold,
          ))),
      maxClustersPerUserFinalResult,
      ModelVersions.toKnownForModelVersion(modelVersion)
    )
    result
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownFor.scala
@ -1,666 +0,0 @@
 package com.twitter.simclusters_v2.scalding
 import com.twitter.algebird.Semigroup
 import com.twitter.bijection.Injection
 import com.twitter.dal.client.dataset.KeyValDALDataset
 import com.twitter.scalding.TypedPipe
 import com.twitter.scalding._
 import com.twitter.scalding_internal.dalv2.DAL
 import com.twitter.scalding_internal.dalv2.DALWrite._
 import com.twitter.scalding_internal.job.TwitterExecutionApp
 import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecution
 import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchExecutionArgs
 import com.twitter.scalding_internal.job.analytics_batch.BatchDescription
 import com.twitter.scalding_internal.job.analytics_batch.BatchFirstTime
 import com.twitter.scalding_internal.job.analytics_batch.BatchIncrement
 import com.twitter.scalding_internal.job.analytics_batch.TwitterScheduledExecutionApp
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
 import com.twitter.simclusters_v2.common.ClusterId
 import com.twitter.simclusters_v2.common.ModelVersions
 import com.twitter.simclusters_v2.common.UserId
 import com.twitter.simclusters_v2.hdfs_sources._
 import com.twitter.simclusters_v2.scalding.common.Util
 import com.twitter.simclusters_v2.thriftscala._
 /**
 * This file implements the job for computing users' interestedIn vector from KnownFor data set.
 *
 * It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then
 * based on the known-for clusters of each followed/faved user, we calculate how much a user is
 * interestedIn a cluster.
 */
 /**
 * Production job for computing interestedIn data set for the model version 20M145K2020.
 *
 * To deploy the job:
 *
 * capesospy-v2 update --build_locally --start_cron interested_in_for_20M_145k_2020 \
 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
 */
 object InterestedInFromKnownFor20M145K2020 extends InterestedInFromKnownForBatchBase {
  override val firstTime: String = "2020-10-06"
  override val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
    SimclustersV2RawInterestedIn20M145K2020ScalaDataset
  override val outputPath: String = InternalDataPaths.RawInterestedIn2020Path
  override val knownForModelVersion: String = ModelVersions.Model20M145K2020
  override val knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] =
    SimclustersV2KnownFor20M145K2020ScalaDataset
 }
 /**
 * base class for the main logic of computing interestedIn from KnownFor data set.
 */
 trait InterestedInFromKnownForBatchBase extends TwitterScheduledExecutionApp {
  implicit val tz = DateOps.UTC
  implicit val parser = DateParser.default
  def firstTime: String
  val batchIncrement: Duration = Days(7)
  val lookBackDays: Duration = Days(30)
  def outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
  def outputPath: String
  def knownForModelVersion: String
  def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
  private lazy val execArgs = AnalyticsBatchExecutionArgs(
    batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
    firstTime = BatchFirstTime(RichDate(firstTime)),
    lastTime = None,
    batchIncrement = BatchIncrement(batchIncrement)
  )
  override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
    implicit dateRange =>
      Execution.withId { implicit uniqueId =>
        Execution.withArgs { args =>
          val normalizedGraph =
            DAL.readMostRecentSnapshot(UserUserNormalizedGraphScalaDataset).toTypedPipe
          val knownFor = KnownForSources.fromKeyVal(
            DAL.readMostRecentSnapshot(knownForDALDataset, dateRange.extend(Days(30))).toTypedPipe,
            knownForModelVersion
          )
          val socialProofThreshold = args.int("socialProofThreshold", 2)
          val maxClustersPerUser = args.int("maxClustersPerUser", 50)
          val result = InterestedInFromKnownFor
            .run(
              normalizedGraph,
              knownFor,
              socialProofThreshold,
              maxClustersPerUser,
              knownForModelVersion
            )
          val writeKeyValResultExec = result
            .map { case (userId, clusters) => KeyVal(userId, clusters) }
            .writeDALVersionedKeyValExecution(
              outputKVDataset,
              D.Suffix(outputPath)
            )
          // read previous data set for validation purpose
          val previousDataset = if (RichDate(firstTime).timestamp != dateRange.start.timestamp) {
            DAL
              .readMostRecentSnapshot(outputKVDataset, dateRange.prepend(lookBackDays)).toTypedPipe
              .map {
                case KeyVal(user, interestedIn) =>
                  (user, interestedIn)
              }
          } else {
            TypedPipe.empty
          }
          Util.printCounters(
            Execution
              .zip(
                writeKeyValResultExec,
                InterestedInFromKnownFor.dataSetStats(result, "NewResult"),
                InterestedInFromKnownFor.dataSetStats(previousDataset, "OldResult")
              ).unit
          )
        }
      }
  }
 }
 /**
 * Adhoc job to compute user interestedIn.
 *
 * scalding remote run --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_adhoc \
 * --user recos-platform \
 * --submitter hadoopnest2.atla.twitter.com \
 * --main-class com.twitter.simclusters_v2.scalding.InterestedInFromKnownForAdhoc -- \
 * --date 2019-08-26  --outputDir /user/recos-platform/adhoc/simclusters_interested_in_log_fav
 */
 object InterestedInFromKnownForAdhoc extends TwitterExecutionApp {
  def job: Execution[Unit] =
    Execution.getConfigMode.flatMap {
      case (config, mode) =>
        Execution.withId { implicit uniqueId =>
          val args = config.getArgs
          val normalizedGraph = TypedPipe.from(
            UserAndNeighborsFixedPathSource(args("graphInputDir"))
          )
          val socialProofThreshold = args.int("socialProofThreshold", 2)
          val maxClustersPerUser = args.int("maxClustersPerUser", 20)
          val knownForModelVersion = args("knownForModelVersion")
          val knownFor = KnownForSources.readKnownFor(args("knownForInputDir"))
          val outputSink = AdhocKeyValSources.interestedInSource(args("outputDir"))
          Util.printCounters(
            InterestedInFromKnownFor
              .run(
                normalizedGraph,
                knownFor,
                socialProofThreshold,
                maxClustersPerUser,
                knownForModelVersion
              ).writeExecution(outputSink)
          )
        }
    }
 }
 /**
 * Adhoc job to check the output of an adhoc interestedInSource.
 */
 object DumpInterestedInAdhoc extends TwitterExecutionApp {
  def job: Execution[Unit] =
    Execution.getConfigMode.flatMap {
      case (config, mode) =>
        Execution.withId { implicit uniqueId =>
          val args = config.getArgs
          val users = args.list("users").map(_.toLong).toSet
          val input = TypedPipe.from(AdhocKeyValSources.interestedInSource(args("inputDir")))
          input.filter { case (userId, rec) => users.contains(userId) }.toIterableExecution.map {
            s => println(s.map(Util.prettyJsonMapper.writeValueAsString).mkString("\n"))
          }
        }
    }
 }
 /**
 * Helper functions
 */
 object InterestedInFromKnownFor {
  private def ifNanMake0(x: Double): Double = if (x.isNaN) 0.0 else x
  case class SrcClusterIntermediateInfo(
    followScore: Double,
    followScoreProducerNormalized: Double,
    favScore: Double,
    favScoreProducerNormalized: Double,
    logFavScore: Double,
    logFavScoreProducerNormalized: Double,
    followSocialProof: List[Long],
    favSocialProof: List[Long]) {
    // overriding for the sake of unit tests
    override def equals(obj: scala.Any): Boolean = {
      obj match {
        case that: SrcClusterIntermediateInfo =>
          math.abs(followScore - that.followScore) < 1e-5 &&
            math.abs(followScoreProducerNormalized - that.followScoreProducerNormalized) < 1e-5 &&
            math.abs(favScore - that.favScore) < 1e-5 &&
            math.abs(favScoreProducerNormalized - that.favScoreProducerNormalized) < 1e-5 &&
            math.abs(logFavScore - that.logFavScore) < 1e-5 &&
            math.abs(logFavScoreProducerNormalized - that.logFavScoreProducerNormalized) < 1e-5 &&
            followSocialProof.toSet == that.followSocialProof.toSet &&
            favSocialProof.toSet == that.favSocialProof.toSet
        case _ => false
      }
    }
  }
  implicit object SrcClusterIntermediateInfoSemigroup
      extends Semigroup[SrcClusterIntermediateInfo] {
    override def plus(
      left: SrcClusterIntermediateInfo,
      right: SrcClusterIntermediateInfo
    ): SrcClusterIntermediateInfo = {
      SrcClusterIntermediateInfo(
        followScore = left.followScore + right.followScore,
        followScoreProducerNormalized =
          left.followScoreProducerNormalized + right.followScoreProducerNormalized,
        favScore = left.favScore + right.favScore,
        favScoreProducerNormalized =
          left.favScoreProducerNormalized + right.favScoreProducerNormalized,
        logFavScore = left.logFavScore + right.logFavScore,
        logFavScoreProducerNormalized =
          left.logFavScoreProducerNormalized + right.logFavScoreProducerNormalized,
        followSocialProof =
          Semigroup.plus(left.followSocialProof, right.followSocialProof).distinct,
        favSocialProof = Semigroup.plus(left.favSocialProof, right.favSocialProof).distinct
      )
    }
  }
  /**
   * @param adjacencyLists User-User follow/fav graph
   * @param knownFor KnownFor data set. Each user can be known for several clusters with certain
   *                 knownFor weights.
   * @param socialProofThreshold A user will only be interested in a cluster if they follow/fav at
   *                             least certain number of users known for this cluster.
   * @param uniqueId required for these Stat
   * @return
   */
  def userClusterPairsWithoutNormalization(
    adjacencyLists: TypedPipe[UserAndNeighbors],
    knownFor: TypedPipe[(Long, Array[(Int, Float)])],
    socialProofThreshold: Int
  )(
    implicit uniqueId: UniqueID
  ): TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] = {
    val edgesToUsersWithKnownFor = Stat("num_edges_to_users_with_known_for")
    val srcDestClusterTriples = Stat("num_src_dest_cluster_triples")
    val srcClusterPairsBeforeSocialProofThresholding =
      Stat("num_src_cluster_pairs_before_social_proof_thresholding")
    val srcClusterPairsAfterSocialProofThresholding =
      Stat("num_src_cluster_pairs_after_social_proof_thresholding")
    val edges = adjacencyLists.flatMap {
      case UserAndNeighbors(srcId, neighborsWithWeights) =>
        neighborsWithWeights.map { neighborWithWeights =>
          (
            neighborWithWeights.neighborId,
            neighborWithWeights.copy(neighborId = srcId)
          )
        }
    }
    implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian
    edges
      .sketch(4000)
      .join(knownFor)
      .flatMap {
        case (destId, (srcWithWeights, clusterArray)) =>
          edgesToUsersWithKnownFor.inc()
          clusterArray.toList.map {
            case (clusterId, knownForScoreF) =>
              val knownForScore = math.max(0.0, knownForScoreF.toDouble)
              srcDestClusterTriples.inc()
              val followScore =
                if (srcWithWeights.isFollowed.contains(true)) knownForScore else 0.0
              val followScoreProducerNormalizedOnly =
                srcWithWeights.followScoreNormalizedByNeighborFollowersL2.getOrElse(
                  0.0) * knownForScore
              val favScore =
                srcWithWeights.favScoreHalfLife100Days.getOrElse(0.0) * knownForScore
              val favScoreProducerNormalizedOnly =
                srcWithWeights.favScoreHalfLife100DaysNormalizedByNeighborFaversL2.getOrElse(
                  0.0) * knownForScore
              val logFavScore = srcWithWeights.logFavScore.getOrElse(0.0) * knownForScore
              val logFavScoreProducerNormalizedOnly = srcWithWeights.logFavScoreL2Normalized
                .getOrElse(0.0) * knownForScore
              val followSocialProof = if (srcWithWeights.isFollowed.contains(true)) {
                List(destId)
              } else Nil
              val favSocialProof = if (srcWithWeights.favScoreHalfLife100Days.exists(_ > 0)) {
                List(destId)
              } else Nil
              (
                (srcWithWeights.neighborId, clusterId),
                SrcClusterIntermediateInfo(
                  followScore,
                  followScoreProducerNormalizedOnly,
                  favScore,
                  favScoreProducerNormalizedOnly,
                  logFavScore,
                  logFavScoreProducerNormalizedOnly,
                  followSocialProof,
                  favSocialProof
                )
              )
          }
      }
      .sumByKey
      .withReducers(10000)
      .filter {
        case ((_, _), SrcClusterIntermediateInfo(_, _, _, _, _, _, followProof, favProof)) =>
          srcClusterPairsBeforeSocialProofThresholding.inc()
          val distinctSocialProof = (followProof ++ favProof).toSet
          val result = distinctSocialProof.size >= socialProofThreshold
          if (result) {
            srcClusterPairsAfterSocialProofThresholding.inc()
          }
          result
      }
  }
  /**
   * Add the cluster-level l2 norm scores, and use them to normalize follow/fav scores.
   */
  def attachNormalizedScores(
    intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
  )(
    implicit uniqueId: UniqueID
  ): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
    def square(x: Double): Double = x * x
    val clusterCountsAndNorms =
      intermediate
        .map {
          case (
                (_, clusterId),
                SrcClusterIntermediateInfo(
                  followScore,
                  followScoreProducerNormalizedOnly,
                  favScore,
                  favScoreProducerNormalizedOnly,
                  logFavScore,
                  logFavScoreProducerNormalizedOnly,
                  _,
                  _
                )
              ) =>
            (
              clusterId,
              (
                1,
                square(followScore),
                square(followScoreProducerNormalizedOnly),
                square(favScore),
                square(favScoreProducerNormalizedOnly),
                square(logFavScore),
                square(logFavScoreProducerNormalizedOnly)
              )
            )
        }
        .sumByKey
        //        .withReducers(100)
        .map {
          case (
                clusterId,
                (
                  cnt,
                  squareFollowScore,
                  squareFollowScoreProducerNormalizedOnly,
                  squareFavScore,
                  squareFavScoreProducerNormalizedOnly,
                  squareLogFavScore,
                  squareLogFavScoreProducerNormalizedOnly
                )) =>
            (
              clusterId,
              (
                cnt,
                math.sqrt(squareFollowScore),
                math.sqrt(squareFollowScoreProducerNormalizedOnly),
                math.sqrt(squareFavScore),
                math.sqrt(squareFavScoreProducerNormalizedOnly),
                math.sqrt(squareLogFavScore),
                math.sqrt(squareLogFavScoreProducerNormalizedOnly)
              ))
        }
    implicit val i2b: Int => Array[Byte] = Injection.int2BigEndian
    intermediate
      .map {
        case ((srcId, clusterId), clusterScoresTuple) =>
          (clusterId, (srcId, clusterScoresTuple))
      }
      .sketch(reducers = 900)
      .join(clusterCountsAndNorms)
      .map {
        case (
              clusterId,
              (
                (
                  srcId,
                  SrcClusterIntermediateInfo(
                    followScore,
                    followScoreProducerNormalizedOnly,
                    favScore,
                    favScoreProducerNormalizedOnly,
                    logFavScore,
                    logFavScoreProducerNormalizedOnly, // not used for now
                    followProof,
                    favProof
                  )
                ),
                (
                  cnt,
                  followNorm,
                  followProducerNormalizedNorm,
                  favNorm,
                  favProducerNormalizedNorm,
                  logFavNorm,
                  logFavProducerNormalizedNorm // not used for now
                )
              )
            ) =>
          (
            srcId,
            List(
              (
                clusterId,
                UserToInterestedInClusterScores(
                  followScore = Some(ifNanMake0(followScore)),
                  followScoreClusterNormalizedOnly = Some(ifNanMake0(followScore / followNorm)),
                  followScoreProducerNormalizedOnly =
                    Some(ifNanMake0(followScoreProducerNormalizedOnly)),
                  followScoreClusterAndProducerNormalized = Some(
                    ifNanMake0(followScoreProducerNormalizedOnly / followProducerNormalizedNorm)),
                  favScore = Some(ifNanMake0(favScore)),
                  favScoreClusterNormalizedOnly = Some(ifNanMake0(favScore / favNorm)),
                  favScoreProducerNormalizedOnly = Some(ifNanMake0(favScoreProducerNormalizedOnly)),
                  favScoreClusterAndProducerNormalized =
                    Some(ifNanMake0(favScoreProducerNormalizedOnly / favProducerNormalizedNorm)),
                  usersBeingFollowed = Some(followProof),
                  usersThatWereFaved = Some(favProof),
                  numUsersInterestedInThisClusterUpperBound = Some(cnt),
                  logFavScore = Some(ifNanMake0(logFavScore)),
                  logFavScoreClusterNormalizedOnly = Some(ifNanMake0(logFavScore / logFavNorm))
                ))
            )
          )
      }
      .sumByKey
      //      .withReducers(1000)
      .toTypedPipe
  }
  /**
   * aggregate cluster scores for each user, to be used instead of attachNormalizedScores
   * when we donot want to compute cluster-level l2 norm scores
   */
  def groupClusterScores(
    intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
  )(
    implicit uniqueId: UniqueID
  ): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
    intermediate
      .map {
        case (
              (srcId, clusterId),
              SrcClusterIntermediateInfo(
                followScore,
                followScoreProducerNormalizedOnly,
                favScore,
                favScoreProducerNormalizedOnly,
                logFavScore,
                logFavScoreProducerNormalizedOnly,
                followProof,
                favProof
              )
            ) =>
          (
            srcId,
            List(
              (
                clusterId,
                UserToInterestedInClusterScores(
                  followScore = Some(ifNanMake0(followScore)),
                  followScoreProducerNormalizedOnly =
                    Some(ifNanMake0(followScoreProducerNormalizedOnly)),
                  favScore = Some(ifNanMake0(favScore)),
                  favScoreProducerNormalizedOnly = Some(ifNanMake0(favScoreProducerNormalizedOnly)),
                  usersBeingFollowed = Some(followProof),
                  usersThatWereFaved = Some(favProof),
                  logFavScore = Some(ifNanMake0(logFavScore)),
                ))
            )
          )
      }
      .sumByKey
      .withReducers(1000)
      .toTypedPipe
  }
  /**
   * For each user, only keep up to a certain number of clusters.
   * @param allInterests user with a list of interestedIn clusters.
   * @param maxClustersPerUser number of clusters to keep for each user
   * @param knownForModelVersion known for model version
   * @param uniqueId required for these Stat
   * @return
   */
  def keepOnlyTopClusters(
    allInterests: TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])],
    maxClustersPerUser: Int,
    knownForModelVersion: String
  )(
    implicit uniqueId: UniqueID
  ): TypedPipe[(Long, ClustersUserIsInterestedIn)] = {
    val userClusterPairsBeforeUserTruncation =
      Stat("num_user_cluster_pairs_before_user_truncation")
    val userClusterPairsAfterUserTruncation =
      Stat("num_user_cluster_pairs_after_user_truncation")
    val usersWithALotOfClusters =
      Stat(s"num_users_with_more_than_${maxClustersPerUser}_clusters")
    allInterests
      .map {
        case (srcId, fullClusterList) =>
          userClusterPairsBeforeUserTruncation.incBy(fullClusterList.size)
          val truncatedClusters = if (fullClusterList.size > maxClustersPerUser) {
            usersWithALotOfClusters.inc()
            fullClusterList
              .sortBy {
                case (_, clusterScores) =>
                  (
                    -clusterScores.favScore.getOrElse(0.0),
                    -clusterScores.logFavScore.getOrElse(0.0),
                    -clusterScores.followScore.getOrElse(0.0),
                    -clusterScores.logFavScoreClusterNormalizedOnly.getOrElse(0.0),
                    -clusterScores.followScoreProducerNormalizedOnly.getOrElse(0.0)
                  )
              }
              .take(maxClustersPerUser)
          } else {
            fullClusterList
          }
          userClusterPairsAfterUserTruncation.incBy(truncatedClusters.size)
          (srcId, ClustersUserIsInterestedIn(knownForModelVersion, truncatedClusters.toMap))
      }
  }
  def run(
    adjacencyLists: TypedPipe[UserAndNeighbors],
    knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
    socialProofThreshold: Int,
    maxClustersPerUser: Int,
    knownForModelVersion: String
  )(
    implicit uniqueId: UniqueID
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    keepOnlyTopClusters(
      attachNormalizedScores(
        userClusterPairsWithoutNormalization(
          adjacencyLists,
          knownFor,
          socialProofThreshold
        )
      ),
      maxClustersPerUser,
      knownForModelVersion
    )
  }
  /**
   * run the interestedIn job, cluster normalized scores are not attached to user's clusters.
   */
  def runWithoutClusterNormalizedScores(
    adjacencyLists: TypedPipe[UserAndNeighbors],
    knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
    socialProofThreshold: Int,
    maxClustersPerUser: Int,
    knownForModelVersion: String
  )(
    implicit uniqueId: UniqueID
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    keepOnlyTopClusters(
      groupClusterScores(
        userClusterPairsWithoutNormalization(
          adjacencyLists,
          knownFor,
          socialProofThreshold
        )
      ),
      maxClustersPerUser,
      knownForModelVersion
    )
  }
  /**
   * print out some basic stats of the data set to make sure things are not broken
   */
  def dataSetStats(
    interestedInData: TypedPipe[(UserId, ClustersUserIsInterestedIn)],
    dataSetName: String = ""
  ): Execution[Unit] = {
    Execution
      .zip(
        Util.printSummaryOfNumericColumn(
          interestedInData.map {
            case (user, interestedIn) =>
              interestedIn.clusterIdToScores.size
          },
          Some(s"$dataSetName UserInterestedIn Size")
        ),
        Util.printSummaryOfNumericColumn(
          interestedInData.flatMap {
            case (user, interestedIn) =>
              interestedIn.clusterIdToScores.map {
                case (_, scores) =>
                  scores.favScore.getOrElse(0.0)
              }
          },
          Some(s"$dataSetName UserInterestedIn favScore")
        ),
        Util.printSummaryOfNumericColumn(
          interestedInData.flatMap {
            case (user, interestedIn) =>
              interestedIn.clusterIdToScores.map {
                case (_, scores) =>
                  scores.favScoreClusterNormalizedOnly.getOrElse(0.0)
              }
          },
          Some(s"$dataSetName UserInterestedIn favScoreClusterNormalizedOnly")
        ),
        Util.printSummaryOfNumericColumn(
          interestedInData.flatMap {
            case (user, interestedIn) =>
              interestedIn.clusterIdToScores.map {
                case (_, scores) =>
                  scores.logFavScoreClusterNormalizedOnly.getOrElse(0.0)
              }
          },
          Some(s"$dataSetName UserInterestedIn logFavScoreClusterNormalizedOnly")
        )
      ).unit
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromKnownForLite.scala
@ -1,354 +0,0 @@
 package com.twitter.simclusters_v2.scalding
 import com.twitter.algebird.Semigroup
 import com.twitter.bijection.Injection
 import com.twitter.dal.client.dataset.KeyValDALDataset
 import com.twitter.scalding._
 import com.twitter.scalding_internal.dalv2.DAL
 import com.twitter.scalding_internal.dalv2.DALWrite.{D, WriteExtension}
 import com.twitter.scalding_internal.job.TwitterExecutionApp
 import com.twitter.scalding_internal.job.analytics_batch.{
  AnalyticsBatchExecution,
  AnalyticsBatchExecutionArgs,
  BatchDescription,
  BatchFirstTime,
  BatchIncrement,
  TwitterScheduledExecutionApp
 }
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
 import com.twitter.simclusters_v2.common.{ClusterId, ModelVersions, UserId}
 import com.twitter.simclusters_v2.hdfs_sources.{
  AdhocKeyValSources,
  InternalDataPaths,
  SimclustersV2KnownFor20M145K2020ScalaDataset,
  SimclustersV2RawInterestedInLite20M145K2020ScalaDataset,
  SimclustersV2RawInterestedIn20M145KUpdatedScalaDataset,
  UserAndNeighborsFixedPathSource,
  UserUserGraphScalaDataset
 }
 import com.twitter.simclusters_v2.scalding.common.Util
 import com.twitter.simclusters_v2.thriftscala.{
  ClustersUserIsInterestedIn,
  ClustersUserIsKnownFor,
  UserAndNeighbors,
  UserToInterestedInClusterScores
 }
 import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
 import java.util.TimeZone
 /**
 * This file implements the job for computing users' interestedIn vector from KnownFor data set.
 *
 * It reads the UserUserGraphScalaDataset to get user-user follow + fav graph, and then
 * based on the known-for clusters of each followed/faved user, we calculate how much a user is
 * interestedIn a cluster.
 *
 * The main differences of the InterestedInFromKnownForLite compared to InterestedInFromKnownFor are
 * the following:
 * - We read the UserUserGraph dataset that doesnot contain the producer normalized scores
 * - We donot compute the cluster normalized scores for the clusters per user
 * - For social proof thresholding, we donot keep track of the entire list of follow and
 * fav social proofs but rather make use of numFollowSocial and numFavSocial (this introduces
 * some noise if follow and fav social proof contain the same users)
 * - Store 200 clusters per user compared to 50 in IIKF
 * - Runs more frequently compared to weekly in IIKF
 */
 /**
 * Production job for computing interestedIn data set for the model version 20M145K2020.
 *
 * To deploy the job:
 *
 * capesospy-v2 update --build_locally --start_cron interested_in_lite_for_20M_145k_2020 \
 src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc.yaml
 */
 object InterestedInFromKnownForLite20M145K2020 extends InterestedInFromKnownForLite {
  override val firstTime: String = "2021-04-24"
  override val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
    SimclustersV2RawInterestedInLite20M145K2020ScalaDataset
  override val outputPath: String = InternalDataPaths.RawInterestedInLite2020Path
  override val knownForModelVersion: String = ModelVersions.Model20M145K2020
  override val knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]] =
    SimclustersV2KnownFor20M145K2020ScalaDataset
 }
 trait InterestedInFromKnownForLite extends TwitterScheduledExecutionApp {
  implicit val tz = DateOps.UTC
  implicit val parser = DateParser.default
  def firstTime: String
  val batchIncrement: Duration = Days(2)
  val lookBackDays: Duration = Days(30)
  def outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]]
  def outputPath: String
  def knownForModelVersion: String
  def knownForDALDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsKnownFor]]
  private lazy val execArgs = AnalyticsBatchExecutionArgs(
    batchDesc = BatchDescription(this.getClass.getName.replace("$", "")),
    firstTime = BatchFirstTime(RichDate(firstTime)),
    lastTime = None,
    batchIncrement = BatchIncrement(batchIncrement)
  )
  override def scheduledJob: Execution[Unit] = AnalyticsBatchExecution(execArgs) {
    implicit dateRange =>
      Execution.withId { implicit uniqueId =>
        Execution.withArgs { args =>
          val userUserGraph =
            DAL.readMostRecentSnapshot(UserUserGraphScalaDataset).toTypedPipe
          val knownFor = KnownForSources.fromKeyVal(
            DAL.readMostRecentSnapshot(knownForDALDataset, dateRange.extend(Days(30))).toTypedPipe,
            knownForModelVersion
          )
          val socialProofThreshold = args.int("socialProofThreshold", 2)
          val maxClustersPerUser = args.int("maxClustersPerUser", 200)
          val result = InterestedInFromKnownForLite
            .run(
              userUserGraph,
              knownFor,
              socialProofThreshold,
              maxClustersPerUser,
              knownForModelVersion
            )
          val writeKeyValResultExec = result
            .map {
              case (userId, clusters) => KeyVal(userId, clusters)
            }.writeDALVersionedKeyValExecution(
              outputKVDataset,
              D.Suffix(outputPath)
            )
          Util.printCounters(writeKeyValResultExec)
        }
      }
  }
 }
 /**
 * Adhoc job to compute user interestedIn.
 *
 * scalding remote run \
 * --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_lite_20m_145k_2020-adhoc \
 * --main-class com.twitter.simclusters_v2.scalding.InterestedInFromKnownForLite20M145K2020Adhoc \
 * --user cassowary --cluster bluebird-qus1 \
 * --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
 * --principal service_acoount@TWITTER.BIZ \
 * -- \
 * --outputDir /gcs/user/cassowary/adhoc/interested_in_from_knownfor_lite/ \
 * --date 2020-08-25
 */
 object InterestedInFromKnownForLite20M145K2020Adhoc extends AdhocExecutionApp {
  override def runOnDateRange(
    args: Args
  )(
    implicit dateRange: DateRange,
    timeZone: TimeZone,
    uniqueID: UniqueID
  ): Execution[Unit] = {
    val userUserGraph = DAL.readMostRecentSnapshot(UserUserGraphScalaDataset).toTypedPipe
    val socialProofThreshold = args.int("socialProofThreshold", 2)
    val maxClustersPerUser = args.int("maxClustersPerUser", 200)
    val knownForModelVersion = ModelVersions.Model20M145K2020
    val knownFor = KnownForSources.fromKeyVal(
      DAL
        .readMostRecentSnapshotNoOlderThan(
          SimclustersV2KnownFor20M145K2020ScalaDataset,
          Days(30)).toTypedPipe,
      knownForModelVersion
    )
    val outputSink = AdhocKeyValSources.interestedInSource(args("outputDir"))
    Util.printCounters(
      InterestedInFromKnownForLite
        .run(
          userUserGraph,
          knownFor,
          socialProofThreshold,
          maxClustersPerUser,
          knownForModelVersion
        ).writeExecution(outputSink)
    )
  }
 }
 object InterestedInFromKnownForLite {
  private def ifNanMake0(x: Double): Double = if (x.isNaN) 0.0 else x
  case class SrcClusterIntermediateInfo(
    followScore: Double,
    favScore: Double,
    logFavScore: Double,
    numFollowed: Int,
    numFaved: Int) {
    // helper function used for test cases
    override def equals(obj: scala.Any): Boolean = {
      obj match {
        case that: SrcClusterIntermediateInfo =>
          math.abs(followScore - that.followScore) < 1e-5 &&
            math.abs(favScore - that.favScore) < 1e-5 &&
            math.abs(logFavScore - that.logFavScore) < 1e-5 &&
            numFollowed == that.numFollowed &&
            numFaved == that.numFaved
        case _ => false
      }
    }
  }
  implicit object SrcClusterIntermediateInfoSemigroup
      extends Semigroup[SrcClusterIntermediateInfo] {
    override def plus(
      left: SrcClusterIntermediateInfo,
      right: SrcClusterIntermediateInfo
    ): SrcClusterIntermediateInfo = {
      SrcClusterIntermediateInfo(
        followScore = left.followScore + right.followScore,
        favScore = left.favScore + right.favScore,
        logFavScore = left.logFavScore + right.logFavScore,
        numFollowed = left.numFollowed + right.numFollowed,
        numFaved = left.numFaved + right.numFaved
      )
    }
  }
  def run(
    adjacencyLists: TypedPipe[UserAndNeighbors],
    knownFor: TypedPipe[(UserId, Array[(ClusterId, Float)])],
    socialProofThreshold: Int,
    maxClustersPerUser: Int,
    knownForModelVersion: String
  )(
    implicit uniqueId: UniqueID
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    InterestedInFromKnownFor.keepOnlyTopClusters(
      groupClusterScores(
        userClusterPairs(
          adjacencyLists,
          knownFor,
          socialProofThreshold
        )
      ),
      maxClustersPerUser,
      knownForModelVersion
    )
  }
  def userClusterPairs(
    adjacencyLists: TypedPipe[UserAndNeighbors],
    knownFor: TypedPipe[(Long, Array[(Int, Float)])],
    socialProofThreshold: Int
  )(
    implicit uniqueId: UniqueID
  ): TypedPipe[((Long, Int), SrcClusterIntermediateInfo)] = {
    val edgesToUsersWithKnownFor = Stat("num_edges_to_users_with_known_for")
    val srcDestClusterTriples = Stat("num_src_dest_cluster_triples")
    val srcClusterPairsBeforeSocialProofThresholding =
      Stat("num_src_cluster_pairs_before_social_proof_thresholding")
    val srcClusterPairsAfterSocialProofThresholding =
      Stat("num_src_cluster_pairs_after_social_proof_thresholding")
    val edges = adjacencyLists.flatMap {
      case UserAndNeighbors(srcId, neighborsWithWeights) =>
        neighborsWithWeights.map { neighborWithWeights =>
          (
            neighborWithWeights.neighborId,
            neighborWithWeights.copy(neighborId = srcId)
          )
        }
    }
    implicit val l2b: Long => Array[Byte] = Injection.long2BigEndian
    edges
      .sketch(4000)
      .join(knownFor)
      .flatMap {
        case (destId, (srcWithWeights, clusterArray)) =>
          edgesToUsersWithKnownFor.inc()
          clusterArray.toList.map {
            case (clusterId, knownForScoreF) =>
              val knownForScore = math.max(0.0, knownForScoreF.toDouble)
              srcDestClusterTriples.inc()
              val followScore =
                if (srcWithWeights.isFollowed.contains(true)) knownForScore else 0.0
              val favScore =
                srcWithWeights.favScoreHalfLife100Days.getOrElse(0.0) * knownForScore
              val logFavScore = srcWithWeights.logFavScore.getOrElse(0.0) * knownForScore
              val numFollowed = if (srcWithWeights.isFollowed.contains(true)) {
                1
              } else 0
              val numFaved = if (srcWithWeights.favScoreHalfLife100Days.exists(_ > 0)) {
                1
              } else 0
              (
                (srcWithWeights.neighborId, clusterId),
                SrcClusterIntermediateInfo(
                  followScore,
                  favScore,
                  logFavScore,
                  numFollowed,
                  numFaved
                )
              )
          }
      }
      .sumByKey
      .withReducers(10000)
      .filter {
        case ((_, _), SrcClusterIntermediateInfo(_, _, _, numFollowed, numFaved)) =>
          srcClusterPairsBeforeSocialProofThresholding.inc()
          // we donot remove duplicates
          val socialProofSize = numFollowed + numFaved
          val result = socialProofSize >= socialProofThreshold
          if (result) {
            srcClusterPairsAfterSocialProofThresholding.inc()
          }
          result
      }
  }
  def groupClusterScores(
    intermediate: TypedPipe[((Long, Int), SrcClusterIntermediateInfo)]
  )(
    implicit uniqueId: UniqueID
  ): TypedPipe[(Long, List[(Int, UserToInterestedInClusterScores)])] = {
    implicit val i2b: Int => Array[Byte] = Injection.int2BigEndian
    intermediate
      .map {
        case (
              (srcId, clusterId),
              SrcClusterIntermediateInfo(
                followScore,
                favScore,
                logFavScore,
                numFollowed,
                numFaved
              )) =>
          (
            srcId,
            List(
              (
                clusterId,
                UserToInterestedInClusterScores(
                  followScore = Some(ifNanMake0(followScore)),
                  favScore = Some(ifNanMake0(favScore)),
                  logFavScore = Some(ifNanMake0(logFavScore)),
                  numUsersBeingFollowed = Some(numFollowed),
                  numUsersThatWereFaved = Some(numFaved)
                ))
            )
          )
      }
      .sumByKey
      //      .withReducers(1000)
      .toTypedPipe
  }
 }
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.docx
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.docx
--- a/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.scala
+++ b/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.scala
@ -1,290 +0,0 @@
 package com.twitter.simclusters_v2.scalding
 import com.twitter.dal.client.dataset.KeyValDALDataset
 import com.twitter.scalding.Execution
 import com.twitter.scalding.TypedTsv
 import com.twitter.scalding._
 import com.twitter.scalding_internal.dalv2.DAL
 import com.twitter.scalding_internal.dalv2.DALWrite._
 import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
 import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
 import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
 import com.twitter.simclusters_v2.common.ModelVersions
 import com.twitter.simclusters_v2.common.UserId
 import com.twitter.simclusters_v2.hdfs_sources.ProducerEmbeddingSources
 import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
 import com.twitter.simclusters_v2.hdfs_sources.DataSources
 import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset
 import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
 import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
 import com.twitter.simclusters_v2.scalding.common.Util
 import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
 import com.twitter.simclusters_v2.thriftscala.EmbeddingType
 import com.twitter.simclusters_v2.thriftscala.SimClusterWithScore
 import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore
 import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores
 import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
 import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp
 import java.util.TimeZone
 import scala.util.Random
 /**
 * This file implements the job for computing users' interestedIn vector from the producerEmbeddings data set.
 *
 * It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then
 * based on the producerEmbedding clusters of each followed/faved user, we calculate how much a user is
 * interestedIn a cluster. To compute the engagement and determine the clusters for the user, we reuse
 * the functions defined in InterestedInKnownFor.
 *
 * Using producerEmbeddings instead of knownFor to obtain interestedIn increases the coverage (especially
 * for medium and light users) and also the density of the cluster embeddings for the user.
 */
 /**
 * Adhoc job to generate the interestedIn from producer embeddings for the model version 20M145KUpdated
 *
 scalding remote run \
  --target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_producer_embeddings \
  --main-class com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsAdhocApp \
  --user cassowary --cluster bluebird-qus1 \
  --keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
  --principal service_acoount@TWITTER.BIZ \
  -- \
  --outputDir /gcs/user/cassowary/adhoc/interested_in_from_prod_embeddings/ \
  --date 2020-08-25 --typedTsv true
 */
 object InterestedInFromProducerEmbeddingsAdhocApp extends AdhocExecutionApp {
  override def runOnDateRange(
    args: Args
  )(
    implicit dateRange: DateRange,
    timeZone: TimeZone,
    uniqueID: UniqueID
  ): Execution[Unit] = {
    val outputDir = args("outputDir")
    val inputGraph = args.optional("graphInputDir") match {
      case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir))
      case None =>
        DAL
          .readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
          .toTypedPipe
    }
    val socialProofThreshold = args.int("socialProofThreshold", 2)
    val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50)
    val maxClustersFromProducer = args.int("maxClustersPerProducer", 25)
    val typedTsvTag = args.boolean("typedTsv")
    val embeddingType =
      EmbeddingType.ProducerFavBasedSemanticCoreEntity
    val modelVersion = ModelVersions.Model20M145KUpdated
    val producerEmbeddings = ProducerEmbeddingSources
      .producerEmbeddingSourceLegacy(embeddingType, ModelVersions.toModelVersion(modelVersion))(
        dateRange.embiggen(Days(7)))
    import InterestedInFromProducerEmbeddingsBatchApp._
    val numProducerMappings = Stat("num_producer_embeddings_total")
    val numProducersWithLargeClusterMappings = Stat(
      "num_producers_with_more_clusters_than_threshold")
    val numProducersWithSmallClusterMappings = Stat(
      "num_producers_with_clusters_less_than_threshold")
    val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings")
    val producerEmbeddingsWithScore = producerEmbeddings.map {
      case (userId: Long, topSimClusters: TopSimClustersWithScore) =>
        (
          userId,
          topSimClusters.topClusters.toArray
            .map {
              case (simCluster: SimClusterWithScore) =>
                (simCluster.clusterId, simCluster.score.toFloat)
            }
        )
    }
    val producerEmbeddingsPruned = producerEmbeddingsWithScore.map {
      case (producerId, clusterArray) =>
        numProducerMappings.inc()
        val clusterSize = clusterArray.size
        totalClustersCoverageProducerEmbeddings.incBy(clusterSize)
        val prunedList = if (clusterSize > maxClustersFromProducer) {
          numProducersWithLargeClusterMappings.inc()
          clusterArray
            .sortBy {
              case (_, knownForScore) => -knownForScore
            }.take(maxClustersFromProducer)
        } else {
          numProducersWithSmallClusterMappings.inc()
          clusterArray
        }
        (producerId, prunedList)
    }
    val result = InterestedInFromKnownFor
      .run(
        inputGraph,
        producerEmbeddingsPruned,
        socialProofThreshold,
        maxClustersPerUserFinalResult,
        modelVersion
      )
    val resultWithoutSocial = getInterestedInDiscardSocial(result)
    if (typedTsvTag) {
      Util.printCounters(
        resultWithoutSocial
          .map {
            case (userId: Long, clusters: ClustersUserIsInterestedIn) =>
              (
                userId,
                clusters.clusterIdToScores.keys.toString()
              )
          }
          .writeExecution(
            TypedTsv(outputDir)
          )
      )
    } else {
      Util.printCounters(
        resultWithoutSocial
          .writeExecution(
            AdhocKeyValSources.interestedInSource(outputDir)
          )
      )
    }
  }
 }
 /**
 * Production job for computing interestedIn data set from the producer embeddings for the model version 20M145KUpdated.
 * It writes the data set in KeyVal format to produce a MH DAL data set.
 *
 * To deploy the job:
 *
 * capesospy-v2 update --build_locally --start_cron
 * --start_cron interested_in_from_producer_embeddings
 * src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml
 */
 object InterestedInFromProducerEmbeddingsBatchApp extends ScheduledExecutionApp {
  override val firstTime: RichDate = RichDate("2019-11-01")
  override val batchIncrement: Duration = Days(7)
  def getPrunedEmbeddings(
    producerEmbeddings: TypedPipe[(Long, TopSimClustersWithScore)],
    maxClustersFromProducer: Int
  ): TypedPipe[(Long, TopSimClustersWithScore)] = {
    producerEmbeddings.map {
      case (producerId, producerClusters) =>
        val prunedProducerClusters =
          producerClusters.topClusters
            .sortBy {
              case simCluster => -simCluster.score.toFloat
            }.take(maxClustersFromProducer)
        (producerId, TopSimClustersWithScore(prunedProducerClusters, producerClusters.modelVersion))
    }
  }
  def getInterestedInDiscardSocial(
    interestedInFromProducersResult: TypedPipe[(UserId, ClustersUserIsInterestedIn)]
  ): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
    interestedInFromProducersResult.map {
      case (srcId, fullClusterList) =>
        val fullClusterListWithoutSocial = fullClusterList.clusterIdToScores.map {
          case (clusterId, clusterDetails) =>
            val clusterDetailsWithoutSocial = UserToInterestedInClusterScores(
              followScore = clusterDetails.followScore,
              followScoreClusterNormalizedOnly = clusterDetails.followScoreClusterNormalizedOnly,
              followScoreProducerNormalizedOnly = clusterDetails.followScoreProducerNormalizedOnly,
              followScoreClusterAndProducerNormalized =
                clusterDetails.followScoreClusterAndProducerNormalized,
              favScore = clusterDetails.favScore,
              favScoreClusterNormalizedOnly = clusterDetails.favScoreClusterNormalizedOnly,
              favScoreProducerNormalizedOnly = clusterDetails.favScoreProducerNormalizedOnly,
              favScoreClusterAndProducerNormalized =
                clusterDetails.favScoreClusterAndProducerNormalized,
              // Social proof is currently not being used anywhere else, hence being discarded to reduce space for this dataset
              usersBeingFollowed = None,
              usersThatWereFaved = None,
              numUsersInterestedInThisClusterUpperBound =
                clusterDetails.numUsersInterestedInThisClusterUpperBound,
              logFavScore = clusterDetails.logFavScore,
              logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly,
              // Counts of the social proof are maintained
              numUsersBeingFollowed = Some(clusterDetails.usersBeingFollowed.getOrElse(Nil).size),
              numUsersThatWereFaved = Some(clusterDetails.usersThatWereFaved.getOrElse(Nil).size)
            )
            (clusterId, clusterDetailsWithoutSocial)
        }
        (
          srcId,
          ClustersUserIsInterestedIn(
            fullClusterList.knownForModelVersion,
            fullClusterListWithoutSocial))
    }
  }
  override def runOnDateRange(
    args: Args
  )(
    implicit dateRange: DateRange,
    timeZone: TimeZone,
    uniqueID: UniqueID
  ): Execution[Unit] = {
    //Input args for the run
    val socialProofThreshold = args.int("socialProofThreshold", 2)
    val maxClustersFromProducer = args.int("maxClustersPerProducer", 25)
    val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50)
    //Path variables
    val modelVersionUpdated = ModelVersions.toModelVersion(ModelVersions.Model20M145KUpdated)
    val rootPath: String = s"/user/cassowary/manhattan_sequence_files"
    val interestedInFromProducersPath =
      rootPath + "/interested_in_from_producer_embeddings/" + modelVersionUpdated
    //Input adjacency list and producer embeddings
    val userUserNormalGraph =
      DataSources.userUserNormalizedGraphSource(dateRange.prepend(Days(7))).forceToDisk
    val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
      SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset
    val producerEmbeddings = ProducerEmbeddingSources
      .producerEmbeddingSourceLegacy(
        EmbeddingType.ProducerFavBasedSemanticCoreEntity,
        modelVersionUpdated)(dateRange.embiggen(Days(7)))
    val producerEmbeddingsPruned = getPrunedEmbeddings(producerEmbeddings, maxClustersFromProducer)
    val producerEmbeddingsWithScore = producerEmbeddingsPruned.map {
      case (userId: Long, topSimClusters: TopSimClustersWithScore) =>
        (
          userId,
          topSimClusters.topClusters.toArray
            .map {
              case (simCluster: SimClusterWithScore) =>
                (simCluster.clusterId, simCluster.score.toFloat)
            }
        )
    }
    val interestedInFromProducersResult =
      InterestedInFromKnownFor.run(
        userUserNormalGraph,
        producerEmbeddingsWithScore,
        socialProofThreshold,
        maxClustersPerUserFinalResult,
        modelVersionUpdated.toString
      )
    val interestedInFromProducersWithoutSocial =
      getInterestedInDiscardSocial(interestedInFromProducersResult)
    val writeKeyValResultExec = interestedInFromProducersWithoutSocial
      .map { case (userId, clusters) => KeyVal(userId, clusters) }
      .writeDALVersionedKeyValExecution(
        outputKVDataset,
        D.Suffix(interestedInFromProducersPath)
      )
    writeKeyValResultExec
  }
 }
--- a/Show More
+++ b/Show More