the-algorithm/src/scala/com/twitter/simclusters_v2/scalding/InterestedInFromProducerEmbeddingsAdhocApp.scala
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

291 lines
12 KiB
Scala

package com.twitter.simclusters_v2.scalding
import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.scalding.Execution
import com.twitter.scalding.TypedTsv
import com.twitter.scalding._
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.DALWrite._
import com.twitter.scalding_internal.dalv2.remote_access.ExplicitLocation
import com.twitter.scalding_internal.dalv2.remote_access.ProcAtla
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.simclusters_v2.common.ModelVersions
import com.twitter.simclusters_v2.common.UserId
import com.twitter.simclusters_v2.hdfs_sources.ProducerEmbeddingSources
import com.twitter.simclusters_v2.hdfs_sources.AdhocKeyValSources
import com.twitter.simclusters_v2.hdfs_sources.DataSources
import com.twitter.simclusters_v2.hdfs_sources.SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset
import com.twitter.simclusters_v2.hdfs_sources.UserAndNeighborsFixedPathSource
import com.twitter.simclusters_v2.hdfs_sources.UserUserNormalizedGraphScalaDataset
import com.twitter.simclusters_v2.scalding.common.Util
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
import com.twitter.simclusters_v2.thriftscala.EmbeddingType
import com.twitter.simclusters_v2.thriftscala.SimClusterWithScore
import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore
import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores
import com.twitter.wtf.scalding.jobs.common.AdhocExecutionApp
import com.twitter.wtf.scalding.jobs.common.ScheduledExecutionApp
import java.util.TimeZone
import scala.util.Random
/**
* This file implements the job for computing users' interestedIn vector from the producerEmbeddings data set.
*
* It reads the UserUserNormalizedGraphScalaDataset to get user-user follow + fav graph, and then
* based on the producerEmbedding clusters of each followed/faved user, we calculate how much a user is
* interestedIn a cluster. To compute the engagement and determine the clusters for the user, we reuse
* the functions defined in InterestedInKnownFor.
*
* Using producerEmbeddings instead of knownFor to obtain interestedIn increases the coverage (especially
* for medium and light users) and also the density of the cluster embeddings for the user.
*/
/**
* Adhoc job to generate the interestedIn from producer embeddings for the model version 20M145KUpdated
*
scalding remote run \
--target src/scala/com/twitter/simclusters_v2/scalding:interested_in_from_producer_embeddings \
--main-class com.twitter.simclusters_v2.scalding.InterestedInFromProducerEmbeddingsAdhocApp \
--user cassowary --cluster bluebird-qus1 \
--keytab /var/lib/tss/keys/fluffy/keytabs/client/cassowary.keytab \
--principal service_acoount@TWITTER.BIZ \
-- \
--outputDir /gcs/user/cassowary/adhoc/interested_in_from_prod_embeddings/ \
--date 2020-08-25 --typedTsv true
*/
object InterestedInFromProducerEmbeddingsAdhocApp extends AdhocExecutionApp {
override def runOnDateRange(
args: Args
)(
implicit dateRange: DateRange,
timeZone: TimeZone,
uniqueID: UniqueID
): Execution[Unit] = {
val outputDir = args("outputDir")
val inputGraph = args.optional("graphInputDir") match {
case Some(inputDir) => TypedPipe.from(UserAndNeighborsFixedPathSource(inputDir))
case None =>
DAL
.readMostRecentSnapshotNoOlderThan(UserUserNormalizedGraphScalaDataset, Days(30))
.toTypedPipe
}
val socialProofThreshold = args.int("socialProofThreshold", 2)
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50)
val maxClustersFromProducer = args.int("maxClustersPerProducer", 25)
val typedTsvTag = args.boolean("typedTsv")
val embeddingType =
EmbeddingType.ProducerFavBasedSemanticCoreEntity
val modelVersion = ModelVersions.Model20M145KUpdated
val producerEmbeddings = ProducerEmbeddingSources
.producerEmbeddingSourceLegacy(embeddingType, ModelVersions.toModelVersion(modelVersion))(
dateRange.embiggen(Days(7)))
import InterestedInFromProducerEmbeddingsBatchApp._
val numProducerMappings = Stat("num_producer_embeddings_total")
val numProducersWithLargeClusterMappings = Stat(
"num_producers_with_more_clusters_than_threshold")
val numProducersWithSmallClusterMappings = Stat(
"num_producers_with_clusters_less_than_threshold")
val totalClustersCoverageProducerEmbeddings = Stat("num_clusters_total_producer_embeddings")
val producerEmbeddingsWithScore = producerEmbeddings.map {
case (userId: Long, topSimClusters: TopSimClustersWithScore) =>
(
userId,
topSimClusters.topClusters.toArray
.map {
case (simCluster: SimClusterWithScore) =>
(simCluster.clusterId, simCluster.score.toFloat)
}
)
}
val producerEmbeddingsPruned = producerEmbeddingsWithScore.map {
case (producerId, clusterArray) =>
numProducerMappings.inc()
val clusterSize = clusterArray.size
totalClustersCoverageProducerEmbeddings.incBy(clusterSize)
val prunedList = if (clusterSize > maxClustersFromProducer) {
numProducersWithLargeClusterMappings.inc()
clusterArray
.sortBy {
case (_, knownForScore) => -knownForScore
}.take(maxClustersFromProducer)
} else {
numProducersWithSmallClusterMappings.inc()
clusterArray
}
(producerId, prunedList)
}
val result = InterestedInFromKnownFor
.run(
inputGraph,
producerEmbeddingsPruned,
socialProofThreshold,
maxClustersPerUserFinalResult,
modelVersion
)
val resultWithoutSocial = getInterestedInDiscardSocial(result)
if (typedTsvTag) {
Util.printCounters(
resultWithoutSocial
.map {
case (userId: Long, clusters: ClustersUserIsInterestedIn) =>
(
userId,
clusters.clusterIdToScores.keys.toString()
)
}
.writeExecution(
TypedTsv(outputDir)
)
)
} else {
Util.printCounters(
resultWithoutSocial
.writeExecution(
AdhocKeyValSources.interestedInSource(outputDir)
)
)
}
}
}
/**
* Production job for computing interestedIn data set from the producer embeddings for the model version 20M145KUpdated.
* It writes the data set in KeyVal format to produce a MH DAL data set.
*
* To deploy the job:
*
* capesospy-v2 update --build_locally --start_cron
* --start_cron interested_in_from_producer_embeddings
* src/scala/com/twitter/simclusters_v2/capesos_config/atla_proc3.yaml
*/
object InterestedInFromProducerEmbeddingsBatchApp extends ScheduledExecutionApp {
override val firstTime: RichDate = RichDate("2019-11-01")
override val batchIncrement: Duration = Days(7)
def getPrunedEmbeddings(
producerEmbeddings: TypedPipe[(Long, TopSimClustersWithScore)],
maxClustersFromProducer: Int
): TypedPipe[(Long, TopSimClustersWithScore)] = {
producerEmbeddings.map {
case (producerId, producerClusters) =>
val prunedProducerClusters =
producerClusters.topClusters
.sortBy {
case simCluster => -simCluster.score.toFloat
}.take(maxClustersFromProducer)
(producerId, TopSimClustersWithScore(prunedProducerClusters, producerClusters.modelVersion))
}
}
def getInterestedInDiscardSocial(
interestedInFromProducersResult: TypedPipe[(UserId, ClustersUserIsInterestedIn)]
): TypedPipe[(UserId, ClustersUserIsInterestedIn)] = {
interestedInFromProducersResult.map {
case (srcId, fullClusterList) =>
val fullClusterListWithoutSocial = fullClusterList.clusterIdToScores.map {
case (clusterId, clusterDetails) =>
val clusterDetailsWithoutSocial = UserToInterestedInClusterScores(
followScore = clusterDetails.followScore,
followScoreClusterNormalizedOnly = clusterDetails.followScoreClusterNormalizedOnly,
followScoreProducerNormalizedOnly = clusterDetails.followScoreProducerNormalizedOnly,
followScoreClusterAndProducerNormalized =
clusterDetails.followScoreClusterAndProducerNormalized,
favScore = clusterDetails.favScore,
favScoreClusterNormalizedOnly = clusterDetails.favScoreClusterNormalizedOnly,
favScoreProducerNormalizedOnly = clusterDetails.favScoreProducerNormalizedOnly,
favScoreClusterAndProducerNormalized =
clusterDetails.favScoreClusterAndProducerNormalized,
// Social proof is currently not being used anywhere else, hence being discarded to reduce space for this dataset
usersBeingFollowed = None,
usersThatWereFaved = None,
numUsersInterestedInThisClusterUpperBound =
clusterDetails.numUsersInterestedInThisClusterUpperBound,
logFavScore = clusterDetails.logFavScore,
logFavScoreClusterNormalizedOnly = clusterDetails.logFavScoreClusterNormalizedOnly,
// Counts of the social proof are maintained
numUsersBeingFollowed = Some(clusterDetails.usersBeingFollowed.getOrElse(Nil).size),
numUsersThatWereFaved = Some(clusterDetails.usersThatWereFaved.getOrElse(Nil).size)
)
(clusterId, clusterDetailsWithoutSocial)
}
(
srcId,
ClustersUserIsInterestedIn(
fullClusterList.knownForModelVersion,
fullClusterListWithoutSocial))
}
}
override def runOnDateRange(
args: Args
)(
implicit dateRange: DateRange,
timeZone: TimeZone,
uniqueID: UniqueID
): Execution[Unit] = {
//Input args for the run
val socialProofThreshold = args.int("socialProofThreshold", 2)
val maxClustersFromProducer = args.int("maxClustersPerProducer", 25)
val maxClustersPerUserFinalResult = args.int("maxInterestedInClustersPerUser", 50)
//Path variables
val modelVersionUpdated = ModelVersions.toModelVersion(ModelVersions.Model20M145KUpdated)
val rootPath: String = s"/user/cassowary/manhattan_sequence_files"
val interestedInFromProducersPath =
rootPath + "/interested_in_from_producer_embeddings/" + modelVersionUpdated
//Input adjacency list and producer embeddings
val userUserNormalGraph =
DataSources.userUserNormalizedGraphSource(dateRange.prepend(Days(7))).forceToDisk
val outputKVDataset: KeyValDALDataset[KeyVal[Long, ClustersUserIsInterestedIn]] =
SimclustersV2InterestedInFromProducerEmbeddings20M145KUpdatedScalaDataset
val producerEmbeddings = ProducerEmbeddingSources
.producerEmbeddingSourceLegacy(
EmbeddingType.ProducerFavBasedSemanticCoreEntity,
modelVersionUpdated)(dateRange.embiggen(Days(7)))
val producerEmbeddingsPruned = getPrunedEmbeddings(producerEmbeddings, maxClustersFromProducer)
val producerEmbeddingsWithScore = producerEmbeddingsPruned.map {
case (userId: Long, topSimClusters: TopSimClustersWithScore) =>
(
userId,
topSimClusters.topClusters.toArray
.map {
case (simCluster: SimClusterWithScore) =>
(simCluster.clusterId, simCluster.score.toFloat)
}
)
}
val interestedInFromProducersResult =
InterestedInFromKnownFor.run(
userUserNormalGraph,
producerEmbeddingsWithScore,
socialProofThreshold,
maxClustersPerUserFinalResult,
modelVersionUpdated.toString
)
val interestedInFromProducersWithoutSocial =
getInterestedInDiscardSocial(interestedInFromProducersResult)
val writeKeyValResultExec = interestedInFromProducersWithoutSocial
.map { case (userId, clusters) => KeyVal(userId, clusters) }
.writeDALVersionedKeyValExecution(
outputKVDataset,
D.Suffix(interestedInFromProducersPath)
)
writeKeyValResultExec
}
}