62 lines
2.3 KiB
Scala
62 lines
2.3 KiB
Scala
package com.twitter.timelines.prediction.features.simcluster
|
|
|
|
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
|
import com.twitter.finagle.stats.StatsReceiver
|
|
import com.twitter.ml.api.Feature._
|
|
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
|
|
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
|
|
import scala.collection.JavaConverters._
|
|
|
|
class SimclusterFeaturesHelper(statsReceiver: StatsReceiver) {
|
|
import SimclusterFeatures._
|
|
|
|
private[this] val scopedStatsReceiver = statsReceiver.scope(getClass.getSimpleName)
|
|
private[this] val invalidSimclusterModelVersion = scopedStatsReceiver
|
|
.counter("invalidSimclusterModelVersion")
|
|
|
|
def fromUserClusterInterestsPair(
|
|
userInterestClustersPair: (Long, ClustersUserIsInterestedIn)
|
|
): Option[SimclusterFeatures] = {
|
|
val (userId, userInterestClusters) = userInterestClustersPair
|
|
if (userInterestClusters.knownForModelVersion == SIMCLUSTER_MODEL_VERSION) {
|
|
val userInterestClustersFavScores = for {
|
|
(clusterId, scores) <- userInterestClusters.clusterIdToScores
|
|
favScore <- scores.favScore
|
|
} yield (clusterId.toString, favScore)
|
|
Some(
|
|
SimclusterFeatures(
|
|
userId,
|
|
userInterestClusters.knownForModelVersion,
|
|
userInterestClustersFavScores.toMap
|
|
)
|
|
)
|
|
} else {
|
|
// We maintain this counter to make sure that the hardcoded modelVersion we are using is correct.
|
|
invalidSimclusterModelVersion.incr
|
|
None
|
|
}
|
|
}
|
|
}
|
|
|
|
object SimclusterFeatures {
|
|
// Check http://go/simclustersv2runbook for production versions
|
|
// Our models are trained for this specific model version only.
|
|
val SIMCLUSTER_MODEL_VERSION = "20M_145K_dec11"
|
|
val prefix = s"simcluster.v2.$SIMCLUSTER_MODEL_VERSION"
|
|
|
|
val SIMCLUSTER_USER_INTEREST_CLUSTER_SCORES = new SparseContinuous(
|
|
s"$prefix.user_interest_cluster_scores",
|
|
Set(EngagementScore, InferredInterests).asJava
|
|
)
|
|
val SIMCLUSTER_USER_INTEREST_CLUSTER_IDS = new SparseBinary(
|
|
s"$prefix.user_interest_cluster_ids",
|
|
Set(InferredInterests).asJava
|
|
)
|
|
val SIMCLUSTER_MODEL_VERSION_METADATA = new Text("meta.simcluster_version")
|
|
}
|
|
|
|
case class SimclusterFeatures(
|
|
userId: Long,
|
|
modelVersion: String,
|
|
interestClusterScoresMap: Map[String, Double])
|