mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-14 23:28:56 +02:00
ef4c5eb65e
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
209 lines
7.9 KiB
Scala
209 lines
7.9 KiB
Scala
package com.twitter.follow_recommendations.common.candidate_sources.base
|
|
|
|
import com.twitter.conversions.DurationOps._
|
|
import com.twitter.finagle.stats.NullStatsReceiver
|
|
import com.twitter.finagle.stats.StatsReceiver
|
|
import com.twitter.finagle.util.DefaultTimer
|
|
import com.twitter.follow_recommendations.common.candidate_sources.base.RealGraphExpansionRepository.DefaultScore
|
|
import com.twitter.follow_recommendations.common.candidate_sources.base.RealGraphExpansionRepository.MaxNumIntermediateNodesToKeep
|
|
import com.twitter.follow_recommendations.common.candidate_sources.base.RealGraphExpansionRepository.FirstDegreeCandidatesTimeout
|
|
import com.twitter.follow_recommendations.common.models.CandidateUser
|
|
import com.twitter.follow_recommendations.common.models._
|
|
import com.twitter.onboarding.relevance.features.ymbii.ExpansionCandidateScores
|
|
import com.twitter.onboarding.relevance.features.ymbii.RawYMBIICandidateFeatures
|
|
import com.twitter.onboarding.relevance.store.thriftscala.CandidatesFollowedV1
|
|
import com.twitter.product_mixer.core.functional_component.candidate_source.CandidateSource
|
|
import com.twitter.product_mixer.core.model.common.identifier.CandidateSourceIdentifier
|
|
import com.twitter.stitch.Stitch
|
|
import com.twitter.strato.client.Fetcher
|
|
import com.twitter.util.Duration
|
|
import scala.collection.immutable
|
|
import scala.util.control.NonFatal
|
|
|
|
private final case class InterestExpansionCandidate(
|
|
userID: Long,
|
|
score: Double,
|
|
features: RawYMBIICandidateFeatures)
|
|
|
|
abstract class RealGraphExpansionRepository[Request](
|
|
realgraphExpansionStore: Fetcher[
|
|
Long,
|
|
Unit,
|
|
CandidatesFollowedV1
|
|
],
|
|
override val identifier: CandidateSourceIdentifier,
|
|
statsReceiver: StatsReceiver = NullStatsReceiver,
|
|
maxUnderlyingCandidatesToQuery: Int = 50,
|
|
maxCandidatesToReturn: Int = 40,
|
|
overrideUnderlyingTimeout: Option[Duration] = None,
|
|
appendSocialProof: Boolean = false)
|
|
extends CandidateSource[
|
|
Request,
|
|
CandidateUser
|
|
] {
|
|
|
|
val underlyingCandidateSource: Seq[
|
|
CandidateSource[
|
|
Request,
|
|
CandidateUser
|
|
]
|
|
]
|
|
|
|
private val stats = statsReceiver.scope(this.getClass.getSimpleName).scope(identifier.name)
|
|
private val underlyingCandidateSourceFailureStats =
|
|
stats.scope("underlying_candidate_source_failure")
|
|
|
|
def apply(
|
|
request: Request,
|
|
): Stitch[Seq[CandidateUser]] = {
|
|
|
|
val candidatesFromUnderlyingSourcesStitch: Seq[Stitch[Seq[CandidateUser]]] =
|
|
underlyingCandidateSource.map { candidateSource =>
|
|
candidateSource
|
|
.apply(request)
|
|
.within(overrideUnderlyingTimeout.getOrElse(FirstDegreeCandidatesTimeout))(
|
|
DefaultTimer
|
|
)
|
|
.handle {
|
|
case NonFatal(e) =>
|
|
underlyingCandidateSourceFailureStats
|
|
.counter(candidateSource.identifier.name, e.getClass.getSimpleName).incr()
|
|
Seq.empty
|
|
}
|
|
}
|
|
|
|
for {
|
|
underlyingCandidatesFromEachAlgo <- Stitch.collect(candidatesFromUnderlyingSourcesStitch)
|
|
// The first algorithm in the list has the highest priority. Depending on if its not
|
|
// populated, fall back to other algorithms. Once a particular algorithm is chosen, only
|
|
// take the top few candidates from the underlying store for expansion.
|
|
underlyingCandidatesTuple =
|
|
underlyingCandidatesFromEachAlgo
|
|
.zip(underlyingCandidateSource)
|
|
.find(_._1.nonEmpty)
|
|
|
|
underlyingAlgorithmUsed: Option[CandidateSourceIdentifier] = underlyingCandidatesTuple.map {
|
|
case (_, candidateSource) => candidateSource.identifier
|
|
}
|
|
|
|
// Take maxUnderlyingCandidatesToQuery to query realgraphExpansionStore
|
|
underlyingCandidates =
|
|
underlyingCandidatesTuple
|
|
.map {
|
|
case (candidates, candidateSource) =>
|
|
stats
|
|
.scope("underlyingAlgorithmUsedScope").counter(
|
|
candidateSource.identifier.name).incr()
|
|
candidates
|
|
}
|
|
.getOrElse(Seq.empty)
|
|
.sortBy(_.score.getOrElse(DefaultScore))(Ordering.Double.reverse)
|
|
.take(maxUnderlyingCandidatesToQuery)
|
|
|
|
underlyingCandidateMap: Map[Long, Double] = underlyingCandidates.map { candidate =>
|
|
(candidate.id, candidate.score.getOrElse(DefaultScore))
|
|
}.toMap
|
|
|
|
expansionCandidates <-
|
|
Stitch
|
|
.traverse(underlyingCandidateMap.keySet.toSeq) { candidateId =>
|
|
Stitch.join(
|
|
Stitch.value(candidateId),
|
|
realgraphExpansionStore.fetch(candidateId).map(_.v))
|
|
|
|
}.map(_.toMap)
|
|
|
|
rerankedCandidates: Seq[InterestExpansionCandidate] =
|
|
rerankCandidateExpansions(underlyingCandidateMap, expansionCandidates)
|
|
|
|
rerankedCandidatesFiltered = rerankedCandidates.take(maxCandidatesToReturn)
|
|
|
|
} yield {
|
|
rerankedCandidatesFiltered.map { candidate =>
|
|
val socialProofReason = if (appendSocialProof) {
|
|
val socialProofIds = candidate.features.expansionCandidateScores
|
|
.map(_.intermediateCandidateId)
|
|
Some(
|
|
Reason(Some(
|
|
AccountProof(followProof = Some(FollowProof(socialProofIds, socialProofIds.size))))))
|
|
} else {
|
|
None
|
|
}
|
|
CandidateUser(
|
|
id = candidate.userID,
|
|
score = Some(candidate.score),
|
|
reason = socialProofReason,
|
|
userCandidateSourceDetails = Some(
|
|
UserCandidateSourceDetails(
|
|
primaryCandidateSource = Some(identifier),
|
|
candidateSourceFeatures = Map(identifier -> Seq(candidate.features))
|
|
))
|
|
).addAddressBookMetadataIfAvailable(underlyingAlgorithmUsed.toSeq)
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Expands underlying candidates, returning them in sorted order.
|
|
*
|
|
* @param underlyingCandidatesMap A map from underlying candidate id to score
|
|
* @param expansionCandidateMap A map from underlying candidate id to optional expansion candidates
|
|
* @return A sorted sequence of expansion candidates and associated scores
|
|
*/
|
|
private def rerankCandidateExpansions(
|
|
underlyingCandidatesMap: Map[Long, Double],
|
|
expansionCandidateMap: Map[Long, Option[CandidatesFollowedV1]]
|
|
): Seq[InterestExpansionCandidate] = {
|
|
|
|
// extract features
|
|
val candidates: Seq[(Long, ExpansionCandidateScores)] = for {
|
|
(underlyingCandidateId, underlyingCandidateScore) <- underlyingCandidatesMap.toSeq
|
|
expansionCandidates =
|
|
expansionCandidateMap
|
|
.get(underlyingCandidateId)
|
|
.flatten
|
|
.map(_.candidatesFollowed)
|
|
.getOrElse(Seq.empty)
|
|
expansionCandidate <- expansionCandidates
|
|
} yield expansionCandidate.candidateID -> ExpansionCandidateScores(
|
|
underlyingCandidateId,
|
|
Some(underlyingCandidateScore),
|
|
Some(expansionCandidate.score)
|
|
)
|
|
|
|
// merge intermediate nodes for the same candidate
|
|
val dedupedCandidates: Seq[(Long, Seq[ExpansionCandidateScores])] =
|
|
candidates.groupBy(_._1).mapValues(_.map(_._2).sortBy(_.intermediateCandidateId)).toSeq
|
|
|
|
// score the candidate
|
|
val candidatesWithTotalScore: Seq[((Long, Seq[ExpansionCandidateScores]), Double)] =
|
|
dedupedCandidates.map { candidate: (Long, Seq[ExpansionCandidateScores]) =>
|
|
(
|
|
candidate,
|
|
candidate._2.map { ieScore: ExpansionCandidateScores =>
|
|
ieScore.scoreFromUserToIntermediateCandidate.getOrElse(DefaultScore) *
|
|
ieScore.scoreFromIntermediateToExpansionCandidate.getOrElse(DefaultScore)
|
|
}.sum)
|
|
}
|
|
|
|
// sort candidate by score
|
|
for {
|
|
((candidate, edges), score) <- candidatesWithTotalScore.sortBy(_._2)(Ordering[Double].reverse)
|
|
} yield InterestExpansionCandidate(
|
|
candidate,
|
|
score,
|
|
RawYMBIICandidateFeatures(
|
|
edges.size,
|
|
edges.take(MaxNumIntermediateNodesToKeep).to[immutable.Seq])
|
|
)
|
|
}
|
|
|
|
}
|
|
|
|
object RealGraphExpansionRepository {
|
|
private val FirstDegreeCandidatesTimeout: Duration = 250.milliseconds
|
|
private val MaxNumIntermediateNodesToKeep = 20
|
|
private val DefaultScore = 0.0d
|
|
|
|
}
|