the-algorithm/src/scala/com/twitter/simclusters_v2/scio/common/ExternalDataSources.scala
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

302 lines
10 KiB
Scala

package com.twitter.simclusters_v2.scio.common
import com.spotify.scio.ScioContext
import com.spotify.scio.values.SCollection
import com.twitter.beam.io.dal.DAL
import com.twitter.common.util.Clock
import com.twitter.common_header.thriftscala.CommonHeader
import com.twitter.common_header.thriftscala.IdType
import com.twitter.common_header.thriftscala.VersionedCommonHeader
import com.twitter.frigate.data_pipeline.magicrecs.magicrecs_notifications_lite.thriftscala.MagicRecsNotificationLite
import com.twitter.frigate.data_pipeline.scalding.magicrecs.magicrecs_notification_lite.MagicrecsNotificationLite1DayLagScalaDataset
import com.twitter.iesource.thriftscala.InteractionEvent
import com.twitter.iesource.thriftscala.InteractionTargetType
import com.twitter.interests_ds.jobs.interests_service.UserTopicRelationSnapshotScalaDataset
import com.twitter.interests.thriftscala.InterestRelationType
import com.twitter.interests.thriftscala.UserInterestsRelationSnapshot
import com.twitter.penguin.scalding.datasets.PenguinUserLanguagesScalaDataset
import com.twitter.search.adaptive.scribing.thriftscala.AdaptiveSearchScribeLog
import com.twitter.simclusters_v2.hdfs_sources.UserUserFavGraphScalaDataset
import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources.ValidFlockEdgeStateId
import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources.getStandardLanguageCode
import com.twitter.twadoop.user.gen.thriftscala.CombinedUser
import flockdb_tools.datasets.flock.FlockBlocksEdgesScalaDataset
import flockdb_tools.datasets.flock.FlockFollowsEdgesScalaDataset
import flockdb_tools.datasets.flock.FlockReportAsAbuseEdgesScalaDataset
import flockdb_tools.datasets.flock.FlockReportAsSpamEdgesScalaDataset
import org.joda.time.Interval
import com.twitter.simclusters_v2.thriftscala.EdgeWithDecayedWeights
import com.twitter.usersource.snapshot.combined.UsersourceScalaDataset
import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset
import com.twitter.util.Duration
import twadoop_config.configuration.log_categories.group.search.AdaptiveSearchScalaDataset
object ExternalDataSources {
def userSource(
noOlderThan: Duration = Duration.fromDays(7)
)(
implicit sc: ScioContext
): SCollection[CombinedUser] = {
sc.customInput(
"ReadUserSource",
DAL
.readMostRecentSnapshotNoOlderThan(
UsersourceScalaDataset,
noOlderThan,
Clock.SYSTEM_CLOCK,
DAL.Environment.Prod
)
)
}
def userCountrySource(
noOlderThan: Duration = Duration.fromDays(7)
)(
implicit sc: ScioContext
): SCollection[(Long, String)] = {
sc.customInput(
"ReadUserCountrySource",
DAL
.readMostRecentSnapshotNoOlderThan(
UsersourceFlatScalaDataset,
noOlderThan,
Clock.SYSTEM_CLOCK,
DAL.Environment.Prod,
)
).flatMap { flatUser =>
for {
userId <- flatUser.id
country <- flatUser.accountCountryCode
} yield {
(userId, country.toUpperCase)
}
}.distinct
}
def userUserFavSource(
noOlderThan: Duration = Duration.fromDays(14)
)(
implicit sc: ScioContext
): SCollection[EdgeWithDecayedWeights] = {
sc.customInput(
"ReadUserUserFavSource",
DAL
.readMostRecentSnapshotNoOlderThan(
UserUserFavGraphScalaDataset,
noOlderThan,
Clock.SYSTEM_CLOCK,
DAL.Environment.Prod
)
)
}
def inferredUserConsumedLanguageSource(
noOlderThan: Duration = Duration.fromDays(7)
)(
implicit sc: ScioContext
): SCollection[(Long, Seq[(String, Double)])] = {
sc.customInput(
"ReadInferredUserConsumedLanguageSource",
DAL
.readMostRecentSnapshotNoOlderThan(
PenguinUserLanguagesScalaDataset,
noOlderThan,
Clock.SYSTEM_CLOCK,
DAL.Environment.Prod
)
).map { kv =>
val consumed = kv.value.consumed
.collect {
case scoredString if scoredString.weight > 0.001 => //throw away 5% outliers
(getStandardLanguageCode(scoredString.item), scoredString.weight)
}.collect {
case (Some(language), score) => (language, score)
}
(kv.key, consumed)
}
}
def flockBlockSource(
noOlderThan: Duration = Duration.fromDays(7)
)(
implicit sc: ScioContext
): SCollection[(Long, Long)] = {
sc.customInput(
"ReadFlockBlock",
DAL.readMostRecentSnapshotNoOlderThan(
FlockBlocksEdgesScalaDataset,
noOlderThan,
Clock.SYSTEM_CLOCK,
DAL.Environment.Prod))
.collect {
case edge if edge.state == ValidFlockEdgeStateId =>
(edge.sourceId, edge.destinationId)
}
}
def flockFollowSource(
noOlderThan: Duration = Duration.fromDays(7)
)(
implicit sc: ScioContext
): SCollection[(Long, Long)] = {
sc.customInput(
"ReadFlockFollow",
DAL
.readMostRecentSnapshotNoOlderThan(
FlockFollowsEdgesScalaDataset,
noOlderThan,
Clock.SYSTEM_CLOCK,
DAL.Environment.Prod))
.collect {
case edge if edge.state == ValidFlockEdgeStateId =>
(edge.sourceId, edge.destinationId)
}
}
def flockReportAsAbuseSource(
noOlderThan: Duration = Duration.fromDays(7)
)(
implicit sc: ScioContext
): SCollection[(Long, Long)] = {
sc.customInput(
"ReadFlockReportAsAbuseJava",
DAL
.readMostRecentSnapshotNoOlderThan(
FlockReportAsAbuseEdgesScalaDataset,
noOlderThan,
Clock.SYSTEM_CLOCK,
DAL.Environment.Prod)
)
.collect {
case edge if edge.state == ValidFlockEdgeStateId =>
(edge.sourceId, edge.destinationId)
}
}
def flockReportAsSpamSource(
noOlderThan: Duration = Duration.fromDays(7)
)(
implicit sc: ScioContext
): SCollection[(Long, Long)] = {
sc.customInput(
"ReadFlockReportAsSpam",
DAL
.readMostRecentSnapshotNoOlderThan(
FlockReportAsSpamEdgesScalaDataset,
noOlderThan,
Clock.SYSTEM_CLOCK,
DAL.Environment.Prod))
.collect {
case edge if edge.state == ValidFlockEdgeStateId =>
(edge.sourceId, edge.destinationId)
}
}
def ieSourceTweetEngagementsSource(
interval: Interval
)(
implicit sc: ScioContext
): SCollection[InteractionEvent] = {
sc.customInput(
"ReadIeSourceTweetEngagementsSource",
DAL
.read(
com.twitter.iesource.processing.events.batch.ServerEngagementsScalaDataset,
interval,
DAL.Environment.Prod,
)
).filter { event =>
// filter out logged out users because their favorites are less reliable
event.engagingUserId > 0L && event.targetType == InteractionTargetType.Tweet
}
}
def topicFollowGraphSource(
noOlderThan: Duration = Duration.fromDays(7)
)(
implicit sc: ScioContext
): SCollection[(Long, Long)] = {
// The implementation here is slightly different than the topicFollowGraphSource function in
// src/scala/com/twitter/simclusters_v2/scalding/embedding/common/ExternalDataSources.scala
// We don't do an additional hashJoin on uttFollowableEntitiesSource.
sc.customInput(
"ReadTopicFollowGraphSource",
DAL
.readMostRecentSnapshotNoOlderThan(
UserTopicRelationSnapshotScalaDataset,
noOlderThan,
Clock.SYSTEM_CLOCK,
DAL.Environment.Prod
)
).collect {
case userInterestsRelationSnapshot: UserInterestsRelationSnapshot
if userInterestsRelationSnapshot.interestType == "UTT" &&
userInterestsRelationSnapshot.relation == InterestRelationType.Followed =>
(userInterestsRelationSnapshot.interestId, userInterestsRelationSnapshot.userId)
}
}
def magicRecsNotficationOpenOrClickEventsSource(
interval: Interval
)(
implicit sc: ScioContext
): SCollection[MagicRecsNotificationLite] = {
sc.customInput(
"ReadMagicRecsNotficationOpenOrClickEventsSource",
DAL
.read(MagicrecsNotificationLite1DayLagScalaDataset, interval, DAL.Environment.Prod))
.filter { entry =>
// keep entries with a valid userId and tweetId, opened or clicked timestamp defined
val userIdExists = entry.targetUserId.isDefined
val tweetIdExists = entry.tweetId.isDefined
val openOrClickExists =
entry.openTimestampMs.isDefined || entry.ntabClickTimestampMs.isDefined
userIdExists && tweetIdExists && openOrClickExists
}
}
def adaptiveSearchScribeLogsSource(
interval: Interval
)(
implicit sc: ScioContext
): SCollection[(Long, String)] = {
sc.customInput(
"ReadAdaptiveSearchScribeLogsSource",
DAL
.read(AdaptiveSearchScalaDataset, interval, DAL.Environment.Prod))
.flatMap({ scribeLog: AdaptiveSearchScribeLog =>
for {
userId <- userIdFromBlenderAdaptiveScribeLog(scribeLog)
// filter out logged out search queries
if userId != 0
queryString <- scribeLog.requestLog.flatMap(_.request).flatMap(_.rawQuery)
} yield {
(userId, Set(queryString))
}
})
// if a user searches for the same query multiple times, there could be duplicates.
// De-dup them to get the distinct queries searched by a user
.sumByKey
.flatMap {
case (userId, distinctQuerySet) =>
distinctQuerySet.map { query =>
(userId, query)
}
}
}
private def userIdFromBlenderAdaptiveScribeLog(
blenderAdaptiveLog: AdaptiveSearchScribeLog
): Option[Long] = {
blenderAdaptiveLog.versionedCommonHeader match {
case VersionedCommonHeader.CommonHeader(CommonHeader.ServerHeader(serverHeader)) =>
serverHeader.requestInfo match {
case Some(requestInfo) => requestInfo.ids.get(IdType.UserId).map(_.toLong)
case _ => None
}
case _ => None
}
}
}