the-algorithm/tweetypie/server/src/main/scala/com/twitter/tweetypie/hydrator/CopyFromSourceTweet.scala
twitter-team 01dbfee4c0 Open-sourcing Tweetypie
Tweetypie is the core Tweet service that handles the reading and writing of Tweet data.
2023-05-19 16:20:06 -05:00

230 lines
9.6 KiB
Scala

package com.twitter.tweetypie
package hydrator
import com.twitter.tweetypie.core._
import com.twitter.tweetypie.repository.TweetQuery
import com.twitter.tweetypie.tweettext.TweetText
import com.twitter.tweetypie.thriftscala._
object CopyFromSourceTweet {
/**
* A `ValueHydrator` that copies and/or merges certain fields from a retweet's source
* tweet into the retweet.
*/
def hydrator: ValueHydrator[TweetData, TweetQuery.Options] =
ValueHydrator.map { (td, _) =>
td.sourceTweetResult.map(_.value.tweet) match {
case None => ValueState.unmodified(td)
case Some(src) => ValueState.modified(td.copy(tweet = copy(src, td.tweet)))
}
}
/**
* Updates `dst` with fields from `src`. This is more complicated than you would think, because:
*
* - the tweet has an extra mention entity due to the "RT @user" prefix;
* - the retweet text may be truncated at the end, and doesn't necessarily contain all of the
* the text from the source tweet. truncation may happen in the middle of entity.
* - the text in the retweet may have a different unicode normalization, which affects
* code point indices. this means entities aren't shifted by a fixed amount equal to
* the RT prefix.
* - url entities, when hydrated, may be converted to media entities; url entities may not
* be hydrated in the retweet, so the source tweet may have a media entity that corresponds
* to an unhydrated url entity in the retweet.
* - there may be multiple media entities that map to a single url entity, because the tweet
* may have multiple photos.
*/
def copy(src: Tweet, dst: Tweet): Tweet = {
val srcCoreData = src.coreData.get
val dstCoreData = dst.coreData.get
// get the code point index of the end of the text
val max = getText(dst).codePointCount(0, getText(dst).length).toShort
// get all entities from the source tweet, merged into a single list sorted by fromIndex.
val srcEntities = getWrappedEntities(src)
// same for the retweet, but drop first @mention, add back later
val dstEntities = getWrappedEntities(dst).drop(1)
// merge indices from dst into srcEntities. at the end, resort entities back
// to their original ordering. for media entities, order matters to clients.
val mergedEntities = merge(srcEntities, dstEntities, max).sortBy(_.position)
// extract entities back out by type
val mentions = mergedEntities.collect { case WrappedMentionEntity(e, _) => e }
val hashtags = mergedEntities.collect { case WrappedHashtagEntity(e, _) => e }
val cashtags = mergedEntities.collect { case WrappedCashtagEntity(e, _) => e }
val urls = mergedEntities.collect { case WrappedUrlEntity(e, _) => e }
val media = mergedEntities.collect { case WrappedMediaEntity(e, _) => e }
// merge the updated entities back into the retweet, adding the RT @mention back in
dst.copy(
coreData = Some(
dstCoreData.copy(
hasMedia = srcCoreData.hasMedia,
hasTakedown = dstCoreData.hasTakedown || srcCoreData.hasTakedown
)
),
mentions = Some(getMentions(dst).take(1) ++ mentions),
hashtags = Some(hashtags),
cashtags = Some(cashtags),
urls = Some(urls),
media = Some(media.map(updateSourceStatusId(src.id, getUserId(src)))),
quotedTweet = src.quotedTweet,
card2 = src.card2,
cards = src.cards,
language = src.language,
mediaTags = src.mediaTags,
spamLabel = src.spamLabel,
takedownCountryCodes =
mergeTakedowns(Seq(src, dst).map(TweetLenses.takedownCountryCodes.get): _*),
conversationControl = src.conversationControl,
exclusiveTweetControl = src.exclusiveTweetControl
)
}
/**
* Merges one or more optional lists of takedowns. If no lists are defined, returns None.
*/
private def mergeTakedowns(takedowns: Option[Seq[CountryCode]]*): Option[Seq[CountryCode]] =
if (takedowns.exists(_.isDefined)) {
Some(takedowns.flatten.flatten.distinct.sorted)
} else {
None
}
/**
* A retweet should never have media without a source_status_id or source_user_id
*/
private def updateSourceStatusId(
srcTweetId: TweetId,
srcUserId: UserId
): MediaEntity => MediaEntity =
mediaEntity =>
if (mediaEntity.sourceStatusId.nonEmpty) {
// when sourceStatusId is set this indicates the media is "pasted media" so the values
// should already be correct (retweeting won't change sourceStatusId / sourceUserId)
mediaEntity
} else {
mediaEntity.copy(
sourceStatusId = Some(srcTweetId),
sourceUserId = Some(mediaEntity.sourceUserId.getOrElse(srcUserId))
)
}
/**
* Attempts to match up entities from the source tweet with entities from the retweet,
* and to use the source tweet entities but shifted to the retweet entity indices. If an entity
* got truncated at the end of the retweet text, we drop it and any following entities.
*/
private def merge(
srcEntities: List[WrappedEntity],
rtEntities: List[WrappedEntity],
maxIndex: Short
): List[WrappedEntity] = {
(srcEntities, rtEntities) match {
case (Nil, Nil) =>
// successfully matched all entities!
Nil
case (Nil, _) =>
// no more source tweet entities, but we still have remaining retweet entities.
// this can happen if a a text truncation turns something invalid like #tag1#tag2 or
// @mention1@mention2 into a valid entity. just drop all the remaining retweet entities.
Nil
case (_, Nil) =>
// no more retweet entities, which means the remaining entities have been truncated.
Nil
case (srcHead :: srcTail, rtHead :: rtTail) =>
// we have more entities from the source tweet and the retweet. typically, we can
// match these entities because they have the same normalized text, but the retweet
// entity might be truncated, so we allow for a prefix match if the retweet entity
// ends at the end of the tweet.
val possiblyTruncated = rtHead.toIndex == maxIndex - 1
val exactMatch = srcHead.normalizedText == rtHead.normalizedText
if (exactMatch) {
// there could be multiple media entities for the same t.co url, so we need to find
// contiguous groupings of entities that share the same fromIndex.
val rtTail = rtEntities.dropWhile(_.fromIndex == rtHead.fromIndex)
val srcGroup =
srcEntities
.takeWhile(_.fromIndex == srcHead.fromIndex)
.map(_.shift(rtHead.fromIndex, rtHead.toIndex))
val srcTail = srcEntities.drop(srcGroup.size)
srcGroup ++ merge(srcTail, rtTail, maxIndex)
} else {
// if we encounter a mismatch, it is most likely because of truncation,
// so we stop here.
Nil
}
}
}
/**
* Wraps all the entities with the appropriate WrappedEntity subclasses, merges them into
* a single list, and sorts by fromIndex.
*/
private def getWrappedEntities(tweet: Tweet): List[WrappedEntity] =
(getUrls(tweet).zipWithIndex.map { case (e, p) => WrappedUrlEntity(e, p) } ++
getMedia(tweet).zipWithIndex.map { case (e, p) => WrappedMediaEntity(e, p) } ++
getMentions(tweet).zipWithIndex.map { case (e, p) => WrappedMentionEntity(e, p) } ++
getHashtags(tweet).zipWithIndex.map { case (e, p) => WrappedHashtagEntity(e, p) } ++
getCashtags(tweet).zipWithIndex.map { case (e, p) => WrappedCashtagEntity(e, p) })
.sortBy(_.fromIndex)
.toList
/**
* The thrift-entity classes don't share a common entity parent class, so we wrap
* them with a class that allows us to mix entities together into a single list, and
* to provide a generic interface for shifting indicies.
*/
private sealed abstract class WrappedEntity(
val fromIndex: Short,
val toIndex: Short,
val rawText: String) {
/** the original position of the entity within the entity group */
val position: Int
val normalizedText: String = TweetText.nfcNormalize(rawText).toLowerCase
def shift(fromIndex: Short, toIndex: Short): WrappedEntity
}
private case class WrappedUrlEntity(entity: UrlEntity, position: Int)
extends WrappedEntity(entity.fromIndex, entity.toIndex, entity.url) {
override def shift(fromIndex: Short, toIndex: Short): WrappedUrlEntity =
copy(entity.copy(fromIndex = fromIndex, toIndex = toIndex))
}
private case class WrappedMediaEntity(entity: MediaEntity, position: Int)
extends WrappedEntity(entity.fromIndex, entity.toIndex, entity.url) {
override def shift(fromIndex: Short, toIndex: Short): WrappedMediaEntity =
copy(entity.copy(fromIndex = fromIndex, toIndex = toIndex))
}
private case class WrappedMentionEntity(entity: MentionEntity, position: Int)
extends WrappedEntity(entity.fromIndex, entity.toIndex, entity.screenName) {
override def shift(fromIndex: Short, toIndex: Short): WrappedMentionEntity =
copy(entity.copy(fromIndex = fromIndex, toIndex = toIndex))
}
private case class WrappedHashtagEntity(entity: HashtagEntity, position: Int)
extends WrappedEntity(entity.fromIndex, entity.toIndex, entity.text) {
override def shift(fromIndex: Short, toIndex: Short): WrappedHashtagEntity =
copy(entity.copy(fromIndex = fromIndex, toIndex = toIndex))
}
private case class WrappedCashtagEntity(entity: CashtagEntity, position: Int)
extends WrappedEntity(entity.fromIndex, entity.toIndex, entity.text) {
override def shift(fromIndex: Short, toIndex: Short): WrappedCashtagEntity =
copy(entity.copy(fromIndex = fromIndex, toIndex = toIndex))
}
}