the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/tflock/TFlockIndexer.scala

/** Copyright 2010 Twitter, Inc. */
package com.twitter.tweetypie
package tflock

import com.twitter.finagle.stats.Counter
import com.twitter.flockdb.client._
import com.twitter.flockdb.client.thriftscala.Priority
import com.twitter.snowflake.id.SnowflakeId
import com.twitter.tweetypie.serverutil.StoredCard
import com.twitter.tweetypie.thriftscala._
import com.twitter.util.Future
import scala.collection.mutable.ListBuffer

object TFlockIndexer {

  /**
   * Printable names for some edge types currently defined in [[com.twitter.flockdb.client]].
   * Used to defined stats counters for adding edges.
   */
  val graphNames: Map[Int, String] =
    Map(
      CardTweetsGraph.id -> "card_tweets",
      ConversationGraph.id -> "conversation",
      DirectedAtUserIdGraph.id -> "directed_at_user_id",
      InvitedUsersGraph.id -> "invited_users",
      MediaTimelineGraph.id -> "media_timeline",
      MentionsGraph.id -> "mentions",
      NarrowcastSentTweetsGraph.id -> "narrowcast_sent_tweets",
      NullcastedTweetsGraph.id -> "nullcasted_tweets",
      QuotersGraph.id -> "quoters",
      QuotesGraph.id -> "quotes",
      QuoteTweetsIndexGraph.id -> "quote_tweets_index",
      RepliesToTweetsGraph.id -> "replies_to_tweets",
      RetweetsByMeGraph.id -> "retweets_by_me",
      RetweetsGraph.id -> "retweets",
      RetweetsOfMeGraph.id -> "retweets_of_me",
      RetweetSourceGraph.id -> "retweet_source",
      TweetsRetweetedGraph.id -> "tweets_retweeted",
      UserTimelineGraph.id -> "user_timeline",
      CreatorSubscriptionTimelineGraph.id -> "creator_subscription_timeline",
      CreatorSubscriptionMediaTimelineGraph.id -> "creator_subscription_image_timeline",
    )

  /**
   * On edge deletion, edges are either archived permanently or retained for 3 months, based on
   * the retention policy in the above confluence page.
   *
   * These two retention policies correspond to the two deletion techniques: archive and remove.
   * We call removeEdges for edges with a short retention policy and archiveEdges for edges with
   * a permanent retention policy.
   */
  val graphsWithRemovedEdges: Seq[Int] =
    Seq(
      CardTweetsGraph.id,
      CuratedTimelineGraph.id,
      CuratedTweetsGraph.id,
      DirectedAtUserIdGraph.id,
      MediaTimelineGraph.id,
      MutedConversationsGraph.id,
      QuotersGraph.id,
      QuotesGraph.id,
      QuoteTweetsIndexGraph.id,
      ReportedTweetsGraph.id,
      RetweetsOfMeGraph.id,
      RetweetSourceGraph.id,
      SoftLikesGraph.id,
      TweetsRetweetedGraph.id,
      CreatorSubscriptionTimelineGraph.id,
      CreatorSubscriptionMediaTimelineGraph.id,
    )

  /**
   * These edges should be left in place when bounced tweets are deleted.
   * These edges are removed during hard deletion.
   *
   * This is done so external teams (timelines) can execute on these edges for
   * tombstone feature.
   */
  val bounceDeleteGraphIds: Set[Int] =
    Set(
      UserTimelineGraph.id,
      ConversationGraph.id
    )

  def makeCounters(stats: StatsReceiver, operation: String): Map[Int, Counter] = {
    TFlockIndexer.graphNames
      .mapValues(stats.scope(_).counter(operation))
      .withDefaultValue(stats.scope("unknown").counter(operation))
  }
}

/**
 * @param backgroundIndexingPriority specifies the queue to use for
 *   background indexing operations. This is useful for making the
 *   effects of background indexing operations (such as deleting edges
 *   for deleted Tweets) available sooner in testing scenarios
 *   (end-to-end tests or development instances). It is set to
 *   Priority.Low in production to reduce the load on high priority
 *   queues that we use for prominently user-visible operations.
 */
class TFlockIndexer(
  tflock: TFlockClient,
  hasMedia: Tweet => Boolean,
  backgroundIndexingPriority: Priority,
  stats: StatsReceiver)
    extends TweetIndexer {
  private[this] val FutureNil = Future.Nil

  private[this] val archiveCounters = TFlockIndexer.makeCounters(stats, "archive")
  private[this] val removeCounters = TFlockIndexer.makeCounters(stats, "remove")
  private[this] val insertCounters = TFlockIndexer.makeCounters(stats, "insert")
  private[this] val negateCounters = TFlockIndexer.makeCounters(stats, "negate")

  private[this] val foregroundIndexingPriority: Priority = Priority.High

  override def createIndex(tweet: Tweet): Future[Unit] =
    createEdges(tweet, isUndelete = false)

  override def undeleteIndex(tweet: Tweet): Future[Unit] =
    createEdges(tweet, isUndelete = true)

  private[this] case class PartitionedEdges(
    longRetention: Seq[ExecuteEdge[StatusGraph]] = Nil,
    shortRetention: Seq[ExecuteEdge[StatusGraph]] = Nil,
    negate: Seq[ExecuteEdge[StatusGraph]] = Nil,
    ignore: Seq[ExecuteEdge[StatusGraph]] = Nil)

  private[this] def partitionEdgesForDelete(
    edges: Seq[ExecuteEdge[StatusGraph]],
    isBounceDelete: Boolean
  ) =
    edges.foldLeft(PartitionedEdges()) {
      // Two dependees of UserTimelineGraph edge states to satisfy: timelines & safety tools.
      // Timelines show bounce-deleted tweets as tombstones; regular deletes are not shown.
      //   - i.e. timelineIds = UserTimelineGraph(Normal || Negative)
      // Safety tools show deleted tweets to authorized internal review agents
      //   - i.e. deletedIds = UserTimelineGraph(Removed || Negative)
      case (partitionedEdges, edge) if isBounceDelete && edge.graphId == UserTimelineGraph.id =>
        partitionedEdges.copy(negate = edge +: partitionedEdges.negate)

      case (partitionedEdges, edge) if isBounceDelete && edge.graphId == ConversationGraph.id =>
        // Bounce-deleted tweets remain rendered as tombstones in conversations, so do not modify
        // the ConversationGraph edge state
        partitionedEdges.copy(ignore = edge +: partitionedEdges.ignore)

      case (partitionedEdges, edge)
          if TFlockIndexer.graphsWithRemovedEdges.contains(edge.graphId) =>
        partitionedEdges.copy(shortRetention = edge +: partitionedEdges.shortRetention)

      case (partitionedEdges, edge) =>
        partitionedEdges.copy(longRetention = edge +: partitionedEdges.longRetention)
    }

  override def deleteIndex(tweet: Tweet, isBounceDelete: Boolean): Future[Unit] =
    for {
      edges <- getEdges(tweet, isCreate = false, isDelete = true, isUndelete = false)
      partitionedEdges = partitionEdgesForDelete(edges, isBounceDelete)
      () <-
        Future
          .join(
            tflock
              .archiveEdges(partitionedEdges.longRetention, backgroundIndexingPriority)
              .onSuccess(_ =>
                partitionedEdges.longRetention.foreach(e => archiveCounters(e.graphId).incr())),
            tflock
              .removeEdges(partitionedEdges.shortRetention, backgroundIndexingPriority)
              .onSuccess(_ =>
                partitionedEdges.shortRetention.foreach(e => removeCounters(e.graphId).incr())),
            tflock
              .negateEdges(partitionedEdges.negate, backgroundIndexingPriority)
              .onSuccess(_ =>
                partitionedEdges.negate.foreach(e => negateCounters(e.graphId).incr()))
          )
          .unit
    } yield ()

  /**
   * This operation is called when a user is put into or taken out of
   * a state in which their retweets should no longer be visible
   * (e.g. suspended or ROPO).
   */
  override def setRetweetVisibility(retweetId: TweetId, setVisible: Boolean): Future[Unit] = {
    val retweetEdge = Seq(ExecuteEdge(retweetId, RetweetsGraph, None, Reverse))

    if (setVisible) {
      tflock
        .insertEdges(retweetEdge, backgroundIndexingPriority)
        .onSuccess(_ => insertCounters(RetweetsGraph.id).incr())
    } else {
      tflock
        .archiveEdges(retweetEdge, backgroundIndexingPriority)
        .onSuccess(_ => archiveCounters(RetweetsGraph.id).incr())
    }
  }

  private[this] def createEdges(tweet: Tweet, isUndelete: Boolean): Future[Unit] =
    for {
      edges <- getEdges(tweet = tweet, isCreate = true, isDelete = false, isUndelete = isUndelete)
      () <- tflock.insertEdges(edges, foregroundIndexingPriority)
    } yield {
      // Count all the edges we've successfully added:
      edges.foreach(e => insertCounters(e.graphId).incr())
    }

  private[this] def addRTEdges(
    tweet: Tweet,
    share: Share,
    isCreate: Boolean,
    edges: ListBuffer[ExecuteEdge[StatusGraph]],
    futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]
  ): Unit = {

    edges += RetweetsOfMeGraph.edge(share.sourceUserId, tweet.id)
    edges += RetweetsByMeGraph.edge(getUserId(tweet), tweet.id)
    edges += RetweetsGraph.edge(share.sourceStatusId, tweet.id)

    if (isCreate) {
      edges += ExecuteEdge(
        sourceId = getUserId(tweet),
        graph = RetweetSourceGraph,
        destinationIds = Some(Seq(share.sourceStatusId)),
        direction = Forward,
        position = Some(SnowflakeId(tweet.id).time.inMillis)
      )
      edges.append(TweetsRetweetedGraph.edge(share.sourceUserId, share.sourceStatusId))
    } else {
      edges += RetweetSourceGraph.edge(getUserId(tweet), share.sourceStatusId)

      // if this is the last retweet we need to remove it from the source user's
      // tweets retweeted graph
      futureEdges.append(
        tflock.count(RetweetsGraph.from(share.sourceStatusId)).flatMap { count =>
          if (count <= 1) {
            tflock.selectAll(RetweetsGraph.from(share.sourceStatusId)).map { tweets =>
              if (tweets.size <= 1)
                Seq(TweetsRetweetedGraph.edge(share.sourceUserId, share.sourceStatusId))
              else
                Nil
            }
          } else {
            FutureNil
          }
        }
      )
    }
  }

  private[this] def addReplyEdges(
    tweet: Tweet,
    edges: ListBuffer[ExecuteEdge[StatusGraph]]
  ): Unit = {
    getReply(tweet).foreach { reply =>
      reply.inReplyToStatusId.flatMap { inReplyToStatusId =>
        edges += RepliesToTweetsGraph.edge(inReplyToStatusId, tweet.id)

        // only index conversationId if this is a reply to another tweet
        TweetLenses.conversationId.get(tweet).map { conversationId =>
          edges += ConversationGraph.edge(conversationId, tweet.id)
        }
      }
    }
  }

  private[this] def addDirectedAtEdges(
    tweet: Tweet,
    edges: ListBuffer[ExecuteEdge[StatusGraph]]
  ): Unit = {
    TweetLenses.directedAtUser.get(tweet).foreach { directedAtUser =>
      edges += DirectedAtUserIdGraph.edge(directedAtUser.userId, tweet.id)
    }
  }

  private[this] def addMentionEdges(
    tweet: Tweet,
    edges: ListBuffer[ExecuteEdge[StatusGraph]]
  ): Unit = {
    getMentions(tweet)
      .flatMap(_.userId).foreach { mention =>
        edges += MentionsGraph.edge(mention, tweet.id)
      }
  }

  private[this] def addQTEdges(
    tweet: Tweet,
    edges: ListBuffer[ExecuteEdge[StatusGraph]],
    futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]],
    isCreate: Boolean
  ): Unit = {
    val userId = getUserId(tweet)

    tweet.quotedTweet.foreach { quotedTweet =>
      // Regardless of tweet creates/deletes, we add the corresponding edges to the
      // following two graphs. Note that we're handling the case for
      // the QuotersGraph slightly differently in the tweet delete case.
      edges.append(QuotesGraph.edge(quotedTweet.userId, tweet.id))
      edges.append(QuoteTweetsIndexGraph.edge(quotedTweet.tweetId, tweet.id))
      if (isCreate) {
        // As mentioned above, for tweet creates we go ahead and add an edge
        // to the QuotersGraph without any additional checks.
        edges.append(QuotersGraph.edge(quotedTweet.tweetId, userId))
      } else {
        // For tweet deletes, we only add an edge to be deleted from the
        // QuotersGraph if the tweeting user isn't quoting the tweet anymore
        // i.e. if a user has quoted a tweet multiple times, we only delete
        // an edge from the QuotersGraph if they've deleted all the quotes,
        // otherwise an edge should exist by definition of what the QuotersGraph
        // represents.

        // Note: There can be a potential edge case here due to a race condition
        // in the following scenario.
        // i)   A quotes a tweet T twice resulting in tweets T1 and T2.
        // ii)  There should exist edges in the QuotersGraph from T -> A and T1 <-> T, T2 <-> T in
        //      the QuoteTweetsIndexGraph, but one of the edges haven't been written
        //      to the QuoteTweetsIndex graph in TFlock yet.
        // iii) In this scenario, we shouldn't really be deleting an edge as we're doing below.
        // The approach that we're taking below is a "best effort" approach similar to what we
        // currently do for RTs.

        // Find all the quotes of the quoted tweet from the quoting user
        val quotesFromQuotingUser = QuoteTweetsIndexGraph
          .from(quotedTweet.tweetId)
          .intersect(UserTimelineGraph.from(userId))
        futureEdges.append(
          tflock
            .count(quotesFromQuotingUser).flatMap { count =>
              // If this is the last quote of the quoted tweet from the quoting user,
              // we go ahead and delete the edge from the QuotersGraph.
              if (count <= 1) {
                tflock.selectAll(quotesFromQuotingUser).map { tweets =>
                  if (tweets.size <= 1) {
                    Seq(QuotersGraph.edge(quotedTweet.tweetId, userId))
                  } else {
                    Nil
                  }
                }
              } else {
                FutureNil
              }
            }
        )
      }
    }
  }

  private[this] def addCardEdges(
    tweet: Tweet,
    edges: ListBuffer[ExecuteEdge[StatusGraph]]
  ): Unit = {
    // Note that we are indexing only the TOO "stored" cards
    // (cardUri=card://<cardId>). Rest of the cards are ignored here.
    tweet.cardReference
      .collect {
        case StoredCard(id) =>
          edges.append(CardTweetsGraph.edge(id, tweet.id))
      }.getOrElse(())
  }

  // Note: on undelete, this method restores all archived edges, including those that may have
  // been archived prior to the delete. This is incorrect behavior but in practice rarely
  // causes problems, as undeletes are so rare.
  private[this] def addEdgesForDeleteOrUndelete(
    tweet: Tweet,
    edges: ListBuffer[ExecuteEdge[StatusGraph]]
  ): Unit = {
    edges.appendAll(
      Seq(
        MentionsGraph.edges(tweet.id, None, Reverse),
        RepliesToTweetsGraph.edges(tweet.id, None)
      )
    )

    // When we delete or undelete a conversation control root Tweet we want to archive or restore
    // all the edges in InvitedUsersGraph from the Tweet id.
    if (hasConversationControl(tweet) && isConversationRoot(tweet)) {
      edges.append(InvitedUsersGraph.edges(tweet.id, None))
    }
  }

  private[this] def addSimpleEdges(
    tweet: Tweet,
    edges: ListBuffer[ExecuteEdge[StatusGraph]]
  ): Unit = {
    if (TweetLenses.nullcast.get(tweet)) {
      edges.append(NullcastedTweetsGraph.edge(getUserId(tweet), tweet.id))
    } else if (TweetLenses.narrowcast.get(tweet).isDefined) {
      edges.append(NarrowcastSentTweetsGraph.edge(getUserId(tweet), tweet.id))
    } else {
      edges.append(UserTimelineGraph.edge(getUserId(tweet), tweet.id))

      if (hasMedia(tweet))
        edges.append(MediaTimelineGraph.edge(getUserId(tweet), tweet.id))

      // Index root creator subscription tweets.
      // Ignore replies because those are not necessarily visible to a user who subscribes to tweet author
      val isRootTweet: Boolean = tweet.coreData match {
        case Some(c) => c.reply.isEmpty && c.share.isEmpty
        case None => true
      }

      if (tweet.exclusiveTweetControl.isDefined && isRootTweet) {
        edges.append(CreatorSubscriptionTimelineGraph.edge(getUserId(tweet), tweet.id))

        if (hasMedia(tweet))
          edges.append(CreatorSubscriptionMediaTimelineGraph.edge(getUserId(tweet), tweet.id))
      }
    }
  }

  /**
   * Issues edges for each mention of user in a conversation-controlled tweet. This way InvitedUsers
   * graph accumulates complete set of ids for @mention-invited users, by conversation id.
   */
  private def invitedUsersEdgesForCreate(
    tweet: Tweet,
    edges: ListBuffer[ExecuteEdge[StatusGraph]]
  ): Unit = {
    val conversationId: Long = getConversationId(tweet).getOrElse(tweet.id)
    val mentions: Seq[UserId] = getMentions(tweet).flatMap(_.userId)
    edges.appendAll(mentions.map(userId => InvitedUsersGraph.edge(conversationId, userId)))
  }

  /**
   * Issues edges of InviteUsersGraph that ought to be deleted for a conversation controlled reply.
   * These are mentions of users in the given tweet, only if the user was not mentioned elsewhere
   * in the conversation. This way for a conversation, InvitedUsersGraph would always hold a set
   * of all users invited to the conversation, and an edge is removed only after the last mention of
   * a user is deleted.
   */
  private def invitedUsersEdgesForDelete(
    tweet: Tweet,
    futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]
  ): Unit = {
    getConversationId(tweet).foreach { conversationId: Long =>
      val mentions: Seq[UserId] = getMentions(tweet).flatMap(_.userId)
      mentions.foreach { userId =>
        val tweetIdsWithinConversation = ConversationGraph.from(conversationId)
        val tweetIdsThatMentionUser = MentionsGraph.from(userId)
        futureEdges.append(
          tflock
            .selectAll(
              query = tweetIdsThatMentionUser.intersect(tweetIdsWithinConversation),
              limit = Some(2), // Just need to know if it is >1 or <=1, so 2 are enough.
              pageSize = None // Provide default, otherwise Mockito complains
            ).map { tweetIds: Seq[Long] =>
              if (tweetIds.size <= 1) {
                Seq(InvitedUsersGraph.edge(conversationId, userId))
              } else {
                Nil
              }
            }
        )
      }
    }
  }

  private def hasInviteViaMention(tweet: Tweet): Boolean = {
    tweet.conversationControl match {
      case Some(ConversationControl.ByInvitation(controls)) =>
        controls.inviteViaMention.getOrElse(false)
      case Some(ConversationControl.Community(controls)) =>
        controls.inviteViaMention.getOrElse(false)
      case Some(ConversationControl.Followers(followers)) =>
        followers.inviteViaMention.getOrElse(false)
      case _ =>
        false
    }
  }

  private def hasConversationControl(tweet: Tweet): Boolean =
    tweet.conversationControl.isDefined

  // If a Tweet has a ConversationControl, it must have a ConversationId associated with it so we
  // can compare the ConversationId with the current Tweet ID to determine if it's the root of the
  // conversation. See ConversationIdHydrator for more details
  private def isConversationRoot(tweet: Tweet): Boolean =
    getConversationId(tweet).get == tweet.id

  private def addInvitedUsersEdges(
    tweet: Tweet,
    isCreate: Boolean,
    isUndelete: Boolean,
    edges: ListBuffer[ExecuteEdge[StatusGraph]],
    futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]
  ): Unit = {
    if (hasConversationControl(tweet)) {
      if (isCreate) {
        if (isConversationRoot(tweet) && !isUndelete) {
          // For root Tweets, only add edges for original creates, not for undeletes.
          // Undeletes are handled by addEdgesForDeleteOrUndelete.
          invitedUsersEdgesForCreate(tweet, edges)
        }
        if (!isConversationRoot(tweet) && hasInviteViaMention(tweet)) {
          // For replies, only add edges when the conversation control is in inviteViaMention mode.
          invitedUsersEdgesForCreate(tweet, edges)
        }
      } else {
        if (!isConversationRoot(tweet)) {
          invitedUsersEdgesForDelete(tweet, futureEdges)
        }
      }
    }
  }

  private[this] def getEdges(
    tweet: Tweet,
    isCreate: Boolean,
    isDelete: Boolean,
    isUndelete: Boolean
  ): Future[Seq[ExecuteEdge[StatusGraph]]] = {
    val edges = ListBuffer[ExecuteEdge[StatusGraph]]()
    val futureEdges = ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]()

    addSimpleEdges(tweet, edges)
    getShare(tweet) match {
      case Some(share) => addRTEdges(tweet, share, isCreate, edges, futureEdges)
      case _ =>
        addInvitedUsersEdges(tweet, isCreate, isUndelete, edges, futureEdges)
        addReplyEdges(tweet, edges)
        addDirectedAtEdges(tweet, edges)
        addMentionEdges(tweet, edges)
        addQTEdges(tweet, edges, futureEdges, isCreate)
        addCardEdges(tweet, edges)
        if (isDelete || isUndelete) {
          addEdgesForDeleteOrUndelete(tweet, edges)
        }
    }

    Future
      .collect(futureEdges)
      .map { moreEdges => (edges ++= moreEdges.flatten).toList }
  }
}