the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/storage/TweetUtils.scala

package com.twitter.tweetypie.storage

import com.twitter.logging.Logger
import com.twitter.scrooge.TFieldBlob
import com.twitter.snowflake.id.SnowflakeId
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
import com.twitter.storage.client.manhattan.kv.ManhattanException
import com.twitter.tweetypie.storage.Response._
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.util.Try

object TweetUtils {
  val log: Logger = Logger("com.twitter.tweetypie.storage.TweetStorageLibrary")
  import FieldResponseCodec.ValueNotFoundException

  /**
   * It's rare, but we have seen tweets with userId=0, which is likely the result of a
   * failed/partial delete. Treat these as invalid tweets, which are returned to callers
   * as not found.
   */
  def isValid(tweet: StoredTweet): Boolean =
    tweet.userId.exists(_ != 0) && tweet.text.nonEmpty &&
      tweet.createdVia.nonEmpty && tweet.createdAtSec.nonEmpty

  /**
   * Helper function to extract Scrubbed field Ids from the result returned by reading entire tweet prefix
   * function.
   *
   * @param records The sequence of MH records for the given tweetId
   *
   * @return The set of scrubbed field ids
   */
  private[tweetypie] def extractScrubbedFields(records: Seq[TweetManhattanRecord]): Set[Short] =
    records
      .map(r => r.lkey)
      .collect { case TweetKey.LKey.ScrubbedFieldKey(fieldId) => fieldId }
      .toSet

  private[tweetypie] val expectedFields =
    TweetFields.requiredFieldIds.toSet - TweetFields.tweetIdField

  /**
   * Find the timestamp from a tweetId and a list of MH records. This is used when
   * you need a timestamp and you aren't sure that tweetId is a snowflake id.
   *
   * @param tweetId A tweetId you want the timestamp for.
   * @param records Tbird_mh records keyed on tweetId, one of which should be the
   * core fields record.
   * @return A milliseconds timestamp if one could be found.
   */
  private[tweetypie] def creationTimeFromTweetIdOrMHRecords(
    tweetId: Long,
    records: Seq[TweetManhattanRecord]
  ): Option[Long] =
    SnowflakeId
      .unixTimeMillisOptFromId(tweetId).orElse({
        records
          .find(_.lkey == TweetKey.LKey.CoreFieldsKey)
          .flatMap { coreFields =>
            CoreFieldsCodec
              .fromTFieldBlob(
                TFieldBlobCodec.fromByteBuffer(coreFields.value.contents)
              ).createdAtSec.map(seconds => seconds * 1000)
          }
      })

  /**
   * Helper function used to parse manhattan results for fields in a tweet (given in the form of
   * Sequence of (FieldKey, Try[Unit]) pairs) and build a TweetResponse object.
   *
   * @param callerName The name of the caller function. Used for error messages
   * @param tweetId Id of the Tweet for which TweetResponse is being built
   * @param fieldResults Sequence of (FieldKey, Try[Unit]).
   *
   * @return TweetResponse object
   */
  private[tweetypie] def buildTweetResponse(
    callerName: String,
    tweetId: Long,
    fieldResults: Map[FieldId, Try[Unit]]
  ): TweetResponse = {
    // Count Found/Not Found
    val successCount =
      fieldResults.foldLeft(0) {
        case (count, (_, Return(_))) => count + 1
        case (count, (_, Throw(_: ValueNotFoundException))) => count + 1
        case (count, _) => count
      }

    val fieldResponsesMap = getFieldResponses(callerName, tweetId, fieldResults)

    val overallCode = if (successCount > 0 && successCount == fieldResults.size) {
      TweetResponseCode.Success
    } else {

      // If any field was rate limited, then we consider the entire tweet to be rate limited. So first we scan
      // the field results to check such an occurrence.
      val wasRateLimited = fieldResults.exists { fieldResult =>
        fieldResult._2 match {
          case Throw(e: DeniedManhattanException) => true
          case _ => false
        }
      }

      // Were we rate limited for any of the additional fields?
      if (wasRateLimited) {
        TweetResponseCode.OverCapacity
      } else if (successCount == 0) {
        // successCount is < fieldResults.size at this point. So if allOrNone is true or
        // if successCount == 0 (i.e failed on all Fields), the overall code should be 'Failure'
        TweetResponseCode.Failure
      } else {
        // allOrNone == false AND successCount > 0 at this point. Clearly the overallCode should be Partial
        TweetResponseCode.Partial
      }
    }

    TweetResponse(tweetId, overallCode, Some(fieldResponsesMap))

  }

  /**
   * Helper function to convert manhattan results into a Map[FieldId, FieldResponse]
   *
   * @param fieldResults Sequence of (TweetKey, TFieldBlob).
   */
  private[tweetypie] def getFieldResponses(
    callerName: String,
    tweetId: TweetId,
    fieldResults: Map[FieldId, Try[_]]
  ): Map[FieldId, FieldResponse] =
    fieldResults.map {
      case (fieldId, resp) =>
        def keyStr = TweetKey.fieldKey(tweetId, fieldId).toString
        resp match {
          case Return(_) =>
            fieldId -> FieldResponse(FieldResponseCode.Success, None)
          case Throw(mhException: ManhattanException) =>
            val errMsg = s"Exception in $callerName. Key: $keyStr. Error: $mhException"
            mhException match {
              case _: ValueNotFoundException => // ValueNotFound is not an error
              case _ => log.error(errMsg)
            }
            fieldId -> FieldResponseCodec.fromThrowable(mhException, Some(errMsg))
          case Throw(e) =>
            val errMsg = s"Exception in $callerName. Key: $keyStr. Error: $e"
            log.error(errMsg)
            fieldId -> FieldResponse(FieldResponseCode.Error, Some(errMsg))
        }
    }

  /**
   * Helper function to build a TweetResponse object when being rate limited. Its possible that only some of the fields
   * got rate limited, so we indicate which fields got processed successfully, and which encountered some sort of error.
   *
   * @param tweetId Tweet id
   * @param callerName name of API calling this function
   * @param fieldResponses field responses for the case where
   *
   * @return The TweetResponse object
   */
  private[tweetypie] def buildTweetOverCapacityResponse(
    callerName: String,
    tweetId: Long,
    fieldResponses: Map[FieldId, Try[Unit]]
  ) = {
    val fieldResponsesMap = getFieldResponses(callerName, tweetId, fieldResponses)
    TweetResponse(tweetId, TweetResponseCode.OverCapacity, Some(fieldResponsesMap))
  }

  /**
   * Build a StoredTweet from a Seq of records. Core fields are handled specially.
   */
  private[tweetypie] def buildStoredTweet(
    tweetId: TweetId,
    records: Seq[TweetManhattanRecord],
    includeScrubbed: Boolean = false,
  ): StoredTweet = {
    getStoredTweetBlobs(records, includeScrubbed)
      .flatMap { fieldBlob =>
        // When fieldId == TweetFields.rootCoreFieldId, we have further work to do since the
        // 'value' is really serialized/packed version of all core fields. In this case we'll have
        // to unpack it into many TFieldBlobs.
        if (fieldBlob.id == TweetFields.rootCoreFieldId) {
          // We won't throw any error in this function and instead let the caller function handle this
          // condition (i.e If the caller function does not find any values for the core-fields in
          // the returned map, it should assume that the tweet is not found)
          CoreFieldsCodec.unpackFields(fieldBlob).values.toSeq
        } else {
          Seq(fieldBlob)
        }
      }.foldLeft(StoredTweet(tweetId))(_.setField(_))
  }

  private[tweetypie] def buildValidStoredTweet(
    tweetId: TweetId,
    records: Seq[TweetManhattanRecord]
  ): Option[StoredTweet] = {
    val storedTweet = buildStoredTweet(tweetId, records)
    if (storedTweet.getFieldBlobs(expectedFields).nonEmpty && isValid(storedTweet)) {
      Some(storedTweet)
    } else {
      None
    }
  }

  /**
   * Return a TFieldBlob for each StoredTweet field defined in this set of records.
   * @param includeScrubbed when false, result will not include scrubbed fields even
   *                        if the data is present in the set of records.
   */
  private[tweetypie] def getStoredTweetBlobs(
    records: Seq[TweetManhattanRecord],
    includeScrubbed: Boolean = false,
  ): Seq[TFieldBlob] = {
    val scrubbed = extractScrubbedFields(records)

    records
      .flatMap { r =>
        // extract LKey.FieldKey records if they are not scrubbed and get their TFieldBlobs
        r.key match {
          case fullKey @ TweetKey(_, key: TweetKey.LKey.FieldKey)
              if includeScrubbed || !scrubbed.contains(key.fieldId) =>
            try {
              val fieldBlob = TFieldBlobCodec.fromByteBuffer(r.value.contents)
              if (fieldBlob.field.id != key.fieldId) {
                throw new AssertionError(
                  s"Blob stored for $fullKey has unexpected id ${fieldBlob.field.id}"
                )
              }
              Some(fieldBlob)
            } catch {
              case e: VersionMismatchError =>
                log.error(
                  s"Failed to decode bytebuffer for $fullKey: ${e.getMessage}"
                )
                throw e
            }
          case _ => None
        }
      }
  }

  /**
   * Its important to bubble up rate limiting exceptions as they would likely be the root cause for other issues
   * (timeouts etc.), so we scan for this particular exception, and if found, we bubble that up specifically
   *
   * @param seqOfTries The sequence of tries which may contain within it a rate limit exception
   *
   * @return if a rate limiting exn was detected, this will be a Throw(e: DeniedManhattanException)
   *         otherwise it will be a Return(_) only if all individual tries succeeded
   */
  private[tweetypie] def collectWithRateLimitCheck(seqOfTries: Seq[Try[Unit]]): Try[Unit] = {
    val rateLimitThrowOpt = seqOfTries.find {
      case Throw(e: DeniedManhattanException) => true
      case _ => false
    }

    rateLimitThrowOpt.getOrElse(
      Try.collect(seqOfTries).map(_ => ())
    ) // Operation is considered successful only if all the deletions are successful
  }
}