the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/storage/TweetUtils.scala

266 lines
10 KiB
Scala

package com.twitter.tweetypie.storage
import com.twitter.logging.Logger
import com.twitter.scrooge.TFieldBlob
import com.twitter.snowflake.id.SnowflakeId
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
import com.twitter.storage.client.manhattan.kv.ManhattanException
import com.twitter.tweetypie.storage.Response._
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.util.Try
object TweetUtils {
val log: Logger = Logger("com.twitter.tweetypie.storage.TweetStorageLibrary")
import FieldResponseCodec.ValueNotFoundException
/**
* It's rare, but we have seen tweets with userId=0, which is likely the result of a
* failed/partial delete. Treat these as invalid tweets, which are returned to callers
* as not found.
*/
def isValid(tweet: StoredTweet): Boolean =
tweet.userId.exists(_ != 0) && tweet.text.nonEmpty &&
tweet.createdVia.nonEmpty && tweet.createdAtSec.nonEmpty
/**
* Helper function to extract Scrubbed field Ids from the result returned by reading entire tweet prefix
* function.
*
* @param records The sequence of MH records for the given tweetId
*
* @return The set of scrubbed field ids
*/
private[tweetypie] def extractScrubbedFields(records: Seq[TweetManhattanRecord]): Set[Short] =
records
.map(r => r.lkey)
.collect { case TweetKey.LKey.ScrubbedFieldKey(fieldId) => fieldId }
.toSet
private[tweetypie] val expectedFields =
TweetFields.requiredFieldIds.toSet - TweetFields.tweetIdField
/**
* Find the timestamp from a tweetId and a list of MH records. This is used when
* you need a timestamp and you aren't sure that tweetId is a snowflake id.
*
* @param tweetId A tweetId you want the timestamp for.
* @param records Tbird_mh records keyed on tweetId, one of which should be the
* core fields record.
* @return A milliseconds timestamp if one could be found.
*/
private[tweetypie] def creationTimeFromTweetIdOrMHRecords(
tweetId: Long,
records: Seq[TweetManhattanRecord]
): Option[Long] =
SnowflakeId
.unixTimeMillisOptFromId(tweetId).orElse({
records
.find(_.lkey == TweetKey.LKey.CoreFieldsKey)
.flatMap { coreFields =>
CoreFieldsCodec
.fromTFieldBlob(
TFieldBlobCodec.fromByteBuffer(coreFields.value.contents)
).createdAtSec.map(seconds => seconds * 1000)
}
})
/**
* Helper function used to parse manhattan results for fields in a tweet (given in the form of
* Sequence of (FieldKey, Try[Unit]) pairs) and build a TweetResponse object.
*
* @param callerName The name of the caller function. Used for error messages
* @param tweetId Id of the Tweet for which TweetResponse is being built
* @param fieldResults Sequence of (FieldKey, Try[Unit]).
*
* @return TweetResponse object
*/
private[tweetypie] def buildTweetResponse(
callerName: String,
tweetId: Long,
fieldResults: Map[FieldId, Try[Unit]]
): TweetResponse = {
// Count Found/Not Found
val successCount =
fieldResults.foldLeft(0) {
case (count, (_, Return(_))) => count + 1
case (count, (_, Throw(_: ValueNotFoundException))) => count + 1
case (count, _) => count
}
val fieldResponsesMap = getFieldResponses(callerName, tweetId, fieldResults)
val overallCode = if (successCount > 0 && successCount == fieldResults.size) {
TweetResponseCode.Success
} else {
// If any field was rate limited, then we consider the entire tweet to be rate limited. So first we scan
// the field results to check such an occurrence.
val wasRateLimited = fieldResults.exists { fieldResult =>
fieldResult._2 match {
case Throw(e: DeniedManhattanException) => true
case _ => false
}
}
// Were we rate limited for any of the additional fields?
if (wasRateLimited) {
TweetResponseCode.OverCapacity
} else if (successCount == 0) {
// successCount is < fieldResults.size at this point. So if allOrNone is true or
// if successCount == 0 (i.e failed on all Fields), the overall code should be 'Failure'
TweetResponseCode.Failure
} else {
// allOrNone == false AND successCount > 0 at this point. Clearly the overallCode should be Partial
TweetResponseCode.Partial
}
}
TweetResponse(tweetId, overallCode, Some(fieldResponsesMap))
}
/**
* Helper function to convert manhattan results into a Map[FieldId, FieldResponse]
*
* @param fieldResults Sequence of (TweetKey, TFieldBlob).
*/
private[tweetypie] def getFieldResponses(
callerName: String,
tweetId: TweetId,
fieldResults: Map[FieldId, Try[_]]
): Map[FieldId, FieldResponse] =
fieldResults.map {
case (fieldId, resp) =>
def keyStr = TweetKey.fieldKey(tweetId, fieldId).toString
resp match {
case Return(_) =>
fieldId -> FieldResponse(FieldResponseCode.Success, None)
case Throw(mhException: ManhattanException) =>
val errMsg = s"Exception in $callerName. Key: $keyStr. Error: $mhException"
mhException match {
case _: ValueNotFoundException => // ValueNotFound is not an error
case _ => log.error(errMsg)
}
fieldId -> FieldResponseCodec.fromThrowable(mhException, Some(errMsg))
case Throw(e) =>
val errMsg = s"Exception in $callerName. Key: $keyStr. Error: $e"
log.error(errMsg)
fieldId -> FieldResponse(FieldResponseCode.Error, Some(errMsg))
}
}
/**
* Helper function to build a TweetResponse object when being rate limited. Its possible that only some of the fields
* got rate limited, so we indicate which fields got processed successfully, and which encountered some sort of error.
*
* @param tweetId Tweet id
* @param callerName name of API calling this function
* @param fieldResponses field responses for the case where
*
* @return The TweetResponse object
*/
private[tweetypie] def buildTweetOverCapacityResponse(
callerName: String,
tweetId: Long,
fieldResponses: Map[FieldId, Try[Unit]]
) = {
val fieldResponsesMap = getFieldResponses(callerName, tweetId, fieldResponses)
TweetResponse(tweetId, TweetResponseCode.OverCapacity, Some(fieldResponsesMap))
}
/**
* Build a StoredTweet from a Seq of records. Core fields are handled specially.
*/
private[tweetypie] def buildStoredTweet(
tweetId: TweetId,
records: Seq[TweetManhattanRecord],
includeScrubbed: Boolean = false,
): StoredTweet = {
getStoredTweetBlobs(records, includeScrubbed)
.flatMap { fieldBlob =>
// When fieldId == TweetFields.rootCoreFieldId, we have further work to do since the
// 'value' is really serialized/packed version of all core fields. In this case we'll have
// to unpack it into many TFieldBlobs.
if (fieldBlob.id == TweetFields.rootCoreFieldId) {
// We won't throw any error in this function and instead let the caller function handle this
// condition (i.e If the caller function does not find any values for the core-fields in
// the returned map, it should assume that the tweet is not found)
CoreFieldsCodec.unpackFields(fieldBlob).values.toSeq
} else {
Seq(fieldBlob)
}
}.foldLeft(StoredTweet(tweetId))(_.setField(_))
}
private[tweetypie] def buildValidStoredTweet(
tweetId: TweetId,
records: Seq[TweetManhattanRecord]
): Option[StoredTweet] = {
val storedTweet = buildStoredTweet(tweetId, records)
if (storedTweet.getFieldBlobs(expectedFields).nonEmpty && isValid(storedTweet)) {
Some(storedTweet)
} else {
None
}
}
/**
* Return a TFieldBlob for each StoredTweet field defined in this set of records.
* @param includeScrubbed when false, result will not include scrubbed fields even
* if the data is present in the set of records.
*/
private[tweetypie] def getStoredTweetBlobs(
records: Seq[TweetManhattanRecord],
includeScrubbed: Boolean = false,
): Seq[TFieldBlob] = {
val scrubbed = extractScrubbedFields(records)
records
.flatMap { r =>
// extract LKey.FieldKey records if they are not scrubbed and get their TFieldBlobs
r.key match {
case fullKey @ TweetKey(_, key: TweetKey.LKey.FieldKey)
if includeScrubbed || !scrubbed.contains(key.fieldId) =>
try {
val fieldBlob = TFieldBlobCodec.fromByteBuffer(r.value.contents)
if (fieldBlob.field.id != key.fieldId) {
throw new AssertionError(
s"Blob stored for $fullKey has unexpected id ${fieldBlob.field.id}"
)
}
Some(fieldBlob)
} catch {
case e: VersionMismatchError =>
log.error(
s"Failed to decode bytebuffer for $fullKey: ${e.getMessage}"
)
throw e
}
case _ => None
}
}
}
/**
* Its important to bubble up rate limiting exceptions as they would likely be the root cause for other issues
* (timeouts etc.), so we scan for this particular exception, and if found, we bubble that up specifically
*
* @param seqOfTries The sequence of tries which may contain within it a rate limit exception
*
* @return if a rate limiting exn was detected, this will be a Throw(e: DeniedManhattanException)
* otherwise it will be a Return(_) only if all individual tries succeeded
*/
private[tweetypie] def collectWithRateLimitCheck(seqOfTries: Seq[Try[Unit]]): Try[Unit] = {
val rateLimitThrowOpt = seqOfTries.find {
case Throw(e: DeniedManhattanException) => true
case _ => false
}
rateLimitThrowOpt.getOrElse(
Try.collect(seqOfTries).map(_ => ())
) // Operation is considered successful only if all the deletions are successful
}
}