the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/storage/Codecs.scala

243 lines
11 KiB
Scala

package com.twitter.tweetypie.storage
import com.twitter.bijection.Conversion.asMethod
import com.twitter.bijection.Injection
import com.twitter.scrooge.TFieldBlob
import com.twitter.storage.client.manhattan.kv._
import com.twitter.tweetypie.storage.Response.FieldResponse
import com.twitter.tweetypie.storage.Response.FieldResponseCode
import com.twitter.tweetypie.storage_internal.thriftscala.CoreFields
import com.twitter.tweetypie.storage_internal.thriftscala.InternalTweet
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
import java.io.ByteArrayOutputStream
import java.nio.ByteBuffer
import org.apache.thrift.protocol.TBinaryProtocol
import org.apache.thrift.transport.TIOStreamTransport
import org.apache.thrift.transport.TMemoryInputTransport
import scala.collection.immutable
import scala.util.control.NoStackTrace
// NOTE: All field ids and Tweet structure in this file correspond to the StoredTweet struct ONLY
object ByteArrayCodec {
def toByteBuffer(byteArray: Array[Byte]): ByteBuffer = byteArray.as[ByteBuffer]
def fromByteBuffer(buffer: ByteBuffer): Array[Byte] = buffer.as[Array[Byte]]
}
object StringCodec {
private val string2ByteBuffer = Injection.connect[String, Array[Byte], ByteBuffer]
def toByteBuffer(strValue: String): ByteBuffer = string2ByteBuffer(strValue)
def fromByteBuffer(buffer: ByteBuffer): String = string2ByteBuffer.invert(buffer).get
}
/**
* Terminology
* -----------
* Tweet id field : The field number of 'tweetId' in the 'Tweet' thrift structure (i.e "1")
*
* First AdditionalField id : The ID if the first additional field in 'Tweet' thrift structure. All field Ids less than this are
* considered internal and all the ids greater than or equal to this field id are considered 'Additional fields'.
* This is set to 100.
*
* Internal Fields : Fields with ids [1 to firstAdditionalFieldid) (excluding firstAdditionalFieldId)
*
* Core fields : (Subset of Internal fields)- Fields with ids [1 to 8, 19]. These fields are "packed" together and stored
* under a single key. This key is referred to as "CoreFieldsKey" (see @TweetKeyType.CoreFieldsKey).
* Note: Actually field 1 is skipped when packing as this field is the tweet id and it need not be
* explicitly stored since the pkey already contains the tweet Id)
*
* Root Core field id : The field id under which the packed core fields are stored in Manhattan. (This is field Id "1")
*
* Required fields : (Subset of Core fields) - Fields with ids [1 to 5] that MUST be present on every tweet.
*
* Additional Fields : All fields with field ids >= 'firstAdditionalFieldId'
*
* Compiled Additional fields : (Subset of Additional Fields) - All fields that the storage library knows about
* (i.e present on the latest storage_internal.thrift that is compiled-in).
*
* Passthrough fields : (Subset of Additional Fields) - The fields on storage_internal.thrift that the storage library is NOT aware of
* These field ids are is obtained looking at the "_passThroughFields" member of the scrooge-generated
* 'Tweet' object.
*
* coreFieldsIdInInternalTweet: This is the field id of the core fields (the only field) in the Internal Tweet struct
*/
object TweetFields {
val firstAdditionalFieldId: Short = 100
val tweetIdField: Short = 1
val geoFieldId: Short = 9
// The field under which all the core field values are stored (in serialized form).
val rootCoreFieldId: Short = 1
val coreFieldIds: immutable.IndexedSeq[FieldId] = {
val quotedTweetFieldId: Short = 19
(1 to 8).map(_.toShort) ++ Seq(quotedTweetFieldId)
}
val requiredFieldIds: immutable.IndexedSeq[FieldId] = (1 to 5).map(_.toShort)
val coreFieldsIdInInternalTweet: Short = 1
val compiledAdditionalFieldIds: Seq[FieldId] =
StoredTweet.metaData.fields.filter(_.id >= firstAdditionalFieldId).map(_.id)
val internalFieldIds: Seq[FieldId] =
StoredTweet.metaData.fields.filter(_.id < firstAdditionalFieldId).map(_.id)
val nonCoreInternalFields: Seq[FieldId] =
(internalFieldIds.toSet -- coreFieldIds.toSet).toSeq
def getAdditionalFieldIds(tweet: StoredTweet): Seq[FieldId] =
compiledAdditionalFieldIds ++ tweet._passthroughFields.keys.toSeq
}
/**
* Helper object to convert TFieldBlob to ByteBuffer that gets stored in Manhattan.
*
* The following is the format in which the TFieldBlob gets stored:
* [Version][TField][TFieldBlob]
*/
object TFieldBlobCodec {
val BinaryProtocolFactory: TBinaryProtocol.Factory = new TBinaryProtocol.Factory()
val FormatVersion = 1.0
def toByteBuffer(tFieldBlob: TFieldBlob): ByteBuffer = {
val baos = new ByteArrayOutputStream()
val prot = BinaryProtocolFactory.getProtocol(new TIOStreamTransport(baos))
prot.writeDouble(FormatVersion)
prot.writeFieldBegin(tFieldBlob.field)
prot.writeBinary(ByteArrayCodec.toByteBuffer(tFieldBlob.data))
ByteArrayCodec.toByteBuffer(baos.toByteArray)
}
def fromByteBuffer(buffer: ByteBuffer): TFieldBlob = {
val byteArray = ByteArrayCodec.fromByteBuffer(buffer)
val prot = BinaryProtocolFactory.getProtocol(new TMemoryInputTransport(byteArray))
val version = prot.readDouble()
if (version != FormatVersion) {
throw new VersionMismatchError(
"Version mismatch in decoding ByteBuffer to TFieldBlob. " +
"Actual version: " + version + ". Expected version: " + FormatVersion
)
}
val tField = prot.readFieldBegin()
val dataBuffer = prot.readBinary()
val data = ByteArrayCodec.fromByteBuffer(dataBuffer)
TFieldBlob(tField, data)
}
}
/**
* Helper object to help convert 'CoreFields' object to/from TFieldBlob (and also to construct
* 'CoreFields' object from a 'StoredTweet' object)
*
* More details:
* - A subset of fields on the 'StoredTweet' thrift structure (2-8,19) are 'packaged' and stored
* together as a serialized TFieldBlob object under a single key in Manhattan (see TweetKeyCodec
* helper object above for more details).
*
* - To make the packing/unpacking the fields to/from TFieldBlob object, we created the following
* two helper thrift structures 'CoreFields' and 'InternalTweet'
*
* // The field Ids and types here MUST exactly match field Ids on 'StoredTweet' thrift structure.
* struct CoreFields {
* 2: optional i64 user_id
* ...
* 8: optional i64 contributor_id
* ...
* 19: optional StoredQuotedTweet stored_quoted_tweet
*
* }
*
* // The field id of core fields MUST be "1"
* struct InternalTweet {
* 1: CoreFields coreFields
* }
*
* - Given the above two structures, packing/unpacking fields (2-8,19) on StoredTweet object into a TFieldBlob
* becomes very trivial:
* For packing:
* (i) Copy fields (2-8,19) from StoredTweet object to a new CoreFields object
* (ii) Create a new InternalTweet object with the 'CoreFields' object constructed in step (i) above
* (iii) Extract field "1" as a TFieldBlob from InternalField (by calling the scrooge generated "getFieldBlob(1)"
* function on the InternalField objecton
*
* For unpacking:
* (i) Create an empty 'InternalField' object
* (ii) Call scrooge-generated 'setField' by passing the tFieldBlob blob (created by packing steps above)
* (iii) Doing step (ii) above will create a hydrated 'CoreField' object that can be accessed by 'coreFields'
* member of 'InternalTweet' object.
*/
object CoreFieldsCodec {
val coreFieldIds: Seq[FieldId] = CoreFields.metaData.fields.map(_.id)
// "Pack" the core fields i.e converts 'CoreFields' object to "packed" tFieldBlob (See description
// above for more details)
def toTFieldBlob(coreFields: CoreFields): TFieldBlob = {
InternalTweet(Some(coreFields)).getFieldBlob(TweetFields.coreFieldsIdInInternalTweet).get
}
// "Unpack" the core fields from a packed TFieldBlob into a CoreFields object (see description above for
// more details)
def fromTFieldBlob(tFieldBlob: TFieldBlob): CoreFields = {
InternalTweet().setField(tFieldBlob).coreFields.get
}
// "Unpack" the core fields from a packed TFieldBlob into a Map of core-fieldId-> TFieldBlob
def unpackFields(tFieldBlob: TFieldBlob): Map[Short, TFieldBlob] =
fromTFieldBlob(tFieldBlob).getFieldBlobs(coreFieldIds)
// Create a 'CoreFields' thrift object from 'Tweet' thrift object.
def fromTweet(tweet: StoredTweet): CoreFields = {
// As mentioned above, the field ids and types on the 'CoreFields' struct exactly match the
// corresponding fields on StoredTweet structure. So it is safe to call .getField() on Tweet object and
// and pass the returned tFleldBlob a 'setField' on 'CoreFields' object.
coreFieldIds.foldLeft(CoreFields()) {
case (core, fieldId) =>
tweet.getFieldBlob(fieldId) match {
case None => core
case Some(tFieldBlob) => core.setField(tFieldBlob)
}
}
}
}
/**
* Helper object to convert ManhattanException to FieldResponseCode thrift object
*/
object FieldResponseCodeCodec {
import FieldResponseCodec.ValueNotFoundException
def fromManhattanException(mhException: ManhattanException): FieldResponseCode = {
mhException match {
case _: ValueNotFoundException => FieldResponseCode.ValueNotFound
case _: InternalErrorManhattanException => FieldResponseCode.Error
case _: InvalidRequestManhattanException => FieldResponseCode.InvalidRequest
case _: DeniedManhattanException => FieldResponseCode.Error
case _: UnsatisfiableManhattanException => FieldResponseCode.Error
case _: TimeoutManhattanException => FieldResponseCode.Timeout
}
}
}
/**
* Helper object to construct FieldResponse thrift object from an Exception.
* This is typically called to convert 'ManhattanException' object to 'FieldResponse' thrift object
*/
object FieldResponseCodec {
class ValueNotFoundException extends ManhattanException("Value not found!") with NoStackTrace
private[storage] val NotFound = new ValueNotFoundException
def fromThrowable(e: Throwable, additionalMsg: Option[String] = None): FieldResponse = {
val (respCode, errMsg) = e match {
case mhException: ManhattanException =>
(FieldResponseCodeCodec.fromManhattanException(mhException), mhException.getMessage)
case _ => (FieldResponseCode.Error, e.getMessage)
}
val respMsg = additionalMsg.map(_ + ". " + errMsg).orElse(Some(errMsg.toString))
FieldResponse(respCode, respMsg)
}
}