[docx] split commit for file 5800

Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
Ari Archer 2024-01-23 19:19:52 +02:00
parent be139a2dd1
commit dedacccd1f
No known key found for this signature in database
GPG Key ID: A50D5B4B599AF8A2
400 changed files with 0 additions and 26994 deletions

View File

@ -1,150 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
import com.twitter.tweetypie.storage.Response.TweetResponseCode
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
import com.twitter.tweetypie.thriftscala.DeletedTweet
import scala.util.control.NonFatal
sealed trait DeleteState
object DeleteState {
/**
* This tweet is deleted but has not been permanently deleted from Manhattan. Tweets in this state
* may be undeleted.
*/
case object SoftDeleted extends DeleteState
/**
* This tweet is deleted after being bounced for violating the Twitter Rules but has not been
* permanently deleted from Manhattan. Tweets in this state may NOT be undeleted.
*/
case object BounceDeleted extends DeleteState
/**
* This tweet has been permanently deleted from Manhattan.
*/
case object HardDeleted extends DeleteState
/**
* There is no data in Manhattan to distinguish this tweet id from one that never existed.
*/
case object NotFound extends DeleteState
/**
* This tweet exists and is not in a deleted state.
*/
case object NotDeleted extends DeleteState
}
case class DeletedTweetResponse(
tweetId: TweetId,
overallResponse: TweetResponseCode,
deleteState: DeleteState,
tweet: Option[DeletedTweet])
object GetDeletedTweetsHandler {
def apply(
read: ManhattanOperations.Read,
stats: StatsReceiver
): TweetStorageClient.GetDeletedTweets =
(unfilteredTweetIds: Seq[TweetId]) => {
val tweetIds = unfilteredTweetIds.filter(_ > 0)
Stats.addWidthStat("getDeletedTweets", "tweetIds", tweetIds.size, stats)
val stitches = tweetIds.map { tweetId =>
read(tweetId)
.map { mhRecords =>
val storedTweet = buildStoredTweet(tweetId, mhRecords)
TweetStateRecord.mostRecent(mhRecords) match {
case Some(m: TweetStateRecord.SoftDeleted) => softDeleted(m, storedTweet)
case Some(m: TweetStateRecord.BounceDeleted) => bounceDeleted(m, storedTweet)
case Some(m: TweetStateRecord.HardDeleted) => hardDeleted(m, storedTweet)
case _ if storedTweet.getFieldBlobs(expectedFields).isEmpty => notFound(tweetId)
case _ => notDeleted(tweetId, storedTweet)
}
}
.handle {
case _: DeniedManhattanException =>
DeletedTweetResponse(
tweetId,
TweetResponseCode.OverCapacity,
DeleteState.NotFound,
None
)
case NonFatal(ex) =>
TweetUtils.log.warning(
ex,
s"Unhandled exception in GetDeletedTweetsHandler for tweetId: $tweetId"
)
DeletedTweetResponse(tweetId, TweetResponseCode.Failure, DeleteState.NotFound, None)
}
}
Stitch.collect(stitches)
}
private def notFound(tweetId: TweetId) =
DeletedTweetResponse(
tweetId = tweetId,
overallResponse = TweetResponseCode.Success,
deleteState = DeleteState.NotFound,
tweet = None
)
private def softDeleted(record: TweetStateRecord.SoftDeleted, storedTweet: StoredTweet) =
DeletedTweetResponse(
record.tweetId,
TweetResponseCode.Success,
DeleteState.SoftDeleted,
Some(
StorageConversions
.toDeletedTweet(storedTweet)
.copy(deletedAtMsec = Some(record.createdAt))
)
)
private def bounceDeleted(record: TweetStateRecord.BounceDeleted, storedTweet: StoredTweet) =
DeletedTweetResponse(
record.tweetId,
TweetResponseCode.Success,
DeleteState.BounceDeleted,
Some(
StorageConversions
.toDeletedTweet(storedTweet)
.copy(deletedAtMsec = Some(record.createdAt))
)
)
private def hardDeleted(record: TweetStateRecord.HardDeleted, storedTweet: StoredTweet) =
DeletedTweetResponse(
record.tweetId,
TweetResponseCode.Success,
DeleteState.HardDeleted,
Some(
StorageConversions
.toDeletedTweet(storedTweet)
.copy(
hardDeletedAtMsec = Some(record.createdAt),
deletedAtMsec = Some(record.deletedAt)
)
)
)
/**
* notDeleted returns a tweet to simplify tweetypie.handler.UndeleteTweetHandler
*/
private def notDeleted(tweetId: TweetId, storedTweet: StoredTweet) =
DeletedTweetResponse(
tweetId = tweetId,
overallResponse = TweetResponseCode.Success,
deleteState = DeleteState.NotDeleted,
tweet = Some(StorageConversions.toDeletedTweet(storedTweet))
)
}

View File

@ -1,126 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.stitch.StitchSeqGroup
import com.twitter.tweetypie.storage.TweetStorageClient.GetStoredTweet
import com.twitter.tweetypie.storage.TweetStorageClient.GetStoredTweet.Error
import com.twitter.tweetypie.storage.TweetStorageClient.GetStoredTweet.Response._
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.util.Time
import com.twitter.util.Try
import scala.collection.mutable
object GetStoredTweetHandler {
private[this] object DeletedState {
def unapply(stateRecord: Option[TweetStateRecord]): Option[TweetStateRecord] =
stateRecord match {
case state @ (Some(_: TweetStateRecord.SoftDeleted) | Some(
_: TweetStateRecord.HardDeleted) | Some(_: TweetStateRecord.BounceDeleted)) =>
state
case _ => None
}
}
private[this] def deletedAtMs(stateRecord: Option[TweetStateRecord]): Option[Long] =
stateRecord match {
case Some(d: TweetStateRecord.SoftDeleted) => Some(d.createdAt)
case Some(d: TweetStateRecord.BounceDeleted) => Some(d.createdAt)
case Some(d: TweetStateRecord.HardDeleted) => Some(d.deletedAt)
case _ => None
}
private[this] def tweetResponseFromRecords(
tweetId: TweetId,
mhRecords: Seq[TweetManhattanRecord],
statsReceiver: StatsReceiver,
): GetStoredTweet.Response = {
val errs =
mutable.Buffer[Error]()
val hasStoredTweetFields: Boolean = mhRecords.exists {
case TweetManhattanRecord(TweetKey(_, _: TweetKey.LKey.FieldKey), _) => true
case _ => false
}
val storedTweet = if (hasStoredTweetFields) {
Try(buildStoredTweet(tweetId, mhRecords, includeScrubbed = true))
.onFailure(_ => errs.append(Error.TweetIsCorrupt))
.toOption
} else {
None
}
val scrubbedFields: Set[FieldId] = extractScrubbedFields(mhRecords)
val tweet: Option[Tweet] = storedTweet.map(StorageConversions.fromStoredTweetAllowInvalid)
val stateRecords: Seq[TweetStateRecord] = TweetStateRecord.fromTweetMhRecords(mhRecords)
val tweetState: Option[TweetStateRecord] = TweetStateRecord.mostRecent(mhRecords)
storedTweet.foreach { storedTweet =>
val storedExpectedFields = storedTweet.getFieldBlobs(expectedFields)
val missingExpectedFields = expectedFields.filterNot(storedExpectedFields.contains)
if (missingExpectedFields.nonEmpty || !isValid(storedTweet)) {
errs.append(Error.TweetFieldsMissingOrInvalid)
}
val invalidScrubbedFields = storedTweet.getFieldBlobs(scrubbedFields).keys
if (invalidScrubbedFields.nonEmpty) {
errs.append(Error.ScrubbedFieldsPresent)
}
if (deletedAtMs(tweetState).exists(_ < Time.now.inMilliseconds - 14.days.inMilliseconds)) {
errs.append(Error.TweetShouldBeHardDeleted)
}
}
val err = Option(errs.toList).filter(_.nonEmpty)
(tweet, tweetState, err) match {
case (None, None, None) =>
statsReceiver.counter("not_found").incr()
NotFound(tweetId)
case (None, Some(tweetState: TweetStateRecord.HardDeleted), None) =>
statsReceiver.counter("hard_deleted").incr()
HardDeleted(tweetId, Some(tweetState), stateRecords, scrubbedFields)
case (None, _, Some(errs)) =>
statsReceiver.counter("failed").incr()
Failed(tweetId, tweetState, stateRecords, scrubbedFields, errs)
case (Some(tweet), _, Some(errs)) =>
statsReceiver.counter("found_invalid").incr()
FoundWithErrors(tweet, tweetState, stateRecords, scrubbedFields, errs)
case (Some(tweet), DeletedState(state), None) =>
statsReceiver.counter("deleted").incr()
FoundDeleted(tweet, Some(state), stateRecords, scrubbedFields)
case (Some(tweet), _, None) =>
statsReceiver.counter("found").incr()
Found(tweet, tweetState, stateRecords, scrubbedFields)
}
}
def apply(read: ManhattanOperations.Read, statsReceiver: StatsReceiver): GetStoredTweet = {
object mhGroup extends StitchSeqGroup[TweetId, Seq[TweetManhattanRecord]] {
override def run(tweetIds: Seq[TweetId]): Stitch[Seq[Seq[TweetManhattanRecord]]] = {
Stats.addWidthStat("getStoredTweet", "tweetIds", tweetIds.size, statsReceiver)
Stitch.traverse(tweetIds)(read(_))
}
}
tweetId =>
if (tweetId <= 0) {
Stitch.NotFound
} else {
Stitch
.call(tweetId, mhGroup)
.map(mhRecords =>
tweetResponseFromRecords(tweetId, mhRecords, statsReceiver.scope("getStoredTweet")))
}
}
}

View File

@ -1,167 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.stats.Counter
import com.twitter.finagle.stats.NullStatsReceiver
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.logging.Logger
import com.twitter.snowflake.id.SnowflakeId
import com.twitter.stitch.Stitch
import com.twitter.stitch.StitchSeqGroup
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
import com.twitter.storage.client.manhattan.kv.ManhattanException
import com.twitter.tweetypie.storage.TweetStateRecord.BounceDeleted
import com.twitter.tweetypie.storage.TweetStateRecord.HardDeleted
import com.twitter.tweetypie.storage.TweetStateRecord.SoftDeleted
import com.twitter.tweetypie.storage.TweetStorageClient.GetTweet
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.util.Duration
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.util.Time
object GetTweetHandler {
private[this] val logger = Logger(getClass)
//////////////////////////////////////////////////
// Logging racy reads for later validation.
val RacyTweetWindow: Duration = 10.seconds
/**
* If this read is soon after the tweet was created, then we would usually
* expect it to be served from cache. This early read indicates that this
* tweet is prone to consistency issues, so we log what's present in
* Manhattan at the time of the read for later analysis.
*/
private[this] def logRacyRead(tweetId: TweetId, records: Seq[TweetManhattanRecord]): Unit =
if (SnowflakeId.isSnowflakeId(tweetId)) {
val tweetAge = Time.now.since(SnowflakeId(tweetId).time)
if (tweetAge <= RacyTweetWindow) {
val sb = new StringBuilder
sb.append("racy_tweet_read\t")
.append(tweetId)
.append('\t')
.append(tweetAge.inMilliseconds) // Log the age for analysis purposes
records.foreach { rec =>
sb.append('\t')
.append(rec.lkey)
rec.value.timestamp.foreach { ts =>
// If there is a timestamp for this key, log it so that we can tell
// later on whether a value should have been present. We expect
// keys written in a single write to have the same timestamp, and
// generally, keys written in separate writes will have different
// timestamps. The timestamp value is optional in Manhattan, but
// we expect there to always be a value for the timestamp.
sb.append(':')
.append(ts.inMilliseconds)
}
}
logger.info(sb.toString)
}
}
/**
* Convert a set of records from Manhattan into a GetTweet.Response.
*/
def tweetResponseFromRecords(
tweetId: TweetId,
mhRecords: Seq[TweetManhattanRecord],
statsReceiver: StatsReceiver = NullStatsReceiver
): GetTweet.Response =
if (mhRecords.isEmpty) {
GetTweet.Response.NotFound
} else {
// If no internal fields are present or no required fields present, we consider the tweet
// as not returnable (even if some additional fields are present)
def tweetFromRecords(tweetId: TweetId, mhRecords: Seq[TweetManhattanRecord]) = {
val storedTweet = buildStoredTweet(tweetId, mhRecords)
if (storedTweet.getFieldBlobs(expectedFields).nonEmpty) {
if (isValid(storedTweet)) {
statsReceiver.counter("valid").incr()
Some(StorageConversions.fromStoredTweet(storedTweet))
} else {
log.info(s"Invalid Tweet Id: $tweetId")
statsReceiver.counter("invalid").incr()
None
}
} else {
// The Tweet contained none of the fields defined in `expectedFields`
log.info(s"Expected Fields Not Present Tweet Id: $tweetId")
statsReceiver.counter("expected_fields_not_present").incr()
None
}
}
val stateRecord = TweetStateRecord.mostRecent(mhRecords)
stateRecord match {
// some other cases don't require an attempt to construct a Tweet
case Some(_: SoftDeleted) | Some(_: HardDeleted) => GetTweet.Response.Deleted
// all other cases require an attempt to construct a Tweet, which may not be successful
case _ =>
logRacyRead(tweetId, mhRecords)
(stateRecord, tweetFromRecords(tweetId, mhRecords)) match {
// BounceDeleted contains the Tweet data so that callers can access data on the the
// tweet (e.g. hard delete daemon requires conversationId and userId. There are no
// plans for Tweetypie server to make use of the returned tweet at this time.
case (Some(_: BounceDeleted), Some(tweet)) => GetTweet.Response.BounceDeleted(tweet)
case (Some(_: BounceDeleted), None) => GetTweet.Response.Deleted
case (_, Some(tweet)) => GetTweet.Response.Found(tweet)
case _ => GetTweet.Response.NotFound
}
}
}
def apply(read: ManhattanOperations.Read, statsReceiver: StatsReceiver): GetTweet = {
object stats {
val getTweetScope = statsReceiver.scope("getTweet")
val deniedCounter: Counter = getTweetScope.counter("mh_denied")
val mhExceptionCounter: Counter = getTweetScope.counter("mh_exception")
val nonFatalExceptionCounter: Counter = getTweetScope.counter("non_fatal_exception")
val notFoundCounter: Counter = getTweetScope.counter("not_found")
}
object mhGroup extends StitchSeqGroup[TweetId, Seq[TweetManhattanRecord]] {
override def run(tweetIds: Seq[TweetId]): Stitch[Seq[Seq[TweetManhattanRecord]]] = {
Stats.addWidthStat("getTweet", "tweetIds", tweetIds.size, statsReceiver)
Stitch.traverse(tweetIds)(read(_))
}
}
tweetId =>
if (tweetId <= 0) {
Stitch.NotFound
} else {
Stitch
.call(tweetId, mhGroup)
.map(mhRecords => tweetResponseFromRecords(tweetId, mhRecords, stats.getTweetScope))
.liftToTry
.map {
case Throw(mhException: DeniedManhattanException) =>
stats.deniedCounter.incr()
Throw(RateLimited("", mhException))
// Encountered some other Manhattan error
case t @ Throw(_: ManhattanException) =>
stats.mhExceptionCounter.incr()
t
// Something else happened
case t @ Throw(ex) =>
stats.nonFatalExceptionCounter.incr()
TweetUtils.log
.warning(ex, s"Unhandled exception in GetTweetHandler for tweetId: $tweetId")
t
case r @ Return(GetTweet.Response.NotFound) =>
stats.notFoundCounter.incr()
r
case r @ Return(_) => r
}
.lowerFromTry
}
}
}

View File

@ -1,153 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.tweetypie.storage.TweetKey.LKey.ForceAddedStateKey
import com.twitter.tweetypie.storage.TweetStorageClient.HardDeleteTweet
import com.twitter.tweetypie.storage.TweetStorageClient.HardDeleteTweet.Response._
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.util.Time
import com.twitter.util.Try
object HardDeleteTweetHandler {
/**
* When a tweet is removed lkeys with these prefixes will be deleted permanently.
*/
private[storage] def isKeyToBeDeleted(key: TweetKey): Boolean =
key.lKey match {
case (TweetKey.LKey.CoreFieldsKey | TweetKey.LKey.InternalFieldsKey(_) |
TweetKey.LKey.AdditionalFieldsKey(_) | TweetKey.LKey.SoftDeletionStateKey |
TweetKey.LKey.BounceDeletionStateKey | TweetKey.LKey.UnDeletionStateKey |
TweetKey.LKey.ForceAddedStateKey) =>
true
case _ => false
}
/**
* When hard deleting, there are two actions, writing the record and
* removing the tweet data. If we are performing any action, we will
* always try to remove the tweet data. If the tweet does not yet have a
* hard deletion record, then we will need to write one. This method
* returns the HardDeleted record if it needs to be written, and None
* if it has already been written.
*
* If the tweet is not in a deleted state we signal this with a
* Throw(NotDeleted).
*/
private[storage] def getHardDeleteStateRecord(
tweetId: TweetId,
records: Seq[TweetManhattanRecord],
mhTimestamp: Time,
stats: StatsReceiver
): Try[Option[TweetStateRecord.HardDeleted]] = {
val mostRecent = TweetStateRecord.mostRecent(records)
val currentStateStr = mostRecent.map(_.name).getOrElse("no_tweet_state_record")
stats.counter(currentStateStr).incr()
mostRecent match {
case Some(
record @ (TweetStateRecord.SoftDeleted(_, _) | TweetStateRecord.BounceDeleted(_, _))) =>
Return(
Some(
TweetStateRecord.HardDeleted(
tweetId = tweetId,
// createdAt is the hard deletion timestamp when dealing with hard deletes in Manhattan
createdAt = mhTimestamp.inMillis,
// deletedAt is the soft deletion timestamp when dealing with hard deletes in Manhattan
deletedAt = record.createdAt
)
)
)
case Some(_: TweetStateRecord.HardDeleted) =>
Return(None)
case Some(_: TweetStateRecord.ForceAdded) =>
Throw(NotDeleted(tweetId, Some(ForceAddedStateKey)))
case Some(_: TweetStateRecord.Undeleted) =>
Throw(NotDeleted(tweetId, Some(TweetKey.LKey.UnDeletionStateKey)))
case None =>
Throw(NotDeleted(tweetId, None))
}
}
/**
* This handler returns HardDeleteTweet.Response.Deleted if data associated with the tweet is deleted,
* either as a result of this request or a previous one.
*
* The most recently added record determines the tweet's state. This method will only delete data
* for tweets in the soft-delete or hard-delete state. (Calling hardDeleteTweet for tweets that have
* already been hard-deleted will remove any lkeys that may not have been deleted previously).
*/
def apply(
read: ManhattanOperations.Read,
insert: ManhattanOperations.Insert,
delete: ManhattanOperations.Delete,
scribe: Scribe,
stats: StatsReceiver
): TweetId => Stitch[HardDeleteTweet.Response] = {
val hardDeleteStats = stats.scope("hardDeleteTweet")
val hardDeleteTweetCancelled = hardDeleteStats.counter("cancelled")
val beforeStateStats = hardDeleteStats.scope("before_state")
def removeRecords(keys: Seq[TweetKey], mhTimestamp: Time): Stitch[Unit] =
Stitch
.collect(keys.map(key => delete(key, Some(mhTimestamp)).liftToTry))
.map(collectWithRateLimitCheck)
.lowerFromTry
def writeRecord(record: Option[TweetStateRecord.HardDeleted]): Stitch[Unit] =
record match {
case Some(r) =>
insert(r.toTweetMhRecord).onSuccess { _ =>
scribe.logRemoved(
r.tweetId,
Time.fromMilliseconds(r.createdAt),
isSoftDeleted = false
)
}
case None => Stitch.Unit
}
tweetId =>
read(tweetId)
.flatMap { records =>
val hardDeletionTimestamp = Time.now
val keysToBeDeleted: Seq[TweetKey] = records.map(_.key).filter(isKeyToBeDeleted)
getHardDeleteStateRecord(
tweetId,
records,
hardDeletionTimestamp,
beforeStateStats) match {
case Return(record) =>
Stitch
.join(
writeRecord(record),
removeRecords(keysToBeDeleted, hardDeletionTimestamp)
).map(_ =>
// If the tweetId is non-snowflake and has previously been hard deleted
// there will be no coreData record to fall back on to get the tweet
// creation time and createdAtMillis will be None.
Deleted(
// deletedAtMillis: when the tweet was hard deleted
deletedAtMillis = Some(hardDeletionTimestamp.inMillis),
// createdAtMillis: when the tweet itself was created
// (as opposed to when the deletion record was created)
createdAtMillis =
TweetUtils.creationTimeFromTweetIdOrMHRecords(tweetId, records)
))
case Throw(notDeleted: NotDeleted) =>
hardDeleteTweetCancelled.incr()
Stitch.value(notDeleted)
case Throw(e) => Stitch.exception(e) // this should never happen
}
}
}
}

View File

@ -1,228 +0,0 @@
package com.twitter.tweetypie.storage
import com.google.common.base.CaseFormat
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
import com.twitter.scrooge.TFieldBlob
import com.twitter.scrooge.ThriftStructFieldInfo
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.kv._
import com.twitter.tweetypie.additionalfields.AdditionalFields
import com.twitter.tweetypie.storage.ManhattanOperations.Read
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
import com.twitter.tweetypie.thriftscala.{Tweet => TweetypieTweet}
import com.twitter.util.Duration
import com.twitter.util.Future
import com.twitter.util.Return
import com.twitter.util.Throw
import diffshow.Container
import diffshow.DiffShow
import diffshow.Expr
import org.apache.commons.codec.binary.Base64
import scala.util.Try
import shapeless.Cached
import shapeless.Strict
// This class is used by the Tweetypie Console to inspect tweet field content in Manhattan
class InspectFields(svcIdentifier: ServiceIdentifier) {
val mhApplicationId = "tbird_mh"
val mhDatasetName = "tbird_mh"
val mhDestinationName = "/s/manhattan/cylon.native-thrift"
val mhTimeout: Duration = 5000.milliseconds
val localMhEndpoint: ManhattanKVEndpoint =
ManhattanKVEndpointBuilder(
ManhattanKVClient(
mhApplicationId,
mhDestinationName,
ManhattanKVClientMtlsParams(svcIdentifier)))
.defaultGuarantee(Guarantee.SoftDcReadMyWrites)
.defaultMaxTimeout(mhTimeout)
.build()
val readOperation: Read = (new ManhattanOperations(mhDatasetName, localMhEndpoint)).read
def lookup(tweetId: Long): Future[String] = {
val result = readOperation(tweetId).liftToTry.map {
case Return(mhRecords) =>
prettyPrintManhattanRecords(tweetId, TweetKey.padTweetIdStr(tweetId), mhRecords)
case Throw(e) => e.toString
}
Stitch.run(result)
}
def storedTweet(tweetId: Long): Future[StoredTweet] = {
val result = readOperation(tweetId).liftToTry.map {
case Return(mhRecords) =>
buildStoredTweet(tweetId, mhRecords)
case Throw(e) =>
throw e
}
Stitch.run(result)
}
private[this] def prettyPrintManhattanRecords(
tweetId: Long,
pkey: String,
mhRecords: Seq[TweetManhattanRecord]
): String = {
if (mhRecords.isEmpty) {
"Not Found"
} else {
val formattedRecords = getFormattedManhattanRecords(tweetId, mhRecords)
val keyFieldWidth = formattedRecords.map(_.key.length).max + 2
val fieldNameFieldWidth = formattedRecords.map(_.fieldName.length).max + 2
val formatString = s" %-${keyFieldWidth}s %-${fieldNameFieldWidth}s %s"
val recordsString =
formattedRecords
.map { record =>
val content = record.content.replaceAll("\n", "\n" + formatString.format("", "", ""))
formatString.format(record.key, record.fieldName, content)
}
.mkString("\n")
"/tbird_mh/" + pkey + "/" + "\n" + recordsString
}
}
private[this] def getFormattedManhattanRecords(
tweetId: Long,
mhRecords: Seq[TweetManhattanRecord]
): Seq[FormattedManhattanRecord] = {
val storedTweet = buildStoredTweet(tweetId, mhRecords).copy(updatedAt = None)
val tweetypieTweet: Option[TweetypieTweet] =
Try(StorageConversions.fromStoredTweet(storedTweet)).toOption
val blobMap: Map[String, TFieldBlob] = getStoredTweetBlobs(mhRecords).map { blob =>
getFieldName(blob.field.id) -> blob
}.toMap
mhRecords
.map {
case TweetManhattanRecord(fullKey, mhValue) =>
FormattedManhattanRecord(
key = fullKey.lKey.toString,
fieldName = getFieldName(fullKey.lKey),
content = prettyPrintManhattanValue(
fullKey.lKey,
mhValue,
storedTweet,
tweetypieTweet,
tweetId,
blobMap
)
)
}
.sortBy(_.key.replace("external", "xternal")) // sort by key, with internal first
}
private[this] def getFieldNameFromThrift(
fieldId: Short,
fieldInfos: List[ThriftStructFieldInfo]
): String =
fieldInfos
.find(info => info.tfield.id == fieldId)
.map(_.tfield.name)
.getOrElse("<UNKNOWN FIELD>")
private[this] def isLkeyScrubbedField(lkey: String): Boolean =
lkey.split("/")(1) == "scrubbed_fields"
private[this] def getFieldName(lkey: TweetKey.LKey): String =
lkey match {
case fieldKey: TweetKey.LKey.FieldKey => getFieldName(fieldKey.fieldId)
case _ => ""
}
private[this] def getFieldName(fieldId: Short): String =
if (fieldId == 1) {
"core_fields"
} else if (AdditionalFields.isAdditionalFieldId(fieldId)) {
getFieldNameFromThrift(fieldId, TweetypieTweet.fieldInfos)
} else {
getFieldNameFromThrift(fieldId, StoredTweet.fieldInfos)
}
private[this] def prettyPrintManhattanValue(
lkey: TweetKey.LKey,
mhValue: TweetManhattanValue,
storedTweet: StoredTweet,
tweetypieTweet: Option[TweetypieTweet],
tweetId: Long,
tfieldBlobs: Map[String, TFieldBlob]
): String = {
val decoded = lkey match {
case _: TweetKey.LKey.MetadataKey =>
decodeMetadata(mhValue)
case fieldKey: TweetKey.LKey.FieldKey =>
tfieldBlobs
.get(getFieldName(fieldKey.fieldId))
.map(blob => decodeField(tweetId, blob, storedTweet, tweetypieTweet))
case _ =>
None
}
decoded.getOrElse { // If all else fails, encode the data as a base64 string
val contents = mhValue.contents.array
if (contents.isEmpty) {
"<NO DATA>"
} else {
Base64.encodeBase64String(contents)
}
}
}
private[this] def decodeMetadata(mhValue: TweetManhattanValue): Option[String] = {
val byteArray = ByteArrayCodec.fromByteBuffer(mhValue.contents)
Try(Json.decode(byteArray).toString).toOption
}
private[this] def decodeField(
tweetId: Long,
blob: TFieldBlob,
storedTweet: StoredTweet,
tweetypieTweet: Option[TweetypieTweet]
): String = {
val fieldId = blob.field.id
if (fieldId == 1) {
coreFields(storedTweet)
} else if (AdditionalFields.isAdditionalFieldId(fieldId)) {
decodeTweetWithOneField(TweetypieTweet(tweetId).setField(blob))
} else {
decodeTweetWithOneField(StoredTweet(tweetId).setField(blob))
}
}
// Takes a Tweet or StoredTweet with a single field set and returns the value of that field
private[this] def decodeTweetWithOneField[T](
tweetWithOneField: T
)(
implicit ev: Cached[Strict[DiffShow[T]]]
): String = {
val config = diffshow.Config(hideFieldWithEmptyVal = true)
val tree: Expr = config.transform(DiffShow.show(tweetWithOneField))
// matches a Tweet or StoredTweet with two values, the first being the id
val value = tree.transform {
case Container(_, List(diffshow.Field("id", _), diffshow.Field(_, value))) => value
}
config.exprPrinter.apply(value, width = 80).render
}
private[this] def coreFields(storedTweet: StoredTweet): String =
diffshow.show(CoreFieldsCodec.fromTweet(storedTweet), hideFieldWithEmptyVal = true)
private[this] def toCamelCase(s: String): String =
CaseFormat.LOWER_UNDERSCORE.to(CaseFormat.LOWER_CAMEL, s)
}
case class FormattedManhattanRecord(key: String, fieldName: String, content: String)

View File

@ -1,17 +0,0 @@
package com.twitter.tweetypie.storage
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
object Json {
val TimestampKey = "timestamp"
val SoftDeleteTimestampKey = "softdelete_timestamp"
private val mapper = new ObjectMapper
mapper.registerModule(DefaultScalaModule)
def encode(m: Map[String, Any]): Array[Byte] = mapper.writeValueAsBytes(m)
def decode(arr: Array[Byte]): Map[String, Any] =
mapper.readValue[Map[String, Any]](arr, classOf[Map[String, Any]])
}

View File

@ -1,103 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.bijection.Injection
import com.twitter.io.Buf
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.bijections.Bijections.BufInjection
import com.twitter.storage.client.manhattan.kv.ManhattanKVEndpoint
import com.twitter.storage.client.manhattan.kv.impl.DescriptorP1L1
import com.twitter.storage.client.manhattan.kv.impl.Component
import com.twitter.storage.client.manhattan.kv.{impl => mh}
import com.twitter.storage.client.manhattan.bijections.Bijections.StringInjection
import com.twitter.util.Time
import java.nio.ByteBuffer
import scala.util.control.NonFatal
case class TweetManhattanRecord(key: TweetKey, value: TweetManhattanValue) {
def pkey: TweetId = key.tweetId
def lkey: TweetKey.LKey = key.lKey
/**
* Produces a representation that is human-readable, but contains
* all of the information from the record. It is not intended for
* producing machine-readable values.
*
* This conversion is relatively expensive, so beware of using it in
* hot code paths.
*/
override def toString: String = {
val valueString =
try {
key.lKey match {
case _: TweetKey.LKey.MetadataKey =>
StringCodec.fromByteBuffer(value.contents)
case _: TweetKey.LKey.FieldKey =>
val tFieldBlob = TFieldBlobCodec.fromByteBuffer(value.contents)
s"TFieldBlob(${tFieldBlob.field}, 0x${Buf.slowHexString(tFieldBlob.content)})"
case TweetKey.LKey.Unknown(_) =>
"0x" + Buf.slowHexString(Buf.ByteBuffer.Shared(value.contents))
}
} catch {
case NonFatal(e) =>
val hexValue = Buf.slowHexString(Buf.ByteBuffer.Shared(value.contents))
s"0x$hexValue (failed to decode due to $e)"
}
s"$key => ${value.copy(contents = valueString)}"
}
}
object ManhattanOperations {
type Read = TweetId => Stitch[Seq[TweetManhattanRecord]]
type Insert = TweetManhattanRecord => Stitch[Unit]
type Delete = (TweetKey, Option[Time]) => Stitch[Unit]
type DeleteRange = TweetId => Stitch[Unit]
object PkeyInjection extends Injection[TweetId, String] {
override def apply(tweetId: TweetId): String = TweetKey.padTweetIdStr(tweetId)
override def invert(str: String): scala.util.Try[TweetId] = scala.util.Try(str.toLong)
}
case class InvalidLkey(lkeyStr: String) extends Exception
object LkeyInjection extends Injection[TweetKey.LKey, String] {
override def apply(lkey: TweetKey.LKey): String = lkey.toString
override def invert(str: String): scala.util.Try[TweetKey.LKey] =
scala.util.Success(TweetKey.LKey.fromString(str))
}
val KeyDescriptor: DescriptorP1L1.EmptyKey[TweetId, TweetKey.LKey] =
mh.KeyDescriptor(
Component(PkeyInjection.andThen(StringInjection)),
Component(LkeyInjection.andThen(StringInjection))
)
val ValueDescriptor: mh.ValueDescriptor.EmptyValue[ByteBuffer] = mh.ValueDescriptor(BufInjection)
}
class ManhattanOperations(dataset: String, mhEndpoint: ManhattanKVEndpoint) {
import ManhattanOperations._
private[this] def pkey(tweetId: TweetId) = KeyDescriptor.withDataset(dataset).withPkey(tweetId)
def read: Read = { tweetId =>
mhEndpoint.slice(pkey(tweetId).under(), ValueDescriptor).map { mhData =>
mhData.map {
case (key, value) => TweetManhattanRecord(TweetKey(key.pkey, key.lkey), value)
}
}
}
def insert: Insert =
record => {
val mhKey = pkey(record.key.tweetId).withLkey(record.key.lKey)
mhEndpoint.insert(mhKey, ValueDescriptor.withValue(record.value))
}
def delete: Delete = (key, time) => mhEndpoint.delete(pkey(key.tweetId).withLkey(key.lKey), time)
def deleteRange: DeleteRange =
tweetId => mhEndpoint.deleteRange(KeyDescriptor.withDataset(dataset).withPkey(tweetId).under())
}

View File

@ -1,451 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
import com.twitter.finagle.ssl.OpportunisticTls
import com.twitter.finagle.stats.NullStatsReceiver
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.logging.BareFormatter
import com.twitter.logging.Level
import com.twitter.logging.ScribeHandler
import com.twitter.logging._
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.bijections.Bijections._
import com.twitter.storage.client.manhattan.kv._
import com.twitter.storage.client.manhattan.kv.impl.ValueDescriptor
import com.twitter.tweetypie.client_id.ClientIdHelper
import com.twitter.tweetypie.storage.Scribe.ScribeHandlerFactory
import com.twitter.tweetypie.storage.TweetStorageClient.BounceDelete
import com.twitter.tweetypie.storage.TweetStorageClient.GetTweet
import com.twitter.tweetypie.storage.TweetStorageClient.HardDeleteTweet
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.tweetypie.util.StitchUtils
import com.twitter.util.Duration
import com.twitter.util.Return
import com.twitter.util.Throw
import scala.util.Random
object ManhattanTweetStorageClient {
object Config {
/**
* The Manhattan dataset where tweets are stored is not externally
* configurable because writing tweets to a non-production dataset
* requires great care. Staging instances using a different dataset will
* write tweets to a non-production store, but will publish events, log to
* HDFS, and cache data referencing tweets in that store which are not
* accessible by the rest of the production cluster.
*
* In a completely isolated environment it should be safe to write to
* other datasets for testing purposes.
*/
val Dataset = "tbird_mh"
/**
* Once a tweet has been deleted it can only be undeleted within this time
* window, after which [[UndeleteHandler]] will return an error on
* undelete attempts.
*/
val UndeleteWindowHours = 240
/**
* Default label used for underlying Manhattan Thrift client metrics
*
* The finagle client metrics will be exported at clnt/:label.
*/
val ThriftClientLabel = "mh_cylon"
/**
* Return the corresponding Wily path for the Cylon cluster in the "other" DC
*/
def remoteDestination(zone: String): String =
s"/srv#/prod/${remoteZone(zone)}/manhattan/cylon.native-thrift"
private def remoteZone(zone: String) = zone match {
case "pdxa" => "atla"
case "atla" | "localhost" => "pdxa"
case _ =>
throw new IllegalArgumentException(s"Cannot configure remote DC for unknown zone '$zone'")
}
}
/**
* @param applicationId Manhattan application id used for quota accounting
* @param localDestination Wily path to local Manhattan cluster
* @param localTimeout Overall timeout (including retries) for all reads/writes to local cluster
* @param remoteDestination Wily path to remote Manhattan cluster, used for undelete and force add
* @param remoteTimeout Overall timeout (including retries) for all reads/writes to remote cluster
* @param undeleteWindowHours Amount of time during which a deleted tweet can be undeleted
* @param thriftClientLabel Label used to scope stats for Manhattan Thrift client
* @param maxRequestsPerBatch Configure the Stitch RequestGroup.Generator batch size
* @param serviceIdentifier The ServiceIdentifier to use when making connections to a Manhattan cluster
* @param opportunisticTlsLevel The level to use for opportunistic TLS for connections to the Manhattan cluster
*/
case class Config(
applicationId: String,
localDestination: String,
localTimeout: Duration,
remoteDestination: String,
remoteTimeout: Duration,
undeleteWindowHours: Int = Config.UndeleteWindowHours,
thriftClientLabel: String = Config.ThriftClientLabel,
maxRequestsPerBatch: Int = Int.MaxValue,
serviceIdentifier: ServiceIdentifier,
opportunisticTlsLevel: OpportunisticTls.Level)
/**
* Sanitizes the input for APIs which take in a (Tweet, Seq[Field]) as input.
*
* NOTE: This function only applies sanity checks which are common to
* all APIs which take in a (Tweet, Seq[Field]) as input. API specific
* checks are not covered here.
*
* @param apiStitch the backing API call
* @tparam T the output type of the backing API call
* @return a stitch function which does some basic input sanity checking
*/
private[storage] def sanitizeTweetFields[T](
apiStitch: (Tweet, Seq[Field]) => Stitch[T]
): (Tweet, Seq[Field]) => Stitch[T] =
(tweet, fields) => {
require(fields.forall(_.id > 0), s"Field ids ${fields} are not positive numbers")
apiStitch(tweet, fields)
}
// Returns a handler that asynchronously logs messages to Scribe using the BareFormatter which
// logs just the message without any additional metadata
def scribeHandler(categoryName: String): HandlerFactory =
ScribeHandler(
formatter = BareFormatter,
maxMessagesPerTransaction = 100,
category = categoryName,
level = Some(Level.TRACE)
)
/**
* A Config appropriate for interactive sessions and scripts.
*/
def develConfig(): Config =
Config(
applicationId = Option(System.getenv("USER")).getOrElse("<unknown>") + ".devel",
localDestination = "/s/manhattan/cylon.native-thrift",
localTimeout = 10.seconds,
remoteDestination = "/s/manhattan/cylon.native-thrift",
remoteTimeout = 10.seconds,
undeleteWindowHours = Config.UndeleteWindowHours,
thriftClientLabel = Config.ThriftClientLabel,
maxRequestsPerBatch = Int.MaxValue,
serviceIdentifier = ServiceIdentifier(System.getenv("USER"), "tweetypie", "devel", "local"),
opportunisticTlsLevel = OpportunisticTls.Required
)
/**
* Build a Manhattan tweet storage client for use in interactive
* sessions and scripts.
*/
def devel(): TweetStorageClient =
new ManhattanTweetStorageClient(
develConfig(),
NullStatsReceiver,
ClientIdHelper.default,
)
}
class ManhattanTweetStorageClient(
config: ManhattanTweetStorageClient.Config,
statsReceiver: StatsReceiver,
private val clientIdHelper: ClientIdHelper)
extends TweetStorageClient {
import ManhattanTweetStorageClient._
lazy val scribeHandlerFactory: ScribeHandlerFactory = scribeHandler _
val scribe: Scribe = new Scribe(scribeHandlerFactory, statsReceiver)
def mkClient(
dest: String,
label: String
): ManhattanKVClient = {
val mhMtlsParams =
if (config.serviceIdentifier == EmptyServiceIdentifier) NoMtlsParams
else
ManhattanKVClientMtlsParams(
serviceIdentifier = config.serviceIdentifier,
opportunisticTls = config.opportunisticTlsLevel
)
new ManhattanKVClient(
config.applicationId,
dest,
mhMtlsParams,
label,
Seq(Experiments.ApertureLoadBalancer))
}
val localClient: ManhattanKVClient = mkClient(config.localDestination, config.thriftClientLabel)
val localMhEndpoint: ManhattanKVEndpoint = ManhattanKVEndpointBuilder(localClient)
.defaultGuarantee(Guarantee.SoftDcReadMyWrites)
.defaultMaxTimeout(config.localTimeout)
.maxRequestsPerBatch(config.maxRequestsPerBatch)
.build()
val localManhattanOperations = new ManhattanOperations(Config.Dataset, localMhEndpoint)
val remoteClient: ManhattanKVClient =
mkClient(config.remoteDestination, s"${config.thriftClientLabel}_remote")
val remoteMhEndpoint: ManhattanKVEndpoint = ManhattanKVEndpointBuilder(remoteClient)
.defaultGuarantee(Guarantee.SoftDcReadMyWrites)
.defaultMaxTimeout(config.remoteTimeout)
.build()
val remoteManhattanOperations = new ManhattanOperations(Config.Dataset, remoteMhEndpoint)
/**
* Note: This translation is only useful for non-batch endpoints. Batch endpoints currently
* represent failure without propagating an exception
* (e.g. [[com.twitter.tweetypie.storage.Response.TweetResponseCode.Failure]]).
*/
private[this] def translateExceptions(
apiName: String,
statsReceiver: StatsReceiver
): PartialFunction[Throwable, Throwable] = {
case e: IllegalArgumentException => ClientError(e.getMessage, e)
case e: DeniedManhattanException => RateLimited(e.getMessage, e)
case e: VersionMismatchError =>
statsReceiver.scope(apiName).counter("mh_version_mismatches").incr()
e
case e: InternalError =>
TweetUtils.log.error(e, s"Error processing $apiName request: ${e.getMessage}")
e
}
/**
* Count requests per client id producing metrics of the form
* .../clients/:root_client_id/requests
*/
def observeClientId[A, B](
apiStitch: A => Stitch[B],
statsReceiver: StatsReceiver,
clientIdHelper: ClientIdHelper,
): A => Stitch[B] = {
val clients = statsReceiver.scope("clients")
val incrementClientRequests = { args: A =>
val clientId = clientIdHelper.effectiveClientIdRoot.getOrElse(ClientIdHelper.UnknownClientId)
clients.counter(clientId, "requests").incr
}
a => {
incrementClientRequests(a)
apiStitch(a)
}
}
/**
* Increment counters based on the overall response status of the returned [[GetTweet.Response]].
*/
def observeGetTweetResponseCode[A](
apiStitch: A => Stitch[GetTweet.Response],
statsReceiver: StatsReceiver
): A => Stitch[GetTweet.Response] = {
val scope = statsReceiver.scope("response_code")
val success = scope.counter("success")
val notFound = scope.counter("not_found")
val failure = scope.counter("failure")
val overCapacity = scope.counter("over_capacity")
val deleted = scope.counter("deleted")
val bounceDeleted = scope.counter("bounce_deleted")
a =>
apiStitch(a).respond {
case Return(_: GetTweet.Response.Found) => success.incr()
case Return(GetTweet.Response.NotFound) => notFound.incr()
case Return(_: GetTweet.Response.BounceDeleted) => bounceDeleted.incr()
case Return(GetTweet.Response.Deleted) => deleted.incr()
case Throw(_: RateLimited) => overCapacity.incr()
case Throw(_) => failure.incr()
}
}
/**
* We do 3 things here:
*
* - Bookkeeping for overall requests
* - Bookkeeping for per api requests
* - Translate exceptions
*
* @param apiName the API being called
* @param apiStitch the implementation of the API
* @tparam A template for input type of API
* @tparam B template for output type of API
* @return Function which executes the given API call
*/
private[storage] def endpoint[A, B](
apiName: String,
apiStitch: A => Stitch[B]
): A => Stitch[B] = {
val translateException = translateExceptions(apiName, statsReceiver)
val observe = StitchUtils.observe[B](statsReceiver, apiName)
a =>
StitchUtils.translateExceptions(
observe(apiStitch(a)),
translateException
)
}
private[storage] def endpoint2[A, B, C](
apiName: String,
apiStitch: (A, B) => Stitch[C],
clientIdHelper: ClientIdHelper,
): (A, B) => Stitch[C] =
Function.untupled(endpoint(apiName, apiStitch.tupled))
val getTweet: TweetStorageClient.GetTweet = {
val stats = statsReceiver.scope("getTweet")
observeClientId(
observeGetTweetResponseCode(
endpoint(
"getTweet",
GetTweetHandler(
read = localManhattanOperations.read,
statsReceiver = stats,
)
),
stats,
),
stats,
clientIdHelper,
)
}
val getStoredTweet: TweetStorageClient.GetStoredTweet = {
val stats = statsReceiver.scope("getStoredTweet")
observeClientId(
endpoint(
"getStoredTweet",
GetStoredTweetHandler(
read = localManhattanOperations.read,
statsReceiver = stats,
)
),
stats,
clientIdHelper,
)
}
val addTweet: TweetStorageClient.AddTweet =
endpoint(
"addTweet",
AddTweetHandler(
insert = localManhattanOperations.insert,
scribe = scribe,
stats = statsReceiver
)
)
val updateTweet: TweetStorageClient.UpdateTweet =
endpoint2(
"updateTweet",
ManhattanTweetStorageClient.sanitizeTweetFields(
UpdateTweetHandler(
insert = localManhattanOperations.insert,
stats = statsReceiver,
)
),
clientIdHelper,
)
val softDelete: TweetStorageClient.SoftDelete =
endpoint(
"softDelete",
SoftDeleteHandler(
insert = localManhattanOperations.insert,
scribe = scribe
)
)
val bounceDelete: BounceDelete =
endpoint(
"bounceDelete",
BounceDeleteHandler(
insert = localManhattanOperations.insert,
scribe = scribe
)
)
val undelete: TweetStorageClient.Undelete =
endpoint(
"undelete",
UndeleteHandler(
read = localManhattanOperations.read,
localInsert = localManhattanOperations.insert,
remoteInsert = remoteManhattanOperations.insert,
delete = localManhattanOperations.delete,
undeleteWindowHours = config.undeleteWindowHours,
stats = statsReceiver
)
)
val getDeletedTweets: TweetStorageClient.GetDeletedTweets =
endpoint(
"getDeletedTweets",
GetDeletedTweetsHandler(
read = localManhattanOperations.read,
stats = statsReceiver
)
)
val deleteAdditionalFields: TweetStorageClient.DeleteAdditionalFields =
endpoint2(
"deleteAdditionalFields",
DeleteAdditionalFieldsHandler(
delete = localManhattanOperations.delete,
stats = statsReceiver,
),
clientIdHelper,
)
val scrub: TweetStorageClient.Scrub =
endpoint2(
"scrub",
ScrubHandler(
insert = localManhattanOperations.insert,
delete = localManhattanOperations.delete,
scribe = scribe,
stats = statsReceiver,
),
clientIdHelper,
)
val hardDeleteTweet: HardDeleteTweet =
endpoint(
"hardDeleteTweet",
HardDeleteTweetHandler(
read = localManhattanOperations.read,
insert = localManhattanOperations.insert,
delete = localManhattanOperations.delete,
scribe = scribe,
stats = statsReceiver
)
)
val ping: TweetStorageClient.Ping =
() =>
Stitch
.run(
localMhEndpoint
.get(
ManhattanOperations.KeyDescriptor
.withDataset(Config.Dataset)
.withPkey(Random.nextLong().abs)
.withLkey(TweetKey.LKey.CoreFieldsKey), // could be any lkey
ValueDescriptor(BufInjection)
).unit
)
}

View File

@ -1,30 +0,0 @@
package com.twitter.tweetypie.storage
object Response {
case class TweetResponse(
tweetId: Long,
overallResponse: TweetResponseCode,
additionalFieldResponses: Option[Map[Short, FieldResponse]] = None)
sealed trait TweetResponseCode
object TweetResponseCode {
object Success extends TweetResponseCode
object Partial extends TweetResponseCode
object Failure extends TweetResponseCode
object OverCapacity extends TweetResponseCode
object Deleted extends TweetResponseCode
}
case class FieldResponse(code: FieldResponseCode, message: Option[String] = None)
sealed trait FieldResponseCode
object FieldResponseCode {
object Success extends FieldResponseCode
object InvalidRequest extends FieldResponseCode
object ValueNotFound extends FieldResponseCode
object Timeout extends FieldResponseCode
object Error extends FieldResponseCode
}
}

View File

@ -1,85 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.servo.util.FutureEffect
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.logging._
import com.twitter.scrooge.BinaryThriftStructSerializer
import com.twitter.servo.util.{Scribe => ServoScribe}
import com.twitter.tweetypie.storage_internal.thriftscala._
import com.twitter.tbird.thriftscala.Added
import com.twitter.tbird.thriftscala.Removed
import com.twitter.tbird.thriftscala.Scrubbed
import com.twitter.util.Time
/**
* Scribe is used to log tweet writes which are used to generate /tables/statuses in HDFS.
*
* Write Scribe Category Message
* ----- --------------- -------
* add tbird_add_status [[com.twitter.tbird.thriftscala.Added]]
* remove tbird_remove_status [[com.twitter.tbird.thriftscala.Removed]]
* scrub tbird_scrub_status [[com.twitter.tbird.thriftscala.Scrubbed]]
*
* The thrift representation is encoded using binary thrift protocol format, followed by base64
* encoding and converted to string using default character set (utf8). The logger uses BareFormatter.
*
* The thrift ops are scribed only after the write API call has succeeded.
*
* The class is thread safe except initial configuration and registration routines,
* and no exception is expected unless java heap is out of memory.
*
* If exception does get thrown, add/remove/scrub operations will fail and
* client will have to retry
*/
class Scribe(factory: Scribe.ScribeHandlerFactory, statsReceiver: StatsReceiver) {
import Scribe._
private val AddedSerializer = BinaryThriftStructSerializer(Added)
private val RemovedSerializer = BinaryThriftStructSerializer(Removed)
private val ScrubbedSerializer = BinaryThriftStructSerializer(Scrubbed)
private val addCounter = statsReceiver.counter("scribe/add/count")
private val removeCounter = statsReceiver.counter("scribe/remove/count")
private val scrubCounter = statsReceiver.counter("scribe/scrub/count")
val addHandler: FutureEffect[String] = ServoScribe(factory(scribeAddedCategory)())
val removeHandler: FutureEffect[String] = ServoScribe(factory(scribeRemovedCategory)())
val scrubHandler: FutureEffect[String] = ServoScribe(factory(scribeScrubbedCategory)())
private def addedToString(tweet: StoredTweet): String =
AddedSerializer.toString(
Added(StatusConversions.toTBirdStatus(tweet), Time.now.inMilliseconds, Some(false))
)
private def removedToString(id: Long, at: Time, isSoftDeleted: Boolean): String =
RemovedSerializer.toString(Removed(id, at.inMilliseconds, Some(isSoftDeleted)))
private def scrubbedToString(id: Long, cols: Seq[Int], at: Time): String =
ScrubbedSerializer.toString(Scrubbed(id, cols, at.inMilliseconds))
def logAdded(tweet: StoredTweet): Unit = {
addHandler(addedToString(tweet))
addCounter.incr()
}
def logRemoved(id: Long, at: Time, isSoftDeleted: Boolean): Unit = {
removeHandler(removedToString(id, at, isSoftDeleted))
removeCounter.incr()
}
def logScrubbed(id: Long, cols: Seq[Int], at: Time): Unit = {
scrubHandler(scrubbedToString(id, cols, at))
scrubCounter.incr()
}
}
object Scribe {
type ScribeHandlerFactory = (String) => HandlerFactory
/** WARNING: These categories are white-listed. If you are changing them, the new categories should be white-listed.
* You should followup with CoreWorkflows team (CW) for that.
*/
private val scribeAddedCategory = "tbird_add_status"
private val scribeRemovedCategory = "tbird_remove_status"
private val scribeScrubbedCategory = "tbird_scrub_status"
}

View File

@ -1,71 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.kv.ManhattanValue
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.util.Time
/**
* Deletes data for the scrubbed field and writes a metadata record.
* Provides scrub functionality. Right now, we only allow the scrubbing of the geo field.
* It should be simple to add more fields to the allowlist if needed.
*/
object ScrubHandler {
val scrubFieldsAllowlist: Set[Field] = Set(Field.Geo)
def apply(
insert: ManhattanOperations.Insert,
delete: ManhattanOperations.Delete,
scribe: Scribe,
stats: StatsReceiver
): TweetStorageClient.Scrub =
(unfilteredTweetIds: Seq[TweetId], columns: Seq[Field]) => {
val tweetIds = unfilteredTweetIds.filter(_ > 0)
require(columns.nonEmpty, "Must specify fields to scrub")
require(
columns.toSet.size == columns.size,
s"Duplicate fields to scrub specified: $columns"
)
require(
columns.forall(scrubFieldsAllowlist.contains(_)),
s"Cannot scrub $columns; scrubbable fields are restricted to $scrubFieldsAllowlist"
)
Stats.addWidthStat("scrub", "ids", tweetIds.size, stats)
val mhTimestamp = Time.now
val stitches = tweetIds.map { tweetId =>
val deletionStitches = columns.map { field =>
val mhKeyToDelete = TweetKey.fieldKey(tweetId, field.id)
delete(mhKeyToDelete, Some(mhTimestamp)).liftToTry
}
val collectedStitch =
Stitch.collect(deletionStitches).map(collectWithRateLimitCheck).lowerFromTry
collectedStitch
.flatMap { _ =>
val scrubbedStitches = columns.map { column =>
val scrubbedKey = TweetKey.scrubbedFieldKey(tweetId, column.id)
val record =
TweetManhattanRecord(
scrubbedKey,
ManhattanValue(StringCodec.toByteBuffer(""), Some(mhTimestamp))
)
insert(record).liftToTry
}
Stitch.collect(scrubbedStitches)
}
.map(collectWithRateLimitCheck)
}
Stitch.collect(stitches).map(collectWithRateLimitCheck).lowerFromTry.onSuccess { _ =>
tweetIds.foreach { id => scribe.logScrubbed(id, columns.map(_.id.toInt), mhTimestamp) }
}
}
}

View File

@ -1,20 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.util.Time
object SoftDeleteHandler {
def apply(
insert: ManhattanOperations.Insert,
scribe: Scribe
): TweetStorageClient.SoftDelete =
tweetId => {
val mhTimestamp = Time.now
val softDeleteRecord = TweetStateRecord
.SoftDeleted(tweetId, mhTimestamp.inMillis)
.toTweetMhRecord
insert(softDeleteRecord).onSuccess { _ =>
scribe.logRemoved(tweetId, mhTimestamp, isSoftDeleted = true)
}
}
}

View File

@ -1,33 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
object Stats {
// These two methods below (addWidthStat and updatePerFieldQpsCounters) are called per RPC call for most APIs,
// so we rely on the stats receiver that is passed in to the library to do memoization.
private[storage] def addWidthStat(
rpcName: String,
paramName: String,
width: Int,
stats: StatsReceiver
): Unit =
getStat(rpcName, paramName, stats).add(width)
// Updates the counters for each Additional field. The idea here is to expose the QPS for each
// additional field
private[storage] def updatePerFieldQpsCounters(
rpcName: String,
fieldIds: Seq[FieldId],
count: Int,
stats: StatsReceiver
): Unit = {
fieldIds.foreach { fieldId => getCounter(rpcName, fieldId, stats).incr(count) }
}
private def getCounter(rpcName: String, fieldId: FieldId, stats: StatsReceiver) =
stats.scope(rpcName, "fields", fieldId.toString).counter("count")
private def getStat(rpcName: String, paramName: String, stats: StatsReceiver) =
stats.scope(rpcName, paramName).stat("width")
}

View File

@ -1,129 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.tweetypie.storage_internal.thriftscala._
import com.twitter.tbird.{thriftscala => tbird}
object StatusConversions {
/**
* This is used only in Scribe.scala, when scribing to tbird_add_status
* Once we remove that, we can also remove this.
*/
def toTBirdStatus(tweet: StoredTweet): tbird.Status =
tbird.Status(
id = tweet.id,
userId = tweet.userId.get,
text = tweet.text.get,
createdVia = tweet.createdVia.get,
createdAtSec = tweet.createdAtSec.get,
reply = tweet.reply.map(toTBirdReply),
share = tweet.share.map(toTBirdShare),
contributorId = tweet.contributorId,
geo = tweet.geo.map(toTBirdGeo),
hasTakedown = tweet.hasTakedown.getOrElse(false),
nsfwUser = tweet.nsfwUser.getOrElse(false),
nsfwAdmin = tweet.nsfwAdmin.getOrElse(false),
media = tweet.media.map(_.map(toTBirdMedia)).getOrElse(Seq()),
narrowcast = tweet.narrowcast.map(toTBirdNarrowcast),
nullcast = tweet.nullcast.getOrElse(false),
trackingId = tweet.trackingId
)
/**
* This is only used in a test, to verify that the above method `toTBirdStatus`
* works, so we can't remove it as long as the above method exists.
*/
def fromTBirdStatus(status: tbird.Status): StoredTweet = {
StoredTweet(
id = status.id,
userId = Some(status.userId),
text = Some(status.text),
createdVia = Some(status.createdVia),
createdAtSec = Some(status.createdAtSec),
reply = status.reply.map(fromTBirdReply),
share = status.share.map(fromTBirdShare),
contributorId = status.contributorId,
geo = status.geo.map(fromTBirdGeo),
hasTakedown = Some(status.hasTakedown),
nsfwUser = Some(status.nsfwUser),
nsfwAdmin = Some(status.nsfwAdmin),
media = Some(status.media.map(fromTBirdMedia)),
narrowcast = status.narrowcast.map(fromTBirdNarrowcast),
nullcast = Some(status.nullcast),
trackingId = status.trackingId
)
}
private def fromTBirdReply(reply: tbird.Reply): StoredReply =
StoredReply(
inReplyToStatusId = reply.inReplyToStatusId,
inReplyToUserId = reply.inReplyToUserId
)
private def fromTBirdShare(share: tbird.Share): StoredShare =
StoredShare(
sourceStatusId = share.sourceStatusId,
sourceUserId = share.sourceUserId,
parentStatusId = share.parentStatusId
)
private def fromTBirdGeo(geo: tbird.Geo): StoredGeo =
StoredGeo(
latitude = geo.latitude,
longitude = geo.longitude,
geoPrecision = geo.geoPrecision,
entityId = geo.entityId
)
private def fromTBirdMedia(media: tbird.MediaEntity): StoredMediaEntity =
StoredMediaEntity(
id = media.id,
mediaType = media.mediaType,
width = media.width,
height = media.height
)
private def fromTBirdNarrowcast(narrowcast: tbird.Narrowcast): StoredNarrowcast =
StoredNarrowcast(
language = Some(narrowcast.language),
location = Some(narrowcast.location),
ids = Some(narrowcast.ids)
)
private def toTBirdReply(reply: StoredReply): tbird.Reply =
tbird.Reply(
inReplyToStatusId = reply.inReplyToStatusId,
inReplyToUserId = reply.inReplyToUserId
)
private def toTBirdShare(share: StoredShare): tbird.Share =
tbird.Share(
sourceStatusId = share.sourceStatusId,
sourceUserId = share.sourceUserId,
parentStatusId = share.parentStatusId
)
private def toTBirdGeo(geo: StoredGeo): tbird.Geo =
tbird.Geo(
latitude = geo.latitude,
longitude = geo.longitude,
geoPrecision = geo.geoPrecision,
entityId = geo.entityId,
name = geo.name
)
private def toTBirdMedia(media: StoredMediaEntity): tbird.MediaEntity =
tbird.MediaEntity(
id = media.id,
mediaType = media.mediaType,
width = media.width,
height = media.height
)
private def toTBirdNarrowcast(narrowcast: StoredNarrowcast): tbird.Narrowcast =
tbird.Narrowcast(
language = narrowcast.language.getOrElse(Nil),
location = narrowcast.location.getOrElse(Nil),
ids = narrowcast.ids.getOrElse(Nil)
)
}

View File

@ -1,346 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.mediaservices.commons.tweetmedia.thriftscala._
import com.twitter.scrooge.TFieldBlob
import com.twitter.tweetypie.additionalfields.AdditionalFields
import com.twitter.tweetypie.storage_internal.thriftscala._
import com.twitter.tweetypie.thriftscala._
import com.twitter.tweetypie.util.TweetLenses
object StorageConversions {
private val tbTweetCompiledAdditionalFieldIds =
StoredTweet.metaData.fields.map(_.id).filter(AdditionalFields.isAdditionalFieldId)
def toStoredReply(reply: Reply, conversationId: Option[TweetId]): StoredReply =
StoredReply(
inReplyToStatusId = reply.inReplyToStatusId.getOrElse(0),
inReplyToUserId = reply.inReplyToUserId,
conversationId = conversationId
)
def toStoredShare(share: Share): StoredShare =
StoredShare(
share.sourceStatusId,
share.sourceUserId,
share.parentStatusId
)
def toStoredQuotedTweet(qt: QuotedTweet, text: String): Option[StoredQuotedTweet] =
qt.permalink
.filterNot { p =>
text.contains(p.shortUrl)
} // omit StoredQuotedTweet when url already in text
.map { p =>
StoredQuotedTweet(
qt.tweetId,
qt.userId,
p.shortUrl
)
}
def toStoredGeo(tweet: Tweet): Option[StoredGeo] =
TweetLenses.geoCoordinates.get(tweet) match {
case None =>
TweetLenses.placeId.get(tweet) match {
case None => None
case Some(placeId) =>
Some(
StoredGeo(
latitude = 0.0,
longitude = 0.0,
geoPrecision = 0,
entityId = 0,
name = Some(placeId)
)
)
}
case Some(coords) =>
Some(
StoredGeo(
latitude = coords.latitude,
longitude = coords.longitude,
geoPrecision = coords.geoPrecision,
entityId = if (coords.display) 2 else 0,
name = TweetLenses.placeId.get(tweet)
)
)
}
def toStoredMedia(mediaList: Seq[MediaEntity]): Seq[StoredMediaEntity] =
mediaList.filter(_.sourceStatusId.isEmpty).flatMap(toStoredMediaEntity)
def toStoredMediaEntity(media: MediaEntity): Option[StoredMediaEntity] =
media.sizes.find(_.sizeType == MediaSizeType.Orig).map { origSize =>
StoredMediaEntity(
id = media.mediaId,
mediaType = origSize.deprecatedContentType.value.toByte,
width = origSize.width.toShort,
height = origSize.height.toShort
)
}
// The language and ids fields are for compatibility with existing tweets stored in manhattan.
def toStoredNarrowcast(narrowcast: Narrowcast): StoredNarrowcast =
StoredNarrowcast(
language = Some(Seq.empty),
location = Some(narrowcast.location),
ids = Some(Seq.empty)
)
def toStoredAdditionalFields(from: Seq[TFieldBlob], to: StoredTweet): StoredTweet =
from.foldLeft(to) { case (t, f) => t.setField(f) }
def toStoredAdditionalFields(from: Tweet, to: StoredTweet): StoredTweet =
toStoredAdditionalFields(AdditionalFields.additionalFields(from), to)
def toStoredTweet(tweet: Tweet): StoredTweet = {
val storedTweet =
StoredTweet(
id = tweet.id,
userId = Some(TweetLenses.userId(tweet)),
text = Some(TweetLenses.text(tweet)),
createdVia = Some(TweetLenses.createdVia(tweet)),
createdAtSec = Some(TweetLenses.createdAt(tweet)),
reply =
TweetLenses.reply(tweet).map { r => toStoredReply(r, TweetLenses.conversationId(tweet)) },
share = TweetLenses.share(tweet).map(toStoredShare),
contributorId = tweet.contributor.map(_.userId),
geo = toStoredGeo(tweet),
hasTakedown = Some(TweetLenses.hasTakedown(tweet)),
nsfwUser = Some(TweetLenses.nsfwUser(tweet)),
nsfwAdmin = Some(TweetLenses.nsfwAdmin(tweet)),
media = tweet.media.map(toStoredMedia),
narrowcast = TweetLenses.narrowcast(tweet).map(toStoredNarrowcast),
nullcast = Some(TweetLenses.nullcast(tweet)),
trackingId = TweetLenses.trackingId(tweet),
quotedTweet = TweetLenses.quotedTweet(tweet).flatMap { qt =>
toStoredQuotedTweet(qt, TweetLenses.text(tweet))
}
)
toStoredAdditionalFields(tweet, storedTweet)
}
/**
* Does not need core data to be set. Constructs on disk tweet by avoiding the TweetLenses object
* and only extracting the specified fields.
*
* NOTE: Assumes that specified fields are set in the tweet.
*
* @param tpTweet Tweetypie Tweet to be converted
* @param fields the fields to be populated in the on disk Tweet
*
* @return an on disk Tweet which has only the specified fields set
*/
def toStoredTweetForFields(tpTweet: Tweet, fields: Set[Field]): StoredTweet = {
// Make sure all the passed in fields are known or additional fields
require(
(fields -- Field.AllUpdatableCompiledFields)
.forall(field => AdditionalFields.isAdditionalFieldId(field.id))
)
val storedTweet =
StoredTweet(
id = tpTweet.id,
geo = if (fields.contains(Field.Geo)) {
tpTweet.coreData.get.coordinates match {
case None =>
tpTweet.coreData.get.placeId match {
case None => None
case Some(placeId) =>
Some(
StoredGeo(
latitude = 0.0,
longitude = 0.0,
geoPrecision = 0,
entityId = 0,
name = Some(placeId)
)
)
}
case Some(coords) =>
Some(
StoredGeo(
latitude = coords.latitude,
longitude = coords.longitude,
geoPrecision = coords.geoPrecision,
entityId = if (coords.display) 2 else 0,
name = tpTweet.coreData.get.placeId
)
)
}
} else {
None
},
hasTakedown =
if (fields.contains(Field.HasTakedown))
Some(tpTweet.coreData.get.hasTakedown)
else
None,
nsfwUser =
if (fields.contains(Field.NsfwUser))
Some(tpTweet.coreData.get.nsfwUser)
else
None,
nsfwAdmin =
if (fields.contains(Field.NsfwAdmin))
Some(tpTweet.coreData.get.nsfwAdmin)
else
None
)
if (fields.map(_.id).exists(AdditionalFields.isAdditionalFieldId))
toStoredAdditionalFields(tpTweet, storedTweet)
else
storedTweet
}
def fromStoredReply(reply: StoredReply): Reply =
Reply(
Some(reply.inReplyToStatusId).filter(_ > 0),
reply.inReplyToUserId
)
def fromStoredShare(share: StoredShare): Share =
Share(
share.sourceStatusId,
share.sourceUserId,
share.parentStatusId
)
def fromStoredQuotedTweet(qt: StoredQuotedTweet): QuotedTweet =
QuotedTweet(
qt.tweetId,
qt.userId,
Some(
ShortenedUrl(
shortUrl = qt.shortUrl,
longUrl = "", // will be hydrated later via tweetypie's QuotedTweetRefUrlsHydrator
displayText = "" //will be hydrated later via tweetypie's QuotedTweetRefUrlsHydrator
)
)
)
def fromStoredGeo(geo: StoredGeo): GeoCoordinates =
GeoCoordinates(
latitude = geo.latitude,
longitude = geo.longitude,
geoPrecision = geo.geoPrecision,
display = geo.entityId == 2
)
def fromStoredMediaEntity(media: StoredMediaEntity): MediaEntity =
MediaEntity(
fromIndex = -1, // will get filled in later
toIndex = -1, // will get filled in later
url = null, // will get filled in later
mediaPath = "", // field is obsolete
mediaUrl = null, // will get filled in later
mediaUrlHttps = null, // will get filled in later
displayUrl = null, // will get filled in later
expandedUrl = null, // will get filled in later
mediaId = media.id,
nsfw = false,
sizes = Set(
MediaSize(
sizeType = MediaSizeType.Orig,
resizeMethod = MediaResizeMethod.Fit,
deprecatedContentType = MediaContentType(media.mediaType),
width = media.width,
height = media.height
)
)
)
def fromStoredNarrowcast(narrowcast: StoredNarrowcast): Narrowcast =
Narrowcast(
location = narrowcast.location.getOrElse(Seq())
)
def fromStoredTweet(storedTweet: StoredTweet): Tweet = {
val coreData =
TweetCoreData(
userId = storedTweet.userId.get,
text = storedTweet.text.get,
createdVia = storedTweet.createdVia.get,
createdAtSecs = storedTweet.createdAtSec.get,
reply = storedTweet.reply.map(fromStoredReply),
share = storedTweet.share.map(fromStoredShare),
hasTakedown = storedTweet.hasTakedown.getOrElse(false),
nsfwUser = storedTweet.nsfwUser.getOrElse(false),
nsfwAdmin = storedTweet.nsfwAdmin.getOrElse(false),
narrowcast = storedTweet.narrowcast.map(fromStoredNarrowcast),
nullcast = storedTweet.nullcast.getOrElse(false),
trackingId = storedTweet.trackingId,
conversationId = storedTweet.reply.flatMap(_.conversationId),
placeId = storedTweet.geo.flatMap(_.name),
coordinates = storedTweet.geo.map(fromStoredGeo),
hasMedia = if (storedTweet.media.exists(_.nonEmpty)) Some(true) else None
)
// retweets should never have their media, but some tweets incorrectly do.
val storedMedia = if (coreData.share.isDefined) Nil else storedTweet.media.toSeq
val tpTweet =
Tweet(
id = storedTweet.id,
coreData = Some(coreData),
contributor = storedTweet.contributorId.map(Contributor(_)),
media = Some(storedMedia.flatten.map(fromStoredMediaEntity)),
mentions = Some(Seq.empty),
urls = Some(Seq.empty),
cashtags = Some(Seq.empty),
hashtags = Some(Seq.empty),
quotedTweet = storedTweet.quotedTweet.map(fromStoredQuotedTweet)
)
fromStoredAdditionalFields(storedTweet, tpTweet)
}
def fromStoredTweetAllowInvalid(storedTweet: StoredTweet): Tweet = {
fromStoredTweet(
storedTweet.copy(
userId = storedTweet.userId.orElse(Some(-1L)),
text = storedTweet.text.orElse(Some("")),
createdVia = storedTweet.createdVia.orElse(Some("")),
createdAtSec = storedTweet.createdAtSec.orElse(Some(-1L))
))
}
def fromStoredAdditionalFields(from: StoredTweet, to: Tweet): Tweet = {
val passThroughAdditionalFields =
from._passthroughFields.filterKeys(AdditionalFields.isAdditionalFieldId)
val allAdditionalFields =
from.getFieldBlobs(tbTweetCompiledAdditionalFieldIds) ++ passThroughAdditionalFields
allAdditionalFields.values.foldLeft(to) { case (t, f) => t.setField(f) }
}
def toDeletedTweet(storedTweet: StoredTweet): DeletedTweet = {
val noteTweetBlob = storedTweet.getFieldBlob(Tweet.NoteTweetField.id)
val noteTweetOption = noteTweetBlob.map(blob => NoteTweet.decode(blob.read))
DeletedTweet(
id = storedTweet.id,
userId = storedTweet.userId,
text = storedTweet.text,
createdAtSecs = storedTweet.createdAtSec,
share = storedTweet.share.map(toDeletedShare),
media = storedTweet.media.map(_.map(toDeletedMediaEntity)),
noteTweetId = noteTweetOption.map(_.id),
isExpandable = noteTweetOption.flatMap(_.isExpandable)
)
}
def toDeletedShare(storedShare: StoredShare): DeletedTweetShare =
DeletedTweetShare(
sourceStatusId = storedShare.sourceStatusId,
sourceUserId = storedShare.sourceUserId,
parentStatusId = storedShare.parentStatusId
)
def toDeletedMediaEntity(storedMediaEntity: StoredMediaEntity): DeletedTweetMediaEntity =
DeletedTweetMediaEntity(
id = storedMediaEntity.id,
mediaType = storedMediaEntity.mediaType,
width = storedMediaEntity.width,
height = storedMediaEntity.height
)
}

View File

@ -1,92 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.util.Time
import com.twitter.util.Try
import java.util.Arrays
import scala.util.control.NoStackTrace
import scala.util.control.NonFatal
sealed abstract class TimestampType(val keyName: String)
object TimestampType {
object Default extends TimestampType("timestamp")
object SoftDelete extends TimestampType("softdelete_timestamp")
}
/**
* TimestampDecoder gets the timestamps associated with state records. The Manhattan timestamp is
* used for legacy records (with value "1"), otherwise the timestamp is extracted from the
* JSON value.
*
* See "Metadata" in README.md for further information about state records.
*/
object TimestampDecoder {
case class UnparsableJson(msg: String, t: Throwable) extends Exception(msg, t) with NoStackTrace
case class MissingJsonTimestamp(msg: String) extends Exception(msg) with NoStackTrace
case class UnexpectedJsonValue(msg: String) extends Exception(msg) with NoStackTrace
case class MissingManhattanTimestamp(msg: String) extends Exception(msg) with NoStackTrace
private[storage] val LegacyValue: Array[Byte] = Array('1')
/**
* The first backfill of tweet data to Manhattan supplied timestamps in milliseconds where
* nanoseconds were expected. The result is that some values have an incorrect Manhattan
* timestamp. For these bad timestamps, time.inNanoseconds is actually milliseconds.
*
* For example, the deletion record for tweet 22225781 has Manhattan timestamp 1970-01-01 00:23:24 +0000.
* Contrast with the deletion record for tweet 435404491999813632 with Manhattan timestamp 2014-11-09 14:24:04 +0000
*
* This threshold value comes from the last time in milliseconds that was interpreted
* as nanoseconds, e.g. Time.fromNanoseconds(1438387200000L) == 1970-01-01 00:23:58 +0000
*/
private[storage] val BadTimestampThreshold = Time.at("1970-01-01 00:23:58 +0000")
def decode(record: TweetManhattanRecord, tsType: TimestampType): Try[Long] =
decode(record.value, tsType)
def decode(mhValue: TweetManhattanValue, tsType: TimestampType): Try[Long] = {
val value = ByteArrayCodec.fromByteBuffer(mhValue.contents)
if (isLegacyRecord(value)) {
nativeManhattanTimestamp(mhValue)
} else {
jsonTimestamp(value, tsType)
}
}
private def isLegacyRecord(value: Array[Byte]) = Arrays.equals(value, LegacyValue)
private def nativeManhattanTimestamp(mhValue: TweetManhattanValue): Try[Long] =
mhValue.timestamp match {
case Some(ts) => Return(correctedTimestamp(ts))
case None =>
Throw(MissingManhattanTimestamp(s"Manhattan timestamp missing in value $mhValue"))
}
private def jsonTimestamp(value: Array[Byte], tsType: TimestampType): Try[Long] =
Try { Json.decode(value) }
.rescue { case NonFatal(e) => Throw(UnparsableJson(e.getMessage, e)) }
.flatMap { m =>
m.get(tsType.keyName) match {
case Some(v) =>
v match {
case l: Long => Return(l)
case i: Integer => Return(i.toLong)
case _ =>
Throw(
UnexpectedJsonValue(s"Unexpected value for ${tsType.keyName} in record data $m")
)
}
case None =>
Throw(MissingJsonTimestamp(s"Missing key ${tsType.keyName} in record data $m"))
}
}
def correctedTime(t: Time): Time =
if (t < BadTimestampThreshold) Time.fromMilliseconds(t.inNanoseconds) else t
def correctedTime(t: Long): Time = correctedTime(Time.fromNanoseconds(t))
def correctedTimestamp(t: Time): Long =
if (t < BadTimestampThreshold) t.inNanoseconds else t.inMilliseconds
}

View File

@ -1,164 +0,0 @@
package com.twitter.tweetypie.storage
/**
* Responsible for encoding/decoding Tweet records to/from Manhattan keys
*
* K/V Scheme:
* -----------
* [TweetId]
* /metadata
* /delete_state (a.k.a. hard delete)
* /soft_delete_state
* /bounce_delete_state
* /undelete_state
* /force_added_state
* /scrubbed_fields/
* /[ScrubbedFieldId_1]
* ..
* /[ScrubbedFieldId_M]
* /fields
* /internal
* /1
* /9
* ..
* /99
* /external
* /100
* ..
*
* IMPORTANT NOTE:
* 1) Field Ids 2 to 8 in Tweet thrift struct are considered "core fields" are 'packed' together
* into a TFieldBlob and stored under field id 1 (i.e [DatasetName]/[TweetId]/fields/internal/1).
* This is why we do not see keys from [DatasetName]/[TweetId]/fields/internal/2 to [DatasetName]/
* [TweetId]/fields/internal/8)
*
* 2) Also, the tweet id (which is the field id 1 in Tweet thrift structure) is not explicitly stored
* in Manhattan. There is no need to explicitly store it since it is a part of the Pkey
*/
case class TweetKey(tweetId: TweetId, lKey: TweetKey.LKey) {
override def toString: String =
s"/${ManhattanOperations.PkeyInjection(tweetId)}/${ManhattanOperations.LkeyInjection(lKey)}"
}
object TweetKey {
// Manhattan uses lexicographical order for keys. To make sure lexicographical order matches the
// numerical order, we should pad both tweet id and field ids with leading zeros.
// Since tweet id is long and field id is a short, the max width of each can be obtained by doing
// Long.MaxValue.toString.length and Short.MaxValue.toString.length respectively
private val TweetIdFormatStr = s"%0${Long.MaxValue.toString.length}d"
private val FieldIdFormatStr = s"%0${Short.MaxValue.toString.length}d"
private[storage] def padTweetIdStr(tweetId: Long): String = TweetIdFormatStr.format(tweetId)
private[storage] def padFieldIdStr(fieldId: Short): String = FieldIdFormatStr.format(fieldId)
def coreFieldsKey(tweetId: TweetId): TweetKey = TweetKey(tweetId, LKey.CoreFieldsKey)
def hardDeletionStateKey(tweetId: TweetId): TweetKey =
TweetKey(tweetId, LKey.HardDeletionStateKey)
def softDeletionStateKey(tweetId: TweetId): TweetKey =
TweetKey(tweetId, LKey.SoftDeletionStateKey)
def bounceDeletionStateKey(tweetId: TweetId): TweetKey =
TweetKey(tweetId, LKey.BounceDeletionStateKey)
def unDeletionStateKey(tweetId: TweetId): TweetKey = TweetKey(tweetId, LKey.UnDeletionStateKey)
def forceAddedStateKey(tweetId: TweetId): TweetKey = TweetKey(tweetId, LKey.ForceAddedStateKey)
def scrubbedGeoFieldKey(tweetId: TweetId): TweetKey = TweetKey(tweetId, LKey.ScrubbedGeoFieldKey)
def fieldKey(tweetId: TweetId, fieldId: FieldId): TweetKey =
TweetKey(tweetId, LKey.FieldKey(fieldId))
def internalFieldsKey(tweetId: TweetId, fieldId: FieldId): TweetKey =
TweetKey(tweetId, LKey.InternalFieldsKey(fieldId))
def additionalFieldsKey(tweetId: TweetId, fieldId: FieldId): TweetKey =
TweetKey(tweetId, LKey.AdditionalFieldsKey(fieldId))
def scrubbedFieldKey(tweetId: TweetId, fieldId: FieldId): TweetKey =
TweetKey(tweetId, LKey.ScrubbedFieldKey(fieldId))
// AllFieldsKeyPrefix: fields
// CoreFieldsKey: fields/internal/1 (Stores subset of StoredTweet fields which are
// "packed" into a single CoreFields record)
// HardDeletionStateKey: metadata/delete_state
// SoftDeletionStateKey: metadata/soft_delete_state
// BounceDeletionStateKey: metadata/bounce_delete_state
// UnDeletionStateKey: metadata/undelete_state
// ForceAddedStateKey: metadata/force_added_state
// FieldKey: fields/<group_name>/<padded_field_id> (where <group_name>
// is 'internal' for field ids < 100 and 'external' for all other
// fields ids)
// InternalFieldsKeyPrefix: fields/internal
// PKey: <empty string>
// ScrubbedFieldKey: metadata/scrubbed_fields/<padded_field_id>
// ScrubbedFieldKeyPrefix: metadata/scrubbed_fields
sealed abstract class LKey(override val toString: String)
object LKey {
private val HardDeletionRecordLiteral = "delete_state"
private val SoftDeletionRecordLiteral = "soft_delete_state"
private val BounceDeletionRecordLiteral = "bounce_delete_state"
private val UnDeletionRecordLiteral = "undelete_state"
private val ForceAddRecordLiteral = "force_added_state"
private val ScrubbedFieldsGroup = "scrubbed_fields"
private val InternalFieldsGroup = "internal"
private val ExternalFieldsGroup = "external"
private val MetadataCategory = "metadata"
private val FieldsCategory = "fields"
private val InternalFieldsKeyPrefix = s"$FieldsCategory/$InternalFieldsGroup/"
private val ExternalFieldsKeyPrefix = s"$FieldsCategory/$ExternalFieldsGroup/"
private val ScrubbedFieldsKeyPrefix = s"$MetadataCategory/$ScrubbedFieldsGroup/"
sealed abstract class MetadataKey(metadataType: String)
extends LKey(s"$MetadataCategory/$metadataType")
sealed abstract class StateKey(stateType: String) extends MetadataKey(stateType)
case object HardDeletionStateKey extends StateKey(s"$HardDeletionRecordLiteral")
case object SoftDeletionStateKey extends StateKey(s"$SoftDeletionRecordLiteral")
case object BounceDeletionStateKey extends StateKey(s"$BounceDeletionRecordLiteral")
case object UnDeletionStateKey extends StateKey(s"$UnDeletionRecordLiteral")
case object ForceAddedStateKey extends StateKey(s"$ForceAddRecordLiteral")
case class ScrubbedFieldKey(fieldId: FieldId)
extends MetadataKey(s"$ScrubbedFieldsGroup/${padFieldIdStr(fieldId)}")
val ScrubbedGeoFieldKey: LKey.ScrubbedFieldKey = ScrubbedFieldKey(TweetFields.geoFieldId)
/**
* LKey that has one of many possible fields id. This generalize over
* internal and additional fields key.
*/
sealed abstract class FieldKey(prefix: String) extends LKey(toString) {
def fieldId: FieldId
override val toString: String = prefix + padFieldIdStr(fieldId)
}
object FieldKey {
def apply(fieldId: FieldId): FieldKey =
fieldId match {
case id if id < TweetFields.firstAdditionalFieldId => InternalFieldsKey(fieldId)
case _ => AdditionalFieldsKey(fieldId)
}
}
case class InternalFieldsKey(fieldId: FieldId) extends FieldKey(InternalFieldsKeyPrefix) {
assert(fieldId < TweetFields.firstAdditionalFieldId)
}
case class AdditionalFieldsKey(fieldId: FieldId) extends FieldKey(ExternalFieldsKeyPrefix) {
assert(fieldId >= TweetFields.firstAdditionalFieldId)
}
val CoreFieldsKey: LKey.InternalFieldsKey = InternalFieldsKey(TweetFields.rootCoreFieldId)
case class Unknown private (str: String) extends LKey(str)
def fromString(str: String): LKey = {
def extractFieldId(prefix: String): FieldId =
str.slice(prefix.length, str.length).toShort
str match {
case CoreFieldsKey.toString => CoreFieldsKey
case HardDeletionStateKey.toString => HardDeletionStateKey
case SoftDeletionStateKey.toString => SoftDeletionStateKey
case BounceDeletionStateKey.toString => BounceDeletionStateKey
case UnDeletionStateKey.toString => UnDeletionStateKey
case ForceAddedStateKey.toString => ForceAddedStateKey
case ScrubbedGeoFieldKey.toString => ScrubbedGeoFieldKey
case _ if str.startsWith(InternalFieldsKeyPrefix) =>
InternalFieldsKey(extractFieldId(InternalFieldsKeyPrefix))
case _ if str.startsWith(ExternalFieldsKeyPrefix) =>
AdditionalFieldsKey(extractFieldId(ExternalFieldsKeyPrefix))
case _ if str.startsWith(ScrubbedFieldsKeyPrefix) =>
ScrubbedFieldKey(extractFieldId(ScrubbedFieldsKeyPrefix))
case _ => Unknown(str)
}
}
}
}

View File

@ -1,90 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.storage.client.manhattan.kv.ManhattanValue
import com.twitter.util.Time
/**
* A [[TweetStateRecord]] represents an action taken on a tweet and can be used to determine a tweet's state.
*
* The state is determined by the record with the most recent timestamp. In the absence of any
* record a tweet is considered found, which is to say the tweet has not been through the
* deletion process.
*
* The [[TweetStateRecord]] type is determined by the lkey of a tweet manhattan record:
* metadata/delete_state -> HardDeleted
* metadata/soft_delete_state -> SoftDeleted
* metadata/undelete_state -> Undeleted
* metadata/force_added_state -> ForceAdded
*
* See the README in this directory for more details about the state of a tweet.
*/
sealed trait TweetStateRecord {
def tweetId: TweetId
def createdAt: Long
def stateKey: TweetKey.LKey.StateKey
def values: Map[String, Long] = Map("timestamp" -> createdAt)
def name: String
def toTweetMhRecord: TweetManhattanRecord = {
val valByteBuffer = ByteArrayCodec.toByteBuffer(Json.encode(values))
val value = ManhattanValue(valByteBuffer, Some(Time.fromMilliseconds(createdAt)))
TweetManhattanRecord(TweetKey(tweetId, stateKey), value)
}
}
object TweetStateRecord {
/** When a soft-deleted or bounce deleted tweet is ultimately hard-deleted by an offline job. */
case class HardDeleted(tweetId: TweetId, createdAt: Long, deletedAt: Long)
extends TweetStateRecord {
// timestamp in the mh backend is the hard deletion timestamp
override def values = Map("timestamp" -> createdAt, "softdelete_timestamp" -> deletedAt)
def stateKey = TweetKey.LKey.HardDeletionStateKey
def name = "hard_deleted"
}
/** When a tweet is deleted by the user. It can still be undeleted while in the soft deleted state. */
case class SoftDeleted(tweetId: TweetId, createdAt: Long) extends TweetStateRecord {
def stateKey = TweetKey.LKey.SoftDeletionStateKey
def name = "soft_deleted"
}
/** When a tweet is deleted by go/bouncer for violating Twitter Rules. It MAY NOT be undeleted. */
case class BounceDeleted(tweetId: TweetId, createdAt: Long) extends TweetStateRecord {
def stateKey = TweetKey.LKey.BounceDeletionStateKey
def name = "bounce_deleted"
}
/** When a tweet is undeleted by an internal system. */
case class Undeleted(tweetId: TweetId, createdAt: Long) extends TweetStateRecord {
def stateKey = TweetKey.LKey.UnDeletionStateKey
def name = "undeleted"
}
/** When a tweet is created using the forceAdd endpoint. */
case class ForceAdded(tweetId: TweetId, createdAt: Long) extends TweetStateRecord {
def stateKey = TweetKey.LKey.ForceAddedStateKey
def name = "force_added"
}
def fromTweetMhRecord(record: TweetManhattanRecord): Option[TweetStateRecord] = {
def ts = TimestampDecoder.decode(record, TimestampType.Default).getOrElse(0L)
def sdts = TimestampDecoder.decode(record, TimestampType.SoftDelete).getOrElse(0L)
def tweetId = record.pkey
record.lkey match {
case TweetKey.LKey.HardDeletionStateKey => Some(HardDeleted(tweetId, ts, sdts))
case TweetKey.LKey.SoftDeletionStateKey => Some(SoftDeleted(tweetId, ts))
case TweetKey.LKey.BounceDeletionStateKey => Some(BounceDeleted(tweetId, ts))
case TweetKey.LKey.UnDeletionStateKey => Some(Undeleted(tweetId, ts))
case TweetKey.LKey.ForceAddedStateKey => Some(ForceAdded(tweetId, ts))
case _ => None
}
}
def fromTweetMhRecords(records: Seq[TweetManhattanRecord]): Seq[TweetStateRecord] =
records.flatMap(fromTweetMhRecord)
def mostRecent(records: Seq[TweetManhattanRecord]): Option[TweetStateRecord] =
fromTweetMhRecords(records).sortBy(_.createdAt).lastOption
}

View File

@ -1,201 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.stitch.Stitch
import com.twitter.tweetypie.storage.Response.TweetResponse
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.util.Future
/**
* Interface for reading and writing tweet data in Manhattan
*/
trait TweetStorageClient {
import TweetStorageClient._
def addTweet: AddTweet
def deleteAdditionalFields: DeleteAdditionalFields
def getTweet: GetTweet
def getStoredTweet: GetStoredTweet
def getDeletedTweets: GetDeletedTweets
def undelete: Undelete
def updateTweet: UpdateTweet
def scrub: Scrub
def softDelete: SoftDelete
def bounceDelete: BounceDelete
def hardDeleteTweet: HardDeleteTweet
def ping: Ping
}
object TweetStorageClient {
type GetTweet = TweetId => Stitch[GetTweet.Response]
object GetTweet {
sealed trait Response
object Response {
case class Found(tweet: Tweet) extends Response
object NotFound extends Response
object Deleted extends Response
// On BounceDeleted, provide the full Tweet so that implementations
// (i.e. ManhattanTweetStorageClient) don't not need to be aware of the specific tweet
// fields required by callers for proper processing of bounced deleted tweets.
case class BounceDeleted(tweet: Tweet) extends Response
}
}
type GetStoredTweet = TweetId => Stitch[GetStoredTweet.Response]
object GetStoredTweet {
sealed abstract class Error(val message: String) {
override def toString: String = message
}
object Error {
case object TweetIsCorrupt extends Error("stored tweet data is corrupt and cannot be decoded")
case object ScrubbedFieldsPresent
extends Error("stored tweet fields that should be scrubbed are still present")
case object TweetFieldsMissingOrInvalid
extends Error("expected tweet fields are missing or contain invalid values")
case object TweetShouldBeHardDeleted
extends Error("stored tweet that should be hard deleted is still present")
}
sealed trait Response
object Response {
sealed trait StoredTweetMetadata {
def state: Option[TweetStateRecord]
def allStates: Seq[TweetStateRecord]
def scrubbedFields: Set[FieldId]
}
sealed trait StoredTweetErrors {
def errs: Seq[Error]
}
/**
* Tweet data was found, possibly state records and/or scrubbed field records.
*/
sealed trait FoundAny extends Response with StoredTweetMetadata {
def tweet: Tweet
}
object FoundAny {
def unapply(
response: Response
): Option[
(Tweet, Option[TweetStateRecord], Seq[TweetStateRecord], Set[FieldId], Seq[Error])
] =
response match {
case f: FoundWithErrors =>
Some((f.tweet, f.state, f.allStates, f.scrubbedFields, f.errs))
case f: FoundAny => Some((f.tweet, f.state, f.allStates, f.scrubbedFields, Seq.empty))
case _ => None
}
}
/**
* No records for this tweet id were found in storage
*/
case class NotFound(id: TweetId) extends Response
/**
* Data related to the Tweet id was found but could not be loaded successfully. The
* errs array contains details of the problems.
*/
case class Failed(
id: TweetId,
state: Option[TweetStateRecord],
allStates: Seq[TweetStateRecord],
scrubbedFields: Set[FieldId],
errs: Seq[Error],
) extends Response
with StoredTweetMetadata
with StoredTweetErrors
/**
* No Tweet data was found, and the most recent state record found is HardDeleted
*/
case class HardDeleted(
id: TweetId,
state: Option[TweetStateRecord.HardDeleted],
allStates: Seq[TweetStateRecord],
scrubbedFields: Set[FieldId],
) extends Response
with StoredTweetMetadata
/**
* Tweet data was found, and the most recent state record found, if any, is not
* any form of deletion record.
*/
case class Found(
tweet: Tweet,
state: Option[TweetStateRecord],
allStates: Seq[TweetStateRecord],
scrubbedFields: Set[FieldId],
) extends FoundAny
/**
* Tweet data was found, and the most recent state record found indicates deletion.
*/
case class FoundDeleted(
tweet: Tweet,
state: Option[TweetStateRecord],
allStates: Seq[TweetStateRecord],
scrubbedFields: Set[FieldId],
) extends FoundAny
/**
* Tweet data was found, however errors were detected in the stored data. Required
* fields may be missing from the Tweet struct (e.g. CoreData), stored fields that
* should be scrubbed remain present, or Tweets that should be hard-deleted remain
* in storage. The errs array contains details of the problems.
*/
case class FoundWithErrors(
tweet: Tweet,
state: Option[TweetStateRecord],
allStates: Seq[TweetStateRecord],
scrubbedFields: Set[FieldId],
errs: Seq[Error],
) extends FoundAny
with StoredTweetErrors
}
}
type HardDeleteTweet = TweetId => Stitch[HardDeleteTweet.Response]
type SoftDelete = TweetId => Stitch[Unit]
type BounceDelete = TweetId => Stitch[Unit]
object HardDeleteTweet {
sealed trait Response
object Response {
case class Deleted(deletedAtMillis: Option[Long], createdAtMillis: Option[Long])
extends Response
case class NotDeleted(id: TweetId, ineligibleLKey: Option[TweetKey.LKey])
extends Throwable
with Response
}
}
type Undelete = TweetId => Stitch[Undelete.Response]
object Undelete {
case class Response(
code: UndeleteResponseCode,
tweet: Option[Tweet] = None,
createdAtMillis: Option[Long] = None,
archivedAtMillis: Option[Long] = None)
sealed trait UndeleteResponseCode
object UndeleteResponseCode {
object Success extends UndeleteResponseCode
object BackupNotFound extends UndeleteResponseCode
object NotCreated extends UndeleteResponseCode
}
}
type AddTweet = Tweet => Stitch[Unit]
type UpdateTweet = (Tweet, Seq[Field]) => Stitch[TweetResponse]
type GetDeletedTweets = Seq[TweetId] => Stitch[Seq[DeletedTweetResponse]]
type DeleteAdditionalFields = (Seq[TweetId], Seq[Field]) => Stitch[Seq[TweetResponse]]
type Scrub = (Seq[TweetId], Seq[Field]) => Stitch[Unit]
type Ping = () => Future[Unit]
}

View File

@ -1,34 +0,0 @@
package com.twitter.tweetypie.storage
import scala.util.control.NoStackTrace
sealed abstract class TweetStorageException(message: String, cause: Throwable)
extends Exception(message, cause)
/**
* The request was not properly formed and failed an assertion present in the code. Should not be
* retried without modification.
*/
case class ClientError(message: String, cause: Throwable)
extends TweetStorageException(message, cause)
with NoStackTrace
/**
* Request was rejected by Manhattan or the in-process rate limiter. Should not be retried.
*/
case class RateLimited(message: String, cause: Throwable)
extends TweetStorageException(message, cause)
with NoStackTrace
/**
* Corrupt tweets were requested from Manhattan
*/
case class VersionMismatchError(message: String, cause: Throwable = null)
extends TweetStorageException(message, cause)
with NoStackTrace
/**
* All other unhandled exceptions.
*/
case class InternalError(message: String, cause: Throwable = null)
extends TweetStorageException(message, cause)

View File

@ -1,265 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.logging.Logger
import com.twitter.scrooge.TFieldBlob
import com.twitter.snowflake.id.SnowflakeId
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
import com.twitter.storage.client.manhattan.kv.ManhattanException
import com.twitter.tweetypie.storage.Response._
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.util.Try
object TweetUtils {
val log: Logger = Logger("com.twitter.tweetypie.storage.TweetStorageLibrary")
import FieldResponseCodec.ValueNotFoundException
/**
* It's rare, but we have seen tweets with userId=0, which is likely the result of a
* failed/partial delete. Treat these as invalid tweets, which are returned to callers
* as not found.
*/
def isValid(tweet: StoredTweet): Boolean =
tweet.userId.exists(_ != 0) && tweet.text.nonEmpty &&
tweet.createdVia.nonEmpty && tweet.createdAtSec.nonEmpty
/**
* Helper function to extract Scrubbed field Ids from the result returned by reading entire tweet prefix
* function.
*
* @param records The sequence of MH records for the given tweetId
*
* @return The set of scrubbed field ids
*/
private[tweetypie] def extractScrubbedFields(records: Seq[TweetManhattanRecord]): Set[Short] =
records
.map(r => r.lkey)
.collect { case TweetKey.LKey.ScrubbedFieldKey(fieldId) => fieldId }
.toSet
private[tweetypie] val expectedFields =
TweetFields.requiredFieldIds.toSet - TweetFields.tweetIdField
/**
* Find the timestamp from a tweetId and a list of MH records. This is used when
* you need a timestamp and you aren't sure that tweetId is a snowflake id.
*
* @param tweetId A tweetId you want the timestamp for.
* @param records Tbird_mh records keyed on tweetId, one of which should be the
* core fields record.
* @return A milliseconds timestamp if one could be found.
*/
private[tweetypie] def creationTimeFromTweetIdOrMHRecords(
tweetId: Long,
records: Seq[TweetManhattanRecord]
): Option[Long] =
SnowflakeId
.unixTimeMillisOptFromId(tweetId).orElse({
records
.find(_.lkey == TweetKey.LKey.CoreFieldsKey)
.flatMap { coreFields =>
CoreFieldsCodec
.fromTFieldBlob(
TFieldBlobCodec.fromByteBuffer(coreFields.value.contents)
).createdAtSec.map(seconds => seconds * 1000)
}
})
/**
* Helper function used to parse manhattan results for fields in a tweet (given in the form of
* Sequence of (FieldKey, Try[Unit]) pairs) and build a TweetResponse object.
*
* @param callerName The name of the caller function. Used for error messages
* @param tweetId Id of the Tweet for which TweetResponse is being built
* @param fieldResults Sequence of (FieldKey, Try[Unit]).
*
* @return TweetResponse object
*/
private[tweetypie] def buildTweetResponse(
callerName: String,
tweetId: Long,
fieldResults: Map[FieldId, Try[Unit]]
): TweetResponse = {
// Count Found/Not Found
val successCount =
fieldResults.foldLeft(0) {
case (count, (_, Return(_))) => count + 1
case (count, (_, Throw(_: ValueNotFoundException))) => count + 1
case (count, _) => count
}
val fieldResponsesMap = getFieldResponses(callerName, tweetId, fieldResults)
val overallCode = if (successCount > 0 && successCount == fieldResults.size) {
TweetResponseCode.Success
} else {
// If any field was rate limited, then we consider the entire tweet to be rate limited. So first we scan
// the field results to check such an occurrence.
val wasRateLimited = fieldResults.exists { fieldResult =>
fieldResult._2 match {
case Throw(e: DeniedManhattanException) => true
case _ => false
}
}
// Were we rate limited for any of the additional fields?
if (wasRateLimited) {
TweetResponseCode.OverCapacity
} else if (successCount == 0) {
// successCount is < fieldResults.size at this point. So if allOrNone is true or
// if successCount == 0 (i.e failed on all Fields), the overall code should be 'Failure'
TweetResponseCode.Failure
} else {
// allOrNone == false AND successCount > 0 at this point. Clearly the overallCode should be Partial
TweetResponseCode.Partial
}
}
TweetResponse(tweetId, overallCode, Some(fieldResponsesMap))
}
/**
* Helper function to convert manhattan results into a Map[FieldId, FieldResponse]
*
* @param fieldResults Sequence of (TweetKey, TFieldBlob).
*/
private[tweetypie] def getFieldResponses(
callerName: String,
tweetId: TweetId,
fieldResults: Map[FieldId, Try[_]]
): Map[FieldId, FieldResponse] =
fieldResults.map {
case (fieldId, resp) =>
def keyStr = TweetKey.fieldKey(tweetId, fieldId).toString
resp match {
case Return(_) =>
fieldId -> FieldResponse(FieldResponseCode.Success, None)
case Throw(mhException: ManhattanException) =>
val errMsg = s"Exception in $callerName. Key: $keyStr. Error: $mhException"
mhException match {
case _: ValueNotFoundException => // ValueNotFound is not an error
case _ => log.error(errMsg)
}
fieldId -> FieldResponseCodec.fromThrowable(mhException, Some(errMsg))
case Throw(e) =>
val errMsg = s"Exception in $callerName. Key: $keyStr. Error: $e"
log.error(errMsg)
fieldId -> FieldResponse(FieldResponseCode.Error, Some(errMsg))
}
}
/**
* Helper function to build a TweetResponse object when being rate limited. Its possible that only some of the fields
* got rate limited, so we indicate which fields got processed successfully, and which encountered some sort of error.
*
* @param tweetId Tweet id
* @param callerName name of API calling this function
* @param fieldResponses field responses for the case where
*
* @return The TweetResponse object
*/
private[tweetypie] def buildTweetOverCapacityResponse(
callerName: String,
tweetId: Long,
fieldResponses: Map[FieldId, Try[Unit]]
) = {
val fieldResponsesMap = getFieldResponses(callerName, tweetId, fieldResponses)
TweetResponse(tweetId, TweetResponseCode.OverCapacity, Some(fieldResponsesMap))
}
/**
* Build a StoredTweet from a Seq of records. Core fields are handled specially.
*/
private[tweetypie] def buildStoredTweet(
tweetId: TweetId,
records: Seq[TweetManhattanRecord],
includeScrubbed: Boolean = false,
): StoredTweet = {
getStoredTweetBlobs(records, includeScrubbed)
.flatMap { fieldBlob =>
// When fieldId == TweetFields.rootCoreFieldId, we have further work to do since the
// 'value' is really serialized/packed version of all core fields. In this case we'll have
// to unpack it into many TFieldBlobs.
if (fieldBlob.id == TweetFields.rootCoreFieldId) {
// We won't throw any error in this function and instead let the caller function handle this
// condition (i.e If the caller function does not find any values for the core-fields in
// the returned map, it should assume that the tweet is not found)
CoreFieldsCodec.unpackFields(fieldBlob).values.toSeq
} else {
Seq(fieldBlob)
}
}.foldLeft(StoredTweet(tweetId))(_.setField(_))
}
private[tweetypie] def buildValidStoredTweet(
tweetId: TweetId,
records: Seq[TweetManhattanRecord]
): Option[StoredTweet] = {
val storedTweet = buildStoredTweet(tweetId, records)
if (storedTweet.getFieldBlobs(expectedFields).nonEmpty && isValid(storedTweet)) {
Some(storedTweet)
} else {
None
}
}
/**
* Return a TFieldBlob for each StoredTweet field defined in this set of records.
* @param includeScrubbed when false, result will not include scrubbed fields even
* if the data is present in the set of records.
*/
private[tweetypie] def getStoredTweetBlobs(
records: Seq[TweetManhattanRecord],
includeScrubbed: Boolean = false,
): Seq[TFieldBlob] = {
val scrubbed = extractScrubbedFields(records)
records
.flatMap { r =>
// extract LKey.FieldKey records if they are not scrubbed and get their TFieldBlobs
r.key match {
case fullKey @ TweetKey(_, key: TweetKey.LKey.FieldKey)
if includeScrubbed || !scrubbed.contains(key.fieldId) =>
try {
val fieldBlob = TFieldBlobCodec.fromByteBuffer(r.value.contents)
if (fieldBlob.field.id != key.fieldId) {
throw new AssertionError(
s"Blob stored for $fullKey has unexpected id ${fieldBlob.field.id}"
)
}
Some(fieldBlob)
} catch {
case e: VersionMismatchError =>
log.error(
s"Failed to decode bytebuffer for $fullKey: ${e.getMessage}"
)
throw e
}
case _ => None
}
}
}
/**
* Its important to bubble up rate limiting exceptions as they would likely be the root cause for other issues
* (timeouts etc.), so we scan for this particular exception, and if found, we bubble that up specifically
*
* @param seqOfTries The sequence of tries which may contain within it a rate limit exception
*
* @return if a rate limiting exn was detected, this will be a Throw(e: DeniedManhattanException)
* otherwise it will be a Return(_) only if all individual tries succeeded
*/
private[tweetypie] def collectWithRateLimitCheck(seqOfTries: Seq[Try[Unit]]): Try[Unit] = {
val rateLimitThrowOpt = seqOfTries.find {
case Throw(e: DeniedManhattanException) => true
case _ => false
}
rateLimitThrowOpt.getOrElse(
Try.collect(seqOfTries).map(_ => ())
) // Operation is considered successful only if all the deletions are successful
}
}

View File

@ -1,106 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.tweetypie.storage.TweetStorageClient.Undelete
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.util.Time
object UndeleteHandler {
def apply(
read: ManhattanOperations.Read,
localInsert: ManhattanOperations.Insert,
remoteInsert: ManhattanOperations.Insert,
delete: ManhattanOperations.Delete,
undeleteWindowHours: Int,
stats: StatsReceiver
): Undelete = {
def withinUndeleteWindow(timestampMs: Long) =
(Time.now - Time.fromMilliseconds(timestampMs)).inHours < undeleteWindowHours
def prepareUndelete(
tweetId: TweetId,
records: Seq[TweetManhattanRecord]
): (Undelete.Response, Option[TweetManhattanRecord]) = {
val undeleteRecord =
Some(TweetStateRecord.Undeleted(tweetId, Time.now.inMillis).toTweetMhRecord)
TweetStateRecord.mostRecent(records) match {
// check if we need to undo a soft deletion
case Some(TweetStateRecord.SoftDeleted(_, createdAt)) =>
if (createdAt > 0) {
if (withinUndeleteWindow(createdAt)) {
(
mkSuccessfulUndeleteResponse(tweetId, records, Some(createdAt)),
undeleteRecord
)
} else {
(Undelete.Response(Undelete.UndeleteResponseCode.BackupNotFound), None)
}
} else {
throw InternalError(s"Timestamp unavailable for $tweetId")
}
// BounceDeleted tweets may not be undeleted. see go/bouncedtweet
case Some(_: TweetStateRecord.HardDeleted | _: TweetStateRecord.BounceDeleted) =>
(Undelete.Response(Undelete.UndeleteResponseCode.BackupNotFound), None)
case Some(_: TweetStateRecord.Undeleted) =>
// We still want to write the undelete record, because at this point, we only know that the local DC's
// winning record is not a soft/hard deletion record, while its possible that the remote DC's winning
// record might still be a soft deletion record. Having said that, we don't want to set it to true
// if the winning record is forceAdd, as the forceAdd call should have ensured that both DCs had the
// forceAdd record.
(mkSuccessfulUndeleteResponse(tweetId, records), undeleteRecord)
case Some(_: TweetStateRecord.ForceAdded) =>
(mkSuccessfulUndeleteResponse(tweetId, records), None)
// lets write the undeletion record just in case there is a softdeletion record in flight
case None => (mkSuccessfulUndeleteResponse(tweetId, records), undeleteRecord)
}
}
// Write the undelete record both locally and remotely to protect
// against races with hard delete replication. We only need this
// protection for the insertion of the undelete record.
def multiInsert(record: TweetManhattanRecord): Stitch[Unit] =
Stitch
.collect(
Seq(
localInsert(record).liftToTry,
remoteInsert(record).liftToTry
)
)
.map(collectWithRateLimitCheck)
.lowerFromTry
def deleteSoftDeleteRecord(tweetId: TweetId): Stitch[Unit] = {
val mhKey = TweetKey.softDeletionStateKey(tweetId)
delete(mhKey, None)
}
tweetId =>
for {
records <- read(tweetId)
(response, undeleteRecord) = prepareUndelete(tweetId, records)
_ <- Stitch.collect(undeleteRecord.map(multiInsert)).unit
_ <- deleteSoftDeleteRecord(tweetId)
} yield {
response
}
}
private[storage] def mkSuccessfulUndeleteResponse(
tweetId: TweetId,
records: Seq[TweetManhattanRecord],
timestampOpt: Option[Long] = None
) =
Undelete.Response(
Undelete.UndeleteResponseCode.Success,
Some(
StorageConversions.fromStoredTweet(buildStoredTweet(tweetId, records))
),
archivedAtMillis = timestampOpt
)
}

View File

@ -1,64 +0,0 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
import com.twitter.storage.client.manhattan.kv.ManhattanValue
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.util.Throw
import com.twitter.util.Time
object UpdateTweetHandler {
def apply(
insert: ManhattanOperations.Insert,
stats: StatsReceiver
): TweetStorageClient.UpdateTweet = { (tpTweet: Tweet, fields: Seq[Field]) =>
require(
fields.forall(!TweetFields.coreFieldIds.contains(_)),
"Core fields cannot be modified by calling updateTweet; use addTweet instead."
)
require(
areAllFieldsDefined(tpTweet, fields),
s"Input tweet $tpTweet does not have specified fields $fields set"
)
val now = Time.now
val storedTweet = StorageConversions.toStoredTweetForFields(tpTweet, fields.toSet)
val tweetId = storedTweet.id
Stats.updatePerFieldQpsCounters("updateTweet", fields.map(_.id), 1, stats)
val (fieldIds, stitchesPerTweet) =
fields.map { field =>
val fieldId = field.id
val tweetKey = TweetKey.fieldKey(tweetId, fieldId)
val blob = storedTweet.getFieldBlob(fieldId).get
val value = ManhattanValue(TFieldBlobCodec.toByteBuffer(blob), Some(now))
val record = TweetManhattanRecord(tweetKey, value)
(fieldId, insert(record).liftToTry)
}.unzip
Stitch.collect(stitchesPerTweet).map { seqOfTries =>
val fieldkeyAndMhResults = fieldIds.zip(seqOfTries).toMap
// If even a single field was rate limited, we will send an overall OverCapacity TweetResponse
val wasRateLimited = fieldkeyAndMhResults.exists { keyAndResult =>
keyAndResult._2 match {
case Throw(e: DeniedManhattanException) => true
case _ => false
}
}
if (wasRateLimited) {
buildTweetOverCapacityResponse("updateTweets", tweetId, fieldkeyAndMhResults)
} else {
buildTweetResponse("updateTweets", tweetId, fieldkeyAndMhResults)
}
}
}
private def areAllFieldsDefined(tpTweet: Tweet, fields: Seq[Field]) = {
val storedTweet = StorageConversions.toStoredTweetForFields(tpTweet, fields.toSet)
fields.map(_.id).forall(storedTweet.getFieldBlob(_).isDefined)
}
}

View File

@ -1,11 +0,0 @@
package com.twitter.tweetypie
import com.twitter.storage.client.manhattan.kv.ManhattanValue
import java.nio.ByteBuffer
package object storage {
type TweetId = Long
type FieldId = Short
type TweetManhattanValue = ManhattanValue[ByteBuffer]
}

View File

@ -1,20 +0,0 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"finagle/finagle-core/src/main",
"flock-client/src/main/scala",
"flock-client/src/main/thrift:thrift-scala",
"tweetypie/servo/util/src/main/scala",
"snowflake:id",
"src/thrift/com/twitter/gizmoduck:thrift-scala",
"src/thrift/com/twitter/servo:servo-exception-java",
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
"tweetypie/server/src/main/scala/com/twitter/tweetypie",
"tweetypie/server/src/main/scala/com/twitter/tweetypie/serverutil",
"tweetypie/common/src/scala/com/twitter/tweetypie/util",
"util/util-core:scala",
],
)

View File

@ -1,532 +0,0 @@
/** Copyright 2010 Twitter, Inc. */
package com.twitter.tweetypie
package tflock
import com.twitter.finagle.stats.Counter
import com.twitter.flockdb.client._
import com.twitter.flockdb.client.thriftscala.Priority
import com.twitter.snowflake.id.SnowflakeId
import com.twitter.tweetypie.serverutil.StoredCard
import com.twitter.tweetypie.thriftscala._
import com.twitter.util.Future
import scala.collection.mutable.ListBuffer
object TFlockIndexer {
/**
* Printable names for some edge types currently defined in [[com.twitter.flockdb.client]].
* Used to defined stats counters for adding edges.
*/
val graphNames: Map[Int, String] =
Map(
CardTweetsGraph.id -> "card_tweets",
ConversationGraph.id -> "conversation",
DirectedAtUserIdGraph.id -> "directed_at_user_id",
InvitedUsersGraph.id -> "invited_users",
MediaTimelineGraph.id -> "media_timeline",
MentionsGraph.id -> "mentions",
NarrowcastSentTweetsGraph.id -> "narrowcast_sent_tweets",
NullcastedTweetsGraph.id -> "nullcasted_tweets",
QuotersGraph.id -> "quoters",
QuotesGraph.id -> "quotes",
QuoteTweetsIndexGraph.id -> "quote_tweets_index",
RepliesToTweetsGraph.id -> "replies_to_tweets",
RetweetsByMeGraph.id -> "retweets_by_me",
RetweetsGraph.id -> "retweets",
RetweetsOfMeGraph.id -> "retweets_of_me",
RetweetSourceGraph.id -> "retweet_source",
TweetsRetweetedGraph.id -> "tweets_retweeted",
UserTimelineGraph.id -> "user_timeline",
CreatorSubscriptionTimelineGraph.id -> "creator_subscription_timeline",
CreatorSubscriptionMediaTimelineGraph.id -> "creator_subscription_image_timeline",
)
/**
* On edge deletion, edges are either archived permanently or retained for 3 months, based on
* the retention policy in the above confluence page.
*
* These two retention policies correspond to the two deletion techniques: archive and remove.
* We call removeEdges for edges with a short retention policy and archiveEdges for edges with
* a permanent retention policy.
*/
val graphsWithRemovedEdges: Seq[Int] =
Seq(
CardTweetsGraph.id,
CuratedTimelineGraph.id,
CuratedTweetsGraph.id,
DirectedAtUserIdGraph.id,
MediaTimelineGraph.id,
MutedConversationsGraph.id,
QuotersGraph.id,
QuotesGraph.id,
QuoteTweetsIndexGraph.id,
ReportedTweetsGraph.id,
RetweetsOfMeGraph.id,
RetweetSourceGraph.id,
SoftLikesGraph.id,
TweetsRetweetedGraph.id,
CreatorSubscriptionTimelineGraph.id,
CreatorSubscriptionMediaTimelineGraph.id,
)
/**
* These edges should be left in place when bounced tweets are deleted.
* These edges are removed during hard deletion.
*
* This is done so external teams (timelines) can execute on these edges for
* tombstone feature.
*/
val bounceDeleteGraphIds: Set[Int] =
Set(
UserTimelineGraph.id,
ConversationGraph.id
)
def makeCounters(stats: StatsReceiver, operation: String): Map[Int, Counter] = {
TFlockIndexer.graphNames
.mapValues(stats.scope(_).counter(operation))
.withDefaultValue(stats.scope("unknown").counter(operation))
}
}
/**
* @param backgroundIndexingPriority specifies the queue to use for
* background indexing operations. This is useful for making the
* effects of background indexing operations (such as deleting edges
* for deleted Tweets) available sooner in testing scenarios
* (end-to-end tests or development instances). It is set to
* Priority.Low in production to reduce the load on high priority
* queues that we use for prominently user-visible operations.
*/
class TFlockIndexer(
tflock: TFlockClient,
hasMedia: Tweet => Boolean,
backgroundIndexingPriority: Priority,
stats: StatsReceiver)
extends TweetIndexer {
private[this] val FutureNil = Future.Nil
private[this] val archiveCounters = TFlockIndexer.makeCounters(stats, "archive")
private[this] val removeCounters = TFlockIndexer.makeCounters(stats, "remove")
private[this] val insertCounters = TFlockIndexer.makeCounters(stats, "insert")
private[this] val negateCounters = TFlockIndexer.makeCounters(stats, "negate")
private[this] val foregroundIndexingPriority: Priority = Priority.High
override def createIndex(tweet: Tweet): Future[Unit] =
createEdges(tweet, isUndelete = false)
override def undeleteIndex(tweet: Tweet): Future[Unit] =
createEdges(tweet, isUndelete = true)
private[this] case class PartitionedEdges(
longRetention: Seq[ExecuteEdge[StatusGraph]] = Nil,
shortRetention: Seq[ExecuteEdge[StatusGraph]] = Nil,
negate: Seq[ExecuteEdge[StatusGraph]] = Nil,
ignore: Seq[ExecuteEdge[StatusGraph]] = Nil)
private[this] def partitionEdgesForDelete(
edges: Seq[ExecuteEdge[StatusGraph]],
isBounceDelete: Boolean
) =
edges.foldLeft(PartitionedEdges()) {
// Two dependees of UserTimelineGraph edge states to satisfy: timelines & safety tools.
// Timelines show bounce-deleted tweets as tombstones; regular deletes are not shown.
// - i.e. timelineIds = UserTimelineGraph(Normal || Negative)
// Safety tools show deleted tweets to authorized internal review agents
// - i.e. deletedIds = UserTimelineGraph(Removed || Negative)
case (partitionedEdges, edge) if isBounceDelete && edge.graphId == UserTimelineGraph.id =>
partitionedEdges.copy(negate = edge +: partitionedEdges.negate)
case (partitionedEdges, edge) if isBounceDelete && edge.graphId == ConversationGraph.id =>
// Bounce-deleted tweets remain rendered as tombstones in conversations, so do not modify
// the ConversationGraph edge state
partitionedEdges.copy(ignore = edge +: partitionedEdges.ignore)
case (partitionedEdges, edge)
if TFlockIndexer.graphsWithRemovedEdges.contains(edge.graphId) =>
partitionedEdges.copy(shortRetention = edge +: partitionedEdges.shortRetention)
case (partitionedEdges, edge) =>
partitionedEdges.copy(longRetention = edge +: partitionedEdges.longRetention)
}
override def deleteIndex(tweet: Tweet, isBounceDelete: Boolean): Future[Unit] =
for {
edges <- getEdges(tweet, isCreate = false, isDelete = true, isUndelete = false)
partitionedEdges = partitionEdgesForDelete(edges, isBounceDelete)
() <-
Future
.join(
tflock
.archiveEdges(partitionedEdges.longRetention, backgroundIndexingPriority)
.onSuccess(_ =>
partitionedEdges.longRetention.foreach(e => archiveCounters(e.graphId).incr())),
tflock
.removeEdges(partitionedEdges.shortRetention, backgroundIndexingPriority)
.onSuccess(_ =>
partitionedEdges.shortRetention.foreach(e => removeCounters(e.graphId).incr())),
tflock
.negateEdges(partitionedEdges.negate, backgroundIndexingPriority)
.onSuccess(_ =>
partitionedEdges.negate.foreach(e => negateCounters(e.graphId).incr()))
)
.unit
} yield ()
/**
* This operation is called when a user is put into or taken out of
* a state in which their retweets should no longer be visible
* (e.g. suspended or ROPO).
*/
override def setRetweetVisibility(retweetId: TweetId, setVisible: Boolean): Future[Unit] = {
val retweetEdge = Seq(ExecuteEdge(retweetId, RetweetsGraph, None, Reverse))
if (setVisible) {
tflock
.insertEdges(retweetEdge, backgroundIndexingPriority)
.onSuccess(_ => insertCounters(RetweetsGraph.id).incr())
} else {
tflock
.archiveEdges(retweetEdge, backgroundIndexingPriority)
.onSuccess(_ => archiveCounters(RetweetsGraph.id).incr())
}
}
private[this] def createEdges(tweet: Tweet, isUndelete: Boolean): Future[Unit] =
for {
edges <- getEdges(tweet = tweet, isCreate = true, isDelete = false, isUndelete = isUndelete)
() <- tflock.insertEdges(edges, foregroundIndexingPriority)
} yield {
// Count all the edges we've successfully added:
edges.foreach(e => insertCounters(e.graphId).incr())
}
private[this] def addRTEdges(
tweet: Tweet,
share: Share,
isCreate: Boolean,
edges: ListBuffer[ExecuteEdge[StatusGraph]],
futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]
): Unit = {
edges += RetweetsOfMeGraph.edge(share.sourceUserId, tweet.id)
edges += RetweetsByMeGraph.edge(getUserId(tweet), tweet.id)
edges += RetweetsGraph.edge(share.sourceStatusId, tweet.id)
if (isCreate) {
edges += ExecuteEdge(
sourceId = getUserId(tweet),
graph = RetweetSourceGraph,
destinationIds = Some(Seq(share.sourceStatusId)),
direction = Forward,
position = Some(SnowflakeId(tweet.id).time.inMillis)
)
edges.append(TweetsRetweetedGraph.edge(share.sourceUserId, share.sourceStatusId))
} else {
edges += RetweetSourceGraph.edge(getUserId(tweet), share.sourceStatusId)
// if this is the last retweet we need to remove it from the source user's
// tweets retweeted graph
futureEdges.append(
tflock.count(RetweetsGraph.from(share.sourceStatusId)).flatMap { count =>
if (count <= 1) {
tflock.selectAll(RetweetsGraph.from(share.sourceStatusId)).map { tweets =>
if (tweets.size <= 1)
Seq(TweetsRetweetedGraph.edge(share.sourceUserId, share.sourceStatusId))
else
Nil
}
} else {
FutureNil
}
}
)
}
}
private[this] def addReplyEdges(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
getReply(tweet).foreach { reply =>
reply.inReplyToStatusId.flatMap { inReplyToStatusId =>
edges += RepliesToTweetsGraph.edge(inReplyToStatusId, tweet.id)
// only index conversationId if this is a reply to another tweet
TweetLenses.conversationId.get(tweet).map { conversationId =>
edges += ConversationGraph.edge(conversationId, tweet.id)
}
}
}
}
private[this] def addDirectedAtEdges(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
TweetLenses.directedAtUser.get(tweet).foreach { directedAtUser =>
edges += DirectedAtUserIdGraph.edge(directedAtUser.userId, tweet.id)
}
}
private[this] def addMentionEdges(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
getMentions(tweet)
.flatMap(_.userId).foreach { mention =>
edges += MentionsGraph.edge(mention, tweet.id)
}
}
private[this] def addQTEdges(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]],
futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]],
isCreate: Boolean
): Unit = {
val userId = getUserId(tweet)
tweet.quotedTweet.foreach { quotedTweet =>
// Regardless of tweet creates/deletes, we add the corresponding edges to the
// following two graphs. Note that we're handling the case for
// the QuotersGraph slightly differently in the tweet delete case.
edges.append(QuotesGraph.edge(quotedTweet.userId, tweet.id))
edges.append(QuoteTweetsIndexGraph.edge(quotedTweet.tweetId, tweet.id))
if (isCreate) {
// As mentioned above, for tweet creates we go ahead and add an edge
// to the QuotersGraph without any additional checks.
edges.append(QuotersGraph.edge(quotedTweet.tweetId, userId))
} else {
// For tweet deletes, we only add an edge to be deleted from the
// QuotersGraph if the tweeting user isn't quoting the tweet anymore
// i.e. if a user has quoted a tweet multiple times, we only delete
// an edge from the QuotersGraph if they've deleted all the quotes,
// otherwise an edge should exist by definition of what the QuotersGraph
// represents.
// Note: There can be a potential edge case here due to a race condition
// in the following scenario.
// i) A quotes a tweet T twice resulting in tweets T1 and T2.
// ii) There should exist edges in the QuotersGraph from T -> A and T1 <-> T, T2 <-> T in
// the QuoteTweetsIndexGraph, but one of the edges haven't been written
// to the QuoteTweetsIndex graph in TFlock yet.
// iii) In this scenario, we shouldn't really be deleting an edge as we're doing below.
// The approach that we're taking below is a "best effort" approach similar to what we
// currently do for RTs.
// Find all the quotes of the quoted tweet from the quoting user
val quotesFromQuotingUser = QuoteTweetsIndexGraph
.from(quotedTweet.tweetId)
.intersect(UserTimelineGraph.from(userId))
futureEdges.append(
tflock
.count(quotesFromQuotingUser).flatMap { count =>
// If this is the last quote of the quoted tweet from the quoting user,
// we go ahead and delete the edge from the QuotersGraph.
if (count <= 1) {
tflock.selectAll(quotesFromQuotingUser).map { tweets =>
if (tweets.size <= 1) {
Seq(QuotersGraph.edge(quotedTweet.tweetId, userId))
} else {
Nil
}
}
} else {
FutureNil
}
}
)
}
}
}
private[this] def addCardEdges(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
// Note that we are indexing only the TOO "stored" cards
// (cardUri=card://<cardId>). Rest of the cards are ignored here.
tweet.cardReference
.collect {
case StoredCard(id) =>
edges.append(CardTweetsGraph.edge(id, tweet.id))
}.getOrElse(())
}
// Note: on undelete, this method restores all archived edges, including those that may have
// been archived prior to the delete. This is incorrect behavior but in practice rarely
// causes problems, as undeletes are so rare.
private[this] def addEdgesForDeleteOrUndelete(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
edges.appendAll(
Seq(
MentionsGraph.edges(tweet.id, None, Reverse),
RepliesToTweetsGraph.edges(tweet.id, None)
)
)
// When we delete or undelete a conversation control root Tweet we want to archive or restore
// all the edges in InvitedUsersGraph from the Tweet id.
if (hasConversationControl(tweet) && isConversationRoot(tweet)) {
edges.append(InvitedUsersGraph.edges(tweet.id, None))
}
}
private[this] def addSimpleEdges(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
if (TweetLenses.nullcast.get(tweet)) {
edges.append(NullcastedTweetsGraph.edge(getUserId(tweet), tweet.id))
} else if (TweetLenses.narrowcast.get(tweet).isDefined) {
edges.append(NarrowcastSentTweetsGraph.edge(getUserId(tweet), tweet.id))
} else {
edges.append(UserTimelineGraph.edge(getUserId(tweet), tweet.id))
if (hasMedia(tweet))
edges.append(MediaTimelineGraph.edge(getUserId(tweet), tweet.id))
// Index root creator subscription tweets.
// Ignore replies because those are not necessarily visible to a user who subscribes to tweet author
val isRootTweet: Boolean = tweet.coreData match {
case Some(c) => c.reply.isEmpty && c.share.isEmpty
case None => true
}
if (tweet.exclusiveTweetControl.isDefined && isRootTweet) {
edges.append(CreatorSubscriptionTimelineGraph.edge(getUserId(tweet), tweet.id))
if (hasMedia(tweet))
edges.append(CreatorSubscriptionMediaTimelineGraph.edge(getUserId(tweet), tweet.id))
}
}
}
/**
* Issues edges for each mention of user in a conversation-controlled tweet. This way InvitedUsers
* graph accumulates complete set of ids for @mention-invited users, by conversation id.
*/
private def invitedUsersEdgesForCreate(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
val conversationId: Long = getConversationId(tweet).getOrElse(tweet.id)
val mentions: Seq[UserId] = getMentions(tweet).flatMap(_.userId)
edges.appendAll(mentions.map(userId => InvitedUsersGraph.edge(conversationId, userId)))
}
/**
* Issues edges of InviteUsersGraph that ought to be deleted for a conversation controlled reply.
* These are mentions of users in the given tweet, only if the user was not mentioned elsewhere
* in the conversation. This way for a conversation, InvitedUsersGraph would always hold a set
* of all users invited to the conversation, and an edge is removed only after the last mention of
* a user is deleted.
*/
private def invitedUsersEdgesForDelete(
tweet: Tweet,
futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]
): Unit = {
getConversationId(tweet).foreach { conversationId: Long =>
val mentions: Seq[UserId] = getMentions(tweet).flatMap(_.userId)
mentions.foreach { userId =>
val tweetIdsWithinConversation = ConversationGraph.from(conversationId)
val tweetIdsThatMentionUser = MentionsGraph.from(userId)
futureEdges.append(
tflock
.selectAll(
query = tweetIdsThatMentionUser.intersect(tweetIdsWithinConversation),
limit = Some(2), // Just need to know if it is >1 or <=1, so 2 are enough.
pageSize = None // Provide default, otherwise Mockito complains
).map { tweetIds: Seq[Long] =>
if (tweetIds.size <= 1) {
Seq(InvitedUsersGraph.edge(conversationId, userId))
} else {
Nil
}
}
)
}
}
}
private def hasInviteViaMention(tweet: Tweet): Boolean = {
tweet.conversationControl match {
case Some(ConversationControl.ByInvitation(controls)) =>
controls.inviteViaMention.getOrElse(false)
case Some(ConversationControl.Community(controls)) =>
controls.inviteViaMention.getOrElse(false)
case Some(ConversationControl.Followers(followers)) =>
followers.inviteViaMention.getOrElse(false)
case _ =>
false
}
}
private def hasConversationControl(tweet: Tweet): Boolean =
tweet.conversationControl.isDefined
// If a Tweet has a ConversationControl, it must have a ConversationId associated with it so we
// can compare the ConversationId with the current Tweet ID to determine if it's the root of the
// conversation. See ConversationIdHydrator for more details
private def isConversationRoot(tweet: Tweet): Boolean =
getConversationId(tweet).get == tweet.id
private def addInvitedUsersEdges(
tweet: Tweet,
isCreate: Boolean,
isUndelete: Boolean,
edges: ListBuffer[ExecuteEdge[StatusGraph]],
futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]
): Unit = {
if (hasConversationControl(tweet)) {
if (isCreate) {
if (isConversationRoot(tweet) && !isUndelete) {
// For root Tweets, only add edges for original creates, not for undeletes.
// Undeletes are handled by addEdgesForDeleteOrUndelete.
invitedUsersEdgesForCreate(tweet, edges)
}
if (!isConversationRoot(tweet) && hasInviteViaMention(tweet)) {
// For replies, only add edges when the conversation control is in inviteViaMention mode.
invitedUsersEdgesForCreate(tweet, edges)
}
} else {
if (!isConversationRoot(tweet)) {
invitedUsersEdgesForDelete(tweet, futureEdges)
}
}
}
}
private[this] def getEdges(
tweet: Tweet,
isCreate: Boolean,
isDelete: Boolean,
isUndelete: Boolean
): Future[Seq[ExecuteEdge[StatusGraph]]] = {
val edges = ListBuffer[ExecuteEdge[StatusGraph]]()
val futureEdges = ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]()
addSimpleEdges(tweet, edges)
getShare(tweet) match {
case Some(share) => addRTEdges(tweet, share, isCreate, edges, futureEdges)
case _ =>
addInvitedUsersEdges(tweet, isCreate, isUndelete, edges, futureEdges)
addReplyEdges(tweet, edges)
addDirectedAtEdges(tweet, edges)
addMentionEdges(tweet, edges)
addQTEdges(tweet, edges, futureEdges, isCreate)
addCardEdges(tweet, edges)
if (isDelete || isUndelete) {
addEdgesForDeleteOrUndelete(tweet, edges)
}
}
Future
.collect(futureEdges)
.map { moreEdges => (edges ++= moreEdges.flatten).toList }
}
}

View File

@ -1,30 +0,0 @@
/** Copyright 2010 Twitter, Inc. */
package com.twitter.tweetypie
package tflock
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.util.Future
trait TweetIndexer {
/**
* Called at tweet-creation time, this method should set up all relevant indices on the tweet.
*/
def createIndex(tweet: Tweet): Future[Unit] = Future.Unit
/**
* Called at tweet-undelete time (which isn't yet handled), this method should
* restore all relevant indices on the tweet.
*/
def undeleteIndex(tweet: Tweet): Future[Unit] = Future.Unit
/**
* Called at tweet-delete time, this method should archive all relevant indices on the tweet.
*/
def deleteIndex(tweet: Tweet, isBounceDelete: Boolean): Future[Unit] = Future.Unit
/**
* This method should archive or unarchive the retweet edge in TFlock RetweetsGraph.
*/
def setRetweetVisibility(retweetId: TweetId, visible: Boolean): Future[Unit] = Future.Unit
}

View File

@ -1,13 +0,0 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"finagle/finagle-core/src/main",
"scrooge/scrooge-core/src/main/scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:service-scala",
"util/util-core:scala",
],
)

View File

@ -1,8 +0,0 @@
package com.twitter.tweetypie.thriftscala
import com.twitter.finagle.service.FailedService
class NotImplementedTweetService
extends TweetService$FinagleClient(
new FailedService(new UnsupportedOperationException("not implemented"))
)

View File

@ -1,79 +0,0 @@
package com.twitter.tweetypie.thriftscala
import com.twitter.util.Future
/**
* A trait for TweetService implementations that wrap an underlying
* TweetService and need to modify only some of the methods.
*/
trait TweetServiceProxy extends TweetService.MethodPerEndpoint {
protected def underlying: TweetService.MethodPerEndpoint
/**
* Default implementation simply passes through the Future but logic can be added to wrap each
* invocation to the underlying TweetService
*/
protected def wrap[A](f: => Future[A]): Future[A] =
f
override def getTweets(request: GetTweetsRequest): Future[Seq[GetTweetResult]] =
wrap(underlying.getTweets(request))
override def getTweetFields(request: GetTweetFieldsRequest): Future[Seq[GetTweetFieldsResult]] =
wrap(underlying.getTweetFields(request))
override def getTweetCounts(request: GetTweetCountsRequest): Future[Seq[GetTweetCountsResult]] =
wrap(underlying.getTweetCounts(request))
override def setAdditionalFields(request: SetAdditionalFieldsRequest): Future[Unit] =
wrap(underlying.setAdditionalFields(request))
override def deleteAdditionalFields(request: DeleteAdditionalFieldsRequest): Future[Unit] =
wrap(underlying.deleteAdditionalFields(request))
override def postTweet(request: PostTweetRequest): Future[PostTweetResult] =
wrap(underlying.postTweet(request))
override def postRetweet(request: RetweetRequest): Future[PostTweetResult] =
wrap(underlying.postRetweet(request))
override def unretweet(request: UnretweetRequest): Future[UnretweetResult] =
wrap(underlying.unretweet(request))
override def getDeletedTweets(
request: GetDeletedTweetsRequest
): Future[Seq[GetDeletedTweetResult]] =
wrap(underlying.getDeletedTweets(request))
override def deleteTweets(request: DeleteTweetsRequest): Future[Seq[DeleteTweetResult]] =
wrap(underlying.deleteTweets(request))
override def updatePossiblySensitiveTweet(
request: UpdatePossiblySensitiveTweetRequest
): Future[Unit] =
wrap(underlying.updatePossiblySensitiveTweet(request))
override def undeleteTweet(request: UndeleteTweetRequest): Future[UndeleteTweetResponse] =
wrap(underlying.undeleteTweet(request))
override def eraseUserTweets(request: EraseUserTweetsRequest): Future[Unit] =
wrap(underlying.eraseUserTweets(request))
override def incrTweetFavCount(request: IncrTweetFavCountRequest): Future[Unit] =
wrap(underlying.incrTweetFavCount(request))
override def deleteLocationData(request: DeleteLocationDataRequest): Future[Unit] =
wrap(underlying.deleteLocationData(request))
override def scrubGeo(request: GeoScrub): Future[Unit] =
wrap(underlying.scrubGeo(request))
override def takedown(request: TakedownRequest): Future[Unit] =
wrap(underlying.takedown(request))
override def flush(request: FlushRequest): Future[Unit] =
wrap(underlying.flush(request))
override def incrTweetBookmarkCount(request: IncrTweetBookmarkCountRequest): Future[Unit] =
wrap(underlying.incrTweetBookmarkCount(request))
}

View File

@ -1,15 +0,0 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"tweetypie/servo/util",
"tweetypie/common/src/thrift/com/twitter/tweetypie:media-entity-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
"tco-util",
"tweetypie/common/src/scala/com/twitter/tweetypie/tweettext",
"tweetypie/common/src/scala/com/twitter/tweetypie/util",
"twitter-text/lib/java/src/main/java/com/twitter/twittertext",
],
)

View File

@ -1,11 +0,0 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.tweetypie.thriftscala.CashtagEntity
import com.twitter.tweetypie.tweettext.TextEntity
object CashtagTextEntity extends TextEntity[CashtagEntity] {
override def fromIndex(entity: CashtagEntity): Short = entity.fromIndex
override def toIndex(entity: CashtagEntity): Short = entity.toIndex
override def move(entity: CashtagEntity, fromIndex: Short, toIndex: Short): CashtagEntity =
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
}

View File

@ -1,118 +0,0 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.servo.data.Mutation
import com.twitter.tco_util.TcoUrl
import com.twitter.tweetypie.thriftscala._
import com.twitter.tweetypie.thriftscala.entities.Implicits._
import com.twitter.tweetypie.tweettext.PartialHtmlEncoding
import com.twitter.tweetypie.tweettext.TextEntity
import com.twitter.tweetypie.tweettext.TextModification
import com.twitter.tweetypie.util.TweetLenses
import com.twitter.twittertext.Extractor
import scala.collection.JavaConverters._
/**
* Contains functions to collect urls, mentions, hashtags, and cashtags from the text of tweets and messages
*/
object EntityExtractor {
// We only use one configuration of com.twitter.twittertext.Extractor, so it's
// OK to share one global reference. The only available
// configuration option is whether to extract URLs without protocols
// (defaults to true)
private[this] val extractor = new Extractor
// The twitter-text library operates on unencoded text, but we store
// and process HTML-encoded text. The TextModification returned
// from this function contains the decoded text which we will operate on,
// but also provides us with the ability to map the indices on
// the twitter-text entities back to the entities on the encoded text.
private val htmlEncodedTextToEncodeModification: String => TextModification =
text =>
PartialHtmlEncoding
.decodeWithModification(text)
.getOrElse(TextModification.identity(text))
.inverse
private[this] val extractAllUrlsFromTextMod: TextModification => Seq[UrlEntity] =
extractUrls(false)
val extractAllUrls: String => Seq[UrlEntity] =
htmlEncodedTextToEncodeModification.andThen(extractAllUrlsFromTextMod)
private[this] val extractTcoUrls: TextModification => Seq[UrlEntity] =
extractUrls(true)
private[this] def extractUrls(tcoOnly: Boolean): TextModification => Seq[UrlEntity] =
mkEntityExtractor[UrlEntity](
extractor.extractURLsWithIndices(_).asScala.filter { e =>
if (tcoOnly) TcoUrl.isTcoUrl(e.getValue) else true
},
UrlEntity(_, _, _)
)
private[this] val extractMentionsFromTextMod: TextModification => Seq[MentionEntity] =
mkEntityExtractor[MentionEntity](
extractor.extractMentionedScreennamesWithIndices(_).asScala,
MentionEntity(_, _, _)
)
val extractMentions: String => Seq[MentionEntity] =
htmlEncodedTextToEncodeModification.andThen(extractMentionsFromTextMod)
private[this] val extractHashtagsFromTextMod: TextModification => Seq[HashtagEntity] =
mkEntityExtractor[HashtagEntity](
extractor.extractHashtagsWithIndices(_).asScala,
HashtagEntity(_, _, _)
)
val extractHashtags: String => Seq[HashtagEntity] =
htmlEncodedTextToEncodeModification.andThen(extractHashtagsFromTextMod)
private[this] val extractCashtagsFromTextMod: TextModification => Seq[CashtagEntity] =
mkEntityExtractor[CashtagEntity](
extractor.extractCashtagsWithIndices(_).asScala,
CashtagEntity(_, _, _)
)
val extractCashtags: String => Seq[CashtagEntity] =
htmlEncodedTextToEncodeModification.andThen(extractCashtagsFromTextMod)
private[this] def mkEntityExtractor[E: TextEntity](
extract: String => Seq[Extractor.Entity],
construct: (Short, Short, String) => E
): TextModification => Seq[E] =
htmlEncodedMod => {
val convert: Extractor.Entity => Option[E] =
e =>
for {
start <- asShort(e.getStart.intValue)
end <- asShort(e.getEnd.intValue)
if e.getValue != null
res <- htmlEncodedMod.reindexEntity(construct(start, end, e.getValue))
} yield res
val entities = extract(htmlEncodedMod.original)
extractor.modifyIndicesFromUTF16ToUnicode(htmlEncodedMod.original, entities.asJava)
entities.map(convert).flatten
}
private[this] def asShort(i: Int): Option[Short] =
if (i.isValidShort) Some(i.toShort) else None
private[this] def mutation(extractUrls: Boolean): Mutation[Tweet] =
Mutation { tweet =>
val htmlEncodedMod = htmlEncodedTextToEncodeModification(TweetLenses.text.get(tweet))
Some(
tweet.copy(
urls = if (extractUrls) Some(extractTcoUrls(htmlEncodedMod)) else tweet.urls,
mentions = Some(extractMentionsFromTextMod(htmlEncodedMod)),
hashtags = Some(extractHashtagsFromTextMod(htmlEncodedMod)),
cashtags = Some(extractCashtagsFromTextMod(htmlEncodedMod))
)
)
}
val mutationWithoutUrls: Mutation[Tweet] = mutation(false)
val mutationAll: Mutation[Tweet] = mutation(true)
}

View File

@ -1,11 +0,0 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.tweetypie.thriftscala.HashtagEntity
import com.twitter.tweetypie.tweettext.TextEntity
object HashtagTextEntity extends TextEntity[HashtagEntity] {
override def fromIndex(entity: HashtagEntity): Short = entity.fromIndex
override def toIndex(entity: HashtagEntity): Short = entity.toIndex
override def move(entity: HashtagEntity, fromIndex: Short, toIndex: Short): HashtagEntity =
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
}

View File

@ -1,10 +0,0 @@
package com.twitter.tweetypie.thriftscala.entities
object Implicits {
implicit val hashtagTextEntity: HashtagTextEntity.type = HashtagTextEntity
implicit val cashtagTextEntity: CashtagTextEntity.type = CashtagTextEntity
implicit val mentionTextEntity: MentionTextEntity.type = MentionTextEntity
implicit val urlTextEntity: UrlTextEntity.type = UrlTextEntity
implicit val mediaTextEntity: MediaTextEntity.type = MediaTextEntity
implicit val textRangeTextEntity: TextRangeEntityAdapter.type = TextRangeEntityAdapter
}

View File

@ -1,11 +0,0 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.tweetypie.thriftscala.MediaEntity
import com.twitter.tweetypie.tweettext.TextEntity
object MediaTextEntity extends TextEntity[MediaEntity] {
override def fromIndex(entity: MediaEntity): Short = entity.fromIndex
override def toIndex(entity: MediaEntity): Short = entity.toIndex
override def move(entity: MediaEntity, fromIndex: Short, toIndex: Short): MediaEntity =
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
}

View File

@ -1,11 +0,0 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.tweetypie.thriftscala.MentionEntity
import com.twitter.tweetypie.tweettext.TextEntity
object MentionTextEntity extends TextEntity[MentionEntity] {
override def fromIndex(entity: MentionEntity): Short = entity.fromIndex
override def toIndex(entity: MentionEntity): Short = entity.toIndex
override def move(entity: MentionEntity, fromIndex: Short, toIndex: Short): MentionEntity =
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
}

View File

@ -1,11 +0,0 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.tweetypie.thriftscala.TextRange
import com.twitter.tweetypie.tweettext.TextEntity
object TextRangeEntityAdapter extends TextEntity[TextRange] {
override def fromIndex(entity: TextRange): Short = entity.fromIndex.toShort
override def toIndex(entity: TextRange): Short = entity.toIndex.toShort
override def move(entity: TextRange, fromIndex: Short, toIndex: Short): TextRange =
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
}

View File

@ -1,11 +0,0 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.tweetypie.thriftscala.UrlEntity
import com.twitter.tweetypie.tweettext.TextEntity
object UrlTextEntity extends TextEntity[UrlEntity] {
override def fromIndex(entity: UrlEntity): Short = entity.fromIndex
override def toIndex(entity: UrlEntity): Short = entity.toIndex
override def move(entity: UrlEntity, fromIndex: Short, toIndex: Short): UrlEntity =
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
}

View File

@ -1,16 +0,0 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
provides = scala_artifact(
org = "com.twitter",
name = "tweetypie-tweettext",
repo = artifactory,
),
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/ibm/icu:icu4j",
"twitter-text/lib/java/src/main/java/com/twitter/twittertext",
],
)

View File

@ -1,44 +0,0 @@
package com.twitter.tweetypie.tweettext
import com.ibm.icu.text.BreakIterator
/**
* Adapt the [[BreakIterator]] interface to a scala [[Iterator]]
* over the offsets of user-perceived characters in a String.
*/
object GraphemeIndexIterator {
/**
* Produce an iterator over indices in the string that mark the end
* of a user-perceived character (grapheme)
*/
def ends(s: String): Iterator[Offset.CodeUnit] =
// The start of every grapheme but the first is also a grapheme
// end. The last grapheme ends at the end of the string.
starts(s).drop(1) ++ Iterator(Offset.CodeUnit.length(s))
/**
* Produce an iterator over indices in the string that mark the start
* of a user-perceived character (grapheme)
*/
def starts(s: String): Iterator[Offset.CodeUnit] =
new Iterator[Offset.CodeUnit] {
private[this] val it = BreakIterator.getCharacterInstance()
it.setText(s)
override def hasNext: Boolean = it.current < s.length
override def next: Offset.CodeUnit = {
if (!hasNext) throw new IllegalArgumentException(s"${it.current()}, ${s.length}")
// No matter what, we will be returning the value of `current`,
// which is the index of the start of the next grapheme.
val result = it.current()
it.next()
Offset.CodeUnit(result)
}
}
}

View File

@ -1,85 +0,0 @@
package com.twitter.tweetypie.tweettext
/**
* An efficient converter of indices between code points and code units.
*/
class IndexConverter(text: String) {
// Keep track of a single corresponding pair of code unit and code point
// offsets so that we can re-use counting work if the next requested
// entity is near the most recent entity.
private var codePointIndex = 0
// The code unit index should never split a surrogate pair.
private var charIndex = 0
/**
* @param offset Index into the string measured in code units.
* @return The code point index that corresponds to the specified character index.
*/
def toCodePoints(offset: Offset.CodeUnit): Offset.CodePoint =
Offset.CodePoint(codeUnitsToCodePoints(offset.toInt))
/**
* @param charIndex Index into the string measured in code units.
* @return The code point index that corresponds to the specified character index.
*/
def codeUnitsToCodePoints(charIndex: Int): Int = {
if (charIndex < this.charIndex) {
this.codePointIndex -= text.codePointCount(charIndex, this.charIndex)
} else {
this.codePointIndex += text.codePointCount(this.charIndex, charIndex)
}
this.charIndex = charIndex
// Make sure that charIndex never points to the second code unit of a
// surrogate pair.
if (charIndex > 0 && Character.isSupplementaryCodePoint(text.codePointAt(charIndex - 1))) {
this.charIndex -= 1
this.codePointIndex -= 1
}
this.codePointIndex
}
/**
* @param offset Index into the string measured in code points.
* @return the corresponding code unit index
*/
def toCodeUnits(offset: Offset.CodePoint): Offset.CodeUnit = {
this.charIndex = text.offsetByCodePoints(charIndex, offset.toInt - this.codePointIndex)
this.codePointIndex = offset.toInt
Offset.CodeUnit(this.charIndex)
}
/**
* @param codePointIndex Index into the string measured in code points.
* @return the corresponding code unit index
*/
def codePointsToCodeUnits(codePointIndex: Int): Int =
toCodeUnits(Offset.CodePoint(codePointIndex)).toInt
/**
* Returns a substring which begins at the specified code point `from` and extends to the
* code point `to`. Since String.substring only works with character, the method first
* converts code point offset to code unit offset.
*/
def substring(from: Offset.CodePoint, to: Offset.CodePoint): String =
text.substring(toCodeUnits(from).toInt, toCodeUnits(to).toInt)
/**
* Returns a substring which begins at the specified code point `from` and extends to the
* code point `to`. Since String.substring only works with character, the method first
* converts code point offset to code unit offset.
*/
def substringByCodePoints(from: Int, to: Int): String =
substring(Offset.CodePoint(from), Offset.CodePoint(to))
/**
* Returns a substring which begins at the specified code point `from` and extends to the
* end of the string. Since String.substring only works with character, the method first
* converts code point offset to code unit offset.
*/
def substringByCodePoints(from: Int): String = {
val charFrom = codePointsToCodeUnits(from)
text.substring(charFrom)
}
}

View File

@ -1,253 +0,0 @@
package com.twitter.tweetypie.tweettext
import scala.collection.immutable
/**
* An Offset is a typed index into a String.
*/
trait Offset[T] extends Ordering[T] {
def toInt(t: T): Int
def count(text: String, start: Offset.CodeUnit, end: Offset.CodeUnit): T
def compare(t1: T, t2: T): Int = toInt(t1).compare(toInt(t2))
def length(input: String): T = count(input, Offset.CodeUnit(0), Offset.CodeUnit.length(input))
}
object Offset {
/**
* UTF-16 code unit offsets are the native offsets for Java/Scala
* Strings.
*/
case class CodeUnit(toInt: Int) extends AnyVal with Ordered[CodeUnit] {
def compare(other: CodeUnit): Int = toInt.compare(other.toInt)
def +(other: CodeUnit) = CodeUnit(toInt + other.toInt)
def -(other: CodeUnit) = CodeUnit(toInt - other.toInt)
def min(other: CodeUnit): CodeUnit = if (toInt < other.toInt) this else other
def max(other: CodeUnit): CodeUnit = if (toInt > other.toInt) this else other
def incr: CodeUnit = CodeUnit(toInt + 1)
def decr: CodeUnit = CodeUnit(toInt - 1)
def until(end: CodeUnit): immutable.IndexedSeq[CodeUnit] =
toInt.until(end.toInt).map(CodeUnit(_))
/**
* Converts this `CodeUnit` to the equivalent `CodePoint` within the
* given text.
*/
def toCodePoint(text: String): CodePoint =
CodePoint(text.codePointCount(0, toInt))
def offsetByCodePoints(text: String, codePoints: CodePoint): CodeUnit =
CodeUnit(text.offsetByCodePoints(toInt, codePoints.toInt))
}
implicit object CodeUnit extends Offset[CodeUnit] {
def toInt(u: CodeUnit): Int = u.toInt
override def length(text: String): CodeUnit = CodeUnit(text.length)
def count(text: String, start: CodeUnit, end: CodeUnit): CodeUnit = end - start
}
/**
* Offsets in whole Unicode code points. Any CodePoint is a valid
* offset into the String as long as it is >= 0 and less than the
* number of code points in the string.
*/
case class CodePoint(toInt: Int) extends AnyVal with Ordered[CodePoint] {
def toShort: Short = toInt.toShort
def compare(other: CodePoint): Int = toInt.compare(other.toInt)
def +(other: CodePoint) = CodePoint(toInt + other.toInt)
def -(other: CodePoint) = CodePoint(toInt - other.toInt)
def min(other: CodePoint): CodePoint = if (toInt < other.toInt) this else other
def max(other: CodePoint): CodePoint = if (toInt > other.toInt) this else other
def until(end: CodePoint): immutable.IndexedSeq[CodePoint] =
toInt.until(end.toInt).map(CodePoint(_))
def toCodeUnit(text: String): CodeUnit =
CodeUnit(text.offsetByCodePoints(0, toInt))
}
implicit object CodePoint extends Offset[CodePoint] {
def toInt(p: CodePoint): Int = p.toInt
def count(text: String, start: CodeUnit, end: CodeUnit): CodePoint =
CodePoint(text.codePointCount(start.toInt, end.toInt))
}
/**
* Offsets into the String as if the String were encoded as UTF-8. You
* cannot use a [[Utf8]] offset to index a String, because not all
* Utf8 indices are valid indices into the String.
*/
case class Utf8(toInt: Int) extends AnyVal with Ordered[Utf8] {
def compare(other: Utf8): Int = toInt.compare(other.toInt)
def +(other: Utf8) = Utf8(toInt + other.toInt)
def -(other: Utf8) = Utf8(toInt - other.toInt)
def min(other: Utf8): Utf8 = if (toInt < other.toInt) this else other
def max(other: Utf8): Utf8 = if (toInt > other.toInt) this else other
}
implicit object Utf8 extends Offset[Utf8] {
def toInt(u: Utf8): Int = u.toInt
/**
* Count how many bytes this section of text would be when encoded as
* UTF-8.
*/
def count(s: String, start: CodeUnit, end: CodeUnit): Utf8 = {
def go(i: CodeUnit, byteLength: Utf8): Utf8 =
if (i < end) {
val cp = s.codePointAt(i.toInt)
go(i + CodeUnit(Character.charCount(cp)), byteLength + forCodePoint(cp))
} else {
byteLength
}
go(start, Utf8(0))
}
/**
* Unfortunately, there is no convenient API for finding out how many
* bytes a unicode code point would take in UTF-8, so we have to
* explicitly calculate it.
*
* @see http://en.wikipedia.org/wiki/UTF-8#Description
*/
def forCodePoint(cp: Int): Utf8 =
Utf8 {
// if the code point is an unpaired surrogate, it will be converted
// into a 1 byte replacement character
if (Character.getType(cp) == Character.SURROGATE) 1
else {
cp match {
case _ if cp < 0x80 => 1
case _ if cp < 0x800 => 2
case _ if cp < 0x10000 => 3
case _ => 4
}
}
}
}
/**
* Display units count what we consider a "character" in a
* Tweet. [[DisplayUnit]] offsets are only valid for text that is
* NFC-normalized (See: http://www.unicode.org/reports/tr15) and
* HTML-encoded, though this interface cannot enforce that.
*
* Currently, a [[DisplayUnit]] is equivalent to a single Unicode code
* point combined with treating "&lt;", "&gt;", and "&amp;" each as a
* single character (since they are displayed as '<', '>', and '&'
* respectively). This implementation is not directly exposed.
*
* It should be possible to change this definition without breaking
* code that uses the [[DisplayUnit]] interface e.g. to count
* user-perceived characters (graphemes) rather than code points,
* though any change has to be made in concert with changing the
* mobile client and Web implementations so that the user experience
* of character counting remains consistent.
*/
case class DisplayUnit(toInt: Int) extends AnyVal with Ordered[DisplayUnit] {
def compare(other: DisplayUnit): Int = toInt.compare(other.toInt)
def +(other: DisplayUnit) = DisplayUnit(toInt + other.toInt)
def -(other: DisplayUnit) = DisplayUnit(toInt - other.toInt)
def min(other: DisplayUnit): DisplayUnit = if (toInt < other.toInt) this else other
def max(other: DisplayUnit): DisplayUnit = if (toInt > other.toInt) this else other
}
implicit object DisplayUnit extends Offset[DisplayUnit] {
def toInt(d: DisplayUnit): Int = d.toInt
/**
* Returns the number of display units in the specified range of the
* given text. See [[DisplayUnit]] for a descrption of what we
* consider a display unit.
*
* The input string should already be NFC normalized to get
* consistent results. If partially html encoded, it will correctly
* count html entities as a single display unit.
*
* @param text the string containing the characters to count.
* @param the index to the first char of the text range
* @param the index after the last char of the text range.
*/
def count(text: String, start: CodeUnit, end: CodeUnit): DisplayUnit = {
val stop = end.min(CodeUnit.length(text))
@annotation.tailrec
def go(offset: CodeUnit, total: DisplayUnit): DisplayUnit =
if (offset >= stop) total
else go(offset + at(text, offset), total + DisplayUnit(1))
go(start, DisplayUnit(0))
}
/**
* Return the length of the display unit at the specified offset in
* the (NFC-normalized, HTML-encoded) text.
*/
def at(text: String, offset: CodeUnit): CodeUnit =
CodeUnit {
text.codePointAt(offset.toInt) match {
case '&' =>
if (text.regionMatches(offset.toInt, "&amp;", 0, 5)) 5
else if (text.regionMatches(offset.toInt, "&lt;", 0, 4)) 4
else if (text.regionMatches(offset.toInt, "&gt;", 0, 4)) 4
else 1
case cp => Character.charCount(cp)
}
}
}
/**
* Ranges of offsets, useful for avoiding slicing entities.
*/
sealed trait Ranges[T] {
def contains(t: T): Boolean
}
object Ranges {
private[this] case class Impl[T](toSeq: Seq[(T, T)])(implicit off: Offset[T])
extends Ranges[T] {
def contains(t: T): Boolean = toSeq.exists { case (lo, hi) => off.gt(t, lo) && off.lt(t, hi) }
}
/**
* Non-inclusive range of offsets (matches values that are strictly
* between `hi` and `lo`)
*/
def between[T](lo: T, hi: T)(implicit off: Offset[T]): Ranges[T] =
if (off.toInt(hi) > off.toInt(lo) + 1 && off.toInt(lo) < Int.MaxValue) Impl(Seq((lo, hi)))
else Impl(Nil)
/**
* The union of all of the specified ranges.
*/
def all[T](ranges: Seq[Ranges[T]])(implicit off: Offset[T]): Ranges[T] =
Impl(
// Preprocess the ranges so that each contains check is as cheap
// as possible.
ranges
.flatMap { case r: Impl[T] => r.toSeq }
.sortBy(_._1)
.foldLeft(Nil: List[(T, T)]) {
case ((a, b) :: out, (c, d)) if off.lt(c, b) => (a, d) :: out
case (out, r) => r :: out
}
)
def Empty[T: Offset]: Ranges[T] = Impl[T](Nil)
private[this] val HtmlEscapes = """&(?:amp|lt|gt);""".r
/**
* Match [[CodeUnit]]s that would split a HTML entity.
*/
def htmlEntities(s: String): Ranges[CodeUnit] = {
val it = HtmlEscapes.findAllIn(s)
all(it.map(_ => between(CodeUnit(it.start), CodeUnit(it.end))).toSeq)
}
def fromCodePointPairs(pairs: Seq[(Int, Int)]): Ranges[CodePoint] =
all(pairs.map { case (lo, hi) => between(CodePoint(lo), CodePoint(hi)) })
}
}

View File

@ -1,55 +0,0 @@
package com.twitter.tweetypie.tweettext
/**
* Code used to convert raw user-provided text into an allowable form.
*/
object PartialHtmlEncoding {
/**
* Replaces all `<`, `>`, and '&' chars with "&lt;", "&gt;", and "&amp;", respectively.
*
* Tweet text is HTML-encoded at tweet creation time, and is stored and processed in encoded form.
*/
def encode(text: String): String = {
val buf = new StringBuilder
text.foreach {
case '<' => buf.append("&lt;")
case '>' => buf.append("&gt;")
case '&' => buf.append("&amp;")
case c => buf.append(c)
}
buf.toString
}
private val AmpLtRegex = "&lt;".r
private val AmpGtRegex = "&gt;".r
private val AmpAmpRegex = "&amp;".r
private val partialHtmlDecoder: (String => String) =
((s: String) => AmpLtRegex.replaceAllIn(s, "<"))
.andThen(s => AmpGtRegex.replaceAllIn(s, ">"))
.andThen(s => AmpAmpRegex.replaceAllIn(s, "&"))
/**
* The opposite of encode, it replaces all "&lt;", "&gt;", and "&amp;" with
* `<`, `>`, and '&', respectively.
*/
def decode(text: String): String =
decodeWithModification(text) match {
case Some(mod) => mod.updated
case None => text
}
/**
* Decodes encoded entities, and returns a `TextModification` if the text was modified.
*/
def decodeWithModification(text: String): Option[TextModification] =
TextModification.replaceAll(
text,
AmpLtRegex -> "<",
AmpGtRegex -> ">",
AmpAmpRegex -> "&"
)
}

View File

@ -1,251 +0,0 @@
package com.twitter.tweetypie.tweettext
import scala.util.matching.Regex
/**
* Code used to convert raw user-provided text into an allowable form.
*/
object Preprocessor {
import TweetText._
import TextModification.replaceAll
/**
* Regex for dos-style line endings.
*/
val DosLineEndingRegex: Regex = """\r\n""".r
/**
* Converts \r\n to just \n.
*/
def normalizeNewlines(text: String): String =
DosLineEndingRegex.replaceAllIn(text, "\n")
/**
* Characters to strip out of tweet text at write-time.
*/
val unicodeCharsToStrip: Seq[Char] =
Seq(
'\uFFFE', '\uFEFF', // BOM
'\uFFFF', // Special
'\u200E', '\u200F', // ltr, rtl
'\u202A', '\u202B', '\u202C', '\u202D', '\u202E', // Directional change
'\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008',
'\u0009', '\u000B', '\u000C', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013',
'\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C',
'\u001D', '\u001E', '\u001F', '\u007F',
'\u2065',
)
val UnicodeCharsToStripRegex: Regex = unicodeCharsToStrip.mkString("[", "", "]").r
/**
* Strips out control characters and other non-textual unicode chars that can break xml and/or
* json rendering, or be used for exploits.
*/
def stripControlCharacters(text: String): String =
UnicodeCharsToStripRegex.replaceAllIn(text, "")
val Tweetypie674UnicodeSequence: String =
"\u0633\u0645\u064e\u0640\u064e\u0651\u0648\u064f\u0648\u064f\u062d\u062e " +
"\u0337\u0334\u0310\u062e \u0337\u0334\u0310\u062e \u0337\u0334\u0310\u062e " +
"\u0627\u0645\u0627\u0631\u062a\u064a\u062e \u0337\u0334\u0310\u062e"
val Tweetypie674UnicodeRegex: Regex = Tweetypie674UnicodeSequence.r
/**
* Replace each `Tweetypie674UnicodeSequence` of this string to REPLACEMENT
* CHARACTER.
*
* Apple has a bug in its CoreText library. This aims to prevent
* ios clients from being crashed when a tweet contains the specific
* unicode sequence.
*/
def avoidCoreTextBug(text: String): String =
Tweetypie674UnicodeRegex.replaceAllIn(text, "\ufffd")
/**
* Replace each `Tweetypie674UnicodeSequence` of this string to a REPLACEMENT
* CHARACTER, returns a TextModification object that provides information
* to also update entity indices.
*/
def replaceCoreTextBugModification(text: String): Option[TextModification] =
replaceAll(text, Tweetypie674UnicodeRegex, "\ufffd")
private val preprocessor: String => String =
((s: String) => nfcNormalize(s))
.andThen(stripControlCharacters _)
.andThen(trimBlankCharacters _)
.andThen(normalizeNewlines _)
.andThen(collapseBlankLines _)
.andThen(avoidCoreTextBug _)
/**
* Performs the text modifications that are necessary in the write-path before extracting URLs.
*/
def preprocessText(text: String): String =
preprocessor(text)
/**
* Replaces all `<`, `>`, and '&' chars with "&lt;", "&gt;", and "&amp;", respectively.
*
* The original purpose of this was presumably to prevent script injections when
* displaying tweets without proper escaping. Currently, tweets are encoded before
* they are stored in the database.
*
* Note that the pre-escaping of & < and > also happens in the rich text editor in javascript
*/
def partialHtmlEncode(text: String): String =
PartialHtmlEncoding.encode(text)
/**
* The opposite of partialHtmlEncode, it replaces all "&lt;", "&gt;", and "&amp;" with
* `<`, `>`, and '&', respectively.
*/
def partialHtmlDecode(text: String): String =
PartialHtmlEncoding.decode(text)
/**
*
* Detects all forms of whitespace, considering as whitespace the following:
* This regex detects characters that always or often are rendered as blank space. We use
* this to prevent users from inserting excess blank lines and from tweeting effectively
* blank tweets.
*
* Note that these are not all semantically "whitespace", so this regex should not be used
* to process non-blank text, e.g. to separate words.
*
* Codepoints below and the `\p{Z}` regex character property alias are defined in the Unicode
* Character Database (UCD) at https://unicode.org/ucd/ and https://unicode.org/reports/tr44/
*
* The `\p{Z}` regex character property alias is defined specifically in UCD as:
*
* Zs | Space_Separator | a space character (of various non-zero widths)
* Zl | Line_Separator | U+2028 LINE SEPARATOR only
* Zp | Paragraph_Separator | U+2029 PARAGRAPH SEPARATOR only
* Z | Separator | Zs | Zl | Zp
* ref: https://unicode.org/reports/tr44/#GC_Values_Table
*
* U+0009 Horizontal Tab (included in \s)
* U+000B Vertical Tab (included in \s)
* U+000C Form feed (included in \s)
* U+000D Carriage return (included in \s)
* U+0020 space (included in \s)
* U+0085 Next line (included in \u0085)
* U+061C arabic letter mark (included in \u061C)
* U+00A0 no-break space (included in \p{Z})
* U+00AD soft-hyphen marker (included in \u00AD)
* U+1680 ogham space mark (included in \p{Z})
* U+180E mongolian vowel separator (included in \p{Z} on jdk8 and included in \u180E on jdk11)
* U+2000 en quad (included in \p{Z})
* U+2001 em quad (included in \p{Z})
* U+2002 en space (included in \p{Z})
* U+2003 em space (included in \p{Z})
* U+2004 three-per-em space (included in \p{Z})
* U+2005 four-per-em space (included in \p{Z})
* U+2006 six-per-em space (included in \p{Z})
* U+2007 figure space (included in \p{Z})
* U+2008 punctuation space (included in \p{Z})
* U+2009 thin space (included in \p{Z})
* U+200A hair space (included in \p{Z})
* U+200B zero-width (included in \u200B-\u200D)
* U+200C zero-width non-joiner (included in \u200B-\u200D)
* U+200D zero-width joiner (included in \u200B-\u200D)
* U+2028 line separator (included in \p{Z})
* U+2029 paragraph separator (included in \p{Z})
* U+202F narrow no-break space (included in \p{Z})
* U+205F medium mathematical space (included in \p{Z})
* U+2061 function application (included in \u2061-\u2064)
* U+2062 invisible times (included in \u2061-\u2064)
* U+2063 invisible separator (included in \u2061-\u2064)
* U+2064 invisible plus (included in \u2061-\u2064)
* U+2066 left-to-right isolate (included in \u2066-\u2069)
* U+2067 right-to-left isolate (included in \u2066-\u2069)
* U+2068 first strong isolate (included in \u2066-\u2069)
* U+2069 pop directional isolate (included in \u2066-\u2069)
* U+206A inhibit symmetric swapping (included in \u206A-\u206F)
* U+206B activate symmetric swapping (included in \u206A-\u206F)
* U+206C inhibit arabic form shaping (included in \u206A-\u206F)
* U+206D activate arabic form shaping (included in \u206A-\u206F)
* U+206E national digit shapes (included in \u206A-\u206F)
* U+206F nominal digit shapes (included in \u206A-\u206F)
* U+2800 braille pattern blank (included in \u2800)
* U+3164 hongul filler (see UCD Ignorable_Code_Point)
* U+FFA0 halfwidth hongul filler (see UCD Ignorable_Code_Point)
* U+3000 ideographic space (included in \p{Z})
* U+FEFF zero-width no-break space (explicitly included in \uFEFF)
*/
val BlankTextRegex: Regex =
"""[\s\p{Z}\u180E\u0085\u00AD\u061C\u200B-\u200D\u2061-\u2064\u2066-\u2069\u206A-\u206F\u2800\u3164\uFEFF\uFFA0]*""".r
/**
* Some of the above blank characters are valid at the start of a Tweet (and irrelevant at the end)
* such as characters that change the direction of text. When trimming from the start
* or end of text we use a smaller set of characters
*/
val BlankWhenLeadingOrTrailingRegex: Regex = """[\s\p{Z}\u180E\u0085\u200B\uFEFF]*""".r
/**
* Matches consecutive blanks, starting at a newline.
*/
val ConsecutiveBlankLinesRegex: Regex = ("""\n(""" + BlankTextRegex + """\n){2,}""").r
val LeadingBlankCharactersRegex: Regex = ("^" + BlankWhenLeadingOrTrailingRegex).r
val TrailingBlankCharactersRegex: Regex = (BlankWhenLeadingOrTrailingRegex + "$").r
/**
* Is the given text empty or contains nothing but whitespace?
*/
def isBlank(text: String): Boolean =
BlankTextRegex.pattern.matcher(text).matches()
/**
* See http://confluence.local.twitter.com/display/PROD/Displaying+line+breaks+in+Tweets
*
* Collapses consecutive blanks lines down to a single blank line. We can assume that
* all newlines have already been normalized to just \n, so we don't have to worry about
* \r\n.
*/
def collapseBlankLinesModification(text: String): Option[TextModification] =
replaceAll(text, ConsecutiveBlankLinesRegex, "\n\n")
def collapseBlankLines(text: String): String =
ConsecutiveBlankLinesRegex.replaceAllIn(text, "\n\n")
def trimBlankCharacters(text: String): String =
TrailingBlankCharactersRegex.replaceFirstIn(
LeadingBlankCharactersRegex.replaceFirstIn(text, ""),
""
)
/** Characters that are not visible on their own. Some of these are used in combination with
* other visible characters, and therefore cannot be always stripped from tweets.
*/
private[tweettext] val InvisibleCharacters: Seq[Char] =
Seq(
'\u2060', '\u2061', '\u2062', '\u2063', '\u2064', '\u206A', '\u206B', '\u206C', '\u206D',
'\u206D', '\u206E', '\u206F', '\u200C',
'\u200D', // non-printing chars with valid use in Arabic
'\u2009', '\u200A', '\u200B', // include very skinny spaces too
'\ufe00', '\ufe01', '\ufe02', '\ufe03', '\ufe04', '\ufe05', '\ufe06', '\ufe07', '\ufe08',
'\ufe09', '\ufe0A', '\ufe0B', '\ufe0C', '\ufe0D', '\ufe0E', '\ufe0F',
)
private[tweetypie] val InvisibleUnicodePattern: Regex =
("^[" + InvisibleCharacters.mkString + "]+$").r
def isInvisibleChar(input: Char): Boolean = {
InvisibleCharacters contains input
}
/** If string is only "invisible characters", replace full string with whitespace.
* The purpose of this method is to remove invisible characters when ONLY invisible characters
* appear between two urls, which can be a security vulnerability due to misleading behavior. These
* characters cannot be removed as a rule applied to the tweet, because they are used in
* conjuction with other characters.
*/
def replaceInvisiblesWithWhitespace(text: String): String = {
text match {
case invisible @ InvisibleUnicodePattern() => " " * TweetText.codePointLength(invisible)
case other => other
}
}
}

View File

@ -1,24 +0,0 @@
package com.twitter.tweetypie.tweettext
/**
* A type class for entities found within a piece of tweet text.
*/
trait TextEntity[T] {
def fromIndex(entity: T): Short
def toIndex(entity: T): Short
def move(entity: T, fromIndex: Short, toIndex: Short): T
}
object TextEntity {
def fromIndex[T: TextEntity](entity: T): Short =
implicitly[TextEntity[T]].fromIndex(entity)
def toIndex[T: TextEntity](entity: T): Short =
implicitly[TextEntity[T]].toIndex(entity)
def move[T: TextEntity](entity: T, fromIndex: Short, toIndex: Short): T =
implicitly[TextEntity[T]].move(entity, fromIndex, toIndex)
def shift[T: TextEntity](entity: T, offset: Short): T =
move(entity, (fromIndex(entity) + offset).toShort, (toIndex(entity) + offset).toShort)
}

View File

@ -1,232 +0,0 @@
package com.twitter.tweetypie.tweettext
import scala.util.matching.Regex
object TextModification {
/**
* Lift a text into a TextModification where `original` and `updated` text are the same
* and `replacements` is empty.
*/
def identity(text: String): TextModification =
TextModification(original = text, updated = text, replacements = Nil)
/**
* Replace each substring that matches the regex with the substitution string, returns a
* TextModification object that contains the updated text and enough information to also
* update entity indices.
*
* This method should correctly be taking into account surrogate-pairs. The returned
* TextModification object has code-point offsets, instead of code-unit offsets.
*/
def replaceAll(text: String, regex: Regex, substitution: String): Option[TextModification] =
replaceAll(text, regex -> substitution)
/**
* Replaces substrings that match the given `Regex` with the corresonding substitution
* string. Returns a `TextModification` that can be used to reindex entities.
*/
def replaceAll(
text: String,
regexAndSubstitutions: (Regex, String)*
): Option[TextModification] = {
val matches =
(for {
(r, s) <- regexAndSubstitutions
m <- r.findAllIn(text).matchData
} yield (m, s)).sortBy { case (m, _) => m.start }
if (matches.isEmpty) {
// no match found, return None to indicate no modifications made
None
} else {
val replacements = List.newBuilder[TextReplacement]
val indexConverter = new IndexConverter(text)
// contains the retained text, built up as we walk through the regex matches
val buf = new StringBuilder(text.length)
// the number of code-points copied into buf
var codePointsCopied = Offset.CodePoint(0)
// always holds the start code-unit offset to copy to buf when we encounter
// either a regex match or end-of-string.
var anchor = 0
import indexConverter.toCodePoints
for ((m, sub) <- matches) {
val unchangedText = text.substring(anchor, m.start)
val unchangedLen = Offset.CodePoint.length(unchangedText)
val subLen = Offset.CodePoint.length(sub)
// copies the text upto the regex match run, plus the replacement string
buf.append(unchangedText).append(sub)
codePointsCopied += unchangedLen + subLen
// the offsets indicate the indices of the matched string in the original
// text, and the indices of the replacement string in the updated string
replacements +=
TextReplacement(
originalFrom = toCodePoints(Offset.CodeUnit(m.start)),
originalTo = toCodePoints(Offset.CodeUnit(m.end)),
updatedFrom = codePointsCopied - subLen,
updatedTo = codePointsCopied
)
anchor = m.end
}
buf.append(text.substring(anchor))
Some(TextModification(text, buf.toString, replacements.result()))
}
}
/**
* Inserts a string at a specified code point offset.
* Returns a `TextModification` that can be used to reindex entities.
*/
def insertAt(
originalText: String,
insertAt: Offset.CodePoint,
textToInsert: String
): TextModification = {
val insertAtCodeUnit = insertAt.toCodeUnit(originalText).toInt
val (before, after) = originalText.splitAt(insertAtCodeUnit)
val updatedText = s"$before$textToInsert$after"
val textToInsertLength = TweetText.codePointLength(textToInsert)
TextModification(
original = originalText,
updated = updatedText,
replacements = List(
TextReplacement.fromCodePoints(
originalFrom = insertAt.toInt,
originalTo = insertAt.toInt,
updatedFrom = insertAt.toInt,
updatedTo = insertAt.toInt + textToInsertLength
))
)
}
}
/**
* Encodes information about insertions/deletions/replacements made to a string, providing
* the original string, the updated string, and a list of TextReplacement objects
* that encode the indices of the segments that were changed. Using this information,
* it is possible to map an offset into the original string to an offset into the updated
* string, assuming the text at the offset was not within one of the modified segments.
*
* All offsets are code-points, not UTF6 code-units.
*/
case class TextModification(
original: String,
updated: String,
replacements: List[TextReplacement]) {
private val originalLen = Offset.CodePoint.length(original)
/**
* Using an offset into the original String, computes the equivalent offset into the updated
* string. If the offset falls within a segment that was removed/replaced, None is returned.
*/
def reindex(index: Offset.CodePoint): Option[Offset.CodePoint] =
reindex(index, Offset.CodePoint(0), replacements)
/**
* Reindexes an entity of type T. Returns the updated entity, or None if either the `fromIndex`
* or `toIndex` value is now out of range.
*/
def reindexEntity[T: TextEntity](e: T): Option[T] =
for {
from <- reindex(Offset.CodePoint(TextEntity.fromIndex(e)))
to <- reindex(Offset.CodePoint(TextEntity.toIndex(e) - 1))
} yield TextEntity.move(e, from.toShort, (to.toShort + 1).toShort)
/**
* Reindexes a sequence of entities of type T. Some entities could be filtered
* out if they span a region of text that has been removed.
*/
def reindexEntities[T: TextEntity](es: Seq[T]): Seq[T] =
for (e <- es; e2 <- reindexEntity(e)) yield e2
/**
* Swaps `original` and `updated` text and inverts all `TextReplacement` instances.
*/
def inverse: TextModification =
TextModification(updated, original, replacements.map(_.inverse))
// recursively walks through the list of TextReplacement objects computing
// offsets to add/substract from 'shift', which accumulates all changes and
// then gets added to index at the end.
private def reindex(
index: Offset.CodePoint,
shift: Offset.CodePoint,
reps: List[TextReplacement]
): Option[Offset.CodePoint] =
reps match {
case Nil =>
if (index.toInt >= 0 && index <= originalLen)
Some(index + shift)
else
None
case (r @ TextReplacement(fr, to, _, _)) :: tail =>
if (index < fr) Some(index + shift)
else if (index < to) None
else reindex(index, shift + r.lengthDelta, tail)
}
}
object TextReplacement {
def fromCodePoints(
originalFrom: Int,
originalTo: Int,
updatedFrom: Int,
updatedTo: Int
): TextReplacement =
TextReplacement(
Offset.CodePoint(originalFrom),
Offset.CodePoint(originalTo),
Offset.CodePoint(updatedFrom),
Offset.CodePoint(updatedTo)
)
}
/**
* Encodes the indices of a segment of text in one string that maps to a replacement
* segment in an updated version of the text. The replacement segment could be empty
* (updatedTo == updatedFrom), indicating the segment was removed.
*
* All offsets are code-points, not UTF16 code-units.
*
* `originalFrom` and `updatedFrom` are inclusive.
* `originalTo` and `updatedTo` are exclusive.
*/
case class TextReplacement(
originalFrom: Offset.CodePoint,
originalTo: Offset.CodePoint,
updatedFrom: Offset.CodePoint,
updatedTo: Offset.CodePoint) {
def originalLength: Offset.CodePoint = originalTo - originalFrom
def updatedLength: Offset.CodePoint = updatedTo - updatedFrom
def lengthDelta: Offset.CodePoint = updatedLength - originalLength
def shiftOriginal(offset: Offset.CodePoint): TextReplacement =
copy(originalFrom = originalFrom + offset, originalTo = originalTo + offset)
def shiftUpdated(offset: Offset.CodePoint): TextReplacement =
copy(updatedFrom = updatedFrom + offset, updatedTo = updatedTo + offset)
def shift(offset: Offset.CodePoint): TextReplacement =
TextReplacement(
originalFrom + offset,
originalTo + offset,
updatedFrom + offset,
updatedTo + offset
)
def inverse: TextReplacement =
TextReplacement(
originalFrom = updatedFrom,
originalTo = updatedTo,
updatedFrom = originalFrom,
updatedTo = originalTo
)
}

View File

@ -1,159 +0,0 @@
package com.twitter.tweetypie.tweettext
import com.twitter.tweetypie.tweettext.TweetText._
import com.twitter.twittertext.Extractor
import java.lang.Character
import scala.annotation.tailrec
import scala.collection.JavaConverters._
object Truncator {
val Ellipsis = "\u2026"
/**
* Truncate tweet text for a retweet. If the text is longer than
* either of the length limits, code points are cut off from the end
* of the text and replaced with an ellipsis. We keep as much of the
* leading text as possible, subject to these constraints:
*
* - There are no more than `MaxDisplayLength` characters.
*
* - When converted to UTF-8, the result does not exceed `MaxByteLength`.
*
* - We do not break within a single grapheme cluster.
*
* The input is assumed to be partial HTML-encoded and may or may
* not be NFC normalized. The result will be partial HTML-encoded
* and will be NFC normalized.
*/
def truncateForRetweet(input: String): String = truncateWithEllipsis(input, Ellipsis)
/**
* Truncate to [[com.twitter.tweetypie.tweettext.TweetText#OrginalMaxDisplayLength]] display
* units, using "..." as an ellipsis. The resulting text is guaranteed to pass our tweet length
* check, but it is not guaranteed to fit in a SMS message.
*/
def truncateForSms(input: String): String = truncateWithEllipsis(input, "...")
/**
* Check the length of the given text, and truncate it if it is longer
* than the allowed length for a Tweet. The result of this method will
* always have:
*
* - Display length <= OriginalMaxDisplayLength.
* - Length when encoded as UTF-8 <= OriginalMaxUtf8Length.
*
* If the input would violate this, then the text will be
* truncated. When the text is truncated, it will be truncated such
* that:
*
* - Grapheme clusters will not be split.
* - The last character before the ellipsis will not be a whitespace
* character.
* - The ellipsis text will be appended to the end.
*/
private[this] def truncateWithEllipsis(input: String, ellipsis: String): String = {
val text = nfcNormalize(input)
val truncateAt =
truncationPoint(text, OriginalMaxDisplayLength, OriginalMaxUtf8Length, Some(ellipsis))
if (truncateAt.codeUnitOffset.toInt == text.length) text
else text.take(truncateAt.codeUnitOffset.toInt) + ellipsis
}
/**
* Indicates a potential TruncationPoint in piece of text.
*
* @param charOffset the utf-16 character offset of the truncation point
* @param codePointOffset the offset in code points
*/
case class TruncationPoint(codeUnitOffset: Offset.CodeUnit, codePointOffset: Offset.CodePoint)
/**
* Computes a TruncationPoint for the given text and length constraints. If `truncated` on
* the result is `false`, it means the text will fit within the given constraints without
* truncation. Otherwise, the result indicates both the character and code-point offsets
* at which to perform the truncation, and the resulting display length and byte length of
* the truncated string.
*
* Text should be NFC normalized first for best results.
*
* @param withEllipsis if true, then the truncation point will be computed so that there is space
* to append an ellipsis and to still remain within the limits. The ellipsis is not counted
* in the returned display and byte lengths.
*
* @param atomicUnits may contain a list of ranges that should be treated as atomic unit and
* not split. each tuple is half-open range in code points.
*/
def truncationPoint(
text: String,
maxDisplayLength: Int = OriginalMaxDisplayLength,
maxByteLength: Int = OriginalMaxUtf8Length,
withEllipsis: Option[String] = None,
atomicUnits: Offset.Ranges[Offset.CodePoint] = Offset.Ranges.Empty
): TruncationPoint = {
val breakPoints =
GraphemeIndexIterator
.ends(text)
.filterNot(Offset.Ranges.htmlEntities(text).contains)
val ellipsisDisplayUnits =
withEllipsis.map(Offset.DisplayUnit.length).getOrElse(Offset.DisplayUnit(0))
val maxTruncatedDisplayLength = Offset.DisplayUnit(maxDisplayLength) - ellipsisDisplayUnits
val ellipsisByteLength = withEllipsis.map(Offset.Utf8.length).getOrElse(Offset.Utf8(0))
val maxTruncatedByteLength = Offset.Utf8(maxByteLength) - ellipsisByteLength
var codeUnit = Offset.CodeUnit(0)
var codePoint = Offset.CodePoint(0)
var displayLength = Offset.DisplayUnit(0)
var byteLength = Offset.Utf8(0)
var truncateCodeUnit = codeUnit
var truncateCodePoint = codePoint
@tailrec def go(): TruncationPoint =
if (displayLength.toInt > maxDisplayLength || byteLength.toInt > maxByteLength) {
TruncationPoint(truncateCodeUnit, truncateCodePoint)
} else if (codeUnit != truncateCodeUnit &&
displayLength <= maxTruncatedDisplayLength &&
byteLength <= maxTruncatedByteLength &&
(codeUnit.toInt == 0 || !Character.isWhitespace(text.codePointBefore(codeUnit.toInt))) &&
!atomicUnits.contains(codePoint)) {
// we can advance the truncation point
truncateCodeUnit = codeUnit
truncateCodePoint = codePoint
go()
} else if (breakPoints.hasNext) {
// there are further truncation points to consider
val nextCodeUnit = breakPoints.next
codePoint += Offset.CodePoint.count(text, codeUnit, nextCodeUnit)
displayLength += Offset.DisplayUnit.count(text, codeUnit, nextCodeUnit)
byteLength += Offset.Utf8.count(text, codeUnit, nextCodeUnit)
codeUnit = nextCodeUnit
go()
} else {
TruncationPoint(codeUnit, codePoint)
}
go()
}
/**
* Truncate the given text, avoiding chopping HTML entities and tweet
* entities. This should only be used for testing because it performs
* entity extraction, and so is very inefficient.
*/
def truncateForTests(
input: String,
maxDisplayLength: Int = OriginalMaxDisplayLength,
maxByteLength: Int = OriginalMaxUtf8Length
): String = {
val text = nfcNormalize(input)
val extractor = new Extractor
val entities = extractor.extractEntitiesWithIndices(text)
extractor.modifyIndicesFromUTF16ToUnicode(text, entities)
val avoid = Offset.Ranges.fromCodePointPairs(
entities.asScala.map(e => (e.getStart().intValue, e.getEnd().intValue))
)
val truncateAt = truncationPoint(text, maxDisplayLength, maxByteLength, None, avoid)
text.take(truncateAt.codeUnitOffset.toInt)
}
}

View File

@ -1,62 +0,0 @@
package com.twitter.tweetypie.tweettext
import java.text.Normalizer
object TweetText {
/** The original maximum tweet length, taking into account normalization */
private[tweetypie] val OriginalMaxDisplayLength = 140
/** Maximum number of visible code points allowed in a tweet when tweet length is counted by code
* points, taking into account normalization. See also [[MaxVisibleWeightedEmojiLength]].
*/
private[tweetypie] val MaxVisibleWeightedLength = 280
/** Maximum number of visible code points allowed in a tweet when tweet length is counted by
* emoji, taking into account normalization. See also [[MaxVisibleWeightedLength]].
* 140 is the max number of Emojis, visible, fully-weighted per Twitter's cramming rules
* 10 is the max number of Code Points per Emoji
*/
private[tweetypie] val MaxVisibleWeightedEmojiLength = 140 * 10
/** Maximum number of bytes when truncating tweet text for a retweet. Originally was the
* max UTF-8 length when tweets were at most 140 characters.
* See also [[OriginalMaxDisplayLength]].
*/
private[tweetypie] val OriginalMaxUtf8Length = 600
/** Maximum number of bytes for tweet text using utf-8 encoding.
*/
private[tweetypie] val MaxUtf8Length = 5708
/** Maximum number of mentions allowed in tweet text. This is enforced at tweet creation time */
private[tweetypie] val MaxMentions = 50
/** Maximum number of urls allowed in tweet text. This is enforced at tweet creation time */
private[tweetypie] val MaxUrls = 10
/** Maximum number of hashtags allowed in tweet text. This is enforced at tweet creation time */
private[tweetypie] val MaxHashtags = 50
/** Maximum number of cashtags allowed in tweet text. This is enforced at tweet creation time */
private[tweetypie] val MaxCashtags = 50
/** Maximum length of a hashtag (not including the '#') */
private[tweetypie] val MaxHashtagLength = 100
/**
* Normalizes the text according to the unicode NFC spec.
*/
def nfcNormalize(text: String): String = Normalizer.normalize(text, Normalizer.Form.NFC)
/**
* Return the number of "characters" in this text. See
* [[Offset.DisplayUnit]].
*/
def displayLength(text: String): Int = Offset.DisplayUnit.length(text).toInt
/**
* Return the number of Unicode code points in this String.
*/
def codePointLength(text: String): Int = Offset.CodePoint.length(text).toInt
}

View File

@ -1,76 +0,0 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
provides = scala_artifact(
org = "com.twitter.tweetypie",
name = "util",
repo = artifactory,
),
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"//:scala-reflect",
"3rdparty/jvm/commons-codec",
"3rdparty/jvm/org/apache/thrift:libthrift",
"finagle/finagle-core/src/main",
"mediaservices/commons/src/main/thrift:thrift-scala",
"scrooge/scrooge-serializer/src/main/scala",
"tweetypie/servo/repo",
"tweetypie/servo/util",
"tweetypie/servo/util/src/main/scala:exception",
"src/scala/com/twitter/takedown/util",
"src/thrift/com/twitter/dataproducts:enrichments_profilegeo-scala",
"src/thrift/com/twitter/escherbird:media-annotation-structs-scala",
"src/thrift/com/twitter/expandodo:cards-scala",
"src/thrift/com/twitter/gizmoduck:thrift-scala",
"src/thrift/com/twitter/servo:servo-exception-scala",
"src/thrift/com/twitter/spam/rtf:safety-label-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:deprecated-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:service-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:transient_context-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
"stitch/stitch-core",
"tweet-util",
"util/util-core:scala",
],
)
scala_library(
name = "EditControlUtil",
sources = [
"EditControlUtil.scala",
"package.scala",
],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
provides = scala_artifact(
org = "com.twitter.tweetypie",
name = "util-EditControlUtil",
repo = artifactory,
),
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"//:scala-reflect",
"3rdparty/jvm/commons-codec",
"3rdparty/jvm/org/apache/thrift:libthrift",
"finagle/finagle-core/src/main",
"mediaservices/commons/src/main/thrift:thrift-scala",
"scrooge/scrooge-serializer/src/main/scala",
"tweetypie/servo/util/src/main/scala:exception",
"src/thrift/com/twitter/dataproducts:enrichments_profilegeo-scala",
"src/thrift/com/twitter/escherbird:media-annotation-structs-scala",
"src/thrift/com/twitter/expandodo:cards-scala",
"src/thrift/com/twitter/gizmoduck:thrift-scala",
"src/thrift/com/twitter/servo:servo-exception-scala",
"src/thrift/com/twitter/spam/rtf:safety-label-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:deprecated-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:service-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:transient_context-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
"stitch/stitch-core",
"tweet-util",
"util/util-core:scala",
],
)

Some files were not shown because too many files have changed in this diff Show More