72 lines
2.4 KiB
Scala
72 lines
2.4 KiB
Scala
package com.twitter.tweetypie.storage
|
|
|
|
import com.twitter.finagle.stats.StatsReceiver
|
|
import com.twitter.stitch.Stitch
|
|
import com.twitter.storage.client.manhattan.kv.ManhattanValue
|
|
import com.twitter.tweetypie.storage.TweetUtils._
|
|
import com.twitter.util.Time
|
|
|
|
/**
|
|
* Deletes data for the scrubbed field and writes a metadata record.
|
|
* Provides scrub functionality. Right now, we only allow the scrubbing of the geo field.
|
|
* It should be simple to add more fields to the allowlist if needed.
|
|
*/
|
|
object ScrubHandler {
|
|
|
|
val scrubFieldsAllowlist: Set[Field] = Set(Field.Geo)
|
|
|
|
def apply(
|
|
insert: ManhattanOperations.Insert,
|
|
delete: ManhattanOperations.Delete,
|
|
scribe: Scribe,
|
|
stats: StatsReceiver
|
|
): TweetStorageClient.Scrub =
|
|
(unfilteredTweetIds: Seq[TweetId], columns: Seq[Field]) => {
|
|
val tweetIds = unfilteredTweetIds.filter(_ > 0)
|
|
|
|
require(columns.nonEmpty, "Must specify fields to scrub")
|
|
require(
|
|
columns.toSet.size == columns.size,
|
|
s"Duplicate fields to scrub specified: $columns"
|
|
)
|
|
require(
|
|
columns.forall(scrubFieldsAllowlist.contains(_)),
|
|
s"Cannot scrub $columns; scrubbable fields are restricted to $scrubFieldsAllowlist"
|
|
)
|
|
|
|
Stats.addWidthStat("scrub", "ids", tweetIds.size, stats)
|
|
val mhTimestamp = Time.now
|
|
|
|
val stitches = tweetIds.map { tweetId =>
|
|
val deletionStitches = columns.map { field =>
|
|
val mhKeyToDelete = TweetKey.fieldKey(tweetId, field.id)
|
|
delete(mhKeyToDelete, Some(mhTimestamp)).liftToTry
|
|
}
|
|
|
|
val collectedStitch =
|
|
Stitch.collect(deletionStitches).map(collectWithRateLimitCheck).lowerFromTry
|
|
|
|
collectedStitch
|
|
.flatMap { _ =>
|
|
val scrubbedStitches = columns.map { column =>
|
|
val scrubbedKey = TweetKey.scrubbedFieldKey(tweetId, column.id)
|
|
val record =
|
|
TweetManhattanRecord(
|
|
scrubbedKey,
|
|
ManhattanValue(StringCodec.toByteBuffer(""), Some(mhTimestamp))
|
|
)
|
|
|
|
insert(record).liftToTry
|
|
}
|
|
|
|
Stitch.collect(scrubbedStitches)
|
|
}
|
|
.map(collectWithRateLimitCheck)
|
|
}
|
|
|
|
Stitch.collect(stitches).map(collectWithRateLimitCheck).lowerFromTry.onSuccess { _ =>
|
|
tweetIds.foreach { id => scribe.logScrubbed(id, columns.map(_.id.toInt), mhTimestamp) }
|
|
}
|
|
}
|
|
}
|