the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/storage/ScrubHandler.scala

72 lines
2.4 KiB
Scala

package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.kv.ManhattanValue
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.util.Time
/**
* Deletes data for the scrubbed field and writes a metadata record.
* Provides scrub functionality. Right now, we only allow the scrubbing of the geo field.
* It should be simple to add more fields to the allowlist if needed.
*/
object ScrubHandler {
val scrubFieldsAllowlist: Set[Field] = Set(Field.Geo)
def apply(
insert: ManhattanOperations.Insert,
delete: ManhattanOperations.Delete,
scribe: Scribe,
stats: StatsReceiver
): TweetStorageClient.Scrub =
(unfilteredTweetIds: Seq[TweetId], columns: Seq[Field]) => {
val tweetIds = unfilteredTweetIds.filter(_ > 0)
require(columns.nonEmpty, "Must specify fields to scrub")
require(
columns.toSet.size == columns.size,
s"Duplicate fields to scrub specified: $columns"
)
require(
columns.forall(scrubFieldsAllowlist.contains(_)),
s"Cannot scrub $columns; scrubbable fields are restricted to $scrubFieldsAllowlist"
)
Stats.addWidthStat("scrub", "ids", tweetIds.size, stats)
val mhTimestamp = Time.now
val stitches = tweetIds.map { tweetId =>
val deletionStitches = columns.map { field =>
val mhKeyToDelete = TweetKey.fieldKey(tweetId, field.id)
delete(mhKeyToDelete, Some(mhTimestamp)).liftToTry
}
val collectedStitch =
Stitch.collect(deletionStitches).map(collectWithRateLimitCheck).lowerFromTry
collectedStitch
.flatMap { _ =>
val scrubbedStitches = columns.map { column =>
val scrubbedKey = TweetKey.scrubbedFieldKey(tweetId, column.id)
val record =
TweetManhattanRecord(
scrubbedKey,
ManhattanValue(StringCodec.toByteBuffer(""), Some(mhTimestamp))
)
insert(record).liftToTry
}
Stitch.collect(scrubbedStitches)
}
.map(collectWithRateLimitCheck)
}
Stitch.collect(stitches).map(collectWithRateLimitCheck).lowerFromTry.onSuccess { _ =>
tweetIds.foreach { id => scribe.logScrubbed(id, columns.map(_.id.toInt), mhTimestamp) }
}
}
}