the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/tweettext/TextModification.scala

233 lines
8.0 KiB
Scala

package com.twitter.tweetypie.tweettext
import scala.util.matching.Regex
object TextModification {
/**
* Lift a text into a TextModification where `original` and `updated` text are the same
* and `replacements` is empty.
*/
def identity(text: String): TextModification =
TextModification(original = text, updated = text, replacements = Nil)
/**
* Replace each substring that matches the regex with the substitution string, returns a
* TextModification object that contains the updated text and enough information to also
* update entity indices.
*
* This method should correctly be taking into account surrogate-pairs. The returned
* TextModification object has code-point offsets, instead of code-unit offsets.
*/
def replaceAll(text: String, regex: Regex, substitution: String): Option[TextModification] =
replaceAll(text, regex -> substitution)
/**
* Replaces substrings that match the given `Regex` with the corresonding substitution
* string. Returns a `TextModification` that can be used to reindex entities.
*/
def replaceAll(
text: String,
regexAndSubstitutions: (Regex, String)*
): Option[TextModification] = {
val matches =
(for {
(r, s) <- regexAndSubstitutions
m <- r.findAllIn(text).matchData
} yield (m, s)).sortBy { case (m, _) => m.start }
if (matches.isEmpty) {
// no match found, return None to indicate no modifications made
None
} else {
val replacements = List.newBuilder[TextReplacement]
val indexConverter = new IndexConverter(text)
// contains the retained text, built up as we walk through the regex matches
val buf = new StringBuilder(text.length)
// the number of code-points copied into buf
var codePointsCopied = Offset.CodePoint(0)
// always holds the start code-unit offset to copy to buf when we encounter
// either a regex match or end-of-string.
var anchor = 0
import indexConverter.toCodePoints
for ((m, sub) <- matches) {
val unchangedText = text.substring(anchor, m.start)
val unchangedLen = Offset.CodePoint.length(unchangedText)
val subLen = Offset.CodePoint.length(sub)
// copies the text upto the regex match run, plus the replacement string
buf.append(unchangedText).append(sub)
codePointsCopied += unchangedLen + subLen
// the offsets indicate the indices of the matched string in the original
// text, and the indices of the replacement string in the updated string
replacements +=
TextReplacement(
originalFrom = toCodePoints(Offset.CodeUnit(m.start)),
originalTo = toCodePoints(Offset.CodeUnit(m.end)),
updatedFrom = codePointsCopied - subLen,
updatedTo = codePointsCopied
)
anchor = m.end
}
buf.append(text.substring(anchor))
Some(TextModification(text, buf.toString, replacements.result()))
}
}
/**
* Inserts a string at a specified code point offset.
* Returns a `TextModification` that can be used to reindex entities.
*/
def insertAt(
originalText: String,
insertAt: Offset.CodePoint,
textToInsert: String
): TextModification = {
val insertAtCodeUnit = insertAt.toCodeUnit(originalText).toInt
val (before, after) = originalText.splitAt(insertAtCodeUnit)
val updatedText = s"$before$textToInsert$after"
val textToInsertLength = TweetText.codePointLength(textToInsert)
TextModification(
original = originalText,
updated = updatedText,
replacements = List(
TextReplacement.fromCodePoints(
originalFrom = insertAt.toInt,
originalTo = insertAt.toInt,
updatedFrom = insertAt.toInt,
updatedTo = insertAt.toInt + textToInsertLength
))
)
}
}
/**
* Encodes information about insertions/deletions/replacements made to a string, providing
* the original string, the updated string, and a list of TextReplacement objects
* that encode the indices of the segments that were changed. Using this information,
* it is possible to map an offset into the original string to an offset into the updated
* string, assuming the text at the offset was not within one of the modified segments.
*
* All offsets are code-points, not UTF6 code-units.
*/
case class TextModification(
original: String,
updated: String,
replacements: List[TextReplacement]) {
private val originalLen = Offset.CodePoint.length(original)
/**
* Using an offset into the original String, computes the equivalent offset into the updated
* string. If the offset falls within a segment that was removed/replaced, None is returned.
*/
def reindex(index: Offset.CodePoint): Option[Offset.CodePoint] =
reindex(index, Offset.CodePoint(0), replacements)
/**
* Reindexes an entity of type T. Returns the updated entity, or None if either the `fromIndex`
* or `toIndex` value is now out of range.
*/
def reindexEntity[T: TextEntity](e: T): Option[T] =
for {
from <- reindex(Offset.CodePoint(TextEntity.fromIndex(e)))
to <- reindex(Offset.CodePoint(TextEntity.toIndex(e) - 1))
} yield TextEntity.move(e, from.toShort, (to.toShort + 1).toShort)
/**
* Reindexes a sequence of entities of type T. Some entities could be filtered
* out if they span a region of text that has been removed.
*/
def reindexEntities[T: TextEntity](es: Seq[T]): Seq[T] =
for (e <- es; e2 <- reindexEntity(e)) yield e2
/**
* Swaps `original` and `updated` text and inverts all `TextReplacement` instances.
*/
def inverse: TextModification =
TextModification(updated, original, replacements.map(_.inverse))
// recursively walks through the list of TextReplacement objects computing
// offsets to add/substract from 'shift', which accumulates all changes and
// then gets added to index at the end.
private def reindex(
index: Offset.CodePoint,
shift: Offset.CodePoint,
reps: List[TextReplacement]
): Option[Offset.CodePoint] =
reps match {
case Nil =>
if (index.toInt >= 0 && index <= originalLen)
Some(index + shift)
else
None
case (r @ TextReplacement(fr, to, _, _)) :: tail =>
if (index < fr) Some(index + shift)
else if (index < to) None
else reindex(index, shift + r.lengthDelta, tail)
}
}
object TextReplacement {
def fromCodePoints(
originalFrom: Int,
originalTo: Int,
updatedFrom: Int,
updatedTo: Int
): TextReplacement =
TextReplacement(
Offset.CodePoint(originalFrom),
Offset.CodePoint(originalTo),
Offset.CodePoint(updatedFrom),
Offset.CodePoint(updatedTo)
)
}
/**
* Encodes the indices of a segment of text in one string that maps to a replacement
* segment in an updated version of the text. The replacement segment could be empty
* (updatedTo == updatedFrom), indicating the segment was removed.
*
* All offsets are code-points, not UTF16 code-units.
*
* `originalFrom` and `updatedFrom` are inclusive.
* `originalTo` and `updatedTo` are exclusive.
*/
case class TextReplacement(
originalFrom: Offset.CodePoint,
originalTo: Offset.CodePoint,
updatedFrom: Offset.CodePoint,
updatedTo: Offset.CodePoint) {
def originalLength: Offset.CodePoint = originalTo - originalFrom
def updatedLength: Offset.CodePoint = updatedTo - updatedFrom
def lengthDelta: Offset.CodePoint = updatedLength - originalLength
def shiftOriginal(offset: Offset.CodePoint): TextReplacement =
copy(originalFrom = originalFrom + offset, originalTo = originalTo + offset)
def shiftUpdated(offset: Offset.CodePoint): TextReplacement =
copy(updatedFrom = updatedFrom + offset, updatedTo = updatedTo + offset)
def shift(offset: Offset.CodePoint): TextReplacement =
TextReplacement(
originalFrom + offset,
originalTo + offset,
updatedFrom + offset,
updatedTo + offset
)
def inverse: TextReplacement =
TextReplacement(
originalFrom = updatedFrom,
originalTo = updatedTo,
updatedFrom = originalFrom,
updatedTo = originalTo
)
}