package com.twitter.tweetypie.tweettext import com.twitter.tweetypie.tweettext.TweetText._ import com.twitter.twittertext.Extractor import java.lang.Character import scala.annotation.tailrec import scala.collection.JavaConverters._ object Truncator { val Ellipsis = "\u2026" /** * Truncate tweet text for a retweet. If the text is longer than * either of the length limits, code points are cut off from the end * of the text and replaced with an ellipsis. We keep as much of the * leading text as possible, subject to these constraints: * * - There are no more than `MaxDisplayLength` characters. * * - When converted to UTF-8, the result does not exceed `MaxByteLength`. * * - We do not break within a single grapheme cluster. * * The input is assumed to be partial HTML-encoded and may or may * not be NFC normalized. The result will be partial HTML-encoded * and will be NFC normalized. */ def truncateForRetweet(input: String): String = truncateWithEllipsis(input, Ellipsis) /** * Truncate to [[com.twitter.tweetypie.tweettext.TweetText#OrginalMaxDisplayLength]] display * units, using "..." as an ellipsis. The resulting text is guaranteed to pass our tweet length * check, but it is not guaranteed to fit in a SMS message. */ def truncateForSms(input: String): String = truncateWithEllipsis(input, "...") /** * Check the length of the given text, and truncate it if it is longer * than the allowed length for a Tweet. The result of this method will * always have: * * - Display length <= OriginalMaxDisplayLength. * - Length when encoded as UTF-8 <= OriginalMaxUtf8Length. * * If the input would violate this, then the text will be * truncated. When the text is truncated, it will be truncated such * that: * * - Grapheme clusters will not be split. * - The last character before the ellipsis will not be a whitespace * character. * - The ellipsis text will be appended to the end. */ private[this] def truncateWithEllipsis(input: String, ellipsis: String): String = { val text = nfcNormalize(input) val truncateAt = truncationPoint(text, OriginalMaxDisplayLength, OriginalMaxUtf8Length, Some(ellipsis)) if (truncateAt.codeUnitOffset.toInt == text.length) text else text.take(truncateAt.codeUnitOffset.toInt) + ellipsis } /** * Indicates a potential TruncationPoint in piece of text. * * @param charOffset the utf-16 character offset of the truncation point * @param codePointOffset the offset in code points */ case class TruncationPoint(codeUnitOffset: Offset.CodeUnit, codePointOffset: Offset.CodePoint) /** * Computes a TruncationPoint for the given text and length constraints. If `truncated` on * the result is `false`, it means the text will fit within the given constraints without * truncation. Otherwise, the result indicates both the character and code-point offsets * at which to perform the truncation, and the resulting display length and byte length of * the truncated string. * * Text should be NFC normalized first for best results. * * @param withEllipsis if true, then the truncation point will be computed so that there is space * to append an ellipsis and to still remain within the limits. The ellipsis is not counted * in the returned display and byte lengths. * * @param atomicUnits may contain a list of ranges that should be treated as atomic unit and * not split. each tuple is half-open range in code points. */ def truncationPoint( text: String, maxDisplayLength: Int = OriginalMaxDisplayLength, maxByteLength: Int = OriginalMaxUtf8Length, withEllipsis: Option[String] = None, atomicUnits: Offset.Ranges[Offset.CodePoint] = Offset.Ranges.Empty ): TruncationPoint = { val breakPoints = GraphemeIndexIterator .ends(text) .filterNot(Offset.Ranges.htmlEntities(text).contains) val ellipsisDisplayUnits = withEllipsis.map(Offset.DisplayUnit.length).getOrElse(Offset.DisplayUnit(0)) val maxTruncatedDisplayLength = Offset.DisplayUnit(maxDisplayLength) - ellipsisDisplayUnits val ellipsisByteLength = withEllipsis.map(Offset.Utf8.length).getOrElse(Offset.Utf8(0)) val maxTruncatedByteLength = Offset.Utf8(maxByteLength) - ellipsisByteLength var codeUnit = Offset.CodeUnit(0) var codePoint = Offset.CodePoint(0) var displayLength = Offset.DisplayUnit(0) var byteLength = Offset.Utf8(0) var truncateCodeUnit = codeUnit var truncateCodePoint = codePoint @tailrec def go(): TruncationPoint = if (displayLength.toInt > maxDisplayLength || byteLength.toInt > maxByteLength) { TruncationPoint(truncateCodeUnit, truncateCodePoint) } else if (codeUnit != truncateCodeUnit && displayLength <= maxTruncatedDisplayLength && byteLength <= maxTruncatedByteLength && (codeUnit.toInt == 0 || !Character.isWhitespace(text.codePointBefore(codeUnit.toInt))) && !atomicUnits.contains(codePoint)) { // we can advance the truncation point truncateCodeUnit = codeUnit truncateCodePoint = codePoint go() } else if (breakPoints.hasNext) { // there are further truncation points to consider val nextCodeUnit = breakPoints.next codePoint += Offset.CodePoint.count(text, codeUnit, nextCodeUnit) displayLength += Offset.DisplayUnit.count(text, codeUnit, nextCodeUnit) byteLength += Offset.Utf8.count(text, codeUnit, nextCodeUnit) codeUnit = nextCodeUnit go() } else { TruncationPoint(codeUnit, codePoint) } go() } /** * Truncate the given text, avoiding chopping HTML entities and tweet * entities. This should only be used for testing because it performs * entity extraction, and so is very inefficient. */ def truncateForTests( input: String, maxDisplayLength: Int = OriginalMaxDisplayLength, maxByteLength: Int = OriginalMaxUtf8Length ): String = { val text = nfcNormalize(input) val extractor = new Extractor val entities = extractor.extractEntitiesWithIndices(text) extractor.modifyIndicesFromUTF16ToUnicode(text, entities) val avoid = Offset.Ranges.fromCodePointPairs( entities.asScala.map(e => (e.getStart().intValue, e.getEnd().intValue)) ) val truncateAt = truncationPoint(text, maxDisplayLength, maxByteLength, None, avoid) text.take(truncateAt.codeUnitOffset.toInt) } }