the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/matching/TokenSequence.scala

93 lines
3.0 KiB
Scala

package com.twitter.tweetypie.matching
object TokenSequence {
/**
* Is `suffix` a suffix of `s`, starting at `offset` in `s`?
*/
def hasSuffixAt(s: CharSequence, suffix: CharSequence, offset: Int): Boolean =
if (offset == 0 && (s.eq(suffix) || s == suffix)) {
true
} else if (suffix.length != (s.length - offset)) {
false
} else {
@annotation.tailrec
def go(i: Int): Boolean =
if (i == suffix.length) true
else if (suffix.charAt(i) == s.charAt(offset + i)) go(i + 1)
else false
go(0)
}
/**
* Do two [[CharSequence]]s contain the same characters?
*
* [[CharSequence]] equality is not sufficient because
* [[CharSequence]]s of different types may not consider other
* [[CharSequence]]s containing the same characters equivalent.
*/
def sameCharacters(s1: CharSequence, s2: CharSequence): Boolean =
hasSuffixAt(s1, s2, 0)
/**
* This method implements the product definition of a token matching a
* keyword. That definition is:
*
* - The token contains the same characters as the keyword.
* - The token contains the same characters as the keyword after
* dropping a leading '#' or '@' from the token.
*
* The intention is that a keyword matches an identical hashtag, but
* if the keyword itself is a hashtag, it only matches the hashtag
* form.
*
* The tokenization process should rule out tokens or keywords that
* start with multiple '#' characters, even though this implementation
* allows for e.g. token "##a" to match "#a".
*/
def tokenMatches(token: CharSequence, keyword: CharSequence): Boolean =
if (sameCharacters(token, keyword)) true
else if (token.length == 0) false
else {
val tokenStart = token.charAt(0)
(tokenStart == '#' || tokenStart == '@') && hasSuffixAt(token, keyword, 1)
}
}
/**
* A sequence of normalized tokens. The sequence depends on the locale
* in which the text was parsed and the version of the penguin library
* that was used at tokenization time.
*/
case class TokenSequence private[matching] (toIndexedSeq: IndexedSeq[CharSequence]) {
import TokenSequence.tokenMatches
private def apply(i: Int): CharSequence = toIndexedSeq(i)
def isEmpty: Boolean = toIndexedSeq.isEmpty
def nonEmpty: Boolean = toIndexedSeq.nonEmpty
/**
* Does the supplied sequence of keywords match a consecutive sequence
* of tokens within this sequence?
*/
def containsKeywordSequence(keywords: TokenSequence): Boolean = {
val finalIndex = toIndexedSeq.length - keywords.toIndexedSeq.length
@annotation.tailrec
def matchesAt(offset: Int, i: Int): Boolean =
if (i >= keywords.toIndexedSeq.length) true
else if (tokenMatches(this(i + offset), keywords(i))) matchesAt(offset, i + 1)
else false
@annotation.tailrec
def search(offset: Int): Boolean =
if (offset > finalIndex) false
else if (matchesAt(offset, 0)) true
else search(offset + 1)
search(0)
}
}