93 lines
3.0 KiB
Scala
93 lines
3.0 KiB
Scala
package com.twitter.tweetypie.matching
|
|
|
|
object TokenSequence {
|
|
|
|
/**
|
|
* Is `suffix` a suffix of `s`, starting at `offset` in `s`?
|
|
*/
|
|
def hasSuffixAt(s: CharSequence, suffix: CharSequence, offset: Int): Boolean =
|
|
if (offset == 0 && (s.eq(suffix) || s == suffix)) {
|
|
true
|
|
} else if (suffix.length != (s.length - offset)) {
|
|
false
|
|
} else {
|
|
@annotation.tailrec
|
|
def go(i: Int): Boolean =
|
|
if (i == suffix.length) true
|
|
else if (suffix.charAt(i) == s.charAt(offset + i)) go(i + 1)
|
|
else false
|
|
|
|
go(0)
|
|
}
|
|
|
|
/**
|
|
* Do two [[CharSequence]]s contain the same characters?
|
|
*
|
|
* [[CharSequence]] equality is not sufficient because
|
|
* [[CharSequence]]s of different types may not consider other
|
|
* [[CharSequence]]s containing the same characters equivalent.
|
|
*/
|
|
def sameCharacters(s1: CharSequence, s2: CharSequence): Boolean =
|
|
hasSuffixAt(s1, s2, 0)
|
|
|
|
/**
|
|
* This method implements the product definition of a token matching a
|
|
* keyword. That definition is:
|
|
*
|
|
* - The token contains the same characters as the keyword.
|
|
* - The token contains the same characters as the keyword after
|
|
* dropping a leading '#' or '@' from the token.
|
|
*
|
|
* The intention is that a keyword matches an identical hashtag, but
|
|
* if the keyword itself is a hashtag, it only matches the hashtag
|
|
* form.
|
|
*
|
|
* The tokenization process should rule out tokens or keywords that
|
|
* start with multiple '#' characters, even though this implementation
|
|
* allows for e.g. token "##a" to match "#a".
|
|
*/
|
|
def tokenMatches(token: CharSequence, keyword: CharSequence): Boolean =
|
|
if (sameCharacters(token, keyword)) true
|
|
else if (token.length == 0) false
|
|
else {
|
|
val tokenStart = token.charAt(0)
|
|
(tokenStart == '#' || tokenStart == '@') && hasSuffixAt(token, keyword, 1)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* A sequence of normalized tokens. The sequence depends on the locale
|
|
* in which the text was parsed and the version of the penguin library
|
|
* that was used at tokenization time.
|
|
*/
|
|
case class TokenSequence private[matching] (toIndexedSeq: IndexedSeq[CharSequence]) {
|
|
import TokenSequence.tokenMatches
|
|
|
|
private def apply(i: Int): CharSequence = toIndexedSeq(i)
|
|
|
|
def isEmpty: Boolean = toIndexedSeq.isEmpty
|
|
def nonEmpty: Boolean = toIndexedSeq.nonEmpty
|
|
|
|
/**
|
|
* Does the supplied sequence of keywords match a consecutive sequence
|
|
* of tokens within this sequence?
|
|
*/
|
|
def containsKeywordSequence(keywords: TokenSequence): Boolean = {
|
|
val finalIndex = toIndexedSeq.length - keywords.toIndexedSeq.length
|
|
|
|
@annotation.tailrec
|
|
def matchesAt(offset: Int, i: Int): Boolean =
|
|
if (i >= keywords.toIndexedSeq.length) true
|
|
else if (tokenMatches(this(i + offset), keywords(i))) matchesAt(offset, i + 1)
|
|
else false
|
|
|
|
@annotation.tailrec
|
|
def search(offset: Int): Boolean =
|
|
if (offset > finalIndex) false
|
|
else if (matchesAt(offset, 0)) true
|
|
else search(offset + 1)
|
|
|
|
search(0)
|
|
}
|
|
}
|