the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/matching/TokenSequence.scala

package com.twitter.tweetypie.matching

object TokenSequence {

  /**
   * Is `suffix` a suffix of `s`, starting at `offset` in `s`?
   */
  def hasSuffixAt(s: CharSequence, suffix: CharSequence, offset: Int): Boolean =
    if (offset == 0 && (s.eq(suffix) || s == suffix)) {
      true
    } else if (suffix.length != (s.length - offset)) {
      false
    } else {
      @annotation.tailrec
      def go(i: Int): Boolean =
        if (i == suffix.length) true
        else if (suffix.charAt(i) == s.charAt(offset + i)) go(i + 1)
        else false

      go(0)
    }

  /**
   * Do two [[CharSequence]]s contain the same characters?
   *
   * [[CharSequence]] equality is not sufficient because
   * [[CharSequence]]s of different types may not consider other
   * [[CharSequence]]s containing the same characters equivalent.
   */
  def sameCharacters(s1: CharSequence, s2: CharSequence): Boolean =
    hasSuffixAt(s1, s2, 0)

  /**
   * This method implements the product definition of a token matching a
   * keyword. That definition is:
   *
   * - The token contains the same characters as the keyword.
   * - The token contains the same characters as the keyword after
   *   dropping a leading '#' or '@' from the token.
   *
   * The intention is that a keyword matches an identical hashtag, but
   * if the keyword itself is a hashtag, it only matches the hashtag
   * form.
   *
   * The tokenization process should rule out tokens or keywords that
   * start with multiple '#' characters, even though this implementation
   * allows for e.g. token "##a" to match "#a".
   */
  def tokenMatches(token: CharSequence, keyword: CharSequence): Boolean =
    if (sameCharacters(token, keyword)) true
    else if (token.length == 0) false
    else {
      val tokenStart = token.charAt(0)
      (tokenStart == '#' || tokenStart == '@') && hasSuffixAt(token, keyword, 1)
    }
}

/**
 * A sequence of normalized tokens. The sequence depends on the locale
 * in which the text was parsed and the version of the penguin library
 * that was used at tokenization time.
 */
case class TokenSequence private[matching] (toIndexedSeq: IndexedSeq[CharSequence]) {
  import TokenSequence.tokenMatches

  private def apply(i: Int): CharSequence = toIndexedSeq(i)

  def isEmpty: Boolean = toIndexedSeq.isEmpty
  def nonEmpty: Boolean = toIndexedSeq.nonEmpty

  /**
   * Does the supplied sequence of keywords match a consecutive sequence
   * of tokens within this sequence?
   */
  def containsKeywordSequence(keywords: TokenSequence): Boolean = {
    val finalIndex = toIndexedSeq.length - keywords.toIndexedSeq.length

    @annotation.tailrec
    def matchesAt(offset: Int, i: Int): Boolean =
      if (i >= keywords.toIndexedSeq.length) true
      else if (tokenMatches(this(i + offset), keywords(i))) matchesAt(offset, i + 1)
      else false

    @annotation.tailrec
    def search(offset: Int): Boolean =
      if (offset > finalIndex) false
      else if (matchesAt(offset, 0)) true
      else search(offset + 1)

    search(0)
  }
}