the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/matching/TweetTokenizer.scala

package com.twitter.tweetypie.matching

import com.twitter.common.text.pipeline.TwitterLanguageIdentifier
import com.twitter.common_internal.text.version.PenguinVersion
import java.util.Locale

object TweetTokenizer extends Tokenizer {
  type LocalePicking = Option[Locale] => Tokenizer

  /**
   * Get a Tokenizer-producing function that uses the supplied locale
   * to select an appropriate Tokenizer.
   */
  def localePicking: LocalePicking = {
    case None => TweetTokenizer
    case Some(locale) => Tokenizer.forLocale(locale)
  }

  private[this] val tweetLangIdentifier =
    (new TwitterLanguageIdentifier.Builder).buildForTweet()

  /**
   * Get a Tokenizer that performs Tweet language detection, and uses
   * that result to tokenize the text. If you already know the locale of
   * the tweet text, use `Tokenizer.get`, because it's much
   * cheaper.
   */
  def get(version: PenguinVersion): Tokenizer =
    new Tokenizer {
      override def tokenize(text: String): TokenSequence = {
        val locale = tweetLangIdentifier.identify(text).getLocale
        Tokenizer.get(locale, version).tokenize(text)
      }
    }

  private[this] val Default = get(Tokenizer.DefaultPenguinVersion)

  /**
   * Tokenize the given text using Tweet language detection and
   * `Tokenizer.DefaultPenguinVersion`. Prefer `Tokenizer.forLocale` if
   * you already know the language of the text.
   */
  override def tokenize(tweetText: String): TokenSequence =
    Default.tokenize(tweetText)
}