the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/matching/TweetTokenizer.scala

46 lines
1.5 KiB
Scala

package com.twitter.tweetypie.matching
import com.twitter.common.text.pipeline.TwitterLanguageIdentifier
import com.twitter.common_internal.text.version.PenguinVersion
import java.util.Locale
object TweetTokenizer extends Tokenizer {
type LocalePicking = Option[Locale] => Tokenizer
/**
* Get a Tokenizer-producing function that uses the supplied locale
* to select an appropriate Tokenizer.
*/
def localePicking: LocalePicking = {
case None => TweetTokenizer
case Some(locale) => Tokenizer.forLocale(locale)
}
private[this] val tweetLangIdentifier =
(new TwitterLanguageIdentifier.Builder).buildForTweet()
/**
* Get a Tokenizer that performs Tweet language detection, and uses
* that result to tokenize the text. If you already know the locale of
* the tweet text, use `Tokenizer.get`, because it's much
* cheaper.
*/
def get(version: PenguinVersion): Tokenizer =
new Tokenizer {
override def tokenize(text: String): TokenSequence = {
val locale = tweetLangIdentifier.identify(text).getLocale
Tokenizer.get(locale, version).tokenize(text)
}
}
private[this] val Default = get(Tokenizer.DefaultPenguinVersion)
/**
* Tokenize the given text using Tweet language detection and
* `Tokenizer.DefaultPenguinVersion`. Prefer `Tokenizer.forLocale` if
* you already know the language of the text.
*/
override def tokenize(tweetText: String): TokenSequence =
Default.tokenize(tweetText)
}