46 lines
1.5 KiB
Scala
46 lines
1.5 KiB
Scala
package com.twitter.tweetypie.matching
|
|
|
|
import com.twitter.common.text.pipeline.TwitterLanguageIdentifier
|
|
import com.twitter.common_internal.text.version.PenguinVersion
|
|
import java.util.Locale
|
|
|
|
object TweetTokenizer extends Tokenizer {
|
|
type LocalePicking = Option[Locale] => Tokenizer
|
|
|
|
/**
|
|
* Get a Tokenizer-producing function that uses the supplied locale
|
|
* to select an appropriate Tokenizer.
|
|
*/
|
|
def localePicking: LocalePicking = {
|
|
case None => TweetTokenizer
|
|
case Some(locale) => Tokenizer.forLocale(locale)
|
|
}
|
|
|
|
private[this] val tweetLangIdentifier =
|
|
(new TwitterLanguageIdentifier.Builder).buildForTweet()
|
|
|
|
/**
|
|
* Get a Tokenizer that performs Tweet language detection, and uses
|
|
* that result to tokenize the text. If you already know the locale of
|
|
* the tweet text, use `Tokenizer.get`, because it's much
|
|
* cheaper.
|
|
*/
|
|
def get(version: PenguinVersion): Tokenizer =
|
|
new Tokenizer {
|
|
override def tokenize(text: String): TokenSequence = {
|
|
val locale = tweetLangIdentifier.identify(text).getLocale
|
|
Tokenizer.get(locale, version).tokenize(text)
|
|
}
|
|
}
|
|
|
|
private[this] val Default = get(Tokenizer.DefaultPenguinVersion)
|
|
|
|
/**
|
|
* Tokenize the given text using Tweet language detection and
|
|
* `Tokenizer.DefaultPenguinVersion`. Prefer `Tokenizer.forLocale` if
|
|
* you already know the language of the text.
|
|
*/
|
|
override def tokenize(tweetText: String): TokenSequence =
|
|
Default.tokenize(tweetText)
|
|
}
|