the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/matching/Tokenizer.scala

157 lines
5.3 KiB
Scala

package com.twitter.tweetypie.matching
import com.twitter.common.text.language.LocaleUtil
import com.twitter.common_internal.text.pipeline.TwitterTextNormalizer
import com.twitter.common_internal.text.pipeline.TwitterTextTokenizer
import com.twitter.common_internal.text.version.PenguinVersion
import com.twitter.concurrent.Once
import com.twitter.io.StreamIO
import java.util.Locale
import scala.collection.JavaConverters._
/**
* Extract a sequence of normalized tokens from the input text. The
* normalization and tokenization are properly configured for keyword
* matching between texts.
*/
trait Tokenizer {
def tokenize(input: String): TokenSequence
}
object Tokenizer {
/**
* When a Penguin version is not explicitly specified, use this
* version of Penguin to perform normalization and tokenization. If
* you cache tokenized text, be sure to store the version as well, to
* avoid comparing text that was normalized with different algorithms.
*/
val DefaultPenguinVersion: PenguinVersion = PenguinVersion.PENGUIN_6
/**
* If you already know the locale of the text that is being tokenized,
* use this method to get a tokenizer that is much more efficient than
* the Tweet or Query tokenizer, since it does not have to perform
* language detection.
*/
def forLocale(locale: Locale): Tokenizer = get(locale, DefaultPenguinVersion)
/**
* Obtain a `Tokenizer` that will tokenize the text for the given
* locale and version of the Penguin library.
*/
def get(locale: Locale, version: PenguinVersion): Tokenizer =
TokenizerFactories(version).forLocale(locale)
/**
* Encapsulates the configuration and use of [[TwitterTextTokenizer]]
* and [[TwitterTextNormalizer]].
*/
private[this] class TokenizerFactory(version: PenguinVersion) {
// The normalizer is thread-safe, so share one instance.
private[this] val normalizer =
(new TwitterTextNormalizer.Builder(version)).build()
// The TwitterTextTokenizer is relatively expensive to build,
// and is not thread safe, so keep instances of it in a
// ThreadLocal.
private[this] val local =
new ThreadLocal[TwitterTextTokenizer] {
override def initialValue: TwitterTextTokenizer =
(new TwitterTextTokenizer.Builder(version)).build()
}
/**
* Obtain a [[Tokenizer]] for this combination of [[PenguinVersion]]
* and [[Locale]].
*/
def forLocale(locale: Locale): Tokenizer =
new Tokenizer {
override def tokenize(input: String): TokenSequence = {
val stream = local.get.getTwitterTokenStreamFor(locale)
stream.reset(normalizer.normalize(input, locale))
val builder = IndexedSeq.newBuilder[CharSequence]
while (stream.incrementToken) builder += stream.term()
TokenSequence(builder.result())
}
}
}
/**
* Since there are a small number of Penguin versions, eagerly
* initialize a TokenizerFactory for each version, to avoid managing
* mutable state.
*/
private[this] val TokenizerFactories: PenguinVersion => TokenizerFactory =
PenguinVersion.values.map(v => v -> new TokenizerFactory(v)).toMap
/**
* The set of locales used in warmup. These locales are mentioned in
* the logic of TwitterTextTokenizer and TwitterTextNormalizer.
*/
private[this] val WarmUpLocales: Seq[Locale] =
Seq
.concat(
Seq(
Locale.JAPANESE,
Locale.KOREAN,
LocaleUtil.UNKNOWN,
LocaleUtil.THAI,
LocaleUtil.ARABIC,
LocaleUtil.SWEDISH
),
LocaleUtil.CHINESE_JAPANESE_LOCALES.asScala,
LocaleUtil.CJK_LOCALES.asScala
)
.toSet
.toArray
.toSeq
/**
* Load the default inputs that are used for warming up this library.
*/
def warmUpCorpus(): Seq[String] = {
val stream = getClass.getResourceAsStream("warmup-text.txt")
val bytes =
try StreamIO.buffer(stream)
finally stream.close()
bytes.toString("UTF-8").linesIterator.toArray.toSeq
}
/**
* Exercise the functionality of this library on the specified
* strings. In general, prefer [[warmUp]] to this method.
*/
def warmUpWith(ver: PenguinVersion, texts: Iterable[String]): Unit =
texts.foreach { txt =>
// Exercise each locale
WarmUpLocales.foreach { loc =>
Tokenizer.get(loc, ver).tokenize(txt)
UserMutes.builder().withPenguinVersion(ver).withLocale(loc).validate(txt)
}
// Exercise language detection
TweetTokenizer.get(ver).tokenize(txt)
UserMutes.builder().withPenguinVersion(ver).validate(txt)
}
private[this] val warmUpOnce = Once(warmUpWith(DefaultPenguinVersion, warmUpCorpus()))
/**
* The creation of the first TwitterTextTokenizer is relatively
* expensive, and tokenizing some texts may cause significant
* initialization.
*
* This method exercises the functionality of this library
* with a range of texts in order to perform as much initialization as
* possible before the library is used in a latency-sensitive way.
*
* The warmup routine will only run once. Subsequent invocations of
* `warmUp` will no do additional work, and will return once warmup is
* complete.
*
* The warmup will take on the order of seconds.
*/
def warmUp(): Unit = warmUpOnce()
}