the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/thriftscala/entities/EntityExtractor.scala

119 lines
4.6 KiB
Scala

package com.twitter.tweetypie.thriftscala.entities
import com.twitter.servo.data.Mutation
import com.twitter.tco_util.TcoUrl
import com.twitter.tweetypie.thriftscala._
import com.twitter.tweetypie.thriftscala.entities.Implicits._
import com.twitter.tweetypie.tweettext.PartialHtmlEncoding
import com.twitter.tweetypie.tweettext.TextEntity
import com.twitter.tweetypie.tweettext.TextModification
import com.twitter.tweetypie.util.TweetLenses
import com.twitter.twittertext.Extractor
import scala.collection.JavaConverters._
/**
* Contains functions to collect urls, mentions, hashtags, and cashtags from the text of tweets and messages
*/
object EntityExtractor {
// We only use one configuration of com.twitter.twittertext.Extractor, so it's
// OK to share one global reference. The only available
// configuration option is whether to extract URLs without protocols
// (defaults to true)
private[this] val extractor = new Extractor
// The twitter-text library operates on unencoded text, but we store
// and process HTML-encoded text. The TextModification returned
// from this function contains the decoded text which we will operate on,
// but also provides us with the ability to map the indices on
// the twitter-text entities back to the entities on the encoded text.
private val htmlEncodedTextToEncodeModification: String => TextModification =
text =>
PartialHtmlEncoding
.decodeWithModification(text)
.getOrElse(TextModification.identity(text))
.inverse
private[this] val extractAllUrlsFromTextMod: TextModification => Seq[UrlEntity] =
extractUrls(false)
val extractAllUrls: String => Seq[UrlEntity] =
htmlEncodedTextToEncodeModification.andThen(extractAllUrlsFromTextMod)
private[this] val extractTcoUrls: TextModification => Seq[UrlEntity] =
extractUrls(true)
private[this] def extractUrls(tcoOnly: Boolean): TextModification => Seq[UrlEntity] =
mkEntityExtractor[UrlEntity](
extractor.extractURLsWithIndices(_).asScala.filter { e =>
if (tcoOnly) TcoUrl.isTcoUrl(e.getValue) else true
},
UrlEntity(_, _, _)
)
private[this] val extractMentionsFromTextMod: TextModification => Seq[MentionEntity] =
mkEntityExtractor[MentionEntity](
extractor.extractMentionedScreennamesWithIndices(_).asScala,
MentionEntity(_, _, _)
)
val extractMentions: String => Seq[MentionEntity] =
htmlEncodedTextToEncodeModification.andThen(extractMentionsFromTextMod)
private[this] val extractHashtagsFromTextMod: TextModification => Seq[HashtagEntity] =
mkEntityExtractor[HashtagEntity](
extractor.extractHashtagsWithIndices(_).asScala,
HashtagEntity(_, _, _)
)
val extractHashtags: String => Seq[HashtagEntity] =
htmlEncodedTextToEncodeModification.andThen(extractHashtagsFromTextMod)
private[this] val extractCashtagsFromTextMod: TextModification => Seq[CashtagEntity] =
mkEntityExtractor[CashtagEntity](
extractor.extractCashtagsWithIndices(_).asScala,
CashtagEntity(_, _, _)
)
val extractCashtags: String => Seq[CashtagEntity] =
htmlEncodedTextToEncodeModification.andThen(extractCashtagsFromTextMod)
private[this] def mkEntityExtractor[E: TextEntity](
extract: String => Seq[Extractor.Entity],
construct: (Short, Short, String) => E
): TextModification => Seq[E] =
htmlEncodedMod => {
val convert: Extractor.Entity => Option[E] =
e =>
for {
start <- asShort(e.getStart.intValue)
end <- asShort(e.getEnd.intValue)
if e.getValue != null
res <- htmlEncodedMod.reindexEntity(construct(start, end, e.getValue))
} yield res
val entities = extract(htmlEncodedMod.original)
extractor.modifyIndicesFromUTF16ToUnicode(htmlEncodedMod.original, entities.asJava)
entities.map(convert).flatten
}
private[this] def asShort(i: Int): Option[Short] =
if (i.isValidShort) Some(i.toShort) else None
private[this] def mutation(extractUrls: Boolean): Mutation[Tweet] =
Mutation { tweet =>
val htmlEncodedMod = htmlEncodedTextToEncodeModification(TweetLenses.text.get(tweet))
Some(
tweet.copy(
urls = if (extractUrls) Some(extractTcoUrls(htmlEncodedMod)) else tweet.urls,
mentions = Some(extractMentionsFromTextMod(htmlEncodedMod)),
hashtags = Some(extractHashtagsFromTextMod(htmlEncodedMod)),
cashtags = Some(extractCashtagsFromTextMod(htmlEncodedMod))
)
)
}
val mutationWithoutUrls: Mutation[Tweet] = mutation(false)
val mutationAll: Mutation[Tweet] = mutation(true)
}