119 lines
4.6 KiB
Scala
119 lines
4.6 KiB
Scala
package com.twitter.tweetypie.thriftscala.entities
|
|
|
|
import com.twitter.servo.data.Mutation
|
|
import com.twitter.tco_util.TcoUrl
|
|
import com.twitter.tweetypie.thriftscala._
|
|
import com.twitter.tweetypie.thriftscala.entities.Implicits._
|
|
import com.twitter.tweetypie.tweettext.PartialHtmlEncoding
|
|
import com.twitter.tweetypie.tweettext.TextEntity
|
|
import com.twitter.tweetypie.tweettext.TextModification
|
|
import com.twitter.tweetypie.util.TweetLenses
|
|
import com.twitter.twittertext.Extractor
|
|
import scala.collection.JavaConverters._
|
|
|
|
/**
|
|
* Contains functions to collect urls, mentions, hashtags, and cashtags from the text of tweets and messages
|
|
*/
|
|
object EntityExtractor {
|
|
// We only use one configuration of com.twitter.twittertext.Extractor, so it's
|
|
// OK to share one global reference. The only available
|
|
// configuration option is whether to extract URLs without protocols
|
|
// (defaults to true)
|
|
private[this] val extractor = new Extractor
|
|
|
|
// The twitter-text library operates on unencoded text, but we store
|
|
// and process HTML-encoded text. The TextModification returned
|
|
// from this function contains the decoded text which we will operate on,
|
|
// but also provides us with the ability to map the indices on
|
|
// the twitter-text entities back to the entities on the encoded text.
|
|
private val htmlEncodedTextToEncodeModification: String => TextModification =
|
|
text =>
|
|
PartialHtmlEncoding
|
|
.decodeWithModification(text)
|
|
.getOrElse(TextModification.identity(text))
|
|
.inverse
|
|
|
|
private[this] val extractAllUrlsFromTextMod: TextModification => Seq[UrlEntity] =
|
|
extractUrls(false)
|
|
|
|
val extractAllUrls: String => Seq[UrlEntity] =
|
|
htmlEncodedTextToEncodeModification.andThen(extractAllUrlsFromTextMod)
|
|
|
|
private[this] val extractTcoUrls: TextModification => Seq[UrlEntity] =
|
|
extractUrls(true)
|
|
|
|
private[this] def extractUrls(tcoOnly: Boolean): TextModification => Seq[UrlEntity] =
|
|
mkEntityExtractor[UrlEntity](
|
|
extractor.extractURLsWithIndices(_).asScala.filter { e =>
|
|
if (tcoOnly) TcoUrl.isTcoUrl(e.getValue) else true
|
|
},
|
|
UrlEntity(_, _, _)
|
|
)
|
|
|
|
private[this] val extractMentionsFromTextMod: TextModification => Seq[MentionEntity] =
|
|
mkEntityExtractor[MentionEntity](
|
|
extractor.extractMentionedScreennamesWithIndices(_).asScala,
|
|
MentionEntity(_, _, _)
|
|
)
|
|
|
|
val extractMentions: String => Seq[MentionEntity] =
|
|
htmlEncodedTextToEncodeModification.andThen(extractMentionsFromTextMod)
|
|
|
|
private[this] val extractHashtagsFromTextMod: TextModification => Seq[HashtagEntity] =
|
|
mkEntityExtractor[HashtagEntity](
|
|
extractor.extractHashtagsWithIndices(_).asScala,
|
|
HashtagEntity(_, _, _)
|
|
)
|
|
|
|
val extractHashtags: String => Seq[HashtagEntity] =
|
|
htmlEncodedTextToEncodeModification.andThen(extractHashtagsFromTextMod)
|
|
|
|
private[this] val extractCashtagsFromTextMod: TextModification => Seq[CashtagEntity] =
|
|
mkEntityExtractor[CashtagEntity](
|
|
extractor.extractCashtagsWithIndices(_).asScala,
|
|
CashtagEntity(_, _, _)
|
|
)
|
|
|
|
val extractCashtags: String => Seq[CashtagEntity] =
|
|
htmlEncodedTextToEncodeModification.andThen(extractCashtagsFromTextMod)
|
|
|
|
private[this] def mkEntityExtractor[E: TextEntity](
|
|
extract: String => Seq[Extractor.Entity],
|
|
construct: (Short, Short, String) => E
|
|
): TextModification => Seq[E] =
|
|
htmlEncodedMod => {
|
|
val convert: Extractor.Entity => Option[E] =
|
|
e =>
|
|
for {
|
|
start <- asShort(e.getStart.intValue)
|
|
end <- asShort(e.getEnd.intValue)
|
|
if e.getValue != null
|
|
res <- htmlEncodedMod.reindexEntity(construct(start, end, e.getValue))
|
|
} yield res
|
|
|
|
val entities = extract(htmlEncodedMod.original)
|
|
extractor.modifyIndicesFromUTF16ToUnicode(htmlEncodedMod.original, entities.asJava)
|
|
entities.map(convert).flatten
|
|
}
|
|
|
|
private[this] def asShort(i: Int): Option[Short] =
|
|
if (i.isValidShort) Some(i.toShort) else None
|
|
|
|
private[this] def mutation(extractUrls: Boolean): Mutation[Tweet] =
|
|
Mutation { tweet =>
|
|
val htmlEncodedMod = htmlEncodedTextToEncodeModification(TweetLenses.text.get(tweet))
|
|
|
|
Some(
|
|
tweet.copy(
|
|
urls = if (extractUrls) Some(extractTcoUrls(htmlEncodedMod)) else tweet.urls,
|
|
mentions = Some(extractMentionsFromTextMod(htmlEncodedMod)),
|
|
hashtags = Some(extractHashtagsFromTextMod(htmlEncodedMod)),
|
|
cashtags = Some(extractCashtagsFromTextMod(htmlEncodedMod))
|
|
)
|
|
)
|
|
}
|
|
|
|
val mutationWithoutUrls: Mutation[Tweet] = mutation(false)
|
|
val mutationAll: Mutation[Tweet] = mutation(true)
|
|
}
|