the-algorithm/src/java/com/twitter/search/common/relevance/entities/TwitterMessage.java

1268 lines
37 KiB
Java

package com.twitter.search.common.relevance.entities;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.lucene.analysis.TokenStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.text.language.LocaleUtil;
import com.twitter.common.text.pipeline.TwitterLanguageIdentifier;
import com.twitter.common.text.token.TokenizedCharSequence;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.cuad.ner.plain.thriftjava.NamedEntity;
import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl;
import com.twitter.search.common.relevance.features.TweetFeatures;
import com.twitter.search.common.relevance.features.TweetTextFeatures;
import com.twitter.search.common.relevance.features.TweetTextQuality;
import com.twitter.search.common.relevance.features.TweetUserFeatures;
import com.twitter.search.common.util.text.NormalizerHelper;
import com.twitter.service.spiderduck.gen.MediaTypes;
import com.twitter.tweetypie.thriftjava.ComposerSource;
import com.twitter.util.TwitterDateFormat;
/**
* A representation of tweets used as an intermediate object during ingestion. As we proceed
* in ingestion, we fill this object with data. We then convert it to ThriftVersionedEvents (which
* itself represents a single tweet too, in different penguin versions potentially).
*/
public class TwitterMessage {
private static final Logger LOG = LoggerFactory.getLogger(TwitterMessage.class);
public static class EscherbirdAnnotation implements Comparable<EscherbirdAnnotation> {
public final long groupId;
public final long domainId;
public final long entityId;
public EscherbirdAnnotation(long groupId, long domainId, long entityId) {
this.groupId = groupId;
this.domainId = domainId;
this.entityId = entityId;
}
@Override
public boolean equals(Object o2) {
if (o2 instanceof EscherbirdAnnotation) {
EscherbirdAnnotation a2 = (EscherbirdAnnotation) o2;
return groupId == a2.groupId && domainId == a2.domainId && entityId == a2.entityId;
}
return false;
}
@Override
public int hashCode() {
return new HashCodeBuilder()
.append(groupId)
.append(domainId)
.append(entityId)
.toHashCode();
}
@Override
public int compareTo(EscherbirdAnnotation o) {
return ComparisonChain.start()
.compare(this.groupId, o.groupId)
.compare(this.domainId, o.domainId)
.compare(this.entityId, o.entityId)
.result();
}
}
private final List<EscherbirdAnnotation> escherbirdAnnotations = Lists.newArrayList();
// tweet features for multiple penguin versions
private static class VersionedTweetFeatures {
// TweetFeatures populated by relevance classifiers, structure defined
// in src/main/thrift/classifier.thrift.
private TweetFeatures tweetFeatures = new TweetFeatures();
private TokenizedCharSequence tokenizedCharSequence = null;
private Set<String> normalizedHashtags = Sets.newHashSet();
public TweetFeatures getTweetFeatures() {
return this.tweetFeatures;
}
public void setTweetFeatures(final TweetFeatures tweetFeatures) {
this.tweetFeatures = tweetFeatures;
}
public TweetTextQuality getTweetTextQuality() {
return this.tweetFeatures.getTweetTextQuality();
}
public TweetTextFeatures getTweetTextFeatures() {
return this.tweetFeatures.getTweetTextFeatures();
}
public TweetUserFeatures getTweetUserFeatures() {
return this.tweetFeatures.getTweetUserFeatures();
}
public TokenizedCharSequence getTokenizedCharSequence() {
return this.tokenizedCharSequence;
}
public void setTokenizedCharSequence(TokenizedCharSequence sequence) {
this.tokenizedCharSequence = sequence;
}
public Set<String> getNormalizedHashtags() {
return this.normalizedHashtags;
}
public void addNormalizedHashtags(String normalizedHashtag) {
this.normalizedHashtags.add(normalizedHashtag);
}
}
public static final int INT_FIELD_NOT_PRESENT = -1;
public static final long LONG_FIELD_NOT_PRESENT = -1;
public static final double DOUBLE_FIELD_NOT_PRESENT = -1;
public static final int MAX_USER_REPUTATION = 100;
private final long tweetId;
private String text;
private Date date;
@Nonnull
private Optional<TwitterMessageUser> optionalFromUser = Optional.empty();
@Nonnull
private Optional<TwitterMessageUser> optionalToUser = Optional.empty();
private Locale locale = null;
private Locale linkLocale = null;
// Original source text.
private String origSource;
// Source with HTML tags removed and truncated.
private String strippedSource;
// Original location text.
private String origLocation;
// Location truncated for mysql field-width reasons (see TwitterMessageUtil.java).
private String truncatedNormalizedLocation;
// User's country
private String fromUserLocCountry;
private Integer followersCount = INT_FIELD_NOT_PRESENT;
private boolean deleted = false;
// Fields extracted from entities (in the JSON object)
private List<TwitterMessageUser> mentions = new ArrayList<>();
private Set<String> hashtags = Sets.newHashSet();
// Lat/lon and region accuracy tuples extracted from tweet text, or null.
private GeoObject geoLocation = null;
private boolean uncodeableLocation = false;
// This is set if the tweet is geotagged. (i.e. "geo" or "coordinate" section is present
// in the json)
// This field has only a getter but no setter --- it is filled in when the json is parsed.
private GeoObject geoTaggedLocation = null;
private double userReputation = DOUBLE_FIELD_NOT_PRESENT;
private boolean geocodeRequired = false;
private boolean sensitiveContent = false;
private boolean userProtected;
private boolean userVerified;
private boolean userBlueVerified;
private TwitterRetweetMessage retweetMessage;
private TwitterQuotedMessage quotedMessage;
private List<String> places;
// maps from original url (the t.co url) to ThriftExpandedUrl, which contains the
// expanded url and the spiderduck response (canoicalLastHopUrl and mediatype)
private final Map<String, ThriftExpandedUrl> expandedUrls;
// maps the photo status id to the media url
private Map<Long, String> photoUrls;
private Optional<Long> inReplyToStatusId = Optional.empty();
private Optional<Long> directedAtUserId = Optional.empty();
private long conversationId = -1;
// True if tweet is nullcasted.
private boolean nullcast = false;
// True if tweet is a self-threaded tweet
private boolean selfThread = false;
// If the tweet is a part of an exclusive conversation, the author who started
// that conversation.
private Optional<Long> exclusiveConversationAuthorId = Optional.empty();
// tweet features map for multiple versions of penguin
private Map<PenguinVersion, VersionedTweetFeatures> versionedTweetFeaturesMap;
// Engagments count: favorites, retweets and replies
private int numFavorites = 0;
private int numRetweets = 0;
private int numReplies = 0;
// Card information
private String cardName;
private String cardDomain;
private String cardTitle;
private String cardDescription;
private String cardLang;
private String cardUrl;
private String placeId;
private String placeFullName;
private String placeCountryCode;
private Set<NamedEntity> namedEntities = Sets.newHashSet();
// Spaces data
private Set<String> spaceIds = Sets.newHashSet();
private Set<TwitterMessageUser> spaceAdmins = Sets.newHashSet();
private String spaceTitle;
private Optional<ComposerSource> composerSource = Optional.empty();
private final List<PotentialLocationObject> potentialLocations = Lists.newArrayList();
// one or two penguin versions supported by this system
private final List<PenguinVersion> supportedPenguinVersions;
public TwitterMessage(Long tweetId, List<PenguinVersion> supportedPenguinVersions) {
this.tweetId = tweetId;
this.places = new ArrayList<>();
this.expandedUrls = new LinkedHashMap<>();
// make sure we support at least one, but no more than two versions of penguin
this.supportedPenguinVersions = supportedPenguinVersions;
this.versionedTweetFeaturesMap = getVersionedTweetFeaturesMap();
Preconditions.checkArgument(this.supportedPenguinVersions.size() <= 2
&& this.supportedPenguinVersions.size() > 0);
}
/**
* Replace to-user with in-reply-to user if needed.
*/
public void replaceToUserWithInReplyToUserIfNeeded(
String inReplyToScreenName, long inReplyToUserId) {
if (shouldUseReplyUserAsToUser(optionalToUser, inReplyToUserId)) {
TwitterMessageUser replyUser =
TwitterMessageUser.createWithNamesAndId(inReplyToScreenName, "", inReplyToUserId);
optionalToUser = Optional.of(replyUser);
}
}
// To-user could have been inferred from the mention at the position 0.
// But if there is an explicit in-reply-to user, we might need to use it as to-user instead.
private static boolean shouldUseReplyUserAsToUser(
Optional<TwitterMessageUser> currentToUser,
long inReplyToUserId) {
if (!currentToUser.isPresent()) {
// There is no mention in the tweet that qualifies as to-user.
return true;
}
// We already have a mention in the tweet that qualifies as to-user.
TwitterMessageUser toUser = currentToUser.get();
if (!toUser.getId().isPresent()) {
// The to-user from the mention is a stub.
return true;
}
long toUserId = toUser.getId().get();
if (toUserId != inReplyToUserId) {
// The to-user from the mention is different that the in-reply-to user,
// use in-reply-to user instead.
return true;
}
return false;
}
public double getUserReputation() {
return userReputation;
}
/**
* Sets the user reputation.
*/
public TwitterMessage setUserReputation(double newUserReputation) {
if (newUserReputation > MAX_USER_REPUTATION) {
LOG.warn("Out of bounds user reputation {} for status id {}", newUserReputation, tweetId);
this.userReputation = (float) MAX_USER_REPUTATION;
} else {
this.userReputation = newUserReputation;
}
return this;
}
public String getText() {
return text;
}
public Optional<TwitterMessageUser> getOptionalToUser() {
return optionalToUser;
}
public void setOptionalToUser(Optional<TwitterMessageUser> optionalToUser) {
this.optionalToUser = optionalToUser;
}
public void setText(String text) {
this.text = text;
}
public Date getDate() {
return date;
}
public void setDate(Date date) {
this.date = date;
}
public void setFromUser(@Nonnull TwitterMessageUser fromUser) {
Preconditions.checkNotNull(fromUser, "Don't set a null fromUser");
optionalFromUser = Optional.of(fromUser);
}
public Optional<String> getFromUserScreenName() {
return optionalFromUser.isPresent()
? optionalFromUser.get().getScreenName()
: Optional.empty();
}
/**
* Sets the fromUserScreenName.
*/
public void setFromUserScreenName(@Nonnull String fromUserScreenName) {
TwitterMessageUser newFromUser = optionalFromUser.isPresent()
? optionalFromUser.get().copyWithScreenName(fromUserScreenName)
: TwitterMessageUser.createWithScreenName(fromUserScreenName);
optionalFromUser = Optional.of(newFromUser);
}
public Optional<TokenStream> getTokenizedFromUserScreenName() {
return optionalFromUser.flatMap(TwitterMessageUser::getTokenizedScreenName);
}
public Optional<String> getFromUserDisplayName() {
return optionalFromUser.flatMap(TwitterMessageUser::getDisplayName);
}
/**
* Sets the fromUserDisplayName.
*/
public void setFromUserDisplayName(@Nonnull String fromUserDisplayName) {
TwitterMessageUser newFromUser = optionalFromUser.isPresent()
? optionalFromUser.get().copyWithDisplayName(fromUserDisplayName)
: TwitterMessageUser.createWithDisplayName(fromUserDisplayName);
optionalFromUser = Optional.of(newFromUser);
}
public Optional<Long> getFromUserTwitterId() {
return optionalFromUser.flatMap(TwitterMessageUser::getId);
}
/**
* Sets the fromUserId.
*/
public void setFromUserId(long fromUserId) {
TwitterMessageUser newFromUser = optionalFromUser.isPresent()
? optionalFromUser.get().copyWithId(fromUserId)
: TwitterMessageUser.createWithId(fromUserId);
optionalFromUser = Optional.of(newFromUser);
}
public long getConversationId() {
return conversationId;
}
public void setConversationId(long conversationId) {
this.conversationId = conversationId;
}
public boolean isUserProtected() {
return this.userProtected;
}
public void setUserProtected(boolean userProtected) {
this.userProtected = userProtected;
}
public boolean isUserVerified() {
return this.userVerified;
}
public void setUserVerified(boolean userVerified) {
this.userVerified = userVerified;
}
public boolean isUserBlueVerified() {
return this.userBlueVerified;
}
public void setUserBlueVerified(boolean userBlueVerified) {
this.userBlueVerified = userBlueVerified;
}
public void setIsSensitiveContent(boolean isSensitiveContent) {
this.sensitiveContent = isSensitiveContent;
}
public boolean isSensitiveContent() {
return this.sensitiveContent;
}
public Optional<TwitterMessageUser> getToUserObject() {
return optionalToUser;
}
public void setToUserObject(@Nonnull TwitterMessageUser user) {
Preconditions.checkNotNull(user, "Don't set a null to-user");
optionalToUser = Optional.of(user);
}
public Optional<Long> getToUserTwitterId() {
return optionalToUser.flatMap(TwitterMessageUser::getId);
}
/**
* Sets toUserId.
*/
public void setToUserTwitterId(long toUserId) {
TwitterMessageUser newToUser = optionalToUser.isPresent()
? optionalToUser.get().copyWithId(toUserId)
: TwitterMessageUser.createWithId(toUserId);
optionalToUser = Optional.of(newToUser);
}
public Optional<String> getToUserLowercasedScreenName() {
return optionalToUser.flatMap(TwitterMessageUser::getScreenName).map(String::toLowerCase);
}
public Optional<String> getToUserScreenName() {
return optionalToUser.flatMap(TwitterMessageUser::getScreenName);
}
/**
* Sets toUserScreenName.
*/
public void setToUserScreenName(@Nonnull String screenName) {
Preconditions.checkNotNull(screenName, "Don't set a null to-user screenname");
TwitterMessageUser newToUser = optionalToUser.isPresent()
? optionalToUser.get().copyWithScreenName(screenName)
: TwitterMessageUser.createWithScreenName(screenName);
optionalToUser = Optional.of(newToUser);
}
// to use from TweetEventParseHelper
public void setDirectedAtUserId(Optional<Long> directedAtUserId) {
this.directedAtUserId = directedAtUserId;
}
@VisibleForTesting
public Optional<Long> getDirectedAtUserId() {
return directedAtUserId;
}
/**
* Returns the referenceAuthorId.
*/
public Optional<Long> getReferenceAuthorId() {
// The semantics of reference-author-id:
// - if the tweet is a retweet, it should be the user id of the author of the original tweet
// - else, if the tweet is directed at a user, it should be the id of the user it's directed at.
// - else, if the tweet is a reply in a root self-thread, directed-at is not set, so it's
// the id of the user who started the self-thread.
//
// For definitive info on replies and directed-at, take a look at go/replies. To view these
// for a certain tweet, use http://go/t.
//
// Note that if directed-at is set, reply is always set.
// If reply is set, directed-at is not necessarily set.
if (isRetweet() && retweetMessage.hasSharedUserTwitterId()) {
long retweetedUserId = retweetMessage.getSharedUserTwitterId();
return Optional.of(retweetedUserId);
} else if (directedAtUserId.isPresent()) {
// Why not replace directedAtUserId with reply and make this function depend
// on the "reply" field of TweetCoreData?
// Well, verified by counters, it seems for ~1% of tweets, which contain both directed-at
// and reply, directed-at-user is different than the reply-to-user id. This happens in the
// following case:
//
// author / reply-to / directed-at
// T1 A - -
// T2 B A A
// T3 B B A
//
// T2 is a reply to T1, T3 is a reply to T2.
//
// It's up to us to decide who this tweet is "referencing", but with the current code,
// we choose that T3 is referencing user A.
return directedAtUserId;
} else {
// This is the case of a root self-thread reply. directed-at is not set.
Optional<Long> fromUserId = this.getFromUserTwitterId();
Optional<Long> toUserId = this.getToUserTwitterId();
if (fromUserId.isPresent() && fromUserId.equals(toUserId)) {
return fromUserId;
}
}
return Optional.empty();
}
public void setNumFavorites(int numFavorites) {
this.numFavorites = numFavorites;
}
public void setNumRetweets(int numRetweets) {
this.numRetweets = numRetweets;
}
public void setNumReplies(int numRepliess) {
this.numReplies = numRepliess;
}
public void addEscherbirdAnnotation(EscherbirdAnnotation annotation) {
escherbirdAnnotations.add(annotation);
}
public List<EscherbirdAnnotation> getEscherbirdAnnotations() {
return escherbirdAnnotations;
}
public List<PotentialLocationObject> getPotentialLocations() {
return potentialLocations;
}
public void setPotentialLocations(Collection<PotentialLocationObject> potentialLocations) {
this.potentialLocations.clear();
this.potentialLocations.addAll(potentialLocations);
}
@Override
public String toString() {
return ToStringBuilder.reflectionToString(this);
}
// Tweet language related getters and setters.
/**
* Returns the locale.
* <p>
* Note the getLocale() will never return null, this is for the convenience of text related
* processing in the ingester. If you want the real locale, you need to check isSetLocale()
* first to see if we really have any information about the locale of this tweet.
*/
public Locale getLocale() {
if (locale == null) {
return TwitterLanguageIdentifier.UNKNOWN;
} else {
return locale;
}
}
public void setLocale(Locale locale) {
this.locale = locale;
}
/**
* Determines if the locate is set.
*/
public boolean isSetLocale() {
return locale != null;
}
/**
* Returns the language of the locale. E.g. zh
*/
public String getLanguage() {
if (isSetLocale()) {
return getLocale().getLanguage();
} else {
return null;
}
}
/**
* Returns the IETF BCP 47 Language Tag of the locale. E.g. zh-CN
*/
public String getBCP47LanguageTag() {
if (isSetLocale()) {
return getLocale().toLanguageTag();
} else {
return null;
}
}
public void setLanguage(String language) {
if (language != null) {
locale = LocaleUtil.getLocaleOf(language);
}
}
// Tweet link language related getters and setters.
public Locale getLinkLocale() {
return linkLocale;
}
public void setLinkLocale(Locale linkLocale) {
this.linkLocale = linkLocale;
}
/**
* Returns the language of the link locale.
*/
public String getLinkLanguage() {
if (this.linkLocale == null) {
return null;
} else {
return this.linkLocale.getLanguage();
}
}
public String getOrigSource() {
return origSource;
}
public void setOrigSource(String origSource) {
this.origSource = origSource;
}
public String getStrippedSource() {
return strippedSource;
}
public void setStrippedSource(String strippedSource) {
this.strippedSource = strippedSource;
}
public String getOrigLocation() {
return origLocation;
}
public String getLocation() {
return truncatedNormalizedLocation;
}
public void setOrigLocation(String origLocation) {
this.origLocation = origLocation;
}
public void setTruncatedNormalizedLocation(String truncatedNormalizedLocation) {
this.truncatedNormalizedLocation = truncatedNormalizedLocation;
}
public boolean hasFromUserLocCountry() {
return fromUserLocCountry != null;
}
public String getFromUserLocCountry() {
return fromUserLocCountry;
}
public void setFromUserLocCountry(String fromUserLocCountry) {
this.fromUserLocCountry = fromUserLocCountry;
}
public String getTruncatedNormalizedLocation() {
return truncatedNormalizedLocation;
}
public Integer getFollowersCount() {
return followersCount;
}
public void setFollowersCount(Integer followersCount) {
this.followersCount = followersCount;
}
public boolean hasFollowersCount() {
return followersCount != INT_FIELD_NOT_PRESENT;
}
public boolean isDeleted() {
return deleted;
}
public void setDeleted(boolean deleted) {
this.deleted = deleted;
}
public boolean hasCard() {
return !StringUtils.isBlank(getCardName());
}
@Override
public int hashCode() {
return ((Long) getId()).hashCode();
}
/**
* Parses the given date using the TwitterDateFormat.
*/
public static Date parseDate(String date) {
DateFormat parser = TwitterDateFormat.apply("EEE MMM d HH:mm:ss Z yyyy");
try {
return parser.parse(date);
} catch (Exception e) {
return null;
}
}
public boolean hasGeoLocation() {
return geoLocation != null;
}
public void setGeoLocation(GeoObject location) {
this.geoLocation = location;
}
public GeoObject getGeoLocation() {
return geoLocation;
}
public String getPlaceId() {
return placeId;
}
public void setPlaceId(String placeId) {
this.placeId = placeId;
}
public String getPlaceFullName() {
return placeFullName;
}
public void setPlaceFullName(String placeFullName) {
this.placeFullName = placeFullName;
}
public String getPlaceCountryCode() {
return placeCountryCode;
}
public void setPlaceCountryCode(String placeCountryCode) {
this.placeCountryCode = placeCountryCode;
}
public void setGeoTaggedLocation(GeoObject geoTaggedLocation) {
this.geoTaggedLocation = geoTaggedLocation;
}
public GeoObject getGeoTaggedLocation() {
return geoTaggedLocation;
}
public void setLatLon(double latitude, double longitude) {
geoLocation = new GeoObject(latitude, longitude, null);
}
public Double getLatitude() {
return hasGeoLocation() ? geoLocation.getLatitude() : null;
}
public Double getLongitude() {
return hasGeoLocation() ? geoLocation.getLongitude() : null;
}
public boolean isUncodeableLocation() {
return uncodeableLocation;
}
public void setUncodeableLocation() {
uncodeableLocation = true;
}
public void setGeocodeRequired() {
this.geocodeRequired = true;
}
public boolean isGeocodeRequired() {
return geocodeRequired;
}
public Map<Long, String> getPhotoUrls() {
return photoUrls;
}
/**
* Associates the given mediaUrl with the given photoStatusId.
*/
public void addPhotoUrl(long photoStatusId, String mediaUrl) {
if (photoUrls == null) {
photoUrls = new LinkedHashMap<>();
}
photoUrls.putIfAbsent(photoStatusId, mediaUrl);
}
public Map<String, ThriftExpandedUrl> getExpandedUrlMap() {
return expandedUrls;
}
public int getExpandedUrlMapSize() {
return expandedUrls.size();
}
/**
* Associates the given originalUrl with the given expanderUrl.
*/
public void addExpandedUrl(String originalUrl, ThriftExpandedUrl expandedUrl) {
this.expandedUrls.put(originalUrl, expandedUrl);
}
/**
* Replaces urls with resolved ones.
*/
public String getTextReplacedWithResolvedURLs() {
String retText = text;
for (Map.Entry<String, ThriftExpandedUrl> entry : expandedUrls.entrySet()) {
ThriftExpandedUrl urlInfo = entry.getValue();
String resolvedUrl;
String canonicalLastHopUrl = urlInfo.getCanonicalLastHopUrl();
String expandedUrl = urlInfo.getExpandedUrl();
if (canonicalLastHopUrl != null) {
resolvedUrl = canonicalLastHopUrl;
LOG.debug("{} has canonical last hop url set", urlInfo);
} else if (expandedUrl != null) {
LOG.debug("{} has no canonical last hop url set, using expanded url instead", urlInfo);
resolvedUrl = expandedUrl;
} else {
LOG.debug("{} has no canonical last hop url or expanded url set, skipping", urlInfo);
continue;
}
retText = retText.replace(entry.getKey(), resolvedUrl);
}
return retText;
}
public long getId() {
return tweetId;
}
public boolean isRetweet() {
return retweetMessage != null;
}
public boolean hasQuote() {
return quotedMessage != null;
}
public boolean isReply() {
return getToUserScreenName().isPresent()
|| getToUserTwitterId().isPresent()
|| getInReplyToStatusId().isPresent();
}
public boolean isReplyToTweet() {
return getInReplyToStatusId().isPresent();
}
public TwitterRetweetMessage getRetweetMessage() {
return retweetMessage;
}
public void setRetweetMessage(TwitterRetweetMessage retweetMessage) {
this.retweetMessage = retweetMessage;
}
public TwitterQuotedMessage getQuotedMessage() {
return quotedMessage;
}
public void setQuotedMessage(TwitterQuotedMessage quotedMessage) {
this.quotedMessage = quotedMessage;
}
public List<String> getPlaces() {
return places;
}
public void addPlace(String place) {
// Places are used for earlybird serialization
places.add(place);
}
public Optional<Long> getInReplyToStatusId() {
return inReplyToStatusId;
}
public void setInReplyToStatusId(long inReplyToStatusId) {
Preconditions.checkArgument(inReplyToStatusId > 0, "In-reply-to status ID should be positive");
this.inReplyToStatusId = Optional.of(inReplyToStatusId);
}
public boolean getNullcast() {
return nullcast;
}
public void setNullcast(boolean nullcast) {
this.nullcast = nullcast;
}
public List<PenguinVersion> getSupportedPenguinVersions() {
return supportedPenguinVersions;
}
private VersionedTweetFeatures getVersionedTweetFeatures(PenguinVersion penguinVersion) {
VersionedTweetFeatures versionedTweetFeatures = versionedTweetFeaturesMap.get(penguinVersion);
return Preconditions.checkNotNull(versionedTweetFeatures);
}
public TweetFeatures getTweetFeatures(PenguinVersion penguinVersion) {
return getVersionedTweetFeatures(penguinVersion).getTweetFeatures();
}
@VisibleForTesting
// only used in Tests
public void setTweetFeatures(PenguinVersion penguinVersion, TweetFeatures tweetFeatures) {
versionedTweetFeaturesMap.get(penguinVersion).setTweetFeatures(tweetFeatures);
}
public int getTweetSignature(PenguinVersion penguinVersion) {
return getVersionedTweetFeatures(penguinVersion).getTweetTextFeatures().getSignature();
}
public TweetTextQuality getTweetTextQuality(PenguinVersion penguinVersion) {
return getVersionedTweetFeatures(penguinVersion).getTweetTextQuality();
}
public TweetTextFeatures getTweetTextFeatures(PenguinVersion penguinVersion) {
return getVersionedTweetFeatures(penguinVersion).getTweetTextFeatures();
}
public TweetUserFeatures getTweetUserFeatures(PenguinVersion penguinVersion) {
return getVersionedTweetFeatures(penguinVersion).getTweetUserFeatures();
}
public TokenizedCharSequence getTokenizedCharSequence(PenguinVersion penguinVersion) {
return getVersionedTweetFeatures(penguinVersion).getTokenizedCharSequence();
}
public void setTokenizedCharSequence(PenguinVersion penguinVersion,
TokenizedCharSequence sequence) {
getVersionedTweetFeatures(penguinVersion).setTokenizedCharSequence(sequence);
}
// True if the features contain multiple hash tags or multiple trends.
// This is intended as an anti-trend-spam measure.
public static boolean hasMultipleHashtagsOrTrends(TweetTextFeatures textFeatures) {
// Allow at most 1 trend and 2 hashtags.
return textFeatures.getTrendingTermsSize() > 1 || textFeatures.getHashtagsSize() > 2;
}
/**
* Returns the expanded URLs.
*/
public Collection<ThriftExpandedUrl> getExpandedUrls() {
return expandedUrls.values();
}
/**
* Returns the canonical last hop URLs.
*/
public Set<String> getCanonicalLastHopUrls() {
Set<String> result = new HashSet<>(expandedUrls.size());
for (ThriftExpandedUrl url : expandedUrls.values()) {
result.add(url.getCanonicalLastHopUrl());
}
return result;
}
public String getCardName() {
return cardName;
}
public void setCardName(String cardName) {
this.cardName = cardName;
}
public String getCardDomain() {
return cardDomain;
}
public void setCardDomain(String cardDomain) {
this.cardDomain = cardDomain;
}
public String getCardTitle() {
return cardTitle;
}
public void setCardTitle(String cardTitle) {
this.cardTitle = cardTitle;
}
public String getCardDescription() {
return cardDescription;
}
public void setCardDescription(String cardDescription) {
this.cardDescription = cardDescription;
}
public String getCardLang() {
return cardLang;
}
public void setCardLang(String cardLang) {
this.cardLang = cardLang;
}
public String getCardUrl() {
return cardUrl;
}
public void setCardUrl(String cardUrl) {
this.cardUrl = cardUrl;
}
public List<TwitterMessageUser> getMentions() {
return this.mentions;
}
public void setMentions(List<TwitterMessageUser> mentions) {
this.mentions = mentions;
}
public List<String> getLowercasedMentions() {
return Lists.transform(getMentions(), user -> {
// This condition is also checked in addUserToMentions().
Preconditions.checkState(user.getScreenName().isPresent(), "Invalid mention");
return user.getScreenName().get().toLowerCase();
});
}
public Set<String> getHashtags() {
return this.hashtags;
}
public Set<String> getNormalizedHashtags(PenguinVersion penguinVersion) {
return getVersionedTweetFeatures(penguinVersion).getNormalizedHashtags();
}
public void addNormalizedHashtag(String normalizedHashtag, PenguinVersion penguinVersion) {
getVersionedTweetFeatures(penguinVersion).addNormalizedHashtags(normalizedHashtag);
}
public Optional<ComposerSource> getComposerSource() {
return composerSource;
}
public void setComposerSource(ComposerSource composerSource) {
Preconditions.checkNotNull(composerSource, "composerSource should not be null");
this.composerSource = Optional.of(composerSource);
}
public boolean isSelfThread() {
return selfThread;
}
public void setSelfThread(boolean selfThread) {
this.selfThread = selfThread;
}
public boolean isExclusive() {
return exclusiveConversationAuthorId.isPresent();
}
public long getExclusiveConversationAuthorId() {
return exclusiveConversationAuthorId.get();
}
public void setExclusiveConversationAuthorId(long exclusiveConversationAuthorId) {
this.exclusiveConversationAuthorId = Optional.of(exclusiveConversationAuthorId);
}
/**
* Adds an expanded media url based on the given parameters.
*/
public void addExpandedMediaUrl(String originalUrl,
String expandedUrl,
@Nullable MediaTypes mediaType) {
if (!StringUtils.isBlank(originalUrl) && !StringUtils.isBlank(expandedUrl)) {
ThriftExpandedUrl thriftExpandedUrl = new ThriftExpandedUrl();
if (mediaType != null) {
thriftExpandedUrl.setMediaType(mediaType);
}
thriftExpandedUrl.setOriginalUrl(originalUrl);
thriftExpandedUrl.setExpandedUrl(expandedUrl); // This will be tokenized and indexed
// Note that the mediaURL is not indexed. We could also index it, but it is not indexed
// to reduce RAM usage.
thriftExpandedUrl.setCanonicalLastHopUrl(expandedUrl); // This will be tokenized and indexed
addExpandedUrl(originalUrl, thriftExpandedUrl);
thriftExpandedUrl.setConsumerMedia(true);
}
}
/**
* Adds an expanded non-media url based on the given parameters.
*/
public void addExpandedNonMediaUrl(String originalUrl, String expandedUrl) {
if (!StringUtils.isBlank(originalUrl)) {
ThriftExpandedUrl thriftExpandedUrl = new ThriftExpandedUrl(originalUrl);
if (!StringUtils.isBlank(expandedUrl)) {
thriftExpandedUrl.setExpandedUrl(expandedUrl);
}
addExpandedUrl(originalUrl, thriftExpandedUrl);
thriftExpandedUrl.setConsumerMedia(false);
}
}
/**
* Only used in tests.
*
* Simulates resolving compressed URLs, which is usually done by ResolveCompressedUrlsStage.
*/
@VisibleForTesting
public void replaceUrlsWithResolvedUrls(Map<String, String> resolvedUrls) {
for (Map.Entry<String, ThriftExpandedUrl> urlEntry : expandedUrls.entrySet()) {
String tcoUrl = urlEntry.getKey();
if (resolvedUrls.containsKey(tcoUrl)) {
ThriftExpandedUrl expandedUrl = urlEntry.getValue();
expandedUrl.setCanonicalLastHopUrl(resolvedUrls.get(tcoUrl));
}
}
}
/**
* Adds a mention for a user with the given screen name.
*/
public void addMention(String screenName) {
TwitterMessageUser user = TwitterMessageUser.createWithScreenName(screenName);
addUserToMentions(user);
}
/**
* Adds the given user to mentions.
*/
public void addUserToMentions(TwitterMessageUser user) {
Preconditions.checkArgument(user.getScreenName().isPresent(), "Don't add invalid mentions");
this.mentions.add(user);
}
/**
* Adds the given hashtag.
*/
public void addHashtag(String hashtag) {
this.hashtags.add(hashtag);
for (PenguinVersion penguinVersion : supportedPenguinVersions) {
addNormalizedHashtag(NormalizerHelper.normalize(hashtag, getLocale(), penguinVersion),
penguinVersion);
}
}
private Map<PenguinVersion, VersionedTweetFeatures> getVersionedTweetFeaturesMap() {
Map<PenguinVersion, VersionedTweetFeatures> versionedMap =
Maps.newEnumMap(PenguinVersion.class);
for (PenguinVersion penguinVersion : getSupportedPenguinVersions()) {
versionedMap.put(penguinVersion, new VersionedTweetFeatures());
}
return versionedMap;
}
public int getNumFavorites() {
return numFavorites;
}
public int getNumRetweets() {
return numRetweets;
}
public int getNumReplies() {
return numReplies;
}
public Set<NamedEntity> getNamedEntities() {
return namedEntities;
}
public void addNamedEntity(NamedEntity namedEntity) {
namedEntities.add(namedEntity);
}
public Set<String> getSpaceIds() {
return spaceIds;
}
public void setSpaceIds(Set<String> spaceIds) {
this.spaceIds = Sets.newHashSet(spaceIds);
}
public Set<TwitterMessageUser> getSpaceAdmins() {
return spaceAdmins;
}
public void addSpaceAdmin(TwitterMessageUser admin) {
spaceAdmins.add(admin);
}
public String getSpaceTitle() {
return spaceTitle;
}
public void setSpaceTitle(String spaceTitle) {
this.spaceTitle = spaceTitle;
}
private static boolean equals(List<EscherbirdAnnotation> l1, List<EscherbirdAnnotation> l2) {
EscherbirdAnnotation[] arr1 = l1.toArray(new EscherbirdAnnotation[l1.size()]);
Arrays.sort(arr1);
EscherbirdAnnotation[] arr2 = l1.toArray(new EscherbirdAnnotation[l2.size()]);
Arrays.sort(arr2);
return Arrays.equals(arr1, arr2);
}
/**
* Compares the given messages using reflection and determines if they're approximately equal.
*/
public static boolean reflectionApproxEquals(
TwitterMessage a,
TwitterMessage b,
List<String> additionalExcludeFields) {
List<String> excludeFields = Lists.newArrayList(
"versionedTweetFeaturesMap",
"geoLocation",
"geoTaggedLocation",
"escherbirdAnnotations"
);
excludeFields.addAll(additionalExcludeFields);
return EqualsBuilder.reflectionEquals(a, b, excludeFields)
&& GeoObject.approxEquals(a.getGeoLocation(), b.getGeoLocation())
&& GeoObject.approxEquals(a.getGeoTaggedLocation(), b.getGeoTaggedLocation())
&& equals(a.getEscherbirdAnnotations(), b.getEscherbirdAnnotations());
}
public static boolean reflectionApproxEquals(TwitterMessage a, TwitterMessage b) {
return reflectionApproxEquals(a, b, Collections.emptyList());
}
}