package com.twitter.search.common.relevance.entities; import java.text.DateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Optional; import java.util.Set; import javax.annotation.Nonnull; import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.ComparisonChain; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang3.builder.EqualsBuilder; import org.apache.commons.lang3.builder.HashCodeBuilder; import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.lucene.analysis.TokenStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.twitter.common.text.language.LocaleUtil; import com.twitter.common.text.pipeline.TwitterLanguageIdentifier; import com.twitter.common.text.token.TokenizedCharSequence; import com.twitter.common_internal.text.version.PenguinVersion; import com.twitter.cuad.ner.plain.thriftjava.NamedEntity; import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl; import com.twitter.search.common.relevance.features.TweetFeatures; import com.twitter.search.common.relevance.features.TweetTextFeatures; import com.twitter.search.common.relevance.features.TweetTextQuality; import com.twitter.search.common.relevance.features.TweetUserFeatures; import com.twitter.search.common.util.text.NormalizerHelper; import com.twitter.service.spiderduck.gen.MediaTypes; import com.twitter.tweetypie.thriftjava.ComposerSource; import com.twitter.util.TwitterDateFormat; /** * A representation of tweets used as an intermediate object during ingestion. As we proceed * in ingestion, we fill this object with data. We then convert it to ThriftVersionedEvents (which * itself represents a single tweet too, in different penguin versions potentially). */ public class TwitterMessage { private static final Logger LOG = LoggerFactory.getLogger(TwitterMessage.class); public static class EscherbirdAnnotation implements Comparable { public final long groupId; public final long domainId; public final long entityId; public EscherbirdAnnotation(long groupId, long domainId, long entityId) { this.groupId = groupId; this.domainId = domainId; this.entityId = entityId; } @Override public boolean equals(Object o2) { if (o2 instanceof EscherbirdAnnotation) { EscherbirdAnnotation a2 = (EscherbirdAnnotation) o2; return groupId == a2.groupId && domainId == a2.domainId && entityId == a2.entityId; } return false; } @Override public int hashCode() { return new HashCodeBuilder() .append(groupId) .append(domainId) .append(entityId) .toHashCode(); } @Override public int compareTo(EscherbirdAnnotation o) { return ComparisonChain.start() .compare(this.groupId, o.groupId) .compare(this.domainId, o.domainId) .compare(this.entityId, o.entityId) .result(); } } private final List escherbirdAnnotations = Lists.newArrayList(); // tweet features for multiple penguin versions private static class VersionedTweetFeatures { // TweetFeatures populated by relevance classifiers, structure defined // in src/main/thrift/classifier.thrift. private TweetFeatures tweetFeatures = new TweetFeatures(); private TokenizedCharSequence tokenizedCharSequence = null; private Set normalizedHashtags = Sets.newHashSet(); public TweetFeatures getTweetFeatures() { return this.tweetFeatures; } public void setTweetFeatures(final TweetFeatures tweetFeatures) { this.tweetFeatures = tweetFeatures; } public TweetTextQuality getTweetTextQuality() { return this.tweetFeatures.getTweetTextQuality(); } public TweetTextFeatures getTweetTextFeatures() { return this.tweetFeatures.getTweetTextFeatures(); } public TweetUserFeatures getTweetUserFeatures() { return this.tweetFeatures.getTweetUserFeatures(); } public TokenizedCharSequence getTokenizedCharSequence() { return this.tokenizedCharSequence; } public void setTokenizedCharSequence(TokenizedCharSequence sequence) { this.tokenizedCharSequence = sequence; } public Set getNormalizedHashtags() { return this.normalizedHashtags; } public void addNormalizedHashtags(String normalizedHashtag) { this.normalizedHashtags.add(normalizedHashtag); } } public static final int INT_FIELD_NOT_PRESENT = -1; public static final long LONG_FIELD_NOT_PRESENT = -1; public static final double DOUBLE_FIELD_NOT_PRESENT = -1; public static final int MAX_USER_REPUTATION = 100; private final long tweetId; private String text; private Date date; @Nonnull private Optional optionalFromUser = Optional.empty(); @Nonnull private Optional optionalToUser = Optional.empty(); private Locale locale = null; private Locale linkLocale = null; // Original source text. private String origSource; // Source with HTML tags removed and truncated. private String strippedSource; // Original location text. private String origLocation; // Location truncated for mysql field-width reasons (see TwitterMessageUtil.java). private String truncatedNormalizedLocation; // User's country private String fromUserLocCountry; private Integer followersCount = INT_FIELD_NOT_PRESENT; private boolean deleted = false; // Fields extracted from entities (in the JSON object) private List mentions = new ArrayList<>(); private Set hashtags = Sets.newHashSet(); // Lat/lon and region accuracy tuples extracted from tweet text, or null. private GeoObject geoLocation = null; private boolean uncodeableLocation = false; // This is set if the tweet is geotagged. (i.e. "geo" or "coordinate" section is present // in the json) // This field has only a getter but no setter --- it is filled in when the json is parsed. private GeoObject geoTaggedLocation = null; private double userReputation = DOUBLE_FIELD_NOT_PRESENT; private boolean geocodeRequired = false; private boolean sensitiveContent = false; private boolean userProtected; private boolean userVerified; private boolean userBlueVerified; private TwitterRetweetMessage retweetMessage; private TwitterQuotedMessage quotedMessage; private List places; // maps from original url (the t.co url) to ThriftExpandedUrl, which contains the // expanded url and the spiderduck response (canoicalLastHopUrl and mediatype) private final Map expandedUrls; // maps the photo status id to the media url private Map photoUrls; private Optional inReplyToStatusId = Optional.empty(); private Optional directedAtUserId = Optional.empty(); private long conversationId = -1; // True if tweet is nullcasted. private boolean nullcast = false; // True if tweet is a self-threaded tweet private boolean selfThread = false; // If the tweet is a part of an exclusive conversation, the author who started // that conversation. private Optional exclusiveConversationAuthorId = Optional.empty(); // tweet features map for multiple versions of penguin private Map versionedTweetFeaturesMap; // Engagments count: favorites, retweets and replies private int numFavorites = 0; private int numRetweets = 0; private int numReplies = 0; // Card information private String cardName; private String cardDomain; private String cardTitle; private String cardDescription; private String cardLang; private String cardUrl; private String placeId; private String placeFullName; private String placeCountryCode; private Set namedEntities = Sets.newHashSet(); // Spaces data private Set spaceIds = Sets.newHashSet(); private Set spaceAdmins = Sets.newHashSet(); private String spaceTitle; private Optional composerSource = Optional.empty(); private final List potentialLocations = Lists.newArrayList(); // one or two penguin versions supported by this system private final List supportedPenguinVersions; public TwitterMessage(Long tweetId, List supportedPenguinVersions) { this.tweetId = tweetId; this.places = new ArrayList<>(); this.expandedUrls = new LinkedHashMap<>(); // make sure we support at least one, but no more than two versions of penguin this.supportedPenguinVersions = supportedPenguinVersions; this.versionedTweetFeaturesMap = getVersionedTweetFeaturesMap(); Preconditions.checkArgument(this.supportedPenguinVersions.size() <= 2 && this.supportedPenguinVersions.size() > 0); } /** * Replace to-user with in-reply-to user if needed. */ public void replaceToUserWithInReplyToUserIfNeeded( String inReplyToScreenName, long inReplyToUserId) { if (shouldUseReplyUserAsToUser(optionalToUser, inReplyToUserId)) { TwitterMessageUser replyUser = TwitterMessageUser.createWithNamesAndId(inReplyToScreenName, "", inReplyToUserId); optionalToUser = Optional.of(replyUser); } } // To-user could have been inferred from the mention at the position 0. // But if there is an explicit in-reply-to user, we might need to use it as to-user instead. private static boolean shouldUseReplyUserAsToUser( Optional currentToUser, long inReplyToUserId) { if (!currentToUser.isPresent()) { // There is no mention in the tweet that qualifies as to-user. return true; } // We already have a mention in the tweet that qualifies as to-user. TwitterMessageUser toUser = currentToUser.get(); if (!toUser.getId().isPresent()) { // The to-user from the mention is a stub. return true; } long toUserId = toUser.getId().get(); if (toUserId != inReplyToUserId) { // The to-user from the mention is different that the in-reply-to user, // use in-reply-to user instead. return true; } return false; } public double getUserReputation() { return userReputation; } /** * Sets the user reputation. */ public TwitterMessage setUserReputation(double newUserReputation) { if (newUserReputation > MAX_USER_REPUTATION) { LOG.warn("Out of bounds user reputation {} for status id {}", newUserReputation, tweetId); this.userReputation = (float) MAX_USER_REPUTATION; } else { this.userReputation = newUserReputation; } return this; } public String getText() { return text; } public Optional getOptionalToUser() { return optionalToUser; } public void setOptionalToUser(Optional optionalToUser) { this.optionalToUser = optionalToUser; } public void setText(String text) { this.text = text; } public Date getDate() { return date; } public void setDate(Date date) { this.date = date; } public void setFromUser(@Nonnull TwitterMessageUser fromUser) { Preconditions.checkNotNull(fromUser, "Don't set a null fromUser"); optionalFromUser = Optional.of(fromUser); } public Optional getFromUserScreenName() { return optionalFromUser.isPresent() ? optionalFromUser.get().getScreenName() : Optional.empty(); } /** * Sets the fromUserScreenName. */ public void setFromUserScreenName(@Nonnull String fromUserScreenName) { TwitterMessageUser newFromUser = optionalFromUser.isPresent() ? optionalFromUser.get().copyWithScreenName(fromUserScreenName) : TwitterMessageUser.createWithScreenName(fromUserScreenName); optionalFromUser = Optional.of(newFromUser); } public Optional getTokenizedFromUserScreenName() { return optionalFromUser.flatMap(TwitterMessageUser::getTokenizedScreenName); } public Optional getFromUserDisplayName() { return optionalFromUser.flatMap(TwitterMessageUser::getDisplayName); } /** * Sets the fromUserDisplayName. */ public void setFromUserDisplayName(@Nonnull String fromUserDisplayName) { TwitterMessageUser newFromUser = optionalFromUser.isPresent() ? optionalFromUser.get().copyWithDisplayName(fromUserDisplayName) : TwitterMessageUser.createWithDisplayName(fromUserDisplayName); optionalFromUser = Optional.of(newFromUser); } public Optional getFromUserTwitterId() { return optionalFromUser.flatMap(TwitterMessageUser::getId); } /** * Sets the fromUserId. */ public void setFromUserId(long fromUserId) { TwitterMessageUser newFromUser = optionalFromUser.isPresent() ? optionalFromUser.get().copyWithId(fromUserId) : TwitterMessageUser.createWithId(fromUserId); optionalFromUser = Optional.of(newFromUser); } public long getConversationId() { return conversationId; } public void setConversationId(long conversationId) { this.conversationId = conversationId; } public boolean isUserProtected() { return this.userProtected; } public void setUserProtected(boolean userProtected) { this.userProtected = userProtected; } public boolean isUserVerified() { return this.userVerified; } public void setUserVerified(boolean userVerified) { this.userVerified = userVerified; } public boolean isUserBlueVerified() { return this.userBlueVerified; } public void setUserBlueVerified(boolean userBlueVerified) { this.userBlueVerified = userBlueVerified; } public void setIsSensitiveContent(boolean isSensitiveContent) { this.sensitiveContent = isSensitiveContent; } public boolean isSensitiveContent() { return this.sensitiveContent; } public Optional getToUserObject() { return optionalToUser; } public void setToUserObject(@Nonnull TwitterMessageUser user) { Preconditions.checkNotNull(user, "Don't set a null to-user"); optionalToUser = Optional.of(user); } public Optional getToUserTwitterId() { return optionalToUser.flatMap(TwitterMessageUser::getId); } /** * Sets toUserId. */ public void setToUserTwitterId(long toUserId) { TwitterMessageUser newToUser = optionalToUser.isPresent() ? optionalToUser.get().copyWithId(toUserId) : TwitterMessageUser.createWithId(toUserId); optionalToUser = Optional.of(newToUser); } public Optional getToUserLowercasedScreenName() { return optionalToUser.flatMap(TwitterMessageUser::getScreenName).map(String::toLowerCase); } public Optional getToUserScreenName() { return optionalToUser.flatMap(TwitterMessageUser::getScreenName); } /** * Sets toUserScreenName. */ public void setToUserScreenName(@Nonnull String screenName) { Preconditions.checkNotNull(screenName, "Don't set a null to-user screenname"); TwitterMessageUser newToUser = optionalToUser.isPresent() ? optionalToUser.get().copyWithScreenName(screenName) : TwitterMessageUser.createWithScreenName(screenName); optionalToUser = Optional.of(newToUser); } // to use from TweetEventParseHelper public void setDirectedAtUserId(Optional directedAtUserId) { this.directedAtUserId = directedAtUserId; } @VisibleForTesting public Optional getDirectedAtUserId() { return directedAtUserId; } /** * Returns the referenceAuthorId. */ public Optional getReferenceAuthorId() { // The semantics of reference-author-id: // - if the tweet is a retweet, it should be the user id of the author of the original tweet // - else, if the tweet is directed at a user, it should be the id of the user it's directed at. // - else, if the tweet is a reply in a root self-thread, directed-at is not set, so it's // the id of the user who started the self-thread. // // For definitive info on replies and directed-at, take a look at go/replies. To view these // for a certain tweet, use http://go/t. // // Note that if directed-at is set, reply is always set. // If reply is set, directed-at is not necessarily set. if (isRetweet() && retweetMessage.hasSharedUserTwitterId()) { long retweetedUserId = retweetMessage.getSharedUserTwitterId(); return Optional.of(retweetedUserId); } else if (directedAtUserId.isPresent()) { // Why not replace directedAtUserId with reply and make this function depend // on the "reply" field of TweetCoreData? // Well, verified by counters, it seems for ~1% of tweets, which contain both directed-at // and reply, directed-at-user is different than the reply-to-user id. This happens in the // following case: // // author / reply-to / directed-at // T1 A - - // T2 B A A // T3 B B A // // T2 is a reply to T1, T3 is a reply to T2. // // It's up to us to decide who this tweet is "referencing", but with the current code, // we choose that T3 is referencing user A. return directedAtUserId; } else { // This is the case of a root self-thread reply. directed-at is not set. Optional fromUserId = this.getFromUserTwitterId(); Optional toUserId = this.getToUserTwitterId(); if (fromUserId.isPresent() && fromUserId.equals(toUserId)) { return fromUserId; } } return Optional.empty(); } public void setNumFavorites(int numFavorites) { this.numFavorites = numFavorites; } public void setNumRetweets(int numRetweets) { this.numRetweets = numRetweets; } public void setNumReplies(int numRepliess) { this.numReplies = numRepliess; } public void addEscherbirdAnnotation(EscherbirdAnnotation annotation) { escherbirdAnnotations.add(annotation); } public List getEscherbirdAnnotations() { return escherbirdAnnotations; } public List getPotentialLocations() { return potentialLocations; } public void setPotentialLocations(Collection potentialLocations) { this.potentialLocations.clear(); this.potentialLocations.addAll(potentialLocations); } @Override public String toString() { return ToStringBuilder.reflectionToString(this); } // Tweet language related getters and setters. /** * Returns the locale. *

* Note the getLocale() will never return null, this is for the convenience of text related * processing in the ingester. If you want the real locale, you need to check isSetLocale() * first to see if we really have any information about the locale of this tweet. */ public Locale getLocale() { if (locale == null) { return TwitterLanguageIdentifier.UNKNOWN; } else { return locale; } } public void setLocale(Locale locale) { this.locale = locale; } /** * Determines if the locate is set. */ public boolean isSetLocale() { return locale != null; } /** * Returns the language of the locale. E.g. zh */ public String getLanguage() { if (isSetLocale()) { return getLocale().getLanguage(); } else { return null; } } /** * Returns the IETF BCP 47 Language Tag of the locale. E.g. zh-CN */ public String getBCP47LanguageTag() { if (isSetLocale()) { return getLocale().toLanguageTag(); } else { return null; } } public void setLanguage(String language) { if (language != null) { locale = LocaleUtil.getLocaleOf(language); } } // Tweet link language related getters and setters. public Locale getLinkLocale() { return linkLocale; } public void setLinkLocale(Locale linkLocale) { this.linkLocale = linkLocale; } /** * Returns the language of the link locale. */ public String getLinkLanguage() { if (this.linkLocale == null) { return null; } else { return this.linkLocale.getLanguage(); } } public String getOrigSource() { return origSource; } public void setOrigSource(String origSource) { this.origSource = origSource; } public String getStrippedSource() { return strippedSource; } public void setStrippedSource(String strippedSource) { this.strippedSource = strippedSource; } public String getOrigLocation() { return origLocation; } public String getLocation() { return truncatedNormalizedLocation; } public void setOrigLocation(String origLocation) { this.origLocation = origLocation; } public void setTruncatedNormalizedLocation(String truncatedNormalizedLocation) { this.truncatedNormalizedLocation = truncatedNormalizedLocation; } public boolean hasFromUserLocCountry() { return fromUserLocCountry != null; } public String getFromUserLocCountry() { return fromUserLocCountry; } public void setFromUserLocCountry(String fromUserLocCountry) { this.fromUserLocCountry = fromUserLocCountry; } public String getTruncatedNormalizedLocation() { return truncatedNormalizedLocation; } public Integer getFollowersCount() { return followersCount; } public void setFollowersCount(Integer followersCount) { this.followersCount = followersCount; } public boolean hasFollowersCount() { return followersCount != INT_FIELD_NOT_PRESENT; } public boolean isDeleted() { return deleted; } public void setDeleted(boolean deleted) { this.deleted = deleted; } public boolean hasCard() { return !StringUtils.isBlank(getCardName()); } @Override public int hashCode() { return ((Long) getId()).hashCode(); } /** * Parses the given date using the TwitterDateFormat. */ public static Date parseDate(String date) { DateFormat parser = TwitterDateFormat.apply("EEE MMM d HH:mm:ss Z yyyy"); try { return parser.parse(date); } catch (Exception e) { return null; } } public boolean hasGeoLocation() { return geoLocation != null; } public void setGeoLocation(GeoObject location) { this.geoLocation = location; } public GeoObject getGeoLocation() { return geoLocation; } public String getPlaceId() { return placeId; } public void setPlaceId(String placeId) { this.placeId = placeId; } public String getPlaceFullName() { return placeFullName; } public void setPlaceFullName(String placeFullName) { this.placeFullName = placeFullName; } public String getPlaceCountryCode() { return placeCountryCode; } public void setPlaceCountryCode(String placeCountryCode) { this.placeCountryCode = placeCountryCode; } public void setGeoTaggedLocation(GeoObject geoTaggedLocation) { this.geoTaggedLocation = geoTaggedLocation; } public GeoObject getGeoTaggedLocation() { return geoTaggedLocation; } public void setLatLon(double latitude, double longitude) { geoLocation = new GeoObject(latitude, longitude, null); } public Double getLatitude() { return hasGeoLocation() ? geoLocation.getLatitude() : null; } public Double getLongitude() { return hasGeoLocation() ? geoLocation.getLongitude() : null; } public boolean isUncodeableLocation() { return uncodeableLocation; } public void setUncodeableLocation() { uncodeableLocation = true; } public void setGeocodeRequired() { this.geocodeRequired = true; } public boolean isGeocodeRequired() { return geocodeRequired; } public Map getPhotoUrls() { return photoUrls; } /** * Associates the given mediaUrl with the given photoStatusId. */ public void addPhotoUrl(long photoStatusId, String mediaUrl) { if (photoUrls == null) { photoUrls = new LinkedHashMap<>(); } photoUrls.putIfAbsent(photoStatusId, mediaUrl); } public Map getExpandedUrlMap() { return expandedUrls; } public int getExpandedUrlMapSize() { return expandedUrls.size(); } /** * Associates the given originalUrl with the given expanderUrl. */ public void addExpandedUrl(String originalUrl, ThriftExpandedUrl expandedUrl) { this.expandedUrls.put(originalUrl, expandedUrl); } /** * Replaces urls with resolved ones. */ public String getTextReplacedWithResolvedURLs() { String retText = text; for (Map.Entry entry : expandedUrls.entrySet()) { ThriftExpandedUrl urlInfo = entry.getValue(); String resolvedUrl; String canonicalLastHopUrl = urlInfo.getCanonicalLastHopUrl(); String expandedUrl = urlInfo.getExpandedUrl(); if (canonicalLastHopUrl != null) { resolvedUrl = canonicalLastHopUrl; LOG.debug("{} has canonical last hop url set", urlInfo); } else if (expandedUrl != null) { LOG.debug("{} has no canonical last hop url set, using expanded url instead", urlInfo); resolvedUrl = expandedUrl; } else { LOG.debug("{} has no canonical last hop url or expanded url set, skipping", urlInfo); continue; } retText = retText.replace(entry.getKey(), resolvedUrl); } return retText; } public long getId() { return tweetId; } public boolean isRetweet() { return retweetMessage != null; } public boolean hasQuote() { return quotedMessage != null; } public boolean isReply() { return getToUserScreenName().isPresent() || getToUserTwitterId().isPresent() || getInReplyToStatusId().isPresent(); } public boolean isReplyToTweet() { return getInReplyToStatusId().isPresent(); } public TwitterRetweetMessage getRetweetMessage() { return retweetMessage; } public void setRetweetMessage(TwitterRetweetMessage retweetMessage) { this.retweetMessage = retweetMessage; } public TwitterQuotedMessage getQuotedMessage() { return quotedMessage; } public void setQuotedMessage(TwitterQuotedMessage quotedMessage) { this.quotedMessage = quotedMessage; } public List getPlaces() { return places; } public void addPlace(String place) { // Places are used for earlybird serialization places.add(place); } public Optional getInReplyToStatusId() { return inReplyToStatusId; } public void setInReplyToStatusId(long inReplyToStatusId) { Preconditions.checkArgument(inReplyToStatusId > 0, "In-reply-to status ID should be positive"); this.inReplyToStatusId = Optional.of(inReplyToStatusId); } public boolean getNullcast() { return nullcast; } public void setNullcast(boolean nullcast) { this.nullcast = nullcast; } public List getSupportedPenguinVersions() { return supportedPenguinVersions; } private VersionedTweetFeatures getVersionedTweetFeatures(PenguinVersion penguinVersion) { VersionedTweetFeatures versionedTweetFeatures = versionedTweetFeaturesMap.get(penguinVersion); return Preconditions.checkNotNull(versionedTweetFeatures); } public TweetFeatures getTweetFeatures(PenguinVersion penguinVersion) { return getVersionedTweetFeatures(penguinVersion).getTweetFeatures(); } @VisibleForTesting // only used in Tests public void setTweetFeatures(PenguinVersion penguinVersion, TweetFeatures tweetFeatures) { versionedTweetFeaturesMap.get(penguinVersion).setTweetFeatures(tweetFeatures); } public int getTweetSignature(PenguinVersion penguinVersion) { return getVersionedTweetFeatures(penguinVersion).getTweetTextFeatures().getSignature(); } public TweetTextQuality getTweetTextQuality(PenguinVersion penguinVersion) { return getVersionedTweetFeatures(penguinVersion).getTweetTextQuality(); } public TweetTextFeatures getTweetTextFeatures(PenguinVersion penguinVersion) { return getVersionedTweetFeatures(penguinVersion).getTweetTextFeatures(); } public TweetUserFeatures getTweetUserFeatures(PenguinVersion penguinVersion) { return getVersionedTweetFeatures(penguinVersion).getTweetUserFeatures(); } public TokenizedCharSequence getTokenizedCharSequence(PenguinVersion penguinVersion) { return getVersionedTweetFeatures(penguinVersion).getTokenizedCharSequence(); } public void setTokenizedCharSequence(PenguinVersion penguinVersion, TokenizedCharSequence sequence) { getVersionedTweetFeatures(penguinVersion).setTokenizedCharSequence(sequence); } // True if the features contain multiple hash tags or multiple trends. // This is intended as an anti-trend-spam measure. public static boolean hasMultipleHashtagsOrTrends(TweetTextFeatures textFeatures) { // Allow at most 1 trend and 2 hashtags. return textFeatures.getTrendingTermsSize() > 1 || textFeatures.getHashtagsSize() > 2; } /** * Returns the expanded URLs. */ public Collection getExpandedUrls() { return expandedUrls.values(); } /** * Returns the canonical last hop URLs. */ public Set getCanonicalLastHopUrls() { Set result = new HashSet<>(expandedUrls.size()); for (ThriftExpandedUrl url : expandedUrls.values()) { result.add(url.getCanonicalLastHopUrl()); } return result; } public String getCardName() { return cardName; } public void setCardName(String cardName) { this.cardName = cardName; } public String getCardDomain() { return cardDomain; } public void setCardDomain(String cardDomain) { this.cardDomain = cardDomain; } public String getCardTitle() { return cardTitle; } public void setCardTitle(String cardTitle) { this.cardTitle = cardTitle; } public String getCardDescription() { return cardDescription; } public void setCardDescription(String cardDescription) { this.cardDescription = cardDescription; } public String getCardLang() { return cardLang; } public void setCardLang(String cardLang) { this.cardLang = cardLang; } public String getCardUrl() { return cardUrl; } public void setCardUrl(String cardUrl) { this.cardUrl = cardUrl; } public List getMentions() { return this.mentions; } public void setMentions(List mentions) { this.mentions = mentions; } public List getLowercasedMentions() { return Lists.transform(getMentions(), user -> { // This condition is also checked in addUserToMentions(). Preconditions.checkState(user.getScreenName().isPresent(), "Invalid mention"); return user.getScreenName().get().toLowerCase(); }); } public Set getHashtags() { return this.hashtags; } public Set getNormalizedHashtags(PenguinVersion penguinVersion) { return getVersionedTweetFeatures(penguinVersion).getNormalizedHashtags(); } public void addNormalizedHashtag(String normalizedHashtag, PenguinVersion penguinVersion) { getVersionedTweetFeatures(penguinVersion).addNormalizedHashtags(normalizedHashtag); } public Optional getComposerSource() { return composerSource; } public void setComposerSource(ComposerSource composerSource) { Preconditions.checkNotNull(composerSource, "composerSource should not be null"); this.composerSource = Optional.of(composerSource); } public boolean isSelfThread() { return selfThread; } public void setSelfThread(boolean selfThread) { this.selfThread = selfThread; } public boolean isExclusive() { return exclusiveConversationAuthorId.isPresent(); } public long getExclusiveConversationAuthorId() { return exclusiveConversationAuthorId.get(); } public void setExclusiveConversationAuthorId(long exclusiveConversationAuthorId) { this.exclusiveConversationAuthorId = Optional.of(exclusiveConversationAuthorId); } /** * Adds an expanded media url based on the given parameters. */ public void addExpandedMediaUrl(String originalUrl, String expandedUrl, @Nullable MediaTypes mediaType) { if (!StringUtils.isBlank(originalUrl) && !StringUtils.isBlank(expandedUrl)) { ThriftExpandedUrl thriftExpandedUrl = new ThriftExpandedUrl(); if (mediaType != null) { thriftExpandedUrl.setMediaType(mediaType); } thriftExpandedUrl.setOriginalUrl(originalUrl); thriftExpandedUrl.setExpandedUrl(expandedUrl); // This will be tokenized and indexed // Note that the mediaURL is not indexed. We could also index it, but it is not indexed // to reduce RAM usage. thriftExpandedUrl.setCanonicalLastHopUrl(expandedUrl); // This will be tokenized and indexed addExpandedUrl(originalUrl, thriftExpandedUrl); thriftExpandedUrl.setConsumerMedia(true); } } /** * Adds an expanded non-media url based on the given parameters. */ public void addExpandedNonMediaUrl(String originalUrl, String expandedUrl) { if (!StringUtils.isBlank(originalUrl)) { ThriftExpandedUrl thriftExpandedUrl = new ThriftExpandedUrl(originalUrl); if (!StringUtils.isBlank(expandedUrl)) { thriftExpandedUrl.setExpandedUrl(expandedUrl); } addExpandedUrl(originalUrl, thriftExpandedUrl); thriftExpandedUrl.setConsumerMedia(false); } } /** * Only used in tests. * * Simulates resolving compressed URLs, which is usually done by ResolveCompressedUrlsStage. */ @VisibleForTesting public void replaceUrlsWithResolvedUrls(Map resolvedUrls) { for (Map.Entry urlEntry : expandedUrls.entrySet()) { String tcoUrl = urlEntry.getKey(); if (resolvedUrls.containsKey(tcoUrl)) { ThriftExpandedUrl expandedUrl = urlEntry.getValue(); expandedUrl.setCanonicalLastHopUrl(resolvedUrls.get(tcoUrl)); } } } /** * Adds a mention for a user with the given screen name. */ public void addMention(String screenName) { TwitterMessageUser user = TwitterMessageUser.createWithScreenName(screenName); addUserToMentions(user); } /** * Adds the given user to mentions. */ public void addUserToMentions(TwitterMessageUser user) { Preconditions.checkArgument(user.getScreenName().isPresent(), "Don't add invalid mentions"); this.mentions.add(user); } /** * Adds the given hashtag. */ public void addHashtag(String hashtag) { this.hashtags.add(hashtag); for (PenguinVersion penguinVersion : supportedPenguinVersions) { addNormalizedHashtag(NormalizerHelper.normalize(hashtag, getLocale(), penguinVersion), penguinVersion); } } private Map getVersionedTweetFeaturesMap() { Map versionedMap = Maps.newEnumMap(PenguinVersion.class); for (PenguinVersion penguinVersion : getSupportedPenguinVersions()) { versionedMap.put(penguinVersion, new VersionedTweetFeatures()); } return versionedMap; } public int getNumFavorites() { return numFavorites; } public int getNumRetweets() { return numRetweets; } public int getNumReplies() { return numReplies; } public Set getNamedEntities() { return namedEntities; } public void addNamedEntity(NamedEntity namedEntity) { namedEntities.add(namedEntity); } public Set getSpaceIds() { return spaceIds; } public void setSpaceIds(Set spaceIds) { this.spaceIds = Sets.newHashSet(spaceIds); } public Set getSpaceAdmins() { return spaceAdmins; } public void addSpaceAdmin(TwitterMessageUser admin) { spaceAdmins.add(admin); } public String getSpaceTitle() { return spaceTitle; } public void setSpaceTitle(String spaceTitle) { this.spaceTitle = spaceTitle; } private static boolean equals(List l1, List l2) { EscherbirdAnnotation[] arr1 = l1.toArray(new EscherbirdAnnotation[l1.size()]); Arrays.sort(arr1); EscherbirdAnnotation[] arr2 = l1.toArray(new EscherbirdAnnotation[l2.size()]); Arrays.sort(arr2); return Arrays.equals(arr1, arr2); } /** * Compares the given messages using reflection and determines if they're approximately equal. */ public static boolean reflectionApproxEquals( TwitterMessage a, TwitterMessage b, List additionalExcludeFields) { List excludeFields = Lists.newArrayList( "versionedTweetFeaturesMap", "geoLocation", "geoTaggedLocation", "escherbirdAnnotations" ); excludeFields.addAll(additionalExcludeFields); return EqualsBuilder.reflectionEquals(a, b, excludeFields) && GeoObject.approxEquals(a.getGeoLocation(), b.getGeoLocation()) && GeoObject.approxEquals(a.getGeoTaggedLocation(), b.getGeoTaggedLocation()) && equals(a.getEscherbirdAnnotations(), b.getEscherbirdAnnotations()); } public static boolean reflectionApproxEquals(TwitterMessage a, TwitterMessage b) { return reflectionApproxEquals(a, b, Collections.emptyList()); } }