445 lines
17 KiB
Java
445 lines
17 KiB
Java
package com.twitter.search.common.relevance.entities;
|
|
|
|
import java.text.Normalizer;
|
|
import java.util.Map;
|
|
import java.util.NavigableMap;
|
|
import java.util.Set;
|
|
import java.util.TreeMap;
|
|
import java.util.concurrent.ConcurrentMap;
|
|
|
|
import com.google.common.annotations.VisibleForTesting;
|
|
import com.google.common.base.Preconditions;
|
|
import com.google.common.collect.Maps;
|
|
|
|
import org.apache.commons.lang.StringUtils;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import com.twitter.common.text.transformer.HTMLTagRemovalTransformer;
|
|
import com.twitter.common_internal.text.extractor.EmojiExtractor;
|
|
import com.twitter.search.common.metrics.SearchRateCounter;
|
|
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
|
|
|
|
public final class TwitterMessageUtil {
|
|
private static final Logger LOG = LoggerFactory.getLogger(TwitterMessageUtil.class);
|
|
|
|
private TwitterMessageUtil() {
|
|
}
|
|
|
|
@VisibleForTesting
|
|
static final ConcurrentMap<Field, Counters> COUNTERS_MAP = Maps.newConcurrentMap();
|
|
// We truncate the location string because we used to use a MySQL table to store the geocoding
|
|
// information. In the MySQL table, the location string was fix width of 30 characters.
|
|
// We have migrated to Manhattan and the location string is no longer limited to 30 character.
|
|
// However, in order to correctly lookup location geocode from Manhattan, we still need to
|
|
// truncate the location just like we did before.
|
|
private static final int MAX_LOCATION_LEN = 30;
|
|
|
|
// Note: we strip tags to index source, as typically source contains <a href=...> tags.
|
|
// Sometimes we get a source where stripping fails, as the URL in the tag was
|
|
// excessively long. We drop these sources, as there is little reason to index them.
|
|
private static final int MAX_SOURCE_LEN = 64;
|
|
|
|
private static HTMLTagRemovalTransformer tagRemovalTransformer = new HTMLTagRemovalTransformer();
|
|
|
|
private static final String STAT_PREFIX = "twitter_message_";
|
|
|
|
public enum Field {
|
|
FROM_USER_DISPLAY_NAME,
|
|
NORMALIZED_LOCATION,
|
|
ORIG_LOCATION,
|
|
ORIG_SOURCE,
|
|
SHARED_USER_DISPLAY_NAME,
|
|
SOURCE,
|
|
TEXT,
|
|
TO_USER_SCREEN_NAME;
|
|
|
|
public String getNameForStats() {
|
|
return name().toLowerCase();
|
|
}
|
|
}
|
|
|
|
@VisibleForTesting
|
|
static class Counters {
|
|
private final SearchRateCounter truncatedCounter;
|
|
private final SearchRateCounter tweetsWithStrippedSupplementaryCharsCounter;
|
|
private final SearchRateCounter strippedSupplementaryCharsCounter;
|
|
private final SearchRateCounter nonStrippedEmojiCharsCounter;
|
|
private final SearchRateCounter emojisAtTruncateBoundaryCounter;
|
|
|
|
Counters(Field field) {
|
|
String fieldNameForStats = field.getNameForStats();
|
|
truncatedCounter = SearchRateCounter.export(
|
|
STAT_PREFIX + "truncated_" + fieldNameForStats);
|
|
tweetsWithStrippedSupplementaryCharsCounter = SearchRateCounter.export(
|
|
STAT_PREFIX + "tweets_with_stripped_supplementary_chars_" + fieldNameForStats);
|
|
strippedSupplementaryCharsCounter = SearchRateCounter.export(
|
|
STAT_PREFIX + "stripped_supplementary_chars_" + fieldNameForStats);
|
|
nonStrippedEmojiCharsCounter = SearchRateCounter.export(
|
|
STAT_PREFIX + "non_stripped_emoji_chars_" + fieldNameForStats);
|
|
emojisAtTruncateBoundaryCounter = SearchRateCounter.export(
|
|
STAT_PREFIX + "emojis_at_truncate_boundary_" + fieldNameForStats);
|
|
}
|
|
|
|
SearchRateCounter getTruncatedCounter() {
|
|
return truncatedCounter;
|
|
}
|
|
|
|
SearchRateCounter getTweetsWithStrippedSupplementaryCharsCounter() {
|
|
return tweetsWithStrippedSupplementaryCharsCounter;
|
|
}
|
|
|
|
SearchRateCounter getStrippedSupplementaryCharsCounter() {
|
|
return strippedSupplementaryCharsCounter;
|
|
}
|
|
|
|
SearchRateCounter getNonStrippedEmojiCharsCounter() {
|
|
return nonStrippedEmojiCharsCounter;
|
|
}
|
|
|
|
SearchRateCounter getEmojisAtTruncateBoundaryCounter() {
|
|
return emojisAtTruncateBoundaryCounter;
|
|
}
|
|
}
|
|
|
|
static {
|
|
for (Field field : Field.values()) {
|
|
COUNTERS_MAP.put(field, new Counters(field));
|
|
}
|
|
}
|
|
|
|
// Note: the monorail enforces a limit of 15 characters for screen names,
|
|
// but some users with up to 20 character names were grandfathered-in. To allow
|
|
// those users to be searchable, support up to 20 chars.
|
|
private static final int MAX_SCREEN_NAME_LEN = 20;
|
|
|
|
// Note: we expect the current limit to be 10K. Also, all supplementary unicode characters (with
|
|
// the exception of emojis, maybe) will be removed and not counted as total length. Added alert
|
|
// for text truncation rate as well. SEARCH-9512
|
|
private static final int MAX_TWEET_TEXT_LEN = 10000;
|
|
|
|
@VisibleForTesting
|
|
static final SearchRateCounter FILTERED_NO_STATUS_ID =
|
|
SearchRateCounter.export(STAT_PREFIX + "filtered_no_status_id");
|
|
@VisibleForTesting
|
|
static final SearchRateCounter FILTERED_NO_FROM_USER =
|
|
SearchRateCounter.export(STAT_PREFIX + "filtered_no_from_user");
|
|
@VisibleForTesting
|
|
static final SearchRateCounter FILTERED_LONG_SCREEN_NAME =
|
|
SearchRateCounter.export(STAT_PREFIX + "filtered_long_screen_name");
|
|
@VisibleForTesting
|
|
static final SearchRateCounter FILTERED_NO_TEXT =
|
|
SearchRateCounter.export(STAT_PREFIX + "filtered_no_text");
|
|
@VisibleForTesting
|
|
static final SearchRateCounter FILTERED_NO_DATE =
|
|
SearchRateCounter.export(STAT_PREFIX + "filtered_no_date");
|
|
@VisibleForTesting
|
|
static final SearchRateCounter NULLCAST_TWEET =
|
|
SearchRateCounter.export(STAT_PREFIX + "filter_nullcast_tweet");
|
|
@VisibleForTesting
|
|
static final SearchRateCounter NULLCAST_TWEET_ACCEPTED =
|
|
SearchRateCounter.export(STAT_PREFIX + "nullcast_tweet_accepted");
|
|
@VisibleForTesting
|
|
static final SearchRateCounter INCONSISTENT_TWEET_ID_AND_CREATED_AT =
|
|
SearchRateCounter.export(STAT_PREFIX + "inconsistent_tweet_id_and_created_at_ms");
|
|
|
|
/** Strips the given source from the message with the given ID. */
|
|
private static String stripSource(String source, Long messageId) {
|
|
if (source == null) {
|
|
return null;
|
|
}
|
|
// Always strip emojis from sources: they don't really make sense in this field.
|
|
String strippedSource = stripSupplementaryChars(
|
|
tagRemovalTransformer.transform(source).toString(), Field.SOURCE, true);
|
|
if (strippedSource.length() > MAX_SOURCE_LEN) {
|
|
LOG.warn("Message "
|
|
+ messageId
|
|
+ " contains stripped source that exceeds MAX_SOURCE_LEN. Removing: "
|
|
+ strippedSource);
|
|
COUNTERS_MAP.get(Field.SOURCE).getTruncatedCounter().increment();
|
|
return null;
|
|
}
|
|
return strippedSource;
|
|
}
|
|
|
|
/**
|
|
* Strips and truncates the location of the message with the given ID.
|
|
*
|
|
*/
|
|
private static String stripAndTruncateLocation(String location) {
|
|
// Always strip emojis from locations: they don't really make sense in this field.
|
|
String strippedLocation = stripSupplementaryChars(location, Field.NORMALIZED_LOCATION, true);
|
|
return truncateString(strippedLocation, MAX_LOCATION_LEN, Field.NORMALIZED_LOCATION, true);
|
|
}
|
|
|
|
/**
|
|
* Sets the origSource and strippedSource fields on a TwitterMessage
|
|
*
|
|
*/
|
|
public static void setSourceOnMessage(TwitterMessage message, String modifiedDeviceSource) {
|
|
// Always strip emojis from sources: they don't really make sense in this field.
|
|
message.setOrigSource(stripSupplementaryChars(modifiedDeviceSource, Field.ORIG_SOURCE, true));
|
|
message.setStrippedSource(stripSource(modifiedDeviceSource, message.getId()));
|
|
}
|
|
|
|
/**
|
|
* Sets the origLocation to the stripped location, and sets
|
|
* the truncatedNormalizedLocation to the truncated and normalized location.
|
|
*/
|
|
public static void setAndTruncateLocationOnMessage(
|
|
TwitterMessage message,
|
|
String newOrigLocation) {
|
|
// Always strip emojis from locations: they don't really make sense in this field.
|
|
message.setOrigLocation(stripSupplementaryChars(newOrigLocation, Field.ORIG_LOCATION, true));
|
|
|
|
// Locations in the new locations table require additional normalization. It can also change
|
|
// the length of the string, so we must do this before truncation.
|
|
if (newOrigLocation != null) {
|
|
String normalized =
|
|
Normalizer.normalize(newOrigLocation, Normalizer.Form.NFKC).toLowerCase().trim();
|
|
message.setTruncatedNormalizedLocation(stripAndTruncateLocation(normalized));
|
|
} else {
|
|
message.setTruncatedNormalizedLocation(null);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Validates the given TwitterMessage.
|
|
*
|
|
* @param message The message to validate.
|
|
* @param stripEmojisForFields The set of fields for which emojis should be stripped.
|
|
* @param acceptNullcastMessage Determines if this message should be accepted, if it's a nullcast
|
|
* message.
|
|
* @return {@code true} if the given message is valid; {@code false} otherwise.
|
|
*/
|
|
public static boolean validateTwitterMessage(
|
|
TwitterMessage message,
|
|
Set<Field> stripEmojisForFields,
|
|
boolean acceptNullcastMessage) {
|
|
if (message.getNullcast()) {
|
|
NULLCAST_TWEET.increment();
|
|
if (!acceptNullcastMessage) {
|
|
LOG.info("Dropping nullcasted message " + message.getId());
|
|
return false;
|
|
}
|
|
NULLCAST_TWEET_ACCEPTED.increment();
|
|
}
|
|
|
|
if (!message.getFromUserScreenName().isPresent()
|
|
|| StringUtils.isBlank(message.getFromUserScreenName().get())) {
|
|
LOG.error("Message " + message.getId() + " contains no from user. Skipping.");
|
|
FILTERED_NO_FROM_USER.increment();
|
|
return false;
|
|
}
|
|
String fromUserScreenName = message.getFromUserScreenName().get();
|
|
|
|
if (fromUserScreenName.length() > MAX_SCREEN_NAME_LEN) {
|
|
LOG.warn("Message " + message.getId() + " has a user screen name longer than "
|
|
+ MAX_SCREEN_NAME_LEN + " characters: " + message.getFromUserScreenName()
|
|
+ ". Skipping.");
|
|
FILTERED_LONG_SCREEN_NAME.increment();
|
|
return false;
|
|
}
|
|
|
|
// Remove supplementary characters and truncate these text fields.
|
|
if (message.getFromUserDisplayName().isPresent()) {
|
|
message.setFromUserDisplayName(stripSupplementaryChars(
|
|
message.getFromUserDisplayName().get(),
|
|
Field.FROM_USER_DISPLAY_NAME,
|
|
stripEmojisForFields.contains(Field.FROM_USER_DISPLAY_NAME)));
|
|
}
|
|
if (message.getToUserScreenName().isPresent()) {
|
|
String strippedToUserScreenName = stripSupplementaryChars(
|
|
message.getToUserLowercasedScreenName().get(),
|
|
Field.TO_USER_SCREEN_NAME,
|
|
stripEmojisForFields.contains(Field.TO_USER_SCREEN_NAME));
|
|
message.setToUserScreenName(
|
|
truncateString(
|
|
strippedToUserScreenName,
|
|
MAX_SCREEN_NAME_LEN,
|
|
Field.TO_USER_SCREEN_NAME,
|
|
stripEmojisForFields.contains(Field.TO_USER_SCREEN_NAME)));
|
|
}
|
|
|
|
String strippedText = stripSupplementaryChars(
|
|
message.getText(),
|
|
Field.TEXT,
|
|
stripEmojisForFields.contains(Field.TEXT));
|
|
message.setText(truncateString(
|
|
strippedText,
|
|
MAX_TWEET_TEXT_LEN,
|
|
Field.TEXT,
|
|
stripEmojisForFields.contains(Field.TEXT)));
|
|
|
|
if (StringUtils.isBlank(message.getText())) {
|
|
FILTERED_NO_TEXT.increment();
|
|
return false;
|
|
}
|
|
|
|
if (message.getDate() == null) {
|
|
LOG.error("Message " + message.getId() + " contains no date. Skipping.");
|
|
FILTERED_NO_DATE.increment();
|
|
return false;
|
|
}
|
|
|
|
if (message.isRetweet()) {
|
|
return validateRetweetMessage(message.getRetweetMessage(), stripEmojisForFields);
|
|
}
|
|
|
|
// Track if both the snowflake ID and created at timestamp are consistent.
|
|
if (!SnowflakeIdParser.isTweetIDAndCreatedAtConsistent(message.getId(), message.getDate())) {
|
|
LOG.error("Found inconsistent tweet ID and created at timestamp: [messageID="
|
|
+ message.getId() + "], [messageDate=" + message.getDate() + "].");
|
|
INCONSISTENT_TWEET_ID_AND_CREATED_AT.increment();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
private static boolean validateRetweetMessage(
|
|
TwitterRetweetMessage message, Set<Field> stripEmojisForFields) {
|
|
if (message.getSharedId() == null || message.getRetweetId() == null) {
|
|
LOG.error("Retweet Message contains a null twitter id. Skipping.");
|
|
FILTERED_NO_STATUS_ID.increment();
|
|
return false;
|
|
}
|
|
|
|
if (message.getSharedDate() == null) {
|
|
LOG.error("Retweet Message " + message.getRetweetId() + " contains no date. Skipping.");
|
|
return false;
|
|
}
|
|
|
|
// Remove supplementary characters from these text fields.
|
|
message.setSharedUserDisplayName(stripSupplementaryChars(
|
|
message.getSharedUserDisplayName(),
|
|
Field.SHARED_USER_DISPLAY_NAME,
|
|
stripEmojisForFields.contains(Field.SHARED_USER_DISPLAY_NAME)));
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Strips non indexable chars from the text.
|
|
*
|
|
* Returns the resulting string, which may be the same object as the text argument when
|
|
* no stripping or truncation is necessary.
|
|
*
|
|
* Non-indexed characters are "supplementary unicode" that are not emojis. Note that
|
|
* supplementary unicode are still characters that seem worth indexing, as many characters
|
|
* in CJK languages are supplementary. However this would make the size of our index
|
|
* explode (~186k supplementary characters exist), so it's not feasible.
|
|
*
|
|
* @param text The text to strip
|
|
* @param field The field this text is from
|
|
* @param stripSupplementaryEmojis Whether or not to strip supplementary emojis. Note that this
|
|
* parameter name isn't 100% accurate. This parameter is meant to replicate behavior prior to
|
|
* adding support for *not* stripping supplementary emojis. The prior behavior would turn an
|
|
* emoji such as a keycap "1\uFE0F\u20E3" (http://www.iemoji.com/view/emoji/295/symbols/keycap-1)
|
|
* into just '1'. So the keycap emoji is not completely stripped, only the portion after the '1'.
|
|
*
|
|
*/
|
|
@VisibleForTesting
|
|
public static String stripSupplementaryChars(
|
|
String text,
|
|
Field field,
|
|
boolean stripSupplementaryEmojis) {
|
|
if (text == null || text.isEmpty()) {
|
|
return text;
|
|
}
|
|
|
|
// Initialize an empty map so that if we choose not to strip emojis,
|
|
// then no emojipositions will be found and we don't need a null
|
|
// check before checking if an emoji is at a certain spot.
|
|
NavigableMap<Integer, Integer> emojiPositions = new TreeMap<>();
|
|
|
|
if (!stripSupplementaryEmojis) {
|
|
emojiPositions = EmojiExtractor.getEmojiPositions(text);
|
|
}
|
|
|
|
StringBuilder strippedTextBuilder = new StringBuilder();
|
|
int sequenceStart = 0;
|
|
int i = 0;
|
|
while (i < text.length()) {
|
|
if (Character.isSupplementaryCodePoint(text.codePointAt(i))) {
|
|
// Check if this supplementary character is an emoji
|
|
if (!emojiPositions.containsKey(i)) {
|
|
// It's not an emoji, or we want to strip emojis, so strip it
|
|
|
|
// text[i] and text[i + 1] are part of a supplementary code point.
|
|
strippedTextBuilder.append(text.substring(sequenceStart, i));
|
|
sequenceStart = i + 2; // skip 2 chars
|
|
i = sequenceStart;
|
|
COUNTERS_MAP.get(field).getStrippedSupplementaryCharsCounter().increment();
|
|
} else {
|
|
// It's an emoji, keep it
|
|
i += emojiPositions.get(i);
|
|
COUNTERS_MAP.get(field).getNonStrippedEmojiCharsCounter().increment();
|
|
}
|
|
} else {
|
|
++i;
|
|
}
|
|
}
|
|
if (sequenceStart < text.length()) {
|
|
strippedTextBuilder.append(text.substring(sequenceStart));
|
|
}
|
|
|
|
String strippedText = strippedTextBuilder.toString();
|
|
if (strippedText.length() < text.length()) {
|
|
COUNTERS_MAP.get(field).getTweetsWithStrippedSupplementaryCharsCounter().increment();
|
|
}
|
|
return strippedText;
|
|
}
|
|
|
|
/**
|
|
* Truncates the given string to the given length.
|
|
*
|
|
* Note that we are truncating based on the # of UTF-16 characters a given emoji takes up.
|
|
* So if a single emoji takes up 4 UTF-16 characters, that counts as 4 for the truncation,
|
|
* not just 1.
|
|
*
|
|
* @param text The text to truncate
|
|
* @param maxLength The maximum length of the string after truncation
|
|
* @param field The field from which this string cames
|
|
* @param splitEmojisAtMaxLength If true, don't worry about emojis and just truncate at maxLength,
|
|
* potentially splitting them. If false, truncate before the emoji if truncating at maxLength
|
|
* would cause the emoji to be split.
|
|
*/
|
|
@VisibleForTesting
|
|
static String truncateString(
|
|
String text,
|
|
int maxLength,
|
|
Field field,
|
|
boolean splitEmojisAtMaxLength) {
|
|
Preconditions.checkArgument(maxLength > 0);
|
|
|
|
if ((text == null) || (text.length() <= maxLength)) {
|
|
return text;
|
|
}
|
|
|
|
int truncatePoint = maxLength;
|
|
NavigableMap<Integer, Integer> emojiPositions;
|
|
// If we want to consider emojis we should not strip on an emoji boundary.
|
|
if (!splitEmojisAtMaxLength) {
|
|
emojiPositions = EmojiExtractor.getEmojiPositions(text);
|
|
|
|
// Get the last emoji before maxlength.
|
|
Map.Entry<Integer, Integer> lastEmojiBeforeMaxLengthEntry =
|
|
emojiPositions.lowerEntry(maxLength);
|
|
|
|
if (lastEmojiBeforeMaxLengthEntry != null) {
|
|
int lowerEmojiEnd = lastEmojiBeforeMaxLengthEntry.getKey()
|
|
+ lastEmojiBeforeMaxLengthEntry.getValue();
|
|
|
|
// If the last emoji would be truncated, truncate before the last emoji.
|
|
if (lowerEmojiEnd > truncatePoint) {
|
|
truncatePoint = lastEmojiBeforeMaxLengthEntry.getKey();
|
|
COUNTERS_MAP.get(field).getEmojisAtTruncateBoundaryCounter().increment();
|
|
}
|
|
}
|
|
}
|
|
|
|
COUNTERS_MAP.get(field).getTruncatedCounter().increment();
|
|
return text.substring(0, truncatePoint);
|
|
}
|
|
}
|