the-algorithm/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java

648 lines
28 KiB
Java

package com.twitter.search.common.converter.earlybird;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import javax.annotation.concurrent.NotThreadSafe;
import com.google.common.base.Preconditions;
import org.apache.commons.collections.CollectionUtils;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.converter.earlybird.EncodedFeatureBuilder.TweetFeatureWithEncodeFeatures;
import com.twitter.search.common.indexing.thriftjava.Place;
import com.twitter.search.common.indexing.thriftjava.PotentialLocation;
import com.twitter.search.common.indexing.thriftjava.ProfileGeoEnrichment;
import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents;
import com.twitter.search.common.indexing.thriftjava.VersionedTweetFeatures;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
import com.twitter.search.common.relevance.entities.GeoObject;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.entities.TwitterQuotedMessage;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentBuilder;
import com.twitter.search.common.schema.thriftjava.ThriftDocument;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType;
import com.twitter.search.common.util.spatial.GeoUtil;
import com.twitter.search.common.util.text.NormalizerHelper;
import com.twitter.tweetypie.thriftjava.ComposerSource;
/**
* Converts a TwitterMessage into a ThriftVersionedEvents. This is only responsible for data that
* is available immediately when a Tweet is created. Some data, like URL data, isn't available
* immediately, and so it is processed later, in the DelayedIndexingConverter and sent as an
* update. In order to achieve this we create the document in 2 passes:
*
* 1. BasicIndexingConverter builds thriftVersionedEvents with the fields that do not require
* external services.
*
* 2. DelayedIndexingConverter builds all the document fields depending on external services, once
* those services have processed the relevant Tweet and we have retrieved that data.
*/
@NotThreadSafe
public class BasicIndexingConverter {
private static final Logger LOG = LoggerFactory.getLogger(BasicIndexingConverter.class);
private static final SearchCounter NUM_NULLCAST_FEATURE_FLAG_SET_TWEETS =
SearchCounter.export("num_nullcast_feature_flag_set_tweets");
private static final SearchCounter NUM_NULLCAST_TWEETS =
SearchCounter.export("num_nullcast_tweets");
private static final SearchCounter NUM_NON_NULLCAST_TWEETS =
SearchCounter.export("num_non_nullcast_tweets");
private static final SearchCounter ADJUSTED_BAD_CREATED_AT_COUNTER =
SearchCounter.export("adjusted_incorrect_created_at_timestamp");
private static final SearchCounter INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS =
SearchCounter.export("inconsistent_tweet_id_and_created_at_ms");
private static final SearchCounter NUM_SELF_THREAD_TWEETS =
SearchCounter.export("num_self_thread_tweets");
private static final SearchCounter NUM_EXCLUSIVE_TWEETS =
SearchCounter.export("num_exclusive_tweets");
// If a tweet carries a timestamp smaller than this timestamp, we consider the timestamp invalid,
// because twitter does not even exist back then before: Sun, 01 Jan 2006 00:00:00 GMT
private static final long VALID_CREATION_TIME_THRESHOLD_MILLIS =
new DateTime(2006, 1, 1, 0, 0, 0, DateTimeZone.UTC).getMillis();
private final EncodedFeatureBuilder featureBuilder;
private final Schema schema;
private final EarlybirdCluster cluster;
public BasicIndexingConverter(Schema schema, EarlybirdCluster cluster) {
this.featureBuilder = new EncodedFeatureBuilder();
this.schema = schema;
this.cluster = cluster;
}
/**
* This function converts TwitterMessage to ThriftVersionedEvents, which is a generic data
* structure that can be consumed by Earlybird directly.
*/
public ThriftVersionedEvents convertMessageToThrift(
TwitterMessage message,
boolean strict,
List<PenguinVersion> penguinVersions) throws IOException {
Preconditions.checkNotNull(message);
Preconditions.checkNotNull(penguinVersions);
ThriftVersionedEvents versionedEvents = new ThriftVersionedEvents()
.setId(message.getId());
ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot();
for (PenguinVersion penguinVersion : penguinVersions) {
ThriftDocument document =
buildDocumentForPenguinVersion(schemaSnapshot, message, strict, penguinVersion);
ThriftIndexingEvent thriftIndexingEvent = new ThriftIndexingEvent()
.setDocument(document)
.setEventType(ThriftIndexingEventType.INSERT)
.setSortId(message.getId());
message.getFromUserTwitterId().map(thriftIndexingEvent::setUid);
versionedEvents.putToVersionedEvents(penguinVersion.getByteValue(), thriftIndexingEvent);
}
return versionedEvents;
}
private ThriftDocument buildDocumentForPenguinVersion(
ImmutableSchemaInterface schemaSnapshot,
TwitterMessage message,
boolean strict,
PenguinVersion penguinVersion) throws IOException {
TweetFeatureWithEncodeFeatures tweetFeature =
featureBuilder.createTweetFeaturesFromTwitterMessage(
message, penguinVersion, schemaSnapshot);
EarlybirdThriftDocumentBuilder builder =
buildBasicFields(message, schemaSnapshot, cluster, tweetFeature);
buildUserFields(builder, message, tweetFeature.versionedFeatures, penguinVersion);
buildGeoFields(builder, message, tweetFeature.versionedFeatures);
buildRetweetAndReplyFields(builder, message, strict);
buildQuotesFields(builder, message);
buildVersionedFeatureFields(builder, tweetFeature.versionedFeatures);
buildAnnotationFields(builder, message);
buildNormalizedMinEngagementFields(builder, tweetFeature.encodedFeatures, cluster);
buildDirectedAtFields(builder, message);
builder.withSpaceIdFields(message.getSpaceIds());
return builder.build();
}
/**
* Build the basic fields for a tweet.
*/
public static EarlybirdThriftDocumentBuilder buildBasicFields(
TwitterMessage message,
ImmutableSchemaInterface schemaSnapshot,
EarlybirdCluster cluster,
TweetFeatureWithEncodeFeatures tweetFeature) {
EarlybirdEncodedFeatures extendedEncodedFeatures = tweetFeature.extendedEncodedFeatures;
if (extendedEncodedFeatures == null && EarlybirdCluster.isTwitterMemoryFormatCluster(cluster)) {
extendedEncodedFeatures = EarlybirdEncodedFeatures.newEncodedTweetFeatures(
schemaSnapshot, EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD);
}
EarlybirdThriftDocumentBuilder builder = new EarlybirdThriftDocumentBuilder(
tweetFeature.encodedFeatures,
extendedEncodedFeatures,
new EarlybirdFieldConstants(),
schemaSnapshot);
builder.withID(message.getId());
final Date createdAt = message.getDate();
long createdAtMs = createdAt == null ? 0L : createdAt.getTime();
createdAtMs = fixCreatedAtTimeStampIfNecessary(message.getId(), createdAtMs);
if (createdAtMs > 0L) {
builder.withCreatedAt((int) (createdAtMs / 1000));
}
builder.withTweetSignature(tweetFeature.versionedFeatures.getTweetSignature());
if (message.getConversationId() > 0) {
long conversationId = message.getConversationId();
builder.withLongField(
EarlybirdFieldConstant.CONVERSATION_ID_CSF.getFieldName(), conversationId);
// We only index conversation ID when it is different from the tweet ID.
if (message.getId() != conversationId) {
builder.withLongField(
EarlybirdFieldConstant.CONVERSATION_ID_FIELD.getFieldName(), conversationId);
}
}
if (message.getComposerSource().isPresent()) {
ComposerSource composerSource = message.getComposerSource().get();
builder.withIntField(
EarlybirdFieldConstant.COMPOSER_SOURCE.getFieldName(), composerSource.getValue());
if (composerSource == ComposerSource.CAMERA) {
builder.withCameraComposerSourceFlag();
}
}
EarlybirdEncodedFeatures encodedFeatures = tweetFeature.encodedFeatures;
if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG)) {
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.VERIFIED_FILTER_TERM);
}
if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG)) {
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.BLUE_VERIFIED_FILTER_TERM);
}
if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG)) {
builder.withOffensiveFlag();
}
if (message.getNullcast()) {
NUM_NULLCAST_TWEETS.increment();
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.NULLCAST_FILTER_TERM);
} else {
NUM_NON_NULLCAST_TWEETS.increment();
}
if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.IS_NULLCAST_FLAG)) {
NUM_NULLCAST_FEATURE_FLAG_SET_TWEETS.increment();
}
if (message.isSelfThread()) {
builder.addFilterInternalFieldTerm(
EarlybirdFieldConstant.SELF_THREAD_FILTER_TERM);
NUM_SELF_THREAD_TWEETS.increment();
}
if (message.isExclusive()) {
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.EXCLUSIVE_FILTER_TERM);
builder.withLongField(
EarlybirdFieldConstant.EXCLUSIVE_CONVERSATION_AUTHOR_ID_CSF.getFieldName(),
message.getExclusiveConversationAuthorId());
NUM_EXCLUSIVE_TWEETS.increment();
}
builder.withLanguageCodes(message.getLanguage(), message.getBCP47LanguageTag());
return builder;
}
/**
* Build the user fields.
*/
public static void buildUserFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message,
VersionedTweetFeatures versionedTweetFeatures,
PenguinVersion penguinVersion) {
// 1. Set all the from user fields.
if (message.getFromUserTwitterId().isPresent()) {
builder.withLongField(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(),
message.getFromUserTwitterId().get())
// CSF
.withLongField(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(),
message.getFromUserTwitterId().get());
} else {
LOG.warn("fromUserTwitterId is not set in TwitterMessage! Status id: " + message.getId());
}
if (message.getFromUserScreenName().isPresent()) {
String fromUser = message.getFromUserScreenName().get();
String normalizedFromUser =
NormalizerHelper.normalizeWithUnknownLocale(fromUser, penguinVersion);
builder
.withWhiteSpaceTokenizedScreenNameField(
EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(),
normalizedFromUser)
.withStringField(EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(),
normalizedFromUser);
if (message.getTokenizedFromUserScreenName().isPresent()) {
builder.withCamelCaseTokenizedScreenNameField(
EarlybirdFieldConstant.CAMELCASE_USER_HANDLE_FIELD.getFieldName(),
fromUser,
normalizedFromUser,
message.getTokenizedFromUserScreenName().get());
}
}
Optional<String> toUserScreenName = message.getToUserLowercasedScreenName();
if (toUserScreenName.isPresent() && !toUserScreenName.get().isEmpty()) {
builder.withStringField(
EarlybirdFieldConstant.TO_USER_FIELD.getFieldName(),
NormalizerHelper.normalizeWithUnknownLocale(toUserScreenName.get(), penguinVersion));
}
if (versionedTweetFeatures.isSetUserDisplayNameTokenStreamText()) {
builder.withTokenStreamField(EarlybirdFieldConstant.TOKENIZED_USER_NAME_FIELD.getFieldName(),
versionedTweetFeatures.getUserDisplayNameTokenStreamText(),
versionedTweetFeatures.getUserDisplayNameTokenStream());
}
}
/**
* Build the geo fields.
*/
public static void buildGeoFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message,
VersionedTweetFeatures versionedTweetFeatures) {
double lat = GeoUtil.ILLEGAL_LATLON;
double lon = GeoUtil.ILLEGAL_LATLON;
if (message.getGeoLocation() != null) {
GeoObject location = message.getGeoLocation();
builder.withGeoField(EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName(),
location.getLatitude(), location.getLongitude(), location.getAccuracy());
if (location.getSource() != null) {
builder.withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
EarlybirdFieldConstants.formatGeoType(location.getSource()));
}
if (GeoUtil.validateGeoCoordinates(location.getLatitude(), location.getLongitude())) {
lat = location.getLatitude();
lon = location.getLongitude();
}
}
// See SEARCH-14317 for investigation on how much space geo filed is used in archive cluster.
// In lucene archives, this CSF is needed regardless of whether geoLocation is set.
builder.withLatLonCSF(lat, lon);
if (versionedTweetFeatures.isSetTokenizedPlace()) {
Place place = versionedTweetFeatures.getTokenizedPlace();
Preconditions.checkArgument(place.isSetId(), "Place ID not set for tweet "
+ message.getId());
Preconditions.checkArgument(place.isSetFullName(),
"Place full name not set for tweet " + message.getId());
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName());
builder
.withStringField(EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName(), place.getId())
.withStringField(EarlybirdFieldConstant.PLACE_FULL_NAME_FIELD.getFieldName(),
place.getFullName());
if (place.isSetCountryCode()) {
builder.withStringField(EarlybirdFieldConstant.PLACE_COUNTRY_CODE_FIELD.getFieldName(),
place.getCountryCode());
}
}
if (versionedTweetFeatures.isSetTokenizedProfileGeoEnrichment()) {
ProfileGeoEnrichment profileGeoEnrichment =
versionedTweetFeatures.getTokenizedProfileGeoEnrichment();
Preconditions.checkArgument(
profileGeoEnrichment.isSetPotentialLocations(),
"ProfileGeoEnrichment.potentialLocations not set for tweet "
+ message.getId());
List<PotentialLocation> potentialLocations = profileGeoEnrichment.getPotentialLocations();
Preconditions.checkArgument(
!potentialLocations.isEmpty(),
"Found tweet with an empty ProfileGeoEnrichment.potentialLocations: "
+ message.getId());
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.PROFILE_GEO_FILTER_TERM);
for (PotentialLocation potentialLocation : potentialLocations) {
if (potentialLocation.isSetCountryCode()) {
builder.withStringField(
EarlybirdFieldConstant.PROFILE_GEO_COUNTRY_CODE_FIELD.getFieldName(),
potentialLocation.getCountryCode());
}
if (potentialLocation.isSetRegion()) {
builder.withStringField(EarlybirdFieldConstant.PROFILE_GEO_REGION_FIELD.getFieldName(),
potentialLocation.getRegion());
}
if (potentialLocation.isSetLocality()) {
builder.withStringField(EarlybirdFieldConstant.PROFILE_GEO_LOCALITY_FIELD.getFieldName(),
potentialLocation.getLocality());
}
}
}
builder.withPlacesField(message.getPlaces());
}
/**
* Build the retweet and reply fields.
*/
public static void buildRetweetAndReplyFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message,
boolean strict) {
long retweetUserIdVal = -1;
long sharedStatusIdVal = -1;
if (message.getRetweetMessage() != null) {
if (message.getRetweetMessage().getSharedId() != null) {
sharedStatusIdVal = message.getRetweetMessage().getSharedId();
}
if (message.getRetweetMessage().hasSharedUserTwitterId()) {
retweetUserIdVal = message.getRetweetMessage().getSharedUserTwitterId();
}
}
long inReplyToStatusIdVal = -1;
long inReplyToUserIdVal = -1;
if (message.isReply()) {
if (message.getInReplyToStatusId().isPresent()) {
inReplyToStatusIdVal = message.getInReplyToStatusId().get();
}
if (message.getToUserTwitterId().isPresent()) {
inReplyToUserIdVal = message.getToUserTwitterId().get();
}
}
buildRetweetAndReplyFields(
retweetUserIdVal,
sharedStatusIdVal,
inReplyToStatusIdVal,
inReplyToUserIdVal,
strict,
builder);
}
/**
* Build the quotes fields.
*/
public static void buildQuotesFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message) {
if (message.getQuotedMessage() != null) {
TwitterQuotedMessage quoted = message.getQuotedMessage();
if (quoted != null && quoted.getQuotedStatusId() > 0 && quoted.getQuotedUserId() > 0) {
builder.withQuote(quoted.getQuotedStatusId(), quoted.getQuotedUserId());
}
}
}
/**
* Build directed at field.
*/
public static void buildDirectedAtFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message) {
if (message.getDirectedAtUserId().isPresent() && message.getDirectedAtUserId().get() > 0) {
builder.withDirectedAtUser(message.getDirectedAtUserId().get());
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.DIRECTED_AT_FILTER_TERM);
}
}
/**
* Build the versioned features for a tweet.
*/
public static void buildVersionedFeatureFields(
EarlybirdThriftDocumentBuilder builder,
VersionedTweetFeatures versionedTweetFeatures) {
builder
.withHashtagsField(versionedTweetFeatures.getHashtags())
.withMentionsField(versionedTweetFeatures.getMentions())
.withStocksFields(versionedTweetFeatures.getStocks())
.withResolvedLinksText(versionedTweetFeatures.getNormalizedResolvedUrlText())
.withTokenStreamField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(),
versionedTweetFeatures.getTweetTokenStreamText(),
versionedTweetFeatures.isSetTweetTokenStream()
? versionedTweetFeatures.getTweetTokenStream() : null)
.withStringField(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName(),
versionedTweetFeatures.getSource())
.withStringField(EarlybirdFieldConstant.NORMALIZED_SOURCE_FIELD.getFieldName(),
versionedTweetFeatures.getNormalizedSource());
// Internal fields for smileys and question marks
if (versionedTweetFeatures.hasPositiveSmiley) {
builder.withStringField(
EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
EarlybirdFieldConstant.HAS_POSITIVE_SMILEY);
}
if (versionedTweetFeatures.hasNegativeSmiley) {
builder.withStringField(
EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
EarlybirdFieldConstant.HAS_NEGATIVE_SMILEY);
}
if (versionedTweetFeatures.hasQuestionMark) {
builder.withStringField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(),
EarlybirdThriftDocumentBuilder.QUESTION_MARK);
}
}
/**
* Build the escherbird annotations for a tweet.
*/
public static void buildAnnotationFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message) {
List<TwitterMessage.EscherbirdAnnotation> escherbirdAnnotations =
message.getEscherbirdAnnotations();
if (CollectionUtils.isEmpty(escherbirdAnnotations)) {
return;
}
builder.addFacetSkipList(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName());
for (TwitterMessage.EscherbirdAnnotation annotation : escherbirdAnnotations) {
String groupDomainEntity = String.format("%d.%d.%d",
annotation.groupId, annotation.domainId, annotation.entityId);
String domainEntity = String.format("%d.%d", annotation.domainId, annotation.entityId);
String entity = String.format("%d", annotation.entityId);
builder.withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(),
groupDomainEntity);
builder.withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(),
domainEntity);
builder.withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(),
entity);
}
}
/**
* Build the correct ThriftIndexingEvent's fields based on retweet and reply status.
*/
public static void buildRetweetAndReplyFields(
long retweetUserIdVal,
long sharedStatusIdVal,
long inReplyToStatusIdVal,
long inReplyToUserIdVal,
boolean strict,
EarlybirdThriftDocumentBuilder builder) {
Optional<Long> retweetUserId = Optional.of(retweetUserIdVal).filter(x -> x > 0);
Optional<Long> sharedStatusId = Optional.of(sharedStatusIdVal).filter(x -> x > 0);
Optional<Long> inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(x -> x > 0);
Optional<Long> inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(x -> x > 0);
// We have six combinations here. A Tweet can be
// 1) a reply to another tweet (then it has both in-reply-to-user-id and
// in-reply-to-status-id set),
// 2) directed-at a user (then it only has in-reply-to-user-id set),
// 3) not a reply at all.
// Additionally, it may or may not be a Retweet (if it is, then it has retweet-user-id and
// retweet-status-id set).
//
// We want to set some fields unconditionally, and some fields (reference-author-id and
// shared-status-id) depending on the reply/retweet combination.
//
// 1. Normal tweet (not a reply, not a retweet). None of the fields should be set.
//
// 2. Reply to a tweet (both in-reply-to-user-id and in-reply-to-status-id set).
// IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
// SHARED_STATUS_ID_CSF should be set to in-reply-to-status-id
// IS_REPLY_FLAG should be set
//
// 3. Directed-at a user (only in-reply-to-user-id is set).
// IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
// IS_REPLY_FLAG should be set
//
// 4. Retweet of a normal tweet (retweet-user-id and retweet-status-id are set).
// RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
// SHARED_STATUS_ID_CSF should be set to retweet-status-id
// IS_RETWEET_FLAG should be set
//
// 5. Retweet of a reply (both in-reply-to-user-id and in-reply-to-status-id set,
// retweet-user-id and retweet-status-id are set).
// RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
// SHARED_STATUS_ID_CSF should be set to retweet-status-id (retweet beats reply!)
// IS_RETWEET_FLAG should be set
// IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
// IS_REPLY_FLAG should NOT be set
//
// 6. Retweet of a directed-at tweet (only in-reply-to-user-id is set,
// retweet-user-id and retweet-status-id are set).
// RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
// SHARED_STATUS_ID_CSF should be set to retweet-status-id
// IS_RETWEET_FLAG should be set
// IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
// IS_REPLY_FLAG should NOT be set
//
// In other words:
// SHARED_STATUS_ID_CSF logic: if this is a retweet SHARED_STATUS_ID_CSF should be set to
// retweet-status-id, otherwise if it's a reply to a tweet, it should be set to
// in-reply-to-status-id.
Preconditions.checkState(retweetUserId.isPresent() == sharedStatusId.isPresent());
if (retweetUserId.isPresent()) {
builder.withNativeRetweet(retweetUserId.get(), sharedStatusId.get());
if (inReplyToUserId.isPresent()) {
// Set IN_REPLY_TO_USER_ID_FIELD even if this is a retweet of a reply.
builder.withInReplyToUserID(inReplyToUserId.get());
}
} else {
// If this is a retweet of a reply, we don't want to mark it as a reply, or override fields
// set by the retweet logic.
// If we are in this branch, this is not a retweet. Potentially, we set the reply flag,
// and override shared-status-id and reference-author-id.
if (inReplyToStatusId.isPresent()) {
if (strict) {
// Enforcing that if this is a reply to a tweet, then it also has a replied-to user.
Preconditions.checkState(inReplyToUserId.isPresent());
}
builder.withReplyFlag();
builder.withLongField(
EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(),
inReplyToStatusId.get());
builder.withLongField(
EarlybirdFieldConstant.IN_REPLY_TO_TWEET_ID_FIELD.getFieldName(),
inReplyToStatusId.get());
}
if (inReplyToUserId.isPresent()) {
builder.withReplyFlag();
builder.withInReplyToUserID(inReplyToUserId.get());
}
}
}
/**
* Build the engagement fields.
*/
public static void buildNormalizedMinEngagementFields(
EarlybirdThriftDocumentBuilder builder,
EarlybirdEncodedFeatures encodedFeatures,
EarlybirdCluster cluster) throws IOException {
if (EarlybirdCluster.isArchive(cluster)) {
int favoriteCount = encodedFeatures.getFeatureValue(EarlybirdFieldConstant.FAVORITE_COUNT);
int retweetCount = encodedFeatures.getFeatureValue(EarlybirdFieldConstant.RETWEET_COUNT);
int replyCount = encodedFeatures.getFeatureValue(EarlybirdFieldConstant.REPLY_COUNT);
builder
.withNormalizedMinEngagementField(
EarlybirdFieldConstant.NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD
.getFieldName(),
favoriteCount);
builder
.withNormalizedMinEngagementField(
EarlybirdFieldConstant.NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD
.getFieldName(),
retweetCount);
builder
.withNormalizedMinEngagementField(
EarlybirdFieldConstant.NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD
.getFieldName(),
replyCount);
}
}
/**
* As seen in SEARCH-5617, we sometimes have incorrect createdAt. This method tries to fix them
* by extracting creation time from snowflake when possible.
*/
public static long fixCreatedAtTimeStampIfNecessary(long id, long createdAtMs) {
if (createdAtMs < VALID_CREATION_TIME_THRESHOLD_MILLIS
&& id > SnowflakeIdParser.SNOWFLAKE_ID_LOWER_BOUND) {
// This tweet has a snowflake ID, and we can extract timestamp from the ID.
ADJUSTED_BAD_CREATED_AT_COUNTER.increment();
return SnowflakeIdParser.getTimestampFromTweetId(id);
} else if (!SnowflakeIdParser.isTweetIDAndCreatedAtConsistent(id, createdAtMs)) {
LOG.error(
"Found inconsistent tweet ID and created at timestamp: [statusID={}], [createdAtMs={}]",
id, createdAtMs);
INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS.increment();
}
return createdAtMs;
}
}