the-algorithm/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.java

100 lines
4.2 KiB
Java

package com.twitter.search.common.converter.earlybird;
import java.io.IOException;
import java.util.List;
import javax.annotation.concurrent.NotThreadSafe;
import com.google.common.base.Preconditions;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentBuilder;
import com.twitter.search.common.schema.thriftjava.ThriftDocument;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType;
/**
* CombinedIndexingConverter builds objects from TwitterMessage to ThriftVersionedEvent.
*
* It is used in tests and in offline jobs, so all data is available on the TwitterMessage. This
* means that we don't need to split up the ThriftVersionedEvents into basic events and update
* events, like we do in the realtime pipeline using the BasicIndexingConverter and the
* DelayedIndexingConverter.
*/
@NotThreadSafe
public class CombinedIndexingConverter {
private final EncodedFeatureBuilder featureBuilder;
private final Schema schema;
private final EarlybirdCluster cluster;
public CombinedIndexingConverter(Schema schema, EarlybirdCluster cluster) {
this.featureBuilder = new EncodedFeatureBuilder();
this.schema = schema;
this.cluster = cluster;
}
/**
* Converts a TwitterMessage to a Thrift representation.
*/
public ThriftVersionedEvents convertMessageToThrift(
TwitterMessage message,
boolean strict,
List<PenguinVersion> penguinVersions) throws IOException {
Preconditions.checkNotNull(message);
Preconditions.checkNotNull(penguinVersions);
ThriftVersionedEvents versionedEvents = new ThriftVersionedEvents()
.setId(message.getId());
ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot();
for (PenguinVersion penguinVersion : penguinVersions) {
ThriftDocument document =
buildDocumentForPenguinVersion(schemaSnapshot, message, strict, penguinVersion);
ThriftIndexingEvent thriftIndexingEvent = new ThriftIndexingEvent()
.setDocument(document)
.setEventType(ThriftIndexingEventType.INSERT)
.setSortId(message.getId());
message.getFromUserTwitterId().map(thriftIndexingEvent::setUid);
versionedEvents.putToVersionedEvents(penguinVersion.getByteValue(), thriftIndexingEvent);
}
return versionedEvents;
}
private ThriftDocument buildDocumentForPenguinVersion(
ImmutableSchemaInterface schemaSnapshot,
TwitterMessage message,
boolean strict,
PenguinVersion penguinVersion) throws IOException {
EncodedFeatureBuilder.TweetFeatureWithEncodeFeatures tweetFeature =
featureBuilder.createTweetFeaturesFromTwitterMessage(
message, penguinVersion, schemaSnapshot);
EarlybirdThriftDocumentBuilder builder =
BasicIndexingConverter.buildBasicFields(message, schemaSnapshot, cluster, tweetFeature);
BasicIndexingConverter
.buildUserFields(builder, message, tweetFeature.versionedFeatures, penguinVersion);
BasicIndexingConverter.buildGeoFields(builder, message, tweetFeature.versionedFeatures);
DelayedIndexingConverter.buildURLFields(builder, message, tweetFeature.encodedFeatures);
BasicIndexingConverter.buildRetweetAndReplyFields(builder, message, strict);
BasicIndexingConverter.buildQuotesFields(builder, message);
BasicIndexingConverter.buildVersionedFeatureFields(builder, tweetFeature.versionedFeatures);
DelayedIndexingConverter.buildCardFields(builder, message, penguinVersion);
BasicIndexingConverter.buildAnnotationFields(builder, message);
BasicIndexingConverter.buildNormalizedMinEngagementFields(
builder, tweetFeature.encodedFeatures, cluster);
DelayedIndexingConverter.buildNamedEntityFields(builder, message);
BasicIndexingConverter.buildDirectedAtFields(builder, message);
return builder.build();
}
}