Compare commits

...

2 Commits

Author SHA1 Message Date
AdamFrack 6c8bca84f4
Merge ac9b4ea0a0 into 72eda9a24f 2024-02-19 13:33:52 +00:00
AdamFrack ac9b4ea0a0
fix twitter 2024-02-19 14:33:48 +01:00
1510 changed files with 0 additions and 194245 deletions

View File

@ -1,50 +0,0 @@
# Tweet Search System (Earlybird)
> **TL;DR** Tweet Search System (Earlybird) find tweets from people you follow, rank them, and serve the tweets to Home.
## What is Tweet Search System (Earlybird)?
[Earlybird](http://notes.stephenholiday.com/Earlybird.pdf) is a **real-time search system** based on [Apache Lucene](https://lucene.apache.org/) to support the high volume of queries and content updates. The major use cases are Relevance Search (specifically, Text search) and Timeline In-network Tweet retrieval (or UserID based search). It is designed to enable the efficient indexing and querying of billions of tweets, and to provide low-latency search results, even with heavy query loads.
## How it is related to the Home Timeline Recommendation Algorithm
![in-network](img/in-network.png)
At Twitter, we use Tweet Search System (Earlybird) to do Home Timeline In-network Tweet retrieval: given a list of following users, find their recently posted tweets. Earlybird (Search Index) is the major candidate source for in-network tweets across Following tab and For You tab.
## High-level architecture
We split our entire tweet search index into three clusters: a **realtime** cluster indexing all public tweets posted in about the last 7 days, a **protected** cluster indexing all protected tweets for the same timeframe; and an **archive** cluster indexing all tweets ever posted, up to about two days ago.
Earlybird addresses the challenges of scaling real-time search by splitting each cluster across multiple **partitions**, each responsible for a portion of the index. The architecture uses a distributed *inverted index* that is sharded and replicated. This design allows for efficient index updates and query processing.
The system also employs an incremental indexing approach, enabling it to process and index new tweets in real-time as they arrive. With single writer, multiple reader structure, Earlybird can handle a large number of real-time updates and queries concurrently while maintaining low query latency. The system can achieve high query throughput and low query latency while maintaining a high degree of index freshness.
### Indexing
* Ingesters read tweets and user modifications from kafka topics, extract fields and features from them and write the extracted data to intermediate kafka topics for Earlybirds to consume, index and serve.
* Feature Update Service feeds feature updates such as up-to-date engagement (like, retweets, replies) counts to Earlybird.
![indexing](img/indexing.png)
### Serving
Earlybird roots fanout requests to different Earlybird clusters or partitions. Upon receiving responses from the clusters or partitions, roots merge the responses before finally returning the merged response to the client.
![serving](img/serving.png)
## Use cases
1. Tweet Search
* Top search
* Latest search
![top](img/top-search.png)
2. Candidate generation
* Timeline (For You Tab, Following Tab)
* Notifications
![home](img/foryou.png)
## References
* "Earlybird: Real-Time Search at Twitter" (http://notes.stephenholiday.com/Earlybird.pdf)
* "Reducing search indexing latency to one second" (https://blog.twitter.com/engineering/en_us/topics/infrastructure/2020/reducing-search-indexing-latency-to-one-second)
* "Omnisearch index formats" (https://blog.twitter.com/engineering/en_us/topics/infrastructure/2016/omnisearch-index-formats)

View File

@ -1 +0,0 @@
Contains code that is common to multiple earlybird services (ingesters, roots and earlybird).

View File

@ -1,57 +0,0 @@
java_library(
sources = ["*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/com/twitter/elephantbird:core",
"3rdparty/jvm/geo/google:geoGoogle",
"3rdparty/jvm/joda-time",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/httpcomponents:httpcore",
"3rdparty/jvm/org/apache/lucene:lucene-core",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"3rdparty/jvm/org/slf4j:slf4j-api",
"cuad/projects/ner/thrift/src/main/thrift:thrift-java",
"decider/src/main/scala",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/collections",
"src/java/com/twitter/common/text/language:locale-util",
"src/java/com/twitter/common/text/token",
"src/java/com/twitter/common/text/util:token-util",
"src/java/com/twitter/common_internal/text:text-penguin7",
"src/java/com/twitter/common_internal/text/version",
"src/java/com/twitter/search/common/config",
"src/java/com/twitter/search/common/constants",
"src/java/com/twitter/search/common/debug",
"src/java/com/twitter/search/common/decider",
"src/java/com/twitter/search/common/encoding/docvalues",
"src/java/com/twitter/search/common/encoding/features",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/partitioning/base",
"src/java/com/twitter/search/common/partitioning/snowflakeparser",
"src/java/com/twitter/search/common/relevance:entities_and_filters",
"src/java/com/twitter/search/common/relevance:text",
"src/java/com/twitter/search/common/relevance/features",
"src/java/com/twitter/search/common/schema",
"src/java/com/twitter/search/common/schema/base",
"src/java/com/twitter/search/common/schema/earlybird",
"src/java/com/twitter/search/common/util:longintconverter",
"src/java/com/twitter/search/common/util/analysis",
"src/java/com/twitter/search/common/util/lang",
"src/java/com/twitter/search/common/util/spatial",
"src/java/com/twitter/search/common/util/text",
"src/java/com/twitter/search/common/util/text/regex",
"src/java/com/twitter/search/common/util/thrift:thrift-utils",
"src/java/com/twitter/search/common/util/url",
"src/java/com/twitter/search/ingester/model",
"src/thrift/com/twitter/search/common:constants-java",
"src/thrift/com/twitter/search/common:indexing-java",
"src/thrift/com/twitter/search/common:schema-java",
"src/thrift/com/twitter/search/common/debug:debug-java",
"src/thrift/com/twitter/service/spiderduck/gen:metadata-store-java",
"src/thrift/com/twitter/tweetypie:tweet-java",
],
)

View File

@ -1,647 +0,0 @@
package com.twitter.search.common.converter.earlybird;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import javax.annotation.concurrent.NotThreadSafe;
import com.google.common.base.Preconditions;
import org.apache.commons.collections.CollectionUtils;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.converter.earlybird.EncodedFeatureBuilder.TweetFeatureWithEncodeFeatures;
import com.twitter.search.common.indexing.thriftjava.Place;
import com.twitter.search.common.indexing.thriftjava.PotentialLocation;
import com.twitter.search.common.indexing.thriftjava.ProfileGeoEnrichment;
import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents;
import com.twitter.search.common.indexing.thriftjava.VersionedTweetFeatures;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
import com.twitter.search.common.relevance.entities.GeoObject;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.entities.TwitterQuotedMessage;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentBuilder;
import com.twitter.search.common.schema.thriftjava.ThriftDocument;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType;
import com.twitter.search.common.util.spatial.GeoUtil;
import com.twitter.search.common.util.text.NormalizerHelper;
import com.twitter.tweetypie.thriftjava.ComposerSource;
/**
* Converts a TwitterMessage into a ThriftVersionedEvents. This is only responsible for data that
* is available immediately when a Tweet is created. Some data, like URL data, isn't available
* immediately, and so it is processed later, in the DelayedIndexingConverter and sent as an
* update. In order to achieve this we create the document in 2 passes:
*
* 1. BasicIndexingConverter builds thriftVersionedEvents with the fields that do not require
* external services.
*
* 2. DelayedIndexingConverter builds all the document fields depending on external services, once
* those services have processed the relevant Tweet and we have retrieved that data.
*/
@NotThreadSafe
public class BasicIndexingConverter {
private static final Logger LOG = LoggerFactory.getLogger(BasicIndexingConverter.class);
private static final SearchCounter NUM_NULLCAST_FEATURE_FLAG_SET_TWEETS =
SearchCounter.export("num_nullcast_feature_flag_set_tweets");
private static final SearchCounter NUM_NULLCAST_TWEETS =
SearchCounter.export("num_nullcast_tweets");
private static final SearchCounter NUM_NON_NULLCAST_TWEETS =
SearchCounter.export("num_non_nullcast_tweets");
private static final SearchCounter ADJUSTED_BAD_CREATED_AT_COUNTER =
SearchCounter.export("adjusted_incorrect_created_at_timestamp");
private static final SearchCounter INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS =
SearchCounter.export("inconsistent_tweet_id_and_created_at_ms");
private static final SearchCounter NUM_SELF_THREAD_TWEETS =
SearchCounter.export("num_self_thread_tweets");
private static final SearchCounter NUM_EXCLUSIVE_TWEETS =
SearchCounter.export("num_exclusive_tweets");
// If a tweet carries a timestamp smaller than this timestamp, we consider the timestamp invalid,
// because twitter does not even exist back then before: Sun, 01 Jan 2006 00:00:00 GMT
private static final long VALID_CREATION_TIME_THRESHOLD_MILLIS =
new DateTime(2006, 1, 1, 0, 0, 0, DateTimeZone.UTC).getMillis();
private final EncodedFeatureBuilder featureBuilder;
private final Schema schema;
private final EarlybirdCluster cluster;
public BasicIndexingConverter(Schema schema, EarlybirdCluster cluster) {
this.featureBuilder = new EncodedFeatureBuilder();
this.schema = schema;
this.cluster = cluster;
}
/**
* This function converts TwitterMessage to ThriftVersionedEvents, which is a generic data
* structure that can be consumed by Earlybird directly.
*/
public ThriftVersionedEvents convertMessageToThrift(
TwitterMessage message,
boolean strict,
List<PenguinVersion> penguinVersions) throws IOException {
Preconditions.checkNotNull(message);
Preconditions.checkNotNull(penguinVersions);
ThriftVersionedEvents versionedEvents = new ThriftVersionedEvents()
.setId(message.getId());
ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot();
for (PenguinVersion penguinVersion : penguinVersions) {
ThriftDocument document =
buildDocumentForPenguinVersion(schemaSnapshot, message, strict, penguinVersion);
ThriftIndexingEvent thriftIndexingEvent = new ThriftIndexingEvent()
.setDocument(document)
.setEventType(ThriftIndexingEventType.INSERT)
.setSortId(message.getId());
message.getFromUserTwitterId().map(thriftIndexingEvent::setUid);
versionedEvents.putToVersionedEvents(penguinVersion.getByteValue(), thriftIndexingEvent);
}
return versionedEvents;
}
private ThriftDocument buildDocumentForPenguinVersion(
ImmutableSchemaInterface schemaSnapshot,
TwitterMessage message,
boolean strict,
PenguinVersion penguinVersion) throws IOException {
TweetFeatureWithEncodeFeatures tweetFeature =
featureBuilder.createTweetFeaturesFromTwitterMessage(
message, penguinVersion, schemaSnapshot);
EarlybirdThriftDocumentBuilder builder =
buildBasicFields(message, schemaSnapshot, cluster, tweetFeature);
buildUserFields(builder, message, tweetFeature.versionedFeatures, penguinVersion);
buildGeoFields(builder, message, tweetFeature.versionedFeatures);
buildRetweetAndReplyFields(builder, message, strict);
buildQuotesFields(builder, message);
buildVersionedFeatureFields(builder, tweetFeature.versionedFeatures);
buildAnnotationFields(builder, message);
buildNormalizedMinEngagementFields(builder, tweetFeature.encodedFeatures, cluster);
buildDirectedAtFields(builder, message);
builder.withSpaceIdFields(message.getSpaceIds());
return builder.build();
}
/**
* Build the basic fields for a tweet.
*/
public static EarlybirdThriftDocumentBuilder buildBasicFields(
TwitterMessage message,
ImmutableSchemaInterface schemaSnapshot,
EarlybirdCluster cluster,
TweetFeatureWithEncodeFeatures tweetFeature) {
EarlybirdEncodedFeatures extendedEncodedFeatures = tweetFeature.extendedEncodedFeatures;
if (extendedEncodedFeatures == null && EarlybirdCluster.isTwitterMemoryFormatCluster(cluster)) {
extendedEncodedFeatures = EarlybirdEncodedFeatures.newEncodedTweetFeatures(
schemaSnapshot, EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD);
}
EarlybirdThriftDocumentBuilder builder = new EarlybirdThriftDocumentBuilder(
tweetFeature.encodedFeatures,
extendedEncodedFeatures,
new EarlybirdFieldConstants(),
schemaSnapshot);
builder.withID(message.getId());
final Date createdAt = message.getDate();
long createdAtMs = createdAt == null ? 0L : createdAt.getTime();
createdAtMs = fixCreatedAtTimeStampIfNecessary(message.getId(), createdAtMs);
if (createdAtMs > 0L) {
builder.withCreatedAt((int) (createdAtMs / 1000));
}
builder.withTweetSignature(tweetFeature.versionedFeatures.getTweetSignature());
if (message.getConversationId() > 0) {
long conversationId = message.getConversationId();
builder.withLongField(
EarlybirdFieldConstant.CONVERSATION_ID_CSF.getFieldName(), conversationId);
// We only index conversation ID when it is different from the tweet ID.
if (message.getId() != conversationId) {
builder.withLongField(
EarlybirdFieldConstant.CONVERSATION_ID_FIELD.getFieldName(), conversationId);
}
}
if (message.getComposerSource().isPresent()) {
ComposerSource composerSource = message.getComposerSource().get();
builder.withIntField(
EarlybirdFieldConstant.COMPOSER_SOURCE.getFieldName(), composerSource.getValue());
if (composerSource == ComposerSource.CAMERA) {
builder.withCameraComposerSourceFlag();
}
}
EarlybirdEncodedFeatures encodedFeatures = tweetFeature.encodedFeatures;
if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG)) {
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.VERIFIED_FILTER_TERM);
}
if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG)) {
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.BLUE_VERIFIED_FILTER_TERM);
}
if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG)) {
builder.withOffensiveFlag();
}
if (message.getNullcast()) {
NUM_NULLCAST_TWEETS.increment();
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.NULLCAST_FILTER_TERM);
} else {
NUM_NON_NULLCAST_TWEETS.increment();
}
if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.IS_NULLCAST_FLAG)) {
NUM_NULLCAST_FEATURE_FLAG_SET_TWEETS.increment();
}
if (message.isSelfThread()) {
builder.addFilterInternalFieldTerm(
EarlybirdFieldConstant.SELF_THREAD_FILTER_TERM);
NUM_SELF_THREAD_TWEETS.increment();
}
if (message.isExclusive()) {
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.EXCLUSIVE_FILTER_TERM);
builder.withLongField(
EarlybirdFieldConstant.EXCLUSIVE_CONVERSATION_AUTHOR_ID_CSF.getFieldName(),
message.getExclusiveConversationAuthorId());
NUM_EXCLUSIVE_TWEETS.increment();
}
builder.withLanguageCodes(message.getLanguage(), message.getBCP47LanguageTag());
return builder;
}
/**
* Build the user fields.
*/
public static void buildUserFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message,
VersionedTweetFeatures versionedTweetFeatures,
PenguinVersion penguinVersion) {
// 1. Set all the from user fields.
if (message.getFromUserTwitterId().isPresent()) {
builder.withLongField(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(),
message.getFromUserTwitterId().get())
// CSF
.withLongField(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(),
message.getFromUserTwitterId().get());
} else {
LOG.warn("fromUserTwitterId is not set in TwitterMessage! Status id: " + message.getId());
}
if (message.getFromUserScreenName().isPresent()) {
String fromUser = message.getFromUserScreenName().get();
String normalizedFromUser =
NormalizerHelper.normalizeWithUnknownLocale(fromUser, penguinVersion);
builder
.withWhiteSpaceTokenizedScreenNameField(
EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(),
normalizedFromUser)
.withStringField(EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(),
normalizedFromUser);
if (message.getTokenizedFromUserScreenName().isPresent()) {
builder.withCamelCaseTokenizedScreenNameField(
EarlybirdFieldConstant.CAMELCASE_USER_HANDLE_FIELD.getFieldName(),
fromUser,
normalizedFromUser,
message.getTokenizedFromUserScreenName().get());
}
}
Optional<String> toUserScreenName = message.getToUserLowercasedScreenName();
if (toUserScreenName.isPresent() && !toUserScreenName.get().isEmpty()) {
builder.withStringField(
EarlybirdFieldConstant.TO_USER_FIELD.getFieldName(),
NormalizerHelper.normalizeWithUnknownLocale(toUserScreenName.get(), penguinVersion));
}
if (versionedTweetFeatures.isSetUserDisplayNameTokenStreamText()) {
builder.withTokenStreamField(EarlybirdFieldConstant.TOKENIZED_USER_NAME_FIELD.getFieldName(),
versionedTweetFeatures.getUserDisplayNameTokenStreamText(),
versionedTweetFeatures.getUserDisplayNameTokenStream());
}
}
/**
* Build the geo fields.
*/
public static void buildGeoFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message,
VersionedTweetFeatures versionedTweetFeatures) {
double lat = GeoUtil.ILLEGAL_LATLON;
double lon = GeoUtil.ILLEGAL_LATLON;
if (message.getGeoLocation() != null) {
GeoObject location = message.getGeoLocation();
builder.withGeoField(EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName(),
location.getLatitude(), location.getLongitude(), location.getAccuracy());
if (location.getSource() != null) {
builder.withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
EarlybirdFieldConstants.formatGeoType(location.getSource()));
}
if (GeoUtil.validateGeoCoordinates(location.getLatitude(), location.getLongitude())) {
lat = location.getLatitude();
lon = location.getLongitude();
}
}
// See SEARCH-14317 for investigation on how much space geo filed is used in archive cluster.
// In lucene archives, this CSF is needed regardless of whether geoLocation is set.
builder.withLatLonCSF(lat, lon);
if (versionedTweetFeatures.isSetTokenizedPlace()) {
Place place = versionedTweetFeatures.getTokenizedPlace();
Preconditions.checkArgument(place.isSetId(), "Place ID not set for tweet "
+ message.getId());
Preconditions.checkArgument(place.isSetFullName(),
"Place full name not set for tweet " + message.getId());
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName());
builder
.withStringField(EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName(), place.getId())
.withStringField(EarlybirdFieldConstant.PLACE_FULL_NAME_FIELD.getFieldName(),
place.getFullName());
if (place.isSetCountryCode()) {
builder.withStringField(EarlybirdFieldConstant.PLACE_COUNTRY_CODE_FIELD.getFieldName(),
place.getCountryCode());
}
}
if (versionedTweetFeatures.isSetTokenizedProfileGeoEnrichment()) {
ProfileGeoEnrichment profileGeoEnrichment =
versionedTweetFeatures.getTokenizedProfileGeoEnrichment();
Preconditions.checkArgument(
profileGeoEnrichment.isSetPotentialLocations(),
"ProfileGeoEnrichment.potentialLocations not set for tweet "
+ message.getId());
List<PotentialLocation> potentialLocations = profileGeoEnrichment.getPotentialLocations();
Preconditions.checkArgument(
!potentialLocations.isEmpty(),
"Found tweet with an empty ProfileGeoEnrichment.potentialLocations: "
+ message.getId());
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.PROFILE_GEO_FILTER_TERM);
for (PotentialLocation potentialLocation : potentialLocations) {
if (potentialLocation.isSetCountryCode()) {
builder.withStringField(
EarlybirdFieldConstant.PROFILE_GEO_COUNTRY_CODE_FIELD.getFieldName(),
potentialLocation.getCountryCode());
}
if (potentialLocation.isSetRegion()) {
builder.withStringField(EarlybirdFieldConstant.PROFILE_GEO_REGION_FIELD.getFieldName(),
potentialLocation.getRegion());
}
if (potentialLocation.isSetLocality()) {
builder.withStringField(EarlybirdFieldConstant.PROFILE_GEO_LOCALITY_FIELD.getFieldName(),
potentialLocation.getLocality());
}
}
}
builder.withPlacesField(message.getPlaces());
}
/**
* Build the retweet and reply fields.
*/
public static void buildRetweetAndReplyFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message,
boolean strict) {
long retweetUserIdVal = -1;
long sharedStatusIdVal = -1;
if (message.getRetweetMessage() != null) {
if (message.getRetweetMessage().getSharedId() != null) {
sharedStatusIdVal = message.getRetweetMessage().getSharedId();
}
if (message.getRetweetMessage().hasSharedUserTwitterId()) {
retweetUserIdVal = message.getRetweetMessage().getSharedUserTwitterId();
}
}
long inReplyToStatusIdVal = -1;
long inReplyToUserIdVal = -1;
if (message.isReply()) {
if (message.getInReplyToStatusId().isPresent()) {
inReplyToStatusIdVal = message.getInReplyToStatusId().get();
}
if (message.getToUserTwitterId().isPresent()) {
inReplyToUserIdVal = message.getToUserTwitterId().get();
}
}
buildRetweetAndReplyFields(
retweetUserIdVal,
sharedStatusIdVal,
inReplyToStatusIdVal,
inReplyToUserIdVal,
strict,
builder);
}
/**
* Build the quotes fields.
*/
public static void buildQuotesFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message) {
if (message.getQuotedMessage() != null) {
TwitterQuotedMessage quoted = message.getQuotedMessage();
if (quoted != null && quoted.getQuotedStatusId() > 0 && quoted.getQuotedUserId() > 0) {
builder.withQuote(quoted.getQuotedStatusId(), quoted.getQuotedUserId());
}
}
}
/**
* Build directed at field.
*/
public static void buildDirectedAtFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message) {
if (message.getDirectedAtUserId().isPresent() && message.getDirectedAtUserId().get() > 0) {
builder.withDirectedAtUser(message.getDirectedAtUserId().get());
builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.DIRECTED_AT_FILTER_TERM);
}
}
/**
* Build the versioned features for a tweet.
*/
public static void buildVersionedFeatureFields(
EarlybirdThriftDocumentBuilder builder,
VersionedTweetFeatures versionedTweetFeatures) {
builder
.withHashtagsField(versionedTweetFeatures.getHashtags())
.withMentionsField(versionedTweetFeatures.getMentions())
.withStocksFields(versionedTweetFeatures.getStocks())
.withResolvedLinksText(versionedTweetFeatures.getNormalizedResolvedUrlText())
.withTokenStreamField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(),
versionedTweetFeatures.getTweetTokenStreamText(),
versionedTweetFeatures.isSetTweetTokenStream()
? versionedTweetFeatures.getTweetTokenStream() : null)
.withStringField(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName(),
versionedTweetFeatures.getSource())
.withStringField(EarlybirdFieldConstant.NORMALIZED_SOURCE_FIELD.getFieldName(),
versionedTweetFeatures.getNormalizedSource());
// Internal fields for smileys and question marks
if (versionedTweetFeatures.hasPositiveSmiley) {
builder.withStringField(
EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
EarlybirdFieldConstant.HAS_POSITIVE_SMILEY);
}
if (versionedTweetFeatures.hasNegativeSmiley) {
builder.withStringField(
EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
EarlybirdFieldConstant.HAS_NEGATIVE_SMILEY);
}
if (versionedTweetFeatures.hasQuestionMark) {
builder.withStringField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(),
EarlybirdThriftDocumentBuilder.QUESTION_MARK);
}
}
/**
* Build the escherbird annotations for a tweet.
*/
public static void buildAnnotationFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message) {
List<TwitterMessage.EscherbirdAnnotation> escherbirdAnnotations =
message.getEscherbirdAnnotations();
if (CollectionUtils.isEmpty(escherbirdAnnotations)) {
return;
}
builder.addFacetSkipList(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName());
for (TwitterMessage.EscherbirdAnnotation annotation : escherbirdAnnotations) {
String groupDomainEntity = String.format("%d.%d.%d",
annotation.groupId, annotation.domainId, annotation.entityId);
String domainEntity = String.format("%d.%d", annotation.domainId, annotation.entityId);
String entity = String.format("%d", annotation.entityId);
builder.withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(),
groupDomainEntity);
builder.withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(),
domainEntity);
builder.withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(),
entity);
}
}
/**
* Build the correct ThriftIndexingEvent's fields based on retweet and reply status.
*/
public static void buildRetweetAndReplyFields(
long retweetUserIdVal,
long sharedStatusIdVal,
long inReplyToStatusIdVal,
long inReplyToUserIdVal,
boolean strict,
EarlybirdThriftDocumentBuilder builder) {
Optional<Long> retweetUserId = Optional.of(retweetUserIdVal).filter(x -> x > 0);
Optional<Long> sharedStatusId = Optional.of(sharedStatusIdVal).filter(x -> x > 0);
Optional<Long> inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(x -> x > 0);
Optional<Long> inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(x -> x > 0);
// We have six combinations here. A Tweet can be
// 1) a reply to another tweet (then it has both in-reply-to-user-id and
// in-reply-to-status-id set),
// 2) directed-at a user (then it only has in-reply-to-user-id set),
// 3) not a reply at all.
// Additionally, it may or may not be a Retweet (if it is, then it has retweet-user-id and
// retweet-status-id set).
//
// We want to set some fields unconditionally, and some fields (reference-author-id and
// shared-status-id) depending on the reply/retweet combination.
//
// 1. Normal tweet (not a reply, not a retweet). None of the fields should be set.
//
// 2. Reply to a tweet (both in-reply-to-user-id and in-reply-to-status-id set).
// IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
// SHARED_STATUS_ID_CSF should be set to in-reply-to-status-id
// IS_REPLY_FLAG should be set
//
// 3. Directed-at a user (only in-reply-to-user-id is set).
// IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
// IS_REPLY_FLAG should be set
//
// 4. Retweet of a normal tweet (retweet-user-id and retweet-status-id are set).
// RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
// SHARED_STATUS_ID_CSF should be set to retweet-status-id
// IS_RETWEET_FLAG should be set
//
// 5. Retweet of a reply (both in-reply-to-user-id and in-reply-to-status-id set,
// retweet-user-id and retweet-status-id are set).
// RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
// SHARED_STATUS_ID_CSF should be set to retweet-status-id (retweet beats reply!)
// IS_RETWEET_FLAG should be set
// IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
// IS_REPLY_FLAG should NOT be set
//
// 6. Retweet of a directed-at tweet (only in-reply-to-user-id is set,
// retweet-user-id and retweet-status-id are set).
// RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
// SHARED_STATUS_ID_CSF should be set to retweet-status-id
// IS_RETWEET_FLAG should be set
// IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
// IS_REPLY_FLAG should NOT be set
//
// In other words:
// SHARED_STATUS_ID_CSF logic: if this is a retweet SHARED_STATUS_ID_CSF should be set to
// retweet-status-id, otherwise if it's a reply to a tweet, it should be set to
// in-reply-to-status-id.
Preconditions.checkState(retweetUserId.isPresent() == sharedStatusId.isPresent());
if (retweetUserId.isPresent()) {
builder.withNativeRetweet(retweetUserId.get(), sharedStatusId.get());
if (inReplyToUserId.isPresent()) {
// Set IN_REPLY_TO_USER_ID_FIELD even if this is a retweet of a reply.
builder.withInReplyToUserID(inReplyToUserId.get());
}
} else {
// If this is a retweet of a reply, we don't want to mark it as a reply, or override fields
// set by the retweet logic.
// If we are in this branch, this is not a retweet. Potentially, we set the reply flag,
// and override shared-status-id and reference-author-id.
if (inReplyToStatusId.isPresent()) {
if (strict) {
// Enforcing that if this is a reply to a tweet, then it also has a replied-to user.
Preconditions.checkState(inReplyToUserId.isPresent());
}
builder.withReplyFlag();
builder.withLongField(
EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(),
inReplyToStatusId.get());
builder.withLongField(
EarlybirdFieldConstant.IN_REPLY_TO_TWEET_ID_FIELD.getFieldName(),
inReplyToStatusId.get());
}
if (inReplyToUserId.isPresent()) {
builder.withReplyFlag();
builder.withInReplyToUserID(inReplyToUserId.get());
}
}
}
/**
* Build the engagement fields.
*/
public static void buildNormalizedMinEngagementFields(
EarlybirdThriftDocumentBuilder builder,
EarlybirdEncodedFeatures encodedFeatures,
EarlybirdCluster cluster) throws IOException {
if (EarlybirdCluster.isArchive(cluster)) {
int favoriteCount = encodedFeatures.getFeatureValue(EarlybirdFieldConstant.FAVORITE_COUNT);
int retweetCount = encodedFeatures.getFeatureValue(EarlybirdFieldConstant.RETWEET_COUNT);
int replyCount = encodedFeatures.getFeatureValue(EarlybirdFieldConstant.REPLY_COUNT);
builder
.withNormalizedMinEngagementField(
EarlybirdFieldConstant.NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD
.getFieldName(),
favoriteCount);
builder
.withNormalizedMinEngagementField(
EarlybirdFieldConstant.NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD
.getFieldName(),
retweetCount);
builder
.withNormalizedMinEngagementField(
EarlybirdFieldConstant.NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD
.getFieldName(),
replyCount);
}
}
/**
* As seen in SEARCH-5617, we sometimes have incorrect createdAt. This method tries to fix them
* by extracting creation time from snowflake when possible.
*/
public static long fixCreatedAtTimeStampIfNecessary(long id, long createdAtMs) {
if (createdAtMs < VALID_CREATION_TIME_THRESHOLD_MILLIS
&& id > SnowflakeIdParser.SNOWFLAKE_ID_LOWER_BOUND) {
// This tweet has a snowflake ID, and we can extract timestamp from the ID.
ADJUSTED_BAD_CREATED_AT_COUNTER.increment();
return SnowflakeIdParser.getTimestampFromTweetId(id);
} else if (!SnowflakeIdParser.isTweetIDAndCreatedAtConsistent(id, createdAtMs)) {
LOG.error(
"Found inconsistent tweet ID and created at timestamp: [statusID={}], [createdAtMs={}]",
id, createdAtMs);
INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS.increment();
}
return createdAtMs;
}
}

View File

@ -1,99 +0,0 @@
package com.twitter.search.common.converter.earlybird;
import java.io.IOException;
import java.util.List;
import javax.annotation.concurrent.NotThreadSafe;
import com.google.common.base.Preconditions;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentBuilder;
import com.twitter.search.common.schema.thriftjava.ThriftDocument;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType;
/**
* CombinedIndexingConverter builds objects from TwitterMessage to ThriftVersionedEvent.
*
* It is used in tests and in offline jobs, so all data is available on the TwitterMessage. This
* means that we don't need to split up the ThriftVersionedEvents into basic events and update
* events, like we do in the realtime pipeline using the BasicIndexingConverter and the
* DelayedIndexingConverter.
*/
@NotThreadSafe
public class CombinedIndexingConverter {
private final EncodedFeatureBuilder featureBuilder;
private final Schema schema;
private final EarlybirdCluster cluster;
public CombinedIndexingConverter(Schema schema, EarlybirdCluster cluster) {
this.featureBuilder = new EncodedFeatureBuilder();
this.schema = schema;
this.cluster = cluster;
}
/**
* Converts a TwitterMessage to a Thrift representation.
*/
public ThriftVersionedEvents convertMessageToThrift(
TwitterMessage message,
boolean strict,
List<PenguinVersion> penguinVersions) throws IOException {
Preconditions.checkNotNull(message);
Preconditions.checkNotNull(penguinVersions);
ThriftVersionedEvents versionedEvents = new ThriftVersionedEvents()
.setId(message.getId());
ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot();
for (PenguinVersion penguinVersion : penguinVersions) {
ThriftDocument document =
buildDocumentForPenguinVersion(schemaSnapshot, message, strict, penguinVersion);
ThriftIndexingEvent thriftIndexingEvent = new ThriftIndexingEvent()
.setDocument(document)
.setEventType(ThriftIndexingEventType.INSERT)
.setSortId(message.getId());
message.getFromUserTwitterId().map(thriftIndexingEvent::setUid);
versionedEvents.putToVersionedEvents(penguinVersion.getByteValue(), thriftIndexingEvent);
}
return versionedEvents;
}
private ThriftDocument buildDocumentForPenguinVersion(
ImmutableSchemaInterface schemaSnapshot,
TwitterMessage message,
boolean strict,
PenguinVersion penguinVersion) throws IOException {
EncodedFeatureBuilder.TweetFeatureWithEncodeFeatures tweetFeature =
featureBuilder.createTweetFeaturesFromTwitterMessage(
message, penguinVersion, schemaSnapshot);
EarlybirdThriftDocumentBuilder builder =
BasicIndexingConverter.buildBasicFields(message, schemaSnapshot, cluster, tweetFeature);
BasicIndexingConverter
.buildUserFields(builder, message, tweetFeature.versionedFeatures, penguinVersion);
BasicIndexingConverter.buildGeoFields(builder, message, tweetFeature.versionedFeatures);
DelayedIndexingConverter.buildURLFields(builder, message, tweetFeature.encodedFeatures);
BasicIndexingConverter.buildRetweetAndReplyFields(builder, message, strict);
BasicIndexingConverter.buildQuotesFields(builder, message);
BasicIndexingConverter.buildVersionedFeatureFields(builder, tweetFeature.versionedFeatures);
DelayedIndexingConverter.buildCardFields(builder, message, penguinVersion);
BasicIndexingConverter.buildAnnotationFields(builder, message);
BasicIndexingConverter.buildNormalizedMinEngagementFields(
builder, tweetFeature.encodedFeatures, cluster);
DelayedIndexingConverter.buildNamedEntityFields(builder, message);
BasicIndexingConverter.buildDirectedAtFields(builder, message);
return builder.build();
}
}

View File

@ -1,594 +0,0 @@
package com.twitter.search.common.converter.earlybird;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.commons.lang.StringUtils;
import org.apache.http.annotation.NotThreadSafe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.text.token.TokenizedCharSequenceStream;
import com.twitter.common.text.util.TokenStreamSerializer;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.cuad.ner.plain.thriftjava.NamedEntity;
import com.twitter.decider.Decider;
import com.twitter.search.common.constants.SearchCardType;
import com.twitter.search.common.decider.DeciderUtil;
import com.twitter.search.common.indexing.thriftjava.SearchCard2;
import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl;
import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents;
import com.twitter.search.common.indexing.thriftjava.TwitterPhotoUrl;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.entities.TwitterMessageUser;
import com.twitter.search.common.relevance.features.TweetTextFeatures;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants;
import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentBuilder;
import com.twitter.search.common.schema.thriftjava.ThriftDocument;
import com.twitter.search.common.schema.thriftjava.ThriftField;
import com.twitter.search.common.schema.thriftjava.ThriftFieldData;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType;
import com.twitter.search.common.util.lang.ThriftLanguageUtil;
import com.twitter.search.common.util.text.LanguageIdentifierHelper;
import com.twitter.search.common.util.text.NormalizerHelper;
import com.twitter.search.common.util.text.TokenizerHelper;
import com.twitter.search.common.util.text.TokenizerResult;
import com.twitter.search.common.util.text.TweetTokenStreamSerializer;
import com.twitter.service.spiderduck.gen.MediaTypes;
import com.twitter.search.common.metrics.SearchCounter;
/**
* Create and populate ThriftVersionedEvents from the URL data, card data, and named entities
* contained in a TwitterMessage. This data is delayed because these services take a few seconds
* to process tweets, and we want to send the basic data available in the BasicIndexingConverter as
* soon as possible, so we send the additional data a few seconds later, as an update.
*
* Prefer to add data and processing to the BasicIndexingConverter when possible. Only add data here
* if your data source _requires_ data from an external service AND the external service takes at
* least a few seconds to process new tweets.
*/
@NotThreadSafe
public class DelayedIndexingConverter {
private static final SearchCounter NUM_TWEETS_WITH_CARD_URL =
SearchCounter.export("tweets_with_card_url");
private static final SearchCounter NUM_TWEETS_WITH_NUMERIC_CARD_URI =
SearchCounter.export("tweets_with_numeric_card_uri");
private static final SearchCounter NUM_TWEETS_WITH_INVALID_CARD_URI =
SearchCounter.export("tweets_with_invalid_card_uri");
private static final SearchCounter TOTAL_URLS =
SearchCounter.export("total_urls_on_tweets");
private static final SearchCounter MEDIA_URLS_ON_TWEETS =
SearchCounter.export("media_urls_on_tweets");
private static final SearchCounter NON_MEDIA_URLS_ON_TWEETS =
SearchCounter.export("non_media_urls_on_tweets");
public static final String INDEX_URL_DESCRIPTION_AND_TITLE_DECIDER =
"index_url_description_and_title";
private static class ThriftDocumentWithEncodedTweetFeatures {
private final ThriftDocument document;
private final EarlybirdEncodedFeatures encodedFeatures;
public ThriftDocumentWithEncodedTweetFeatures(ThriftDocument document,
EarlybirdEncodedFeatures encodedFeatures) {
this.document = document;
this.encodedFeatures = encodedFeatures;
}
public ThriftDocument getDocument() {
return document;
}
public EarlybirdEncodedFeatures getEncodedFeatures() {
return encodedFeatures;
}
}
// The list of all the encoded_tweet_features flags that might be updated by this converter.
// No extended_encoded_tweet_features are updated (otherwise they should be in this list too).
private static final List<EarlybirdFieldConstants.EarlybirdFieldConstant> UPDATED_FLAGS =
Lists.newArrayList(
EarlybirdFieldConstants.EarlybirdFieldConstant.IS_OFFENSIVE_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_LINK_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.IS_SENSITIVE_CONTENT,
EarlybirdFieldConstants.EarlybirdFieldConstant.TEXT_SCORE,
EarlybirdFieldConstants.EarlybirdFieldConstant.TWEET_SIGNATURE,
EarlybirdFieldConstants.EarlybirdFieldConstant.LINK_LANGUAGE,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_NEWS_URL_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_EXPANDO_CARD_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_MULTIPLE_MEDIA_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CARD_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VINE_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PERISCOPE_FLAG,
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG
);
private static final Logger LOG = LoggerFactory.getLogger(DelayedIndexingConverter.class);
private static final String AMPLIFY_CARD_NAME = "amplify";
private static final String PLAYER_CARD_NAME = "player";
private final EncodedFeatureBuilder featureBuilder = new EncodedFeatureBuilder();
private final Schema schema;
private final Decider decider;
public DelayedIndexingConverter(Schema schema, Decider decider) {
this.schema = schema;
this.decider = decider;
}
/**
* Converts the given message to two ThriftVersionedEvents instances: the first one is a feature
* update event for all link and card related flags, and the second one is the append event that
* might contain updates to all link and card related fields.
*
* We need to split the updates to fields and flags into two separate events because:
* - When a tweet is created, earlybirds get the "main" event, which does not have resolved URLs.
* - Then the earlybirds might get a feature update from the signal ingesters, marking the tweet
* as spam.
* - Then the ingesters resolve the URLs and send an update event. At this point, the ingesters
* need to send updates for link-related flags too (HAS_LINK_FLAG, etc.). And there are a few
* ways to do this:
* 1. Encode these flags into encoded_tweet_features and extended_encoded_tweet_features and
* add these fields to the update event. The problem is that earlybirds will then override
* the encoded_tweet_features ane extended_encoded_tweet_features fields in the index for
* this tweet, which will override the feature update the earlybirds got earlier, which
* means that a spammy tweet might no longer be marked as spam in the index.
* 2. Send updates only for the flags that might've been updated by this converter. Since
* ThriftIndexingEvent already has a map of field -> value, it seems like the natural place
* to add these updates to. However, earlybirds can correctly process flag updates only if
* they come in a feature update event (PARTIAL_UPDATE). So we need to send the field
* updates in an OUT_OF_ORDER_UPDATE event, and the flag updates in a PARTIAL_UPDATE event.
*
* We need to send the feature update event before the append event to avoid issues like the one
* in SEARCH-30919 where tweets were returned from the card name field index before the HAS_CARD
* feature was updated to true.
*
* @param message The TwitterMessage to convert.
* @param penguinVersions The Penguin versions for which ThriftIndexingEvents should be created.
* @return An out of order update event for all link- and card-related fields and a feature update
* event for all link- and card-related flags.
*/
public List<ThriftVersionedEvents> convertMessageToOutOfOrderAppendAndFeatureUpdate(
TwitterMessage message, List<PenguinVersion> penguinVersions) {
Preconditions.checkNotNull(message);
Preconditions.checkNotNull(penguinVersions);
ThriftVersionedEvents featureUpdateVersionedEvents = new ThriftVersionedEvents();
ThriftVersionedEvents outOfOrderAppendVersionedEvents = new ThriftVersionedEvents();
ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot();
for (PenguinVersion penguinVersion : penguinVersions) {
ThriftDocumentWithEncodedTweetFeatures documentWithEncodedFeatures =
buildDocumentForPenguinVersion(schemaSnapshot, message, penguinVersion);
ThriftIndexingEvent featureUpdateThriftIndexingEvent = new ThriftIndexingEvent();
featureUpdateThriftIndexingEvent.setEventType(ThriftIndexingEventType.PARTIAL_UPDATE);
featureUpdateThriftIndexingEvent.setUid(message.getId());
featureUpdateThriftIndexingEvent.setDocument(
buildFeatureUpdateDocument(documentWithEncodedFeatures.getEncodedFeatures()));
featureUpdateVersionedEvents.putToVersionedEvents(
penguinVersion.getByteValue(), featureUpdateThriftIndexingEvent);
ThriftIndexingEvent outOfOrderAppendThriftIndexingEvent = new ThriftIndexingEvent();
outOfOrderAppendThriftIndexingEvent.setDocument(documentWithEncodedFeatures.getDocument());
outOfOrderAppendThriftIndexingEvent.setEventType(ThriftIndexingEventType.OUT_OF_ORDER_APPEND);
message.getFromUserTwitterId().ifPresent(outOfOrderAppendThriftIndexingEvent::setUid);
outOfOrderAppendThriftIndexingEvent.setSortId(message.getId());
outOfOrderAppendVersionedEvents.putToVersionedEvents(
penguinVersion.getByteValue(), outOfOrderAppendThriftIndexingEvent);
}
featureUpdateVersionedEvents.setId(message.getId());
outOfOrderAppendVersionedEvents.setId(message.getId());
return Lists.newArrayList(featureUpdateVersionedEvents, outOfOrderAppendVersionedEvents);
}
private ThriftDocument buildFeatureUpdateDocument(EarlybirdEncodedFeatures encodedFeatures) {
ThriftDocument document = new ThriftDocument();
for (EarlybirdFieldConstants.EarlybirdFieldConstant flag : UPDATED_FLAGS) {
ThriftField field = new ThriftField();
field.setFieldConfigId(flag.getFieldId());
field.setFieldData(new ThriftFieldData().setIntValue(encodedFeatures.getFeatureValue(flag)));
document.addToFields(field);
}
return document;
}
private ThriftDocumentWithEncodedTweetFeatures buildDocumentForPenguinVersion(
ImmutableSchemaInterface schemaSnapshot,
TwitterMessage message,
PenguinVersion penguinVersion) {
EarlybirdEncodedFeatures encodedFeatures = featureBuilder.createTweetFeaturesFromTwitterMessage(
message, penguinVersion, schemaSnapshot).encodedFeatures;
EarlybirdThriftDocumentBuilder builder = new EarlybirdThriftDocumentBuilder(
encodedFeatures,
null,
new EarlybirdFieldConstants(),
schemaSnapshot);
builder.setAddLatLonCSF(false);
builder.withID(message.getId());
buildFieldsFromUrlInfo(builder, message, penguinVersion, encodedFeatures);
buildCardFields(builder, message, penguinVersion);
buildNamedEntityFields(builder, message);
builder.withTweetSignature(message.getTweetSignature(penguinVersion));
buildSpaceAdminAndTitleFields(builder, message, penguinVersion);
builder.setAddEncodedTweetFeatures(false);
return new ThriftDocumentWithEncodedTweetFeatures(builder.build(), encodedFeatures);
}
public static void buildNamedEntityFields(
EarlybirdThriftDocumentBuilder builder, TwitterMessage message) {
for (NamedEntity namedEntity : message.getNamedEntities()) {
builder.withNamedEntity(namedEntity);
}
}
private void buildFieldsFromUrlInfo(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message,
PenguinVersion penguinVersion,
EarlybirdEncodedFeatures encodedFeatures) {
// We need to update the RESOLVED_LINKS_TEXT_FIELD, since we might have new resolved URLs.
// Use the same logic as in EncodedFeatureBuilder.java.
TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion);
String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
builder.withResolvedLinksText(resolvedUrlsText);
buildURLFields(builder, message, encodedFeatures);
buildAnalyzedURLFields(builder, message, penguinVersion);
}
private void buildAnalyzedURLFields(
EarlybirdThriftDocumentBuilder builder, TwitterMessage message, PenguinVersion penguinVersion
) {
TOTAL_URLS.add(message.getExpandedUrls().size());
if (DeciderUtil.isAvailableForRandomRecipient(
decider,
INDEX_URL_DESCRIPTION_AND_TITLE_DECIDER)) {
for (ThriftExpandedUrl expandedUrl : message.getExpandedUrls()) {
/*
Consumer Media URLs are added to the expanded URLs in
TweetEventParserHelper.addMediaEntitiesToMessage. These Twitter.com media URLs contain
the tweet text as the description and the title is "<User Name> on Twitter". This is
redundant information at best and misleading at worst. We will ignore these URLs to avoid
polluting the url_description and url_title field as well as saving space.
*/
if (!expandedUrl.isSetConsumerMedia() || !expandedUrl.isConsumerMedia()) {
NON_MEDIA_URLS_ON_TWEETS.increment();
if (expandedUrl.isSetDescription()) {
buildTweetTokenizerTokenizedField(builder,
EarlybirdFieldConstants.EarlybirdFieldConstant.URL_DESCRIPTION_FIELD.getFieldName(),
expandedUrl.getDescription(),
penguinVersion);
}
if (expandedUrl.isSetTitle()) {
buildTweetTokenizerTokenizedField(builder,
EarlybirdFieldConstants.EarlybirdFieldConstant.URL_TITLE_FIELD.getFieldName(),
expandedUrl.getTitle(),
penguinVersion);
}
} else {
MEDIA_URLS_ON_TWEETS.increment();
}
}
}
}
/**
* Build the URL based fields from a tweet.
*/
public static void buildURLFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message,
EarlybirdEncodedFeatures encodedFeatures
) {
Map<String, ThriftExpandedUrl> expandedUrlMap = message.getExpandedUrlMap();
for (ThriftExpandedUrl expandedUrl : expandedUrlMap.values()) {
if (expandedUrl.getMediaType() == MediaTypes.NATIVE_IMAGE) {
EncodedFeatureBuilder.addPhotoUrl(message, expandedUrl.getCanonicalLastHopUrl());
}
}
// now add all twitter photos links that came with the tweet's payload
Map<Long, String> photos = message.getPhotoUrls();
List<TwitterPhotoUrl> photoURLs = new ArrayList<>();
if (photos != null) {
for (Map.Entry<Long, String> entry : photos.entrySet()) {
TwitterPhotoUrl photo = new TwitterPhotoUrl(entry.getKey());
String mediaUrl = entry.getValue();
if (mediaUrl != null) {
photo.setMediaUrl(mediaUrl);
}
photoURLs.add(photo);
}
}
try {
builder
.withURLs(Lists.newArrayList(expandedUrlMap.values()))
.withTwimgURLs(photoURLs);
} catch (IOException ioe) {
LOG.error("URL field creation threw an IOException", ioe);
}
if (encodedFeatures.isFlagSet(
EarlybirdFieldConstants.EarlybirdFieldConstant.IS_OFFENSIVE_FLAG)) {
builder.withOffensiveFlag();
}
if (encodedFeatures.isFlagSet(
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG)) {
builder.addFilterInternalFieldTerm(
EarlybirdFieldConstants.EarlybirdFieldConstant.CONSUMER_VIDEO_FILTER_TERM);
}
if (encodedFeatures.isFlagSet(
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG)) {
builder.addFilterInternalFieldTerm(
EarlybirdFieldConstants.EarlybirdFieldConstant.PRO_VIDEO_FILTER_TERM);
}
if (encodedFeatures.isFlagSet(EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VINE_FLAG)) {
builder.addFilterInternalFieldTerm(
EarlybirdFieldConstants.EarlybirdFieldConstant.VINE_FILTER_TERM);
}
if (encodedFeatures.isFlagSet(
EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PERISCOPE_FLAG)) {
builder.addFilterInternalFieldTerm(
EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_FILTER_TERM);
}
}
/**
* Build the card information inside ThriftIndexingEvent's fields.
*/
static void buildCardFields(EarlybirdThriftDocumentBuilder builder,
TwitterMessage message,
PenguinVersion penguinVersion) {
if (message.hasCard()) {
SearchCard2 card = buildSearchCardFromTwitterMessage(
message,
TweetTokenStreamSerializer.getTweetTokenStreamSerializer(),
penguinVersion);
buildCardFeatures(message.getId(), builder, card);
}
}
private static SearchCard2 buildSearchCardFromTwitterMessage(
TwitterMessage message,
TokenStreamSerializer streamSerializer,
PenguinVersion penguinVersion) {
SearchCard2 card = new SearchCard2();
card.setCardName(message.getCardName());
if (message.getCardDomain() != null) {
card.setCardDomain(message.getCardDomain());
}
if (message.getCardLang() != null) {
card.setCardLang(message.getCardLang());
}
if (message.getCardUrl() != null) {
card.setCardUrl(message.getCardUrl());
}
if (message.getCardTitle() != null && !message.getCardTitle().isEmpty()) {
String normalizedTitle = NormalizerHelper.normalize(
message.getCardTitle(), message.getLocale(), penguinVersion);
TokenizerResult result = TokenizerHelper.tokenizeTweet(
normalizedTitle, message.getLocale(), penguinVersion);
TokenizedCharSequenceStream tokenSeqStream = new TokenizedCharSequenceStream();
tokenSeqStream.reset(result.tokenSequence);
try {
card.setCardTitleTokenStream(streamSerializer.serialize(tokenSeqStream));
card.setCardTitleTokenStreamText(result.tokenSequence.toString());
} catch (IOException e) {
LOG.error("TwitterTokenStream serialization error! Could not serialize card title: "
+ result.tokenSequence);
card.unsetCardTitleTokenStream();
card.unsetCardTitleTokenStreamText();
}
}
if (message.getCardDescription() != null && !message.getCardDescription().isEmpty()) {
String normalizedDesc = NormalizerHelper.normalize(
message.getCardDescription(), message.getLocale(), penguinVersion);
TokenizerResult result = TokenizerHelper.tokenizeTweet(
normalizedDesc, message.getLocale(), penguinVersion);
TokenizedCharSequenceStream tokenSeqStream = new TokenizedCharSequenceStream();
tokenSeqStream.reset(result.tokenSequence);
try {
card.setCardDescriptionTokenStream(streamSerializer.serialize(tokenSeqStream));
card.setCardDescriptionTokenStreamText(result.tokenSequence.toString());
} catch (IOException e) {
LOG.error("TwitterTokenStream serialization error! Could not serialize card description: "
+ result.tokenSequence);
card.unsetCardDescriptionTokenStream();
card.unsetCardDescriptionTokenStreamText();
}
}
return card;
}
/**
* Builds card features.
*/
private static void buildCardFeatures(
long tweetId, EarlybirdThriftDocumentBuilder builder, SearchCard2 card) {
if (card == null) {
return;
}
builder
.withTokenStreamField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_TITLE_FIELD.getFieldName(),
card.getCardTitleTokenStreamText(),
card.isSetCardTitleTokenStream() ? card.getCardTitleTokenStream() : null)
.withTokenStreamField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_DESCRIPTION_FIELD.getFieldName(),
card.getCardDescriptionTokenStreamText(),
card.isSetCardDescriptionTokenStream() ? card.getCardDescriptionTokenStream() : null)
.withStringField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName(),
card.getCardName())
.withIntField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD.getFieldName(),
SearchCardType.cardTypeFromStringName(card.getCardName()).getByteValue());
if (card.getCardLang() != null) {
builder.withStringField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_LANG.getFieldName(),
card.getCardLang()).withIntField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_LANG_CSF.getFieldName(),
ThriftLanguageUtil.getThriftLanguageOf(card.getCardLang()).getValue());
}
if (card.getCardDomain() != null) {
builder.withStringField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_DOMAIN_FIELD.getFieldName(),
card.getCardDomain());
}
if (card.getCardUrl() != null) {
NUM_TWEETS_WITH_CARD_URL.increment();
if (card.getCardUrl().startsWith("card://")) {
String suffix = card.getCardUrl().replace("card://", "");
if (StringUtils.isNumeric(suffix)) {
NUM_TWEETS_WITH_NUMERIC_CARD_URI.increment();
builder.withLongField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_URI_CSF.getFieldName(),
Long.parseLong(suffix));
LOG.debug(String.format(
"Good card URL for tweet %s: %s",
tweetId,
card.getCardUrl()));
} else {
NUM_TWEETS_WITH_INVALID_CARD_URI.increment();
LOG.debug(String.format(
"Card URL starts with \"card://\" but followed by non-numeric for tweet %s: %s",
tweetId,
card.getCardUrl()));
}
}
}
if (isCardVideo(card)) {
// Add into "internal" field so that this tweet is returned by filter:videos.
builder.addFacetSkipList(
EarlybirdFieldConstants.EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName());
}
}
/**
* Determines if a card is a video.
*/
private static boolean isCardVideo(@Nullable SearchCard2 card) {
if (card == null) {
return false;
}
return AMPLIFY_CARD_NAME.equalsIgnoreCase(card.getCardName())
|| PLAYER_CARD_NAME.equalsIgnoreCase(card.getCardName());
}
private void buildSpaceAdminAndTitleFields(
EarlybirdThriftDocumentBuilder builder,
TwitterMessage message,
PenguinVersion penguinVersion) {
buildSpaceAdminFields(builder, message.getSpaceAdmins(), penguinVersion);
// build the space title field.
buildTweetTokenizerTokenizedField(
builder,
EarlybirdFieldConstants.EarlybirdFieldConstant.SPACE_TITLE_FIELD.getFieldName(),
message.getSpaceTitle(),
penguinVersion);
}
private void buildSpaceAdminFields(
EarlybirdThriftDocumentBuilder builder,
Set<TwitterMessageUser> spaceAdmins,
PenguinVersion penguinVersion) {
for (TwitterMessageUser spaceAdmin : spaceAdmins) {
if (spaceAdmin.getScreenName().isPresent()) {
// build screen name (aka handle) fields.
String screenName = spaceAdmin.getScreenName().get();
String normalizedScreenName =
NormalizerHelper.normalizeWithUnknownLocale(screenName, penguinVersion);
builder.withStringField(
EarlybirdFieldConstants.EarlybirdFieldConstant.SPACE_ADMIN_FIELD.getFieldName(),
normalizedScreenName);
builder.withWhiteSpaceTokenizedScreenNameField(
EarlybirdFieldConstants
.EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(),
normalizedScreenName);
if (spaceAdmin.getTokenizedScreenName().isPresent()) {
builder.withCamelCaseTokenizedScreenNameField(
EarlybirdFieldConstants
.EarlybirdFieldConstant.CAMELCASE_TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(),
screenName,
normalizedScreenName,
spaceAdmin.getTokenizedScreenName().get());
}
}
if (spaceAdmin.getDisplayName().isPresent()) {
buildTweetTokenizerTokenizedField(
builder,
EarlybirdFieldConstants
.EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_DISPLAY_NAME_FIELD.getFieldName(),
spaceAdmin.getDisplayName().get(),
penguinVersion);
}
}
}
private void buildTweetTokenizerTokenizedField(
EarlybirdThriftDocumentBuilder builder,
String fieldName,
String text,
PenguinVersion penguinVersion) {
if (StringUtils.isNotEmpty(text)) {
Locale locale = LanguageIdentifierHelper
.identifyLanguage(text);
String normalizedText = NormalizerHelper.normalize(
text, locale, penguinVersion);
TokenizerResult result = TokenizerHelper
.tokenizeTweet(normalizedText, locale, penguinVersion);
TokenizedCharSequenceStream tokenSeqStream = new TokenizedCharSequenceStream();
tokenSeqStream.reset(result.tokenSequence);
TokenStreamSerializer streamSerializer =
TweetTokenStreamSerializer.getTweetTokenStreamSerializer();
try {
builder.withTokenStreamField(
fieldName,
result.tokenSequence.toString(),
streamSerializer.serialize(tokenSeqStream));
} catch (IOException e) {
LOG.error("TwitterTokenStream serialization error! Could not serialize: " + text);
}
}
}
}

View File

@ -1,531 +0,0 @@
package com.twitter.search.common.converter.earlybird;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.text.token.TokenizedCharSequence;
import com.twitter.common.text.token.TokenizedCharSequenceStream;
import com.twitter.common.text.util.TokenStreamSerializer;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.indexing.thriftjava.Place;
import com.twitter.search.common.indexing.thriftjava.PotentialLocation;
import com.twitter.search.common.indexing.thriftjava.ProfileGeoEnrichment;
import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl;
import com.twitter.search.common.indexing.thriftjava.VersionedTweetFeatures;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.relevance.entities.PotentialLocationObject;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.features.FeatureSink;
import com.twitter.search.common.relevance.features.MutableFeatureNormalizers;
import com.twitter.search.common.relevance.features.RelevanceSignalConstants;
import com.twitter.search.common.relevance.features.TweetTextFeatures;
import com.twitter.search.common.relevance.features.TweetTextQuality;
import com.twitter.search.common.relevance.features.TweetUserFeatures;
import com.twitter.search.common.schema.base.FeatureConfiguration;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.common.util.lang.ThriftLanguageUtil;
import com.twitter.search.common.util.text.LanguageIdentifierHelper;
import com.twitter.search.common.util.text.NormalizerHelper;
import com.twitter.search.common.util.text.SourceNormalizer;
import com.twitter.search.common.util.text.TokenizerHelper;
import com.twitter.search.common.util.text.TokenizerResult;
import com.twitter.search.common.util.text.TweetTokenStreamSerializer;
import com.twitter.search.common.util.url.LinkVisibilityUtils;
import com.twitter.search.common.util.url.NativeVideoClassificationUtils;
import com.twitter.search.ingester.model.VisibleTokenRatioUtil;
/**
* EncodedFeatureBuilder helps to build encoded features for TwitterMessage.
*
* This is stateful so should only be used one tweet at a time
*/
public class EncodedFeatureBuilder {
private static final Logger LOG = LoggerFactory.getLogger(EncodedFeatureBuilder.class);
private static final SearchCounter NUM_TWEETS_WITH_INVALID_TWEET_ID_IN_PHOTO_URL =
SearchCounter.export("tweets_with_invalid_tweet_id_in_photo_url");
// TwitterTokenStream for converting TokenizedCharSequence into a stream for serialization
// This is stateful so should only be used one tweet at a time
private final TokenizedCharSequenceStream tokenSeqStream = new TokenizedCharSequenceStream();
// SUPPRESS CHECKSTYLE:OFF LineLength
private static final Pattern TWITTER_PHOTO_PERMA_LINK_PATTERN =
Pattern.compile("(?i:^(?:(?:https?\\:\\/\\/)?(?:www\\.)?)?twitter\\.com\\/(?:\\?[^#]+)?(?:#!?\\/?)?\\w{1,20}\\/status\\/(\\d+)\\/photo\\/\\d*$)");
private static final Pattern TWITTER_PHOTO_COPY_PASTE_LINK_PATTERN =
Pattern.compile("(?i:^(?:(?:https?\\:\\/\\/)?(?:www\\.)?)?twitter\\.com\\/(?:#!?\\/)?\\w{1,20}\\/status\\/(\\d+)\\/photo\\/\\d*$)");
// SUPPRESS CHECKSTYLE:ON LineLength
private static final VisibleTokenRatioUtil VISIBLE_TOKEN_RATIO = new VisibleTokenRatioUtil();
private static final Map<PenguinVersion, SearchCounter> SERIALIZE_FAILURE_COUNTERS_MAP =
Maps.newEnumMap(PenguinVersion.class);
static {
for (PenguinVersion penguinVersion : PenguinVersion.values()) {
SERIALIZE_FAILURE_COUNTERS_MAP.put(
penguinVersion,
SearchCounter.export(
"tokenstream_serialization_failure_" + penguinVersion.name().toLowerCase()));
}
}
public static class TweetFeatureWithEncodeFeatures {
public final VersionedTweetFeatures versionedFeatures;
public final EarlybirdEncodedFeatures encodedFeatures;
public final EarlybirdEncodedFeatures extendedEncodedFeatures;
public TweetFeatureWithEncodeFeatures(
VersionedTweetFeatures versionedFeatures,
EarlybirdEncodedFeatures encodedFeatures,
EarlybirdEncodedFeatures extendedEncodedFeatures) {
this.versionedFeatures = versionedFeatures;
this.encodedFeatures = encodedFeatures;
this.extendedEncodedFeatures = extendedEncodedFeatures;
}
}
/**
* Create tweet text features and the encoded features.
*
* @param message the tweet message
* @param penguinVersion the based penguin version to create the features
* @param schemaSnapshot the schema associated with the features
* @return the text features and the encoded features
*/
public TweetFeatureWithEncodeFeatures createTweetFeaturesFromTwitterMessage(
TwitterMessage message,
PenguinVersion penguinVersion,
ImmutableSchemaInterface schemaSnapshot) {
VersionedTweetFeatures versionedTweetFeatures = new VersionedTweetFeatures();
// Write extendedPackedFeatures.
EarlybirdEncodedFeatures extendedEncodedFeatures =
createExtendedEncodedFeaturesFromTwitterMessage(message, penguinVersion, schemaSnapshot);
if (extendedEncodedFeatures != null) {
extendedEncodedFeatures
.writeExtendedFeaturesToVersionedTweetFeatures(versionedTweetFeatures);
}
setSourceAndNormalizedSource(
message.getStrippedSource(), versionedTweetFeatures, penguinVersion);
TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion);
///////////////////////////////
// Add hashtags and mentions
textFeatures.getHashtags().forEach(versionedTweetFeatures::addToHashtags);
textFeatures.getMentions().forEach(versionedTweetFeatures::addToMentions);
///////////////////////////////
// Extract some extra information from the message text.
// Index stock symbols with $ prepended
textFeatures.getStocks().stream()
.filter(stock -> stock != null)
.forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase()));
// Question marks
versionedTweetFeatures.setHasQuestionMark(textFeatures.hasQuestionMark());
// Smileys
versionedTweetFeatures.setHasPositiveSmiley(textFeatures.hasPositiveSmiley());
versionedTweetFeatures.setHasNegativeSmiley(textFeatures.hasNegativeSmiley());
TokenStreamSerializer streamSerializer =
TweetTokenStreamSerializer.getTweetTokenStreamSerializer();
TokenizedCharSequence tokenSeq = textFeatures.getTokenSequence();
tokenSeqStream.reset(tokenSeq);
int tokenPercent = VISIBLE_TOKEN_RATIO.extractAndNormalizeTokenPercentage(tokenSeqStream);
tokenSeqStream.reset(tokenSeq);
// Write packedFeatures.
EarlybirdEncodedFeatures encodedFeatures = createEncodedFeaturesFromTwitterMessage(
message, penguinVersion, schemaSnapshot, tokenPercent);
encodedFeatures.writeFeaturesToVersionedTweetFeatures(versionedTweetFeatures);
try {
versionedTweetFeatures.setTweetTokenStream(streamSerializer.serialize(tokenSeqStream));
versionedTweetFeatures.setTweetTokenStreamText(tokenSeq.toString());
} catch (IOException e) {
LOG.error("TwitterTokenStream serialization error! Could not serialize: "
+ tokenSeq.toString());
SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment();
versionedTweetFeatures.unsetTweetTokenStream();
versionedTweetFeatures.unsetTweetTokenStreamText();
}
// User name features
if (message.getFromUserDisplayName().isPresent()) {
Locale locale = LanguageIdentifierHelper
.identifyLanguage(message.getFromUserDisplayName().get());
String normalizedDisplayName = NormalizerHelper.normalize(
message.getFromUserDisplayName().get(), locale, penguinVersion);
TokenizerResult result = TokenizerHelper
.tokenizeTweet(normalizedDisplayName, locale, penguinVersion);
tokenSeqStream.reset(result.tokenSequence);
try {
versionedTweetFeatures.setUserDisplayNameTokenStream(
streamSerializer.serialize(tokenSeqStream));
versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString());
} catch (IOException e) {
LOG.error("TwitterTokenStream serialization error! Could not serialize: "
+ message.getFromUserDisplayName().get());
SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment();
versionedTweetFeatures.unsetUserDisplayNameTokenStream();
versionedTweetFeatures.unsetUserDisplayNameTokenStreamText();
}
}
String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText);
addPlace(message, versionedTweetFeatures, penguinVersion);
addProfileGeoEnrichment(message, versionedTweetFeatures, penguinVersion);
versionedTweetFeatures.setTweetSignature(message.getTweetSignature(penguinVersion));
return new TweetFeatureWithEncodeFeatures(
versionedTweetFeatures, encodedFeatures, extendedEncodedFeatures);
}
protected static void setSourceAndNormalizedSource(
String strippedSource,
VersionedTweetFeatures versionedTweetFeatures,
PenguinVersion penguinVersion) {
if (strippedSource != null && !strippedSource.isEmpty()) {
// normalize source for searchable field - replaces whitespace with underscores (???).
versionedTweetFeatures.setNormalizedSource(
SourceNormalizer.normalize(strippedSource, penguinVersion));
// source facet has simpler normalization.
Locale locale = LanguageIdentifierHelper.identifyLanguage(strippedSource);
versionedTweetFeatures.setSource(NormalizerHelper.normalizeKeepCase(
strippedSource, locale, penguinVersion));
}
}
/**
* Adds the given photo url to the thrift status if it is a twitter photo permalink.
* Returns true, if this was indeed a twitter photo, false otherwise.
*/
public static boolean addPhotoUrl(TwitterMessage message, String photoPermalink) {
Matcher matcher = TWITTER_PHOTO_COPY_PASTE_LINK_PATTERN.matcher(photoPermalink);
if (!matcher.matches() || matcher.groupCount() < 1) {
matcher = TWITTER_PHOTO_PERMA_LINK_PATTERN.matcher(photoPermalink);
}
if (matcher.matches() && matcher.groupCount() == 1) {
// this is a native photo url which we need to store in a separate field
String idStr = matcher.group(1);
if (idStr != null) {
// idStr should be a valid tweet ID (and therefore, should fit into a Long), but we have
// tweets for which idStr is a long sequence of digits that does not fit into a Long.
try {
long photoStatusId = Long.parseLong(idStr);
message.addPhotoUrl(photoStatusId, null);
} catch (NumberFormatException e) {
LOG.warn("Found a tweet with a photo URL with an invalid tweet ID: " + message);
NUM_TWEETS_WITH_INVALID_TWEET_ID_IN_PHOTO_URL.increment();
}
}
return true;
}
return false;
}
private void addPlace(TwitterMessage message,
VersionedTweetFeatures versionedTweetFeatures,
PenguinVersion penguinVersion) {
String placeId = message.getPlaceId();
if (placeId == null) {
return;
}
// Tweet.Place.id and Tweet.Place.full_name are both required fields.
String placeFullName = message.getPlaceFullName();
Preconditions.checkNotNull(placeFullName, "Tweet.Place without full_name.");
Locale placeFullNameLocale = LanguageIdentifierHelper.identifyLanguage(placeFullName);
String normalizedPlaceFullName =
NormalizerHelper.normalize(placeFullName, placeFullNameLocale, penguinVersion);
String tokenizedPlaceFullName = StringUtils.join(
TokenizerHelper.tokenizeQuery(normalizedPlaceFullName, placeFullNameLocale, penguinVersion),
" ");
Place place = new Place(placeId, tokenizedPlaceFullName);
String placeCountryCode = message.getPlaceCountryCode();
if (placeCountryCode != null) {
Locale placeCountryCodeLocale = LanguageIdentifierHelper.identifyLanguage(placeCountryCode);
place.setCountryCode(
NormalizerHelper.normalize(placeCountryCode, placeCountryCodeLocale, penguinVersion));
}
versionedTweetFeatures.setTokenizedPlace(place);
}
private void addProfileGeoEnrichment(TwitterMessage message,
VersionedTweetFeatures versionedTweetFeatures,
PenguinVersion penguinVersion) {
List<PotentialLocationObject> potentialLocations = message.getPotentialLocations();
if (potentialLocations.isEmpty()) {
return;
}
List<PotentialLocation> thriftPotentialLocations = Lists.newArrayList();
for (PotentialLocationObject potentialLocation : potentialLocations) {
thriftPotentialLocations.add(potentialLocation.toThriftPotentialLocation(penguinVersion));
}
versionedTweetFeatures.setTokenizedProfileGeoEnrichment(
new ProfileGeoEnrichment(thriftPotentialLocations));
}
/** Returns the encoded features. */
public static EarlybirdEncodedFeatures createEncodedFeaturesFromTwitterMessage(
TwitterMessage message,
PenguinVersion penguinVersion,
ImmutableSchemaInterface schema,
int normalizedTokenPercentBucket) {
FeatureSink sink = new FeatureSink(schema);
// Static features
sink.setBooleanValue(EarlybirdFieldConstant.IS_RETWEET_FLAG, message.isRetweet())
.setBooleanValue(EarlybirdFieldConstant.IS_REPLY_FLAG, message.isReply())
.setBooleanValue(
EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG, message.isUserVerified())
.setBooleanValue(
EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG, message.isUserBlueVerified())
.setBooleanValue(EarlybirdFieldConstant.IS_SENSITIVE_CONTENT, message.isSensitiveContent());
TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion);
if (textFeatures != null) {
final FeatureConfiguration featureConfigNumHashtags = schema.getFeatureConfigurationByName(
EarlybirdFieldConstant.NUM_HASHTAGS.getFieldName());
final FeatureConfiguration featureConfigNumMentions = schema.getFeatureConfigurationByName(
EarlybirdFieldConstant.NUM_MENTIONS.getFieldName());
sink.setNumericValue(
EarlybirdFieldConstant.NUM_HASHTAGS,
Math.min(textFeatures.getHashtagsSize(), featureConfigNumHashtags.getMaxValue()))
.setNumericValue(
EarlybirdFieldConstant.NUM_MENTIONS,
Math.min(textFeatures.getMentionsSize(), featureConfigNumMentions.getMaxValue()))
.setBooleanValue(
EarlybirdFieldConstant.HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG,
TwitterMessage.hasMultipleHashtagsOrTrends(textFeatures))
.setBooleanValue(
EarlybirdFieldConstant.HAS_TREND_FLAG,
textFeatures.getTrendingTermsSize() > 0);
}
TweetTextQuality textQuality = message.getTweetTextQuality(penguinVersion);
if (textQuality != null) {
sink.setNumericValue(EarlybirdFieldConstant.TEXT_SCORE, textQuality.getTextScore());
sink.setBooleanValue(
EarlybirdFieldConstant.IS_OFFENSIVE_FLAG,
textQuality.hasBoolQuality(TweetTextQuality.BooleanQualityType.OFFENSIVE)
|| textQuality.hasBoolQuality(TweetTextQuality.BooleanQualityType.OFFENSIVE_USER)
// Note: if json message "possibly_sensitive" flag is set, we consider the tweet
// sensitive and is currently filtered out in safe search mode via a hacky setup:
// earlybird does not create _filter_sensitive_content field, only
// _is_offensive field is created, and used in filter:safe operator
|| textQuality.hasBoolQuality(TweetTextQuality.BooleanQualityType.SENSITIVE));
if (textQuality.hasBoolQuality(TweetTextQuality.BooleanQualityType.SENSITIVE)) {
sink.setBooleanValue(EarlybirdFieldConstant.IS_SENSITIVE_CONTENT, true);
}
} else {
// we don't have text score, for whatever reason, set to sentinel value so we won't be
// skipped by scoring function
sink.setNumericValue(EarlybirdFieldConstant.TEXT_SCORE,
RelevanceSignalConstants.UNSET_TEXT_SCORE_SENTINEL);
}
if (message.isSetLocale()) {
sink.setNumericValue(EarlybirdFieldConstant.LANGUAGE,
ThriftLanguageUtil.getThriftLanguageOf(message.getLocale()).getValue());
}
// User features
TweetUserFeatures userFeatures = message.getTweetUserFeatures(penguinVersion);
if (userFeatures != null) {
sink.setBooleanValue(EarlybirdFieldConstant.IS_USER_SPAM_FLAG, userFeatures.isSpam())
.setBooleanValue(EarlybirdFieldConstant.IS_USER_NSFW_FLAG, userFeatures.isNsfw())
.setBooleanValue(EarlybirdFieldConstant.IS_USER_BOT_FLAG, userFeatures.isBot());
}
if (message.getUserReputation() != TwitterMessage.DOUBLE_FIELD_NOT_PRESENT) {
sink.setNumericValue(EarlybirdFieldConstant.USER_REPUTATION,
(byte) message.getUserReputation());
} else {
sink.setNumericValue(EarlybirdFieldConstant.USER_REPUTATION,
RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL);
}
sink.setBooleanValue(EarlybirdFieldConstant.IS_NULLCAST_FLAG, message.getNullcast());
// Realtime Ingestion does not write engagement features. Updater does that.
if (message.getNumFavorites() > 0) {
sink.setNumericValue(EarlybirdFieldConstant.FAVORITE_COUNT,
MutableFeatureNormalizers.BYTE_NORMALIZER.normalize(message.getNumFavorites()));
}
if (message.getNumRetweets() > 0) {
sink.setNumericValue(EarlybirdFieldConstant.RETWEET_COUNT,
MutableFeatureNormalizers.BYTE_NORMALIZER.normalize(message.getNumRetweets()));
}
if (message.getNumReplies() > 0) {
sink.setNumericValue(EarlybirdFieldConstant.REPLY_COUNT,
MutableFeatureNormalizers.BYTE_NORMALIZER.normalize(message.getNumReplies()));
}
sink.setNumericValue(EarlybirdFieldConstant.VISIBLE_TOKEN_RATIO, normalizedTokenPercentBucket);
EarlybirdEncodedFeatures encodedFeatures =
(EarlybirdEncodedFeatures) sink.getFeaturesForBaseField(
EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldName());
updateLinkEncodedFeatures(encodedFeatures, message);
return encodedFeatures;
}
/**
* Returns the extended encoded features.
*/
public static EarlybirdEncodedFeatures createExtendedEncodedFeaturesFromTwitterMessage(
TwitterMessage message,
PenguinVersion penguinVersion,
ImmutableSchemaInterface schema) {
FeatureSink sink = new FeatureSink(schema);
TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion);
if (textFeatures != null) {
setExtendedEncodedFeatureIntValue(sink, schema,
EarlybirdFieldConstant.NUM_HASHTAGS_V2, textFeatures.getHashtagsSize());
setExtendedEncodedFeatureIntValue(sink, schema,
EarlybirdFieldConstant.NUM_MENTIONS_V2, textFeatures.getMentionsSize());
setExtendedEncodedFeatureIntValue(sink, schema,
EarlybirdFieldConstant.NUM_STOCKS, textFeatures.getStocksSize());
}
Optional<Long> referenceAuthorId = message.getReferenceAuthorId();
if (referenceAuthorId.isPresent()) {
setEncodedReferenceAuthorId(sink, referenceAuthorId.get());
}
return (EarlybirdEncodedFeatures) sink.getFeaturesForBaseField(
EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD.getFieldName());
}
/**
* Updates all URL-related features, based on the values stored in the given message.
*
* @param encodedFeatures The features to be updated.
* @param message The message.
*/
public static void updateLinkEncodedFeatures(
EarlybirdEncodedFeatures encodedFeatures, TwitterMessage message) {
if (message.getLinkLocale() != null) {
encodedFeatures.setFeatureValue(
EarlybirdFieldConstant.LINK_LANGUAGE,
ThriftLanguageUtil.getThriftLanguageOf(message.getLinkLocale()).getValue());
}
if (message.hasCard()) {
encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_CARD_FLAG);
}
// Set HAS_IMAGE HAS_NEWS HAS_VIDEO etc. flags for expanded urls.
if (message.getExpandedUrlMapSize() > 0) {
encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_LINK_FLAG);
for (ThriftExpandedUrl url : message.getExpandedUrlMap().values()) {
if (url.isSetMediaType()) {
switch (url.getMediaType()) {
case NATIVE_IMAGE:
encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG);
encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG);
break;
case IMAGE:
encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG);
break;
case VIDEO:
encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG);
break;
case NEWS:
encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_NEWS_URL_FLAG);
break;
case UNKNOWN:
break;
default:
throw new IllegalStateException("Unexpected enum value: " + url.getMediaType());
}
}
}
}
Set<String> canonicalLastHopUrlsStrings = message.getCanonicalLastHopUrls();
Set<String> expandedUrlsStrings = message.getExpandedUrls()
.stream()
.map(ThriftExpandedUrl::getExpandedUrl)
.collect(Collectors.toSet());
Set<String> expandedAndLastHopUrlsStrings = new HashSet<>();
expandedAndLastHopUrlsStrings.addAll(expandedUrlsStrings);
expandedAndLastHopUrlsStrings.addAll(canonicalLastHopUrlsStrings);
// Check both expanded and last hop url for consumer videos as consumer video urls are
// sometimes redirected to the url of the tweets containing the videos (SEARCH-42612).
if (NativeVideoClassificationUtils.hasConsumerVideo(expandedAndLastHopUrlsStrings)) {
encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG);
}
if (NativeVideoClassificationUtils.hasProVideo(canonicalLastHopUrlsStrings)) {
encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG);
}
if (NativeVideoClassificationUtils.hasVine(canonicalLastHopUrlsStrings)) {
encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_VINE_FLAG);
}
if (NativeVideoClassificationUtils.hasPeriscope(canonicalLastHopUrlsStrings)) {
encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_PERISCOPE_FLAG);
}
if (LinkVisibilityUtils.hasVisibleLink(message.getExpandedUrls())) {
encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG);
}
}
private static void setExtendedEncodedFeatureIntValue(
FeatureSink sink,
ImmutableSchemaInterface schema,
EarlybirdFieldConstant field,
int value) {
boolean fieldInSchema = schema.hasField(field.getFieldName());
if (fieldInSchema) {
FeatureConfiguration featureConfig =
schema.getFeatureConfigurationByName(field.getFieldName());
sink.setNumericValue(field, Math.min(value, featureConfig.getMaxValue()));
}
}
private static void setEncodedReferenceAuthorId(FeatureSink sink, long referenceAuthorId) {
LongIntConverter.IntegerRepresentation ints =
LongIntConverter.convertOneLongToTwoInt(referenceAuthorId);
sink.setNumericValue(
EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT, ints.leastSignificantInt);
sink.setNumericValue(
EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT, ints.mostSignificantInt);
}
}

View File

@ -1,20 +0,0 @@
# Java library for docvalues and common stride field encoding utilities.
java_library(
sources = ["*.java"],
platform = "java8",
provides = artifact(
org = "com.twitter.search.common",
name = "encoding-docvalues",
repo = artifactory,
),
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
"3rdparty/jvm/org/apache/lucene:lucene-core",
"3rdparty/jvm/org/apache/lucene:lucene-facet",
"3rdparty/jvm/org/apache/thrift:libthrift",
"src/java/com/twitter/search/common/schema/base",
"src/thrift/com/twitter/search/common:schema-java",
],
)

View File

@ -1,34 +0,0 @@
package com.twitter.search.common.encoding.docvalues;
public final class CSFTypeUtil {
private CSFTypeUtil() {
}
/**
* Convert a long into a byte array, stored into dest.
*/
public static void convertToBytes(byte[] dest, int valueIndex, int value) {
int offset = valueIndex * Integer.BYTES;
dest[offset] = (byte) (value >>> 24);
dest[offset + 1] = (byte) (value >>> 16);
dest[offset + 2] = (byte) (value >>> 8);
dest[offset + 3] = (byte) value;
}
/**
* Convert bytes into a long value. Inverse function of convertToBytes.
*/
public static int convertFromBytes(byte[] data, int startOffset, int valueIndex) {
// This should rarely happen, eg. when we get a corrupt ThriftIndexingEvent, we insert a new
// Document which is blank. Such a document results in a length 0 BytesRef.
if (data.length == 0) {
return 0;
}
int offset = startOffset + valueIndex * Integer.BYTES;
return ((data[offset] & 0xFF) << 24)
| ((data[offset + 1] & 0xFF) << 16)
| ((data[offset + 2] & 0xFF) << 8)
| (data[offset + 3] & 0xFF);
}
}

View File

@ -1,17 +0,0 @@
# Java library for feature encoding and decoding utilities.
java_library(
sources = ["*.java"],
platform = "java8",
provides = artifact(
org = "com.twitter.search.common",
name = "encoding-features",
repo = artifactory,
),
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/org/apache/thrift:libthrift",
"src/java/com/twitter/search/common/schema/base",
"src/thrift/com/twitter/search/common:indexing-java",
],
)

View File

@ -1,73 +0,0 @@
package com.twitter.search.common.encoding.features;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeMap;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
/**
* Normalizes values to predefined bins.
* If the value to normalize is lower than the lowest bin defined, normalizes to Byte.MIN_VALUE.
*/
public class BinByteNormalizer extends ByteNormalizer {
private final TreeMap<Double, Byte> bins = Maps.newTreeMap();
private final TreeMap<Byte, Double> reverseBins = Maps.newTreeMap();
/**
* Constructs a normalizer using predefined bins.
* @param bins A mapping between the upper bound of a value and the bin it should normalize to.
* For example providing a map with 2 entries, {5=>1, 10=>2} will normalize as follows:
* values under 5: Byte.MIN_VALUE
* values between 5 and 10: 1
* values over 10: 2
*/
public BinByteNormalizer(final Map<Double, Byte> bins) {
Preconditions.checkNotNull(bins);
Preconditions.checkArgument(!bins.isEmpty(), "No bins provided");
Preconditions.checkArgument(hasIncreasingValues(bins));
this.bins.putAll(bins);
for (Map.Entry<Double, Byte> entry : bins.entrySet()) {
reverseBins.put(entry.getValue(), entry.getKey());
}
}
/**
* check that if key1 > key2 then val1 > val2 in the {@code map}.
*/
private static boolean hasIncreasingValues(final Map<Double, Byte> map) {
SortedSet<Double> orderedKeys = Sets.newTreeSet(map.keySet());
byte prev = Byte.MIN_VALUE;
for (Double key : orderedKeys) { // save the unboxing
byte cur = map.get(key);
if (cur <= prev) {
return false;
}
prev = cur;
}
return true;
}
@Override
public byte normalize(double val) {
Map.Entry<Double, Byte> lowerBound = bins.floorEntry(val);
return lowerBound == null
? Byte.MIN_VALUE
: lowerBound.getValue();
}
@Override
public double unnormLowerBound(byte norm) {
return reverseBins.get(reverseBins.floorKey(norm));
}
@Override
public double unnormUpperBound(byte norm) {
return norm == reverseBins.lastKey()
? Double.POSITIVE_INFINITY
: reverseBins.get(reverseBins.floorKey((byte) (1 + norm)));
}
}

View File

@ -1,38 +0,0 @@
package com.twitter.search.common.encoding.features;
/**
* Interface for compressing unbounded float values to a signed byte. It includes both
* normalization of values and encoding of values in a byte.
*/
public abstract class ByteNormalizer {
public static byte intToUnsignedByte(int i) {
return (byte) i;
}
public static int unsignedByteToInt(byte b) {
return (int) b & 0xFF;
}
/**
* Returns the byte-compressed value of {@code val}.
*/
public abstract byte normalize(double val);
/**
* Returns a lower bound to the unnormalized range of {@code norm}.
*/
public abstract double unnormLowerBound(byte norm);
/**
* Returns an upper bound to the unnormalized range of {@code norm}.
*/
public abstract double unnormUpperBound(byte norm);
/**
* Returns true if the normalized value of {@code val} is different than the normalized value of
* {@code val - 1}
*/
public boolean changedNorm(double val) {
return normalize(val) != normalize(val - 1);
}
}

View File

@ -1,47 +0,0 @@
package com.twitter.search.common.encoding.features;
import com.google.common.base.Preconditions;
/**
* A byte normalizer that restricts the values to the given range before normalizing them.
*/
public class ClampByteNormalizer extends ByteNormalizer {
private final int minUnnormalizedValue;
private final int maxUnnormalizedValue;
/**
* Creates a new ClampByteNormalizer instance.
*
* @param minValue The smallest allowed unnormalized value.
* @param maxValue The largest allowed unnormalized value.
*/
public ClampByteNormalizer(int minUnnormalizedValue, int maxUnnormalizedValue) {
Preconditions.checkState(minUnnormalizedValue <= maxUnnormalizedValue);
Preconditions.checkState(minUnnormalizedValue >= 0);
Preconditions.checkState(maxUnnormalizedValue <= 255);
this.minUnnormalizedValue = minUnnormalizedValue;
this.maxUnnormalizedValue = maxUnnormalizedValue;
}
@Override
public byte normalize(double val) {
int adjustedValue = (int) val;
if (adjustedValue < minUnnormalizedValue) {
adjustedValue = minUnnormalizedValue;
}
if (adjustedValue > maxUnnormalizedValue) {
adjustedValue = maxUnnormalizedValue;
}
return ByteNormalizer.intToUnsignedByte(adjustedValue);
}
@Override
public double unnormLowerBound(byte norm) {
return ByteNormalizer.unsignedByteToInt(norm);
}
@Override
public double unnormUpperBound(byte norm) {
return ByteNormalizer.unsignedByteToInt(norm) + 1;
}
}

View File

@ -1,58 +0,0 @@
package com.twitter.search.common.encoding.features;
/**
* Encodes multiple values (bytes or bits) into an integer.
*/
public class EncodedFeatures {
private int value;
public final void setSerializedValue(int val) {
this.value = val;
}
public final int getSerializedValue() {
return value;
}
// setByte is agnostic to signed / unsigned bytes.
protected final EncodedFeatures setByte(byte count, int bitshift, long inverseMask) {
value = (int) ((value & inverseMask) | ((count & 0xffL) << bitshift));
return this;
}
/**
* Sets the value but only if greater. setByteIfGreater assumes unsigned bytes.
*/
public final EncodedFeatures setByteIfGreater(byte newCount, int bitshift, long inversemask) {
if ((getByte(bitshift) & 0xff) < (newCount & 0xff)) {
setByte(newCount, bitshift, inversemask);
}
return this;
}
protected final int getByte(int bitshift) {
return (int) (((value & 0xffffffffL) >>> bitshift) & 0xffL);
}
protected final int getByteMasked(int bitshift, long mask) {
return (int) (((value & mask) >>> bitshift) & 0xffL);
}
protected final EncodedFeatures setBit(int bit, boolean flag) {
if (flag) {
value |= bit;
} else {
value &= ~bit;
}
return this;
}
protected final boolean getBit(int bit) {
return (value & bit) != 0;
}
@Override
public String toString() {
return String.format("%x", value);
}
}

View File

@ -1,15 +0,0 @@
package com.twitter.search.common.encoding.features;
/**
* Interface for processing different feature values into an int. It provides a one-way translation
* of encoding using com.twitter.search.common.encoding.features.ByteNormalizer and supports all the
* old normalizers. The difference is that we directly return the normalized int value
* (instead of converting from byte).
*/
public interface IntNormalizer {
/**
* Returns the normalized value of {@code val}.
* The value may be byte-compressed or as-is depending on the normalizer type
*/
int normalize(double val);
}

View File

@ -1,159 +0,0 @@
package com.twitter.search.common.encoding.features;
import java.util.List;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.twitter.search.common.indexing.thriftjava.PackedFeatures;
import com.twitter.search.common.schema.base.FeatureConfiguration;
/**
* Class used to read/write integers encoded according to
* {@link com.twitter.search.common.schema.base.FeatureConfiguration}
*
* Implementations must override {@link #getInt(int pos)} and {@link #setInt(int pos, int value)}.
*/
public abstract class IntegerEncodedFeatures {
/**
* Returns the value at the given position.
*/
public abstract int getInt(int pos);
/**
* Sets the given value at the given position.
*/
public abstract void setInt(int pos, int value);
/**
* Get the maximum number of integers to hold features.
* @return the number of integers to represent all features.
*/
public abstract int getNumInts();
/**
* Test to see if the given feature is true or non-zero. Useful for one bit features.
* @param feature feature to examine
* @return true if feature is non-zero
*/
public boolean isFlagSet(FeatureConfiguration feature) {
return (getInt(feature.getValueIndex()) & feature.getBitMask()) != 0;
}
public IntegerEncodedFeatures setFlag(FeatureConfiguration feature) {
setInt(feature.getValueIndex(), getInt(feature.getValueIndex()) | feature.getBitMask());
return this;
}
public IntegerEncodedFeatures clearFlag(FeatureConfiguration feature) {
setInt(feature.getValueIndex(), getInt(feature.getValueIndex()) & feature.getInverseBitMask());
return this;
}
/**
* Sets a boolean flag.
*/
public IntegerEncodedFeatures setFlagValue(FeatureConfiguration feature, boolean value) {
if (value) {
setFlag(feature);
} else {
clearFlag(feature);
}
return this;
}
/**
* Get feature value
* @param feature feature to get
* @return the value of the feature
*/
public int getFeatureValue(FeatureConfiguration feature) {
return (getInt(feature.getValueIndex()) & feature.getBitMask())
>>> feature.getBitStartPosition();
}
/**
* Set feature value
* @param feature feature to modify
* @param value value to set.
*/
public IntegerEncodedFeatures setFeatureValue(FeatureConfiguration feature, int value) {
Preconditions.checkState(
value <= feature.getMaxValue(),
"Feature value, %s, is greater than the max value allowed for this feature. "
+ "Feature: %s, Max value: %s",
value, feature.getName(), feature.getMaxValue());
// Clear the value of the given feature in its int.
int temp = getInt(feature.getValueIndex()) & feature.getInverseBitMask();
// Set the new feature value. Applying the bit mask here ensures that other features in the
// same int are not modified by mistake.
temp |= (value << feature.getBitStartPosition()) & feature.getBitMask();
setInt(feature.getValueIndex(), temp);
return this;
}
/**
* Sets feature value if greater than current value
* @param feature feature to modify
* @param value new value
*/
public IntegerEncodedFeatures setFeatureValueIfGreater(FeatureConfiguration feature, int value) {
if (value > getFeatureValue(feature)) {
setFeatureValue(feature, value);
}
return this;
}
/**
* Increment a feature if its not at its maximum value.
* @return whether the feature is incremented.
*/
public boolean incrementIfNotMaximum(FeatureConfiguration feature) {
int newValue = getFeatureValue(feature) + 1;
if (newValue <= feature.getMaxValue()) {
setFeatureValue(feature, newValue);
return true;
} else {
return false;
}
}
/**
* Copy these encoded features to a new PackedFeatures thrift struct.
*/
public PackedFeatures copyToPackedFeatures() {
return copyToPackedFeatures(new PackedFeatures());
}
/**
* Copy these encoded features to a PackedFeatures thrift struct.
*/
public PackedFeatures copyToPackedFeatures(PackedFeatures packedFeatures) {
Preconditions.checkNotNull(packedFeatures);
final List<Integer> integers = Lists.newArrayListWithCapacity(getNumInts());
for (int i = 0; i < getNumInts(); i++) {
integers.add(getInt(i));
}
packedFeatures.setDeprecated_featureConfigurationVersion(0);
packedFeatures.setFeatures(integers);
return packedFeatures;
}
/**
* Copy features from a packed features struct.
*/
public void readFromPackedFeatures(PackedFeatures packedFeatures) {
Preconditions.checkNotNull(packedFeatures);
List<Integer> ints = packedFeatures.getFeatures();
for (int i = 0; i < getNumInts(); i++) {
if (i < ints.size()) {
setInt(i, ints.get(i));
} else {
setInt(i, 0);
}
}
}
}

View File

@ -1,53 +0,0 @@
package com.twitter.search.common.encoding.features;
import com.google.common.base.Preconditions;
/**
* Normalizes values as follows:
* Positive numbers normalize to (1 + round(log_baseN(value))).
* Negative numbers throw.
* 0 will normalize to 0.
* The log base is 2 by default.
*/
public class LogByteNormalizer extends ByteNormalizer {
private static final double DEFAULT_BASE = 2;
private final double base;
private final double logBase;
public LogByteNormalizer(double base) {
Preconditions.checkArgument(base > 0);
this.base = base;
logBase = Math.log(base);
}
public LogByteNormalizer() {
this(DEFAULT_BASE);
}
@Override
public byte normalize(double val) {
if (val < 0) {
throw new IllegalArgumentException("Can't log-normalize negative value " + val);
} else if (val == 0) {
return 0;
} else {
long logVal = 1 + (long) Math.floor(Math.log(val) / logBase);
return logVal > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte) logVal;
}
}
@Override
public double unnormLowerBound(byte norm) {
return norm < 0
? Double.NEGATIVE_INFINITY
: Math.floor(Math.pow(base, norm - 1));
}
@Override
public double unnormUpperBound(byte norm) {
return norm == Byte.MAX_VALUE
? Double.POSITIVE_INFINITY
: Math.floor(Math.pow(base, norm));
}
}

View File

@ -1,51 +0,0 @@
package com.twitter.search.common.encoding.features;
import com.google.common.base.Preconditions;
/**
* A normalizer that normalizes the prediction score from a machine learning classifier, which
* ranges within [0.0, 1.0], to an integer value by multiplying by (10 ^ precision), and returns
* the rounded value. The lower the precision, the less amount of bits it takes to encode the score.
* @see #precision
*
* This normalizer also could denormalize the normalized value from integer back to double using the
* same precision.
*/
public class PredictionScoreNormalizer {
private final int precision;
private final double normalizingBase;
public PredictionScoreNormalizer(int precision) {
this.precision = precision;
this.normalizingBase = Math.pow(10, this.precision);
}
/**
* Returns the normalized int value for prediction score {@code score} by multiplying
* by {@code normalizingBase}, and round the result.
* @throws IllegalArgumentException when parameter {@code score} is not within [0.0, 1.0]
*/
public int normalize(double score) {
Preconditions.checkArgument(isScoreWithinRange(score));
return (int) Math.round(score * this.normalizingBase);
}
/**
* Converts the normalized int value back to a double score by dividing by {@code normalizingBase}
* @throws IllegalStateException when the denormalized value is not within [0.0, 1.0]
*/
public double denormalize(int normalizedScore) {
double denormalizedValue = normalizedScore / this.normalizingBase;
if (!isScoreWithinRange(denormalizedValue)) {
throw new IllegalStateException(
String.format("The denormalized value %s is not within [0.0, 1.0]", denormalizedValue)
);
}
return denormalizedValue;
}
private static boolean isScoreWithinRange(double score) {
return 0.0 <= score && score <= 1.0;
}
}

View File

@ -1,35 +0,0 @@
package com.twitter.search.common.encoding.features;
/**
* Normalizes using the logic described in {@link SingleBytePositiveFloatUtil}.
*/
public class SingleBytePositiveFloatNormalizer extends ByteNormalizer {
@Override
public byte normalize(double val) {
return SingleBytePositiveFloatUtil.toSingleBytePositiveFloat((float) val);
}
@Override
public double unnormLowerBound(byte norm) {
return SingleBytePositiveFloatUtil.toJavaFloat(norm);
}
/**
* Get the upper bound of the raw value for a normalized byte.
* @deprecated This is wrongly implemented, always use unnormLowerBound(),
* or use SmartIntegerNormalizer.
*/
@Override @Deprecated
public double unnormUpperBound(byte norm) {
return 1 + SingleBytePositiveFloatUtil.toJavaFloat(norm);
}
/**
* Return the the post-log2 unnormalized value. This is only used for some legacy Earlybird
* features and scoring functions.
*/
public double unnormAndLog2(byte norm) {
return SingleBytePositiveFloatUtil.toLog2Double(norm);
}
}

View File

@ -1,164 +0,0 @@
package com.twitter.search.common.encoding.features;
/**
* Util used to:
* - Encode a positive Java float into a single byte float
* - Decode a single byte into a positive Java float
*
* Configuration:
* - Exponent: higher 4 bits, base 10.
* - Mantissa: lower 4 bit, representing 1.0 to 9.0
* - Exponent bias is 1.
*
* Formula:
* Max(Mantissa, 9) * 10 ^ (Exponent - 1)
*
* Smallest float: 0.0 (0000 0000)
* Smallest positive float: 1.0 * 10^-1 (0000 0001)
* Largest float: 9.0 * 10^13 (1110 1111)
* Infinity: (1111 0000)
* NaN: (1111 1000)
*/
public final class SingleBytePositiveFloatUtil {
private SingleBytePositiveFloatUtil() { }
// 4 bits mantissa. Range [1.0, 10.0) is divided into 16 steps
public static final byte MAX_BYTE_VALUE = (byte) 0xEF;
public static final byte INFINITY = (byte) 0xF0;
public static final byte NOT_A_NUMBER = (byte) 0xF8;
private static final float STEP_SIZE = 1.0f;
private static final int EXPONENT_BIAS = 1;
private static final byte MIN_EXPONENT = -EXPONENT_BIAS;
private static final int MAX_EXPONENT = 14 - EXPONENT_BIAS;
private static final byte MANTISSA_MASK = 0x0F;
/**
* Converts the given float into a single byte floating point number.
* This is used in the updater and OK to be a bit slow.
*/
public static byte toSingleBytePositiveFloat(float f) {
if (f < 0) {
throw new UnsupportedOperationException(
"Cannot encode negative floats into SingleBytePostiveFloat.");
}
if (Float.compare(f, Float.POSITIVE_INFINITY) == 0) {
return INFINITY;
}
if (Float.compare(f, Float.NaN) == 0) {
return NOT_A_NUMBER;
}
int mantissa = 0;
int exponent = (int) Math.floor(Math.log10(f));
// Overflow (Number too large), just return the largest possible value
if (exponent > MAX_EXPONENT) {
return MAX_BYTE_VALUE;
}
// Underflow (Number too small), just return 0
if (exponent < MIN_EXPONENT) {
return 0;
}
int frac = Math.round(f / (float) Math.pow(10.0f, exponent) / STEP_SIZE);
mantissa = fractionToMantissaTable[frac];
return (byte) (((exponent + EXPONENT_BIAS) << 4) | mantissa);
}
/**
* Called in Earlybird per hit and needs to be fast.
*/
public static float toJavaFloat(byte b) {
return BYTE_TO_FLOAT_CONVERSION_TABLE[b & 0xff];
}
// Table used for converting mantissa into a significant
private static float[] mantissaToFractionTable = {
// Decimal Matisa value
STEP_SIZE * 0, // 0000
STEP_SIZE * 1, // 0001
STEP_SIZE * 1, // 0010
STEP_SIZE * 2, // 0011
STEP_SIZE * 2, // 0100
STEP_SIZE * 3, // 0101
STEP_SIZE * 3, // 0110
STEP_SIZE * 4, // 0111
STEP_SIZE * 4, // 1000
STEP_SIZE * 5, // 1001
STEP_SIZE * 5, // 1010
STEP_SIZE * 6, // 1011
STEP_SIZE * 6, // 1100
STEP_SIZE * 7, // 1101
STEP_SIZE * 8, // 1110
STEP_SIZE * 9 // 1111
};
// Table used for converting fraction into mantissa.
// Reverse operation of the above
private static int[] fractionToMantissaTable = {
0, // 0
1, // 1
3, // 2
5, // 3
7, // 4
9, // 5
11, // 6
13, // 7
14, // 8
15, // 9
15, // 10 (Edge case: because we round the fraction, we can get 10 here.)
};
public static final byte LARGEST_FRACTION_UNDER_ONE = (byte) (toSingleBytePositiveFloat(1f) - 1);
/**
* Converts the given byte to java float.
*/
private static float toJavaFloatSlow(byte b) {
if (b == INFINITY) {
return Float.POSITIVE_INFINITY;
}
if ((b & 0xff) > (INFINITY & 0xff)) {
return Float.NaN;
}
int exponent = ((b & 0xff) >>> 4) - EXPONENT_BIAS;
int mantissa = b & MANTISSA_MASK;
return mantissaToFractionTable[mantissa] * (float) Math.pow(10.0f, exponent);
}
// Cached results from byte to float conversion
private static final float[] BYTE_TO_FLOAT_CONVERSION_TABLE = new float[256];
private static final double[] BYTE_TO_LOG2_CONVERSION_TABLE = new double[256];
private static final byte[] OLD_TO_NEW_BYTE_CONVERSION_TABLE = new byte[256];
static {
LogByteNormalizer normalizer = new LogByteNormalizer();
for (int i = 0; i < 256; i++) {
byte b = (byte) i;
BYTE_TO_FLOAT_CONVERSION_TABLE[i] = toJavaFloatSlow(b);
BYTE_TO_LOG2_CONVERSION_TABLE[i] =
0xff & normalizer.normalize(BYTE_TO_FLOAT_CONVERSION_TABLE[i]);
if (b == 0) {
OLD_TO_NEW_BYTE_CONVERSION_TABLE[i] = 0;
} else if (b > 0) {
OLD_TO_NEW_BYTE_CONVERSION_TABLE[i] =
toSingleBytePositiveFloat((float) normalizer.unnormLowerBound(b));
} else {
// should not get here.
OLD_TO_NEW_BYTE_CONVERSION_TABLE[i] = MAX_BYTE_VALUE;
}
}
}
/**
* Convert a normalized byte to the log2() version of its original value
*/
static double toLog2Double(byte b) {
return BYTE_TO_LOG2_CONVERSION_TABLE[b & 0xff];
}
}

View File

@ -1,150 +0,0 @@
package com.twitter.search.common.encoding.features;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
/**
* A smart integer normalizer that converts an integer of a known range to a small integer up to
* 8 bits long. This normalizer generates a boundary value array in the constructor as the buckets
* for different values.
* <p/>
* The normalized value has a nice properties:
* 1) it maintains the order of original value: if a > b, then normalize(a) > normalize(b).
* 2) the value 0 is always normalized to byte 0.
* 3) the normalized values are (almost) evenly distributed on the log scale
* 4) no waste in code space, all possible values representable by normalized bits are used,
* each corresponding to a different value.
*/
public class SmartIntegerNormalizer extends ByteNormalizer {
// The max value we want to support in this normalizer. If the input is larger than this value,
// it's normalized as if it's the maxValue.
private final int maxValue;
// Number of bits used for normalized value, the largest normalized value
// would be (1 << numBits) - 1.
private final int numBits;
// The inclusive lower bounds of all buckets. A normalized value k corresponds to original values
// in the inclusive-exclusive range
// [ boundaryValues[k], boundaryValues[k+1] )
private final int[] boundaryValues;
// The length of the boundaryValues array, or the number of buckets.
private final int length;
/**
* Construct a normalizer.
*
* @param maxValue max value it supports, must be larger than minValue. Anything larger than this
* would be treated as maxValue.
* @param numBits number of bits you want to use for this normalization, between 1 and 8.
* higher resolution for the lower numbers.
*/
public SmartIntegerNormalizer(int maxValue, int numBits) {
Preconditions.checkArgument(maxValue > 0);
Preconditions.checkArgument(numBits > 0 && numBits <= 8);
this.maxValue = maxValue;
this.numBits = numBits;
this.length = 1 << numBits;
this.boundaryValues = new int[length];
int index;
for (index = length - 1; index >= 0; --index) {
// values are evenly distributed on the log scale
int boundary = (int) Math.pow(maxValue, (double) index / length);
// we have more byte slots left than we have possible boundary values (buckets),
// just give consecutive boundary values to all remaining slots, starting from 0.
if (boundary <= index) {
break;
}
boundaryValues[index] = boundary;
}
if (index >= 0) {
for (int i = 1; i <= index; ++i) {
boundaryValues[i] = i;
}
}
boundaryValues[0] = 0; // the first one is always 0.
}
@Override
public byte normalize(double val) {
int intVal = (int) (val > maxValue ? maxValue : val);
return intToUnsignedByte(binarySearch(intVal, boundaryValues));
}
/**
* Return the lower bound of the bucket represent by norm. This simply returns the boundary
* value indexed by current norm.
*/
@Override
public double unnormLowerBound(byte norm) {
return boundaryValues[unsignedByteToInt(norm)];
}
/**
* Return the upper bound of the bucket represent by norm. This returns the next boundary value
* minus 1. If norm represents the last bucket, it returns the maxValue.
*/
@Override
public double unnormUpperBound(byte norm) {
// if it's already the last possible normalized value, just return the corresponding last
// boundary value.
int intNorm = unsignedByteToInt(norm);
if (intNorm == length - 1) {
return maxValue;
}
return boundaryValues[intNorm + 1] - 1;
}
/**
* Do a binary search on array and find the index of the item that's no bigger than value.
*/
private static int binarySearch(int value, int[] array) {
// corner cases
if (value <= array[0]) {
return 0;
} else if (value >= array[array.length - 1]) {
return array.length - 1;
}
int left = 0;
int right = array.length - 1;
int pivot = (left + right) >> 1;
do {
int midVal = array[pivot];
if (value == midVal) {
break;
} else if (value > midVal) {
left = pivot;
} else {
right = pivot;
}
pivot = (left + right) >> 1;
} while (pivot != left);
return pivot;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder(String.format(
"Smart Integer Normalizer (numBits = %d, max = %d)\n",
this.numBits, this.maxValue));
for (int i = 0; i < this.length; i++) {
sb.append(String.format(
"[%2d] boundary = %6d, range [ %6d, %6d ), norm: %4d | %4d | %4d %s\n",
i, boundaryValues[i],
(int) unnormLowerBound(intToUnsignedByte(i)),
(int) unnormUpperBound(intToUnsignedByte(i)),
unsignedByteToInt(normalize(boundaryValues[i] - 1)),
unsignedByteToInt(normalize(boundaryValues[i])),
unsignedByteToInt(normalize(boundaryValues[i] + 1)),
i == boundaryValues[i] ? "*" : ""));
}
return sb.toString();
}
@VisibleForTesting
int[] getBoundaryValues() {
return boundaryValues;
}
}

View File

@ -1,25 +0,0 @@
java_library(
sources = ["*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/code/findbugs:jsr305",
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn",
"3rdparty/jvm/org/apache/lucene:lucene-core",
"3rdparty/jvm/org/apache/lucene:lucene-facet",
"3rdparty/jvm/org/apache/lucene:lucene-queries",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"3rdparty/jvm/org/slf4j:slf4j-api",
"src/java/com/twitter/search/common/features",
"src/java/com/twitter/search/common/schema/base",
"src/java/com/twitter/search/common/schema/earlybird",
"src/java/com/twitter/search/common/util/analysis",
"src/java/com/twitter/search/queryparser",
"src/java/com/twitter/search/queryparser/query:core-query-nodes",
"src/java/com/twitter/search/queryparser/query/search:search-query-nodes",
],
)

View File

@ -1,27 +0,0 @@
package com.twitter.search.common.query;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.Query;
/**
* A class of utilities related to query boosts.
*/
public final class BoostUtils {
private BoostUtils() {
}
/**
* Wraps the given query into a BoostQuery, if {@code boost} is not equal to 1.0f.
*
* @param query The query.
* @param boost The boost.
* @return If {@code boost} is equal to 1.0f, then {@code query} is returned; otherwise,
* {@code query} is wrapped into a {@code BoostQuery} instance with the given boost.
*/
public static Query maybeWrapInBoostQuery(Query query, float boost) {
if (boost == 1.0f) {
return query;
}
return new BoostQuery(query, boost);
}
}

View File

@ -1,92 +0,0 @@
package com.twitter.search.common.query;
import java.util.Map;
import java.util.Set;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.twitter.search.queryparser.query.BooleanQuery;
import com.twitter.search.queryparser.query.Conjunction;
import com.twitter.search.queryparser.query.Disjunction;
import com.twitter.search.queryparser.query.Operator;
import com.twitter.search.queryparser.query.Phrase;
import com.twitter.search.queryparser.query.Query;
import com.twitter.search.queryparser.query.QueryParserException;
import com.twitter.search.queryparser.query.QueryVisitor;
import com.twitter.search.queryparser.query.SpecialTerm;
import com.twitter.search.queryparser.query.Term;
import com.twitter.search.queryparser.query.annotation.Annotation;
/**
* Collect the nodes with a specified annotation type in the given query.
*/
public class CollectAnnotationsVisitor extends QueryVisitor<Boolean> {
protected final Annotation.Type type;
protected final Map<Query, Boolean> nodeToTypeMap = Maps.newIdentityHashMap();
public CollectAnnotationsVisitor(Annotation.Type type) {
this.type = Preconditions.checkNotNull(type);
}
@Override
public Boolean visit(Disjunction disjunction) throws QueryParserException {
return visitBooleanQuery(disjunction);
}
@Override
public Boolean visit(Conjunction conjunction) throws QueryParserException {
return visitBooleanQuery(conjunction);
}
@Override
public Boolean visit(Phrase phrase) throws QueryParserException {
return visitQuery(phrase);
}
@Override
public Boolean visit(Term term) throws QueryParserException {
return visitQuery(term);
}
@Override
public Boolean visit(Operator operator) throws QueryParserException {
return visitQuery(operator);
}
@Override
public Boolean visit(SpecialTerm special) throws QueryParserException {
return visitQuery(special);
}
protected boolean visitQuery(Query query) throws QueryParserException {
if (query.hasAnnotationType(type)) {
collectNode(query);
return true;
}
return false;
}
protected void collectNode(Query query) {
nodeToTypeMap.put(query, true);
}
protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException {
boolean found = false;
if (query.hasAnnotationType(type)) {
collectNode(query);
found = true;
}
for (Query child : query.getChildren()) {
found |= child.accept(this);
}
return found;
}
public Set<Query> getNodes() {
return nodeToTypeMap.keySet();
}
}

View File

@ -1,89 +0,0 @@
package com.twitter.search.common.query;
import java.util.Map;
import java.util.Set;
import com.google.common.collect.Maps;
import com.twitter.search.queryparser.query.BooleanQuery;
import com.twitter.search.queryparser.query.Conjunction;
import com.twitter.search.queryparser.query.Disjunction;
import com.twitter.search.queryparser.query.Operator;
import com.twitter.search.queryparser.query.Phrase;
import com.twitter.search.queryparser.query.Query;
import com.twitter.search.queryparser.query.QueryParserException;
import com.twitter.search.queryparser.query.QueryVisitor;
import com.twitter.search.queryparser.query.SpecialTerm;
import com.twitter.search.queryparser.query.Term;
/**
* Collects the nodes with a specified query type in the given query.
*/
public class CollectQueryTypeVisitor extends QueryVisitor<Boolean> {
protected final Query.QueryType queryType;
protected final Map<Query, Boolean> nodeToTypeMap = Maps.newIdentityHashMap();
public CollectQueryTypeVisitor(Query.QueryType queryType) {
this.queryType = queryType;
}
@Override
public Boolean visit(Disjunction disjunction) throws QueryParserException {
return visitBooleanQuery(disjunction);
}
@Override
public Boolean visit(Conjunction conjunction) throws QueryParserException {
return visitBooleanQuery(conjunction);
}
@Override
public Boolean visit(Phrase phrase) throws QueryParserException {
return visitQuery(phrase);
}
@Override
public Boolean visit(Term term) throws QueryParserException {
return visitQuery(term);
}
@Override
public Boolean visit(Operator operator) throws QueryParserException {
return visitQuery(operator);
}
@Override
public Boolean visit(SpecialTerm special) throws QueryParserException {
return visitQuery(special);
}
public Set<Query> getCollectedNodes() {
return nodeToTypeMap.keySet();
}
protected boolean visitQuery(Query query) throws QueryParserException {
if (query.isTypeOf(queryType)) {
collectNode(query);
return true;
}
return false;
}
protected void collectNode(Query query) {
nodeToTypeMap.put(query, true);
}
protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException {
boolean found = false;
if (query.isTypeOf(queryType)) {
collectNode(query);
found = true;
}
for (Query child : query.getChildren()) {
found |= child.accept(this);
}
return found;
}
}

View File

@ -1,13 +0,0 @@
package com.twitter.search.common.query;
import com.twitter.search.queryparser.query.annotation.Annotation;
/**
* A visitor that collects the nodes that have :v annotation
*/
public class CollectVariantVisitor extends CollectAnnotationsVisitor {
public CollectVariantVisitor() {
super(Annotation.Type.VARIANT);
}
}

View File

@ -1,60 +0,0 @@
package com.twitter.search.common.query;
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
/**
* An abstract Weight implementation that can be used by all "filter" classes (Query instances that
* should not contribute to the overall query score).
*/
public abstract class DefaultFilterWeight extends Weight {
public DefaultFilterWeight(Query query) {
super(query);
}
@Override
public void extractTerms(Set<Term> terms) {
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
Scorer scorer = scorer(context);
if ((scorer != null) && (scorer.iterator().advance(doc) == doc)) {
return Explanation.match(0f, "Match on id " + doc);
}
return Explanation.match(0f, "No match on id " + doc);
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
DocIdSetIterator disi = getDocIdSetIterator(context);
if (disi == null) {
return null;
}
return new ConstantScoreScorer(this, 0.0f, ScoreMode.COMPLETE_NO_SCORES, disi);
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return false;
}
/**
* Returns the DocIdSetIterator over which the scorers created by this weight need to iterate.
*
* @param context The LeafReaderContext instance used to create the scorer.
*/
protected abstract DocIdSetIterator getDocIdSetIterator(LeafReaderContext context)
throws IOException;
}

View File

@ -1,74 +0,0 @@
package com.twitter.search.common.query;
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
/**
* Lucene filter on top of a known docid
*
*/
public class DocIdFilter extends Query {
private final int docid;
public DocIdFilter(int docid) {
this.docid = docid;
}
@Override
public Weight createWeight(
IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
return new Weight(this) {
@Override
public void extractTerms(Set<Term> terms) {
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
Scorer scorer = scorer(context);
if ((scorer != null) && (scorer.iterator().advance(doc) == doc)) {
return Explanation.match(0f, "Match on id " + doc);
}
return Explanation.match(0f, "No match on id " + doc);
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
return new ConstantScoreScorer(this, 0.0f, scoreMode, new SingleDocDocIdSetIterator(docid));
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return true;
}
};
}
@Override
public int hashCode() {
return docid;
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof DocIdFilter)) {
return false;
}
return docid == DocIdFilter.class.cast(obj).docid;
}
@Override
public String toString(String field) {
return "DOC_ID_FILTER[docId=" + docid + " + ]";
}
}

View File

@ -1,48 +0,0 @@
package com.twitter.search.common.query;
/**
* When a hit (on a part of the query tree) occurs, this class is passed to HitAttributeCollector
* for collection.
*
* This implementation carries the following info:
* <ul>
* <li>The field that matched (the field ID is recorded)</li>
* <li>The query node that matched (the query node rank is recorded)</li>
* <li>The ID of the last doc that matched this query</li>
* </ul>
*
* Each IdentifiableQuery should be associated with one FieldRankHitInfo, which is passed to a
* HitAttributeCollector when a hit occurs.
*/
public class FieldRankHitInfo {
protected static final int UNSET_DOC_ID = -1;
private final int fieldId;
private final int rank;
private int docId = UNSET_DOC_ID;
public FieldRankHitInfo(int fieldId, int rank) {
this.fieldId = fieldId;
this.rank = rank;
}
public int getFieldId() {
return fieldId;
}
public int getRank() {
return rank;
}
public int getDocId() {
return docId;
}
public void setDocId(int docId) {
this.docId = docId;
}
public void resetDocId() {
this.docId = UNSET_DOC_ID;
}
}

View File

@ -1,205 +0,0 @@
package com.twitter.search.common.query;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import com.google.common.base.Enums;
import com.google.common.base.Function;
import com.google.common.base.Functions;
import com.google.common.base.Predicates;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.common.schema.base.FieldWeightDefault;
import com.twitter.search.queryparser.query.Query;
import com.twitter.search.queryparser.query.QueryParserException;
import com.twitter.search.queryparser.query.annotation.Annotation;
import com.twitter.search.queryparser.query.annotation.FieldAnnotationUtils;
import com.twitter.search.queryparser.query.annotation.FieldNameWithBoost;
public final class FieldWeightUtil {
private static final Logger LOG = LoggerFactory.getLogger(FieldWeightUtil.class);
private FieldWeightUtil() {
}
/**
* Combines default field weight configuration with field annotations and returns a
* field-to-weight map.
*
* @param query The query whose annotations we will look into
* @param defaultFieldWeightMap field-to-FieldWeightDefault map
* @param enabledFieldWeightMap for optimization, this is the field-to-weight map inferred from
* the field-to-FieldWeightDefault map
* @param fieldNameToTyped A function that can turn string field name to typed field
* @param <T> The typed field
*/
public static <T> ImmutableMap<T, Float> combineDefaultWithAnnotation(
Query query,
Map<T, FieldWeightDefault> defaultFieldWeightMap,
Map<T, Float> enabledFieldWeightMap,
Function<String, T> fieldNameToTyped) throws QueryParserException {
return combineDefaultWithAnnotation(
query,
defaultFieldWeightMap,
enabledFieldWeightMap,
fieldNameToTyped,
Collections.<MappableField, T>emptyMap(),
Functions.forMap(Collections.<T, String>emptyMap(), ""));
}
/**
* Combines default field weight configuration with field annotations and returns a
* field-to-weight map. Also maps generic mappable fields to field weight boosts and resolves them
*
* @param query The query whose annotations we will look into
* @param defaultFieldWeightMap field-to-FieldWeightDefault map
* @param enabledFieldWeightMap for optimization, this is the field-to-weight map inferred from
* the field-to-FieldWeightDefault map
* @param fieldNameToTyped A function that can turn a string field name to typed field
* @param mappableFieldMap mapping of mappable fields to the corresponding typed fields
* @param typedToFieldName A function that can turn a typed field into a string field name
* @param <T> The typed field
*
* Note: As a result of discussion on SEARCH-24029, we now allow replace and remove annotations
* on a single term. See http://go/fieldweight for info on field weight annotations.
*/
public static <T> ImmutableMap<T, Float> combineDefaultWithAnnotation(
Query query,
Map<T, FieldWeightDefault> defaultFieldWeightMap,
Map<T, Float> enabledFieldWeightMap,
Function<String, T> fieldNameToTyped,
Map<MappableField, T> mappableFieldMap,
Function<T, String> typedToFieldName) throws QueryParserException {
List<Annotation> fieldAnnotations = query.getAllAnnotationsOf(Annotation.Type.FIELD);
List<Annotation> mappableFieldAnnotations =
query.getAllAnnotationsOf(Annotation.Type.MAPPABLE_FIELD);
if (fieldAnnotations.isEmpty() && mappableFieldAnnotations.isEmpty()) {
return ImmutableMap.copyOf(enabledFieldWeightMap);
}
// Convert mapped fields to field annotations
Iterable<Annotation> fieldAnnotationsForMappedFields =
FluentIterable.from(mappableFieldAnnotations)
.transform(FieldWeightUtil.fieldAnnotationForMappableField(mappableFieldMap,
typedToFieldName))
.filter(Predicates.notNull());
Iterable<Annotation> annotations =
Iterables.concat(fieldAnnotationsForMappedFields, fieldAnnotations);
// Sanitize the field annotations first, remove the ones we don't know
// for REPLACE and REMOVE.
List<FieldNameWithBoost> sanitizedFields = Lists.newArrayList();
Set<FieldNameWithBoost.FieldModifier> seenModifierTypes =
EnumSet.noneOf(FieldNameWithBoost.FieldModifier.class);
for (Annotation annotation : annotations) {
FieldNameWithBoost fieldNameWithBoost = (FieldNameWithBoost) annotation.getValue();
T typedField = fieldNameToTyped.apply(fieldNameWithBoost.getFieldName());
FieldNameWithBoost.FieldModifier modifier = fieldNameWithBoost.getFieldModifier();
if (defaultFieldWeightMap.containsKey(typedField)) {
seenModifierTypes.add(modifier);
sanitizedFields.add(fieldNameWithBoost);
}
}
// Even if there is no mapping for a mapped annotation, if a query is replaced by an unknown
// mapping, it should not map to other fields, so we need to detect a REPLACE annotation
if (seenModifierTypes.isEmpty()
&& FieldAnnotationUtils.hasReplaceAnnotation(mappableFieldAnnotations)) {
seenModifierTypes.add(FieldNameWithBoost.FieldModifier.REPLACE);
}
boolean onlyHasReplace = seenModifierTypes.size() == 1
&& seenModifierTypes.contains(FieldNameWithBoost.FieldModifier.REPLACE);
// If we only have replace, start with an empty map, otherwise, start with all enabled fields.
Map<T, Float> actualMap = onlyHasReplace
? Maps.<T, Float>newLinkedHashMap()
: Maps.newLinkedHashMap(enabledFieldWeightMap);
// Go over all field annotations and apply them.
for (FieldNameWithBoost fieldAnnotation : sanitizedFields) {
T typedField = fieldNameToTyped.apply(fieldAnnotation.getFieldName());
FieldNameWithBoost.FieldModifier modifier = fieldAnnotation.getFieldModifier();
switch (modifier) {
case REMOVE:
actualMap.remove(typedField);
break;
case ADD:
case REPLACE:
if (fieldAnnotation.getBoost().isPresent()) {
actualMap.put(typedField, fieldAnnotation.getBoost().get());
} else {
// When annotation does not specify weight, use default weight
actualMap.put(
typedField,
defaultFieldWeightMap.get(typedField).getWeight());
}
break;
default:
throw new QueryParserException("Unknown field annotation type: " + fieldAnnotation);
}
}
return ImmutableMap.copyOf(actualMap);
}
public static ImmutableMap<String, Float> combineDefaultWithAnnotation(
Query query,
Map<String, FieldWeightDefault> defaultFieldWeightMap,
Map<String, Float> enabledFieldWeightMap) throws QueryParserException {
return combineDefaultWithAnnotation(
query, defaultFieldWeightMap, enabledFieldWeightMap, Functions.<String>identity());
}
/**
* Create an annotation of the FIELD type from annotations of the MAPPED_FIELD type
* @param mappableFieldMap mapping of mappable fields to the corresponding typed fields
* @param typedToFieldName A function that can turn a typed field into a string field name
* @param <T> The typed field
* @return an Annotation with the same modifier and boost for a FIELD as the incoming MAPPED_FIELD
* annotation
*/
private static <T> Function<Annotation, Annotation> fieldAnnotationForMappableField(
final Map<MappableField, T> mappableFieldMap,
final Function<T, String> typedToFieldName) {
return new Function<Annotation, Annotation>() {
@Nullable
@Override
public Annotation apply(Annotation mappableAnnotation) {
FieldNameWithBoost fieldNameWithBoost = (FieldNameWithBoost) mappableAnnotation.getValue();
MappableField mappedField =
Enums.getIfPresent(
MappableField.class,
fieldNameWithBoost.getFieldName().toUpperCase()).orNull();
T typedFieldName = mappableFieldMap.get(mappedField);
Annotation fieldAnnotation = null;
if (typedFieldName != null) {
String fieldName = typedToFieldName.apply(typedFieldName);
FieldNameWithBoost mappedFieldBoost =
new FieldNameWithBoost(
fieldName,
fieldNameWithBoost.getBoost(),
fieldNameWithBoost.getFieldModifier());
fieldAnnotation = Annotation.Type.FIELD.newInstance(mappedFieldBoost);
}
return fieldAnnotation;
}
};
}
}

View File

@ -1,225 +0,0 @@
package com.twitter.search.common.query;
import java.io.IOException;
import java.util.Set;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
/**
* A pairing of a query and a filter. The hits traversal is driven by the query's DocIdSetIterator,
* and the filter is used only to do post-filtering. In other words, the filter is never used to
* find the next doc ID: it's only used to filter out the doc IDs returned by the query's
* DocIdSetIterator. This is useful when we need to have a conjunction between a query that can
* quickly iterate through doc IDs (eg. a posting list), and an expensive filter (eg. a filter based
* on the values stored in a CSF).
*
* For example, let say we want to build a query that returns all docs that have at least 100 faves.
* 1. One option is to go with the [min_faves 100] query. This would be very expensive though,
* because this query would have to walk through every doc in the segment and for each one of
* them it would have to extract the number of faves from the forward index.
* 2. Another option is to go with a conjunction between this query and the HAS_ENGAGEMENT filter:
* (+[min_faves 100] +[cached_filter has_engagements]). The HAS_ENGAGEMENT filter could
* traverse the doc ID space faster (if it's backed by a posting list). But this approach would
* still be slow, because as soon as the HAS_ENGAGEMENT filter finds a doc ID, the conjunction
* scorer would trigger an advance(docID) call on the min_faves part of the query, which has
* the same problem as the first option.
* 3. Finally, a better option for this particular case would be to drive by the HAS_ENGAGEMENT
* filter (because it can quickly jump over all docs that do not have any engagement), and use
* the min_faves filter as a post-processing step, on a much smaller set of docs.
*/
public class FilteredQuery extends Query {
/**
* A doc ID predicate that determines if the given doc ID should be accepted.
*/
@FunctionalInterface
public static interface DocIdFilter {
/**
* Determines if the given doc ID should be accepted.
*/
boolean accept(int docId) throws IOException;
}
/**
* A factory for creating DocIdFilter instances based on a given LeafReaderContext instance.
*/
@FunctionalInterface
public static interface DocIdFilterFactory {
/**
* Returns a DocIdFilter instance for the given LeafReaderContext instance.
*/
DocIdFilter getDocIdFilter(LeafReaderContext context) throws IOException;
}
private static class FilteredQueryDocIdSetIterator extends DocIdSetIterator {
private final DocIdSetIterator queryScorerIterator;
private final DocIdFilter docIdFilter;
public FilteredQueryDocIdSetIterator(
DocIdSetIterator queryScorerIterator, DocIdFilter docIdFilter) {
this.queryScorerIterator = Preconditions.checkNotNull(queryScorerIterator);
this.docIdFilter = Preconditions.checkNotNull(docIdFilter);
}
@Override
public int docID() {
return queryScorerIterator.docID();
}
@Override
public int nextDoc() throws IOException {
int docId;
do {
docId = queryScorerIterator.nextDoc();
} while (docId != NO_MORE_DOCS && !docIdFilter.accept(docId));
return docId;
}
@Override
public int advance(int target) throws IOException {
int docId = queryScorerIterator.advance(target);
if (docId == NO_MORE_DOCS || docIdFilter.accept(docId)) {
return docId;
}
return nextDoc();
}
@Override
public long cost() {
return queryScorerIterator.cost();
}
}
private static class FilteredQueryScorer extends Scorer {
private final Scorer queryScorer;
private final DocIdFilter docIdFilter;
public FilteredQueryScorer(Weight weight, Scorer queryScorer, DocIdFilter docIdFilter) {
super(weight);
this.queryScorer = Preconditions.checkNotNull(queryScorer);
this.docIdFilter = Preconditions.checkNotNull(docIdFilter);
}
@Override
public int docID() {
return queryScorer.docID();
}
@Override
public float score() throws IOException {
return queryScorer.score();
}
@Override
public DocIdSetIterator iterator() {
return new FilteredQueryDocIdSetIterator(queryScorer.iterator(), docIdFilter);
}
@Override
public float getMaxScore(int upTo) throws IOException {
return queryScorer.getMaxScore(upTo);
}
}
private static class FilteredQueryWeight extends Weight {
private final Weight queryWeight;
private final DocIdFilterFactory docIdFilterFactory;
public FilteredQueryWeight(
FilteredQuery query, Weight queryWeight, DocIdFilterFactory docIdFilterFactory) {
super(query);
this.queryWeight = Preconditions.checkNotNull(queryWeight);
this.docIdFilterFactory = Preconditions.checkNotNull(docIdFilterFactory);
}
@Override
public void extractTerms(Set<Term> terms) {
queryWeight.extractTerms(terms);
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
return queryWeight.explain(context, doc);
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
Scorer queryScorer = queryWeight.scorer(context);
if (queryScorer == null) {
return null;
}
return new FilteredQueryScorer(this, queryScorer, docIdFilterFactory.getDocIdFilter(context));
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return queryWeight.isCacheable(ctx);
}
}
private final Query query;
private final DocIdFilterFactory docIdFilterFactory;
public FilteredQuery(Query query, DocIdFilterFactory docIdFilterFactory) {
this.query = Preconditions.checkNotNull(query);
this.docIdFilterFactory = Preconditions.checkNotNull(docIdFilterFactory);
}
public Query getQuery() {
return query;
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
Query rewrittenQuery = query.rewrite(reader);
if (rewrittenQuery != query) {
return new FilteredQuery(rewrittenQuery, docIdFilterFactory);
}
return this;
}
@Override
public int hashCode() {
return query.hashCode() * 13 + docIdFilterFactory.hashCode();
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof FilteredQuery)) {
return false;
}
FilteredQuery filteredQuery = FilteredQuery.class.cast(obj);
return query.equals(filteredQuery.query)
&& docIdFilterFactory.equals(filteredQuery.docIdFilterFactory);
}
@Override
public String toString(String field) {
StringBuilder sb = new StringBuilder();
sb.append("FilteredQuery(")
.append(query)
.append(" -> ")
.append(docIdFilterFactory)
.append(")");
return sb.toString();
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
Weight queryWeight = Preconditions.checkNotNull(query.createWeight(searcher, scoreMode, boost));
return new FilteredQueryWeight(this, queryWeight, docIdFilterFactory);
}
}

View File

@ -1,36 +0,0 @@
package com.twitter.search.common.query;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
public class FilteredScorer extends Scorer {
protected final Scorer inner;
public FilteredScorer(Weight weight, Scorer inner) {
super(weight);
this.inner = inner;
}
@Override
public float score() throws IOException {
return inner.score();
}
@Override
public int docID() {
return inner.docID();
}
@Override
public DocIdSetIterator iterator() {
return inner.iterator();
}
@Override
public float getMaxScore(int upTo) throws IOException {
return inner.getMaxScore(upTo);
}
}

View File

@ -1,101 +0,0 @@
package com.twitter.search.common.query;
import java.util.List;
import java.util.Map;
import java.util.function.BiFunction;
import java.util.function.Function;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Query;
/**
* Not threadsafe, but should be reused across different queries unless the size of the existing
* one is too small for a new huge serialized query.
*/
public class HitAttributeCollector {
private final List<FieldRankHitInfo> hitInfos = Lists.newArrayList();
private final BiFunction<Integer, Integer, FieldRankHitInfo> hitInfoSupplier;
private int docBase = 0;
public HitAttributeCollector() {
this.hitInfoSupplier = FieldRankHitInfo::new;
}
/**
* Constructs a new {@code HitAttributionCollector} with the specified {@code FieldRankHitInfo}
* supplier.
*
* @param hitInfoSupplier function to supply a {@code FieldRankHitInfo} instance
*/
public HitAttributeCollector(BiFunction<Integer, Integer, FieldRankHitInfo> hitInfoSupplier) {
this.hitInfoSupplier = hitInfoSupplier;
}
/**
* Creates a new IdentifiableQuery for the given query, fieldId and rank, and "registers"
* the fieldId and the rank with this collector.
*
* @param query the query to be wrapped.
* @param fieldId the ID of the field to be searched.
* @param rank The rank of this query.
* @return A new IdentifiableQuery instance for the given query, fieldId and rank.
*/
public IdentifiableQuery newIdentifiableQuery(Query query, int fieldId, int rank) {
FieldRankHitInfo fieldRankHitInfo = hitInfoSupplier.apply(fieldId, rank);
hitInfos.add(fieldRankHitInfo);
return new IdentifiableQuery(query, fieldRankHitInfo, this);
}
public void clearHitAttributions(LeafReaderContext ctx, FieldRankHitInfo hitInfo) {
docBase = ctx.docBase;
hitInfo.resetDocId();
}
public void collectScorerAttribution(int docId, FieldRankHitInfo hitInfo) {
hitInfo.setDocId(docId + docBase);
}
/**
* This method should be called when a global hit occurs.
* This method returns hit attribution summary for the whole query tree.
* This supports getting hit attribution for only the curDoc.
*
* @param docId docId passed in for checking against curDoc.
* @return Returns a map from node rank to a set of matching field IDs. This map does not contain
* entries for ranks that did not hit at all.
*/
public Map<Integer, List<Integer>> getHitAttribution(int docId) {
return getHitAttribution(docId, (fieldId) -> fieldId);
}
/**
* This method should be called when a global hit occurs.
* This method returns hit attribution summary for the whole query tree.
* This supports getting hit attribution for only the curDoc.
*
* @param docId docId passed in for checking against curDoc.
* @param fieldIdFunc The mapping of field IDs to objects of type T.
* @return Returns a map from node rank to a set of matching objects (usually field IDs or names).
* This map does not contain entries for ranks that did not hit at all.
*/
public <T> Map<Integer, List<T>> getHitAttribution(int docId, Function<Integer, T> fieldIdFunc) {
int key = docId + docBase;
Map<Integer, List<T>> hitMap = Maps.newHashMap();
// Manually iterate through all hitInfos elements. It's slightly faster than using an Iterator.
for (FieldRankHitInfo hitInfo : hitInfos) {
if (hitInfo.getDocId() == key) {
int rank = hitInfo.getRank();
List<T> rankHits = hitMap.computeIfAbsent(rank, k -> Lists.newArrayList());
T fieldDescription = fieldIdFunc.apply(hitInfo.getFieldId());
rankHits.add(fieldDescription);
}
}
return hitMap;
}
}

View File

@ -1,102 +0,0 @@
package com.twitter.search.common.query;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import com.google.common.collect.Maps;
import com.twitter.search.queryparser.query.Query;
import static com.twitter.search.common.query.FieldRankHitInfo.UNSET_DOC_ID;
/**
* Generic helper class containing the data needed to set up and collect field hit attributions.
*/
public class HitAttributeHelper implements HitAttributeProvider {
private final HitAttributeCollector collector;
private final Function<Integer, String> fieldIdsToFieldNames;
// This is a mapping of type T query nodes to rank id
private final Map<Query, Integer> nodeToRankMap;
// This is meant to expand individual Query nodes into multiple ranks,
// for example, expanding a multi_term_disjunction to include a rank for each disjunction value.
private final Map<Query, List<Integer>> expandedNodeToRankMap;
// A single-entry cache for hit attribution, so we can reuse the immediate result. Will be used
// only when lastDocId matches
private ThreadLocal<Map<Integer, List<String>>> lastHitAttrHolder = new ThreadLocal<>();
private ThreadLocal<Integer> lastDocIdHolder = ThreadLocal.withInitial(() -> UNSET_DOC_ID);
protected HitAttributeHelper(
HitAttributeCollector collector,
Function<Integer, String> fieldIdsToFieldNames,
Map<Query, Integer> nodeToRankMap,
Map<Query, List<Integer>> expandedNodeToRankMap) {
this.collector = collector;
this.fieldIdsToFieldNames = fieldIdsToFieldNames;
this.nodeToRankMap = nodeToRankMap;
this.expandedNodeToRankMap = expandedNodeToRankMap;
}
/**
* Constructs a new {@code HitAttributeHelper} with the specified {@code HitAttributeCollector}
* instance and fields.
*
* @param collector a collector instance
* @param fieldIdsToFieldNames a list of field names indexed by id
*/
public HitAttributeHelper(HitAttributeCollector collector, String[] fieldIdsToFieldNames) {
this(collector,
(fieldId) -> fieldIdsToFieldNames[fieldId],
Maps.newHashMap(),
Maps.newHashMap());
}
public HitAttributeCollector getFieldRankHitAttributeCollector() {
return collector;
}
/**
* Returns hit attribution information indexed by node rank
*
* @param docId the document id
* @return a mapping from the query's node rank to a list of field names that were hit.
*/
public Map<Integer, List<String>> getHitAttribution(int docId) {
// check cache first so we don't have to recompute the same thing.
if (lastDocIdHolder.get() == docId) {
return lastHitAttrHolder.get();
}
lastDocIdHolder.set(docId);
Map<Integer, List<String>> hitAttribution =
collector.getHitAttribution(docId, fieldIdsToFieldNames);
lastHitAttrHolder.set(hitAttribution);
return hitAttribution;
}
/**
* Adds a new node and its respective rank to the helper's node-to-rank map
* Will throw an exception if attempting to add/update an existing node
*
* @param node the query node
* @param rank the rank associated with the node
*/
public void addNodeRank(Query node, int rank) {
// if there are two of the same terms, just map them to the first rank, they should get the same
// hits back
if (!nodeToRankMap.containsKey(node)) {
nodeToRankMap.put(node, rank);
}
}
public Map<Query, Integer> getNodeToRankMap() {
return nodeToRankMap;
}
public Map<Query, List<Integer>> getExpandedNodeToRankMap() {
return expandedNodeToRankMap;
}
}

View File

@ -1,12 +0,0 @@
package com.twitter.search.common.query;
import java.util.List;
import java.util.Map;
/**
* The interface for objects that can provide hit attributes for a document.
*/
public interface HitAttributeProvider {
/** Returns the hit attributes for the given document. */
Map<Integer, List<String>> getHitAttribution(int docId);
}

View File

@ -1,378 +0,0 @@
package com.twitter.search.common.query;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.DocIdSetBuilder;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.base.IndexedNumericFieldSettings;
import com.twitter.search.common.util.analysis.LongTermAttributeImpl;
import com.twitter.search.common.util.analysis.SortableLongTermAttributeImpl;
import com.twitter.search.queryparser.query.QueryParserException;
/**
* An extension of Lucene's MultiTermQuery which creates a disjunction of
* long ID terms. Lucene tries to rewrite the Query depending on the number
* of clauses to perform as efficiently as possible.
*/
public class IDDisjunctionQuery extends MultiTermQuery {
private final List<Long> ids;
private final boolean useOrderPreservingEncoding;
/** Creates a new IDDisjunctionQuery instance. */
public IDDisjunctionQuery(List<Long> ids, String field, ImmutableSchemaInterface schemaSnapshot)
throws QueryParserException {
super(field);
this.ids = ids;
setRewriteMethod(new Rewrite());
if (!schemaSnapshot.hasField(field)) {
throw new QueryParserException(
"Tried to search a field which does not exist in schema: " + field);
}
IndexedNumericFieldSettings numericFieldSettings =
schemaSnapshot.getFieldInfo(field).getFieldType().getNumericFieldSettings();
if (numericFieldSettings == null) {
throw new QueryParserException("Requested id field is not numerical: " + field);
}
this.useOrderPreservingEncoding = numericFieldSettings.isUseSortableEncoding();
}
/**
* Work around for an issue where LongTerms are not valid utf8, so calling
* toString on any TermQuery containing a LongTerm may cause exceptions.
*/
private class Rewrite extends RewriteMethod {
@Override
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
Query result = new MultiTermQueryConstantScoreWrapper(
(IDDisjunctionQuery) query, useOrderPreservingEncoding);
return result;
}
}
@Override
protected TermsEnum getTermsEnum(final Terms terms, AttributeSource atts) throws IOException {
final Iterator<Long> it = this.ids.iterator();
final TermsEnum termsEnum = terms.iterator();
return new FilteredTermsEnum(termsEnum) {
private final BytesRef term = useOrderPreservingEncoding
? SortableLongTermAttributeImpl.newBytesRef()
: LongTermAttributeImpl.newBytesRef();
@Override protected AcceptStatus accept(BytesRef term) throws IOException {
return AcceptStatus.YES;
}
@Override public BytesRef next() throws IOException {
while (it.hasNext()) {
Long longTerm = it.next();
if (useOrderPreservingEncoding) {
SortableLongTermAttributeImpl.copyLongToBytesRef(term, longTerm);
} else {
LongTermAttributeImpl.copyLongToBytesRef(term, longTerm);
}
if (termsEnum.seekExact(term)) {
return term;
}
}
return null;
}
};
}
@Override
public String toString(String field) {
StringBuilder builder = new StringBuilder();
builder.append("IDDisjunction[").append(this.field).append(":");
for (Long id : this.ids) {
builder.append(id);
builder.append(",");
}
builder.setLength(builder.length() - 1);
builder.append("]");
return builder.toString();
}
private static class TermQueryWithToString extends TermQuery {
private final boolean useOrderPreservingEncoding;
public TermQueryWithToString(Term t, TermStates states, boolean useOrderPreservingEncoding) {
super(t, states);
this.useOrderPreservingEncoding = useOrderPreservingEncoding;
}
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
if (!getTerm().field().equals(field)) {
buffer.append(getTerm().field());
buffer.append(":");
}
long longTerm;
BytesRef termBytes = getTerm().bytes();
if (useOrderPreservingEncoding) {
longTerm = SortableLongTermAttributeImpl.copyBytesRefToLong(termBytes);
} else {
longTerm = LongTermAttributeImpl.copyBytesRefToLong(termBytes);
}
buffer.append(longTerm);
return buffer.toString();
}
}
/**
* This class provides the functionality behind {@link MultiTermQuery#CONSTANT_SCORE_REWRITE}.
* It tries to rewrite per-segment as a boolean query that returns a constant score and otherwise
* fills a DocIdSet with matches and builds a Scorer on top of this DocIdSet.
*/
static final class MultiTermQueryConstantScoreWrapper extends Query {
// disable the rewrite option which will scan all posting lists sequentially and perform
// the intersection using a temporary DocIdSet. In earlybird this mode is slower than a "normal"
// disjunctive BooleanQuery, due to early termination and the fact that everything is in memory.
private static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 3000;
private static class TermAndState {
private final BytesRef term;
private final TermState state;
private final int docFreq;
private final long totalTermFreq;
TermAndState(BytesRef term, TermState state, int docFreq, long totalTermFreq) {
this.term = term;
this.state = state;
this.docFreq = docFreq;
this.totalTermFreq = totalTermFreq;
}
}
private static class WeightOrDocIdSet {
private final Weight weight;
private final DocIdSet docIdSet;
WeightOrDocIdSet(Weight weight) {
this.weight = Objects.requireNonNull(weight);
this.docIdSet = null;
}
WeightOrDocIdSet(DocIdSet docIdSet) {
this.docIdSet = docIdSet;
this.weight = null;
}
}
protected final IDDisjunctionQuery query;
private final boolean useOrderPreservingEncoding;
/**
* Wrap a {@link MultiTermQuery} as a Filter.
*/
protected MultiTermQueryConstantScoreWrapper(
IDDisjunctionQuery query,
boolean useOrderPreservingEncoding) {
this.query = query;
this.useOrderPreservingEncoding = useOrderPreservingEncoding;
}
@Override
public String toString(String field) {
// query.toString should be ok for the filter, too, if the query boost is 1.0f
return query.toString(field);
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof MultiTermQueryConstantScoreWrapper)) {
return false;
}
return query.equals(MultiTermQueryConstantScoreWrapper.class.cast(obj).query);
}
@Override
public int hashCode() {
return query == null ? 0 : query.hashCode();
}
/** Returns the field name for this query */
public String getField() {
return query.getField();
}
private List<Long> getIDs() {
return query.ids;
}
@Override
public Weight createWeight(
final IndexSearcher searcher,
final ScoreMode scoreMode,
final float boost) throws IOException {
return new ConstantScoreWeight(this, boost) {
/** Try to collect terms from the given terms enum and return true iff all
* terms could be collected. If {@code false} is returned, the enum is
* left positioned on the next term. */
private boolean collectTerms(LeafReaderContext context,
TermsEnum termsEnum,
List<TermAndState> terms) throws IOException {
final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD,
BooleanQuery.getMaxClauseCount());
for (int i = 0; i < threshold; ++i) {
final BytesRef term = termsEnum.next();
if (term == null) {
return true;
}
TermState state = termsEnum.termState();
terms.add(new TermAndState(BytesRef.deepCopyOf(term),
state,
termsEnum.docFreq(),
termsEnum.totalTermFreq()));
}
return termsEnum.next() == null;
}
/**
* On the given leaf context, try to either rewrite to a disjunction if
* there are few terms, or build a DocIdSet containing matching docs.
*/
private WeightOrDocIdSet rewrite(LeafReaderContext context)
throws IOException {
final Terms terms = context.reader().terms(query.getField());
if (terms == null) {
// field does not exist
return new WeightOrDocIdSet((DocIdSet) null);
}
final TermsEnum termsEnum = query.getTermsEnum(terms);
assert termsEnum != null;
PostingsEnum docs = null;
final List<TermAndState> collectedTerms = new ArrayList<>();
if (collectTerms(context, termsEnum, collectedTerms)) {
// build a boolean query
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
for (TermAndState t : collectedTerms) {
final TermStates termStates = new TermStates(searcher.getTopReaderContext());
termStates.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
final Term term = new Term(query.getField(), t.term);
bqBuilder.add(
new TermQueryWithToString(term, termStates, useOrderPreservingEncoding),
Occur.SHOULD);
}
Query q = BoostUtils.maybeWrapInBoostQuery(
new ConstantScoreQuery(bqBuilder.build()), score());
return new WeightOrDocIdSet(
searcher.rewrite(q).createWeight(searcher, scoreMode, boost));
}
// Too many terms: go back to the terms we already collected and start building
// the DocIdSet
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc());
if (!collectedTerms.isEmpty()) {
TermsEnum termsEnum2 = terms.iterator();
for (TermAndState t : collectedTerms) {
termsEnum2.seekExact(t.term, t.state);
docs = termsEnum2.postings(docs, PostingsEnum.NONE);
builder.add(docs);
}
}
// Then keep filling the DocIdSet with remaining terms
do {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
} while (termsEnum.next() != null);
return new WeightOrDocIdSet(builder.build());
}
private Scorer scorer(DocIdSet set) throws IOException {
if (set == null) {
return null;
}
final DocIdSetIterator disi = set.iterator();
if (disi == null) {
return null;
}
return new ConstantScoreScorer(this, score(), ScoreMode.COMPLETE_NO_SCORES, disi);
}
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrDocIdSet = rewrite(context);
if (weightOrDocIdSet.weight != null) {
return weightOrDocIdSet.weight.bulkScorer(context);
} else {
final Scorer scorer = scorer(weightOrDocIdSet.docIdSet);
if (scorer == null) {
return null;
}
return new DefaultBulkScorer(scorer);
}
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrDocIdSet = rewrite(context);
if (weightOrDocIdSet.weight != null) {
return weightOrDocIdSet.weight.scorer(context);
} else {
return scorer(weightOrDocIdSet.docIdSet);
}
}
@Override
public void extractTerms(Set<Term> terms) {
terms.addAll(getIDs()
.stream()
.map(id -> new Term(getField(), LongTermAttributeImpl.copyIntoNewBytesRef(id)))
.collect(Collectors.toSet()));
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return false;
}
};
}
}
}

View File

@ -1,77 +0,0 @@
package com.twitter.search.common.query;
import java.io.IOException;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
/**
* Query implementation adds attribute collection support for an underlying query.
*/
public class IdentifiableQuery extends Query {
protected final Query inner;
private final FieldRankHitInfo queryId;
private final HitAttributeCollector attrCollector;
public IdentifiableQuery(Query inner, FieldRankHitInfo queryId,
HitAttributeCollector attrCollector) {
this.inner = Preconditions.checkNotNull(inner);
this.queryId = queryId;
this.attrCollector = Preconditions.checkNotNull(attrCollector);
}
@Override
public Weight createWeight(
IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
Weight innerWeight = inner.createWeight(searcher, scoreMode, boost);
return new IdentifiableQueryWeight(this, innerWeight, queryId, attrCollector);
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
Query rewritten = inner.rewrite(reader);
if (rewritten != inner) {
return new IdentifiableQuery(rewritten, queryId, attrCollector);
}
return this;
}
@Override
public int hashCode() {
return inner.hashCode() * 13 + (queryId == null ? 0 : queryId.hashCode());
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof IdentifiableQuery)) {
return false;
}
IdentifiableQuery identifiableQuery = IdentifiableQuery.class.cast(obj);
return inner.equals(identifiableQuery.inner)
&& (queryId == null
? identifiableQuery.queryId == null
: queryId.equals(identifiableQuery.queryId));
}
@Override
public String toString(String field) {
return inner.toString(field);
}
@VisibleForTesting
public Query getQueryForTest() {
return inner;
}
@VisibleForTesting
public FieldRankHitInfo getQueryIdForTest() {
return queryId;
}
}

View File

@ -1,60 +0,0 @@
package com.twitter.search.common.query;
import java.io.IOException;
import com.google.common.base.Preconditions;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
/**
* Scorer implementation that adds attribute collection support for an underlying query.
* Meant to be used in conjunction with {@link IdentifiableQuery}.
*/
public class IdentifiableQueryScorer extends FilteredScorer {
private final FieldRankHitInfo queryId;
private final HitAttributeCollector attrCollector;
public IdentifiableQueryScorer(Weight weight, Scorer inner, FieldRankHitInfo queryId,
HitAttributeCollector attrCollector) {
super(weight, inner);
this.queryId = queryId;
this.attrCollector = Preconditions.checkNotNull(attrCollector);
}
@Override
public DocIdSetIterator iterator() {
final DocIdSetIterator superDISI = super.iterator();
return new DocIdSetIterator() {
@Override
public int docID() {
return superDISI.docID();
}
@Override
public int nextDoc() throws IOException {
int docid = superDISI.nextDoc();
if (docid != NO_MORE_DOCS) {
attrCollector.collectScorerAttribution(docid, queryId);
}
return docid;
}
@Override
public int advance(int target) throws IOException {
int docid = superDISI.advance(target);
if (docid != NO_MORE_DOCS) {
attrCollector.collectScorerAttribution(docid, queryId);
}
return docid;
}
@Override
public long cost() {
return superDISI.cost();
}
};
}
}

View File

@ -1,58 +0,0 @@
package com.twitter.search.common.query;
import java.io.IOException;
import java.util.Set;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
/**
* Weight implementation that adds attribute collection support for an underlying query.
* Meant to be used in conjunction with {@link IdentifiableQuery}.
*/
public class IdentifiableQueryWeight extends Weight {
private final Weight inner;
private final FieldRankHitInfo queryId;
private final HitAttributeCollector attrCollector;
/** Creates a new IdentifiableQueryWeight instance. */
public IdentifiableQueryWeight(IdentifiableQuery query, Weight inner, FieldRankHitInfo queryId,
HitAttributeCollector attrCollector) {
super(query);
this.inner = inner;
this.queryId = queryId;
this.attrCollector = Preconditions.checkNotNull(attrCollector);
}
@Override
public Explanation explain(LeafReaderContext context, int doc)
throws IOException {
return inner.explain(context, doc);
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
attrCollector.clearHitAttributions(context, queryId);
Scorer innerScorer = inner.scorer(context);
if (innerScorer != null) {
return new IdentifiableQueryScorer(this, innerScorer, queryId, attrCollector);
} else {
return null;
}
}
@Override
public void extractTerms(Set<Term> terms) {
inner.extractTerms(terms);
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return inner.isCacheable(ctx);
}
}

View File

@ -1,34 +0,0 @@
package com.twitter.search.common.query;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
/**
* The indices may map the fields declared here to fields internally without exposing their schemas
* to other services. This can be used, for example, to set boosts for URL-like fields in Earlybird
* without direct knowledge of the internal Earlybird field name
*/
public enum MappableField {
REFERRAL,
URL;
static {
ImmutableMap.Builder<MappableField, String> builder = ImmutableMap.builder();
for (MappableField mappableField : MappableField.values()) {
builder.put(mappableField, mappableField.toString().toLowerCase());
}
MAPPABLE_FIELD_TO_NAME_MAP = Maps.immutableEnumMap(builder.build());
}
private static final ImmutableMap<MappableField, String> MAPPABLE_FIELD_TO_NAME_MAP;
/** Returns the name of the given MappableField. */
public static String mappableFieldName(MappableField mappableField) {
return MAPPABLE_FIELD_TO_NAME_MAP.get(mappableField);
}
/** Returns the name of this MappableField. */
public String getName() {
return MAPPABLE_FIELD_TO_NAME_MAP.get(this);
}
}

View File

@ -1,61 +0,0 @@
package com.twitter.search.common.query;
import java.io.IOException;
import java.util.Iterator;
import java.util.Set;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
public class MultiTermDisjunctionQuery extends MultiTermQuery {
private final Set<BytesRef> values;
/** Creates a new MultiTermDisjunctionQuery instance. */
public MultiTermDisjunctionQuery(String field, Set<BytesRef> values) {
super(field);
this.values = values;
}
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts)
throws IOException {
final TermsEnum termsEnum = terms.iterator();
final Iterator<BytesRef> it = values.iterator();
return new FilteredTermsEnum(termsEnum) {
@Override protected AcceptStatus accept(BytesRef term) throws IOException {
return AcceptStatus.YES;
}
@Override public BytesRef next() throws IOException {
while (it.hasNext()) {
BytesRef termRef = it.next();
if (termsEnum.seekExact(termRef)) {
return termRef;
}
}
return null;
}
};
}
@Override
public String toString(String field) {
StringBuilder builder = new StringBuilder();
builder.append("MultiTermDisjunctionQuery[");
for (BytesRef termVal : this.values) {
builder.append(termVal);
builder.append(",");
}
builder.setLength(builder.length() - 1);
builder.append("]");
return builder.toString();
}
}

View File

@ -1,160 +0,0 @@
package com.twitter.search.common.query;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import com.google.common.collect.Sets;
import com.twitter.search.queryparser.query.Conjunction;
import com.twitter.search.queryparser.query.Disjunction;
import com.twitter.search.queryparser.query.Phrase;
import com.twitter.search.queryparser.query.Query;
import com.twitter.search.queryparser.query.QueryParserException;
import com.twitter.search.queryparser.query.SpecialTerm;
import com.twitter.search.queryparser.query.Term;
import com.twitter.search.queryparser.query.search.Link;
import com.twitter.search.queryparser.query.search.SearchOperator;
import com.twitter.search.queryparser.query.search.SearchQueryVisitor;
/**
* Visitor to track the fields hits of each node
* Returns the common fields among conjunctions and the union of the fields amongst disjunctions
*/
public final class QueryCommonFieldHitsVisitor extends SearchQueryVisitor<Set<String>> {
private static final Logger LOG = Logger.getLogger(QueryCommonFieldHitsVisitor.class.getName());
private Map<Query, Integer> nodeToRankMap;
private Map<Integer, List<String>> hitFieldsByRank;
/**
* Find query term hit intersections based on hitmap given by HitAttributeHelper
*
* @param hitAttributeHelper the HitAttributeHelper
* @param docID documentID
* @param query the query searched
* @return a set of hit fields in String representation
*/
public static Set<String> findIntersection(
HitAttributeHelper hitAttributeHelper,
int docID,
Query query) {
return findIntersection(hitAttributeHelper.getNodeToRankMap(),
hitAttributeHelper.getHitAttribution(docID),
query);
}
/**
* Find query term hit intersections based on hitmap given by HitAttributeHelper
*
* @param nodeToRankMap the map of query node to its integer rank value
* @param hitFieldsByRank map of rank to list of hit fields in String representation
* @param query the query searched
* @return a set of hit fields in String representation
*/
public static Set<String> findIntersection(
Map<Query, Integer> nodeToRankMap,
Map<Integer, List<String>> hitFieldsByRank,
Query query) {
QueryCommonFieldHitsVisitor visitor =
new QueryCommonFieldHitsVisitor(nodeToRankMap, hitFieldsByRank);
try {
Set<String> returnSet = query.accept(visitor);
return returnSet;
} catch (QueryParserException e) {
LOG.log(Level.SEVERE, "Could not find intersection for query [" + query + "]: ", e);
return Collections.emptySet();
}
}
private QueryCommonFieldHitsVisitor(Map<Query, Integer> nodeToRankMap,
Map<Integer, List<String>> hitFieldsByRank) {
this.nodeToRankMap = nodeToRankMap;
this.hitFieldsByRank = hitFieldsByRank;
}
@Override
public Set<String> visit(Disjunction disjunction) throws QueryParserException {
Set<String> fieldHitIntersections = Sets.newHashSet();
for (Query child : disjunction.getChildren()) {
fieldHitIntersections.addAll(child.accept(this));
}
return fieldHitIntersections;
}
@Override
public Set<String> visit(Conjunction conjunction) throws QueryParserException {
List<Query> children = conjunction.getChildren();
if (!children.isEmpty()) {
boolean initializedIntersections = false;
Set<String> fieldHitIntersections = Sets.newHashSet();
for (Query child : children) {
Set<String> hits = child.accept(this);
if (hits.isEmpty()) {
// if it is empty, it means this query node is not of term type
// and we do not include these in the field intersection
// eg. cache filters, proximity groups
continue;
}
if (!initializedIntersections) {
fieldHitIntersections.addAll(hits);
initializedIntersections = true;
} else {
fieldHitIntersections.retainAll(hits);
}
}
return fieldHitIntersections;
}
return Collections.emptySet();
}
@Override
public Set<String> visit(Term term) throws QueryParserException {
Set<String> fieldHitIntersections = Sets.newHashSet();
Integer rank = nodeToRankMap.get(term);
if (rank != null) {
List<String> fields = hitFieldsByRank.get(rank);
// for disjunction cases where a term may not have any hits
if (fields != null) {
fieldHitIntersections.addAll(fields);
}
}
return fieldHitIntersections;
}
@Override
public Set<String> visit(SpecialTerm specialTerm) throws QueryParserException {
// This is way of splitting @mentions ensures consistency with way the lucene query is built in
// expertsearch
if (specialTerm.getType() == SpecialTerm.Type.MENTION && specialTerm.getValue().contains("_")) {
Phrase phrase = new Phrase(specialTerm.getValue().split("_"));
return phrase.accept(this);
}
return specialTerm.toTermOrPhrase().accept(this);
}
@Override
public Set<String> visit(SearchOperator operator) throws QueryParserException {
return Collections.emptySet();
}
@Override
public Set<String> visit(Link link) throws QueryParserException {
return link.toPhrase().accept(this);
}
@Override
public Set<String> visit(Phrase phrase) throws QueryParserException {
// All terms in the phrase should return the same hits fields, just check the first one
List<String> terms = phrase.getTerms();
if (!terms.isEmpty()) {
Term term = new Term(phrase.getTerms().get(0));
return term.accept(this);
}
return Collections.emptySet();
}
}

View File

@ -1,81 +0,0 @@
package com.twitter.search.common.query;
import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.queryparser.query.Query;
import com.twitter.search.queryparser.query.QueryParserException;
import com.twitter.search.queryparser.visitors.MultiTermDisjunctionRankVisitor;
import com.twitter.search.queryparser.visitors.NodeRankAnnotator;
import com.twitter.search.queryparser.visitors.QueryTreeIndex;
/**
* A helper class to collect field and query node hit attributions.
*/
public class QueryHitAttributeHelper extends HitAttributeHelper {
private final Query annotatedQuery;
protected QueryHitAttributeHelper(HitAttributeCollector collector,
Function<Integer, String> fieldIdsToFieldNames,
IdentityHashMap<Query, Integer> nodeToRankMap,
Query annotatedQuery,
Map<Query, List<Integer>> expandedRanksMap) {
super(collector, fieldIdsToFieldNames, nodeToRankMap, expandedRanksMap);
this.annotatedQuery = annotatedQuery;
}
/**
* Constructor specific for com.twitter.search.queryParser.query.Query
*
* This helper visits a parsed query to construct a node-to-rank mapping,
* and uses a schema to determine all of the possible fields to be tracked.
* A collector is then created.
*
* @param query the query for which we will collect hit attribution.
* @param schema the indexing schema.
*/
public static QueryHitAttributeHelper from(Query query, final Schema schema)
throws QueryParserException {
IdentityHashMap<Query, Integer> nodeToRankMap;
Query annotatedQuery;
// First see if the query already has node rank annotations on it. If so, we'll just use those
// to identify query nodes.
// We enforce that all provided ranks are in the range of [0, N-1] so not to blow up the size
// of the collection array.
QueryRankVisitor rankVisitor = new QueryRankVisitor();
if (query.accept(rankVisitor)) {
nodeToRankMap = rankVisitor.getNodeToRankMap();
annotatedQuery = query;
} else {
// Otherwise, we will assign all nodes in-order ranks, and use those to track per-node hit
// attribution
QueryTreeIndex queryTreeIndex = QueryTreeIndex.buildFor(query);
NodeRankAnnotator annotator = new NodeRankAnnotator(queryTreeIndex.getNodeToIndexMap());
annotatedQuery = query.accept(annotator);
nodeToRankMap = annotator.getUpdatedNodeToRankMap();
}
// Extract ranks for multi_term_disjunction operators
MultiTermDisjunctionRankVisitor multiTermDisjunctionRankVisitor =
new MultiTermDisjunctionRankVisitor(Collections.max(nodeToRankMap.values()));
annotatedQuery.accept(multiTermDisjunctionRankVisitor);
Map<Query, List<Integer>> expandedRanksMap =
multiTermDisjunctionRankVisitor.getMultiTermDisjunctionRankExpansionsMap();
return new QueryHitAttributeHelper(
new HitAttributeCollector(),
(fieldId) -> schema.getFieldName(fieldId),
nodeToRankMap,
annotatedQuery,
expandedRanksMap);
}
public Query getAnnotatedQuery() {
return annotatedQuery;
}
}

View File

@ -1,56 +0,0 @@
package com.twitter.search.common.query;
import java.util.IdentityHashMap;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.twitter.search.queryparser.query.BooleanQuery;
import com.twitter.search.queryparser.query.Query;
import com.twitter.search.queryparser.query.QueryParserException;
import com.twitter.search.queryparser.query.annotation.Annotation;
import com.twitter.search.queryparser.visitors.DetectAnnotationVisitor;
/**
* A visitor that collects node ranks from :r annotation in the query
*/
public class QueryRankVisitor extends DetectAnnotationVisitor {
private final IdentityHashMap<Query, Integer> nodeToRankMap = Maps.newIdentityHashMap();
public QueryRankVisitor() {
super(Annotation.Type.NODE_RANK);
}
@Override
protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException {
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
}
boolean found = false;
for (Query child : query.getChildren()) {
found |= child.accept(this);
}
return found;
}
@Override
protected boolean visitQuery(Query query) throws QueryParserException {
if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) {
collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query);
return true;
}
return false;
}
private void collectNodeRank(Annotation anno, Query query) {
Preconditions.checkArgument(anno.getType() == Annotation.Type.NODE_RANK);
int rank = (Integer) anno.getValue();
nodeToRankMap.put(query, rank);
}
public IdentityHashMap<Query, Integer> getNodeToRankMap() {
return nodeToRankMap;
}
}

View File

@ -1,51 +0,0 @@
package com.twitter.search.common.query;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
public class SingleDocDocIdSetIterator extends DocIdSetIterator {
// the only docid in the list
private final int doc;
private int docid = -1;
public SingleDocDocIdSetIterator(int doc) {
this.doc = doc;
}
@Override
public int docID() {
return docid;
}
@Override
public int nextDoc() throws IOException {
if (docid == -1) {
docid = doc;
} else {
docid = NO_MORE_DOCS;
}
return docid;
}
@Override
public int advance(int target) throws IOException {
if (docid == NO_MORE_DOCS) {
return docid;
} else if (doc < target) {
docid = NO_MORE_DOCS;
return docid;
} else {
docid = doc;
}
return docid;
}
@Override
public long cost() {
return 1;
}
}

View File

@ -1,32 +0,0 @@
package com.twitter.search.common.query;
import java.util.Collections;
import java.util.List;
import java.util.Map;
/**
* A hit attribute provider based on the static data
*/
public class StaticHitAttributeProvider implements HitAttributeProvider {
private int currentDocId;
private Map<Integer, List<String>> currentHitAttr;
public StaticHitAttributeProvider() {
}
/**
* Set a fake last doc id and hit attribution, this is only used to generate explanation.
*/
public void setCurrentHitAttr(int docId, Map<Integer, List<String>> hitAttr) {
this.currentDocId = docId;
this.currentHitAttr = hitAttr;
}
@Override
public Map<Integer, List<String>> getHitAttribution(int docId) {
if (docId == currentDocId) {
return currentHitAttr;
}
return Collections.EMPTY_MAP;
}
}

View File

@ -1,257 +0,0 @@
java_library(
name = "utils",
sources = ["utils/*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/com/twitter/elephantbird:core",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/thrift:libthrift",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/text/language:locale-util",
"src/java/com/twitter/search/common/partitioning/snowflakeparser",
"src/java/com/twitter/search/common/relevance/features",
"src/java/com/twitter/search/common/schema/earlybird",
"src/java/com/twitter/search/common/tweetypie",
"src/thrift/com/twitter/search:earlybird-java",
"src/thrift/com/twitter/search/common:schema-java",
],
)
java_library(
name = "ranking",
sources = ["ranking/**/*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
":utils",
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/com/twitter/elephantbird:core",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"3rdparty/jvm/org/slf4j:slf4j-api",
"src/java/com/twitter/common/base",
"src/java/com/twitter/search/common/logging",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/relevance/features",
"src/thrift/com/twitter/search:earlybird-java",
],
)
TRENDS_DATA_SERVICE_SOURCES = [
"TrendsThriftDataServiceManager.java",
"NGramCache.java",
]
java_library(
name = "trends-data-service",
sources = TRENDS_DATA_SERVICE_SOURCES,
platform = "java8",
provides = artifact(
org = "com.twitter.search.common.relevance",
name = "trends-data-service",
repo = artifactory,
),
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/io/netty:netty4-tcnative-boringssl-static",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication",
"finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/client",
"finagle/finagle-core/src/main",
"finagle/finagle-thrift/src/main/java",
"finagle/finagle-thriftmux/src/main/scala",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common_internal/text/version",
"src/java/com/twitter/penguin/search/filter",
"src/java/com/twitter/search/common/metrics",
"src/thrift/com/twitter/trends/plus:trends-plus-java",
"src/thrift/com/twitter/trends/service/gen:trends_service-java",
"src/thrift/com/twitter/trends/trending_content:trending-content-service-java",
"trends/trends_metadata/thrift/src/main/thrift/com/twitter/trends/trends_metadata:thrift-java",
"twitter-server-internal/src/main/scala",
"util/util-core:scala",
"util/util-stats/src/main/scala",
],
)
java_library(
name = "feature-update-reader",
sources = ["readers/*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server",
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"3rdparty/jvm/org/slf4j:slf4j-api",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/util:system-mocks",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/util/io:record-reader-api",
"src/java/com/twitter/search/common/util/thrift:text-protocol",
"src/thrift/com/twitter/search/common:schema-java",
],
)
target(
dependencies = [
":feature-update-reader",
":trends-data-service",
"src/java/com/twitter/search/common/relevance/features",
],
)
java_library(
name = "config",
sources = ["config/**/*.java"],
platform = "java8",
provides = artifact(
org = "com.twitter.search.common.relevance",
name = "config",
repo = artifactory,
),
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"src/java/com/twitter/search/common/config",
"src/resources/com/twitter/search/common/relevance/config",
],
)
java_library(
name = "classifiers",
sources = ["classifiers/**/*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
":config",
":entities_and_filters",
":trends-data-service",
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/com/twitter/elephantbird:core",
"3rdparty/jvm/commons-lang",
"3rdparty/jvm/geo/google:geoGoogle",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/collections",
"src/java/com/twitter/common/text/language:locale-util",
"src/java/com/twitter/common/text/token",
"src/java/com/twitter/common/text/transformer",
"src/java/com/twitter/common_internal/text:text-penguin7",
"src/java/com/twitter/common_internal/text/version",
"src/java/com/twitter/search/common/config",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/relevance/features",
"src/java/com/twitter/search/common/util/io/periodic",
"src/java/com/twitter/search/common/util/text",
"twitter-text/lib/java/src/main/java/com/twitter/twittertext",
],
)
java_library(
name = "text",
sources = ["text/**/*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
":entities_and_filters",
"3rdparty/jvm/com/google/guava",
"src/java/com/twitter/common/text/token",
"src/java/com/twitter/common/text/util:char-seq-util",
"src/java/com/twitter/common_internal/text:text-penguin7",
"src/java/com/twitter/common_internal/text/version",
"src/java/com/twitter/search/common/relevance/features",
"src/java/com/twitter/search/common/util/text",
"src/java/com/twitter/search/common/util/text/regex",
"src/thrift/com/twitter/search/common:indexing-java",
],
)
java_library(
name = "scorers",
sources = ["scorers/**/*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
":classifiers",
":config",
":entities_and_filters",
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/com/twitter/elephantbird:core",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common_internal/text/version",
"src/java/com/twitter/search/common/encoding/features",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/relevance/features",
"src/java/com/twitter/search/common/schema/base",
"src/java/com/twitter/search/common/schema/earlybird",
],
)
java_library(
name = "entities_and_filters",
sources = [
"entities/**/*.java",
"filters/**/*.java",
],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/code/findbugs:jsr305",
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/commons-lang",
"3rdparty/jvm/geo/google:geoGoogle",
"3rdparty/jvm/org/apache/commons:commons-lang3",
"3rdparty/jvm/org/apache/lucene:lucene-core",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/slf4j:slf4j-api",
"cuad/projects/ner/thrift/src/main/thrift:thrift-java",
"decider/src/main/scala",
"src/java/com/twitter/common/text/extractor",
"src/java/com/twitter/common/text/language:locale-util",
"src/java/com/twitter/common/text/pipeline",
"src/java/com/twitter/common/text/token",
"src/java/com/twitter/common/text/transformer",
"src/java/com/twitter/common_internal/text:text-penguin7",
"src/java/com/twitter/common_internal/text/version",
"src/java/com/twitter/search/common/decider",
"src/java/com/twitter/search/common/encoding/features",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/partitioning/snowflakeparser",
"src/java/com/twitter/search/common/relevance/features",
"src/java/com/twitter/search/common/schema/earlybird",
"src/java/com/twitter/search/common/util/text",
"src/thrift/com/twitter/search/common:indexing-java",
"src/thrift/com/twitter/service/spiderduck/gen:metadata-store-java",
"src/thrift/com/twitter/tweetypie:tweet-java",
"util/util-core:scala",
],
)
java_library(
name = "scores",
sources = ["scores/**/*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
],
)

View File

@ -1,152 +0,0 @@
package com.twitter.search.common.relevance;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.cache.CacheBuilder;
import com.google.common.collect.ImmutableList;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.penguin.search.filter.StringMatchFilter;
import com.twitter.util.Duration;
/**
* the Cache for Trends
*/
public class NGramCache {
private static final int DEFAULT_MAX_CACHE_SIZE = 5000;
private static final long DEFAULT_CACHE_ITEM_TTL_SEC = 24 * 3600; // 1 day
private final PenguinVersion penguinVersion;
// Keys are trends. Values are empty strings.
private final Map<String, String> trendsCache;
private volatile StringMatchFilter trendsMatcher = null;
/**
* Extract Trends from a list of normalized tokens
*/
public List<String> extractTrendsFromNormalized(List<String> tokens) {
if (trendsMatcher == null) {
return Collections.emptyList();
}
ImmutableList.Builder<String> trends = ImmutableList.builder();
for (String trend : trendsMatcher.extractNormalized(tokens)) {
if (trendsCache.containsKey(trend)) {
trends.add(trend);
}
}
return trends.build();
}
/**
* Extract Trends from a list of tokens
*/
public List<String> extractTrendsFrom(List<String> tokens, Locale language) {
if (trendsMatcher == null) {
return Collections.emptyList();
}
return trendsMatcher.extract(language, tokens);
}
/**
* Extract Trends from a given CharSequence
*/
public List<String> extractTrendsFrom(CharSequence text, Locale language) {
if (trendsMatcher == null) {
return Collections.emptyList();
}
ImmutableList.Builder<String> trends = ImmutableList.builder();
for (String trend : trendsMatcher.extract(language, text)) {
if (trendsCache.containsKey(trend)) {
trends.add(trend);
}
}
return trends.build();
}
public long numTrendingTerms() {
return trendsCache.size();
}
public Set<String> getTrends() {
return trendsCache.keySet();
}
public void clear() {
trendsCache.clear();
trendsMatcher = null;
}
/** Adds all trends to this NGramCache. */
public void addAll(Iterable<String> trends) {
for (String trend : trends) {
trendsCache.put(trend, "");
}
trendsMatcher = new StringMatchFilter(trendsCache.keySet(), penguinVersion);
}
public static Builder builder() {
return new Builder();
}
public static class Builder {
private int maxCacheSize = DEFAULT_MAX_CACHE_SIZE;
private long cacheItemTTLSecs = DEFAULT_CACHE_ITEM_TTL_SEC; // 1 day
private PenguinVersion penguinVersion = PenguinVersion.PENGUIN_4;
public Builder maxCacheSize(int cacheSize) {
this.maxCacheSize = cacheSize;
return this;
}
public Builder cacheItemTTL(long cacheItemTTL) {
this.cacheItemTTLSecs = cacheItemTTL;
return this;
}
public Builder penguinVersion(PenguinVersion newPenguinVersion) {
this.penguinVersion = Preconditions.checkNotNull(newPenguinVersion);
return this;
}
/** Builds an NGramCache instance. */
public NGramCache build() {
return new NGramCache(
maxCacheSize,
Duration.apply(cacheItemTTLSecs, TimeUnit.SECONDS),
penguinVersion);
}
}
// Should be used only in tests that want to mock out this class.
@VisibleForTesting
public NGramCache() {
this(DEFAULT_MAX_CACHE_SIZE,
Duration.apply(DEFAULT_CACHE_ITEM_TTL_SEC, TimeUnit.SECONDS),
PenguinVersion.PENGUIN_4);
}
private NGramCache(int maxCacheSize, Duration cacheItemTTL, PenguinVersion penguinVersion) {
// we only have 1 refresher thread that writes to the cache
this.trendsCache = CacheBuilder.newBuilder()
.concurrencyLevel(1)
.expireAfterWrite(cacheItemTTL.inSeconds(), TimeUnit.SECONDS)
.maximumSize(maxCacheSize)
.<String, String>build()
.asMap();
this.penguinVersion = penguinVersion;
}
}

View File

@ -1,353 +0,0 @@
package com.twitter.search.common.relevance;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import scala.runtime.BoxedUnit;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.finagle.Service;
import com.twitter.finagle.ThriftMux;
import com.twitter.finagle.builder.ClientBuilder;
import com.twitter.finagle.builder.ClientConfig;
import com.twitter.finagle.mtls.authentication.ServiceIdentifier;
import com.twitter.finagle.mtls.client.MtlsClientBuilder;
import com.twitter.finagle.stats.DefaultStatsReceiver;
import com.twitter.finagle.thrift.ThriftClientRequest;
import com.twitter.search.common.metrics.RelevanceStats;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.trends.plus.Module;
import com.twitter.trends.plus.TrendsPlusRequest;
import com.twitter.trends.plus.TrendsPlusResponse;
import com.twitter.trends.service.gen.Location;
import com.twitter.trends.trending_content.thriftjava.TrendingContentService;
import com.twitter.trends.trends_metadata.thriftjava.TrendsMetadataService;
import com.twitter.util.Duration;
import com.twitter.util.Future;
import com.twitter.util.Try;
/**
* Manages trends data retrieved from trends thrift API and perform automatic refresh.
*/
public final class TrendsThriftDataServiceManager {
private static final Logger LOG =
LoggerFactory.getLogger(TrendsThriftDataServiceManager.class.getName());
private static final int DEFAULT_TIME_TO_KILL_SEC = 60;
@VisibleForTesting
protected static final Map<String, String> DEFAULT_TRENDS_PARAMS_MAP = ImmutableMap.of(
"MAX_ITEMS_TO_RETURN", "10"); // we only take top 10 for each woeid.
@VisibleForTesting
protected static final int MAX_TRENDS_PER_WOEID = 10;
private final Duration requestTimeout;
private final Duration refreshDelayDuration;
private final Duration reloadIntervalDuration;
private final int numRetries;
// a list of trends cache we want to update
private final List<NGramCache> trendsCacheList;
private final SearchCounter getAvailableSuccessCounter =
RelevanceStats.exportLong("trends_extractor_get_available_success");
private final SearchCounter getAvailableFailureCounter =
RelevanceStats.exportLong("trends_extractor_get_available_failure");
private final SearchCounter getTrendsSuccessCounter =
RelevanceStats.exportLong("trends_extractor_success_fetch");
private final SearchCounter getTrendsFailureCounter =
RelevanceStats.exportLong("trends_extractor_failed_fetch");
private final SearchCounter updateFailureCounter =
RelevanceStats.exportLong("trends_extractor_failed_update");
private final ServiceIdentifier serviceIdentifier;
private ScheduledExecutorService scheduler;
@VisibleForTesting
protected Service<ThriftClientRequest, byte[]> contentService;
protected TrendingContentService.ServiceToClient contentClient;
protected Service<ThriftClientRequest, byte[]> metadataService;
protected TrendsMetadataService.ServiceToClient metadataClient;
@VisibleForTesting
protected TrendsUpdater trendsUpdater;
/**
* Returns an instance of TrendsThriftDataServiceManager.
* @param serviceIdentifier The service that wants to call
* into Trend's services.
* @param numRetries The number of retries in the event of
* request failures.
* @param requestTimeout The amount of time we wait before we consider a
* a request as failed.
* @param initTrendsCacheDelay How long to wait before the initial
* filling of the Trends cache in milliseconds.
* @param reloadInterval How often to refresh the cache with updated trends.
* @param trendsCacheList The cache of trends.
* @return An instance of TrendsThriftDataServiceManager configured
* with respect to the params provided.
*/
public static TrendsThriftDataServiceManager newInstance(
ServiceIdentifier serviceIdentifier,
int numRetries,
Duration requestTimeout,
Duration initTrendsCacheDelay,
Duration reloadInterval,
List<NGramCache> trendsCacheList) {
return new TrendsThriftDataServiceManager(
serviceIdentifier,
numRetries,
requestTimeout,
initTrendsCacheDelay,
reloadInterval,
trendsCacheList);
}
/**
* Resume auto refresh. Always called in constructor. Can be invoked after a
* stopAuthRefresh call to resume auto refreshing. Invoking it after shutDown is undefined.
*/
public synchronized void startAutoRefresh() {
if (scheduler == null) {
scheduler = Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder().setDaemon(true).setNameFormat(
"trends-data-refresher[%d]").build());
scheduler.scheduleAtFixedRate(
trendsUpdater,
refreshDelayDuration.inSeconds(),
reloadIntervalDuration.inSeconds(),
TimeUnit.SECONDS);
}
}
/**
* Stop auto refresh. Wait for the current execution thread to finish.
* This is a blocking call.
*/
public synchronized void stopAutoRefresh() {
if (scheduler != null) {
scheduler.shutdown(); // Disable new tasks from being submitted
try {
// Wait a while for existing tasks to terminate
if (!scheduler.awaitTermination(DEFAULT_TIME_TO_KILL_SEC, TimeUnit.SECONDS)) {
scheduler.shutdownNow(); // Cancel currently executing tasks
// Wait a while for tasks to respond to being cancelled
if (!scheduler.awaitTermination(DEFAULT_TIME_TO_KILL_SEC, TimeUnit.SECONDS)) {
LOG.info("Executor thread pool did not terminate.");
}
}
} catch (InterruptedException ie) {
// (Re-)Cancel if current thread also interrupted
scheduler.shutdownNow();
// Preserve interrupt status
Thread.currentThread().interrupt();
}
scheduler = null;
}
}
/** Shuts down the manager. */
public void shutDown() {
stopAutoRefresh();
// clear the cache
for (NGramCache cache : trendsCacheList) {
cache.clear();
}
if (contentService != null) {
contentService.close();
}
if (metadataService != null) {
metadataService.close();
}
}
private TrendsThriftDataServiceManager(
ServiceIdentifier serviceIdentifier,
int numRetries,
Duration requestTimeoutMS,
Duration refreshDelayDuration,
Duration reloadIntervalDuration,
List<NGramCache> trendsCacheList) {
this.numRetries = numRetries;
this.requestTimeout = requestTimeoutMS;
this.refreshDelayDuration = refreshDelayDuration;
this.reloadIntervalDuration = reloadIntervalDuration;
this.serviceIdentifier = serviceIdentifier;
this.trendsCacheList = Preconditions.checkNotNull(trendsCacheList);
trendsUpdater = new TrendsUpdater();
metadataService = buildMetadataService();
metadataClient = buildMetadataClient(metadataService);
contentService = buildContentService();
contentClient = buildContentClient(contentService);
}
@VisibleForTesting
protected Service<ThriftClientRequest, byte[]> buildContentService() {
ClientBuilder<
ThriftClientRequest,
byte[], ClientConfig.Yes,
ClientConfig.Yes,
ClientConfig.Yes
>
builder = ClientBuilder.get()
.stack(ThriftMux.client())
.name("trends_thrift_data_service_manager_content")
.dest("")
.retries(numRetries)
.reportTo(DefaultStatsReceiver.get())
.tcpConnectTimeout(requestTimeout)
.requestTimeout(requestTimeout);
ClientBuilder mtlsBuilder =
new MtlsClientBuilder.MtlsClientBuilderSyntax<>(builder).mutualTls(serviceIdentifier);
return ClientBuilder.safeBuild(mtlsBuilder);
}
@VisibleForTesting
protected TrendingContentService.ServiceToClient buildContentClient(
Service<ThriftClientRequest, byte[]> service) {
return new TrendingContentService.ServiceToClient(service);
}
@VisibleForTesting
protected Service<ThriftClientRequest, byte[]> buildMetadataService() {
ClientBuilder<
ThriftClientRequest,
byte[],
ClientConfig.Yes,
ClientConfig.Yes,
ClientConfig.Yes
>
builder = ClientBuilder.get()
.stack(ThriftMux.client())
.name("trends_thrift_data_service_manager_metadata")
.dest("")
.retries(numRetries)
.reportTo(DefaultStatsReceiver.get())
.tcpConnectTimeout(requestTimeout)
.requestTimeout(requestTimeout);
ClientBuilder mtlsBuilder =
new MtlsClientBuilder.MtlsClientBuilderSyntax<>(builder).mutualTls(serviceIdentifier);
return ClientBuilder.safeBuild(mtlsBuilder);
}
@VisibleForTesting
protected TrendsMetadataService.ServiceToClient buildMetadataClient(
Service<ThriftClientRequest, byte[]> service) {
return new TrendsMetadataService.ServiceToClient(service);
}
/**
* Updater that fetches available woeids and corresponding trending terms.
*/
@VisibleForTesting
protected class TrendsUpdater implements Runnable {
@Override
public void run() {
populateCacheFromTrendsService();
}
private Future<BoxedUnit> populateCacheFromTrendsService() {
long startTime = System.currentTimeMillis();
AtomicLong numTrendsReceived = new AtomicLong(0);
return metadataClient.getAvailable().flatMap(locations -> {
if (locations == null) {
getAvailableFailureCounter.increment();
LOG.warn("Failed to get woeids from trends.");
return Future.value(BoxedUnit.UNIT);
}
getAvailableSuccessCounter.increment();
return populateCacheFromTrendLocations(locations, numTrendsReceived);
}).onFailure(throwable -> {
LOG.info("Update failed", throwable);
updateFailureCounter.increment();
return BoxedUnit.UNIT;
}).ensure(() -> {
logRefreshStatus(startTime, numTrendsReceived);
return BoxedUnit.UNIT;
});
}
private Future<BoxedUnit> populateCacheFromTrendLocations(
List<Location> locations,
AtomicLong numTrendsReceived) {
List<Future<TrendsPlusResponse>> trendsPlusFutures = locations.stream()
.map(location -> makeTrendsPlusRequest(location))
.collect(Collectors.toList());
Future<List<Try<TrendsPlusResponse>>> trendsPlusFuture =
Future.collectToTry(trendsPlusFutures);
return trendsPlusFuture.map(tryResponses -> {
populateCacheFromResponses(tryResponses, numTrendsReceived);
return BoxedUnit.UNIT;
});
}
private Future<TrendsPlusResponse> makeTrendsPlusRequest(Location location) {
TrendsPlusRequest request = new TrendsPlusRequest()
.setWoeid(location.getWoeid())
.setMaxTrends(MAX_TRENDS_PER_WOEID);
long startTime = System.currentTimeMillis();
return contentClient.getTrendsPlus(request)
.onSuccess(response -> {
getTrendsSuccessCounter.increment();
return BoxedUnit.UNIT;
}).onFailure(throwable -> {
getTrendsFailureCounter.increment();
return BoxedUnit.UNIT;
});
}
private void populateCacheFromResponses(
List<Try<TrendsPlusResponse>> tryResponses,
AtomicLong numTrendsReceived) {
Set<String> trendStrings = Sets.newHashSet();
for (Try<TrendsPlusResponse> tryResponse : tryResponses) {
if (tryResponse.isThrow()) {
LOG.warn("Failed to fetch trends:" + tryResponse.toString());
continue;
}
TrendsPlusResponse trendsPlusResponse = tryResponse.get();
numTrendsReceived.addAndGet(trendsPlusResponse.modules.size());
for (Module module : trendsPlusResponse.modules) {
trendStrings.add(module.getTrend().name);
}
}
for (NGramCache cache : trendsCacheList) {
cache.addAll(trendStrings);
}
}
}
private void logRefreshStatus(long startTime, AtomicLong numTrendsReceived) {
LOG.info(String.format("Refresh done in [%dms] :\nfetchSuccess[%d] fetchFailure[%d] "
+ "updateFailure[%d] num trends received [%d]",
System.currentTimeMillis() - startTime,
getTrendsSuccessCounter.get(),
getTrendsFailureCounter.get(),
updateFailureCounter.get(),
numTrendsReceived.get()));
}
}

View File

@ -1,118 +0,0 @@
package com.twitter.search.common.relevance.classifiers;
import com.google.common.base.Preconditions;
import com.twitter.search.common.relevance.entities.TwitterMessage;
/**
* Interface to perform feature classification for a single
* @TwitterMessage object, or a group of them.
*
* Classification includes two steps: feature extraction, and
* quality evaluation. During feature extraction, any interesting
* feature that is deemed useful for subsequent quality analysis
* is extracted from the @TwitterMessage object. Quality evaluation
* is then done by a group of @TweetEvaluator objects associated
* with the classifier, by using the various features extracted in the
* previous step.
*
* Feature extraction and quality evaluation results are stored in
* @TweetFeatures field of the @TwitterMessage object, which is defined
* in src/main/thrift/classifier.thrift.
*/
public abstract class TweetClassifier {
/**
* A list of TweetQualityEvaluators which are invoked after
* feature extraction is done. If null, no quality evaluation
* is done.
*/
protected Iterable<TweetEvaluator> qualityEvaluators = null;
/**
* Passed in TwitterMessage is examined and any extractable
* features are saved in TweetFeatures field of TwitterMessage.
* Then TweetQualityEvaluators are applied to compute various
* quality values.
*
* @param tweet TwitterMessage to perform classification on.
*/
public void classifyTweet(final TwitterMessage tweet) {
Preconditions.checkNotNull(tweet);
// extract features
extractFeatures(tweet);
// compute quality
evaluate(tweet);
}
/**
* Classify a group of TwitterMessages and store features in their corresponding
* TweetFeatures fields.
*
* This default implementation just iterates through the map and classifies each
* individual tweet. Batching for better performance, if applicable, can be implemented by
* concrete subclasses.
*
* @param tweets TwitterMessages to perform classification on.
*/
public void classifyTweets(final Iterable<TwitterMessage> tweets) {
extractFeatures(tweets);
evaluate(tweets);
}
/**
* Use the specified list of TweetQualityEvaluators for this classifier.
*
* @param evaluators list of TweetQualityEvaluators to be used with this classifier.
*/
protected void setQualityEvaluators(final Iterable<TweetEvaluator> qualityEvaluators) {
Preconditions.checkNotNull(qualityEvaluators);
this.qualityEvaluators = qualityEvaluators;
}
/**
* Extract interesting features from a single TwitterMessage for classification.
*
* @param tweet TwitterMessage to extract interesting features for
*/
protected abstract void extractFeatures(final TwitterMessage tweet);
/**
* Extract interesting features from a list of TwitterMessages for classification.
* @param tweets list of TwitterMessages to extract interesting features for
*/
protected void extractFeatures(final Iterable<TwitterMessage> tweets) {
for (TwitterMessage tweet: tweets) {
extractFeatures(tweet);
}
}
/**
* Given a TwitterMessage which already has its features extracted,
* perform quality evaluation.
*
* @param tweet TwitterMessage to perform quality evaluation for
*/
protected void evaluate(final TwitterMessage tweet) {
if (qualityEvaluators == null) {
return;
}
for (TweetEvaluator evaluator : qualityEvaluators) {
evaluator.evaluate(tweet);
}
}
/**
* Given a list of TwitterMessages which already have their features extracted,
* perform quality evaluation.
*
* @param tweets list of TwitterMessages to perform quality evaluation for
*/
protected void evaluate(final Iterable<TwitterMessage> tweets) {
for (TwitterMessage tweet: tweets) {
evaluate(tweet);
}
}
}

View File

@ -1,37 +0,0 @@
package com.twitter.search.common.relevance.classifiers;
import com.google.common.base.Preconditions;
import com.twitter.search.common.relevance.entities.TwitterMessage;
/**
* Interface to perform quality evaluation for a single @TwitterMessage
* object or a group of them.
*
*/
public abstract class TweetEvaluator {
/**
* Passed in TwitterMessage is examined and any extractable
* features are stored in TweetFeatures field of TwitterMessage.
*
* @param tweet TwitterMessage to perform classification on.
*/
public abstract void evaluate(final TwitterMessage tweet);
/**
* Classify a group of TwitterMessages and store the features in their corresponding
* TweetFeatures fields.
*
* This default implementation just iterates through the map and classifies each
* individual tweet. Batching for better performance, if applicable, can be implemented by
* concrete subclasses.
*
* @param tweets TwitterMessages to perform classification on.
*/
public void evaluate(final Iterable<TwitterMessage> tweets) {
Preconditions.checkNotNull(tweets);
for (TwitterMessage tweet: tweets) {
evaluate(tweet);
}
}
}

View File

@ -1,260 +0,0 @@
package com.twitter.search.common.relevance.classifiers;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.atomic.AtomicReference;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.io.ByteSource;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.text.language.LocaleUtil;
import com.twitter.common.text.token.TokenizedCharSequence;
import com.twitter.common.text.token.attribute.TokenType;
import com.twitter.common.util.Clock;
import com.twitter.common_internal.text.pipeline.TwitterNgramGenerator;
import com.twitter.common_internal.text.topic.BlacklistedTopics;
import com.twitter.common_internal.text.topic.BlacklistedTopics.FilterMode;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.metrics.RelevanceStats;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.metrics.SearchRateCounter;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.features.TweetTextFeatures;
import com.twitter.search.common.relevance.features.TweetTextQuality;
import com.twitter.search.common.util.io.periodic.PeriodicFileLoader;
import com.twitter.search.common.util.text.NormalizerHelper;
import com.twitter.search.common.util.text.TokenizerHelper;
/**
* Determines if tweet text or username contains potentially offensive language.
*/
public class TweetOffensiveEvaluator extends TweetEvaluator {
private static final Logger LOG = LoggerFactory.getLogger(TweetOffensiveEvaluator.class);
private static final int MAX_OFFENSIVE_TERMS = 2;
private final File filterDirectory;
private static final File DEFAULT_FILTER_DIR = new File("");
private static final String ADULT_TOKEN_FILE_NAME = "adult_tokens.txt";
private static final String OFFENSIVE_TOPIC_FILE_NAME = "offensive_topics.txt";
private static final String OFFENSIVE_SUBSTRING_FILE_NAME = "offensive_substrings.txt";
private static final ThreadLocal<TwitterNgramGenerator> NGRAM_GENERATOR_HOLDER =
new ThreadLocal<TwitterNgramGenerator>() {
@Override
protected TwitterNgramGenerator initialValue() {
// It'll generate ngrams from TokenizedCharSequence, which contains tokenization results,
// so it doesn't matter which Penguin version to use here.
return new TwitterNgramGenerator.Builder(PenguinVersion.PENGUIN_6)
.setSize(1, MAX_OFFENSIVE_TERMS)
.build();
}
};
private final AtomicReference<BlacklistedTopics> offensiveTopics =
new AtomicReference<>();
private final AtomicReference<BlacklistedTopics> offensiveUsersTopics =
new AtomicReference<>();
private final AtomicReference<ByteSource> adultTokenFileContents = new AtomicReference<>();
private final AtomicReference<ByteSource> offensiveTokenFileContents = new AtomicReference<>();
private final AtomicReference<ByteSource> offensiveSubstringFileContents = new
AtomicReference<>();
private final SearchCounter sensitiveTextCounter =
RelevanceStats.exportLong("num_sensitive_text");
public TweetOffensiveEvaluator() {
this(DEFAULT_FILTER_DIR);
}
public TweetOffensiveEvaluator(
File filterDirectory
) {
this.filterDirectory = filterDirectory;
adultTokenFileContents.set(BlacklistedTopics.getResource(
BlacklistedTopics.DATA_PREFIX + ADULT_TOKEN_FILE_NAME));
offensiveTokenFileContents.set(BlacklistedTopics.getResource(
BlacklistedTopics.DATA_PREFIX + OFFENSIVE_TOPIC_FILE_NAME));
offensiveSubstringFileContents.set(BlacklistedTopics.getResource(
BlacklistedTopics.DATA_PREFIX + OFFENSIVE_SUBSTRING_FILE_NAME));
try {
rebuildBlacklistedTopics();
} catch (IOException e) {
throw new RuntimeException(e);
}
ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder()
.setNameFormat("offensive-evaluator-blacklist-reloader")
.setDaemon(true)
.build());
initPeriodicFileLoader(adultTokenFileContents, ADULT_TOKEN_FILE_NAME, executor);
initPeriodicFileLoader(offensiveTokenFileContents, OFFENSIVE_TOPIC_FILE_NAME, executor);
initPeriodicFileLoader(offensiveSubstringFileContents, OFFENSIVE_SUBSTRING_FILE_NAME, executor);
}
private void initPeriodicFileLoader(
AtomicReference<ByteSource> byteSource,
String fileName,
ScheduledExecutorService executor) {
File file = new File(filterDirectory, fileName);
try {
PeriodicFileLoader loader = new PeriodicFileLoader(
"offensive-evaluator-" + fileName,
file.getPath(),
executor,
Clock.SYSTEM_CLOCK) {
@Override
protected void accept(InputStream stream) throws IOException {
byteSource.set(ByteSource.wrap(IOUtils.toByteArray(stream)));
rebuildBlacklistedTopics();
}
};
loader.init();
} catch (Exception e) {
// Not the end of the world if we couldn't load the file, we already loaded the resource.
LOG.error("Could not load offensive topic filter " + fileName + " from ConfigBus", e);
}
}
private void rebuildBlacklistedTopics() throws IOException {
offensiveTopics.set(new BlacklistedTopics.Builder(false)
.loadFilterFromSource(adultTokenFileContents.get(), FilterMode.EXACT)
.loadFilterFromSource(offensiveSubstringFileContents.get(), FilterMode.SUBSTRING)
.build());
offensiveUsersTopics.set(new BlacklistedTopics.Builder(false)
.loadFilterFromSource(offensiveTokenFileContents.get(), FilterMode.EXACT)
.loadFilterFromSource(offensiveSubstringFileContents.get(), FilterMode.SUBSTRING)
.build());
}
@Override
public void evaluate(final TwitterMessage tweet) {
BlacklistedTopics offensiveFilter = this.offensiveTopics.get();
BlacklistedTopics offensiveUsersFilter = this.offensiveUsersTopics.get();
if (offensiveFilter == null || offensiveUsersFilter == null) {
return;
}
if (tweet.isSensitiveContent()) {
sensitiveTextCounter.increment();
}
// Check for user name.
Preconditions.checkState(tweet.getFromUserScreenName().isPresent(),
"Missing from-user screen name");
for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) {
TweetTextQuality textQuality = tweet.getTweetTextQuality(penguinVersion);
if (tweet.isSensitiveContent()) {
textQuality.addBoolQuality(TweetTextQuality.BooleanQualityType.SENSITIVE);
}
// Check if username has an offensive term
if (isUserNameOffensive(
tweet.getFromUserScreenName().get(), offensiveUsersFilter, penguinVersion)) {
SearchRateCounter offensiveUserCounter = RelevanceStats.exportRate(
"num_offensive_user_" + penguinVersion.name().toLowerCase());
offensiveUserCounter.increment();
textQuality.addBoolQuality(TweetTextQuality.BooleanQualityType.OFFENSIVE_USER);
}
// Check if tweet has an offensive term
if (isTweetOffensive(tweet, offensiveFilter, penguinVersion)) {
SearchRateCounter offensiveTextCounter = RelevanceStats.exportRate(
"num_offensive_text_" + penguinVersion.name().toLowerCase());
offensiveTextCounter.increment();
textQuality.addBoolQuality(TweetTextQuality.BooleanQualityType.OFFENSIVE);
}
}
}
private boolean isUserNameOffensive(String userName,
BlacklistedTopics offensiveUsersFilter,
PenguinVersion penguinVersion) {
String normalizedUserName = NormalizerHelper.normalizeKeepCase(
userName, LocaleUtil.UNKNOWN, penguinVersion);
List<String> termsToCheck = new ArrayList(TokenizerHelper.getSubtokens(normalizedUserName));
termsToCheck.add(normalizedUserName.toLowerCase());
for (String userNameToken : termsToCheck) {
if (!StringUtils.isBlank(userNameToken) && offensiveUsersFilter.filter(userNameToken)) {
return true;
}
}
return false;
}
private boolean isTweetOffensive(final TwitterMessage tweet,
BlacklistedTopics offensiveFilter,
PenguinVersion penguinVersion) {
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
boolean tweetHasOffensiveTerm = false;
// Check for tweet text.
List<TokenizedCharSequence> ngrams =
NGRAM_GENERATOR_HOLDER.get().generateNgramsAsTokenizedCharSequence(
textFeatures.getTokenSequence(), tweet.getLocale());
for (TokenizedCharSequence ngram : ngrams) {
// skip URL ngram
if (!ngram.getTokensOf(TokenType.URL).isEmpty()) {
continue;
}
String ngramStr = ngram.toString();
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
tweetHasOffensiveTerm = true;
break;
}
}
// Due to some strangeness in Penguin, we don't get ngrams for tokens around "\n-" or "-\n"
// in the original string, this made us miss some offensive words this way. Here we do another
// pass of check using just the tokens generated by the tokenizer. (See SEARCHQUAL-8907)
if (!tweetHasOffensiveTerm) {
for (String ngramStr : textFeatures.getTokens()) {
// skip URLs
if (ngramStr.startsWith("http://") || ngramStr.startsWith("https://")) {
continue;
}
if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) {
tweetHasOffensiveTerm = true;
break;
}
}
}
if (!tweetHasOffensiveTerm) {
// check for resolved URLs
String resolvedUrlsText =
Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
List<String> ngramStrs = NGRAM_GENERATOR_HOLDER.get().generateNgramsAsString(
resolvedUrlsText, LocaleUtil.UNKNOWN);
for (String ngram : ngramStrs) {
if (!StringUtils.isBlank(ngram) && offensiveFilter.filter(ngram)) {
tweetHasOffensiveTerm = true;
break;
}
}
}
return tweetHasOffensiveTerm;
}
}

View File

@ -1,105 +0,0 @@
package com.twitter.search.common.relevance.classifiers;
import java.io.IOException;
import java.util.Set;
import com.google.common.base.Preconditions;
import com.twitter.common.text.transformer.RegexTransformer;
import com.twitter.common.text.transformer.RtRemovalTransformer;
import com.twitter.common.text.transformer.Transformer;
import com.twitter.common.text.transformer.TransformerChain;
import com.twitter.common_internal.text.duplicate.RandomSubstringExtractor;
import com.twitter.common_internal.text.duplicate.SignatureGenerator;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.features.TweetIntegerShingleSignature;
import com.twitter.search.common.relevance.features.TweetTextFeatures;
import com.twitter.search.common.util.text.NormalizerHelper;
import com.twitter.twittertext.Regex;
/**
* Given a tweet text, extract useful text features.
*/
public class TweetQualityFeatureExtractor {
private static final Transformer STATUS_TEXT_CLEANER =
TransformerChain.of(
// remove @reply as defined in twitter-text
new RegexTransformer.Builder()
.setRegexPattern(Regex.VALID_REPLY)
.setReplaceString("")
.setTriggeringChar('@')
.build(),
// remove the old style retweet, eg RT: @mention or via @mention
new RtRemovalTransformer()
);
// for signature generation
private static final int MIN_NUM_FEATURES = 2;
private final SignatureGenerator signatureGenerator = new SignatureGenerator(
new RandomSubstringExtractor(
TweetIntegerShingleSignature.NUM_SHINGLES, // number of signatures
MIN_NUM_FEATURES, // each signature is generated by taking this number of features/tokens
// from text
false, // do not consider full tweet text as a feature
false)); // do not do early termination
/**
* Given TwitterMessage, extract all interesting tweet text features and store in
* the returned TweetTextFeatures object.
*
* @param tweet TwitterMessage to extract features from
* @throws IOException
*/
public void extractTweetTextFeatures(final TwitterMessage tweet) {
Preconditions.checkNotNull(tweet);
for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) {
// Get basic features.
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
extractCharLength(textFeatures);
// Signature that hashes on text with resolved urls, aggressively remove RT tags, which
// accounts for more than 50% of neardups, also remove @mentions.
// we use resolved urls for signature since they are what matters.
CharSequence strippedText = tweet.getTextReplacedWithResolvedURLs();
strippedText = strippedText == null ? "" : strippedText;
strippedText = STATUS_TEXT_CLEANER.transform(strippedText);
// Generate the signature.
// will lower case, use penguin
String normalizedSignatureText =
NormalizerHelper.normalize(strippedText, tweet.getLocale(), penguinVersion);
if (normalizedSignatureText != null && !normalizedSignatureText.isEmpty()) {
Set<byte[]> rawSignature =
signatureGenerator.generateSignatureByteArray(normalizedSignatureText);
textFeatures.setSignature((new TweetIntegerShingleSignature(rawSignature)).serialize());
}
}
}
/**
* Compute number of letters in stripped tweet text, also records unsupported char counts.
*
* @param textFeatures TweetTextFeatures object to store letter length, unsupported chars, etc.
*/
private static void extractCharLength(final TweetTextFeatures textFeatures) {
Preconditions.checkNotNull(textFeatures);
int length = 0;
int caps = 0;
String strippedText = textFeatures.getNormalizedStrippedText();
if (strippedText != null && !strippedText.isEmpty()) {
for (char c : strippedText.toCharArray()) {
if (Character.isLetter(c)) {
length++;
if (Character.isUpperCase(c)) {
caps++;
}
}
}
}
textFeatures.setLength(length);
textFeatures.setCaps(caps);
}
}

View File

@ -1,67 +0,0 @@
package com.twitter.search.common.relevance.classifiers;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.twitter.finagle.mtls.authentication.ServiceIdentifier;
import java.util.List;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.relevance.config.TweetProcessingConfig;
import com.twitter.search.common.relevance.entities.TwitterMessage;
/**
* Classifier that focuses on tweet text features and their corresponding
* quality.
*/
public class TweetTextClassifier extends TweetClassifier {
private TweetQualityFeatureExtractor featureExtractor = new TweetQualityFeatureExtractor();
private TweetTrendsExtractor trendsExtractor = null;
/**
* Constructor. Requires a list of TweetQualityEvaluator objects.
* @param evaluators list of TweetQualityEvaluator objects responsible for quality evaluation.
* @param serviceIdentifier The identifier of the calling service.
* @param supportedPenguinVersions A list of supported penguin versions.
*/
public TweetTextClassifier(
final Iterable<TweetEvaluator> evaluators,
ServiceIdentifier serviceIdentifier,
List<PenguinVersion> supportedPenguinVersions) {
Preconditions.checkNotNull(evaluators);
setQualityEvaluators(evaluators);
TweetProcessingConfig.init();
if (TweetProcessingConfig.getBool("extract_trends", false)) {
trendsExtractor = new TweetTrendsExtractor(serviceIdentifier, supportedPenguinVersions);
}
}
/**
* Extract text features for the specified TwitterMessage.
*
* @param tweet TwitterMessage to extract features from.
*/
@Override
protected void extractFeatures(TwitterMessage tweet) {
extractFeatures(Lists.newArrayList(tweet));
}
/**
* Extract text features for the specified list of TwitterMessages.
*
* @param tweets list of TwitterMessages to extract interesting features for
*/
@Override
protected void extractFeatures(Iterable<TwitterMessage> tweets) {
Preconditions.checkNotNull(tweets);
for (TwitterMessage tweet : tweets) {
featureExtractor.extractTweetTextFeatures(tweet);
}
// Optionally try to annotate trends for all the tweets.
if (TweetProcessingConfig.getBool("extract_trends", false) && trendsExtractor != null) {
trendsExtractor.extractTrends(tweets);
}
}
}

View File

@ -1,54 +0,0 @@
package com.twitter.search.common.relevance.classifiers;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.features.TweetTextFeatures;
import com.twitter.search.common.relevance.features.TweetTextQuality;
/**
* Calculates entropy of tweet text based on tokens.
*/
public class TweetTextEvaluator extends TweetEvaluator {
@Override
public void evaluate(final TwitterMessage tweet) {
for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) {
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
TweetTextQuality textQuality = tweet.getTweetTextQuality(penguinVersion);
double readability = 0;
int numKeptWords = textFeatures.getStrippedTokensSize();
for (String token : textFeatures.getStrippedTokens()) {
readability += token.length();
}
if (numKeptWords > 0) {
readability = readability * Math.log(numKeptWords) / numKeptWords;
}
textQuality.setReadability(readability);
textQuality.setEntropy(entropy(textFeatures.getStrippedTokens()));
textQuality.setShout(textFeatures.getCaps() / Math.max(textFeatures.getLength(), 1.0d));
}
}
private static double entropy(List<String> tokens) {
Map<String, Long> tokenCounts =
tokens.stream().collect(Collectors.groupingBy(Function.identity(), Collectors.counting()));
int numItems = tokens.size();
double entropy = 0;
for (long count : tokenCounts.values()) {
double prob = (double) count / numItems;
entropy -= prob * log2(prob);
}
return entropy;
}
private static double log2(double n) {
return Math.log(n) / Math.log(2);
}
}

View File

@ -1,165 +0,0 @@
package com.twitter.search.common.relevance.classifiers;
import java.util.List;
import java.util.concurrent.TimeUnit;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.finagle.mtls.authentication.ServiceIdentifier;
import com.twitter.search.common.metrics.RelevanceStats;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.relevance.NGramCache;
import com.twitter.search.common.relevance.TrendsThriftDataServiceManager;
import com.twitter.search.common.relevance.config.TweetProcessingConfig;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.features.TweetTextFeatures;
import com.twitter.util.Duration;
/**
* Determines if tweets contains trending terms.
* Sets corresponding bits and fields to TweetTextFeatures.
*/
public class TweetTrendsExtractor {
// The amount of time before filling the trends cache for the first time.
private static final long INIT_TRENDS_CACHE_DELAY = 0;
private static final Logger LOG = LoggerFactory.getLogger(TweetTrendsExtractor.class.getName());
private static final int LOGGING_INTERVAL = 100000;
// Singleton trends data service. This is the default service used unless a different
// instance is injected in the constructor.
private static volatile TrendsThriftDataServiceManager trendsDataServiceSingleton;
// trends cache used for extracting trends from tweets
private static volatile ImmutableMap<PenguinVersion, NGramCache> trendsCaches;
private static synchronized void initTrendsDataServiceInstance(
ServiceIdentifier serviceIdentifier,
List<PenguinVersion> supportedPenguinVersions) {
if (trendsDataServiceSingleton == null) {
TweetProcessingConfig.init();
if (trendsCaches == null) {
ImmutableMap.Builder<PenguinVersion, NGramCache> trendsCachesBuilder =
ImmutableMap.builder();
for (PenguinVersion penguinVersion : supportedPenguinVersions) {
NGramCache cache = NGramCache.builder()
.maxCacheSize(
TweetProcessingConfig.getInt("trends_extractor_num_trends_to_cache", 5000))
.penguinVersion(penguinVersion)
.build();
trendsCachesBuilder.put(penguinVersion, cache);
}
trendsCaches = trendsCachesBuilder.build();
}
long rawTimeout = TweetProcessingConfig.getLong("trends_extractor_timeout_msec", 200);
long rawInterval =
TweetProcessingConfig.getLong("trends_extractor_reload_interval_sec", 600L);
trendsDataServiceSingleton =
TrendsThriftDataServiceManager.newInstance(
serviceIdentifier,
TweetProcessingConfig.getInt("trends_extractor_retry", 2),
Duration.apply(rawTimeout, TimeUnit.MILLISECONDS),
Duration.apply(INIT_TRENDS_CACHE_DELAY, TimeUnit.SECONDS),
Duration.apply(rawInterval, TimeUnit.SECONDS),
trendsCaches.values().asList()
);
trendsDataServiceSingleton.startAutoRefresh();
LOG.info("Started trend extractor.");
}
}
public TweetTrendsExtractor(
ServiceIdentifier serviceIdentifier,
List<PenguinVersion> supportedPenguinVersions) {
initTrendsDataServiceInstance(serviceIdentifier, supportedPenguinVersions);
}
/**
* Extract trending terms from the specified tweet.
* @param tweet the specified tweet
*/
public void extractTrends(TwitterMessage tweet) {
extractTrends(ImmutableList.of(tweet));
}
/**
* Extract trending terms from the specified list of tweets.
* @param tweets a list of tweets
*/
public void extractTrends(Iterable<TwitterMessage> tweets) {
Preconditions.checkNotNull(tweets);
for (TwitterMessage tweet : tweets) {
for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) {
NGramCache trendsCache = trendsCaches.get(penguinVersion);
if (trendsCache == null) {
LOG.info("Trends cache for Penguin version " + penguinVersion + " is null.");
continue;
} else if (trendsCache.numTrendingTerms() == 0) {
LOG.info("Trends cache for Penguin version " + penguinVersion + " is empty.");
continue;
}
List<String> trendsInTweet = trendsCache.extractTrendsFrom(
tweet.getTokenizedCharSequence(penguinVersion), tweet.getLocale());
TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion);
if (textFeatures == null || textFeatures.getTokens() == null) {
continue;
}
textFeatures.getTrendingTerms().addAll(trendsInTweet);
updateTrendsStats(
tweet,
textFeatures,
penguinVersion,
RelevanceStats.exportLong(
"trends_extractor_has_trends_" + penguinVersion.name().toLowerCase()),
RelevanceStats.exportLong(
"trends_extractor_no_trends_" + penguinVersion.name().toLowerCase()),
RelevanceStats.exportLong(
"trends_extractor_too_many_trends_" + penguinVersion.name().toLowerCase()));
}
}
}
private void updateTrendsStats(TwitterMessage tweet,
TweetTextFeatures textFeatures,
PenguinVersion penguinVersion,
SearchCounter hasTrendsCounterToUpdate,
SearchCounter noTrendsCounterToUpdate,
SearchCounter tooManyTrendsCounterToUpdate) {
int numTrendingTerms = textFeatures.getTrendingTerms().size();
if (numTrendingTerms == 0) {
noTrendsCounterToUpdate.increment();
} else {
if (numTrendingTerms > 1) {
tooManyTrendsCounterToUpdate.increment();
}
hasTrendsCounterToUpdate.increment();
}
long counter = noTrendsCounterToUpdate.get();
if (counter % LOGGING_INTERVAL == 0) {
long hasTrends = hasTrendsCounterToUpdate.get();
long noTrends = noTrendsCounterToUpdate.get();
long tooManyTrends = tooManyTrendsCounterToUpdate.get();
double ratio = 100.0d * hasTrends / (hasTrends + noTrends + 1);
double tooManyTrendsRatio = 100.0d * tooManyTrends / (hasTrends + 1);
LOG.info(String.format(
"Has trends %d, no trends %d, ratio %.2f, too many trends %.2f,"
+ " sample tweet id [%d] matching terms [%s] penguin version [%s]",
hasTrends, noTrends, ratio, tooManyTrendsRatio, tweet.getId(),
textFeatures.getTrendingTerms(), penguinVersion));
}
}
}

View File

@ -1,114 +0,0 @@
package com.twitter.search.common.relevance.config;
import java.io.InputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.common.config.ConfigFile;
/**
* Config file for relevance computation.
*/
public final class TweetProcessingConfig {
private static final Logger LOG = LoggerFactory.getLogger(TweetProcessingConfig.class);
private static final String SCORER_CONFIG_DIR = "common/relevance/config";
public static final String DEFAULT_CONFIG_FILE = "relevance.yml";
private static ConfigFile relevanceConfig = null;
private TweetProcessingConfig() {
}
/** Initializes this instance from the given config file. */
public static void init(String configFile) {
if (relevanceConfig == null) {
synchronized (TweetProcessingConfig.class) {
if (relevanceConfig == null) {
String file = configFile == null ? DEFAULT_CONFIG_FILE : configFile;
relevanceConfig = new ConfigFile(SCORER_CONFIG_DIR, file);
}
}
}
}
/** Initializes this instance from the given input stream. */
public static void init(InputStream inputStream, String configType) {
if (relevanceConfig == null) {
synchronized (TweetProcessingConfig.class) {
if (relevanceConfig == null) {
relevanceConfig = new ConfigFile(inputStream, configType);
}
}
}
}
/** Initializes this instance. */
public static void init() {
init(null);
}
/**
* Returns the value of the given property as a double value.
*
* @param property The property.
* @param defaultValue The default value to return if the property is not present in the config.
*/
public static double getDouble(String property, double defaultValue) {
return relevanceConfig.getDouble(property, defaultValue);
}
/**
* Returns the value of the given property as a string value.
*
* @param property The property.
* @param defaultValue The default value to return if the property is not present in the config.
*/
public static String getString(String property, String defaultValue) {
return relevanceConfig.getString(property, defaultValue);
}
/**
* Returns the value of the given property as an integer value.
*
* @param property The property.
* @param defaultValue The default value to return if the property is not present in the config.
*/
public static int getInt(String property, int defaultValue) {
return relevanceConfig.getInt(property, defaultValue);
}
/**
* Returns the value of the given property as a long value.
*
* @param property The property.
* @param defaultValue The default value to return if the property is not present in the config.
*/
public static long getLong(String property, long defaultValue) {
return relevanceConfig.getLong(property, defaultValue);
}
/**
* Returns the value of the given property as a boolean value.
*
* @param property The property.
* @param defaultValue The default value to return if the property is not present in the config.
*/
public static boolean getBool(String property, boolean defaultValue) {
return relevanceConfig.getBool(property, defaultValue);
}
/**
* Returns the value of the given property as a string.
*
* @param property The property.
* @throws ConfigurationException If the given property is not found in the config.
*/
public static String getString(String property) {
try {
return relevanceConfig.getString(property);
} catch (ConfigurationException e) {
LOG.error("Fatal error: could not get config string " + property, e);
throw new RuntimeException(e);
}
}
}

View File

@ -1,201 +0,0 @@
package com.twitter.search.common.relevance.entities;
import java.util.List;
import java.util.Optional;
import com.google.common.annotations.VisibleForTesting;
import com.twitter.search.common.indexing.thriftjava.ThriftGeoLocationSource;
import com.twitter.search.common.indexing.thriftjava.ThriftGeoTags;
import com.twitter.tweetypie.thriftjava.GeoCoordinates;
import com.twitter.tweetypie.thriftjava.Place;
import geo.google.datamodel.GeoAddressAccuracy;
/**
* A GeoObject, extending a GeoCoordinate to include radius and accuracy
*/
public class GeoObject {
public static final int INT_FIELD_NOT_PRESENT = -1;
public static final double DOUBLE_FIELD_NOT_PRESENT = -1.0;
private double latitude = DOUBLE_FIELD_NOT_PRESENT;
private double longitude = DOUBLE_FIELD_NOT_PRESENT;
private double radius = DOUBLE_FIELD_NOT_PRESENT;
private final ThriftGeoLocationSource source;
// Valid range is 0-9. With 0 being unknown and 9 being most accurate.
// If this GeoObject is valid, this should be set to INT_FIELD_NOT_PRESENT
private int accuracy = 0;
/** Creates a new GeoObject instance. */
public GeoObject(double lat, double lon, ThriftGeoLocationSource source) {
this(lat, lon, 0, source);
}
/** Creates a new GeoObject instance. */
public GeoObject(double lat, double lon, int acc, ThriftGeoLocationSource source) {
latitude = lat;
longitude = lon;
accuracy = acc;
this.source = source;
}
/** Creates a new GeoObject instance. */
public GeoObject(ThriftGeoLocationSource source) {
this.source = source;
}
/**
* Tries to create a {@code GeoObject} instance from a given TweetyPie {@code Place} struct based
* on its bounding box coordinates.
*
* @param place
* @return {@code Optional} instance with {@code GeoObject} if bounding box coordinates are
* available, or an empty {@code Optional}.
*/
public static Optional<GeoObject> fromPlace(Place place) {
// Can't use place.centroid: from the sample of data, centroid seems to always be null
// (as of May 17 2016).
if (place.isSetBounding_box() && place.getBounding_boxSize() > 0) {
int pointsCount = place.getBounding_boxSize();
if (pointsCount == 1) {
GeoCoordinates point = place.getBounding_box().get(0);
return Optional.of(createForIngester(point.getLatitude(), point.getLongitude()));
} else {
double sumLatitude = 0.0;
double sumLongitude = 0.0;
List<GeoCoordinates> box = place.getBounding_box();
// Drop the last point if it's the same as the first point.
// The same logic is present in several other classes dealing with places.
// See e.g. birdherd/src/main/scala/com/twitter/birdherd/tweetypie/TweetyPiePlace.scala
if (box.get(pointsCount - 1).equals(box.get(0))) {
pointsCount--;
}
for (int i = 0; i < pointsCount; i++) {
GeoCoordinates coords = box.get(i);
sumLatitude += coords.getLatitude();
sumLongitude += coords.getLongitude();
}
double averageLatitude = sumLatitude / pointsCount;
double averageLongitude = sumLongitude / pointsCount;
return Optional.of(GeoObject.createForIngester(averageLatitude, averageLongitude));
}
}
return Optional.empty();
}
public void setRadius(double radius) {
this.radius = radius;
}
public Double getRadius() {
return radius;
}
public void setLatitude(double latitude) {
this.latitude = latitude;
}
public Double getLatitude() {
return latitude;
}
public void setLongitude(double longitude) {
this.longitude = longitude;
}
public Double getLongitude() {
return longitude;
}
public int getAccuracy() {
return accuracy;
}
public void setAccuracy(int accuracy) {
this.accuracy = accuracy;
}
public ThriftGeoLocationSource getSource() {
return source;
}
/** Convers this GeoObject instance to a ThriftGeoTags instance. */
public ThriftGeoTags toThriftGeoTags(long twitterMessageId) {
ThriftGeoTags geoTags = new ThriftGeoTags();
geoTags.setStatusId(twitterMessageId);
geoTags.setLatitude(getLatitude());
geoTags.setLongitude(getLongitude());
geoTags.setAccuracy(accuracy);
geoTags.setGeoLocationSource(source);
return geoTags;
}
private static final double COORDS_EQUALITY_THRESHOLD = 1e-7;
/**
* Performs an approximate comparison between the two GeoObject instances.
*
* @deprecated This code is not performant and should not be used in
* production code. Use only for tests. See SEARCH-5148.
*/
@Deprecated
@VisibleForTesting
public static boolean approxEquals(GeoObject a, GeoObject b) {
if (a == null && b == null) {
return true;
}
if ((a == null && b != null) || (a != null && b == null)) {
return false;
}
if (a.accuracy != b.accuracy) {
return false;
}
if (Math.abs(a.latitude - b.latitude) > COORDS_EQUALITY_THRESHOLD) {
return false;
}
if (Math.abs(a.longitude - b.longitude) > COORDS_EQUALITY_THRESHOLD) {
return false;
}
if (Double.compare(a.radius, b.radius) != 0) {
return false;
}
if (a.source != b.source) {
return false;
}
return true;
}
@Override
public String toString() {
return "GeoObject{"
+ "latitude=" + latitude
+ ", longitude=" + longitude
+ ", radius=" + radius
+ ", source=" + source
+ ", accuracy=" + accuracy
+ '}';
}
/**
* Convenience factory method for ingester purposes.
*/
public static GeoObject createForIngester(double latitude, double longitude) {
return new GeoObject(
latitude,
longitude,
// store with highest level of accuracy: POINT_LEVEL
GeoAddressAccuracy.POINT_LEVEL.getCode(),
ThriftGeoLocationSource.GEOTAG);
}
}

View File

@ -1,122 +0,0 @@
package com.twitter.search.common.relevance.entities;
import java.util.Locale;
import com.google.common.base.Preconditions;
import org.apache.commons.lang.StringUtils;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.indexing.thriftjava.PotentialLocation;
import com.twitter.search.common.util.text.LanguageIdentifierHelper;
import com.twitter.search.common.util.text.NormalizerHelper;
import com.twitter.search.common.util.text.TokenizerHelper;
/**
* An immutable tuple to wrap a country code, region and locality. Based on the PotentialLocation
* struct in status.thrift.
*/
public class PotentialLocationObject {
private final String countryCode;
private final String region;
private final String locality;
/**
* Creates a new PotentialLocationObject instance.
*
* @param countryCode The country code.
* @param region The region.
* @param locality The locality.
*/
public PotentialLocationObject(String countryCode, String region, String locality) {
this.countryCode = countryCode;
this.region = region;
this.locality = locality;
}
public String getCountryCode() {
return countryCode;
}
public String getRegion() {
return region;
}
public String getLocality() {
return locality;
}
/**
* Converts this PotentialLocationObject instance to a PotentialLocation thrift struct.
*
* @param penguinVersion The penguin version to use for normalization and tokenization.
*/
public PotentialLocation toThriftPotentialLocation(PenguinVersion penguinVersion) {
Preconditions.checkNotNull(penguinVersion);
String normalizedCountryCode = null;
if (countryCode != null) {
Locale countryCodeLocale = LanguageIdentifierHelper.identifyLanguage(countryCode);
normalizedCountryCode =
NormalizerHelper.normalize(countryCode, countryCodeLocale, penguinVersion);
}
String tokenizedRegion = null;
if (region != null) {
Locale regionLocale = LanguageIdentifierHelper.identifyLanguage(region);
String normalizedRegion = NormalizerHelper.normalize(region, regionLocale, penguinVersion);
tokenizedRegion = StringUtils.join(
TokenizerHelper.tokenizeQuery(normalizedRegion, regionLocale, penguinVersion), " ");
}
String tokenizedLocality = null;
if (locality != null) {
Locale localityLocale = LanguageIdentifierHelper.identifyLanguage(locality);
String normalizedLocality =
NormalizerHelper.normalize(locality, localityLocale, penguinVersion);
tokenizedLocality =
StringUtils.join(TokenizerHelper.tokenizeQuery(
normalizedLocality, localityLocale, penguinVersion), " ");
}
return new PotentialLocation()
.setCountryCode(normalizedCountryCode)
.setRegion(tokenizedRegion)
.setLocality(tokenizedLocality);
}
@Override
public int hashCode() {
return ((countryCode == null) ? 0 : countryCode.hashCode())
+ 13 * ((region == null) ? 0 : region.hashCode())
+ 19 * ((locality == null) ? 0 : locality.hashCode());
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof PotentialLocationObject)) {
return false;
}
PotentialLocationObject entry = (PotentialLocationObject) obj;
return (countryCode == null
? entry.countryCode == null
: countryCode.equals(entry.countryCode))
&& (region == null
? entry.region == null
: region.equals(entry.region))
&& (locality == null
? entry.locality == null
: locality.equals(entry.locality));
}
@Override
public String toString() {
return new StringBuilder("PotentialLocationObject {")
.append("countryCode=").append(countryCode)
.append(", region=").append(region)
.append(", locality=").append(locality)
.append("}")
.toString();
}
}

View File

@ -1,231 +0,0 @@
package com.twitter.search.common.relevance.entities;
import java.util.Optional;
import javax.annotation.Nonnull;
import com.google.common.base.Preconditions;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.lucene.analysis.TokenStream;
import com.twitter.search.common.util.text.TokenizerHelper;
// Represents from-user, to-user, mentions and audioSpace admins in TwitterMessage.
public final class TwitterMessageUser {
@Nonnull private final Optional<String> screenName; // a.k.a. user handle or username
@Nonnull private final Optional<String> displayName;
@Nonnull private Optional<TokenStream> tokenizedScreenName;
@Nonnull private final Optional<Long> id; // twitter ID
public static final class Builder {
@Nonnull private Optional<String> screenName = Optional.empty();
@Nonnull private Optional<String> displayName = Optional.empty();
@Nonnull private Optional<TokenStream> tokenizedScreenName = Optional.empty();
@Nonnull private Optional<Long> id = Optional.empty();
public Builder() {
}
/**
* Initialized Builder based on an existing TwitterMessageUser
*/
public Builder(TwitterMessageUser user) {
this.screenName = user.screenName;
this.displayName = user.displayName;
this.tokenizedScreenName = user.tokenizedScreenName;
this.id = user.id;
}
/**
* Initialized Builder screen name (handle/the name following the "@") and do tokenization
* for it.
*/
public Builder withScreenName(Optional<String> newScreenName) {
this.screenName = newScreenName;
if (newScreenName.isPresent()) {
this.tokenizedScreenName = Optional.of(
TokenizerHelper.getNormalizedCamelcaseTokenStream(newScreenName.get()));
}
return this;
}
/**
* Initialized Builder display name
*/
public Builder withDisplayName(Optional<String> newDisplayName) {
this.displayName = newDisplayName;
return this;
}
public Builder withId(Optional<Long> newId) {
this.id = newId;
return this;
}
public TwitterMessageUser build() {
return new TwitterMessageUser(
screenName, displayName, tokenizedScreenName, id);
}
}
/** Creates a TwitterMessageUser instance with the given screen name. */
public static TwitterMessageUser createWithScreenName(@Nonnull String screenName) {
Preconditions.checkNotNull(screenName, "Don't set a null screen name");
return new Builder()
.withScreenName(Optional.of(screenName))
.build();
}
/** Creates a TwitterMessageUser instance with the given display name. */
public static TwitterMessageUser createWithDisplayName(@Nonnull String displayName) {
Preconditions.checkNotNull(displayName, "Don't set a null display name");
return new Builder()
.withDisplayName(Optional.of(displayName))
.build();
}
/** Creates a TwitterMessageUser instance with the given ID. */
public static TwitterMessageUser createWithId(long id) {
Preconditions.checkArgument(id >= 0, "Don't sent a negative user ID");
return new Builder()
.withId(Optional.of(id))
.build();
}
/** Creates a TwitterMessageUser instance with the given parameters. */
public static TwitterMessageUser createWithNamesAndId(
@Nonnull String screenName,
@Nonnull String displayName,
long id) {
Preconditions.checkNotNull(screenName, "Use another method instead of passing null name");
Preconditions.checkNotNull(displayName, "Use another method instead of passing null name");
Preconditions.checkArgument(id >= 0, "Use another method instead of passing negative ID");
return new Builder()
.withScreenName(Optional.of(screenName))
.withDisplayName(Optional.of(displayName))
.withId(Optional.of(id))
.build();
}
/** Creates a TwitterMessageUser instance with the given parameters. */
public static TwitterMessageUser createWithNames(
@Nonnull String screenName,
@Nonnull String displayName) {
Preconditions.checkNotNull(screenName, "Use another method instead of passing null name");
Preconditions.checkNotNull(displayName, "Use another method instead of passing null name");
return new Builder()
.withScreenName(Optional.of(screenName))
.withDisplayName(Optional.of(displayName))
.build();
}
/** Creates a TwitterMessageUser instance with the given parameters. */
public static TwitterMessageUser createWithOptionalNamesAndId(
@Nonnull Optional<String> optScreenName,
@Nonnull Optional<String> optDisplayName,
@Nonnull Optional<Long> optId) {
Preconditions.checkNotNull(optScreenName, "Pass Optional.absent() instead of null");
Preconditions.checkNotNull(optDisplayName, "Pass Optional.absent() instead of null");
Preconditions.checkNotNull(optId, "Pass Optional.absent() instead of null");
return new Builder()
.withScreenName(optScreenName)
.withDisplayName(optDisplayName)
.withId(optId)
.build();
}
private TwitterMessageUser(
@Nonnull Optional<String> screenName,
@Nonnull Optional<String> displayName,
@Nonnull Optional<TokenStream> tokenizedScreenName,
@Nonnull Optional<Long> id) {
this.screenName = screenName;
this.displayName = displayName;
this.tokenizedScreenName = tokenizedScreenName;
this.id = id;
}
/** Creates a copy of this TwitterMessageUser instance, with the given screen name. */
public TwitterMessageUser copyWithScreenName(@Nonnull String newScreenName) {
Preconditions.checkNotNull(newScreenName, "Don't set a null screen name");
return new Builder(this)
.withScreenName(Optional.of(newScreenName))
.build();
}
/** Creates a copy of this TwitterMessageUser instance, with the given display name. */
public TwitterMessageUser copyWithDisplayName(@Nonnull String newDisplayName) {
Preconditions.checkNotNull(newDisplayName, "Don't set a null display name");
return new Builder(this)
.withDisplayName(Optional.of(newDisplayName))
.build();
}
/** Creates a copy of this TwitterMessageUser instance, with the given ID. */
public TwitterMessageUser copyWithId(long newId) {
Preconditions.checkArgument(newId >= 0, "Don't set a negative user ID");
return new Builder(this)
.withId(Optional.of(newId))
.build();
}
public Optional<String> getScreenName() {
return screenName;
}
public Optional<String> getDisplayName() {
return displayName;
}
public Optional<TokenStream> getTokenizedScreenName() {
return tokenizedScreenName;
}
public Optional<Long> getId() {
return id;
}
@Override
public String toString() {
return "[" + screenName + ", " + displayName + ", " + id + "]";
}
/**
* Compares this TwitterMessageUser instance to the given object.
*
* @deprecated deprecated.
*/
@Deprecated
@Override
public boolean equals(Object o) {
if (o == null) {
return false;
}
if (o == this) {
return true;
}
if (o.getClass() != getClass()) {
return false;
}
TwitterMessageUser other = (TwitterMessageUser) o;
return new EqualsBuilder()
.append(screenName, other.screenName)
.append(displayName, other.displayName)
.isEquals();
}
/**
* Returns a hash code for this TwitterMessageUser instance.
*
* @deprecated deprecated.
*/
@Deprecated
@Override
public int hashCode() {
return HashCodeBuilder.reflectionHashCode(this);
}
}

View File

@ -1,444 +0,0 @@
package com.twitter.search.common.relevance.entities;
import java.text.Normalizer;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentMap;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.text.transformer.HTMLTagRemovalTransformer;
import com.twitter.common_internal.text.extractor.EmojiExtractor;
import com.twitter.search.common.metrics.SearchRateCounter;
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
public final class TwitterMessageUtil {
private static final Logger LOG = LoggerFactory.getLogger(TwitterMessageUtil.class);
private TwitterMessageUtil() {
}
@VisibleForTesting
static final ConcurrentMap<Field, Counters> COUNTERS_MAP = Maps.newConcurrentMap();
// We truncate the location string because we used to use a MySQL table to store the geocoding
// information. In the MySQL table, the location string was fix width of 30 characters.
// We have migrated to Manhattan and the location string is no longer limited to 30 character.
// However, in order to correctly lookup location geocode from Manhattan, we still need to
// truncate the location just like we did before.
private static final int MAX_LOCATION_LEN = 30;
// Note: we strip tags to index source, as typically source contains <a href=...> tags.
// Sometimes we get a source where stripping fails, as the URL in the tag was
// excessively long. We drop these sources, as there is little reason to index them.
private static final int MAX_SOURCE_LEN = 64;
private static HTMLTagRemovalTransformer tagRemovalTransformer = new HTMLTagRemovalTransformer();
private static final String STAT_PREFIX = "twitter_message_";
public enum Field {
FROM_USER_DISPLAY_NAME,
NORMALIZED_LOCATION,
ORIG_LOCATION,
ORIG_SOURCE,
SHARED_USER_DISPLAY_NAME,
SOURCE,
TEXT,
TO_USER_SCREEN_NAME;
public String getNameForStats() {
return name().toLowerCase();
}
}
@VisibleForTesting
static class Counters {
private final SearchRateCounter truncatedCounter;
private final SearchRateCounter tweetsWithStrippedSupplementaryCharsCounter;
private final SearchRateCounter strippedSupplementaryCharsCounter;
private final SearchRateCounter nonStrippedEmojiCharsCounter;
private final SearchRateCounter emojisAtTruncateBoundaryCounter;
Counters(Field field) {
String fieldNameForStats = field.getNameForStats();
truncatedCounter = SearchRateCounter.export(
STAT_PREFIX + "truncated_" + fieldNameForStats);
tweetsWithStrippedSupplementaryCharsCounter = SearchRateCounter.export(
STAT_PREFIX + "tweets_with_stripped_supplementary_chars_" + fieldNameForStats);
strippedSupplementaryCharsCounter = SearchRateCounter.export(
STAT_PREFIX + "stripped_supplementary_chars_" + fieldNameForStats);
nonStrippedEmojiCharsCounter = SearchRateCounter.export(
STAT_PREFIX + "non_stripped_emoji_chars_" + fieldNameForStats);
emojisAtTruncateBoundaryCounter = SearchRateCounter.export(
STAT_PREFIX + "emojis_at_truncate_boundary_" + fieldNameForStats);
}
SearchRateCounter getTruncatedCounter() {
return truncatedCounter;
}
SearchRateCounter getTweetsWithStrippedSupplementaryCharsCounter() {
return tweetsWithStrippedSupplementaryCharsCounter;
}
SearchRateCounter getStrippedSupplementaryCharsCounter() {
return strippedSupplementaryCharsCounter;
}
SearchRateCounter getNonStrippedEmojiCharsCounter() {
return nonStrippedEmojiCharsCounter;
}
SearchRateCounter getEmojisAtTruncateBoundaryCounter() {
return emojisAtTruncateBoundaryCounter;
}
}
static {
for (Field field : Field.values()) {
COUNTERS_MAP.put(field, new Counters(field));
}
}
// Note: the monorail enforces a limit of 15 characters for screen names,
// but some users with up to 20 character names were grandfathered-in. To allow
// those users to be searchable, support up to 20 chars.
private static final int MAX_SCREEN_NAME_LEN = 20;
// Note: we expect the current limit to be 10K. Also, all supplementary unicode characters (with
// the exception of emojis, maybe) will be removed and not counted as total length. Added alert
// for text truncation rate as well. SEARCH-9512
private static final int MAX_TWEET_TEXT_LEN = 10000;
@VisibleForTesting
static final SearchRateCounter FILTERED_NO_STATUS_ID =
SearchRateCounter.export(STAT_PREFIX + "filtered_no_status_id");
@VisibleForTesting
static final SearchRateCounter FILTERED_NO_FROM_USER =
SearchRateCounter.export(STAT_PREFIX + "filtered_no_from_user");
@VisibleForTesting
static final SearchRateCounter FILTERED_LONG_SCREEN_NAME =
SearchRateCounter.export(STAT_PREFIX + "filtered_long_screen_name");
@VisibleForTesting
static final SearchRateCounter FILTERED_NO_TEXT =
SearchRateCounter.export(STAT_PREFIX + "filtered_no_text");
@VisibleForTesting
static final SearchRateCounter FILTERED_NO_DATE =
SearchRateCounter.export(STAT_PREFIX + "filtered_no_date");
@VisibleForTesting
static final SearchRateCounter NULLCAST_TWEET =
SearchRateCounter.export(STAT_PREFIX + "filter_nullcast_tweet");
@VisibleForTesting
static final SearchRateCounter NULLCAST_TWEET_ACCEPTED =
SearchRateCounter.export(STAT_PREFIX + "nullcast_tweet_accepted");
@VisibleForTesting
static final SearchRateCounter INCONSISTENT_TWEET_ID_AND_CREATED_AT =
SearchRateCounter.export(STAT_PREFIX + "inconsistent_tweet_id_and_created_at_ms");
/** Strips the given source from the message with the given ID. */
private static String stripSource(String source, Long messageId) {
if (source == null) {
return null;
}
// Always strip emojis from sources: they don't really make sense in this field.
String strippedSource = stripSupplementaryChars(
tagRemovalTransformer.transform(source).toString(), Field.SOURCE, true);
if (strippedSource.length() > MAX_SOURCE_LEN) {
LOG.warn("Message "
+ messageId
+ " contains stripped source that exceeds MAX_SOURCE_LEN. Removing: "
+ strippedSource);
COUNTERS_MAP.get(Field.SOURCE).getTruncatedCounter().increment();
return null;
}
return strippedSource;
}
/**
* Strips and truncates the location of the message with the given ID.
*
*/
private static String stripAndTruncateLocation(String location) {
// Always strip emojis from locations: they don't really make sense in this field.
String strippedLocation = stripSupplementaryChars(location, Field.NORMALIZED_LOCATION, true);
return truncateString(strippedLocation, MAX_LOCATION_LEN, Field.NORMALIZED_LOCATION, true);
}
/**
* Sets the origSource and strippedSource fields on a TwitterMessage
*
*/
public static void setSourceOnMessage(TwitterMessage message, String modifiedDeviceSource) {
// Always strip emojis from sources: they don't really make sense in this field.
message.setOrigSource(stripSupplementaryChars(modifiedDeviceSource, Field.ORIG_SOURCE, true));
message.setStrippedSource(stripSource(modifiedDeviceSource, message.getId()));
}
/**
* Sets the origLocation to the stripped location, and sets
* the truncatedNormalizedLocation to the truncated and normalized location.
*/
public static void setAndTruncateLocationOnMessage(
TwitterMessage message,
String newOrigLocation) {
// Always strip emojis from locations: they don't really make sense in this field.
message.setOrigLocation(stripSupplementaryChars(newOrigLocation, Field.ORIG_LOCATION, true));
// Locations in the new locations table require additional normalization. It can also change
// the length of the string, so we must do this before truncation.
if (newOrigLocation != null) {
String normalized =
Normalizer.normalize(newOrigLocation, Normalizer.Form.NFKC).toLowerCase().trim();
message.setTruncatedNormalizedLocation(stripAndTruncateLocation(normalized));
} else {
message.setTruncatedNormalizedLocation(null);
}
}
/**
* Validates the given TwitterMessage.
*
* @param message The message to validate.
* @param stripEmojisForFields The set of fields for which emojis should be stripped.
* @param acceptNullcastMessage Determines if this message should be accepted, if it's a nullcast
* message.
* @return {@code true} if the given message is valid; {@code false} otherwise.
*/
public static boolean validateTwitterMessage(
TwitterMessage message,
Set<Field> stripEmojisForFields,
boolean acceptNullcastMessage) {
if (message.getNullcast()) {
NULLCAST_TWEET.increment();
if (!acceptNullcastMessage) {
LOG.info("Dropping nullcasted message " + message.getId());
return false;
}
NULLCAST_TWEET_ACCEPTED.increment();
}
if (!message.getFromUserScreenName().isPresent()
|| StringUtils.isBlank(message.getFromUserScreenName().get())) {
LOG.error("Message " + message.getId() + " contains no from user. Skipping.");
FILTERED_NO_FROM_USER.increment();
return false;
}
String fromUserScreenName = message.getFromUserScreenName().get();
if (fromUserScreenName.length() > MAX_SCREEN_NAME_LEN) {
LOG.warn("Message " + message.getId() + " has a user screen name longer than "
+ MAX_SCREEN_NAME_LEN + " characters: " + message.getFromUserScreenName()
+ ". Skipping.");
FILTERED_LONG_SCREEN_NAME.increment();
return false;
}
// Remove supplementary characters and truncate these text fields.
if (message.getFromUserDisplayName().isPresent()) {
message.setFromUserDisplayName(stripSupplementaryChars(
message.getFromUserDisplayName().get(),
Field.FROM_USER_DISPLAY_NAME,
stripEmojisForFields.contains(Field.FROM_USER_DISPLAY_NAME)));
}
if (message.getToUserScreenName().isPresent()) {
String strippedToUserScreenName = stripSupplementaryChars(
message.getToUserLowercasedScreenName().get(),
Field.TO_USER_SCREEN_NAME,
stripEmojisForFields.contains(Field.TO_USER_SCREEN_NAME));
message.setToUserScreenName(
truncateString(
strippedToUserScreenName,
MAX_SCREEN_NAME_LEN,
Field.TO_USER_SCREEN_NAME,
stripEmojisForFields.contains(Field.TO_USER_SCREEN_NAME)));
}
String strippedText = stripSupplementaryChars(
message.getText(),
Field.TEXT,
stripEmojisForFields.contains(Field.TEXT));
message.setText(truncateString(
strippedText,
MAX_TWEET_TEXT_LEN,
Field.TEXT,
stripEmojisForFields.contains(Field.TEXT)));
if (StringUtils.isBlank(message.getText())) {
FILTERED_NO_TEXT.increment();
return false;
}
if (message.getDate() == null) {
LOG.error("Message " + message.getId() + " contains no date. Skipping.");
FILTERED_NO_DATE.increment();
return false;
}
if (message.isRetweet()) {
return validateRetweetMessage(message.getRetweetMessage(), stripEmojisForFields);
}
// Track if both the snowflake ID and created at timestamp are consistent.
if (!SnowflakeIdParser.isTweetIDAndCreatedAtConsistent(message.getId(), message.getDate())) {
LOG.error("Found inconsistent tweet ID and created at timestamp: [messageID="
+ message.getId() + "], [messageDate=" + message.getDate() + "].");
INCONSISTENT_TWEET_ID_AND_CREATED_AT.increment();
}
return true;
}
private static boolean validateRetweetMessage(
TwitterRetweetMessage message, Set<Field> stripEmojisForFields) {
if (message.getSharedId() == null || message.getRetweetId() == null) {
LOG.error("Retweet Message contains a null twitter id. Skipping.");
FILTERED_NO_STATUS_ID.increment();
return false;
}
if (message.getSharedDate() == null) {
LOG.error("Retweet Message " + message.getRetweetId() + " contains no date. Skipping.");
return false;
}
// Remove supplementary characters from these text fields.
message.setSharedUserDisplayName(stripSupplementaryChars(
message.getSharedUserDisplayName(),
Field.SHARED_USER_DISPLAY_NAME,
stripEmojisForFields.contains(Field.SHARED_USER_DISPLAY_NAME)));
return true;
}
/**
* Strips non indexable chars from the text.
*
* Returns the resulting string, which may be the same object as the text argument when
* no stripping or truncation is necessary.
*
* Non-indexed characters are "supplementary unicode" that are not emojis. Note that
* supplementary unicode are still characters that seem worth indexing, as many characters
* in CJK languages are supplementary. However this would make the size of our index
* explode (~186k supplementary characters exist), so it's not feasible.
*
* @param text The text to strip
* @param field The field this text is from
* @param stripSupplementaryEmojis Whether or not to strip supplementary emojis. Note that this
* parameter name isn't 100% accurate. This parameter is meant to replicate behavior prior to
* adding support for *not* stripping supplementary emojis. The prior behavior would turn an
* emoji such as a keycap "1\uFE0F\u20E3" (http://www.iemoji.com/view/emoji/295/symbols/keycap-1)
* into just '1'. So the keycap emoji is not completely stripped, only the portion after the '1'.
*
*/
@VisibleForTesting
public static String stripSupplementaryChars(
String text,
Field field,
boolean stripSupplementaryEmojis) {
if (text == null || text.isEmpty()) {
return text;
}
// Initialize an empty map so that if we choose not to strip emojis,
// then no emojipositions will be found and we don't need a null
// check before checking if an emoji is at a certain spot.
NavigableMap<Integer, Integer> emojiPositions = new TreeMap<>();
if (!stripSupplementaryEmojis) {
emojiPositions = EmojiExtractor.getEmojiPositions(text);
}
StringBuilder strippedTextBuilder = new StringBuilder();
int sequenceStart = 0;
int i = 0;
while (i < text.length()) {
if (Character.isSupplementaryCodePoint(text.codePointAt(i))) {
// Check if this supplementary character is an emoji
if (!emojiPositions.containsKey(i)) {
// It's not an emoji, or we want to strip emojis, so strip it
// text[i] and text[i + 1] are part of a supplementary code point.
strippedTextBuilder.append(text.substring(sequenceStart, i));
sequenceStart = i + 2; // skip 2 chars
i = sequenceStart;
COUNTERS_MAP.get(field).getStrippedSupplementaryCharsCounter().increment();
} else {
// It's an emoji, keep it
i += emojiPositions.get(i);
COUNTERS_MAP.get(field).getNonStrippedEmojiCharsCounter().increment();
}
} else {
++i;
}
}
if (sequenceStart < text.length()) {
strippedTextBuilder.append(text.substring(sequenceStart));
}
String strippedText = strippedTextBuilder.toString();
if (strippedText.length() < text.length()) {
COUNTERS_MAP.get(field).getTweetsWithStrippedSupplementaryCharsCounter().increment();
}
return strippedText;
}
/**
* Truncates the given string to the given length.
*
* Note that we are truncating based on the # of UTF-16 characters a given emoji takes up.
* So if a single emoji takes up 4 UTF-16 characters, that counts as 4 for the truncation,
* not just 1.
*
* @param text The text to truncate
* @param maxLength The maximum length of the string after truncation
* @param field The field from which this string cames
* @param splitEmojisAtMaxLength If true, don't worry about emojis and just truncate at maxLength,
* potentially splitting them. If false, truncate before the emoji if truncating at maxLength
* would cause the emoji to be split.
*/
@VisibleForTesting
static String truncateString(
String text,
int maxLength,
Field field,
boolean splitEmojisAtMaxLength) {
Preconditions.checkArgument(maxLength > 0);
if ((text == null) || (text.length() <= maxLength)) {
return text;
}
int truncatePoint = maxLength;
NavigableMap<Integer, Integer> emojiPositions;
// If we want to consider emojis we should not strip on an emoji boundary.
if (!splitEmojisAtMaxLength) {
emojiPositions = EmojiExtractor.getEmojiPositions(text);
// Get the last emoji before maxlength.
Map.Entry<Integer, Integer> lastEmojiBeforeMaxLengthEntry =
emojiPositions.lowerEntry(maxLength);
if (lastEmojiBeforeMaxLengthEntry != null) {
int lowerEmojiEnd = lastEmojiBeforeMaxLengthEntry.getKey()
+ lastEmojiBeforeMaxLengthEntry.getValue();
// If the last emoji would be truncated, truncate before the last emoji.
if (lowerEmojiEnd > truncatePoint) {
truncatePoint = lastEmojiBeforeMaxLengthEntry.getKey();
COUNTERS_MAP.get(field).getEmojisAtTruncateBoundaryCounter().increment();
}
}
}
COUNTERS_MAP.get(field).getTruncatedCounter().increment();
return text.substring(0, truncatePoint);
}
}

View File

@ -1,41 +0,0 @@
package com.twitter.search.common.relevance.entities;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ToStringBuilder;
/**
* The object for quoted message
*/
public class TwitterQuotedMessage {
private final long quotedStatusId;
private final long quotedUserId;
public TwitterQuotedMessage(long quotedStatusId, long quotedUserId) {
this.quotedStatusId = quotedStatusId;
this.quotedUserId = quotedUserId;
}
public long getQuotedStatusId() {
return quotedStatusId;
}
public long getQuotedUserId() {
return quotedUserId;
}
@Override
public boolean equals(Object o) {
return EqualsBuilder.reflectionEquals(this, o);
}
@Override
public int hashCode() {
return HashCodeBuilder.reflectionHashCode(this);
}
@Override
public String toString() {
return ToStringBuilder.reflectionToString(this);
}
}

View File

@ -1,80 +0,0 @@
package com.twitter.search.common.relevance.entities;
import java.util.Date;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ToStringBuilder;
public class TwitterRetweetMessage {
// based on original tweet
private Long sharedId;
// TwitterMessageUtil checks them
private String sharedUserDisplayName;
private Long sharedUserTwitterId = TwitterMessage.LONG_FIELD_NOT_PRESENT;
private Date sharedDate = null;
// based on retweet
private Long retweetId;
public Long getRetweetId() {
return retweetId;
}
public void setRetweetId(Long retweetId) {
this.retweetId = retweetId;
}
public Long getSharedId() {
return sharedId;
}
public void setSharedId(Long sharedId) {
this.sharedId = sharedId;
}
public String getSharedUserDisplayName() {
return sharedUserDisplayName;
}
public void setSharedUserDisplayName(String sharedUserDisplayName) {
this.sharedUserDisplayName = sharedUserDisplayName;
}
public Long getSharedUserTwitterId() {
return sharedUserTwitterId;
}
public boolean hasSharedUserTwitterId() {
return sharedUserTwitterId != TwitterMessage.LONG_FIELD_NOT_PRESENT;
}
public void setSharedUserTwitterId(Long sharedUserTwitterId) {
this.sharedUserTwitterId = sharedUserTwitterId;
}
public Date getSharedDate() {
return sharedDate;
}
public void setSharedDate(Date sharedDate) {
this.sharedDate = sharedDate;
}
@Override
public boolean equals(Object o) {
return EqualsBuilder.reflectionEquals(this, o);
}
@Override
public int hashCode() {
return HashCodeBuilder.reflectionHashCode(this);
}
@Override
public String toString() {
return ToStringBuilder.reflectionToString(this);
}
}

View File

@ -1,88 +0,0 @@
package com.twitter.search.common.relevance.features;
import java.util.concurrent.TimeUnit;
import com.google.common.base.Preconditions;
/**
* Utility to compute an age decay multiplier based on a sigmoid function.
*/
public class AgeDecay {
public static final double SLOPE_COEFF = 4.0;
public static final double LN_HALF = Math.log(0.5);
public final double halflife;
public final double maxBoost;
public final double base;
public final double slope;
/** Creates a new AgeDecay instance. */
public AgeDecay(double base, double maxBoost, double halflife, double slope) {
this.maxBoost = maxBoost;
this.base = base;
this.halflife = halflife;
this.slope = slope;
}
/** Creates a new AgeDecay instance. */
public AgeDecay(double base, double halflife, double slope) {
this(base, 1.0, halflife, slope);
}
/**
* Compute the age decay, using the provided halflife.
*
* @param tweetAge The tweet age.
* @param unit The unit of the tweetAge parameter.
*/
public double getAgeDecayMultiplier(long tweetAge, TimeUnit unit) {
return getAgeDecayMultiplier(TimeUnit.SECONDS.convert(tweetAge, unit));
}
/**
* Compute the age decay, assuming the halflife in the constructor is in minutes.
* @param ageInSeconds the age in seconds
*/
public double getAgeDecayMultiplier(long ageInSeconds) {
long minutesSinceTweet = TimeUnit.MINUTES.convert(ageInSeconds, TimeUnit.SECONDS);
return compute(minutesSinceTweet);
}
/**
* Compute age decay given an age, the age has to be in the same unit as halflife, which you
* construct the object with.
*/
public double compute(double age) {
return compute(base, maxBoost, halflife, slope, age);
}
/**
* Compute the age decay given all parameters. Use this if you don't need to reuse an AgeDecay
* object.
*/
public static double compute(
double base, double maxBoost, double halflife, double slope, double age) {
return base + ((maxBoost - base) / (1 + Math.exp(slope * (age - halflife))));
}
public static double compute(
double base, double maxBoost, double halflife, double age) {
Preconditions.checkArgument(halflife != 0);
return compute(base, maxBoost, halflife, SLOPE_COEFF / halflife, age);
}
/**
* Another nicer exponential decay function. Returns a value in (0, 1]
*/
public static double computeExponential(double halflife, double exp, double age) {
return Math.exp(LN_HALF * Math.pow(age, exp) / Math.pow(halflife, exp));
}
/**
* Exponential decay with remapping of the value from (0,1] to (min,max]
*/
public static double computeExponential(double halflife, double exp, double age,
double minBoost, double maxBoost) {
double decay = computeExponential(halflife, exp, age); // in (0, 1]
return (maxBoost - minBoost) * decay + minBoost;
}
}

View File

@ -1,24 +0,0 @@
# Java library for tweet features and utilities.
java_library(
sources = ["*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/com/twitter/elephantbird:core",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/thrift:libthrift",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/text/token",
"src/java/com/twitter/search/common/encoding/features",
"src/java/com/twitter/search/common/features",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/schema/base",
"src/java/com/twitter/search/common/schema/earlybird",
"src/java/com/twitter/search/common/util/lang",
"src/thrift/com/twitter/search/common:constants-java",
"src/thrift/com/twitter/search/common:features-java",
"src/thrift/com/twitter/search/common:schema-java",
],
)

View File

@ -1,232 +0,0 @@
package com.twitter.search.common.relevance.features;
import java.io.IOException;
import java.util.Map;
import java.util.function.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.schema.base.FeatureConfiguration;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
import com.twitter.search.common.schema.thriftjava.ThriftFeatureNormalizationType;
public class EarlybirdDocumentFeatures {
private static final Map<Integer, SearchCounter> FEATURE_CONFIG_IS_NULL_MAP = Maps.newHashMap();
private static final Map<Integer, SearchCounter> FEATURE_OUTPUT_TYPE_IS_NULL_MAP =
Maps.newHashMap();
private static final Map<Integer, SearchCounter> NO_SCHEMA_FIELD_FOR_FEATURE_MAP =
Maps.newHashMap();
private static final String FEATURE_CONFIG_IS_NULL_COUNTER_PATTERN =
"null_feature_config_for_feature_id_%d";
private static final String FEATURE_OUTPUT_TYPE_IS_NULL_COUNTER_PATTERN =
"null_output_type_for_feature_id_%d";
private static final String NO_SCHEMA_FIELD_FOR_FEATURE_COUNTER_PATTERN =
"no_schema_field_for_feature_id_%d";
private static final SearchCounter UNKNOWN_FEATURE_OUTPUT_TYPE_COUNTER =
SearchCounter.export("unknown_feature_output_type");
private final Map<String, NumericDocValues> numericDocValues = Maps.newHashMap();
private final LeafReader leafReader;
private int docId = -1;
/**
* Creates a new EarlybirdDocumentFeatures instance that will return feature values based on the
* NumericDocValues stored in the given LeafReader for the given document.
*/
public EarlybirdDocumentFeatures(LeafReader leafReader) {
this.leafReader = Preconditions.checkNotNull(leafReader);
}
/**
* Advances this instance to the given doc ID. The new doc ID must be greater than or equal to the
* current doc ID stored in this instance.
*/
public void advance(int target) {
Preconditions.checkArgument(
target >= 0,
"Target (%s) cannot be negative.",
target);
Preconditions.checkArgument(
target >= docId,
"Target (%s) smaller than current doc ID (%s).",
target,
docId);
Preconditions.checkArgument(
target < leafReader.maxDoc(),
"Target (%s) cannot be greater than or equal to the max doc ID (%s).",
target,
leafReader.maxDoc());
docId = target;
}
/**
* Returns the feature value for the given field.
*/
public long getFeatureValue(EarlybirdFieldConstant field) throws IOException {
// The index might not have a NumericDocValues instance for this feature.
// This might happen if we dynamically update the feature schema, for example.
//
// Cache the NumericDocValues instances for all accessed features, even if they're null.
String fieldName = field.getFieldName();
NumericDocValues docValues;
if (numericDocValues.containsKey(fieldName)) {
docValues = numericDocValues.get(fieldName);
} else {
docValues = leafReader.getNumericDocValues(fieldName);
numericDocValues.put(fieldName, docValues);
}
return docValues != null && docValues.advanceExact(docId) ? docValues.longValue() : 0L;
}
/**
* Determines if the given flag is set.
*/
public boolean isFlagSet(EarlybirdFieldConstant field) throws IOException {
return getFeatureValue(field) != 0;
}
/**
* Returns the unnormalized value for the given field.
*/
public double getUnnormalizedFeatureValue(EarlybirdFieldConstant field) throws IOException {
long featureValue = getFeatureValue(field);
ThriftFeatureNormalizationType normalizationType = field.getFeatureNormalizationType();
if (normalizationType == null) {
normalizationType = ThriftFeatureNormalizationType.NONE;
}
switch (normalizationType) {
case NONE:
return featureValue;
case LEGACY_BYTE_NORMALIZER:
return MutableFeatureNormalizers.BYTE_NORMALIZER.unnormLowerBound((byte) featureValue);
case LEGACY_BYTE_NORMALIZER_WITH_LOG2:
return MutableFeatureNormalizers.BYTE_NORMALIZER.unnormAndLog2((byte) featureValue);
case SMART_INTEGER_NORMALIZER:
return MutableFeatureNormalizers.SMART_INTEGER_NORMALIZER.unnormUpperBound(
(byte) featureValue);
case PREDICTION_SCORE_NORMALIZER:
return IntNormalizers.PREDICTION_SCORE_NORMALIZER.denormalize((int) featureValue);
default:
throw new IllegalArgumentException(
"Unsupported normalization type " + normalizationType + " for feature "
+ field.getFieldName());
}
}
/**
* Creates a ThriftSearchResultFeatures instance populated with values for all available features
* that have a non-zero value set.
*/
public ThriftSearchResultFeatures getSearchResultFeatures(ImmutableSchemaInterface schema)
throws IOException {
return getSearchResultFeatures(schema, (featureId) -> true);
}
/**
* Creates a ThriftSearchResultFeatures instance populated with values for all available features
* that have a non-zero value set.
*
* @param schema The schema.
* @param shouldCollectFeatureId A predicate that determines which features should be collected.
*/
public ThriftSearchResultFeatures getSearchResultFeatures(
ImmutableSchemaInterface schema,
Function<Integer, Boolean> shouldCollectFeatureId) throws IOException {
Map<Integer, Boolean> boolValues = Maps.newHashMap();
Map<Integer, Double> doubleValues = Maps.newHashMap();
Map<Integer, Integer> intValues = Maps.newHashMap();
Map<Integer, Long> longValues = Maps.newHashMap();
Map<Integer, FeatureConfiguration> idToFeatureConfigMap = schema.getFeatureIdToFeatureConfig();
for (int featureId : schema.getSearchFeatureSchema().getEntries().keySet()) {
if (!shouldCollectFeatureId.apply(featureId)) {
continue;
}
FeatureConfiguration featureConfig = idToFeatureConfigMap.get(featureId);
if (featureConfig == null) {
FEATURE_CONFIG_IS_NULL_MAP.computeIfAbsent(
featureId,
(fId) -> SearchCounter.export(
String.format(FEATURE_CONFIG_IS_NULL_COUNTER_PATTERN, fId))).increment();
continue;
}
ThriftCSFType outputType = featureConfig.getOutputType();
if (outputType == null) {
FEATURE_OUTPUT_TYPE_IS_NULL_MAP.computeIfAbsent(
featureId,
(fId) -> SearchCounter.export(
String.format(FEATURE_OUTPUT_TYPE_IS_NULL_COUNTER_PATTERN, fId))).increment();
continue;
}
if (!EarlybirdFieldConstants.hasFieldConstant(featureId)) {
// Should only happen for features that were dynamically added to the schema.
NO_SCHEMA_FIELD_FOR_FEATURE_MAP.computeIfAbsent(
featureId,
(fId) -> SearchCounter.export(
String.format(NO_SCHEMA_FIELD_FOR_FEATURE_COUNTER_PATTERN, fId))).increment();
continue;
}
EarlybirdFieldConstant field = EarlybirdFieldConstants.getFieldConstant(featureId);
switch (outputType) {
case BOOLEAN:
if (isFlagSet(field)) {
boolValues.put(featureId, true);
}
break;
case BYTE:
// It's unclear why we don't add this feature to a separate byteValues map...
byte byteFeatureValue = (byte) getFeatureValue(field);
if (byteFeatureValue != 0) {
intValues.put(featureId, (int) byteFeatureValue);
}
break;
case INT:
int intFeatureValue = (int) getFeatureValue(field);
if (intFeatureValue != 0) {
intValues.put(featureId, intFeatureValue);
}
break;
case LONG:
long longFeatureValue = getFeatureValue(field);
if (longFeatureValue != 0) {
longValues.put(featureId, longFeatureValue);
}
break;
case FLOAT:
// It's unclear why we don't add this feature to a separate floatValues map...
float floatFeatureValue = (float) getFeatureValue(field);
if (floatFeatureValue != 0) {
doubleValues.put(featureId, (double) floatFeatureValue);
}
break;
case DOUBLE:
double doubleFeatureValue = getUnnormalizedFeatureValue(field);
if (doubleFeatureValue != 0) {
doubleValues.put(featureId, doubleFeatureValue);
}
break;
default:
UNKNOWN_FEATURE_OUTPUT_TYPE_COUNTER.increment();
}
}
return new ThriftSearchResultFeatures()
.setBoolValues(boolValues)
.setIntValues(intValues)
.setLongValues(longValues)
.setDoubleValues(doubleValues);
}
}

View File

@ -1,75 +0,0 @@
package com.twitter.search.common.relevance.features;
import java.util.Map;
import com.google.common.collect.Maps;
import com.twitter.search.common.encoding.features.IntegerEncodedFeatures;
import com.twitter.search.common.schema.base.FeatureConfiguration;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
/**
* FeatureSink is used to write features based on feature configuration or feature name. After
* all feature is written, the class can return the base field integer array values.
*
* This class is not thread-safe.
*/
public class FeatureSink {
private ImmutableSchemaInterface schema;
private final Map<String, IntegerEncodedFeatures> encodedFeatureMap;
/** Creates a new FeatureSink instance. */
public FeatureSink(ImmutableSchemaInterface schema) {
this.schema = schema;
this.encodedFeatureMap = Maps.newHashMap();
}
private IntegerEncodedFeatures getFeatures(String baseFieldName) {
IntegerEncodedFeatures features = encodedFeatureMap.get(baseFieldName);
if (features == null) {
features = EarlybirdEncodedFeatures.newEncodedTweetFeatures(schema, baseFieldName);
encodedFeatureMap.put(baseFieldName, features);
}
return features;
}
/** Sets the given numeric value for the field. */
public FeatureSink setNumericValue(EarlybirdFieldConstant field, int value) {
return setNumericValue(field.getFieldName(), value);
}
/** Sets the given numeric value for the feature with the given name. */
public FeatureSink setNumericValue(String featureName, int value) {
final FeatureConfiguration featureConfig = schema.getFeatureConfigurationByName(featureName);
if (featureConfig != null) {
getFeatures(featureConfig.getBaseField()).setFeatureValue(featureConfig, value);
}
return this;
}
/** Sets the given boolean value for the given field. */
public FeatureSink setBooleanValue(EarlybirdFieldConstant field, boolean value) {
return setBooleanValue(field.getFieldName(), value);
}
/** Sets the given boolean value for the feature with the given name. */
public FeatureSink setBooleanValue(String featureName, boolean value) {
final FeatureConfiguration featureConfig = schema.getFeatureConfigurationByName(featureName);
if (featureConfig != null) {
getFeatures(featureConfig.getBaseField()).setFlagValue(featureConfig, value);
}
return this;
}
/** Returns the features for the given base field. */
public IntegerEncodedFeatures getFeaturesForBaseField(EarlybirdFieldConstant baseField) {
return getFeaturesForBaseField(baseField.getFieldName());
}
/** Returns the features for the given base field. */
public IntegerEncodedFeatures getFeaturesForBaseField(String baseFieldName) {
return encodedFeatureMap.get(baseFieldName);
}
}

View File

@ -1,39 +0,0 @@
package com.twitter.search.common.relevance.features;
import java.util.concurrent.TimeUnit;
import com.twitter.search.common.encoding.features.ByteNormalizer;
import com.twitter.search.common.encoding.features.IntNormalizer;
import com.twitter.search.common.encoding.features.PredictionScoreNormalizer;
/**
* Int value normalizers used to push feature values into earlybird db. For the
* 8-bit feature types, this class wraps the
* com.twitter.search.common.relevance.features.MutableFeatureNormalizers
*/
public final class IntNormalizers {
private IntNormalizers() {
}
public static final IntNormalizer LEGACY_NORMALIZER =
val -> ByteNormalizer.unsignedByteToInt(
MutableFeatureNormalizers.BYTE_NORMALIZER.normalize(val));
public static final IntNormalizer SMART_INTEGER_NORMALIZER =
val -> ByteNormalizer.unsignedByteToInt(
MutableFeatureNormalizers.SMART_INTEGER_NORMALIZER.normalize(val));
// The PARUS_SCORE feature is deprecated and is never set in our indexes. However, we still need
// this normalizer for now, because some models do not work properly with "missing" features, so
// for now we still need to set the PARUS_SCORE feature to 0.
public static final IntNormalizer PARUS_SCORE_NORMALIZER = val -> 0;
public static final IntNormalizer BOOLEAN_NORMALIZER =
val -> val == 0 ? 0 : 1;
public static final IntNormalizer TIMESTAMP_SEC_TO_HR_NORMALIZER =
val -> (int) TimeUnit.SECONDS.toHours((long) val);
public static final PredictionScoreNormalizer PREDICTION_SCORE_NORMALIZER =
new PredictionScoreNormalizer(3);
}

View File

@ -1,23 +0,0 @@
package com.twitter.search.common.relevance.features;
import com.twitter.search.common.encoding.features.ByteNormalizer;
import com.twitter.search.common.encoding.features.SingleBytePositiveFloatNormalizer;
import com.twitter.search.common.encoding.features.SmartIntegerNormalizer;
/**
* Byte value normalizers used to push feature values into earlybird db.
*/
public abstract class MutableFeatureNormalizers {
// The max value we support in SMART_INTEGER_NORMALIZER below, this should be enough for all kinds
// of engagements we see on Twitter, anything larger than this would be represented as the same
// value (255, if using a byte).
private static final int MAX_COUNTER_VALUE_SUPPORTED = 50000000;
// Avoid using this normalizer for procesing any new data, always use SmartIntegerNormalizer
// below.
public static final SingleBytePositiveFloatNormalizer BYTE_NORMALIZER =
new SingleBytePositiveFloatNormalizer();
public static final ByteNormalizer SMART_INTEGER_NORMALIZER =
new SmartIntegerNormalizer(MAX_COUNTER_VALUE_SUPPORTED, 8);
}

View File

@ -1,9 +0,0 @@
package com.twitter.search.common.relevance.features;
/**
* An enum to hold different types of query-specific features (these are not indexed in Earlybird)
*/
public enum QueryFeatureType {
SOCIAL_ENGAGEMENTS,
CLICKS
}

View File

@ -1,30 +0,0 @@
package com.twitter.search.common.relevance.features;
/**
* Defines relevance related constants that are used at both ingestion time and
* earlybird scoring time.
*/
public final class RelevanceSignalConstants {
// user reputation
public static final byte UNSET_REPUTATION_SENTINEL = Byte.MIN_VALUE;
public static final byte MAX_REPUTATION = 100;
public static final byte MIN_REPUTATION = 0;
// below overall CDF of ~10%, default value for new users,
// given as a goodwill value in case it is unset
public static final byte GOODWILL_REPUTATION = 17;
// text score
public static final byte UNSET_TEXT_SCORE_SENTINEL = Byte.MIN_VALUE;
// roughly at overall CDF of ~10%, given as a goodwill value in case it is unset
public static final byte GOODWILL_TEXT_SCORE = 19;
private RelevanceSignalConstants() {
}
// check whether the specified user rep value is valid
public static boolean isValidUserReputation(int userRep) {
return userRep != UNSET_REPUTATION_SENTINEL
&& userRep >= MIN_REPUTATION
&& userRep < MAX_REPUTATION;
}
}

View File

@ -1,24 +0,0 @@
package com.twitter.search.common.relevance.features;
import com.google.common.base.Preconditions;
/**
* Scoring utilities
*/
public final class ScoringUtils {
private ScoringUtils() { }
/**
* normalize a positive value of arbitrary range to [0.0, 1.0], with a slop
* @param value the value to normalize.
* @param halfval a reference value that will be normalized to 0.5
* @param exp an exponential parameter (must be positive) to control the converging speed,
* the smaller the value the faster it reaches the halfval but slower it reaches the maximum.
* @return a normalized value
*/
public static float normalize(float value, double halfval, double exp) {
Preconditions.checkArgument(exp > 0.0 && exp <= 1.0);
return (float) (Math.pow(value, exp) / (Math.pow(value, exp) + Math.pow(halfval, exp)));
}
}

View File

@ -1,79 +0,0 @@
package com.twitter.search.common.relevance.features;
import java.util.Map;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import com.twitter.common.base.Function;
/**
* Class to keep String-Double of term vectors
* It can calculate magnitude, dot product, and cosine similarity
*/
public class TermVector {
private static final double MIN_MAGNITUDE = 0.00001;
private final double magnitude;
private final ImmutableMap<String, Double> termWeights;
/** Creates a new TermVector instance. */
public TermVector(Map<String, Double> termWeights) {
this.termWeights = ImmutableMap.copyOf(termWeights);
double sum = 0.0;
for (Map.Entry<String, Double> entry : termWeights.entrySet()) {
double value = entry.getValue();
sum += value * value;
}
magnitude = Math.sqrt(sum);
}
public ImmutableMap<String, Double> getTermWeights() {
return termWeights;
}
public double getMagnitude() {
return magnitude;
}
/**
* Normalize term vector into unit magnitude
* @return the unit normalized TermVector with magnitude equals 1
* return null if magnitude is very low
*/
public TermVector getUnitNormalized() {
if (magnitude < MIN_MAGNITUDE) {
return null;
}
return new TermVector(
Maps.transformValues(termWeights, (Function<Double, Double>) weight -> weight / magnitude));
}
/**
* Calculate the dot product with another term vector
* @param other the other term vector
* @return the dot product of the two vectors
*/
public double getDotProduct(TermVector other) {
double sum = 0.0;
for (Map.Entry<String, Double> entry : termWeights.entrySet()) {
Double value2 = other.termWeights.get(entry.getKey());
if (value2 != null) {
sum += entry.getValue() * value2;
}
}
return sum;
}
/**
* Calculate the cosine similarity of with another term vector
* @param other the other term vector
* @return the cosine similarity.
* if either has very small magnitude, it returns 0 (dotProduct close to 0)
*/
public double getCosineSimilarity(TermVector other) {
if (magnitude < MIN_MAGNITUDE || other.magnitude < MIN_MAGNITUDE) {
return 0;
}
return getDotProduct(other) / (magnitude * other.magnitude);
}
}

View File

@ -1,57 +0,0 @@
package com.twitter.search.common.relevance.features;
import com.twitter.search.common.encoding.features.EncodedFeatures;
/**
* Holds engagement features for a particular tweet and encodes them as a single int.
* The features are: retweet count, favorite count, itweet score, reply count.
*/
public class TweetEngagementFeatures extends EncodedFeatures {
private static final int RETWEET_COUNT_BIT_SHIFT = 0;
private static final long RETWEET_COUNT_INVERSE_BIT_MASK = 0xffffff00L;
private static final int ITWEET_SCORE_BIT_SHIFT = 8;
private static final long ITWEET_SCORE_INVERSE_BIT_MASK = 0xffff00ffL;
private static final int FAV_COUNT_BIT_SHIFT = 16;
private static final long FAV_COUNT_INVERSE_BIT_MASK = 0xff00ffffL;
private static final int REPLY_COUNT_BIT_SHIFT = 24;
private static final long REPLY_COUNT_INVERSE_BIT_MASK = 0x00ffffffL;
public TweetEngagementFeatures setRetweetCount(byte count) {
setByteIfGreater(count, RETWEET_COUNT_BIT_SHIFT, RETWEET_COUNT_INVERSE_BIT_MASK);
return this;
}
public int getRetweetCount() {
return getByte(RETWEET_COUNT_BIT_SHIFT);
}
public TweetEngagementFeatures setITweetScore(byte score) {
setByteIfGreater(score, ITWEET_SCORE_BIT_SHIFT, ITWEET_SCORE_INVERSE_BIT_MASK);
return this;
}
public int getITweetScore() {
return getByte(ITWEET_SCORE_BIT_SHIFT);
}
public TweetEngagementFeatures setFavCount(byte count) {
setByteIfGreater(count, FAV_COUNT_BIT_SHIFT, FAV_COUNT_INVERSE_BIT_MASK);
return this;
}
public int getFavCount() {
return getByte(FAV_COUNT_BIT_SHIFT);
}
public TweetEngagementFeatures setReplyCount(byte count) {
setByteIfGreater(count, REPLY_COUNT_BIT_SHIFT, REPLY_COUNT_INVERSE_BIT_MASK);
return this;
}
public int getReplyCount() {
return getByte(REPLY_COUNT_BIT_SHIFT);
}
}

View File

@ -1,291 +0,0 @@
package com.twitter.search.common.relevance.features;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.twitter.search.common.encoding.features.IntNormalizer;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants;
import static com.twitter.search.common.relevance.features.IntNormalizers.BOOLEAN_NORMALIZER;
import static com.twitter.search.common.relevance.features.IntNormalizers.LEGACY_NORMALIZER;
import static com.twitter.search.common.relevance.features.IntNormalizers.PARUS_SCORE_NORMALIZER;
import static com.twitter.search.common.relevance.features.IntNormalizers.SMART_INTEGER_NORMALIZER;
import static com.twitter.search.common.relevance.features.IntNormalizers.TIMESTAMP_SEC_TO_HR_NORMALIZER;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
/**
* An enum to represent all dynamic/realtime feature types we can update in the Signal Ingester.
* It provides information for their normalization and their corresponding earlybird feature fields
* and provides utils both producer (Signal Ingester) and consumer (Earlybird) side.
*
*/
public enum TweetFeatureType {
RETWEET (true, 0, LEGACY_NORMALIZER,
EarlybirdFieldConstant.RETWEET_COUNT),
REPLY (true, 1, LEGACY_NORMALIZER,
EarlybirdFieldConstant.REPLY_COUNT),
FAVORITE (true, 4, LEGACY_NORMALIZER,
EarlybirdFieldConstant.FAVORITE_COUNT),
PARUS_SCORE (false, 3, PARUS_SCORE_NORMALIZER,
EarlybirdFieldConstant.PARUS_SCORE),
EMBEDS_IMP_COUNT (true, 10, LEGACY_NORMALIZER,
EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT),
EMBEDS_URL_COUNT (true, 11, LEGACY_NORMALIZER,
EarlybirdFieldConstant.EMBEDS_URL_COUNT),
VIDEO_VIEW (false, 12, LEGACY_NORMALIZER,
EarlybirdFieldConstant.VIDEO_VIEW_COUNT),
// v2 engagement counters, they will eventually replace v1 counters above
RETWEET_V2 (true, 13, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.RETWEET_COUNT_V2),
REPLY_V2 (true, 14, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.REPLY_COUNT_V2),
FAVORITE_V2 (true, 15, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.FAVORITE_COUNT_V2),
EMBEDS_IMP_COUNT_V2 (true, 16, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT_V2),
EMBEDS_URL_COUNT_V2 (true, 17, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.EMBEDS_URL_COUNT_V2),
VIDEO_VIEW_V2 (false, 18, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.VIDEO_VIEW_COUNT_V2),
// other new items
QUOTE (true, 19, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.QUOTE_COUNT),
// weighted engagement counters
WEIGHTED_RETWEET (true, 20, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.WEIGHTED_RETWEET_COUNT),
WEIGHTED_REPLY (true, 21, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.WEIGHTED_REPLY_COUNT),
WEIGHTED_FAVORITE (true, 22, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.WEIGHTED_FAVORITE_COUNT),
WEIGHTED_QUOTE (true, 23, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.WEIGHTED_QUOTE_COUNT),
// tweet-level safety labels
LABEL_ABUSIVE (false, 24, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_ABUSIVE_FLAG),
LABEL_ABUSIVE_HI_RCL (false, 25, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_ABUSIVE_HI_RCL_FLAG),
LABEL_DUP_CONTENT (false, 26, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_DUP_CONTENT_FLAG),
LABEL_NSFW_HI_PRC (false, 27, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_NSFW_HI_PRC_FLAG),
LABEL_NSFW_HI_RCL (false, 28, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_NSFW_HI_RCL_FLAG),
LABEL_SPAM (false, 29, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_SPAM_FLAG),
LABEL_SPAM_HI_RCL (false, 30, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_SPAM_HI_RCL_FLAG),
PERISCOPE_EXISTS (false, 32, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.PERISCOPE_EXISTS),
PERISCOPE_HAS_BEEN_FEATURED (false, 33, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.PERISCOPE_HAS_BEEN_FEATURED),
PERISCOPE_IS_CURRENTLY_FEATURED (false, 34, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.PERISCOPE_IS_CURRENTLY_FEATURED),
PERISCOPE_IS_FROM_QUALITY_SOURCE(false, 35, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.PERISCOPE_IS_FROM_QUALITY_SOURCE),
PERISCOPE_IS_LIVE (false, 36, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.PERISCOPE_IS_LIVE),
// decayed engagement counters
DECAYED_RETWEET (true, 37, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.DECAYED_RETWEET_COUNT),
DECAYED_REPLY (true, 38, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.DECAYED_REPLY_COUNT),
DECAYED_FAVORITE (true, 39, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.DECAYED_FAVORITE_COUNT),
DECAYED_QUOTE (true, 40, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.DECAYED_QUOTE_COUNT),
// timestamp of last engagement types
LAST_RETWEET_SINCE_CREATION_HR (false, 41, TIMESTAMP_SEC_TO_HR_NORMALIZER,
EarlybirdFieldConstant.LAST_RETWEET_SINCE_CREATION_HRS),
LAST_REPLY_SINCE_CREATION_HR (false, 42, TIMESTAMP_SEC_TO_HR_NORMALIZER,
EarlybirdFieldConstant.LAST_REPLY_SINCE_CREATION_HRS),
LAST_FAVORITE_SINCE_CREATION_HR (false, 43, TIMESTAMP_SEC_TO_HR_NORMALIZER,
EarlybirdFieldConstant.LAST_FAVORITE_SINCE_CREATION_HRS),
LAST_QUOTE_SINCE_CREATION_HR (false, 44, TIMESTAMP_SEC_TO_HR_NORMALIZER,
EarlybirdFieldConstant.LAST_QUOTE_SINCE_CREATION_HRS),
// fake engagement counters
FAKE_RETWEET (true, 45, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.FAKE_RETWEET_COUNT),
FAKE_REPLY (true, 46, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.FAKE_REPLY_COUNT),
FAKE_FAVORITE (true, 47, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.FAKE_FAVORITE_COUNT),
FAKE_QUOTE (true, 48, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.FAKE_QUOTE_COUNT),
// blink engagement counters
BLINK_RETWEET (true, 49, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.BLINK_RETWEET_COUNT),
BLINK_REPLY (true, 50, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.BLINK_REPLY_COUNT),
BLINK_FAVORITE (true, 51, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.BLINK_FAVORITE_COUNT),
BLINK_QUOTE (true, 52, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.BLINK_QUOTE_COUNT),
/* semicolon in a single line to avoid polluting git blame */;
private static final Map<TweetFeatureType, TweetFeatureType> V2_COUNTER_MAP =
ImmutableMap.<TweetFeatureType, TweetFeatureType>builder()
.put(RETWEET, RETWEET_V2)
.put(REPLY, REPLY_V2)
.put(FAVORITE, FAVORITE_V2)
.put(EMBEDS_IMP_COUNT, EMBEDS_IMP_COUNT_V2)
.put(EMBEDS_URL_COUNT, EMBEDS_URL_COUNT_V2)
.put(VIDEO_VIEW, VIDEO_VIEW_V2)
.build();
private static final Map<TweetFeatureType, TweetFeatureType> WEIGHTED_COUNTER_MAP =
ImmutableMap.<TweetFeatureType, TweetFeatureType>builder()
.put(RETWEET, WEIGHTED_RETWEET)
.put(REPLY, WEIGHTED_REPLY)
.put(FAVORITE, WEIGHTED_FAVORITE)
.put(QUOTE, WEIGHTED_QUOTE)
.build();
private static final Map<TweetFeatureType, TweetFeatureType> DECAYED_COUNTER_MAP =
ImmutableMap.<TweetFeatureType, TweetFeatureType>builder()
.put(RETWEET, DECAYED_RETWEET)
.put(REPLY, DECAYED_REPLY)
.put(FAVORITE, DECAYED_FAVORITE)
.put(QUOTE, DECAYED_QUOTE)
.build();
private static final Map<TweetFeatureType, TweetFeatureType> DECAYED_COUNTER_TO_ELAPSED_TIME =
ImmutableMap.<TweetFeatureType, TweetFeatureType>builder()
.put(DECAYED_RETWEET, LAST_RETWEET_SINCE_CREATION_HR)
.put(DECAYED_REPLY, LAST_REPLY_SINCE_CREATION_HR)
.put(DECAYED_FAVORITE, LAST_FAVORITE_SINCE_CREATION_HR)
.put(DECAYED_QUOTE, LAST_QUOTE_SINCE_CREATION_HR)
.build();
private static final Set<TweetFeatureType> DECAYED_FEATURES =
ImmutableSet.of(DECAYED_RETWEET, DECAYED_REPLY, DECAYED_FAVORITE, DECAYED_QUOTE);
private static final Set<TweetFeatureType> FAKE_ENGAGEMENT_FEATURES =
ImmutableSet.of(FAKE_RETWEET, FAKE_REPLY, FAKE_FAVORITE, FAKE_QUOTE);
private static final Set<TweetFeatureType> BLINK_ENGAGEMENT_FEATURES =
ImmutableSet.of(BLINK_RETWEET, BLINK_REPLY, BLINK_FAVORITE, BLINK_QUOTE);
@Nullable
public TweetFeatureType getV2Type() {
return V2_COUNTER_MAP.get(this);
}
@Nullable
public static TweetFeatureType getWeightedType(TweetFeatureType type) {
return WEIGHTED_COUNTER_MAP.get(type);
}
@Nullable
public static TweetFeatureType getDecayedType(TweetFeatureType type) {
return DECAYED_COUNTER_MAP.get(type);
}
// Whether this feature is incremental or direct value.
private final boolean incremental;
// This normalizer is used to (1) normalize the output value in DLIndexEventOutputBolt,
// (2) check value change.
private final IntNormalizer normalizer;
// value for composing cache key. It has to be unique and in increasing order.
private final int typeInt;
private final EarlybirdFieldConstants.EarlybirdFieldConstant earlybirdField;
private final IncrementChecker incrementChecker;
/**
* Constructing an enum for a type. The earlybirdField can be null if it's not prepared, they
* can be here as placeholders but they can't be outputted.
* The normalizer is null for the timestamp features that do not require normalization
*/
TweetFeatureType(boolean incremental,
int typeInt,
IntNormalizer normalizer,
@Nullable EarlybirdFieldConstant earlybirdField) {
this.incremental = incremental;
this.typeInt = typeInt;
this.normalizer = normalizer;
this.earlybirdField = earlybirdField;
this.incrementChecker = new IncrementChecker(this);
}
public boolean isIncremental() {
return incremental;
}
public IntNormalizer getNormalizer() {
return normalizer;
}
public int getTypeInt() {
return typeInt;
}
public int normalize(double value) {
return normalizer.normalize(value);
}
public IncrementChecker getIncrementChecker() {
return incrementChecker;
}
public EarlybirdFieldConstant getEarlybirdField() {
return Preconditions.checkNotNull(earlybirdField);
}
public boolean hasEarlybirdField() {
return earlybirdField != null;
}
public boolean isDecayed() {
return DECAYED_FEATURES.contains(this);
}
@Nullable
public TweetFeatureType getElapsedTimeFeatureType() {
return DECAYED_COUNTER_TO_ELAPSED_TIME.get(this);
}
public boolean isFakeEngagement() {
return FAKE_ENGAGEMENT_FEATURES.contains(this);
}
public boolean isBlinkEngagement() {
return BLINK_ENGAGEMENT_FEATURES.contains(this);
}
/**
* Check if an increment is eligible for emitting
*/
public static class IncrementChecker {
private final IntNormalizer normalizer;
public IncrementChecker(IntNormalizer normalizer) {
this.normalizer = normalizer;
}
IncrementChecker(TweetFeatureType type) {
this(type.getNormalizer());
}
/**
* Check if a value change is eligible for output
*/
public boolean eligibleForEmit(int oldValue, int newValue) {
return normalizer.normalize(oldValue) != normalizer.normalize(newValue);
}
}
}

View File

@ -1,19 +0,0 @@
package com.twitter.search.common.relevance.features;
public class TweetFeatures {
private final TweetTextQuality tweetTextQuality = new TweetTextQuality();
private final TweetTextFeatures tweetTextFeatures = new TweetTextFeatures();
private final TweetUserFeatures tweetUserFeatures = new TweetUserFeatures();
public TweetTextFeatures getTweetTextFeatures() {
return tweetTextFeatures;
}
public TweetTextQuality getTweetTextQuality() {
return tweetTextQuality;
}
public TweetUserFeatures getTweetUserFeatures() {
return tweetUserFeatures;
}
}

View File

@ -1,201 +0,0 @@
package com.twitter.search.common.relevance.features;
import java.nio.ByteBuffer;
import java.util.Arrays;
import com.google.common.base.Preconditions;
/**
* A TweetIntegerShingleSignature object consists of 4 bytes, each representing the signature of
* a status text sample. The signature bytes are sorted in ascending order and compacted to an
* integer in big endian for serialization.
*
* Fuzzy matching of two TweetIntegerShingleSignature objects is met when the number of matching
* bytes between the two is equal to or above 3.
*/
public class TweetIntegerShingleSignature {
public static final int NUM_SHINGLES = Integer.SIZE / Byte.SIZE;
public static final int DEFAULT_NO_SIGNATURE = 0;
public static final TweetIntegerShingleSignature NO_SIGNATURE_HANDLE =
deserialize(DEFAULT_NO_SIGNATURE);
public static final int DEFAULT_MIN_SHINGLES_MATCH = 3;
private final int minShinglesMatch;
private final byte[] shingles;
private final int signature; // redundant information, for easier comparison.
/**
* Construct from a byte array.
*/
public TweetIntegerShingleSignature(byte[] shingles, int minShinglesMatch) {
Preconditions.checkArgument(shingles.length == NUM_SHINGLES);
this.shingles = shingles;
// sort to byte's natural ascending order
Arrays.sort(this.shingles);
this.minShinglesMatch = minShinglesMatch;
this.signature = serializeInternal(shingles);
}
/**
* Construct from a byte array.
*/
public TweetIntegerShingleSignature(byte[] shingles) {
this(shingles, DEFAULT_MIN_SHINGLES_MATCH);
}
/**
* Construct from a serialized integer signature.
*/
public TweetIntegerShingleSignature(int signature, int minShinglesMatch) {
this.shingles = deserializeInternal(signature);
// sort to byte's natural ascending order
Arrays.sort(this.shingles);
this.minShinglesMatch = minShinglesMatch;
// now store the sorted shingles into signature field, may be different from what passed in.
this.signature = serializeInternal(shingles);
}
/**
* Construct from a serialized integer signature.
*/
public TweetIntegerShingleSignature(int signature) {
this(signature, DEFAULT_MIN_SHINGLES_MATCH);
}
/**
* Used by ingester to generate signature.
* Raw signatures are in byte arrays per sample, and can be more or less
* than what is asked for.
*
* @param rawSignature
*/
public TweetIntegerShingleSignature(Iterable<byte[]> rawSignature) {
byte[] condensedSignature = new byte[NUM_SHINGLES];
int i = 0;
for (byte[] signatureItem : rawSignature) {
condensedSignature[i++] = signatureItem[0];
if (i == NUM_SHINGLES) {
break;
}
}
this.shingles = condensedSignature;
Arrays.sort(this.shingles);
this.minShinglesMatch = DEFAULT_MIN_SHINGLES_MATCH;
this.signature = serializeInternal(shingles);
}
/**
* When used in a hashtable for dup detection, take the first byte of each signature for fast
* pass for majority case of no fuzzy matching. For top queries, this optimization losses about
* only 4% of all fuzzy matches.
*
* @return most significant byte of this signature as its hashcode.
*/
@Override
public int hashCode() {
return shingles[0] & 0xFF;
}
/**
* Perform fuzzy matching between two TweetIntegerShingleSignature objects.
*
* @param other TweetIntegerShingleSignature object to perform fuzzy match against
* @return true if at least minMatch number of bytes match
*/
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (other == null) {
return false;
}
if (getClass() != other.getClass()) {
return false;
}
final TweetIntegerShingleSignature otherSignatureInteger = (TweetIntegerShingleSignature) other;
int otherSignature = otherSignatureInteger.serialize();
if (signature == otherSignature) {
// Both serialized signature is the same
return true;
} else if (signature != DEFAULT_NO_SIGNATURE && otherSignature != DEFAULT_NO_SIGNATURE) {
// Neither is NO_SIGNATURE, need to compare shingles.
byte[] otherShingles = otherSignatureInteger.getShingles();
int numberMatchesNeeded = minShinglesMatch;
// expect bytes are in ascending sorted order
int i = 0;
int j = 0;
while (((numberMatchesNeeded <= (NUM_SHINGLES - i)) // early termination for i
|| (numberMatchesNeeded <= (NUM_SHINGLES - j))) // early termination j
&& (i < NUM_SHINGLES) && (j < NUM_SHINGLES)) {
if (shingles[i] == otherShingles[j]) {
if (shingles[i] != 0) { // we only consider two shingles equal if they are non zero
numberMatchesNeeded--;
if (numberMatchesNeeded == 0) {
return true;
}
}
i++;
j++;
} else if (shingles[i] < otherShingles[j]) {
i++;
} else if (shingles[i] > otherShingles[j]) {
j++;
}
}
}
// One is NO_SIGNATURE and one is not.
return false;
}
/**
* Returns the sorted array of signature bytes.
*/
public byte[] getShingles() {
return shingles;
}
/**
* Serialize 4 sorted signature bytes into an integer in big endian order.
*
* @return compacted int signature
*/
private static int serializeInternal(byte[] shingles) {
ByteBuffer byteBuffer = ByteBuffer.allocate(NUM_SHINGLES);
byteBuffer.put(shingles, 0, NUM_SHINGLES);
return byteBuffer.getInt(0);
}
/**
* Deserialize an integer into a 4-byte array.
* @param signature The signature integer.
* @return A byte array with 4 elements.
*/
private static byte[] deserializeInternal(int signature) {
return ByteBuffer.allocate(NUM_SHINGLES).putInt(signature).array();
}
public int serialize() {
return signature;
}
public static boolean isFuzzyMatch(int signature1, int signature2) {
return TweetIntegerShingleSignature.deserialize(signature1).equals(
TweetIntegerShingleSignature.deserialize(signature2));
}
public static TweetIntegerShingleSignature deserialize(int signature) {
return new TweetIntegerShingleSignature(signature);
}
public static TweetIntegerShingleSignature deserialize(int signature, int minMatchSingles) {
return new TweetIntegerShingleSignature(signature, minMatchSingles);
}
@Override
public String toString() {
return String.format("%d %d %d %d", shingles[0], shingles[1], shingles[2], shingles[3]);
}
}

View File

@ -1,15 +0,0 @@
package com.twitter.search.common.relevance.features;
public final class TweetSignatureUtil {
private TweetSignatureUtil() {
}
/** Converts the signature in args[0] to a TweetIntegerShingleSignature. */
public static void main(String[] args) throws Exception {
if (args.length < 1) {
throw new RuntimeException("Please provide signature value.");
}
int signature = Integer.parseInt(args[0]);
System.out.println(TweetIntegerShingleSignature.deserialize(signature).toString());
}
}

View File

@ -1,225 +0,0 @@
package com.twitter.search.common.relevance.features;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Sets;
import com.twitter.common.text.token.TokenizedCharSequence;
public class TweetTextFeatures {
// Basic Features, always extracted.
// normalized, lower cased tweet text, w/o resolved urls
private String normalizedText;
// tokens from normalizedText, w/o resolved urls, lower cased.
private List<String> tokens;
// tokens from resolved urls, lower cased.
private List<String> resolvedUrlsTokens;
// tokens in the form of a TokenizedCharSeq, NOT LOWER CASED
private TokenizedCharSequence tokenSequence;
// strippedTokens above joined with space
private String normalizedStrippedText;
// normalized, original case tokens, without @mention, #hashtag or urls.
private List<String> strippedTokens;
// all hash tags, without "#", lower cased
private Set<String> hashtags = Sets.newHashSet();
// all mentions, without "@", lower cased
private Set<String> mentions = Sets.newHashSet();
// whether this tweet has a question mark that's not in url.
private boolean hasQuestionMark = false;
private boolean hasPositiveSmiley = false;
private boolean hasNegativeSmiley = false;
// normalized, original case smileys
private List<String> smileys;
// lower cased, normalized stock names, without "$"
private List<String> stocks;
// Extra features for text quality evaluation only.
private int signature = TweetIntegerShingleSignature.DEFAULT_NO_SIGNATURE;
private Set<String> trendingTerms = Sets.newHashSet();
private int length;
private int caps;
public String getNormalizedText() {
return normalizedText;
}
public void setNormalizedText(String normalizedText) {
this.normalizedText = normalizedText;
}
public List<String> getTokens() {
return tokens;
}
public int getTokensSize() {
return tokens == null ? 0 : tokens.size();
}
public void setTokens(List<String> tokens) {
this.tokens = tokens;
}
public List<String> getResolvedUrlTokens() {
return resolvedUrlsTokens;
}
public int getResolvedUrlTokensSize() {
return resolvedUrlsTokens == null ? 0 : resolvedUrlsTokens.size();
}
public void setResolvedUrlTokens(List<String> tokensResolvedUrls) {
this.resolvedUrlsTokens = tokensResolvedUrls;
}
public TokenizedCharSequence getTokenSequence() {
return tokenSequence;
}
public void setTokenSequence(TokenizedCharSequence tokenSequence) {
this.tokenSequence = tokenSequence;
}
public String getNormalizedStrippedText() {
return normalizedStrippedText;
}
public void setNormalizedStrippedText(String normalizedStrippedText) {
this.normalizedStrippedText = normalizedStrippedText;
}
public List<String> getStrippedTokens() {
return strippedTokens;
}
public int getStrippedTokensSize() {
return strippedTokens == null ? 0 : strippedTokens.size();
}
public void setStrippedTokens(List<String> strippedTokens) {
this.strippedTokens = strippedTokens;
}
public Set<String> getHashtags() {
return hashtags;
}
public int getHashtagsSize() {
return hashtags.size();
}
public void setHashtags(Collection<String> hashtags) {
this.hashtags = Sets.newHashSet(hashtags);
}
public Set<String> getMentions() {
return mentions;
}
public int getMentionsSize() {
return mentions.size();
}
public void setMentions(Collection<String> mentions) {
this.mentions = Sets.newHashSet(mentions);
}
public boolean hasQuestionMark() {
return hasQuestionMark;
}
public void setHasQuestionMark(boolean hasQuestionMark) {
this.hasQuestionMark = hasQuestionMark;
}
public boolean hasPositiveSmiley() {
return hasPositiveSmiley;
}
public void setHasPositiveSmiley(boolean hasPositiveSmiley) {
this.hasPositiveSmiley = hasPositiveSmiley;
}
public boolean hasNegativeSmiley() {
return hasNegativeSmiley;
}
public void setHasNegativeSmiley(boolean hasNegativeSmiley) {
this.hasNegativeSmiley = hasNegativeSmiley;
}
public List<String> getSmileys() {
return smileys;
}
public int getSmileysSize() {
return smileys == null ? 0 : smileys.size();
}
public void setSmileys(List<String> smileys) {
this.smileys = smileys;
}
public List<String> getStocks() {
return stocks;
}
public int getStocksSize() {
return stocks == null ? 0 : stocks.size();
}
public void setStocks(List<String> stocks) {
this.stocks = stocks;
}
public int getSignature() {
return signature;
}
public void setSignature(int signature) {
this.signature = signature;
}
/** Returns the trending terms. */
public Set<String> getTrendingTerms() {
return trendingTerms;
}
public int getTrendingTermsSize() {
return trendingTerms.size();
}
@VisibleForTesting
public void setTrendingTerms(Set<String> trendingTerms) {
this.trendingTerms = trendingTerms;
}
public int getLength() {
return length;
}
public void setLength(int length) {
this.length = length;
}
public int getCaps() {
return caps;
}
public void setCaps(int caps) {
this.caps = caps;
}
}

View File

@ -1,69 +0,0 @@
package com.twitter.search.common.relevance.features;
import java.util.Set;
import com.google.common.collect.Sets;
public class TweetTextQuality {
public static enum BooleanQualityType {
OFFENSIVE, // tweet text is offensive
OFFENSIVE_USER, // user name is offensive
HASHTAG_NAME_MATCH, // hashtag matches username
SENSITIVE, // tweet is marked as sensitive when it comes in
}
public static final double ENTROPY_NOT_SET = Double.MIN_VALUE;
public static final byte UNSET_TEXT_SCORE = -128;
private double readability;
private double shout;
private double entropy = ENTROPY_NOT_SET;
private final Set<BooleanQualityType> boolQualities = Sets.newHashSet();
private byte textScore = UNSET_TEXT_SCORE;
public double getReadability() {
return readability;
}
public void setReadability(double readability) {
this.readability = readability;
}
public double getShout() {
return shout;
}
public void setShout(double shout) {
this.shout = shout;
}
public double getEntropy() {
return entropy;
}
public void setEntropy(double entropy) {
this.entropy = entropy;
}
public void addBoolQuality(BooleanQualityType type) {
boolQualities.add(type);
}
public boolean hasBoolQuality(BooleanQualityType type) {
return boolQualities.contains(type);
}
public Set<BooleanQualityType> getBoolQualities() {
return boolQualities;
}
public byte getTextScore() {
return textScore;
}
public void setTextScore(byte textScore) {
this.textScore = textScore;
}
}

View File

@ -1,114 +0,0 @@
package com.twitter.search.common.relevance.features;
import java.util.Map;
public class TweetUserFeatures {
private String lang;
private double langConfidence;
private int followers;
private int following;
private int reputation;
private int tweets;
private int retweets;
private int retweeted;
private Map<String, Double> knownForTopics;
private boolean isSpam;
private boolean isNsfw;
private boolean isBot;
public String getLang() {
return lang;
}
public void setLang(String lang) {
this.lang = lang;
}
public double getLangConfidence() {
return langConfidence;
}
public void setLangConfidence(double langConfidence) {
this.langConfidence = langConfidence;
}
public int getFollowers() {
return followers;
}
public void setFollowers(int followers) {
this.followers = followers;
}
public int getFollowing() {
return following;
}
public void setFollowing(int following) {
this.following = following;
}
public int getReputation() {
return reputation;
}
public void setReputation(int reputation) {
this.reputation = reputation;
}
public int getTweets() {
return tweets;
}
public void setTweets(int tweets) {
this.tweets = tweets;
}
public int getRetweets() {
return retweets;
}
public void setRetweets(int retweets) {
this.retweets = retweets;
}
public int getRetweeted() {
return retweeted;
}
public void setRetweeted(int retweeted) {
this.retweeted = retweeted;
}
public Map<String, Double> getKnownForTopics() {
return knownForTopics;
}
public void setKnownForTopics(Map<String, Double> knownForTopics) {
this.knownForTopics = knownForTopics;
}
public boolean isSpam() {
return isSpam;
}
public void setSpam(boolean spam) {
isSpam = spam;
}
public boolean isNsfw() {
return isNsfw;
}
public void setNsfw(boolean nsfw) {
isNsfw = nsfw;
}
public boolean isBot() {
return isBot;
}
public void setBot(boolean bot) {
isBot = bot;
}
}

View File

@ -1,65 +0,0 @@
package com.twitter.search.common.relevance.scorers;
import com.twitter.search.common.relevance.classifiers.TweetClassifier;
import com.twitter.search.common.relevance.entities.TwitterMessage;
/**
* Interface to compute feature scores for a single @TwitterMessage
* object, or a group of them, after they have been processed by
* feature classifiers.
*
* Intentionally kept Scorers separate from Classifiers, since they
* may be run at different stages and in different batching manners.
* Convenience methods are provided to run classification and scoring
* in one call.
*/
public abstract class TweetScorer {
/**
* Compute and store feature score in TwitterMessage based on its
* TweetFeatures.
*
* @param tweet tweet message to compute and store score to.
*/
public abstract void scoreTweet(final TwitterMessage tweet);
/**
* Score a group of TwitterMessages based on their corresponding TweetFeatures
* and store feature scores in TwitterMessages.
*
* This default implementation just iterates through the map and scores each
* individual tweet. Batching for better performance, if applicable, can be implemented by
* concrete subclasses.
*
* @param tweets TwitterMessages to score.
*/
public void scoreTweets(Iterable<TwitterMessage> tweets) {
for (TwitterMessage tweet: tweets) {
scoreTweet(tweet);
}
}
/**
* Convenience method.
* Classify tweet using the specified list of classifiers, then compute score.
*
* @param classifier list of classifiers to use for classification.
* @param tweet tweet to classify and score
*/
public void classifyAndScoreTweet(TweetClassifier classifier, TwitterMessage tweet) {
classifier.classifyTweet(tweet);
scoreTweet(tweet);
}
/**
* Convenience method.
* Classify tweets using the specified list of classifiers, then compute score.
*
* @param classifier classifier to use for classification.
* @param tweets tweets to classify and score
*/
public void classifyAndScoreTweets(TweetClassifier classifier, Iterable<TwitterMessage> tweets) {
for (TwitterMessage tweet: tweets) {
classifyAndScoreTweet(classifier, tweet);
}
}
}

View File

@ -1,242 +0,0 @@
package com.twitter.search.common.relevance.scorers;
import java.util.Map;
import java.util.concurrent.ConcurrentMap;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.metrics.RelevanceStats;
import com.twitter.search.common.metrics.SearchRateCounter;
import com.twitter.search.common.relevance.config.TweetProcessingConfig;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.features.TweetFeatures;
import com.twitter.search.common.relevance.features.TweetTextFeatures;
import com.twitter.search.common.relevance.features.TweetTextQuality;
/**
* Compute a text score for TwitterMessage based on its offensiveness,
* shoutness, length, readability and hashtag properties extracted from
* tweet text.
* <p/>
* Formula:
* text_score = offensive_text_damping * offensive_username_damping *
* Sigma(feature_score_weight * feature_score)
* <p/>
* scored features are: length, readability, shout, entropy, links
*/
public class TweetTextScorer extends TweetScorer {
private static final Logger LOG = LoggerFactory.getLogger(TweetTextScorer.class);
private static final double DEFAULT_OFFENSIVE_TERM_DAMPING = 0.2d;
private static final double DEFAULT_OFFENSIVE_NAME_DAMPING = 0.2d;
// Sigma of all weights = 1.0d
private static final double DEFAULT_LENGTH_WEIGHT = 0.5d;
private static final double DEFAULT_READABILITY_WEIGHT = 0.1d;
private static final double DEFAULT_SHOUT_WEIGHT = 0.1d;
private static final double DEFAULT_ENTROPY_WEIGHT = 0.25d;
private static final double DEFAULT_LINK_WEIGHT = 0.05d;
private static final double DEFAULT_NO_DAMPING = 1.0d;
// Sigmoid alpha values for normalization
private static final double DEFAULT_READABILITY_ALPHA = 0.05d;
private static final double DEFAULT_ENTROPY_ALPHA = 0.5d;
private static final double DEFAULT_LENGTH_ALPHA = 0.03d;
private static final ConcurrentMap<String, SearchRateCounter> RATE_COUNTERS =
Maps.newConcurrentMap();
private static final ConcurrentMap<PenguinVersion, Map<Integer, SearchRateCounter>>
SCORE_HISTOGRAMS = Maps.newConcurrentMap();
private double offensiveTermDamping = DEFAULT_OFFENSIVE_TERM_DAMPING;
private double offensiveNameDamping = DEFAULT_OFFENSIVE_NAME_DAMPING;
private double lengthWeight = DEFAULT_LENGTH_WEIGHT;
private double readabilityWeight = DEFAULT_READABILITY_WEIGHT;
private double shoutWeight = DEFAULT_SHOUT_WEIGHT;
private double entropyWeight = DEFAULT_ENTROPY_WEIGHT;
private double linkWeight = DEFAULT_LINK_WEIGHT;
private double readabilityAlpha = DEFAULT_READABILITY_ALPHA;
private double entropyAlpha = DEFAULT_ENTROPY_ALPHA;
private double lengthAlpha = DEFAULT_LENGTH_ALPHA;
/** Configure from a config file, validate the configuration. */
public TweetTextScorer(String configFile) {
TweetProcessingConfig.init(configFile);
// get dampings
checkWeightRange(offensiveTermDamping = TweetProcessingConfig
.getDouble("offensive_term_damping", DEFAULT_OFFENSIVE_TERM_DAMPING));
checkWeightRange(offensiveNameDamping = TweetProcessingConfig
.getDouble("offensive_name_damping", DEFAULT_OFFENSIVE_NAME_DAMPING));
// get weights
checkWeightRange(lengthWeight = TweetProcessingConfig
.getDouble("length_weight", DEFAULT_LENGTH_WEIGHT));
checkWeightRange(readabilityWeight = TweetProcessingConfig
.getDouble("readability_weight", DEFAULT_READABILITY_WEIGHT));
checkWeightRange(shoutWeight = TweetProcessingConfig
.getDouble("shout_weight", DEFAULT_SHOUT_WEIGHT));
checkWeightRange(entropyWeight = TweetProcessingConfig
.getDouble("entropy_weight", DEFAULT_ENTROPY_WEIGHT));
checkWeightRange(linkWeight = TweetProcessingConfig
.getDouble("link_weight", DEFAULT_LINK_WEIGHT));
// check sigma of weights
Preconditions.checkArgument(
lengthWeight + readabilityWeight + shoutWeight + entropyWeight + linkWeight == 1.0d);
readabilityAlpha = TweetProcessingConfig
.getDouble("readability_alpha", DEFAULT_READABILITY_ALPHA);
entropyAlpha = TweetProcessingConfig.getDouble("entropy_alpha", DEFAULT_ENTROPY_ALPHA);
lengthAlpha = TweetProcessingConfig.getDouble("length_alpha", DEFAULT_LENGTH_ALPHA);
}
/** Creates a new TweetTextScorer instance. */
public TweetTextScorer() {
}
/** Scores the given tweet. */
public void scoreTweet(final TwitterMessage tweet) {
Preconditions.checkNotNull(tweet);
for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) {
TweetFeatures features = Preconditions.checkNotNull(tweet.getTweetFeatures(penguinVersion));
TweetTextFeatures textFeatures = Preconditions.checkNotNull(features.getTweetTextFeatures());
TweetTextQuality textQuality = Preconditions.checkNotNull(features.getTweetTextQuality());
boolean isOffensiveText = textQuality.hasBoolQuality(
TweetTextQuality.BooleanQualityType.OFFENSIVE);
boolean isOffensiveScreenName = textQuality.hasBoolQuality(
TweetTextQuality.BooleanQualityType.OFFENSIVE_USER);
double shoutScore = DEFAULT_NO_DAMPING - textQuality.getShout();
double lengthScore = normalize(textFeatures.getLength(), lengthAlpha);
double readabilityScore = normalize(textQuality.getReadability(), readabilityAlpha);
double entropyScore = normalize(textQuality.getEntropy(), entropyAlpha);
double score = (isOffensiveText ? offensiveTermDamping : DEFAULT_NO_DAMPING)
* (isOffensiveScreenName ? offensiveNameDamping : DEFAULT_NO_DAMPING)
* (lengthWeight * lengthScore
+ readabilityWeight * readabilityScore
+ shoutWeight * shoutScore
+ entropyWeight * entropyScore
+ linkWeight * (tweet.getExpandedUrlMapSize() > 0 ? 1 : 0));
// scale to [0, 100] byte
textQuality.setTextScore((byte) (score * 100));
updateStats(
isOffensiveText,
isOffensiveScreenName,
textFeatures,
score,
getRateCounterStat("num_offensive_text_", penguinVersion),
getRateCounterStat("num_offensive_user_", penguinVersion),
getRateCounterStat("num_no_trends_", penguinVersion),
getRateCounterStat("num_has_trends_", penguinVersion),
getRateCounterStat("num_too_many_trends_", penguinVersion),
getRateCounterStat("num_scored_tweets_", penguinVersion),
getScoreHistogram(penguinVersion));
if (LOG.isDebugEnabled()) {
LOG.debug(String.format(
"Tweet length [%.2f] weighted length [%.2f], readability [%.2f] "
+ "weighted readability [%.2f], shout [%.2f] weighted shout [%.2f], "
+ "entropy [%.2f], weighted entropy [%.2f], "
+ "score [%.2f], text [%s], penguin version [%s]",
lengthScore,
lengthWeight * lengthScore,
readabilityScore,
readabilityWeight * readabilityScore,
shoutScore,
shoutWeight * shoutScore,
entropyScore,
entropyWeight * entropyScore,
score,
tweet.getText(),
penguinVersion));
}
}
}
private void updateStats(boolean isOffensiveText,
boolean isOffensiveScreenName,
TweetTextFeatures textFeatures,
double score,
SearchRateCounter offensiveTextCounter,
SearchRateCounter offensiveUserNameCounter,
SearchRateCounter noTrendsCounter,
SearchRateCounter hasTrendsCounter,
SearchRateCounter tooManyTrendsHashtagsCounter,
SearchRateCounter scoredTweets,
Map<Integer, SearchRateCounter> scoreHistogram) {
// set stats
if (isOffensiveText) {
offensiveTextCounter.increment();
}
if (isOffensiveScreenName) {
offensiveUserNameCounter.increment();
}
if (textFeatures.getTrendingTermsSize() == 0) {
noTrendsCounter.increment();
} else {
hasTrendsCounter.increment();
}
if (TwitterMessage.hasMultipleHashtagsOrTrends(textFeatures)) {
tooManyTrendsHashtagsCounter.increment();
}
scoredTweets.increment();
int bucket = (int) Math.floor(score * 10) * 10;
scoreHistogram.get(bucket).increment();
}
// normalize the passed in value to smoothed [0, 1.0d] range
private static double normalize(double value, double alpha) {
return 2 * (1.0d / (1.0d + Math.exp(-(alpha * value))) - 0.5);
}
// Make sure weight values are within the range of [0.0, 1.0]
private void checkWeightRange(double value) {
Preconditions.checkArgument(value >= 0.0d && value <= 1.0d);
}
private Map<Integer, SearchRateCounter> getScoreHistogram(PenguinVersion penguinVersion) {
Map<Integer, SearchRateCounter> scoreHistogram = SCORE_HISTOGRAMS.get(penguinVersion);
if (scoreHistogram == null) {
scoreHistogram = Maps.newHashMap();
String statsName = "num_text_score_%d_%s";
for (int i = 0; i <= 100; i += 10) {
scoreHistogram.put(i, RelevanceStats.exportRate(
String.format(statsName, i, penguinVersion.name().toLowerCase())));
}
scoreHistogram = SCORE_HISTOGRAMS.putIfAbsent(penguinVersion, scoreHistogram);
if (scoreHistogram == null) {
scoreHistogram = SCORE_HISTOGRAMS.get(penguinVersion);
}
}
return scoreHistogram;
}
private SearchRateCounter getRateCounterStat(String statPrefix, PenguinVersion penguinVersion) {
String statName = statPrefix + penguinVersion.name().toLowerCase();
SearchRateCounter rateCounter = RATE_COUNTERS.get(statName);
if (rateCounter == null) {
// Only one RateCounter instance is created for each stat name. So we don't need to worry
// that another thread might've created this instance in the meantime: we can just create/get
// it, and store it in the map.
rateCounter = RelevanceStats.exportRate(statName);
RATE_COUNTERS.put(statName, rateCounter);
}
return rateCounter;
}
}

View File

@ -1,41 +0,0 @@
package com.twitter.search.common.relevance.text;
import java.util.regex.Matcher;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.util.text.regex.Regex;
public final class LocationUtils {
private LocationUtils() {
}
/**
* Extract lat/lon information from a twitter message.
* @param message The twitter message.
* @return A two-element double array for the lat/lon information.
*/
public static double[] extractLatLon(TwitterMessage message) {
// first look in text for L:, then fall back to profile
Matcher loc = Regex.LAT_LON_LOC_PATTERN.matcher(message.getText());
if (loc.find() || message.getOrigLocation() != null
&& (loc = Regex.LAT_LON_PATTERN.matcher(message.getOrigLocation())).find()) {
final double lat = Double.parseDouble(loc.group(2));
final double lon = Double.parseDouble(loc.group(3));
if (Math.abs(lat) > 90.0) {
throw new NumberFormatException("Latitude cannot exceed +-90 degrees: " + lat);
}
if (Math.abs(lon) > 180.0) {
throw new NumberFormatException("Longitude cannot exceed +-180 degrees: " + lon);
}
// Reject these common "bogus" regions.
if ((lat == 0 && lon == 0) || lat == -1 || lon == -1) {
return null;
}
return new double[]{lat, lon};
}
return null;
}
}

View File

@ -1,190 +0,0 @@
package com.twitter.search.common.relevance.text;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import com.google.common.base.Joiner;
import com.google.common.collect.Sets;
import com.twitter.common.text.util.CharSequenceUtils;
import com.twitter.common_internal.text.version.PenguinVersion;
import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl;
import com.twitter.search.common.relevance.entities.TwitterMessage;
import com.twitter.search.common.relevance.features.TweetTextFeatures;
import com.twitter.search.common.util.text.NormalizerHelper;
import com.twitter.search.common.util.text.Smileys;
import com.twitter.search.common.util.text.TokenizerHelper;
import com.twitter.search.common.util.text.TokenizerResult;
/**
* A parser to extract very basic information from a tweet.
*/
public class TweetParser {
private static final boolean DO_NOT_REMOVE_WWW = false;
/** Parses the given TwitterMessage. */
public void parseTweet(TwitterMessage message) {
parseTweet(message, false, true);
}
/** Parses the given TwitterMessage. */
public void parseTweet(TwitterMessage message,
boolean useEntitiesFromTweetText,
boolean parseUrls) {
for (PenguinVersion penguinVersion : message.getSupportedPenguinVersions()) {
parseTweet(message, useEntitiesFromTweetText, parseUrls, penguinVersion);
}
}
/** Parses the given TwitterMessage. */
public void parseTweet(TwitterMessage message,
boolean useEntitiesFromTweetText,
boolean parseUrls,
PenguinVersion penguinVersion) {
TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion);
String rawText = message.getText();
Locale locale = message.getLocale();
// don't lower case first.
String normalizedText = NormalizerHelper.normalizeKeepCase(rawText, locale, penguinVersion);
String lowercasedNormalizedText =
CharSequenceUtils.toLowerCase(normalizedText, locale).toString();
textFeatures.setNormalizedText(lowercasedNormalizedText);
TokenizerResult result = TokenizerHelper.tokenizeTweet(normalizedText, locale, penguinVersion);
List<String> tokens = new ArrayList<>(result.tokens);
textFeatures.setTokens(tokens);
textFeatures.setTokenSequence(result.tokenSequence);
if (parseUrls) {
parseUrls(message, textFeatures);
}
textFeatures.setStrippedTokens(result.strippedDownTokens);
textFeatures.setNormalizedStrippedText(Joiner.on(" ").skipNulls()
.join(result.strippedDownTokens));
// Sanity checks, make sure there is no null token list.
if (textFeatures.getTokens() == null) {
textFeatures.setTokens(Collections.<String>emptyList());
}
if (textFeatures.getResolvedUrlTokens() == null) {
textFeatures.setResolvedUrlTokens(Collections.<String>emptyList());
}
if (textFeatures.getStrippedTokens() == null) {
textFeatures.setStrippedTokens(Collections.<String>emptyList());
}
setHashtagsAndMentions(message, textFeatures, penguinVersion);
textFeatures.setStocks(sanitizeTokenizerResults(result.stocks, '$'));
textFeatures.setHasQuestionMark(findQuestionMark(textFeatures));
// Set smiley polarities.
textFeatures.setSmileys(result.smileys);
for (String smiley : textFeatures.getSmileys()) {
if (Smileys.isValidSmiley(smiley)) {
boolean polarity = Smileys.getPolarity(smiley);
if (polarity) {
textFeatures.setHasPositiveSmiley(true);
} else {
textFeatures.setHasNegativeSmiley(true);
}
}
}
message.setTokenizedCharSequence(penguinVersion, result.rawSequence);
if (useEntitiesFromTweetText) {
takeEntities(message, textFeatures, result, penguinVersion);
}
}
/** Parse the URLs in the given TwitterMessage. */
public void parseUrls(TwitterMessage message) {
for (PenguinVersion penguinVersion : message.getSupportedPenguinVersions()) {
parseUrls(message, message.getTweetTextFeatures(penguinVersion));
}
}
/** Parse the URLs in the given TwitterMessage. */
public void parseUrls(TwitterMessage message, TweetTextFeatures textFeatures) {
if (message.getExpandedUrlMap() != null) {
Set<String> urlsToTokenize = Sets.newLinkedHashSet();
for (ThriftExpandedUrl url : message.getExpandedUrlMap().values()) {
if (url.isSetExpandedUrl()) {
urlsToTokenize.add(url.getExpandedUrl());
}
if (url.isSetCanonicalLastHopUrl()) {
urlsToTokenize.add(url.getCanonicalLastHopUrl());
}
}
TokenizerResult resolvedUrlResult =
TokenizerHelper.tokenizeUrls(urlsToTokenize, message.getLocale(), DO_NOT_REMOVE_WWW);
List<String> urlTokens = new ArrayList<>(resolvedUrlResult.tokens);
textFeatures.setResolvedUrlTokens(urlTokens);
}
}
private void takeEntities(TwitterMessage message,
TweetTextFeatures textFeatures,
TokenizerResult result,
PenguinVersion penguinVersion) {
if (message.getHashtags().isEmpty()) {
// add hashtags to TwitterMessage if it doens't already have them, from
// JSON entities, this happens when we do offline indexing
for (String hashtag : sanitizeTokenizerResults(result.hashtags, '#')) {
message.addHashtag(hashtag);
}
}
if (message.getMentions().isEmpty()) {
// add mentions to TwitterMessage if it doens't already have them, from
// JSON entities, this happens when we do offline indexing
for (String mention : sanitizeTokenizerResults(result.mentions, '@')) {
message.addMention(mention);
}
}
setHashtagsAndMentions(message, textFeatures, penguinVersion);
}
private void setHashtagsAndMentions(TwitterMessage message,
TweetTextFeatures textFeatures,
PenguinVersion penguinVersion) {
textFeatures.setHashtags(message.getNormalizedHashtags(penguinVersion));
textFeatures.setMentions(message.getLowercasedMentions());
}
// The strings in the mentions, hashtags and stocks lists in TokenizerResult should already have
// the leading characters ('@', '#' and '$') stripped. So in most cases, this sanitization is not
// needed. However, sometimes Penguin tokenizes hashtags, cashtags and mentions incorrectly
// (for example, when using the Korean tokenizer for tokens like ~@mention or ?#hashtag -- see
// SEARCHQUAL-11924 for more details). So we're doing this extra sanitization here to try to work
// around these tokenization issues.
private List<String> sanitizeTokenizerResults(List<String> tokens, char tokenSymbol) {
List<String> sanitizedTokens = new ArrayList<String>();
for (String token : tokens) {
int indexOfTokenSymbol = token.indexOf(tokenSymbol);
if (indexOfTokenSymbol < 0) {
sanitizedTokens.add(token);
} else {
String sanitizedToken = token.substring(indexOfTokenSymbol + 1);
if (!sanitizedToken.isEmpty()) {
sanitizedTokens.add(sanitizedToken);
}
}
}
return sanitizedTokens;
}
/** Determines if the normalized text of the given features contain a question mark. */
public static boolean findQuestionMark(TweetTextFeatures textFeatures) {
// t.co links don't contain ?'s, so it's not necessary to subtract ?'s occurring in Urls
// the tweet text always contains t.co, even if the display url is different
// all links on twitter are now wrapped into t.co
return textFeatures.getNormalizedText().contains("?");
}
}

View File

@ -1,39 +0,0 @@
package com.twitter.search.common.relevance.text;
public class VisibleTokenRatioNormalizer {
private static final int NORMALIZE_TO_BITS = 4;
private final int normalizeToSize;
/**
* constructor
*/
public VisibleTokenRatioNormalizer(int normalizeToBits) {
int size = 2 << (normalizeToBits - 1);
// Let's say normalizeSize is set to 16....
// If you multiply 1.0 * 16, it is 16
// If you multiply 0.0 * 16, it is 0
// That would be occupying 17 ints, not 16, so we subtract 1 here...
this.normalizeToSize = size - 1;
}
/**
* method
*/
public int normalize(double percent) {
if (percent > 1 || percent < 0) {
throw new IllegalArgumentException("percent should be less than 1 and greater than 0");
}
int bucket = (int) (percent * normalizeToSize);
return normalizeToSize - bucket;
}
public double denormalize(int reverseBucket) {
int bucket = normalizeToSize - reverseBucket;
return bucket / (double) normalizeToSize;
}
public static VisibleTokenRatioNormalizer createInstance() {
return new VisibleTokenRatioNormalizer(NORMALIZE_TO_BITS);
}
}

View File

@ -1,142 +0,0 @@
package com.twitter.search.common.schema;
import java.io.Reader;
import java.text.ParseException;
import java.util.Map;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.fa.PersianCharFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import com.twitter.search.common.schema.thriftjava.ThriftAnalyzer;
import com.twitter.search.common.schema.thriftjava.ThriftClassInstantiater;
import com.twitter.search.common.schema.thriftjava.ThriftCustomAnalyzer;
public class AnalyzerFactory {
private static final Logger LOG = LoggerFactory.getLogger(AnalyzerFactory.class);
private static final String MATCH_VERSION_ARG_NAME = "matchVersion";
private static final String STANDARD_ANALYZER = "StandardAnalyzer";
private static final String WHITESPACE_ANALYZER = "WhitespaceAnalyzer";
private static final String SEARCH_WHITESPACE_ANALYZER = "SearchWhitespaceAnalyzer";
private static final String HTML_STRIP_CHAR_FILTER = "HTMLStripCharFilter";
private static final String PERSIAN_CHAR_FILTER = "PersianCharFilter";
/**
* Return a Lucene Analyzer based on the given ThriftAnalyzer.
*/
public Analyzer getAnalyzer(ThriftAnalyzer analyzer) {
if (analyzer.isSetAnalyzer()) {
return resolveAnalyzerClass(analyzer.getAnalyzer());
} else if (analyzer.isSetCustomAnalyzer()) {
return buildCustomAnalyzer(analyzer.getCustomAnalyzer());
}
return new SearchWhitespaceAnalyzer();
}
private Analyzer resolveAnalyzerClass(ThriftClassInstantiater classDef) {
Map<String, String> params = classDef.getParams();
Version matchVersion = Version.LUCENE_8_5_2;
String matchVersionName = getArg(params, MATCH_VERSION_ARG_NAME);
if (matchVersionName != null) {
try {
matchVersion = Version.parse(matchVersionName);
} catch (ParseException e) {
// ignore and use default version
LOG.warn("Unable to parse match version: " + matchVersionName
+ ". Will use default version of 8.5.2.");
}
}
if (classDef.getClassName().equals(STANDARD_ANALYZER)) {
String stopwords = getArg(params, "stopwords");
if (stopwords != null) {
CharArraySet stopwordSet = new CharArraySet(
Lists.newLinkedList(Splitter.on(",").split(stopwords)),
false);
return new StandardAnalyzer(stopwordSet);
} else {
return new StandardAnalyzer();
}
} else if (classDef.getClassName().equals(WHITESPACE_ANALYZER)) {
return new WhitespaceAnalyzer();
} else if (classDef.getClassName().equals(SEARCH_WHITESPACE_ANALYZER)) {
return new SearchWhitespaceAnalyzer();
}
return null;
}
private Analyzer buildCustomAnalyzer(final ThriftCustomAnalyzer customAnalyzer) {
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = resolveTokenizerClass(customAnalyzer.getTokenizer());
TokenStream filter = tokenizer;
if (customAnalyzer.isSetFilters()) {
for (ThriftClassInstantiater filterClass : customAnalyzer.getFilters()) {
filter = resolveTokenFilterClass(filterClass, filter);
}
}
return new TokenStreamComponents(tokenizer, filter);
}
};
}
private Tokenizer resolveTokenizerClass(ThriftClassInstantiater classDef) {
return null;
}
private TokenStream resolveTokenFilterClass(ThriftClassInstantiater classDef, TokenStream input) {
return null;
}
private CharFilter resolveCharFilterClass(ThriftClassInstantiater classDef, Reader input) {
if (classDef.getClassName().equals(HTML_STRIP_CHAR_FILTER)) {
String escapedTags = getArg(classDef.getParams(), "excapedTags");
if (escapedTags != null) {
return new HTMLStripCharFilter(input, Sets.newHashSet(Splitter.on(",").split(escapedTags)));
} else {
return new HTMLStripCharFilter(input);
}
} else if (classDef.getClassName().equals(PERSIAN_CHAR_FILTER)) {
return new PersianCharFilter(input);
}
throw new ClassNotSupportedException("CharFilter", classDef);
}
private String getArg(Map<String, String> args, String arg) {
if (args == null) {
return null;
}
return args.get(arg);
}
public final class ClassNotSupportedException extends RuntimeException {
private ClassNotSupportedException(String type, ThriftClassInstantiater classDef) {
super(type + " class with name " + classDef.getClassName() + " currently not supported.");
}
}
}

View File

@ -1,34 +0,0 @@
# Library for schema builder and related analysis utilities.
java_library(
sources = ["*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn",
"3rdparty/jvm/org/apache/lucene:lucene-core",
"3rdparty/jvm/org/apache/lucene:lucene-facet",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"3rdparty/jvm/org/slf4j:slf4j-api",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/collections",
"src/java/com/twitter/common/text/token",
"src/java/com/twitter/common/text/util:token-util",
"src/java/com/twitter/search/common/encoding/docvalues",
"src/java/com/twitter/search/common/features",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/schema/base",
"src/java/com/twitter/search/common/util/analysis",
"src/java/com/twitter/search/common/util/io",
"src/java/com/twitter/search/common/util/io:record-reader-api",
"src/java/com/twitter/search/common/util/spatial",
"src/java/com/twitter/search/common/util/text",
"src/java/com/twitter/search/common/util/thrift:thrift-utils",
"src/thrift/com/twitter/search/common:features-java",
"src/thrift/com/twitter/search/common:schema-java",
],
)

View File

@ -1,214 +0,0 @@
package com.twitter.search.common.schema;
import java.util.Collection;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import javax.annotation.Nullable;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableCollection;
import com.google.common.collect.ImmutableMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.index.FieldInfos;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchema;
import com.twitter.search.common.schema.base.FeatureConfiguration;
import com.twitter.search.common.schema.base.FieldWeightDefault;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.schema.thriftjava.ThriftAnalyzer;
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
import com.twitter.search.common.schema.thriftjava.ThriftFieldConfiguration;
/**
* A schema implementation that allow minor version increments at run time.
*/
public class DynamicSchema implements Schema {
private static final Logger LOG = LoggerFactory.getLogger(DynamicSchema.class);
private final AtomicReference<ImmutableSchema> schema;
public DynamicSchema(ImmutableSchema schema) {
this.schema = new AtomicReference<>(schema);
}
public ImmutableSchemaInterface getSchemaSnapshot() {
return schema.get();
}
/**
* Update the schema reference inside this DynamicSchema.
*/
public synchronized void updateSchema(ImmutableSchema newSchema) throws SchemaUpdateException {
ImmutableSchema oldSchema = schema.get();
if (newSchema.getMajorVersionNumber() != oldSchema.getMajorVersionNumber()) {
throw new SchemaUpdateException("Dynamic major version update is not supported.");
} else {
if (newSchema.getMinorVersionNumber() <= oldSchema.getMinorVersionNumber()) {
throw new SchemaUpdateException("Dynamic backward minor version update is not supported.");
} else {
LOG.info("DynamicSchema accepted update. Old version is {}.{}; new version is {}.{}",
oldSchema.getMajorVersionNumber(),
oldSchema.getMinorVersionNumber(),
newSchema.getMajorVersionNumber(),
newSchema.getMinorVersionNumber());
schema.set(newSchema);
}
}
}
public static class SchemaUpdateException extends Exception {
public SchemaUpdateException(String message) {
super(message);
}
}
// The below are all methods in the Schema interface delegated to the underlying ImmutableSchema.
// The below is generated by IntelliJ, and reviewers can stop reviewing this file here.
// If you are adding logic into this class, please do so above this line.
@Override
public FieldInfos getLuceneFieldInfos(
Predicate<String> acceptedFields) {
return schema.get().getLuceneFieldInfos(acceptedFields);
}
@Override
public FacetsConfig getFacetsConfig() {
return schema.get().getFacetsConfig();
}
@Override
public Analyzer getDefaultAnalyzer(
ThriftAnalyzer override) {
return schema.get().getDefaultAnalyzer(override);
}
@Override
public ImmutableCollection<FieldInfo> getFieldInfos() {
return schema.get().getFieldInfos();
}
@Override
public boolean hasField(int fieldConfigId) {
return schema.get().hasField(fieldConfigId);
}
@Override
public boolean hasField(String fieldName) {
return schema.get().hasField(fieldName);
}
@Override
@Nullable
public FieldInfo getFieldInfo(int fieldConfigId) {
return schema.get().getFieldInfo(fieldConfigId);
}
@Override
@Nullable
public FieldInfo getFieldInfo(String fieldName) {
return schema.get().getFieldInfo(fieldName);
}
@Override
public String getFieldName(int fieldConfigId) {
return schema.get().getFieldName(fieldConfigId);
}
@Override
public FieldInfo getFieldInfo(int fieldConfigId,
ThriftFieldConfiguration override) {
return schema.get().getFieldInfo(fieldConfigId, override);
}
@Override
public int getNumFacetFields() {
return schema.get().getNumFacetFields();
}
@Override
public FieldInfo getFacetFieldByFacetName(
String facetName) {
return schema.get().getFacetFieldByFacetName(facetName);
}
@Override
public FieldInfo getFacetFieldByFieldName(
String fieldName) {
return schema.get().getFacetFieldByFieldName(fieldName);
}
@Override
public Collection<FieldInfo> getFacetFields() {
return schema.get().getFacetFields();
}
@Override
public Collection<FieldInfo> getCsfFacetFields() {
return schema.get().getCsfFacetFields();
}
@Override
public String getVersionDescription() {
return schema.get().getVersionDescription();
}
@Override
public int getMajorVersionNumber() {
return schema.get().getMajorVersionNumber();
}
@Override
public int getMinorVersionNumber() {
return schema.get().getMinorVersionNumber();
}
@Override
public boolean isVersionOfficial() {
return schema.get().isVersionOfficial();
}
@Override
public Map<String, FieldWeightDefault> getFieldWeightMap() {
return schema.get().getFieldWeightMap();
}
@Override
public FeatureConfiguration getFeatureConfigurationByName(
String featureName) {
return schema.get().getFeatureConfigurationByName(featureName);
}
@Override
public FeatureConfiguration getFeatureConfigurationById(int featureFieldId) {
return Preconditions.checkNotNull(schema.get().getFeatureConfigurationById(featureFieldId));
}
@Override
@Nullable
public ThriftCSFType getCSFFieldType(
String fieldName) {
return schema.get().getCSFFieldType(fieldName);
}
@Override
public ThriftSearchFeatureSchema getSearchFeatureSchema() {
return schema.get().getSearchFeatureSchema();
}
@Override
public ImmutableMap<Integer, FeatureConfiguration> getFeatureIdToFeatureConfig() {
return schema.get().getFeatureIdToFeatureConfig();
}
@Override
public ImmutableMap<String, FeatureConfiguration> getFeatureNameToFeatureConfig() {
return schema.get().getFeatureNameToFeatureConfig();
}
}

View File

@ -1,904 +0,0 @@
package com.twitter.search.common.schema;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicLong;
import javax.annotation.Nullable;
import javax.annotation.concurrent.Immutable;
import javax.annotation.concurrent.ThreadSafe;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableCollection;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSortedMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexOptions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.collections.Pair;
import com.twitter.common.text.util.TokenStreamSerializer;
import com.twitter.search.common.features.ExternalTweetFeature;
import com.twitter.search.common.features.SearchResultFeature;
import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchema;
import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchemaEntry;
import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchemaSpecifier;
import com.twitter.search.common.features.thrift.ThriftSearchFeatureType;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.metrics.SearchLongGauge;
import com.twitter.search.common.schema.base.EarlybirdFieldType;
import com.twitter.search.common.schema.base.FeatureConfiguration;
import com.twitter.search.common.schema.base.FieldWeightDefault;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.base.IndexedNumericFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftAnalyzer;
import com.twitter.search.common.schema.thriftjava.ThriftCSFFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
import com.twitter.search.common.schema.thriftjava.ThriftCSFViewSettings;
import com.twitter.search.common.schema.thriftjava.ThriftFacetFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftFieldConfiguration;
import com.twitter.search.common.schema.thriftjava.ThriftFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftIndexedFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftSchema;
import com.twitter.search.common.schema.thriftjava.ThriftSearchFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftTokenStreamSerializer;
/**
* A schema instance that does not change at run time.
*/
@Immutable @ThreadSafe
public class ImmutableSchema implements ImmutableSchemaInterface {
private static final Logger LOG = LoggerFactory.getLogger(ImmutableSchema.class);
private static final ImmutableSet<ThriftCSFType> CAN_FACET_ON_CSF_TYPES =
ImmutableSet.<ThriftCSFType>builder()
.add(ThriftCSFType.BYTE)
.add(ThriftCSFType.INT)
.add(ThriftCSFType.LONG)
.build();
private static final SearchCounter FEATURES_EXISTED_IN_OLD_SCHEMA =
SearchCounter.export("features_existed_in_old_schema");
// Currently our index uses 4 bits to store the facet field id.
public static final int MAX_FACET_FIELD_ID = 15;
public static final String HF_TERM_PAIRS_FIELD = "hf_term_pairs";
public static final String HF_PHRASE_PAIRS_FIELD = "hf_phrase_pairs";
private final ImmutableMap<Integer, FieldInfo> fieldSettingsMapById;
private final ImmutableMap<String, FieldInfo> fieldSettingsMapByName;
private final ImmutableMap<String, FeatureConfiguration> featureConfigMapByName;
private final ImmutableMap<Integer, FeatureConfiguration> featureConfigMapById;
@Nullable
private final ThriftAnalyzer defaultAnalyzer;
private final AnalyzerFactory analyzerFactory;
private final ImmutableMap<String, FieldWeightDefault> fieldWeightMap;
private final Map<String, FieldInfo> facetNameToFieldMap = Maps.newHashMap();
private final int numFacetFields;
private final ImmutableSet<FieldInfo> csfFacetFields;
// This is the search result feature schema - it has the definition for all the column stride
// view fields.
private final ThriftSearchFeatureSchema searchFeatureSchema;
private final int majorVersionNumber;
private final int minorVersionNumber;
private final String versionDesc;
private final boolean isVersionOfficial;
/**
* Construct a Schema instance with the given ThriftSchema and AnalyzerFactory.
*/
public ImmutableSchema(ThriftSchema thriftSchema,
AnalyzerFactory analyzerFactory,
String featureSchemaVersionPrefix) throws SchemaValidationException {
Pair<Integer, String> versionPair = parseVersionString(thriftSchema.getVersion());
this.majorVersionNumber = thriftSchema.getMajorVersionNumber();
this.minorVersionNumber = thriftSchema.getMinorVersionNumber();
this.versionDesc = versionPair.getSecond();
this.isVersionOfficial = thriftSchema.isVersionIsOfficial();
this.analyzerFactory = analyzerFactory;
Map<Integer, FieldInfo> tmpMap = Maps.newLinkedHashMap();
Set<FieldInfo> tmpSet = Sets.newHashSet();
if (thriftSchema.isSetDefaultAnalyzer()) {
this.defaultAnalyzer = thriftSchema.getDefaultAnalyzer().deepCopy();
} else {
this.defaultAnalyzer = null;
}
Map<Integer, ThriftFieldConfiguration> configs = thriftSchema.getFieldConfigs();
// Collect all the CSF Views, so that we can later verify that they are appropriately
// configured once we've processed all the other field settings.
Map<Integer, ThriftFieldConfiguration> csfViewFields = Maps.newHashMap();
boolean requiresHfPairFields = false;
boolean hasHfTermPairField = false;
boolean hasHfPhrasePairField = false;
int numFacets = 0;
for (Map.Entry<Integer, ThriftFieldConfiguration> entry : configs.entrySet()) {
int fieldId = entry.getKey();
if (tmpMap.containsKey(fieldId)) {
throw new SchemaValidationException("Duplicate field id " + fieldId);
}
ThriftFieldConfiguration config = entry.getValue();
FieldInfo fieldInfo = parseThriftFieldSettings(fieldId, config, csfViewFields);
validate(fieldInfo);
if (fieldInfo.getFieldType().isFacetField()) {
if (numFacets > MAX_FACET_FIELD_ID) {
throw new SchemaValidationException(
"Maximum supported facet field ID is: " + MAX_FACET_FIELD_ID);
}
numFacets++;
facetNameToFieldMap.put(fieldInfo.getFieldType().getFacetName(), fieldInfo);
if (fieldInfo.getFieldType().isUseCSFForFacetCounting()) {
tmpSet.add(fieldInfo);
}
}
tmpMap.put(fieldId, fieldInfo);
if (fieldInfo.getFieldType().isIndexHFTermPairs()) {
requiresHfPairFields = true;
}
if (fieldInfo.getName().equals(HF_TERM_PAIRS_FIELD)) {
hasHfTermPairField = true;
}
if (fieldInfo.getName().equals(HF_PHRASE_PAIRS_FIELD)) {
hasHfPhrasePairField = true;
}
}
this.numFacetFields = numFacets;
this.csfFacetFields = ImmutableSet.copyOf(tmpSet);
// If any field requires high frequency term/phrase pair fields, make sure they exist
if (requiresHfPairFields) {
if (!hasHfTermPairField || !hasHfPhrasePairField) {
throw new SchemaValidationException(
"High frequency term/phrase pair fields do not exist in the schema.");
}
}
this.fieldSettingsMapById = ImmutableMap.copyOf(tmpMap);
Pair<ImmutableMap<String, FeatureConfiguration>, ImmutableMap<Integer, FeatureConfiguration>>
featureConfigMapPair = buildFeatureMaps(csfViewFields);
this.featureConfigMapByName = featureConfigMapPair.getFirst();
this.featureConfigMapById = featureConfigMapPair.getSecond();
for (ThriftFieldConfiguration csfViewField : csfViewFields.values()) {
SchemaBuilder.verifyCSFViewSettings(configs, csfViewField);
}
ImmutableMap.Builder<String, FieldInfo> builder = ImmutableMap.builder();
for (FieldInfo info : fieldSettingsMapById.values()) {
info.getFieldType().freeze();
builder.put(info.getName(), info);
}
this.fieldSettingsMapByName = builder.build();
ImmutableMap.Builder<String, FieldWeightDefault> fieldWeightMapBuilder = ImmutableMap.builder();
for (FieldInfo fi : getFieldInfos()) {
// CSF fields are not searchable. All other fields are.
if (fi.getFieldType().isIndexedField()) {
fieldWeightMapBuilder.put(
fi.getName(),
new FieldWeightDefault(
fi.getFieldType().isTextSearchableByDefault(),
fi.getFieldType().getTextSearchableFieldWeight()));
}
}
this.fieldWeightMap = fieldWeightMapBuilder.build();
// Create features with extra Earlybird derived fields, extra fields won't change the version
// but they do change the checksum.
this.searchFeatureSchema = createSearchResultFeatureSchema(
featureSchemaVersionPrefix, fieldSettingsMapByName, featureConfigMapByName);
}
/**
* Add a set of features to a schema if they don't exist yet, and update the schema checksum.
* if there's conflict, RuntimeException will be thrown.
* Old map won't be touched, a new map will be returned will old and new data combined.
*/
public static Map<Integer, ThriftSearchFeatureSchemaEntry> appendToFeatureSchema(
Map<Integer, ThriftSearchFeatureSchemaEntry> oldEntryMap,
Set<? extends SearchResultFeature> features) throws SchemaValidationException {
if (oldEntryMap == null) {
throw new SchemaValidationException(
"Cannot append features to schema, the entryMap is null");
}
// make a copy of the existing map
ImmutableMap.Builder<Integer, ThriftSearchFeatureSchemaEntry> builder =
ImmutableSortedMap.<Integer, ThriftSearchFeatureSchemaEntry>naturalOrder()
.putAll(oldEntryMap);
for (SearchResultFeature feature : features) {
if (oldEntryMap.containsKey(feature.getId())) {
FEATURES_EXISTED_IN_OLD_SCHEMA.increment();
} else {
builder.put(feature.getId(), new ThriftSearchFeatureSchemaEntry()
.setFeatureName(feature.getName())
.setFeatureType(feature.getType()));
}
}
return builder.build();
}
/**
* Append external features to create a new schema.
* @param oldSchema The old schema to build on top of
* @param features a list of features to be appended to the schema
* @param versionSuffix the version suffix, if not-null, it will be attached to the end of
* original schema's version.
* @return A new schema object with the appended fields
* @throws SchemaValidationException thrown when the checksum cannot be computed
*/
public static ThriftSearchFeatureSchema appendToCreateNewFeatureSchema(
ThriftSearchFeatureSchema oldSchema,
Set<ExternalTweetFeature> features,
@Nullable String versionSuffix) throws SchemaValidationException {
ThriftSearchFeatureSchema newSchema = new ThriftSearchFeatureSchema();
// copy over all the entries plus the new ones
newSchema.setEntries(appendToFeatureSchema(oldSchema.getEntries(), features));
ThriftSearchFeatureSchemaSpecifier spec = new ThriftSearchFeatureSchemaSpecifier();
// the version is directly inherited or with a suffix
Preconditions.checkArgument(versionSuffix == null || !versionSuffix.isEmpty());
spec.setVersion(versionSuffix == null
? oldSchema.getSchemaSpecifier().getVersion()
: oldSchema.getSchemaSpecifier().getVersion() + versionSuffix);
spec.setChecksum(getChecksum(newSchema.getEntries()));
newSchema.setSchemaSpecifier(spec);
return newSchema;
}
@Override
public FieldInfos getLuceneFieldInfos(Predicate<String> acceptedFields) {
List<org.apache.lucene.index.FieldInfo> acceptedFieldInfos = Lists.newArrayList();
for (FieldInfo fi : getFieldInfos()) {
if (acceptedFields == null || acceptedFields.apply(fi.getName())) {
acceptedFieldInfos.add(convert(fi.getName(), fi.getFieldId(), fi.getFieldType()));
}
}
return new FieldInfos(acceptedFieldInfos.toArray(
new org.apache.lucene.index.FieldInfo[acceptedFieldInfos.size()]));
}
private FieldInfo parseThriftFieldSettings(int fieldId, ThriftFieldConfiguration fieldConfig,
Map<Integer, ThriftFieldConfiguration> csfViewFields)
throws SchemaValidationException {
FieldInfo fieldInfo
= new FieldInfo(fieldId, fieldConfig.getFieldName(), new EarlybirdFieldType());
ThriftFieldSettings fieldSettings = fieldConfig.getSettings();
boolean settingFound = false;
if (fieldSettings.isSetIndexedFieldSettings()) {
if (fieldSettings.isSetCsfFieldSettings() || fieldSettings.isSetCsfViewSettings()) {
throw new SchemaValidationException("ThriftFieldSettings: Only one of "
+ "'indexedFieldSettings', 'csfFieldSettings', 'csfViewSettings' can be set.");
}
applyIndexedFieldSettings(fieldInfo, fieldSettings.getIndexedFieldSettings());
settingFound = true;
}
if (fieldSettings.isSetCsfFieldSettings()) {
if (fieldSettings.isSetIndexedFieldSettings() || fieldSettings.isSetCsfViewSettings()) {
throw new SchemaValidationException("ThriftFieldSettings: Only one of "
+ "'indexedFieldSettings', 'csfFieldSettings', 'csfViewSettings' can be set.");
}
applyCsfFieldSettings(fieldInfo, fieldSettings.getCsfFieldSettings());
settingFound = true;
}
if (fieldSettings.isSetFacetFieldSettings()) {
if (!fieldSettings.isSetIndexedFieldSettings() && !(fieldSettings.isSetCsfFieldSettings()
&& fieldSettings.getFacetFieldSettings().isUseCSFForFacetCounting()
&& CAN_FACET_ON_CSF_TYPES.contains(fieldSettings.getCsfFieldSettings().getCsfType()))) {
throw new SchemaValidationException("ThriftFieldSettings: 'facetFieldSettings' can only be "
+ "used in combination with 'indexedFieldSettings' or with 'csfFieldSettings' "
+ "where 'isUseCSFForFacetCounting' was set to true and ThriftCSFType is a type that "
+ "can be faceted on.");
}
applyFacetFieldSettings(fieldInfo, fieldSettings.getFacetFieldSettings());
settingFound = true;
}
if (fieldSettings.isSetCsfViewSettings()) {
if (fieldSettings.isSetIndexedFieldSettings() || fieldSettings.isSetCsfFieldSettings()) {
throw new SchemaValidationException("ThriftFieldSettings: Only one of "
+ "'indexedFieldSettings', 'csfFieldSettings', 'csfViewSettings' can be set.");
}
// add this field now, but apply settings later to make sure the base field was added properly
// before
csfViewFields.put(fieldId, fieldConfig);
settingFound = true;
}
if (!settingFound) {
throw new SchemaValidationException("ThriftFieldSettings: One of 'indexedFieldSettings', "
+ "'csfFieldSettings' or 'facetFieldSettings' must be set.");
}
// search field settings are optional
if (fieldSettings.isSetSearchFieldSettings()) {
if (!fieldSettings.isSetIndexedFieldSettings()) {
throw new SchemaValidationException(
"ThriftFieldSettings: 'searchFieldSettings' can only be "
+ "used in combination with 'indexedFieldSettings'");
}
applySearchFieldSettings(fieldInfo, fieldSettings.getSearchFieldSettings());
}
return fieldInfo;
}
private void applyCsfFieldSettings(FieldInfo fieldInfo, ThriftCSFFieldSettings settings)
throws SchemaValidationException {
// csfType is required - no need to check if it's set
fieldInfo.getFieldType().setDocValuesType(DocValuesType.NUMERIC);
fieldInfo.getFieldType().setCsfType(settings.getCsfType());
if (settings.isVariableLength()) {
fieldInfo.getFieldType().setDocValuesType(DocValuesType.BINARY);
fieldInfo.getFieldType().setCsfVariableLength();
} else {
if (settings.isSetFixedLengthSettings()) {
fieldInfo.getFieldType().setCsfFixedLengthSettings(
settings.getFixedLengthSettings().getNumValuesPerDoc(),
settings.getFixedLengthSettings().isUpdateable());
if (settings.getFixedLengthSettings().getNumValuesPerDoc() > 1) {
fieldInfo.getFieldType().setDocValuesType(DocValuesType.BINARY);
}
} else {
throw new SchemaValidationException(
"ThriftCSFFieldSettings: Either variableLength should be set to 'true', "
+ "or fixedLengthSettings should be set.");
}
}
fieldInfo.getFieldType().setCsfLoadIntoRam(settings.isLoadIntoRAM());
if (settings.isSetDefaultValue()) {
fieldInfo.getFieldType().setCsfDefaultValue(settings.getDefaultValue());
}
}
private void applyCsfViewFieldSettings(FieldInfo fieldInfo, FieldInfo baseField,
ThriftCSFViewSettings settings)
throws SchemaValidationException {
// csfType is required - no need to check if it's set
fieldInfo.getFieldType().setDocValuesType(DocValuesType.NUMERIC);
fieldInfo.getFieldType().setCsfType(settings.getCsfType());
fieldInfo.getFieldType().setCsfFixedLengthSettings(1 /* numValuesPerDoc*/,
false /* updateable*/);
fieldInfo.getFieldType().setCsfViewSettings(fieldInfo.getName(), settings, baseField);
}
private void applyFacetFieldSettings(FieldInfo fieldInfo, ThriftFacetFieldSettings settings) {
if (settings.isSetFacetName()) {
fieldInfo.getFieldType().setFacetName(settings.getFacetName());
} else {
// fall back to field name if no facet name is explicitly provided
fieldInfo.getFieldType().setFacetName(fieldInfo.getName());
}
fieldInfo.getFieldType().setStoreFacetSkiplist(settings.isStoreSkiplist());
fieldInfo.getFieldType().setStoreFacetOffensiveCounters(settings.isStoreOffensiveCounters());
fieldInfo.getFieldType().setUseCSFForFacetCounting(settings.isUseCSFForFacetCounting());
}
private void applyIndexedFieldSettings(FieldInfo fieldInfo, ThriftIndexedFieldSettings settings)
throws SchemaValidationException {
fieldInfo.getFieldType().setIndexedField(true);
fieldInfo.getFieldType().setStored(settings.isStored());
fieldInfo.getFieldType().setTokenized(settings.isTokenized());
fieldInfo.getFieldType().setStoreTermVectors(settings.isStoreTermVectors());
fieldInfo.getFieldType().setStoreTermVectorOffsets(settings.isStoreTermVectorOffsets());
fieldInfo.getFieldType().setStoreTermVectorPositions(settings.isStoreTermVectorPositions());
fieldInfo.getFieldType().setStoreTermVectorPayloads(settings.isStoreTermVectorPayloads());
fieldInfo.getFieldType().setOmitNorms(settings.isOmitNorms());
fieldInfo.getFieldType().setIndexHFTermPairs(settings.isIndexHighFreqTermPairs());
fieldInfo.getFieldType().setUseTweetSpecificNormalization(
settings.deprecated_performTweetSpecificNormalizations);
if (settings.isSetIndexOptions()) {
switch (settings.getIndexOptions()) {
case DOCS_ONLY :
fieldInfo.getFieldType().setIndexOptions(IndexOptions.DOCS);
break;
case DOCS_AND_FREQS :
fieldInfo.getFieldType().setIndexOptions(IndexOptions.DOCS_AND_FREQS);
break;
case DOCS_AND_FREQS_AND_POSITIONS :
fieldInfo.getFieldType().setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
break;
case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS :
fieldInfo.getFieldType().setIndexOptions(
IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
break;
default:
throw new SchemaValidationException("Unknown value for IndexOptions: "
+ settings.getIndexOptions());
}
} else if (settings.isIndexed()) {
// default for backward-compatibility
fieldInfo.getFieldType().setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
}
fieldInfo.getFieldType().setStorePerPositionPayloads(settings.isStorePerPositionPayloads());
fieldInfo.getFieldType().setDefaultPayloadLength(
settings.getDefaultPerPositionPayloadLength());
fieldInfo.getFieldType().setBecomesImmutable(!settings.isSupportOutOfOrderAppends());
fieldInfo.getFieldType().setSupportOrderedTerms(settings.isSupportOrderedTerms());
fieldInfo.getFieldType().setSupportTermTextLookup(settings.isSupportTermTextLookup());
if (settings.isSetNumericFieldSettings()) {
fieldInfo.getFieldType().setNumericFieldSettings(
new IndexedNumericFieldSettings(settings.getNumericFieldSettings()));
}
if (settings.isSetTokenStreamSerializer()) {
fieldInfo.getFieldType().setTokenStreamSerializerBuilder(
buildTokenStreamSerializerProvider(settings.getTokenStreamSerializer()));
}
}
private void applySearchFieldSettings(FieldInfo fieldInfo, ThriftSearchFieldSettings settings)
throws SchemaValidationException {
fieldInfo.getFieldType().setTextSearchableFieldWeight(
(float) settings.getTextSearchableFieldWeight());
fieldInfo.getFieldType().setTextSearchableByDefault(settings.isTextDefaultSearchable());
}
private void validate(FieldInfo fieldInfo) throws SchemaValidationException {
}
private TokenStreamSerializer.Builder buildTokenStreamSerializerProvider(
final ThriftTokenStreamSerializer settings) {
TokenStreamSerializer.Builder builder = TokenStreamSerializer.builder();
for (String serializerName : settings.getAttributeSerializerClassNames()) {
try {
builder.add((TokenStreamSerializer.AttributeSerializer) Class.forName(serializerName)
.newInstance());
} catch (InstantiationException e) {
throw new RuntimeException(
"Unable to instantiate AttributeSerializer for name " + serializerName);
} catch (IllegalAccessException e) {
throw new RuntimeException(
"Unable to instantiate AttributeSerializer for name " + serializerName);
} catch (ClassNotFoundException e) {
throw new RuntimeException(
"Unable to instantiate AttributeSerializer for name " + serializerName);
}
}
return builder;
}
@Override
public FacetsConfig getFacetsConfig() {
FacetsConfig facetsConfig = new FacetsConfig();
for (String facetName : facetNameToFieldMap.keySet()) {
// set multiValued = true as default, since we're using SortedSetDocValues facet, in which,
// there is no difference between multiValued true or false for the real facet, but only the
// checking of the values.
facetsConfig.setMultiValued(facetName, true);
}
return facetsConfig;
}
@Override
public Analyzer getDefaultAnalyzer(ThriftAnalyzer override) {
if (override != null) {
return analyzerFactory.getAnalyzer(override);
}
if (defaultAnalyzer != null) {
return analyzerFactory.getAnalyzer(defaultAnalyzer);
}
return new SearchWhitespaceAnalyzer();
}
@Override
public ImmutableCollection<FieldInfo> getFieldInfos() {
return fieldSettingsMapById.values();
}
/**
* This is the preferred method to check whether a field configuration is in schema.
* One can also use getFieldInfo and do null checks, but should be careful about excessive
* warning logging resulting from looking up fields not in schema.
*/
@Override
public boolean hasField(int fieldConfigId) {
return fieldSettingsMapById.containsKey(fieldConfigId);
}
/**
* This is the preferred method to check whether a field configuration is in schema.
* One can also use getFieldInfo and do null checks, but should be careful about excessive
* warning logging resulting from looking up fields not in schema.
*/
@Override
public boolean hasField(String fieldName) {
return fieldSettingsMapByName.containsKey(fieldName);
}
/**
* Get FieldInfo for the given field id.
* If the goal is to check whether a field is in the schema, use {@link #hasField(int)} instead.
* This method logs a warning whenever it returns null.
*/
@Override
@Nullable
public FieldInfo getFieldInfo(int fieldConfigId) {
return getFieldInfo(fieldConfigId, null);
}
private org.apache.lucene.index.FieldInfo convert(String fieldName,
int index,
EarlybirdFieldType type) {
return new org.apache.lucene.index.FieldInfo(
fieldName, // String name
index, // int number
type.storeTermVectors(), // boolean storeTermVector
type.omitNorms(), // boolean omitNorms
type.isStorePerPositionPayloads(), // boolean storePayloads
type.indexOptions(), // IndexOptions indexOptions
type.docValuesType(), // DocValuesType docValues
-1, // long dvGen
Maps.<String, String>newHashMap(), // Map<String, String> attributes
0, // int pointDataDimensionCount
0, // int pointIndexDimensionCount
0, // int pointNumBytes
false); // boolean softDeletesField
}
/**
* Get FieldInfo for the given field name, or null if the field does not exist.
*/
@Override
@Nullable
public FieldInfo getFieldInfo(String fieldName) {
return fieldSettingsMapByName.get(fieldName);
}
@Override
public String getFieldName(int fieldConfigId) {
FieldInfo fieldInfo = fieldSettingsMapById.get(fieldConfigId);
return fieldInfo != null ? fieldInfo.getName() : null;
}
@Override
public FieldInfo getFieldInfo(int fieldConfigId, ThriftFieldConfiguration override) {
FieldInfo fieldInfo = fieldSettingsMapById.get(fieldConfigId);
if (fieldInfo == null) {
// This method is used to check the availability of fields by IDs,
// so no warning is logged here (would be too verbose otherwise).
return null;
}
if (override != null) {
try {
return merge(fieldConfigId, fieldInfo, override);
} catch (SchemaValidationException e) {
throw new RuntimeException(e);
}
}
return fieldInfo;
}
@Override
public int getNumFacetFields() {
return numFacetFields;
}
@Override
public FieldInfo getFacetFieldByFacetName(String facetName) {
return facetNameToFieldMap.get(facetName);
}
@Override
public FieldInfo getFacetFieldByFieldName(String fieldName) {
FieldInfo fieldInfo = getFieldInfo(fieldName);
return fieldInfo != null && fieldInfo.getFieldType().isFacetField() ? fieldInfo : null;
}
@Override
public Collection<FieldInfo> getFacetFields() {
return facetNameToFieldMap.values();
}
@Override
public Collection<FieldInfo> getCsfFacetFields() {
return csfFacetFields;
}
@Override
public String getVersionDescription() {
return versionDesc;
}
@Override
public int getMajorVersionNumber() {
return majorVersionNumber;
}
@Override
public int getMinorVersionNumber() {
return minorVersionNumber;
}
@Override
public boolean isVersionOfficial() {
return isVersionOfficial;
}
/**
* Parses a version string like "16: renamed field x into y" into a version number and
* a string description.
* @return a Pair of the version number and the description
*/
private static Pair<Integer, String> parseVersionString(String version)
throws SchemaValidationException {
Preconditions.checkNotNull(version, "Schema must have a version number and description.");
int colonIndex = version.indexOf(':');
if (colonIndex == -1) {
throw new SchemaValidationException("Malformed version string: " + version);
}
try {
int versionNumber = Integer.parseInt(version.substring(0, colonIndex));
String versionDesc = version.substring(colonIndex + 1);
return Pair.of(versionNumber, versionDesc);
} catch (Exception e) {
throw new SchemaValidationException("Malformed version string: " + version, e);
}
}
@Override
public Map<String, FieldWeightDefault> getFieldWeightMap() {
return fieldWeightMap;
}
/**
* Build the feature maps so that we can use feature name to get the feature configuration.
* @return: an immutable map keyed on fieldName.
*/
private Pair<ImmutableMap<String, FeatureConfiguration>,
ImmutableMap<Integer, FeatureConfiguration>> buildFeatureMaps(
final Map<Integer, ThriftFieldConfiguration> csvViewFields)
throws SchemaValidationException {
final ImmutableMap.Builder<String, FeatureConfiguration> featureConfigMapByNameBuilder =
ImmutableMap.builder();
final ImmutableMap.Builder<Integer, FeatureConfiguration> featureConfigMapByIdBuilder =
ImmutableMap.builder();
for (final Map.Entry<Integer, ThriftFieldConfiguration> entry : csvViewFields.entrySet()) {
ThriftFieldSettings fieldSettings = entry.getValue().getSettings();
FieldInfo fieldInfo = getFieldInfo(entry.getKey());
FieldInfo baseFieldInfo =
getFieldInfo(fieldSettings.getCsfViewSettings().getBaseFieldConfigId());
if (baseFieldInfo == null) {
throw new SchemaValidationException("Base field (id="
+ fieldSettings.getCsfViewSettings().getBaseFieldConfigId() + ") not found.");
}
applyCsfViewFieldSettings(fieldInfo, baseFieldInfo, fieldSettings.getCsfViewSettings());
FeatureConfiguration featureConfig = fieldInfo.getFieldType()
.getCsfViewFeatureConfiguration();
if (featureConfig != null) {
featureConfigMapByNameBuilder.put(fieldInfo.getName(), featureConfig);
featureConfigMapByIdBuilder.put(fieldInfo.getFieldId(), featureConfig);
}
}
return Pair.of(featureConfigMapByNameBuilder.build(), featureConfigMapByIdBuilder.build());
}
@Override
public FeatureConfiguration getFeatureConfigurationByName(String featureName) {
return featureConfigMapByName.get(featureName);
}
@Override
public FeatureConfiguration getFeatureConfigurationById(int featureFieldId) {
return Preconditions.checkNotNull(featureConfigMapById.get(featureFieldId),
"Field ID: " + featureFieldId);
}
@Override
@Nullable
public ThriftCSFType getCSFFieldType(String fieldName) {
FieldInfo fieldInfo = getFieldInfo(fieldName);
if (fieldInfo == null) {
return null;
}
EarlybirdFieldType fieldType = fieldInfo.getFieldType();
if (fieldType.docValuesType() != org.apache.lucene.index.DocValuesType.NUMERIC) {
return null;
}
return fieldType.getCsfType();
}
@Override
public ImmutableSchemaInterface getSchemaSnapshot() {
return this;
}
private FieldInfo merge(int fieldConfigId,
FieldInfo fieldInfo,
ThriftFieldConfiguration overrideConfig)
throws SchemaValidationException {
throw new UnsupportedOperationException("Field override config not supported");
}
@Override
public ThriftSearchFeatureSchema getSearchFeatureSchema() {
return searchFeatureSchema;
}
@Override
public ImmutableMap<Integer, FeatureConfiguration> getFeatureIdToFeatureConfig() {
return featureConfigMapById;
}
@Override
public ImmutableMap<String, FeatureConfiguration> getFeatureNameToFeatureConfig() {
return featureConfigMapByName;
}
private ThriftSearchFeatureSchema createSearchResultFeatureSchema(
String featureSchemaVersionPrefix,
Map<String, FieldInfo> allFieldSettings,
Map<String, FeatureConfiguration> featureConfigurations) throws SchemaValidationException {
final ImmutableMap.Builder<Integer, ThriftSearchFeatureSchemaEntry> builder =
new ImmutableMap.Builder<>();
for (Map.Entry<String, FieldInfo> field : allFieldSettings.entrySet()) {
FeatureConfiguration featureConfig = featureConfigurations.get(field.getKey());
if (featureConfig == null) {
// This is either a not csf related field or a csf field.
continue;
}
// This is a csfView field.
if (featureConfig.getOutputType() == null) {
LOG.info("Skip unused fieldschemas: {} for search feature schema.", field.getKey());
continue;
}
ThriftSearchFeatureType featureType = getResultFeatureType(featureConfig.getOutputType());
if (featureType != null) {
builder.put(
field.getValue().getFieldId(),
new ThriftSearchFeatureSchemaEntry(field.getKey(), featureType));
} else {
LOG.error("Invalid CSFType encountered for csf field: {}", field.getKey());
}
}
Map<Integer, ThriftSearchFeatureSchemaEntry> indexOnlySchemaEntries = builder.build();
// Add earlybird derived features, they are defined in ExternalTweetFeatures and used in the
// scoring function. They are no different from those auto-generated index-based features
// viewed from outside Earlybird.
Map<Integer, ThriftSearchFeatureSchemaEntry> entriesWithEBFeatures =
appendToFeatureSchema(
indexOnlySchemaEntries, ExternalTweetFeature.EARLYBIRD_DERIVED_FEATURES);
// Add other features needed for tweet ranking from EarlybirdRankingDerivedFeature.
Map<Integer, ThriftSearchFeatureSchemaEntry> allSchemaEntries = appendToFeatureSchema(
entriesWithEBFeatures, ExternalTweetFeature.EARLYBIRD_RANKING_DERIVED_FEATURES);
long schemaEntriesChecksum = getChecksum(allSchemaEntries);
SearchLongGauge.export("feature_schema_checksum", new AtomicLong(schemaEntriesChecksum));
String schemaVersion = String.format(
"%s.%d.%d", featureSchemaVersionPrefix, majorVersionNumber, minorVersionNumber);
ThriftSearchFeatureSchemaSpecifier schemaSpecifier =
new ThriftSearchFeatureSchemaSpecifier(schemaVersion, schemaEntriesChecksum);
ThriftSearchFeatureSchema schema = new ThriftSearchFeatureSchema();
schema.setSchemaSpecifier(schemaSpecifier);
schema.setEntries(allSchemaEntries);
return schema;
}
// Serializes schemaEntries to a byte array, and computes a CRC32 checksum of the array.
// The serialization needs to be stable: if schemaEntries1.equals(schemaEntries2), we want
// this method to produce the same checksum for schemaEntrie1 and schemaEntrie2, even if
// the checksums are computed in different JVMs, etc.
private static long getChecksum(Map<Integer, ThriftSearchFeatureSchemaEntry> schemaEntries)
throws SchemaValidationException {
SortedMap<Integer, ThriftSearchFeatureSchemaEntry> sortedSchemaEntries =
new TreeMap<Integer, ThriftSearchFeatureSchemaEntry>(schemaEntries);
CRC32OutputStream crc32OutputStream = new CRC32OutputStream();
ObjectOutputStream objectOutputStream = null;
try {
objectOutputStream = new ObjectOutputStream(crc32OutputStream);
for (Integer fieldId : sortedSchemaEntries.keySet()) {
objectOutputStream.writeObject(fieldId);
ThriftSearchFeatureSchemaEntry schemaEntry = sortedSchemaEntries.get(fieldId);
objectOutputStream.writeObject(schemaEntry.getFeatureName());
objectOutputStream.writeObject(schemaEntry.getFeatureType());
}
objectOutputStream.flush();
return crc32OutputStream.getValue();
} catch (IOException e) {
throw new SchemaValidationException("Could not serialize feature schema entries.", e);
} finally {
Preconditions.checkNotNull(objectOutputStream);
try {
objectOutputStream.close();
} catch (IOException e) {
throw new SchemaValidationException("Could not close ObjectOutputStream.", e);
}
}
}
/**
* Get the search feature type based on the csf type.
* @param csfType the column stride field type for the data
* @return the corresponding search feature type
*/
@VisibleForTesting
public static ThriftSearchFeatureType getResultFeatureType(ThriftCSFType csfType) {
switch (csfType) {
case INT:
case BYTE:
return ThriftSearchFeatureType.INT32_VALUE;
case BOOLEAN:
return ThriftSearchFeatureType.BOOLEAN_VALUE;
case FLOAT:
case DOUBLE:
return ThriftSearchFeatureType.DOUBLE_VALUE;
case LONG:
return ThriftSearchFeatureType.LONG_VALUE;
default:
return null;
}
}
}

View File

@ -1,44 +0,0 @@
package com.twitter.search.common.schema;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
/**
* A Lucene numeric field, similar to the LegacyIntField, LegacyLongField, etc. Lucene classes that
* were removed in Lucene 7.0.0.
*/
public final class NumericField extends Field {
private static final FieldType NUMERIC_FIELD_TYPE = new FieldType();
static {
NUMERIC_FIELD_TYPE.setTokenized(true);
NUMERIC_FIELD_TYPE.setOmitNorms(true);
NUMERIC_FIELD_TYPE.setIndexOptions(IndexOptions.DOCS);
NUMERIC_FIELD_TYPE.freeze();
}
/**
* Creates a new integer field with the given name and value.
*/
public static NumericField newIntField(String fieldName, int value) {
NumericField field = new NumericField(fieldName);
field.fieldsData = Integer.valueOf(value);
return field;
}
/**
* Creates a new long field with the given name and value.
*/
public static NumericField newLongField(String fieldName, long value) {
NumericField field = new NumericField(fieldName);
field.fieldsData = Long.valueOf(value);
return field;
}
// We could replace the static methods with constructors, but I think that would make it much
// easier to accidentally use NumericField(String, int) instead of NumericField(String, long),
// for example, leading to hard to debug errors.
private NumericField(String fieldName) {
super(fieldName, NUMERIC_FIELD_TYPE);
}
}

View File

@ -1,693 +0,0 @@
package com.twitter.search.common.schema;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets;
import com.twitter.common.text.util.CharSequenceTermAttributeSerializer;
import com.twitter.common.text.util.PositionIncrementAttributeSerializer;
import com.twitter.common.text.util.TokenStreamSerializer;
import com.twitter.common.text.util.TokenTypeAttributeSerializer;
import com.twitter.search.common.schema.base.FeatureConfiguration;
import com.twitter.search.common.schema.base.FieldNameToIdMapping;
import com.twitter.search.common.schema.thriftjava.ThriftCSFFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
import com.twitter.search.common.schema.thriftjava.ThriftCSFViewSettings;
import com.twitter.search.common.schema.thriftjava.ThriftFacetFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftFeatureNormalizationType;
import com.twitter.search.common.schema.thriftjava.ThriftFeatureUpdateConstraint;
import com.twitter.search.common.schema.thriftjava.ThriftFieldConfiguration;
import com.twitter.search.common.schema.thriftjava.ThriftFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftFixedLengthCSFSettings;
import com.twitter.search.common.schema.thriftjava.ThriftIndexOptions;
import com.twitter.search.common.schema.thriftjava.ThriftIndexedFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftIndexedNumericFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftNumericType;
import com.twitter.search.common.schema.thriftjava.ThriftSchema;
import com.twitter.search.common.schema.thriftjava.ThriftSearchFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftTokenStreamSerializer;
import com.twitter.search.common.util.analysis.CharTermAttributeSerializer;
import com.twitter.search.common.util.analysis.IntTermAttributeSerializer;
import com.twitter.search.common.util.analysis.LongTermAttributeSerializer;
import com.twitter.search.common.util.analysis.PayloadAttributeSerializer;
public class SchemaBuilder {
public static final String CSF_VIEW_NAME_SEPARATOR = ".";
protected final ThriftSchema schema = new ThriftSchema();
protected final FieldNameToIdMapping idMapping;
protected final int tokenStreamSerializerVersion;
// As of now, we do not allow two fields to share the same field name.
// This set is used to perform this check.
private final Set<String> fieldNameSet = Sets.newHashSet();
/**
* Construct a schema builder with the given FieldNameToIdMapper.
* A SchemaBuilder is used to build a ThriftSchema incrementally.
*/
public SchemaBuilder(FieldNameToIdMapping idMapping,
TokenStreamSerializer.Version tokenStreamSerializerVersion) {
this.idMapping = idMapping;
Preconditions.checkArgument(
tokenStreamSerializerVersion == TokenStreamSerializer.Version.VERSION_2);
this.tokenStreamSerializerVersion = tokenStreamSerializerVersion.ordinal();
}
/**
* Build ThriftSchema using settings accumulated so far.
*/
public final ThriftSchema build() {
return schema;
}
/**
* Uses fieldName also as facetName.
*/
public final SchemaBuilder withFacetConfigs(String fieldName,
boolean storeSkipList,
boolean storeOffensiveCounters,
boolean useCSFForFacetCounting) {
return withFacetConfigs(
fieldName,
fieldName,
storeSkipList,
storeOffensiveCounters,
useCSFForFacetCounting);
}
/**
* Add facet field configuration.
*/
public final SchemaBuilder withFacetConfigs(String fieldName,
String facetName,
boolean storeSkipList,
boolean storeOffensiveCounters,
boolean useCSFForFacetCounting) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFacetFieldSettings facetSettings = new ThriftFacetFieldSettings();
// As of now, all our facet names are the same as field names
facetSettings.setFacetName(facetName);
facetSettings.setStoreSkiplist(storeSkipList);
facetSettings.setStoreOffensiveCounters(storeOffensiveCounters);
facetSettings.setUseCSFForFacetCounting(useCSFForFacetCounting);
int fieldId = idMapping.getFieldID(fieldName);
ThriftFieldConfiguration fieldConfiguration = schema.getFieldConfigs().get(fieldId);
Preconditions.checkNotNull(fieldConfiguration,
"In Earlybird, a facet field must be indexed. "
+ "No ThriftIndexedFieldSettings found for field " + fieldName);
fieldConfiguration.getSettings().setFacetFieldSettings(facetSettings);
return this;
}
/**
* Configure the given field ID to be used for partitioning.
*/
public final SchemaBuilder withPartitionFieldId(int partitionFieldId) {
schema.setPartitionFieldId(partitionFieldId);
return this;
}
/**
* Add a column stride field into schema.
*/
public final SchemaBuilder withColumnStrideField(String fieldName,
ThriftCSFType type,
int numValuesPerDoc,
boolean updatable,
boolean loadIntoRam) {
return withColumnStrideField(fieldName, type, numValuesPerDoc, updatable, loadIntoRam, null);
}
/**
* Add a column stride field into schema that is variable length.
*/
public final SchemaBuilder withBinaryColumnStrideField(String fieldName,
boolean loadIntoRam) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftCSFFieldSettings csfFieldSettings = new ThriftCSFFieldSettings();
csfFieldSettings.setCsfType(ThriftCSFType.BYTE)
.setVariableLength(true)
.setLoadIntoRAM(loadIntoRam);
ThriftFieldSettings fieldSettings =
new ThriftFieldSettings().setCsfFieldSettings(csfFieldSettings);
ThriftFieldConfiguration fieldConf =
new ThriftFieldConfiguration(fieldName).setSettings(fieldSettings);
putIntoFieldConfigs(idMapping.getFieldID(fieldName), fieldConf);
return this;
}
/**
* Add a column stride field into schema which has a default value.
*/
public final SchemaBuilder withColumnStrideField(String fieldName,
ThriftCSFType type,
int numValuesPerDoc,
boolean updatable,
boolean loadIntoRam,
Long defaultValue) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftCSFFieldSettings csfFieldSettings = new ThriftCSFFieldSettings();
csfFieldSettings.setCsfType(type)
.setVariableLength(false)
.setFixedLengthSettings(
new ThriftFixedLengthCSFSettings()
.setNumValuesPerDoc(numValuesPerDoc)
.setUpdateable(updatable))
.setLoadIntoRAM(loadIntoRam);
if (defaultValue != null) {
csfFieldSettings.setDefaultValue(defaultValue);
}
ThriftFieldSettings fieldSettings =
new ThriftFieldSettings().setCsfFieldSettings(csfFieldSettings);
ThriftFieldConfiguration fieldConf =
new ThriftFieldConfiguration(fieldName).setSettings(fieldSettings);
putIntoFieldConfigs(idMapping.getFieldID(fieldName), fieldConf);
return this;
}
/**
* Add a CSF view into schema. A view is a portion of another CSF.
*/
public final SchemaBuilder withColumnStrideFieldView(
String fieldName,
ThriftCSFType csfType,
ThriftCSFType outputCSFType,
String baseFieldName,
int valueIndex,
int bitStartPosition,
int bitLength,
ThriftFeatureNormalizationType featureNormalizationType,
@Nullable Set<ThriftFeatureUpdateConstraint> constraints) {
if (!shouldIncludeField(fieldName)) {
return this;
}
int baseFieldConfigID = idMapping.getFieldID(baseFieldName);
ThriftCSFViewSettings csfViewSettings = new ThriftCSFViewSettings()
.setBaseFieldConfigId(baseFieldConfigID)
.setCsfType(csfType)
.setValueIndex(valueIndex)
.setBitStartPosition(bitStartPosition)
.setBitLength(bitLength);
if (outputCSFType != null) {
csfViewSettings.setOutputCSFType(outputCSFType);
}
if (featureNormalizationType != ThriftFeatureNormalizationType.NONE) {
csfViewSettings.setNormalizationType(featureNormalizationType);
}
if (constraints != null) {
csfViewSettings.setFeatureUpdateConstraints(constraints);
}
ThriftFieldSettings fieldSettings = new ThriftFieldSettings()
.setCsfViewSettings(csfViewSettings);
ThriftFieldConfiguration fieldConf = new ThriftFieldConfiguration(fieldName)
.setSettings(fieldSettings);
Map<Integer, ThriftFieldConfiguration> fieldConfigs = schema.getFieldConfigs();
verifyCSFViewSettings(fieldConfigs, fieldConf);
putIntoFieldConfigs(idMapping.getFieldID(fieldName), fieldConf);
return this;
}
/**
* Sanity checks for CSF view settings.
*/
public static void verifyCSFViewSettings(Map<Integer, ThriftFieldConfiguration> fieldConfigs,
ThriftFieldConfiguration fieldConf) {
Preconditions.checkNotNull(fieldConf.getSettings());
Preconditions.checkNotNull(fieldConf.getSettings().getCsfViewSettings());
ThriftCSFViewSettings csfViewSettings = fieldConf.getSettings().getCsfViewSettings();
if (fieldConfigs != null) {
ThriftFieldConfiguration baseFieldConfig = fieldConfigs.get(
csfViewSettings.getBaseFieldConfigId());
if (baseFieldConfig != null) {
String baseFieldName = baseFieldConfig.getFieldName();
String expectedViewNamePrefix = baseFieldName + CSF_VIEW_NAME_SEPARATOR;
if (fieldConf.getFieldName().startsWith(expectedViewNamePrefix)) {
ThriftFieldSettings baseFieldSettings = baseFieldConfig.getSettings();
ThriftCSFFieldSettings baseFieldCSFSettings = baseFieldSettings.getCsfFieldSettings();
if (baseFieldCSFSettings != null) {
if (!baseFieldCSFSettings.isVariableLength()
&& baseFieldCSFSettings.getFixedLengthSettings() != null) {
ThriftCSFType baseCSFType = baseFieldCSFSettings.getCsfType();
switch (baseCSFType) {
case BYTE:
checkCSFViewPositions(baseFieldCSFSettings, 8, csfViewSettings);
break;
case INT:
checkCSFViewPositions(baseFieldCSFSettings, 32, csfViewSettings);
break;
default:
throw new IllegalStateException("Base field: " + baseFieldName
+ " is of a non-supported CSFType: " + baseCSFType);
}
} else {
throw new IllegalStateException("Base field: " + baseFieldName
+ " must be a fixed-length CSF field");
}
} else {
throw new IllegalStateException("Base field: " + baseFieldName + " is not a CSF field");
}
} else {
throw new IllegalStateException("View field name for baseFieldConfigID: "
+ csfViewSettings.getBaseFieldConfigId() + " must start with: '"
+ expectedViewNamePrefix + "'");
}
} else {
throw new IllegalStateException("Can't add a view, no field defined for base fieldID: "
+ csfViewSettings.getBaseFieldConfigId());
}
} else {
throw new IllegalStateException("Can't add a view, no field configs defined.");
}
}
private static void checkCSFViewPositions(ThriftCSFFieldSettings baseFieldCSFSettings,
int bitsPerValue,
ThriftCSFViewSettings csfViewSettings) {
ThriftFixedLengthCSFSettings fixedLengthCSFSettings =
baseFieldCSFSettings.getFixedLengthSettings();
Preconditions.checkNotNull(fixedLengthCSFSettings);
int numValues = fixedLengthCSFSettings.getNumValuesPerDoc();
Preconditions.checkState(csfViewSettings.getValueIndex() >= 0,
"value index must be positive: " + csfViewSettings.getValueIndex());
Preconditions.checkState(csfViewSettings.getValueIndex() < numValues, "value index "
+ csfViewSettings.getValueIndex() + " must be less than numValues: " + numValues);
Preconditions.checkState(csfViewSettings.getBitStartPosition() >= 0,
"bitStartPosition must be positive: " + csfViewSettings.getBitStartPosition());
Preconditions.checkState(csfViewSettings.getBitStartPosition() < bitsPerValue,
"bitStartPosition " + csfViewSettings.getBitStartPosition()
+ " must be less than bitsPerValue " + bitsPerValue);
Preconditions.checkState(csfViewSettings.getBitLength() >= 1,
"bitLength must be positive: " + csfViewSettings.getBitLength());
Preconditions.checkState(
csfViewSettings.getBitStartPosition() + csfViewSettings.getBitLength() <= bitsPerValue,
String.format("bitStartPosition (%d) + bitLength (%d) must be less than bitsPerValue (%d)",
csfViewSettings.getBitStartPosition(), csfViewSettings.getBitLength(), bitsPerValue));
}
// No position; no freq; not pretokenized; not tokenized.
/**
* Norm is disabled as default. Like Lucene string field, or int/long fields.
*/
public final SchemaBuilder withIndexedNotTokenizedField(String fieldName) {
return withIndexedNotTokenizedField(fieldName, false);
}
/**
* Add an indexed but not tokenized field. This is similar to Lucene's StringField.
*/
public final SchemaBuilder withIndexedNotTokenizedField(String fieldName,
boolean supportOutOfOrderAppends) {
return withIndexedNotTokenizedField(fieldName, supportOutOfOrderAppends, true);
}
private final SchemaBuilder withIndexedNotTokenizedField(String fieldName,
boolean supportOutOfOrderAppends,
boolean omitNorms) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldSettings settings = getNoPositionNoFreqSettings(supportOutOfOrderAppends);
settings.getIndexedFieldSettings().setOmitNorms(omitNorms);
ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName)
.setSettings(settings);
putIntoFieldConfigs(idMapping.getFieldID(fieldName), config);
return this;
}
/** Makes the given field searchable by default, with the given weight. */
public final SchemaBuilder withSearchFieldByDefault(
String fieldName, float textSearchableFieldWeight) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldSettings settings =
schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings();
settings.setSearchFieldSettings(
new ThriftSearchFieldSettings()
.setTextSearchableFieldWeight(textSearchableFieldWeight)
.setTextDefaultSearchable(true));
return this;
}
/**
* Similar to Lucene's TextField. The string is analyzed using the default/override analyzer.
* @param fieldName
* @param addHfPairIfHfFieldsArePresent Add hfPair fields if they exists in the schema.
* For certain text fields, adding hfPair fields are usually preferred, but they may
* not exist in the schema, in which case the hfPair fields will not be added.
*/
public final SchemaBuilder withTextField(String fieldName,
boolean addHfPairIfHfFieldsArePresent) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName).setSettings(
getDefaultSettings(ThriftIndexOptions.DOCS_AND_FREQS_AND_POSITIONS));
if (addHfPairIfHfFieldsArePresent) {
// Add hfPair fields only if they exist in the schema for the cluster
boolean hfPair = shouldIncludeField(ImmutableSchema.HF_TERM_PAIRS_FIELD)
&& shouldIncludeField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD);
config.getSettings().getIndexedFieldSettings().setIndexHighFreqTermPairs(hfPair);
}
config.getSettings().getIndexedFieldSettings().setTokenized(true);
putIntoFieldConfigs(idMapping.getFieldID(fieldName), config);
return this;
}
/**
* Marked the given field as having per position payload.
*/
public final SchemaBuilder withPerPositionPayload(String fieldName, int defaultPayloadLength) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldSettings settings =
schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings();
settings.getIndexedFieldSettings().setStorePerPositionPayloads(true);
settings.getIndexedFieldSettings().setDefaultPerPositionPayloadLength(defaultPayloadLength);
return this;
}
/**
* Add field into schema that is pre-tokenized and does not have position.
* E.g. hashtags / stocks / card_domain
*/
public final SchemaBuilder withPretokenizedNoPositionField(String fieldName) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName)
.setSettings(getPretokenizedNoPositionFieldSetting());
// Add hfPair fields only if they exist in the schema for the cluster
boolean hfPair = shouldIncludeField(ImmutableSchema.HF_TERM_PAIRS_FIELD)
&& shouldIncludeField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD);
config.getSettings().getIndexedFieldSettings().setIndexHighFreqTermPairs(hfPair);
putIntoFieldConfigs(idMapping.getFieldID(fieldName), config);
return this;
}
/**
* Mark the field to have ordered term dictionary.
* In Lucene, term dictionary is sorted. In Earlybird, term dictionary order is not
* guaranteed unless this is turned on.
*/
public final SchemaBuilder withOrderedTerms(String fieldName) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldSettings settings =
schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings();
settings.getIndexedFieldSettings().setSupportOrderedTerms(true);
return this;
}
/**
* Support lookup of term text by term id in the term dictionary.
*/
public final SchemaBuilder withTermTextLookup(String fieldName) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldSettings settings =
schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings();
settings.getIndexedFieldSettings().setSupportTermTextLookup(true);
return this;
}
/**
* Add a text field that is pre-tokenized, so not analyzed again in the index (e.g. Earlybird).
*
* Note that the token streams MUST be created using the attributes defined in
* {@link com.twitter.search.common.util.text.TweetTokenStreamSerializer}.
*/
public final SchemaBuilder withPretokenizedTextField(
String fieldName,
boolean addHfPairIfHfFieldsArePresent) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName)
.setSettings(getDefaultPretokenizedSettings(
ThriftIndexOptions.DOCS_AND_FREQS_AND_POSITIONS));
putIntoFieldConfigs(idMapping.getFieldID(fieldName), config);
// Add hfPair fields only if they exist in the schema for the cluster
if (addHfPairIfHfFieldsArePresent) {
// Add hfPair fields only if they exist in the schema for the cluster
boolean hfPair = shouldIncludeField(ImmutableSchema.HF_TERM_PAIRS_FIELD)
&& shouldIncludeField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD);
config.getSettings().getIndexedFieldSettings().setIndexHighFreqTermPairs(hfPair);
}
return this;
}
/**
* Add a feature configuration
*/
public final SchemaBuilder withFeatureConfiguration(String baseFieldName, String viewName,
FeatureConfiguration featureConfiguration) {
return withColumnStrideFieldView(
viewName,
// Defaulting all encoded tweet features to int since the underlying encoded tweet features
// are ints.
ThriftCSFType.INT,
featureConfiguration.getOutputType(),
baseFieldName,
featureConfiguration.getValueIndex(),
featureConfiguration.getBitStartPosition(),
featureConfiguration.getBitLength(),
featureConfiguration.getFeatureNormalizationType(),
featureConfiguration.getUpdateConstraints()
);
}
/**
* Add a long field in schema. This field uses LongTermAttribute.
*/
private SchemaBuilder addLongTermField(String fieldName, boolean useSortableEncoding) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldSettings longTermSettings = getEarlybirdNumericFieldSettings();
ThriftTokenStreamSerializer tokenStreamSerializer =
new ThriftTokenStreamSerializer(tokenStreamSerializerVersion);
tokenStreamSerializer.setAttributeSerializerClassNames(
ImmutableList.<String>of(LongTermAttributeSerializer.class.getName()));
longTermSettings.getIndexedFieldSettings().setTokenStreamSerializer(tokenStreamSerializer);
ThriftIndexedNumericFieldSettings numericFieldSettings =
new ThriftIndexedNumericFieldSettings(true);
numericFieldSettings.setNumericType(ThriftNumericType.LONG);
numericFieldSettings.setUseSortableEncoding(useSortableEncoding);
longTermSettings.getIndexedFieldSettings().setNumericFieldSettings(numericFieldSettings);
putIntoFieldConfigs(idMapping.getFieldID(fieldName),
new ThriftFieldConfiguration(fieldName).setSettings(longTermSettings));
return this;
}
public final SchemaBuilder withSortableLongTermField(String fieldName) {
return addLongTermField(fieldName, true);
}
public final SchemaBuilder withLongTermField(String fieldName) {
return addLongTermField(fieldName, false);
}
/**
* Add an int field in schema. This field uses IntTermAttribute.
*/
public final SchemaBuilder withIntTermField(String fieldName) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldSettings intTermSettings = getEarlybirdNumericFieldSettings();
ThriftTokenStreamSerializer attributeSerializer =
new ThriftTokenStreamSerializer(tokenStreamSerializerVersion);
attributeSerializer.setAttributeSerializerClassNames(
ImmutableList.<String>of(IntTermAttributeSerializer.class.getName()));
intTermSettings.getIndexedFieldSettings().setTokenStreamSerializer(attributeSerializer);
ThriftIndexedNumericFieldSettings numericFieldSettings =
new ThriftIndexedNumericFieldSettings(true);
numericFieldSettings.setNumericType(ThriftNumericType.INT);
intTermSettings.getIndexedFieldSettings().setNumericFieldSettings(numericFieldSettings);
putIntoFieldConfigs(idMapping.getFieldID(fieldName),
new ThriftFieldConfiguration(fieldName).setSettings(intTermSettings));
return this;
}
/**
* Timeline and ExpertSearch uses
* {@link com.twitter.search.common.util.analysis.PayloadWeightedTokenizer} to store weighted
* values.
*
* E.g. for the PRODUCED_LANGUAGES and CONSUMED_LANGUAGES fields, they contain not a single,
* value, but instead a list of values with a weight associated with each value.
*
* This method adds an indexed field that uses
* {@link com.twitter.search.common.util.analysis.PayloadWeightedTokenizer}.
*/
public final SchemaBuilder withCharTermPayloadWeightedField(String fieldName) {
ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName)
.setSettings(getPayloadWeightedSettings(ThriftIndexOptions.DOCS_AND_FREQS_AND_POSITIONS));
putIntoFieldConfigs(idMapping.getFieldID(fieldName), config);
return this;
}
/**
* Set the version and description of this schema.
*/
public final SchemaBuilder withSchemaVersion(
int majorVersionNumber,
int minorVersionNumber,
String versionDesc,
boolean isOfficial) {
schema.setMajorVersionNumber(majorVersionNumber);
schema.setMinorVersionNumber(minorVersionNumber);
schema.setVersion(majorVersionNumber + ":" + versionDesc);
schema.setVersionIsOfficial(isOfficial);
return this;
}
public final SchemaBuilder withSchemaVersion(
int majorVersionNumber,
String versionDesc,
boolean isOfficial) {
return withSchemaVersion(majorVersionNumber, 0, versionDesc, isOfficial);
}
protected void putIntoFieldConfigs(int id, ThriftFieldConfiguration config) {
if (schema.getFieldConfigs() != null && schema.getFieldConfigs().containsKey(id)) {
throw new IllegalStateException("Already have a ThriftFieldConfiguration for field id " + id);
}
if (fieldNameSet.contains(config.getFieldName())) {
throw new IllegalStateException("Already have a ThriftFieldConfiguration for field "
+ config.getFieldName());
}
fieldNameSet.add(config.getFieldName());
schema.putToFieldConfigs(id, config);
}
// Default field settings. Most field settings are similar to this.
protected ThriftFieldSettings getDefaultSettings(ThriftIndexOptions indexOption) {
return getDefaultSettings(indexOption, false);
}
protected ThriftFieldSettings getDefaultSettings(ThriftIndexOptions indexOption,
boolean supportOutOfOrderAppends) {
ThriftFieldSettings fieldSettings = new ThriftFieldSettings();
ThriftIndexedFieldSettings indexedFieldSettings = new ThriftIndexedFieldSettings();
indexedFieldSettings
.setIndexed(true)
.setStored(false)
.setTokenized(false)
.setStoreTermVectors(false)
.setStoreTermVectorOffsets(false)
.setStoreTermVectorPayloads(false)
.setStoreTermVectorPositions(false)
.setSupportOutOfOrderAppends(supportOutOfOrderAppends)
.setIndexOptions(indexOption)
.setOmitNorms(true); // All Earlybird fields omit norms.
fieldSettings.setIndexedFieldSettings(indexedFieldSettings);
return fieldSettings;
}
/**
* Default field settings for fields that are pretokenized
*
* The fields that use these settings will need to be tokenized using a serializer with the
* attributes defined in {@link com.twitter.search.common.util.text.TweetTokenStreamSerializer}.
*/
protected final ThriftFieldSettings getDefaultPretokenizedSettings(
ThriftIndexOptions indexOption) {
ThriftFieldSettings fieldSettings = getDefaultSettings(indexOption);
fieldSettings.getIndexedFieldSettings().setTokenized(true);
ThriftTokenStreamSerializer attributeSerializer =
new ThriftTokenStreamSerializer(tokenStreamSerializerVersion);
attributeSerializer.setAttributeSerializerClassNames(
ImmutableList.<String>of(
CharSequenceTermAttributeSerializer.class.getName(),
PositionIncrementAttributeSerializer.class.getName(),
TokenTypeAttributeSerializer.class.getName()));
fieldSettings.getIndexedFieldSettings().setTokenStreamSerializer(attributeSerializer);
return fieldSettings;
}
protected final ThriftFieldSettings getPretokenizedNoPositionFieldSetting() {
return getDefaultPretokenizedSettings(ThriftIndexOptions.DOCS_AND_FREQS);
}
protected final ThriftFieldSettings getNoPositionNoFreqSettings() {
return getNoPositionNoFreqSettings(false);
}
protected final ThriftFieldSettings getNoPositionNoFreqSettings(
boolean supportOutOfOrderAppends) {
return getDefaultSettings(ThriftIndexOptions.DOCS_ONLY, supportOutOfOrderAppends);
}
protected final ThriftFieldSettings getEarlybirdNumericFieldSettings() {
// Supposedly numeric fields are not tokenized.
// However, Earlybird uses SingleTokenTokenStream to handle int/long fields.
// So we need to set indexed to true for these fields.
ThriftFieldSettings settings = getNoPositionNoFreqSettings();
settings.getIndexedFieldSettings().setTokenized(true);
return settings;
}
private ThriftFieldSettings getPayloadWeightedSettings(ThriftIndexOptions indexOption) {
ThriftFieldSettings fieldSettings = getDefaultSettings(indexOption);
fieldSettings.getIndexedFieldSettings().setTokenized(true);
ThriftTokenStreamSerializer attributeSerializer =
new ThriftTokenStreamSerializer(tokenStreamSerializerVersion);
attributeSerializer.setAttributeSerializerClassNames(
ImmutableList.<String>of(CharTermAttributeSerializer.class.getName(),
PositionIncrementAttributeSerializer.class.getName(),
PayloadAttributeSerializer.class.getName()));
fieldSettings.getIndexedFieldSettings().setTokenStreamSerializer(attributeSerializer);
return fieldSettings;
}
protected boolean shouldIncludeField(String fieldName) {
return true;
}
}

View File

@ -1,433 +0,0 @@
package com.twitter.search.common.schema;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
import org.apache.lucene.util.BytesRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.text.token.TwitterTokenStream;
import com.twitter.search.common.schema.base.EarlybirdFieldType;
import com.twitter.search.common.schema.base.IndexedNumericFieldSettings;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.schema.thriftjava.ThriftDocument;
import com.twitter.search.common.schema.thriftjava.ThriftField;
import com.twitter.search.common.schema.thriftjava.ThriftFieldData;
import com.twitter.search.common.schema.thriftjava.ThriftGeoCoordinate;
import com.twitter.search.common.util.analysis.IntTermAttribute;
import com.twitter.search.common.util.analysis.LongTermAttribute;
import com.twitter.search.common.util.analysis.SortableLongTermAttribute;
import com.twitter.search.common.util.spatial.GeoUtil;
import com.twitter.search.common.util.text.HighFrequencyTermPairs;
import com.twitter.search.common.util.text.OmitNormTextField;
import com.twitter.search.common.util.text.SingleTokenStream;
/**
* A document factory that converts {@link ThriftDocument} into Lucene {@link Document}s
* using the provided {@link com.twitter.search.common.schema.base.Schema}.
*/
public class SchemaDocumentFactory {
private static final Logger LOG = LoggerFactory.getLogger(SchemaDocumentFactory.class);
private final Schema schema;
private final ImmutableList<TokenStreamRewriter> tokenStreamRewriters;
/**
* Creates a SchemaDocumentFactory with a schema and the tokenStreamRewriters.
*
* @param tokenStreamRewriters a list of token stream rewriters, which will be applied in order.
*/
public SchemaDocumentFactory(
Schema schema,
List<TokenStreamRewriter> tokenStreamRewriters) {
this.schema = schema;
this.tokenStreamRewriters = ImmutableList.copyOf(tokenStreamRewriters);
}
/**
* Creates a SchemaDocumentFactory with no tokenStreamRewriters.
*/
public SchemaDocumentFactory(Schema schema) {
this(schema, Collections.EMPTY_LIST);
}
public final Document newDocument(ThriftDocument document) throws IOException {
return innerNewDocument(document);
}
/**
* Create a Lucene document from the ThriftDocument.
*/
@VisibleForTesting
public Document innerNewDocument(ThriftDocument document) throws IOException {
Document luceneDocument = new Document();
Set<String> hfTerms = Sets.newHashSet();
Set<String> hfPhrases = Sets.newHashSet();
Analyzer defaultAnalyzer = schema.getDefaultAnalyzer(document.getDefaultAnalyzerOverride());
for (ThriftField field : document.getFields()) {
boolean successful = false;
try {
addLuceneFields(field, defaultAnalyzer, luceneDocument, hfTerms, hfPhrases);
successful = true;
} finally {
if (!successful) {
LOG.warn("Unexpected exception while trying to add field. Field ID: "
+ field.getFieldConfigId() + " Field Name: "
+ schema.getFieldName(field.getFieldConfigId()));
}
}
}
for (String token : hfTerms) {
for (String token2 : hfTerms) {
if (token.compareTo(token2) < 0) {
luceneDocument.add(new Field(ImmutableSchema.HF_TERM_PAIRS_FIELD,
HighFrequencyTermPairs.createPair(token, token2),
OmitNormTextField.TYPE_NOT_STORED));
}
}
}
for (String phrase : hfPhrases) {
// Tokens in the phrase set are not terms and have already been processed with
// HighFrequencyTermPairs.createPhrasePair.
luceneDocument.add(new Field(ImmutableSchema.HF_PHRASE_PAIRS_FIELD, phrase,
OmitNormTextField.TYPE_NOT_STORED));
}
return schema.getFacetsConfig().build(luceneDocument);
}
private void addLuceneFields(ThriftField field, Analyzer analyzer, Document doc,
Set<String> hfTerms, Set<String> hfPhrases) throws IOException {
Schema.FieldInfo fieldInfo =
schema.getFieldInfo(field.getFieldConfigId(), field.getFieldConfigOverride());
if (fieldInfo == null) {
// field not defined in schema - skip it
return;
}
ThriftFieldData fieldData = field.getFieldData();
if (fieldInfo.getFieldType().getCsfType() != null) {
addCSFField(doc, fieldInfo, fieldData);
return;
}
// Checking which data type is set is not sufficient here. We also need to check schema to
// see what the type the field is configured to be. See SEARCH-5173 for more details.
// The problem is that Pig, while converting Tuples to Thrift, sets all primitive type
// fields to 0. (i.e. the isSet calls will return true).
IndexedNumericFieldSettings numericSettings =
fieldInfo.getFieldType().getNumericFieldSettings();
if (fieldData.isSetTokenStreamValue()) {
addTokenField(doc, hfTerms, hfPhrases, fieldInfo, fieldData);
} else if (fieldData.isSetStringValue()) {
addStringField(analyzer, doc, hfTerms, hfPhrases, fieldInfo, fieldData);
} else if (fieldData.isSetBytesValue()) {
addBytesField(doc, fieldInfo, fieldData);
} else if (fieldData.isSetGeoCoordinate()) {
addGeoField(doc, fieldInfo, fieldData);
} else if (numericSettings != null) {
// handle numeric fields.
switch (numericSettings.getNumericType()) {
case INT:
Preconditions.checkState(fieldData.isSetIntValue(),
"Int field does not have int value set. Field name: %s", fieldInfo.getName());
addIntField(doc, fieldInfo, fieldData);
break;
case LONG:
Preconditions.checkState(fieldData.isSetLongValue(),
"Long field does not have long value set. Field name: %s", fieldInfo.getName());
addLongField(doc, fieldInfo, fieldData);
break;
case FLOAT:
Preconditions.checkState(fieldData.isSetFloatValue(),
"Float field does not have float value set. Field name: %s ", fieldInfo.getName());
addFloatField();
break;
case DOUBLE:
Preconditions.checkState(fieldData.isSetDoubleValue(),
"Double field does not have double value set. Field name: %s", fieldInfo.getName());
addDoubleFIeld();
break;
default:
throw new UnsupportedOperationException("Earlybird does not know how to handle field "
+ field.getFieldConfigId() + " " + field);
}
} else {
throw new UnsupportedOperationException("Earlybird does not know how to handle field "
+ field.getFieldConfigId() + " " + field);
}
}
private void addCSFField(Document doc, Schema.FieldInfo fieldInfo, ThriftFieldData fieldData) {
if (fieldInfo.getFieldType().getCsfFixedLengthNumValuesPerDoc() > 1) {
// As an optimization, TBinaryProtocol stores a byte array field as a part of a larger byte
// array field. Must call fieldData.getBytesValue(). fieldData.bytesValue.array() will
// return extraneous data. See: SEARCH-3996
doc.add(new Field(fieldInfo.getName(), fieldData.getBytesValue(), fieldInfo.getFieldType()));
} else {
doc.add(new CSFField(fieldInfo.getName(), fieldInfo.getFieldType(), fieldData));
}
}
private void addTokenField(
Document doc,
Set<String> hfTerms,
Set<String> hfPhrases,
Schema.FieldInfo fieldInfo,
ThriftFieldData fieldData) throws IOException {
TwitterTokenStream twitterTokenStream
= fieldInfo.getFieldType().getTokenStreamSerializer().deserialize(
fieldData.getTokenStreamValue(), fieldData.getStringValue());
try {
for (TokenStreamRewriter rewriter : tokenStreamRewriters) {
twitterTokenStream = rewriter.rewrite(fieldInfo, twitterTokenStream);
}
expandStream(doc, fieldInfo, twitterTokenStream, hfTerms, hfPhrases);
doc.add(new Field(fieldInfo.getName(), twitterTokenStream, fieldInfo.getFieldType()));
} finally {
twitterTokenStream.close();
}
}
private void addStringField(Analyzer analyzer, Document doc, Set<String> hfTerms,
Set<String> hfPhrases, Schema.FieldInfo fieldInfo,
ThriftFieldData fieldData) {
doc.add(new Field(fieldInfo.getName(), fieldData.getStringValue(), fieldInfo.getFieldType()));
if (fieldInfo.getFieldType().tokenized()) {
try {
TokenStream tokenStream = analyzer.tokenStream(fieldInfo.getName(),
new StringReader(fieldData.getStringValue()));
try {
expandStream(
doc,
fieldInfo,
tokenStream,
hfTerms,
hfPhrases);
} finally {
tokenStream.close();
}
} catch (IOException e) {
LOG.error("IOException expanding token stream", e);
}
} else {
addFacetField(doc, fieldInfo, fieldData.getStringValue());
}
}
private void addBytesField(Document doc, Schema.FieldInfo fieldInfo, ThriftFieldData fieldData) {
doc.add(new Field(fieldInfo.getName(), fieldData.getBytesValue(), fieldInfo.getFieldType()));
}
private void addIntField(Document doc, Schema.FieldInfo fieldInfo,
ThriftFieldData fieldData) {
int value = fieldData.getIntValue();
addFacetField(doc, fieldInfo, String.valueOf(value));
if (fieldInfo.getFieldType().getNumericFieldSettings() == null) {
// No NumericFieldSettings. Even though the data is numeric, this field is not
// really a numerical field. Just add as a string.
doc.add(new Field(fieldInfo.getName(), String.valueOf(value), fieldInfo.getFieldType()));
} else if (fieldInfo.getFieldType().getNumericFieldSettings().isUseTwitterFormat()) {
addIntTermAttributeField(value, fieldInfo, doc);
} else {
// Use lucene style numerical fields
doc.add(NumericField.newIntField(fieldInfo.getName(), value));
}
}
private void addIntTermAttributeField(int value,
Schema.FieldInfo fieldInfo,
Document doc) {
SingleTokenStream singleToken = new SingleTokenStream();
IntTermAttribute termAtt = singleToken.addAttribute(IntTermAttribute.class);
termAtt.setTerm(value);
doc.add(new Field(fieldInfo.getName(), singleToken, fieldInfo.getFieldType()));
}
private void addLongField(Document doc, Schema.FieldInfo fieldInfo,
ThriftFieldData fieldData) {
long value = fieldData.getLongValue();
addFacetField(doc, fieldInfo, String.valueOf(value));
if (fieldInfo.getFieldType().getNumericFieldSettings() == null) {
// No NumericFieldSettings. Even though the data is numeric, this field is not
// really a numerical field. Just add as a string.
doc.add(new Field(fieldInfo.getName(), String.valueOf(value), fieldInfo.getFieldType()));
} else if (fieldInfo.getFieldType().getNumericFieldSettings().isUseTwitterFormat()) {
// Twitter style numerical field: use LongTermAttribute
addLongTermAttributeField(value, fieldInfo, doc);
} else {
// Use lucene style numerical fields
doc.add(NumericField.newLongField(fieldInfo.getName(), value));
}
}
private void addLongTermAttributeField(long value,
Schema.FieldInfo fieldInfo,
Document doc) {
SingleTokenStream singleToken = new SingleTokenStream();
boolean useSortableEncoding =
fieldInfo.getFieldType().getNumericFieldSettings().isUseSortableEncoding();
if (useSortableEncoding) {
SortableLongTermAttribute termAtt = singleToken.addAttribute(SortableLongTermAttribute.class);
termAtt.setTerm(value);
} else {
LongTermAttribute termAtt = singleToken.addAttribute(LongTermAttribute.class);
termAtt.setTerm(value);
}
doc.add(new Field(fieldInfo.getName(), singleToken, fieldInfo.getFieldType()));
}
private void addFloatField() {
throw new UnsupportedOperationException("Earlybird does not support float values yet.");
}
private void addDoubleFIeld() {
throw new UnsupportedOperationException("Earlybird does not support double values yet.");
}
private void addGeoField(Document doc, Schema.FieldInfo fieldInfo, ThriftFieldData fieldData) {
ThriftGeoCoordinate coord = fieldData.getGeoCoordinate();
if (GeoUtil.validateGeoCoordinates(coord.getLat(), coord.getLon())) {
GeoUtil.fillGeoFields(doc, fieldInfo.getName(),
coord.getLat(), coord.getLon(), coord.getAccuracy());
}
}
private void addFacetField(Document doc, Schema.FieldInfo fieldInfo, String value) {
Preconditions.checkArgument(doc != null);
Preconditions.checkArgument(fieldInfo != null);
Preconditions.checkArgument(value != null);
if (fieldInfo.getFieldType().getFacetName() != null) {
doc.add(new SortedSetDocValuesFacetField(fieldInfo.getFieldType().getFacetName(), value));
}
}
private String getTerm(TermToBytesRefAttribute attr) {
if (attr instanceof CharTermAttribute) {
return ((CharTermAttribute) attr).toString();
} else if (attr instanceof IntTermAttribute) {
return String.valueOf(((IntTermAttribute) attr).getTerm());
} else if (attr instanceof LongTermAttribute) {
return String.valueOf(((LongTermAttribute) attr).getTerm());
} else {
return attr.getBytesRef().utf8ToString();
}
}
/**
* Expand the TwitterTokenStream and populate high-frequency terms, phrases and/or facet category paths.
*/
private void expandStream(
Document doc,
Schema.FieldInfo fieldInfo,
TokenStream stream,
Set<String> hfTerms,
Set<String> hfPhrases) throws IOException {
// Checkstyle does not allow assignment to parameters.
Set<String> facetHfTerms = hfTerms;
Set<String> facetHfPhrases = hfPhrases;
if (!(HighFrequencyTermPairs.INDEX_HF_TERM_PAIRS
&& fieldInfo.getFieldType().isIndexHFTermPairs())) {
// high-frequency terms and phrases are not needed
if (fieldInfo.getFieldType().getFacetName() == null) {
// Facets are not needed either, simply return, would do nothing otherwise
return;
}
facetHfTerms = null;
facetHfPhrases = null;
}
final TermToBytesRefAttribute attr = stream.getAttribute(TermToBytesRefAttribute.class);
stream.reset();
String lastHFTerm = null;
while (stream.incrementToken()) {
String term = getTerm(attr);
if (fieldInfo.getFieldType().getFacetName() != null) {
addFacetField(doc, fieldInfo, term);
}
if (HighFrequencyTermPairs.HF_TERM_SET.contains(term)) {
if (facetHfTerms != null) {
facetHfTerms.add(term);
}
if (lastHFTerm != null) {
if (facetHfPhrases != null) {
facetHfPhrases.add(HighFrequencyTermPairs.createPhrasePair(lastHFTerm, term));
}
}
lastHFTerm = term;
} else {
lastHFTerm = null;
}
}
}
public static final class CSFField extends Field {
/**
* Create a CSFField with the given fieldType, containing the given field data.
*/
public CSFField(String name, EarlybirdFieldType fieldType, ThriftFieldData data) {
super(name, fieldType);
if (fieldType.isCsfVariableLength()) {
fieldsData = new BytesRef(data.getBytesValue());
} else {
switch (fieldType.getCsfType()) {
case BYTE:
fieldsData = Long.valueOf(data.getByteValue());
break;
case INT:
fieldsData = Long.valueOf(data.getIntValue());
break;
case LONG:
fieldsData = Long.valueOf(data.getLongValue());
break;
case FLOAT:
fieldsData = Long.valueOf(Float.floatToRawIntBits((float) data.getFloatValue()));
break;
case DOUBLE:
fieldsData = Long.valueOf(Double.doubleToRawLongBits(data.getDoubleValue()));
break;
default:
throw new IllegalArgumentException("Unknown csf type: " + fieldType.getCsfType());
}
}
}
}
public interface TokenStreamRewriter {
/**
* Rewrite the token stream.
*/
TwitterTokenStream rewrite(Schema.FieldInfo fieldInfo, TwitterTokenStream stream);
}
}

View File

@ -1,102 +0,0 @@
package com.twitter.search.common.schema;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.util.BytesRef;
import com.twitter.search.common.schema.base.EarlybirdFieldType;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.base.IndexedNumericFieldSettings;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
import com.twitter.search.common.schema.thriftjava.ThriftNumericType;
import com.twitter.search.common.util.analysis.IntTermAttributeImpl;
import com.twitter.search.common.util.analysis.LongTermAttributeImpl;
import com.twitter.search.common.util.analysis.SortableLongTermAttributeImpl;
public final class SchemaUtil {
private SchemaUtil() {
}
/**
* Get the a fixed CSF field's number of values per doc.
* @param schema the Schema for the index
* @param fieldId the field id the CSF field - the field must be of binary integer type and
* in fixed size
* @return the number of values per doc
*/
public static int getCSFFieldFixedLength(ImmutableSchemaInterface schema, int fieldId) {
final Schema.FieldInfo fieldInfo = Preconditions.checkNotNull(schema.getFieldInfo(fieldId));
return getCSFFieldFixedLength(fieldInfo);
}
/**
* Get the a fixed CSF field's number of values per doc.
* @param schema the Schema for the index
* @param fieldName the field name of the CSF field - the field must be of binary integer type
* and in fixed size
* @return the number of values per doc
*/
public static int getCSFFieldFixedLength(ImmutableSchemaInterface schema, String fieldName) {
final Schema.FieldInfo fieldInfo = Preconditions.checkNotNull(schema.getFieldInfo(fieldName));
return getCSFFieldFixedLength(fieldInfo);
}
/**
* Get the a fixed CSF field's number of values per doc.
* @param fieldInfo the field of the CSF field - the field must be of binary integer type
* and in fixed size
* @return the number of values per doc
*/
public static int getCSFFieldFixedLength(Schema.FieldInfo fieldInfo) {
final EarlybirdFieldType fieldType = fieldInfo.getFieldType();
Preconditions.checkState(fieldType.docValuesType() == DocValuesType.BINARY
&& fieldType.getCsfType() == ThriftCSFType.INT);
return fieldType.getCsfFixedLengthNumValuesPerDoc();
}
/** Converts the given value to a BytesRef instance, according to the type of the given field. */
public static BytesRef toBytesRef(Schema.FieldInfo fieldInfo, String value) {
EarlybirdFieldType fieldType = fieldInfo.getFieldType();
Preconditions.checkArgument(fieldType.indexOptions() != IndexOptions.NONE);
IndexedNumericFieldSettings numericSetting = fieldType.getNumericFieldSettings();
if (numericSetting != null) {
if (!numericSetting.isUseTwitterFormat()) {
throw new UnsupportedOperationException(
"Numeric field not using Twitter format: cannot drill down.");
}
ThriftNumericType numericType = numericSetting.getNumericType();
switch (numericType) {
case INT:
try {
return IntTermAttributeImpl.copyIntoNewBytesRef(Integer.parseInt(value));
} catch (NumberFormatException e) {
throw new UnsupportedOperationException(
String.format("Cannot parse value for int field %s: %s",
fieldInfo.getName(), value),
e);
}
case LONG:
try {
return numericSetting.isUseSortableEncoding()
? SortableLongTermAttributeImpl.copyIntoNewBytesRef(Long.parseLong(value))
: LongTermAttributeImpl.copyIntoNewBytesRef(Long.parseLong(value));
} catch (NumberFormatException e) {
throw new UnsupportedOperationException(
String.format("Cannot parse value for long field %s: %s",
fieldInfo.getName(), value),
e);
}
default:
throw new UnsupportedOperationException(
String.format("Unsupported numeric type for field %s: %s",
fieldInfo.getName(), numericType));
}
}
return new BytesRef(value);
}
}

View File

@ -1,27 +0,0 @@
package com.twitter.search.common.schema;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* The majority of the code is copied from Lucene 3.1 analysis.core.WhitespaceAnalyzer. The only
* new code is the getPositionIncrementGap()
*/
public final class SearchWhitespaceAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
return new TokenStreamComponents(new WhitespaceTokenizer());
}
/**
* Make sure that phrase queries do not match across 2 instances of the text field.
*
* See the Javadoc for Analyzer.getPositionIncrementGap() for a good explanation of how this
* method works.
*/
@Override
public int getPositionIncrementGap(String fieldName) {
// Hard-code "text" here, because we can't depend on EarlybirdFieldConstants.
return "text".equals(fieldName) ? 1 : super.getPositionIncrementGap(fieldName);
}
}

View File

@ -1,228 +0,0 @@
package com.twitter.search.common.schema;
import java.io.IOException;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.annotation.Nullable;
import com.twitter.common.text.util.PositionIncrementAttributeSerializer;
import com.twitter.common.text.util.TokenStreamSerializer;
import com.twitter.search.common.schema.base.FieldNameToIdMapping;
import com.twitter.search.common.schema.thriftjava.ThriftDocument;
import com.twitter.search.common.schema.thriftjava.ThriftField;
import com.twitter.search.common.schema.thriftjava.ThriftFieldData;
import com.twitter.search.common.schema.thriftjava.ThriftGeoCoordinate;
import com.twitter.search.common.util.analysis.CharTermAttributeSerializer;
import com.twitter.search.common.util.analysis.LongTermAttributeSerializer;
import com.twitter.search.common.util.analysis.LongTermsTokenStream;
import com.twitter.search.common.util.analysis.PayloadAttributeSerializer;
import com.twitter.search.common.util.analysis.PayloadWeightedTokenizer;
import com.twitter.search.common.util.spatial.GeoUtil;
/**
* Builder class for building ThriftDocuments.
*/
public class ThriftDocumentBuilder {
private static final Logger LOG = Logger.getLogger(ThriftDocumentBuilder.class.getName());
protected final ThriftDocument doc = new ThriftDocument();
protected final FieldNameToIdMapping idMapping;
private static final ThreadLocal<TokenStreamSerializer> PAYLOAD_WEIGHTED_SERIALIZER_PER_THREAD =
new ThreadLocal<TokenStreamSerializer>() {
@Override
protected TokenStreamSerializer initialValue() {
return TokenStreamSerializer.builder()
.add(new CharTermAttributeSerializer())
.add(new PositionIncrementAttributeSerializer())
.add(new PayloadAttributeSerializer())
.build();
}
};
private static final ThreadLocal<TokenStreamSerializer> LONG_TERM_SERIALIZER_PER_THREAD =
new ThreadLocal<TokenStreamSerializer>() {
@Override
protected TokenStreamSerializer initialValue() {
return TokenStreamSerializer.builder()
.add(new LongTermAttributeSerializer())
.build();
}
};
public ThriftDocumentBuilder(FieldNameToIdMapping idMapping) {
this.idMapping = idMapping;
}
protected void prepareToBuild() {
// left empty, subclass can override this.
}
public ThriftDocument build() {
prepareToBuild();
return doc;
}
/**
* Add a long field. This is indexed as a
* {@link com.twitter.search.common.util.analysis.LongTermAttribute}
*/
public final ThriftDocumentBuilder withLongField(String fieldName, long value) {
ThriftFieldData fieldData = new ThriftFieldData().setLongValue(value);
ThriftField field = new ThriftField()
.setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData);
doc.addToFields(field);
return this;
}
/**
* Add an int field. This is indexed as a
* {@link com.twitter.search.common.util.analysis.IntTermAttribute}
*/
public final ThriftDocumentBuilder withIntField(String fieldName, int value) {
ThriftFieldData fieldData = new ThriftFieldData().setIntValue(value);
ThriftField field = new ThriftField()
.setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData);
doc.addToFields(field);
return this;
}
/**
* Add a field whose value is a single byte.
*/
public final ThriftDocumentBuilder withByteField(String fieldName, byte value) {
ThriftFieldData fieldData = new ThriftFieldData().setByteValue(value);
ThriftField field = new ThriftField()
.setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData);
doc.addToFields(field);
return this;
}
/**
* Add a field whose value is a byte array.
*/
public final ThriftDocumentBuilder withBytesField(String fieldName, byte[] value) {
ThriftFieldData fieldData = new ThriftFieldData().setBytesValue(value);
ThriftField field = new ThriftField()
.setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData);
doc.addToFields(field);
return this;
}
/**
* Add a field whose value is a float.
*/
public final ThriftDocumentBuilder withFloatField(String fieldName, float value) {
ThriftFieldData fieldData = new ThriftFieldData().setFloatValue(value);
ThriftField field = new ThriftField()
.setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData);
doc.addToFields(field);
return this;
}
/**
* Added a field whose value is a Lucene TokenStream.
* The Lucene TokenStream is serialized using Twitter's
* {@link com.twitter.common.text.util.TokenStreamSerializer}
*/
public final ThriftDocumentBuilder withTokenStreamField(String fieldName,
@Nullable String tokenStreamText,
byte[] tokenStream) {
if (tokenStream == null) {
return this;
}
ThriftFieldData fieldData = new ThriftFieldData()
.setStringValue(tokenStreamText).setTokenStreamValue(tokenStream);
ThriftField field = new ThriftField()
.setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData);
doc.addToFields(field);
return this;
}
/**
* Add a field whose value is a String.
* @param fieldName Name of the field where the string will be added.
* @param text This string is indexed as is (not analyzed).
*/
public final ThriftDocumentBuilder withStringField(String fieldName, String text) {
if (text == null || text.isEmpty()) {
return this;
}
ThriftFieldData fieldData = new ThriftFieldData().setStringValue(text);
ThriftField field = new ThriftField()
.setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData);
doc.addToFields(field);
return this;
}
/**
* Add a field whose value is a geo coordinate.
* Earlybird will process the coordinates into geo hashes before indexing.
*/
public final ThriftDocumentBuilder withGeoField(String fieldName,
double lat, double lon, int acc) {
if (!GeoUtil.validateGeoCoordinates(lat, lon)) {
// If the geo coordinates are invalid, don't add any field.
return this;
}
ThriftGeoCoordinate coord = new ThriftGeoCoordinate();
coord.setLat(lat);
coord.setLon(lon);
coord.setAccuracy(acc);
ThriftFieldData fieldData = new ThriftFieldData().setGeoCoordinate(coord);
ThriftField field = new ThriftField()
.setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData);
doc.addToFields(field);
return this;
}
/**
* Added a list of tokens that are weighted. The weights are stored inside payload.
* See {@link com.twitter.search.common.util.analysis.PayloadWeightedTokenizer} for more details.
*/
public final ThriftDocumentBuilder withPayloadWeightTokenStreamField(String fieldName,
String tokens) {
byte[] serialized;
try {
PayloadWeightedTokenizer tokenizer = new PayloadWeightedTokenizer(tokens);
serialized = PAYLOAD_WEIGHTED_SERIALIZER_PER_THREAD.get().serialize(tokenizer);
tokenizer.close();
} catch (IOException e) {
LOG.log(Level.WARNING,
"Failed to add PayloadWeightedTokenizer field. Bad token weight list: " + tokens, e);
return this;
} catch (NumberFormatException e) {
LOG.log(Level.WARNING,
"Failed to add PayloadWeightedTokenizer field. Cannot parse token weight: " + tokens, e);
return this;
}
withTokenStreamField(fieldName, tokens, serialized);
return this;
}
/**
* Add a field whose value is a list of longs.
* Each long is encoded into a LongTermAttribute.
* The field will contain a LongTermTokenStream.
*/
public final ThriftDocumentBuilder withLongIDsField(String fieldName,
List<Long> longList) throws IOException {
if (longList == null || longList.isEmpty()) {
return this;
}
LongTermsTokenStream stream = new LongTermsTokenStream(longList);
stream.reset();
byte[] serializedStream = LONG_TERM_SERIALIZER_PER_THREAD.get().serialize(stream);
ThriftFieldData fieldData = new ThriftFieldData().setTokenStreamValue(serializedStream);
ThriftField field = new ThriftField()
.setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData);
doc.addToFields(field);
return this;
}
}

View File

@ -1,25 +0,0 @@
# Library for Schema.java and other utilities with minimal dependencies.
java_library(
name = "base",
sources = ["*.java"],
platform = "java8",
provides = artifact(
org = "com.twitter.search.common",
name = "schema-base",
repo = artifactory,
),
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/code/findbugs:jsr305",
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/commons-lang",
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
"3rdparty/jvm/org/apache/lucene:lucene-core",
"3rdparty/jvm/org/apache/lucene:lucene-facet",
"3rdparty/jvm/org/apache/thrift:libthrift",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/text/util:token-util",
"src/thrift/com/twitter/search/common:features-java",
"src/thrift/com/twitter/search/common:schema-java",
],
)

View File

@ -1,374 +0,0 @@
package com.twitter.search.common.schema.base;
import javax.annotation.Nullable;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import com.twitter.common.text.util.TokenStreamSerializer;
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
import com.twitter.search.common.schema.thriftjava.ThriftCSFViewSettings;
import com.twitter.search.common.schema.thriftjava.ThriftFeatureUpdateConstraint;
/**
* An extension of Lucene's {@link FieldType} that contains additional Earlybird-specific settings.
* Lucene IndexingChains can downcast the FieldType object to access these additional settings.
*/
public class EarlybirdFieldType extends FieldType {
public static final EarlybirdFieldType LONG_CSF_FIELD_TYPE = new EarlybirdFieldType();
public static final EarlybirdFieldType INT_CSF_FIELD_TYPE = new EarlybirdFieldType();
public static final EarlybirdFieldType BYTE_CSF_FIELD_TYPE = new EarlybirdFieldType();
static {
LONG_CSF_FIELD_TYPE.setCsfType(ThriftCSFType.LONG);
LONG_CSF_FIELD_TYPE.setDocValuesType(DocValuesType.NUMERIC);
LONG_CSF_FIELD_TYPE.setCsfLoadIntoRam(true);
LONG_CSF_FIELD_TYPE.freeze();
INT_CSF_FIELD_TYPE.setCsfType(ThriftCSFType.INT);
INT_CSF_FIELD_TYPE.setDocValuesType(DocValuesType.NUMERIC);
INT_CSF_FIELD_TYPE.setCsfLoadIntoRam(true);
INT_CSF_FIELD_TYPE.freeze();
BYTE_CSF_FIELD_TYPE.setCsfType(ThriftCSFType.BYTE);
BYTE_CSF_FIELD_TYPE.setDocValuesType(DocValuesType.NUMERIC);
BYTE_CSF_FIELD_TYPE.setCsfLoadIntoRam(true);
BYTE_CSF_FIELD_TYPE.freeze();
}
private boolean storePerPositionPayloads;
private int defaultPayloadLength;
// This is true for fields that become immutable after optimization
private boolean becomesImmutable = true;
private boolean supportOrderedTerms;
private boolean supportTermTextLookup;
private boolean indexHFTermPairs;
/**
* This flag turns on tweet specific normalizations.
* This turns on the following two token processors:
* {@link com.twitter.search.common.util.text.splitter.HashtagMentionPunctuationSplitter}
* {@link com.twitter.search.common.util.text.filter.NormalizedTokenFilter}
*
* HashtagMentionPunctuationSplitter would break a mention or hashtag like @ab_cd or #ab_cd into
* tokens {ab, cd}.
* NormalizedTokenFilter strips out the # @ $ from the tokens.
*
*
* @deprecated we should remove this flag. It is confusing to have Earlybird apply additional
* tokenization on top of what ingester produced.
*/
@Deprecated
private boolean useTweetSpecificNormalization;
@Nullable
private TokenStreamSerializer.Builder tokenStreamSerializerProvider = null;
// csf type settings
private ThriftCSFType csfType;
private boolean csfVariableLength;
private int csfFixedLengthNumValuesPerDoc;
private boolean csfFixedLengthUpdateable;
private boolean csfLoadIntoRam;
private boolean csfDefaultValueSet;
private long csfDefaultValue;
// True if this is a CSF field which is a view on top of a different CSF field
private boolean csfViewField;
// If this field is a csf view, this is the ID of the CSF field backing the view
private int csfViewBaseFieldId;
private FeatureConfiguration csfViewFeatureConfiguration;
// facet field settings
private String facetName;
private boolean storeFacetSkiplist;
private boolean storeFacetOffensiveCounters;
private boolean useCSFForFacetCounting;
// Determines if this field is indexed
private boolean indexedField = false;
// search field settings
// whether a field should be searched by default
private boolean textSearchableByDefault = false;
private float textSearchableFieldWeight = 1.0f;
// For indexed numerical fields
private IndexedNumericFieldSettings numericFieldSettings = null;
public boolean isStorePerPositionPayloads() {
return storePerPositionPayloads;
}
public void setStorePerPositionPayloads(boolean storePerPositionPayloads) {
checkIfFrozen();
this.storePerPositionPayloads = storePerPositionPayloads;
}
public int getDefaultPayloadLength() {
return defaultPayloadLength;
}
public void setDefaultPayloadLength(int defaultPayloadLength) {
checkIfFrozen();
this.defaultPayloadLength = defaultPayloadLength;
}
public boolean becomesImmutable() {
return becomesImmutable;
}
public void setBecomesImmutable(boolean becomesImmutable) {
checkIfFrozen();
this.becomesImmutable = becomesImmutable;
}
public boolean isSupportOrderedTerms() {
return supportOrderedTerms;
}
public void setSupportOrderedTerms(boolean supportOrderedTerms) {
checkIfFrozen();
this.supportOrderedTerms = supportOrderedTerms;
}
public boolean isSupportTermTextLookup() {
return supportTermTextLookup;
}
public void setSupportTermTextLookup(boolean supportTermTextLookup) {
this.supportTermTextLookup = supportTermTextLookup;
}
@Nullable
public TokenStreamSerializer getTokenStreamSerializer() {
return tokenStreamSerializerProvider == null ? null : tokenStreamSerializerProvider.safeBuild();
}
public void setTokenStreamSerializerBuilder(TokenStreamSerializer.Builder provider) {
checkIfFrozen();
this.tokenStreamSerializerProvider = provider;
}
public ThriftCSFType getCsfType() {
return csfType;
}
public void setCsfType(ThriftCSFType csfType) {
checkIfFrozen();
this.csfType = csfType;
}
public boolean isCsfVariableLength() {
return csfVariableLength;
}
public int getCsfFixedLengthNumValuesPerDoc() {
return csfFixedLengthNumValuesPerDoc;
}
public void setCsfVariableLength() {
checkIfFrozen();
this.csfVariableLength = true;
}
/**
* Make the field a fixed length CSF, with the given length.
*/
public void setCsfFixedLengthSettings(int csfFixedLengthNumValuesPerDocument,
boolean isCsfFixedLengthUpdateable) {
checkIfFrozen();
this.csfVariableLength = false;
this.csfFixedLengthNumValuesPerDoc = csfFixedLengthNumValuesPerDocument;
this.csfFixedLengthUpdateable = isCsfFixedLengthUpdateable;
}
public boolean isCsfFixedLengthUpdateable() {
return csfFixedLengthUpdateable;
}
public boolean isCsfLoadIntoRam() {
return csfLoadIntoRam;
}
public void setCsfLoadIntoRam(boolean csfLoadIntoRam) {
checkIfFrozen();
this.csfLoadIntoRam = csfLoadIntoRam;
}
public void setCsfDefaultValue(long defaultValue) {
checkIfFrozen();
this.csfDefaultValue = defaultValue;
this.csfDefaultValueSet = true;
}
public long getCsfDefaultValue() {
return csfDefaultValue;
}
public boolean isCsfDefaultValueSet() {
return csfDefaultValueSet;
}
public String getFacetName() {
return facetName;
}
public void setFacetName(String facetName) {
checkIfFrozen();
this.facetName = facetName;
}
public boolean isStoreFacetSkiplist() {
return storeFacetSkiplist;
}
public void setStoreFacetSkiplist(boolean storeFacetSkiplist) {
checkIfFrozen();
this.storeFacetSkiplist = storeFacetSkiplist;
}
public boolean isStoreFacetOffensiveCounters() {
return storeFacetOffensiveCounters;
}
public void setStoreFacetOffensiveCounters(boolean storeFacetOffensiveCounters) {
checkIfFrozen();
this.storeFacetOffensiveCounters = storeFacetOffensiveCounters;
}
public boolean isUseCSFForFacetCounting() {
return useCSFForFacetCounting;
}
public void setUseCSFForFacetCounting(boolean useCSFForFacetCounting) {
checkIfFrozen();
this.useCSFForFacetCounting = useCSFForFacetCounting;
}
public boolean isFacetField() {
return facetName != null && !StringUtils.isEmpty(facetName);
}
public boolean isIndexHFTermPairs() {
return indexHFTermPairs;
}
public void setIndexHFTermPairs(boolean indexHFTermPairs) {
checkIfFrozen();
this.indexHFTermPairs = indexHFTermPairs;
}
public boolean acceptPretokenizedField() {
return tokenStreamSerializerProvider != null;
}
/**
* set this field to use additional twitter specific tokenization.
* @deprecated should avoid doing additional tokenizations on top of what ingester produced.
*/
@Deprecated
public boolean useTweetSpecificNormalization() {
return useTweetSpecificNormalization;
}
/**
* test whether this field uses additional twitter specific tokenization.
* @deprecated should avoid doing additional tokenizations on top of what ingester produced.
*/
@Deprecated
public void setUseTweetSpecificNormalization(boolean useTweetSpecificNormalization) {
checkIfFrozen();
this.useTweetSpecificNormalization = useTweetSpecificNormalization;
}
public boolean isIndexedField() {
return indexedField;
}
public void setIndexedField(boolean indexedField) {
this.indexedField = indexedField;
}
public boolean isTextSearchableByDefault() {
return textSearchableByDefault;
}
public void setTextSearchableByDefault(boolean textSearchableByDefault) {
checkIfFrozen();
this.textSearchableByDefault = textSearchableByDefault;
}
public float getTextSearchableFieldWeight() {
return textSearchableFieldWeight;
}
public void setTextSearchableFieldWeight(float textSearchableFieldWeight) {
checkIfFrozen();
this.textSearchableFieldWeight = textSearchableFieldWeight;
}
/**
* Convenience method to find out if this field stores positions. {@link #indexOptions()} can also
* be used to determine the index options for this field.
*/
public final boolean hasPositions() {
return indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS
|| indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
}
public boolean isCsfViewField() {
return csfViewField;
}
public int getCsfViewBaseFieldId() {
return csfViewBaseFieldId;
}
public FeatureConfiguration getCsfViewFeatureConfiguration() {
return csfViewFeatureConfiguration;
}
/**
* Set the CSF view settings. A CSF view is a portion of an another CSF.
*/
public void setCsfViewSettings(String fieldName,
ThriftCSFViewSettings csfViewSettings,
Schema.FieldInfo baseField) {
checkIfFrozen();
this.csfViewField = true;
this.csfViewBaseFieldId = csfViewSettings.getBaseFieldConfigId();
FeatureConfiguration.Builder builder = FeatureConfiguration.builder()
.withName(fieldName)
.withType(csfViewSettings.csfType)
.withBitRange(csfViewSettings.getValueIndex(),
csfViewSettings.getBitStartPosition(),
csfViewSettings.getBitLength())
.withBaseField(baseField.getName());
if (csfViewSettings.isSetOutputCSFType()) {
builder.withOutputType(csfViewSettings.getOutputCSFType());
}
if (csfViewSettings.isSetNormalizationType()) {
builder.withFeatureNormalizationType(csfViewSettings.getNormalizationType());
}
if (csfViewSettings.isSetFeatureUpdateConstraints()) {
for (ThriftFeatureUpdateConstraint c : csfViewSettings.getFeatureUpdateConstraints()) {
builder.withFeatureUpdateConstraint(c);
}
}
this.csfViewFeatureConfiguration = builder.build();
}
public IndexedNumericFieldSettings getNumericFieldSettings() {
return numericFieldSettings;
}
public void setNumericFieldSettings(IndexedNumericFieldSettings numericFieldSettings) {
checkIfFrozen();
this.numericFieldSettings = numericFieldSettings;
}
}

View File

@ -1,316 +0,0 @@
package com.twitter.search.common.schema.base;
import java.util.Set;
import javax.annotation.Nullable;
import com.google.common.base.Preconditions;
import com.google.common.collect.Sets;
import com.twitter.common.base.MorePreconditions;
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
import com.twitter.search.common.schema.thriftjava.ThriftFeatureNormalizationType;
import com.twitter.search.common.schema.thriftjava.ThriftFeatureUpdateConstraint;
// FeatureConfiguration is defined for all the column stride view fields.
public final class FeatureConfiguration {
private final String name;
private final int intIndex;
// Start position in the given int (0-31)
private final int bitStartPos;
// Length in bits of the feature
private final int bitLength;
// precomputed for reuse
private final int bitMask;
private final int inverseBitMask;
private final int maxValue;
private final ThriftCSFType type;
// This is the client seen feature type: if this is null, this field is unused.
@Nullable
private final ThriftCSFType outputType;
private final String baseField;
private final Set<FeatureConstraint> featureUpdateConstraints;
private final ThriftFeatureNormalizationType featureNormalizationType;
/**
* Creates a new FeatureConfiguration with a base field.
*
* @param intIndex which integer is the feature in (0 based).
* @param bitStartPos at which bit does the feature start (0-31).
* @param bitLength length in bits of the feature
* @param baseField the CSF this feature is stored within.
*/
private FeatureConfiguration(
String name,
ThriftCSFType type,
ThriftCSFType outputType,
int intIndex,
int bitStartPos,
int bitLength,
String baseField,
Set<FeatureConstraint> featureUpdateConstraints,
ThriftFeatureNormalizationType featureNormalizationType) {
Preconditions.checkState(bitStartPos + bitLength <= Integer.SIZE,
"Feature must not cross int boundary.");
this.name = MorePreconditions.checkNotBlank(name);
this.type = Preconditions.checkNotNull(type);
this.outputType = outputType;
this.intIndex = intIndex;
this.bitStartPos = bitStartPos;
this.bitLength = bitLength;
// Technically, int-sized features can use all 32 bits to store a positive value greater than
// Integer.MAX_VALUE. But in practice, we will convert the values of those features to Java ints
// on the read side, so the max value for those features will still be Integer.MAX_VALUE.
this.maxValue = (1 << Math.min(bitLength, Integer.SIZE - 1)) - 1;
this.bitMask = (int) (((1L << bitLength) - 1) << bitStartPos);
this.inverseBitMask = ~bitMask;
this.baseField = baseField;
this.featureUpdateConstraints = featureUpdateConstraints;
this.featureNormalizationType = Preconditions.checkNotNull(featureNormalizationType);
}
public String getName() {
return name;
}
public int getMaxValue() {
return maxValue;
}
@Override
public String toString() {
return new StringBuilder().append(name)
.append(" (").append(intIndex).append(", ")
.append(bitStartPos).append(", ")
.append(bitLength).append(") ").toString();
}
public int getValueIndex() {
return intIndex;
}
public int getBitStartPosition() {
return bitStartPos;
}
public int getBitLength() {
return bitLength;
}
public int getBitMask() {
return bitMask;
}
public int getInverseBitMask() {
return inverseBitMask;
}
public String getBaseField() {
return baseField;
}
public ThriftCSFType getType() {
return type;
}
@Nullable
public ThriftCSFType getOutputType() {
return outputType;
}
public ThriftFeatureNormalizationType getFeatureNormalizationType() {
return featureNormalizationType;
}
/**
* Returns the update constraint for the feature.
*/
public Set<ThriftFeatureUpdateConstraint> getUpdateConstraints() {
if (featureUpdateConstraints == null) {
return null;
}
Set<ThriftFeatureUpdateConstraint> constraintSet = Sets.newHashSet();
for (FeatureConstraint constraint : featureUpdateConstraints) {
constraintSet.add(constraint.getType());
}
return constraintSet;
}
/**
* Returns true if the given update satisfies all feature update constraints.
*/
public boolean validateFeatureUpdate(final Number oldValue, final Number newValue) {
if (featureUpdateConstraints != null) {
for (FeatureConstraint contraint : featureUpdateConstraints) {
if (!contraint.apply(oldValue, newValue)) {
return false;
}
}
}
return true;
}
@Override
public int hashCode() {
return (name == null ? 0 : name.hashCode())
+ intIndex * 7
+ bitStartPos * 13
+ bitLength * 23
+ bitMask * 31
+ inverseBitMask * 43
+ (int) maxValue * 53
+ (type == null ? 0 : type.hashCode()) * 61
+ (outputType == null ? 0 : outputType.hashCode()) * 71
+ (baseField == null ? 0 : baseField.hashCode()) * 83
+ (featureUpdateConstraints == null ? 0 : featureUpdateConstraints.hashCode()) * 87
+ (featureNormalizationType == null ? 0 : featureNormalizationType.hashCode()) * 97;
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof FeatureConfiguration)) {
return false;
}
FeatureConfiguration featureConfiguration = FeatureConfiguration.class.cast(obj);
return (name == featureConfiguration.name)
&& (bitStartPos == featureConfiguration.bitStartPos)
&& (bitLength == featureConfiguration.bitLength)
&& (bitMask == featureConfiguration.bitMask)
&& (inverseBitMask == featureConfiguration.inverseBitMask)
&& (maxValue == featureConfiguration.maxValue)
&& (type == featureConfiguration.type)
&& (outputType == featureConfiguration.outputType)
&& (baseField == featureConfiguration.baseField)
&& (featureUpdateConstraints == null
? featureConfiguration.featureUpdateConstraints == null
: featureUpdateConstraints.equals(featureConfiguration.featureUpdateConstraints))
&& (featureNormalizationType == null
? featureConfiguration.featureNormalizationType == null
: featureNormalizationType.equals(featureConfiguration.featureNormalizationType));
}
private interface FeatureConstraint {
boolean apply(Number oldValue, Number newValue);
ThriftFeatureUpdateConstraint getType();
}
public static Builder builder() {
return new Builder();
}
public static final class Builder {
private String name;
private ThriftCSFType type;
private ThriftCSFType outputType;
private int intIndex;
// Start position in the given int (0-31)
private int bitStartPos;
// Length in bits of the feature
private int bitLength;
private String baseField;
private Set<FeatureConstraint> featureUpdateConstraints;
private ThriftFeatureNormalizationType featureNormalizationType =
ThriftFeatureNormalizationType.NONE;
public FeatureConfiguration build() {
return new FeatureConfiguration(name, type, outputType, intIndex, bitStartPos, bitLength,
baseField, featureUpdateConstraints, featureNormalizationType);
}
public Builder withName(String n) {
this.name = n;
return this;
}
public Builder withType(ThriftCSFType featureType) {
this.type = featureType;
return this;
}
public Builder withOutputType(ThriftCSFType featureFeatureType) {
this.outputType = featureFeatureType;
return this;
}
public Builder withFeatureNormalizationType(
ThriftFeatureNormalizationType normalizationType) {
this.featureNormalizationType = Preconditions.checkNotNull(normalizationType);
return this;
}
/**
* Sets the bit range at the given intIndex, startPos and length.
*/
public Builder withBitRange(int index, int startPos, int length) {
this.intIndex = index;
this.bitStartPos = startPos;
this.bitLength = length;
return this;
}
public Builder withBaseField(String baseFieldName) {
this.baseField = baseFieldName;
return this;
}
/**
* Adds a feature update constraint.
*/
public Builder withFeatureUpdateConstraint(final ThriftFeatureUpdateConstraint constraint) {
if (featureUpdateConstraints == null) {
featureUpdateConstraints = Sets.newHashSet();
}
switch (constraint) {
case IMMUTABLE:
featureUpdateConstraints.add(new FeatureConstraint() {
@Override public boolean apply(Number oldValue, Number newValue) {
return false;
}
@Override public ThriftFeatureUpdateConstraint getType() {
return ThriftFeatureUpdateConstraint.IMMUTABLE;
}
});
break;
case INC_ONLY:
featureUpdateConstraints.add(new FeatureConstraint() {
@Override public boolean apply(Number oldValue, Number newValue) {
return newValue.intValue() > oldValue.intValue();
}
@Override public ThriftFeatureUpdateConstraint getType() {
return ThriftFeatureUpdateConstraint.INC_ONLY;
}
});
break;
case POSITIVE:
featureUpdateConstraints.add(new FeatureConstraint() {
@Override public boolean apply(Number oldValue, Number newValue) {
return newValue.intValue() >= 0;
}
@Override public ThriftFeatureUpdateConstraint getType() {
return ThriftFeatureUpdateConstraint.POSITIVE;
}
});
break;
default:
}
return this;
}
private Builder() {
}
}
}

Some files were not shown because too many files have changed in this diff Show More