the-algorithm/science/search/ingester/config/pipeline-ingester.realtime.xml

241 lines
8.8 KiB
XML
Raw Normal View History

2023-03-31 19:53:53 +02:00
<?xml version="1.0" encoding="UTF-8"?>
<!-- Ingesters process tweet create events from TweetyPie and write them to a queue for Earlybird
to index. -->
<pipeline>
<property
propName="validator"
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
<listener
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
<driverFactory
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
id="kafka">
<property
propName="queueFactory"
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
capacity="1000"
fair="false"/>
</driverFactory>
<!-- Read tweets from the thrift kafka queue. The reader loops forever. -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.kafka.KafkaRawRecordConsumerStage"
kafkaClusterPath=""
kafkaClientId=""
kafkaTopicName=""
kafkaConsumerGroupId=""
maxPollRecords="1"
pollTimeoutMs="1000"
partitioned="false"
deciderKey=""
driverFactoryId="kafka"/>
<!-- Deserialize the bytes into TweetData -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.TweetEventDeserializerStage"
driverFactoryId="kafka"/>
<!-- Filter to only have the safetytype for this cluster -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.FilterEventsBySafetyTypeStage"
tweetCreateLatencyLogThresholdMillis="5000"
safetyType="PUBLIC"
driverFactoryId="kafka"/>
<!-- Parse to TwitterMessage -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.ThriftTweetParserStage"
tweetCreateEventBranchNames="kafka_retweet_and_reply"
tweetDeleteEventBranchNames="kafka_update_events_delete"
driverFactoryId="kafka"/>
<branch>
<pipeline key="kafka_update_events_delete">
<property
propName="validator"
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
<listener
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
<driverFactory
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
id="kafka_update_events_delete">
<!-- we are willing to queue more deletes than other stages,
to make sure we don't slow down the incoming tweets -->
<property
propName="queueFactory"
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
capacity="1000"
fair="false"/>
</driverFactory>
<stage
className="com.twitter.search.ingester.pipeline.twitter.kafka.DeleteUpdateEventsKafkaProducerStage"
kafkaClusterPath=""
kafkaClientId=""
kafkaTopicName=""
driverFactoryId="kafka_update_events_delete"/>
</pipeline>
</branch>
<!-- Processes retweets and replies to tweets -->
<branch>
<pipeline key="kafka_retweet_and_reply">
<property
propName="validator"
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
<listener
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
<driverFactory
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
id="kafka_retweet_and_reply">
<property
propName="queueFactory"
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
capacity="1000"
fair="false"/>
</driverFactory>
<!-- An incoming reply to this stage can either be a tweet directed at someone using @mention, or
a tweet that is a direct reply to another tweet. This stage filters retweets and tweets that are
direct replies to other tweets into the retweet_and_reply pipeline -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.FilterRetweetsAndRepliesStage"
driverFactoryId="kafka_retweet_and_reply"/>
<stage
className="com.twitter.search.ingester.pipeline.twitter.ConvertToThriftVersionedEventsStage"
driverFactoryId="kafka_retweet_and_reply"/>
<stage
className="com.twitter.search.ingester.pipeline.twitter.kafka.RetweetAndReplyUpdateEventsKafkaProducerStage"
kafkaClusterPath=""
kafkaClientId=""
kafkaTopicName=""
driverFactoryId="kafka_retweet_and_reply"/>
</pipeline>
</branch>
<!-- filters out messages that are not formatted correctly -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.FilterTwitterMessageStage"
driverFactoryId="kafka"/>
<!-- retrieves space ids from space urls if the tweet has space urls -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.RetrieveSpaceIdsStage"
driverFactoryId="kafka"/>
<!-- looks up user reputation scores for each message -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.LookupUserPropertiesBatchedStage"
driverFactoryId="kafka"/>
<!-- extract text features of the message -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.TextFeatureExtractionWorkersStage"
driverFactoryId="kafka"/>
<!-- compute text quality score of the message -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.TextQualityEvaluationWorkerStage"
driverFactoryId="kafka"/>
<!-- Extract lat/lon pairs from the text, and geocode them -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.SingleTweetExtractAndGeocodeLatLonStage"
driverFactoryId="kafka"/>
<!-- adds coded locations -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.PopulateCodedLocationsBatchedStage"
driverFactoryId="kafka"/>
<!-- Parse the TwitterMessages into ThriftStatuses -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.ConvertMessageToThriftStage"
thriftVersionedEventsBranchName="kafka_base_tweets"
driverFactoryId="kafka"/>
<!-- Branch for tweets -->
<branch>
<pipeline key="kafka_base_tweets">
<property
propName="validator"
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
<listener
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
<driverFactory
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
id="kafka_base_tweets">
<property
propName="queueFactory"
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
capacity="1000"
fair="false"/>
</driverFactory>
<stage
className="com.twitter.search.ingester.pipeline.twitter.kafka.TweetThriftVersionedEventsKafkaProducerStage"
kafkaClusterPath=""
kafkaClientId=""
kafkaTopicName="search_ingester_indexing_events_realtime_prod"
driverFactoryId="kafka_base_tweets"/>
</pipeline>
</branch>
<!-- Resolve compressed URL via Pink -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.ResolveCompressedUrlsBatchedStage"
pinkClientId="INGESTER"
batchedStageBatchSize="10"
tweetMaxAgeToResolve="10000"
driverFactoryId="kafka"/>
<!-- Retrieve card information -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.RetrieveCardBatchedStage"
tweetypieClientId="ingester.prod"
internalBatchSize="50"
driverFactoryId="kafka"/>
<!-- Retrieve named entities -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.RetrieveNamedEntitiesSingleTweetStage"
driverFactoryId="kafka"/>
<!-- retrieves space admins and title for a tweet if the tweet has space urls -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.RetrieveSpaceAdminsAndTitleStage"
driverFactoryId="kafka"/>
<!-- extract text features of the message -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.TextUrlsFeatureExtractionStage"
driverFactoryId="kafka"/>
<!-- Compute the tweet signature -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.ComputeTweetSignatureStage"
driverFactoryId="kafka"/>
<!-- Parse the TwitterMessages into ThriftStatuses -->
<stage
className="com.twitter.search.ingester.pipeline.twitter.ConvertDelayedMessageToThriftStage"
driverFactoryId="kafka"/>
<stage
className="com.twitter.search.ingester.pipeline.twitter.kafka.TweetThriftVersionedEventsKafkaProducerStage"
kafkaClusterPath=""
stageName="UpdateEvents"
kafkaClientId=""
kafkaTopicName=""
driverFactoryId="kafka"/>
</pipeline>