<?xml version="1.0" encoding="UTF-8"?>


<!-- Ingesters process tweet create events from TweetyPie and write them to a queue for Earlybird
to index. -->
<pipeline>
  <property
      propName="validator"
      className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
  <listener
      className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
  <driverFactory
      className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
      id="kafka">

    <property
        propName="queueFactory"
        className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
        capacity="1000"
        fair="false"/>
  </driverFactory>

  <!-- Read tweets from the thrift kafka queue. The reader loops forever. -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.kafka.KafkaRawRecordConsumerStage"
      kafkaClusterPath=""
      kafkaClientId=""
      kafkaTopicName=""
      kafkaConsumerGroupId=""
      maxPollRecords="1"
      pollTimeoutMs="1000"
      partitioned="false"
      deciderKey=""
      driverFactoryId="kafka"/>

  <!-- Deserialize the bytes into TweetData -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.TweetEventDeserializerStage"
      driverFactoryId="kafka"/>

  <!-- Filter to only have the safetytype for this cluster -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.FilterEventsBySafetyTypeStage"
      tweetCreateLatencyLogThresholdMillis="5000"
      safetyType="PUBLIC"
      driverFactoryId="kafka"/>

  <!-- Parse to TwitterMessage -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.ThriftTweetParserStage"
      tweetCreateEventBranchNames="kafka_retweet_and_reply"
      tweetDeleteEventBranchNames="kafka_update_events_delete"
      driverFactoryId="kafka"/>

  <branch>
    <pipeline key="kafka_update_events_delete">
      <property
          propName="validator"
          className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
      <listener
          className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
      <driverFactory
          className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
          id="kafka_update_events_delete">

        <!-- we are willing to queue more deletes than other stages,
             to make sure we don't slow down the incoming tweets -->
        <property
            propName="queueFactory"
            className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
            capacity="1000"
            fair="false"/>
      </driverFactory>

      <stage
          className="com.twitter.search.ingester.pipeline.twitter.kafka.DeleteUpdateEventsKafkaProducerStage"
          kafkaClusterPath=""
          kafkaClientId=""
          kafkaTopicName=""
          driverFactoryId="kafka_update_events_delete"/>
    </pipeline>
  </branch>

  <!-- Processes retweets and replies to tweets -->
  <branch>
    <pipeline key="kafka_retweet_and_reply">
      <property
          propName="validator"
          className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
      <listener
          className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
      <driverFactory
          className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
          id="kafka_retweet_and_reply">

        <property
            propName="queueFactory"
            className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
            capacity="1000"
            fair="false"/>
      </driverFactory>

      <!-- An incoming reply to this stage can either be a tweet directed at someone using @mention, or
           a tweet that is a direct reply to another tweet. This stage filters retweets and tweets that are
           direct replies to other tweets into the retweet_and_reply pipeline -->
      <stage
          className="com.twitter.search.ingester.pipeline.twitter.FilterRetweetsAndRepliesStage"
          driverFactoryId="kafka_retweet_and_reply"/>

      <stage
          className="com.twitter.search.ingester.pipeline.twitter.ConvertToThriftVersionedEventsStage"
          driverFactoryId="kafka_retweet_and_reply"/>

      <stage
          className="com.twitter.search.ingester.pipeline.twitter.kafka.RetweetAndReplyUpdateEventsKafkaProducerStage"
          kafkaClusterPath=""
          kafkaClientId=""
          kafkaTopicName=""
          driverFactoryId="kafka_retweet_and_reply"/>
    </pipeline>
  </branch>

  <!-- filters out messages that are not formatted correctly -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.FilterTwitterMessageStage"
      driverFactoryId="kafka"/>

  <!-- retrieves space ids from space urls if the tweet has space urls -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.RetrieveSpaceIdsStage"
      driverFactoryId="kafka"/>


  <!-- looks up user reputation scores for each message -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.LookupUserPropertiesBatchedStage"
      driverFactoryId="kafka"/>

  <!-- extract text features of the message -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.TextFeatureExtractionWorkersStage"
      driverFactoryId="kafka"/>

  <!-- compute text quality score of the message -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.TextQualityEvaluationWorkerStage"
      driverFactoryId="kafka"/>

  <!-- Extract lat/lon pairs from the text, and geocode them -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.SingleTweetExtractAndGeocodeLatLonStage"
      driverFactoryId="kafka"/>

  <!-- adds coded locations -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.PopulateCodedLocationsBatchedStage"
      driverFactoryId="kafka"/>

  <!-- Parse the TwitterMessages into ThriftStatuses -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.ConvertMessageToThriftStage"
      thriftVersionedEventsBranchName="kafka_base_tweets"
      driverFactoryId="kafka"/>

  <!-- Branch for tweets -->
  <branch>
    <pipeline key="kafka_base_tweets">
      <property
          propName="validator"
          className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
      <listener
          className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
      <driverFactory
          className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
          id="kafka_base_tweets">

        <property
            propName="queueFactory"
            className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
            capacity="1000"
            fair="false"/>
      </driverFactory>

      <stage
          className="com.twitter.search.ingester.pipeline.twitter.kafka.TweetThriftVersionedEventsKafkaProducerStage"
          kafkaClusterPath=""
          kafkaClientId=""
          kafkaTopicName="search_ingester_indexing_events_realtime_prod"
          driverFactoryId="kafka_base_tweets"/>
    </pipeline>
  </branch>

  <!-- Resolve compressed URL via Pink -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.ResolveCompressedUrlsBatchedStage"
      pinkClientId="INGESTER"
      batchedStageBatchSize="10"
      tweetMaxAgeToResolve="10000"
      driverFactoryId="kafka"/>

  <!-- Retrieve card information -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.RetrieveCardBatchedStage"
      tweetypieClientId="ingester.prod"
      internalBatchSize="50"
      driverFactoryId="kafka"/>

  <!-- Retrieve named entities -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.RetrieveNamedEntitiesSingleTweetStage"
      driverFactoryId="kafka"/>

  <!-- retrieves space admins and title for a tweet if the tweet has space urls -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.RetrieveSpaceAdminsAndTitleStage"
      driverFactoryId="kafka"/>

  <!-- extract text features of the message -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.TextUrlsFeatureExtractionStage"
      driverFactoryId="kafka"/>

  <!-- Compute the tweet signature -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.ComputeTweetSignatureStage"
      driverFactoryId="kafka"/>

  <!-- Parse the TwitterMessages into ThriftStatuses -->
  <stage
      className="com.twitter.search.ingester.pipeline.twitter.ConvertDelayedMessageToThriftStage"
      driverFactoryId="kafka"/>

  <stage
      className="com.twitter.search.ingester.pipeline.twitter.kafka.TweetThriftVersionedEventsKafkaProducerStage"
      kafkaClusterPath=""
      stageName="UpdateEvents"
      kafkaClientId=""
      kafkaTopicName=""
      driverFactoryId="kafka"/>
</pipeline>