mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-01 08:48:46 +02:00
ef4c5eb65e
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
72 lines
2.8 KiB
Scala
72 lines
2.8 KiB
Scala
package com.twitter.interaction_graph.scio.agg_address_book
|
|
|
|
import com.spotify.scio.ScioContext
|
|
import com.spotify.scio.values.SCollection
|
|
import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord
|
|
import com.twitter.beam.io.dal.DAL
|
|
import com.twitter.beam.io.dal.DAL.DiskFormat
|
|
import com.twitter.beam.io.dal.DAL.PathLayout
|
|
import com.twitter.beam.io.dal.DAL.WriteOptions
|
|
import com.twitter.beam.job.ServiceIdentifierOptions
|
|
import com.twitter.scio_internal.job.ScioBeamJob
|
|
import com.twitter.statebird.v2.thriftscala.Environment
|
|
import com.twitter.interaction_graph.thriftscala.Edge
|
|
import com.twitter.interaction_graph.thriftscala.Vertex
|
|
import java.time.Instant
|
|
import org.joda.time.Interval
|
|
|
|
object InteractionGraphAddressBookJob extends ScioBeamJob[InteractionGraphAddressBookOption] {
|
|
override protected def configurePipeline(
|
|
scioContext: ScioContext,
|
|
pipelineOptions: InteractionGraphAddressBookOption
|
|
): Unit = {
|
|
@transient
|
|
implicit lazy val sc: ScioContext = scioContext
|
|
implicit lazy val dateInterval: Interval = pipelineOptions.interval
|
|
implicit lazy val addressBookCounters: InteractionGraphAddressBookCountersTrait =
|
|
InteractionGraphAddressBookCounters
|
|
|
|
val interactionGraphAddressBookSource = InteractionGraphAddressBookSource(pipelineOptions)
|
|
|
|
val addressBook: SCollection[UserMatchesRecord] =
|
|
interactionGraphAddressBookSource.readSimpleUserMatches(
|
|
dateInterval.withStart(dateInterval.getStart.minusDays(3))
|
|
)
|
|
val (vertex, edges) = InteractionGraphAddressBookUtil.process(addressBook)
|
|
|
|
val dalEnvironment: String = pipelineOptions
|
|
.as(classOf[ServiceIdentifierOptions])
|
|
.getEnvironment()
|
|
val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) {
|
|
pipelineOptions.getDALWriteEnvironment
|
|
} else {
|
|
dalEnvironment
|
|
}
|
|
|
|
vertex.saveAsCustomOutput(
|
|
"Write Vertex Records",
|
|
DAL.writeSnapshot[Vertex](
|
|
InteractionGraphAggAddressBookVertexSnapshotScalaDataset,
|
|
PathLayout.DailyPath(pipelineOptions.getOutputPath + "/address_book_vertex_daily"),
|
|
Instant.ofEpochMilli(dateInterval.getEndMillis),
|
|
DiskFormat.Parquet,
|
|
Environment.valueOf(dalWriteEnvironment),
|
|
writeOption =
|
|
WriteOptions(numOfShards = Some((pipelineOptions.getNumberOfShards / 16.0).ceil.toInt))
|
|
)
|
|
)
|
|
|
|
edges.saveAsCustomOutput(
|
|
"Write Edge Records",
|
|
DAL.writeSnapshot[Edge](
|
|
InteractionGraphAggAddressBookEdgeSnapshotScalaDataset,
|
|
PathLayout.DailyPath(pipelineOptions.getOutputPath + "/address_book_edge_daily"),
|
|
Instant.ofEpochMilli(dateInterval.getEndMillis),
|
|
DiskFormat.Parquet,
|
|
Environment.valueOf(dalWriteEnvironment),
|
|
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards))
|
|
)
|
|
)
|
|
}
|
|
}
|