the-algorithm/navi/navi/proto/tensorflow/core/protobuf/coordination_service.proto

syntax = "proto3";

package tensorflow;

import "tensorflow/compiler/xla/pjrt/distributed/protocol.proto";
import "tensorflow/core/framework/device_attributes.proto";

option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";

// Represents a remote worker task, specified by job name and task id.
message CoordinatedTask {
  string job_name = 1;
  int32 task_id = 2;
}

// Status payload for all coordination service errors.
// Note: an empty proto may be set if the error is triggered by the task's own
// agent calls (i.e. not propagated by the service from another remote task).
message CoordinationServiceError {
  // Removed fields which used to specify the error origin.
  reserved 1, 2;
  // If true, error is reported via the agent API by the user (and not an
  // internal service error).
  bool is_reported_error = 3;
  // Denotes which task hit the error. If unset, the error originated from the
  // same task that is processing this error.
  CoordinatedTask source_task = 4;
}

// Represent device information from different runtimes.
message TfDeviceList {
  repeated DeviceAttributes devices = 1;
}
message XlaDeviceList {
  xla.GlobalTopologyProto devices = 1;
}
message CoordinationServiceDeviceInfo {
  oneof type {
    TfDeviceList tf = 1;
    XlaDeviceList xla = 2;
  }
}

// Request and response messages for registering a worker to the cluster leader.
// Use `job` and `task` to represent the role of the worker, and use
// `incarnation` to uniquely identify a worker process. Leader responds with its
// `incarnation` to identify a leader process.
message RegisterWorkerRequest {
  // Removed fields which used to specify the task.
  reserved 1, 2;
  fixed64 incarnation = 3;
  // Moved the field `local_device_attributes` from this request message to
  // WaitForAllTasksRequest defined below.
  reserved 4;
  CoordinatedTask source_task = 5;
}

message RegisterWorkerResponse {
  fixed64 leader_incarnation = 1;
}

// Request and response messages for sending heartbeats.
message HeartbeatRequest {
  // Removed fields which used to specify the remote task.
  reserved 1, 2;
  fixed64 incarnation = 3;
  CoordinatedTask source_task = 4;
}

message HeartbeatResponse {
  fixed64 leader_incarnation = 1;
  // If there are failures in cluster, use additional metadata in response to
  // broadcast error code and message to other workers.
}

// Request and response messages for waiting for all tasks.
message WaitForAllTasksRequest {
  // Removed fields which used to specify the remote task.
  reserved 1, 2;
  // Removed field that specifically used TF device info.
  reserved 3;
  // All local device attributes on the request sender.
  CoordinationServiceDeviceInfo local_device_info = 4;
  CoordinatedTask source_task = 5;
}

message WaitForAllTasksResponse {
  fixed64 leader_incarnation = 1;
  // Removed field that specifically used TF device info.
  reserved 2;
  // All devices in the cluster.
  CoordinationServiceDeviceInfo cluster_device_info = 3;
}

// Request and response messages for reporting errors to task.
message ReportErrorToAgentRequest {
  int32 error_code = 1;
  string error_message = 2;
  // Removed fields that are embedded in payload.
  reserved 3, 4;
  CoordinationServiceError error_payload = 5;
}

message ReportErrorToAgentResponse {}

// Request and response messages for reporting errors to service instance.
message ReportErrorToServiceRequest {
  int32 error_code = 1;
  string error_message = 2;
  // Removed fields which used to specify the error origin.
  reserved 3, 4;
  CoordinatedTask error_origin = 5;
}

message ReportErrorToServiceResponse {}

// Message for configuration key value.
// Key is structured like Unix file system, with multiple levels of directory
// names separated by the slash ('/') characters.
message KeyValueEntry {
  string key = 1;
  bytes value = 2;
}

// Request and response messages for inserting configuration key-value data.
message InsertKeyValueRequest {
  KeyValueEntry kv = 1;
}

message InsertKeyValueResponse {}

// Request and response messages for getting configuration key-value data.
message GetKeyValueRequest {
  string key = 1;
}

message GetKeyValueResponse {
  KeyValueEntry kv = 1;
}

// Request and response messages for deleting configuration key-value data.
// When is_directory is true, delete key-values recursively under `key`.
message DeleteKeyValueRequest {
  string key = 1;
  bool is_directory = 2;
}

message DeleteKeyValueResponse {}

// Request and response messages for generic sync barriers.
message BarrierRequest {
  string barrier_id = 1;
  int64 barrier_timeout_in_ms = 2;
  // Denotes list of tasks that will wait for the barrier. If unspecified, it
  // implies that the entire cluster is participating in the barrier.
  repeated CoordinatedTask tasks = 3;
  // Task that is making the request.
  CoordinatedTask source_task = 4;
}

message BarrierResponse {}

// Request and response messages for  cancelling generic sync barriers.
message CancelBarrierRequest {
  string barrier_id = 1;
  // Task that is making the request.
  CoordinatedTask source_task = 2;
}

message CancelBarrierResponse {}

// Coordination Service defines a TensorFlow service that controls and
// coordinates distributed execution in a cluster of multiple workers.
//
// The service keeps track of the cluster configuration and the state of cluster
// members or the leader depending on the role of the current worker. The
// distributed runtime leverages this service to coordinate and perform cluster
// initialization, check the healthiness of workers, and propagate error
// messages to the cluster.
service CoordinationService {
  // Register task to coordination service so that the service starts to track
  // liveness of the task. RPC blocks and returns only when it registers to
  // the service successfully, or error happens in the registering process.
  rpc RegisterWorker(RegisterWorkerRequest) returns (RegisterWorkerResponse);

  // Heartbeat message from task to coordination service. Heartbeat is sent from
  // a task to refresh its timestamp on leader to avoid it becoming stale.
  // RPC responds immediately after refreshing the timestamp on leader.
  rpc Heartbeat(HeartbeatRequest) returns (HeartbeatResponse);

  // Wait for all tasks in the cluster to be up and running. The RPC request
  // only gets responded when all workers are registered, or some error occurs.
  rpc WaitForAllTasks(WaitForAllTasksRequest) returns (WaitForAllTasksResponse);

  // Report error to the task. RPC sets the receiving instance of coordination
  // service agent to error state permanently.
  // TODO(b/195990880): Consider splitting this into a different RPC service.
  rpc ReportErrorToAgent(ReportErrorToAgentRequest)
      returns (ReportErrorToAgentResponse);

  // Report task error to coordination service. RPC sets the service-side task
  // state to error, and propagate the error to other tasks in the cluster.
  rpc ReportErrorToService(ReportErrorToServiceRequest)
      returns (ReportErrorToServiceResponse);

  // Insert configuration key-value that will be accessible to all cluster
  // workers. The key can be formatted as Unix file path with hierarchy. The
  // coordination service key-value store should only be used for cluster
  // configuration data.
  rpc InsertKeyValue(InsertKeyValueRequest) returns (InsertKeyValueResponse);

  // Get configuration key-value. The request blocks until the key-value data
  // becomes available (i.e., set by a worker in the cluster).
  rpc GetKeyValue(GetKeyValueRequest) returns (GetKeyValueResponse);

  // Delete configuration key-value. If is_directory is set in request,
  // recursively clean up all key-values under the path specified by `key`.
  rpc DeleteKeyValue(DeleteKeyValueRequest) returns (DeleteKeyValueResponse);

  // Blocks until all (or a subset of) tasks are at the barrier or the barrier
  // fails.
  //
  // `barrier_id` should be unique across barriers. Once the barrier has passed
  // or failed, subsequent calls will not block, and immediately respond with
  // the previous response.
  //
  // The first WaitAtBarrier() call received by the service for a particular
  // barrier id is special in that it determines the barrier deadline based on
  // timeout duration.
  // However, if subsequent calls by different agents specify a different set of
  // `tasks` for the same `barrier_id`, the barrier will fail instantly.
  //
  // If no tasks are specified (default), the barrier will block for all the
  // connected tasks.
  //
  // Possible service errors:
  //   - DeadlineExceeded: Timed out waiting for specified tasks at the barrier.
  //      Deadline is determined by the server timestamp when it receives the
  //      first WaitAtBarrier() + timeout duration.
  //   - Cancelled: One of the tasks called CancelBarrier().
  //   - Aborted: Service is shutting down.
  //   - Internal: Any participating task is in ERROR state.
  //   - InvalidArgument: (1) Conflicting tasks specified by different agents
  //       for the same barrier, (2) one of the participating tasks is not in
  //       the cluster, or (3) task making the request is not included in the
  //       list of participating tasks.
  rpc Barrier(BarrierRequest) returns (BarrierResponse);

  // Aborts the barrier if it is ongoing.
  // Current and future WaitAtBarrier() calls with the same id will return a
  // CANCELLED error status.
  // Possible service errors:
  //   - FailedPrecondition: Barrier has already been passed.
  //   - NotFound: No barrier with the specified id is found.
  rpc CancelBarrier(CancelBarrierRequest) returns (CancelBarrierResponse);
}