the-algorithm/navi/navi/proto/tensorflow/core/protobuf/coordination_service.proto

257 lines
9.5 KiB
Protocol Buffer

syntax = "proto3";
package tensorflow;
import "tensorflow/compiler/xla/pjrt/distributed/protocol.proto";
import "tensorflow/core/framework/device_attributes.proto";
option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
// Represents a remote worker task, specified by job name and task id.
message CoordinatedTask {
string job_name = 1;
int32 task_id = 2;
}
// Status payload for all coordination service errors.
// Note: an empty proto may be set if the error is triggered by the task's own
// agent calls (i.e. not propagated by the service from another remote task).
message CoordinationServiceError {
// Removed fields which used to specify the error origin.
reserved 1, 2;
// If true, error is reported via the agent API by the user (and not an
// internal service error).
bool is_reported_error = 3;
// Denotes which task hit the error. If unset, the error originated from the
// same task that is processing this error.
CoordinatedTask source_task = 4;
}
// Represent device information from different runtimes.
message TfDeviceList {
repeated DeviceAttributes devices = 1;
}
message XlaDeviceList {
xla.GlobalTopologyProto devices = 1;
}
message CoordinationServiceDeviceInfo {
oneof type {
TfDeviceList tf = 1;
XlaDeviceList xla = 2;
}
}
// Request and response messages for registering a worker to the cluster leader.
// Use `job` and `task` to represent the role of the worker, and use
// `incarnation` to uniquely identify a worker process. Leader responds with its
// `incarnation` to identify a leader process.
message RegisterWorkerRequest {
// Removed fields which used to specify the task.
reserved 1, 2;
fixed64 incarnation = 3;
// Moved the field `local_device_attributes` from this request message to
// WaitForAllTasksRequest defined below.
reserved 4;
CoordinatedTask source_task = 5;
}
message RegisterWorkerResponse {
fixed64 leader_incarnation = 1;
}
// Request and response messages for sending heartbeats.
message HeartbeatRequest {
// Removed fields which used to specify the remote task.
reserved 1, 2;
fixed64 incarnation = 3;
CoordinatedTask source_task = 4;
}
message HeartbeatResponse {
fixed64 leader_incarnation = 1;
// If there are failures in cluster, use additional metadata in response to
// broadcast error code and message to other workers.
}
// Request and response messages for waiting for all tasks.
message WaitForAllTasksRequest {
// Removed fields which used to specify the remote task.
reserved 1, 2;
// Removed field that specifically used TF device info.
reserved 3;
// All local device attributes on the request sender.
CoordinationServiceDeviceInfo local_device_info = 4;
CoordinatedTask source_task = 5;
}
message WaitForAllTasksResponse {
fixed64 leader_incarnation = 1;
// Removed field that specifically used TF device info.
reserved 2;
// All devices in the cluster.
CoordinationServiceDeviceInfo cluster_device_info = 3;
}
// Request and response messages for reporting errors to task.
message ReportErrorToAgentRequest {
int32 error_code = 1;
string error_message = 2;
// Removed fields that are embedded in payload.
reserved 3, 4;
CoordinationServiceError error_payload = 5;
}
message ReportErrorToAgentResponse {}
// Request and response messages for reporting errors to service instance.
message ReportErrorToServiceRequest {
int32 error_code = 1;
string error_message = 2;
// Removed fields which used to specify the error origin.
reserved 3, 4;
CoordinatedTask error_origin = 5;
}
message ReportErrorToServiceResponse {}
// Message for configuration key value.
// Key is structured like Unix file system, with multiple levels of directory
// names separated by the slash ('/') characters.
message KeyValueEntry {
string key = 1;
bytes value = 2;
}
// Request and response messages for inserting configuration key-value data.
message InsertKeyValueRequest {
KeyValueEntry kv = 1;
}
message InsertKeyValueResponse {}
// Request and response messages for getting configuration key-value data.
message GetKeyValueRequest {
string key = 1;
}
message GetKeyValueResponse {
KeyValueEntry kv = 1;
}
// Request and response messages for deleting configuration key-value data.
// When is_directory is true, delete key-values recursively under `key`.
message DeleteKeyValueRequest {
string key = 1;
bool is_directory = 2;
}
message DeleteKeyValueResponse {}
// Request and response messages for generic sync barriers.
message BarrierRequest {
string barrier_id = 1;
int64 barrier_timeout_in_ms = 2;
// Denotes list of tasks that will wait for the barrier. If unspecified, it
// implies that the entire cluster is participating in the barrier.
repeated CoordinatedTask tasks = 3;
// Task that is making the request.
CoordinatedTask source_task = 4;
}
message BarrierResponse {}
// Request and response messages for cancelling generic sync barriers.
message CancelBarrierRequest {
string barrier_id = 1;
// Task that is making the request.
CoordinatedTask source_task = 2;
}
message CancelBarrierResponse {}
// Coordination Service defines a TensorFlow service that controls and
// coordinates distributed execution in a cluster of multiple workers.
//
// The service keeps track of the cluster configuration and the state of cluster
// members or the leader depending on the role of the current worker. The
// distributed runtime leverages this service to coordinate and perform cluster
// initialization, check the healthiness of workers, and propagate error
// messages to the cluster.
service CoordinationService {
// Register task to coordination service so that the service starts to track
// liveness of the task. RPC blocks and returns only when it registers to
// the service successfully, or error happens in the registering process.
rpc RegisterWorker(RegisterWorkerRequest) returns (RegisterWorkerResponse);
// Heartbeat message from task to coordination service. Heartbeat is sent from
// a task to refresh its timestamp on leader to avoid it becoming stale.
// RPC responds immediately after refreshing the timestamp on leader.
rpc Heartbeat(HeartbeatRequest) returns (HeartbeatResponse);
// Wait for all tasks in the cluster to be up and running. The RPC request
// only gets responded when all workers are registered, or some error occurs.
rpc WaitForAllTasks(WaitForAllTasksRequest) returns (WaitForAllTasksResponse);
// Report error to the task. RPC sets the receiving instance of coordination
// service agent to error state permanently.
// TODO(b/195990880): Consider splitting this into a different RPC service.
rpc ReportErrorToAgent(ReportErrorToAgentRequest)
returns (ReportErrorToAgentResponse);
// Report task error to coordination service. RPC sets the service-side task
// state to error, and propagate the error to other tasks in the cluster.
rpc ReportErrorToService(ReportErrorToServiceRequest)
returns (ReportErrorToServiceResponse);
// Insert configuration key-value that will be accessible to all cluster
// workers. The key can be formatted as Unix file path with hierarchy. The
// coordination service key-value store should only be used for cluster
// configuration data.
rpc InsertKeyValue(InsertKeyValueRequest) returns (InsertKeyValueResponse);
// Get configuration key-value. The request blocks until the key-value data
// becomes available (i.e., set by a worker in the cluster).
rpc GetKeyValue(GetKeyValueRequest) returns (GetKeyValueResponse);
// Delete configuration key-value. If is_directory is set in request,
// recursively clean up all key-values under the path specified by `key`.
rpc DeleteKeyValue(DeleteKeyValueRequest) returns (DeleteKeyValueResponse);
// Blocks until all (or a subset of) tasks are at the barrier or the barrier
// fails.
//
// `barrier_id` should be unique across barriers. Once the barrier has passed
// or failed, subsequent calls will not block, and immediately respond with
// the previous response.
//
// The first WaitAtBarrier() call received by the service for a particular
// barrier id is special in that it determines the barrier deadline based on
// timeout duration.
// However, if subsequent calls by different agents specify a different set of
// `tasks` for the same `barrier_id`, the barrier will fail instantly.
//
// If no tasks are specified (default), the barrier will block for all the
// connected tasks.
//
// Possible service errors:
// - DeadlineExceeded: Timed out waiting for specified tasks at the barrier.
// Deadline is determined by the server timestamp when it receives the
// first WaitAtBarrier() + timeout duration.
// - Cancelled: One of the tasks called CancelBarrier().
// - Aborted: Service is shutting down.
// - Internal: Any participating task is in ERROR state.
// - InvalidArgument: (1) Conflicting tasks specified by different agents
// for the same barrier, (2) one of the participating tasks is not in
// the cluster, or (3) task making the request is not included in the
// list of participating tasks.
rpc Barrier(BarrierRequest) returns (BarrierResponse);
// Aborts the barrier if it is ongoing.
// Current and future WaitAtBarrier() calls with the same id will return a
// CANCELLED error status.
// Possible service errors:
// - FailedPrecondition: Barrier has already been passed.
// - NotFound: No barrier with the specified id is found.
rpc CancelBarrier(CancelBarrierRequest) returns (CancelBarrierResponse);
}