the-algorithm/navi/navi/proto/tensorflow/core/protobuf/eager_service.proto

345 lines
13 KiB
Protocol Buffer

syntax = "proto3";
package tensorflow.eager;
import "tensorflow/core/framework/attr_value.proto";
import "tensorflow/core/framework/device_attributes.proto";
import "tensorflow/core/framework/function.proto";
import "tensorflow/core/framework/tensor.proto";
import "tensorflow/core/framework/tensor_shape.proto";
import "tensorflow/core/framework/versions.proto";
import "tensorflow/core/protobuf/remote_tensor_handle.proto";
import "tensorflow/core/protobuf/tensorflow_server.proto";
option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
// A proto representation of an eager operation.
message Operation {
// A unique identifier for the operation. Set by the client so that the client
// can uniquely identify the outputs of the scheduled operation.
//
// In the initial implementation, sending duplicate IDs has undefined
// behaviour, but additional constraints may be placed upon this in the
// future.
int64 id = 1;
string name = 2;
message Input {
oneof item {
RemoteTensorHandle remote_handle = 1;
TensorProto tensor = 2;
}
}
repeated Input op_inputs = 10;
// Control Operation IDs that will be respected when ops are re-ordered by
// async execution. If async execution (+ op re-ordering) is not enabled, this
// should have no effect.
repeated int64 control_op_ids = 4;
map<string, AttrValue> attrs = 5;
string device = 6;
// Indicates whether the op is a component of a multi-device function.
bool is_component_function = 7;
// Set when is_component_function is true. It's initially generated
// when we create an FunctionLibraryRuntime::Options (negative value) and used
// to create Rendezvous for function execution. All components of a
// multi-device function should use the same step id to make sure that they
// can communicate through Send/Recv ops.
int64 func_step_id = 8;
// Indicates whether the op is a function.
bool is_function = 9;
reserved 3;
}
message QueueItem {
// The remote executor should be able to handle either executing ops directly,
// or releasing any unused tensor handles, since the tensor lifetime is
// maintained by the client.
oneof item {
RemoteTensorHandle handle_to_decref = 1;
Operation operation = 2;
SendTensorOp send_tensor = 3;
// Takes a FunctionDef and makes it enqueable on the remote worker.
RegisterFunctionOp register_function = 4;
CleanupFunctionOp cleanup_function = 5;
// A remote executor is created to execute ops/functions asynchronously
// enqueued in streaming call. Request with this item type waits for pending
// nodes to finish on the remote executor and report status.
SyncRemoteExecutorForStream sync_remote_executor_for_stream = 6;
SendPackedHandleOp send_packed_handle = 7;
}
}
message QueueResponse {
// `shape` and `tensor` cannot be set in the same response.
// Shapes of output tensors for creating remote TensorHandles.
repeated TensorShapeProto shape = 1;
// Optional. If set, represents the output devices of a function.
repeated string device = 3;
// Output tensors of a remote function. Set when Operation.id is invalid.
repeated TensorProto tensor = 2;
}
message CreateContextRequest {
// Identifies the full cluster, and this particular worker's position within.
ServerDef server_def = 1;
// Whether the ops on the worker should be executed synchronously or
// asynchronously. By default, ops are executed synchronously.
bool async = 2;
// Number of seconds to keep the context alive. If more than keep_alive_secs
// has passed since a particular context has been communicated with, it will
// be garbage collected.
int64 keep_alive_secs = 3;
// This is the version for all the ops that will be enqueued by the client.
VersionDef version_def = 4;
// Device attributes in the cluster
repeated DeviceAttributes cluster_device_attributes = 6;
// The ID of the created context. This is usually a randomly generated number,
// that will be used to identify the context in future requests to the
// service. Contexts are not persisted through server restarts.
// This ID will be used for all future communications as well. It is essential
// that both ends use this ID for selecting a rendezvous to get everything to
// match.
fixed64 context_id = 7;
// The view ID of the context.
fixed64 context_view_id = 8;
// For a multi device function, if false, eagerly copy all remote inputs to
// the default function device; if true, lazily copy remote inputs to their
// target devices after function instantiation to avoid redundant copies.
bool lazy_copy_remote_function_inputs = 9;
reserved 5;
}
message CreateContextResponse {
// List of devices that are locally accessible to the worker.
repeated DeviceAttributes device_attributes = 2;
reserved 1;
}
message UpdateContextRequest {
// Identifies the full cluster, and this particular worker's position within.
ServerDef server_def = 1;
// Device attributes in the cluster.
// If this field is empty, it indicates that this is a simple update request
// that only increments the cluster view ID and does not require changes to
// the workers it connects to.
repeated DeviceAttributes cluster_device_attributes = 2;
// The ID of the context to be updated. A context with the specified ID must
// already exist on the recepient server of this request.
fixed64 context_id = 3;
// The view ID of the context, which should be contiguously incremented when
// updating the same context.
fixed64 context_view_id = 4;
}
message UpdateContextResponse {
// List of devices that are locally accessible to the worker.
repeated DeviceAttributes device_attributes = 1;
}
message EnqueueRequest {
fixed64 context_id = 1;
repeated QueueItem queue = 3;
}
message EnqueueResponse {
// A single operation response for every item in the request.
repeated QueueResponse queue_response = 1;
}
message WaitQueueDoneRequest {
fixed64 context_id = 1;
// Ids to wait on. If empty, wait on everything currently pending.
repeated int64 op_id = 2;
}
message WaitQueueDoneResponse {
// TODO(nareshmodi): Consider adding NodeExecStats here to be able to
// propagate some stats.
}
message RunComponentFunctionRequest {
fixed64 context_id = 1;
Operation operation = 2;
// The output indices of its parent function.
repeated int32 output_num = 3;
}
message RunComponentFunctionResponse {
repeated TensorShapeProto shape = 1;
repeated TensorProto tensor = 2;
}
message KeepAliveRequest {
fixed64 context_id = 1;
}
message KeepAliveResponse {
// If the requested context_id is on the remote host, set the context view ID.
fixed64 context_view_id = 1;
}
message CloseContextRequest {
fixed64 context_id = 1;
fixed64 context_view_id = 2;
}
message CloseContextResponse {}
message RegisterFunctionOp {
FunctionDef function_def = 1;
// If true, it means that function_def is produced by graph partition during
// multi-device function instantiation.
bool is_component_function = 2;
// All necessary FunctionDefs and GradientDefs to expand `function_def`.
// When is_component_function is true, `function_def` could be a nested
// function, since some nodes in its parent's function body could be
// replaced with a new function by the graph optimization passes. No need to
// add FunctionDefs here to the function cache in EagerContext since they
// won't be executed as KernelAndDevices.
FunctionDefLibrary library = 3;
}
// Cleanup the step state of a multi-device function (e.g. tensors buffered by
// a `Send` op but not picked up by its corresponding `Recv` op).
message CleanupFunctionOp {
int64 step_id = 1;
}
message SyncRemoteExecutorForStream {}
message SendTensorOp {
// All remote tensors are identified by <Op ID, Output num>. To mimic this
// situation when directly sending tensors, we include an "artificial" op ID
// (which would have corresponded to the _Recv op when not using SendTensor).
int64 op_id = 1;
// The index within the repeated field is the output number that will help
// uniquely identify (along with the above op_id) the particular tensor.
repeated TensorProto tensors = 2;
// The device on which the tensors should be resident.
string device_name = 3;
}
// Send a packed TensorHandle to a remote worker.
message SendPackedHandleOp {
// Op id of the remote packed TensorHandle.
int64 op_id = 1;
message LocalTensorHandle {
TensorProto tensor = 1;
// Device where the tensor is produced.
string device = 2;
}
message Handle {
oneof item {
LocalTensorHandle local_handle = 1;
RemoteTensorHandle remote_handle = 2;
}
}
repeated Handle handles = 2;
string device_name = 3;
}
////////////////////////////////////////////////////////////////////////////////
//
// Eager Service defines a TensorFlow service that executes operations eagerly
// on a set of local devices, on behalf of a remote Eager executor.
//
// The service impl will keep track of the various clients and devices it has
// access to and allows the client to enqueue ops on any devices that it is able
// to access and schedule data transfers from/to any of the peers.
//
// A client can generate multiple contexts to be able to independently execute
// operations, but cannot share data between the two contexts.
//
// NOTE: Even though contexts generated by clients should be independent, the
// lower level tensorflow execution engine is not, so they might share some data
// (e.g. a Device's ResourceMgr).
//
////////////////////////////////////////////////////////////////////////////////
service EagerService {
// This initializes the worker, informing it about the other workers in the
// cluster and exchanging authentication tokens which will be used in all
// other RPCs to detect whether the worker has restarted.
rpc CreateContext(CreateContextRequest) returns (CreateContextResponse);
// This updates the eager context on an existing worker when updating the set
// of servers in a distributed eager cluster.
rpc UpdateContext(UpdateContextRequest) returns (UpdateContextResponse);
// This takes a list of Execute and DeleteTensorHandle operations and enqueues
// (in async mode) or executes (in sync mode) them on the remote server.
// All outputs of ops which were not explicitly deleted with
// DeleteTensorHandle entries will be assumed to be alive and are usable by
// future calls to Enqueue.
rpc Enqueue(EnqueueRequest) returns (EnqueueResponse);
// A streaming version of Enqueue.
// Current server implementation sends one response per received request.
// The benefit for using a streaming version is that subsequent requests
// can be sent without waiting for a response to the previous request. This
// synchronization is required in the regular Enqueue call because gRPC does
// not guarantee to preserve request order.
rpc StreamingEnqueue(stream EnqueueRequest) returns (stream EnqueueResponse);
// Takes a set of op IDs and waits until those ops are done. Returns any error
// in the stream so far.
rpc WaitQueueDone(WaitQueueDoneRequest) returns (WaitQueueDoneResponse);
// This takes an Eager operation and executes it in async mode on the remote
// server. Different from EnqueueRequest, ops/functions sent through this
// type of requests are allowed to execute in parallel and no ordering is
// preserved by RPC stream or executor.
// This request type should only be used for executing component functions.
// Ordering of component functions should be enforced by their corresponding
// main functions. The runtime ensures the following invarients for component
// functions (CFs) and their main functions (MFs):
// (1) MF1 -> MF2 ==> CF1 -> CF2 ("->" indicates order of execution);
// (2) MF1 || MF2 ==> CF1 || CF2 ("||" indicates possible parallel execution);
// (3) For CF1 and CF2 that come from the same MF, CF1 || CF2
// For executing ops/main functions, use Enqueue or StreamingEnqueue instead
// for correct ordering.
rpc RunComponentFunction(RunComponentFunctionRequest)
returns (RunComponentFunctionResponse);
// Contexts are always created with a deadline and no RPCs within a deadline
// will trigger a context garbage collection. KeepAlive calls can be used to
// delay this. It can also be used to validate the existence of a context ID
// on remote eager worker. If the context is on remote worker, return the same
// ID and the current context view ID. This is useful for checking if the
// remote worker (potentially with the same task name and hostname / port) is
// replaced with a new process.
rpc KeepAlive(KeepAliveRequest) returns (KeepAliveResponse);
// Closes the context. No calls to other methods using the existing context ID
// are valid after this.
rpc CloseContext(CloseContextRequest) returns (CloseContextResponse);
}