#ifndef TENSORFLOW_COMMON_RUNTIME_EXECUTOR_H_
#define TENSORFLOW_COMMON_RUNTIME_EXECUTOR_H_
#include "tensorflow/core/common_runtime/device.h"
#include "tensorflow/core/framework/rendezvous.h"
#include "tensorflow/core/framework/session_state.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/lib/core/notification.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"
namespace tensorflow {
class StepStatsCollector;
// Executor runs a graph computation.
// 执行器运行图形计算。
// Example:
// Graph* graph = ...;
// ... construct graph ...
// Executor* executor;
// TF_CHECK_OK(NewSimpleExecutor(my_device, graph, &executor));
// Rendezvous* rendezvous = NewNaiveRendezvous();
// TF_CHECK_OK(rendezvous->Send("input", some_input_tensor));
// TF_CHECK_OK(executor->Run({ExecutorOpts, rendezvous, nullptr}));
// TF_CHECK_OK(rendezvous->Recv("output", &output_tensor));
// ... ...
//
// Multiple threads can call Executor::Run concurrently.
class Executor {
public:
virtual ~Executor() {}
// RunAsync() executes the graph computation. "done" is run when the graph computation completes.
// If any error happens during the computation, "done" is run and the error is passed to "done".
// RunAsync() 执行图形计算。 "done" 在图形计算完成时运行。
// 如果在计算过程中出现任何错误,运行"done",并将错误传递给"done"。
//
// RunAsync() is given a few arguments in Args. The caller must ensure objects passed in Args
// (rendezvous, stats_collector, etc.) are alive at least until done is invoked.
// All pointers to the argument objects can be nullptr.
// 在 Args 中给出了 RunAsync() 的一些参数 。
// 调用者必须确保在 Args(rendezvous,stats_collector 等)中传递的对象至少在调用完成之前是活着的。
// 参数对象的所有指针都可以为 nullptr。
//
// "step_id" is a process-wide unique identifier for the step being run. Executors on different
// devices may receive the same step_id in the case that a step runs Ops on more than one device.
// The step_id is used for tracking resource usage of a given step.
// "step_id" 是正在运行的步骤的流程范围的唯一标识符。在步骤在多个设备上运行 Ops 的情况下,
// 不同设备上的执行程序可能会收到相同的 step_id。 step_id 用于跟踪给定步骤的资源使用情况。
//
// RunAsync() uses the given "rendezvous", if not null, as the mechanism to communicate
// inputs and outputs of the underlying graph computation.
// 如果不为空,RunAsync() 使用给定的 "rendezvous" 作为通信底层图形计算的输入和输出的机制。
//
// RunAsync() calls "stats_collector", if not null, to keep track of stats.
// This allows us to collect statistics and traces on demand.
// 如果不为空,RunAsync() 调用 "stats_collector" 来跟踪统计信息。
// 这使我们能够根据需要收集统计数据和痕迹。
//
// RunAsync() is provided a "call_frame", if the executor is used for executing a function,
// is used to pass arguments and return values between the caller and the callee.
// 如果 executor 用于执行一个函数,RunAsync() 被提供了一个 "call_frame"
// 用于传递参数并在调用者和被调用者之间返回值。
//
// RunAsync() uses "cancellation_manager", if not nullptr, to register callbacks that
// should be called if the graph computation is cancelled.
// Note that the callbacks merely unblock any long-running computation,
// and a cancelled step will terminate by returning/calling the DoneCallback as usual.
// 如果不为空,RunAsync() 使用 "cancellation_manager"注册回调,当图形计算被取消应该调用。
// 请注意,回调只是解除阻止任何长时间运行的计算,一般取消的步骤将通过返回/调用 DoneCallback 来终止。
//
// RunAsync() dispatches closures to "runner".
// Typically, "runner" is backed up by a bounded threadpool.
// RunAsync() 将闭包(closures)分派到 "runner"。通常,"runner" 由有界线程池备份。
struct Args {
int64 step_id = 0;
Rendezvous* rendezvous = nullptr;
StepStatsCollector* stats_collector = nullptr;
FunctionCallFrame* call_frame = nullptr;
CancellationManager* cancellation_manager = nullptr;
SessionState* session_state = nullptr;
TensorStore* tensor_store = nullptr;
ScopedStepContainer* step_container = nullptr;
// If true, calls Sync() on the device.
bool sync_on_finish = false;
typedef std::function<void()> Closure;
typedef std::function<void(Closure)> Runner;
Runner runner = nullptr;
// A callback that is invoked each time a node has finished executing.
typedef std::function<Status(const string& node_name, const int output_slot,
const Tensor* tensor, const bool is_ref,
OpKernelContext* ctx)>
NodeOutputsCallback;
NodeOutputsCallback node_outputs_cb = nullptr;
};
typedef std::function<void(const Status&)> DoneCallback;
virtual void RunAsync(const Args& args, DoneCallback done) = 0;
// Synchronous wrapper for RunAsync().
Status Run(const Args& args) {
Status ret;
Notification n;
RunAsync(args, [&ret, &n](const Status& s) {
ret = s;
n.Notify();
});
n.WaitForNotification();
return ret;
}
};
// Creates an Executor that computes the given "graph".
//
// If successful, returns the constructed executor in "*executor". The
// caller keeps the ownership of "device". The returned executor takes
// the ownership of "graph". Otherwise, returns an error status.
//
// "params" provides a set of context for the executor. We expect that
// different context would provide different implementations.
struct LocalExecutorParams {
Device* device;
// The library runtime support.
FunctionLibraryRuntime* function_library = nullptr;
// create_kernel returns an instance of op kernel based on NodeDef.
// delete_kernel is called for every kernel used by the executor
// when the executor is deleted.
std::function<Status(const NodeDef&, OpKernel**)> create_kernel;
std::function<void(OpKernel*)> delete_kernel;
Executor::Args::NodeOutputsCallback node_outputs_cb;
};
::tensorflow::Status NewLocalExecutor(const LocalExecutorParams& params,
const Graph* graph, Executor** executor);
// A class to help run multiple executors in parallel and wait until all of them are complete.
//
// ExecutorBarrier deletes itself after the function returned by Get() is called.
class ExecutorBarrier {
public:
typedef std::function<void(const Status&)> StatusCallback;
// Create an ExecutorBarrier for 'num' different executors.
//
// 'r' is the shared Rendezvous object that is used to communicate state.
// If any of the executors experiences an error, the rendezvous object will be aborted exactly once.
//
// 'done' is called after the last executor completes, and ExecutorBarrier is deleted.
//
ExecutorBarrier(int num, Rendezvous* r, StatusCallback done)
: rendez_(r), done_cb_(done), pending_(num) {}
~ExecutorBarrier() {}
// Returns a closure that Executors must call when they are done computing,
// passing the status of their execution as an argument.
StatusCallback Get() {
return std::bind(&ExecutorBarrier::WhenDone, this, std::placeholders::_1);
}
private:
Rendezvous* rendez_ = nullptr;
StatusCallback done_cb_ = nullptr;
mutable mutex mu_;
int pending_ GUARDED_BY(mu_) = 0;
Status status_ GUARDED_BY(mu_);
void WhenDone(const Status& s) {
bool error = false;
Rendezvous* error_rendez = nullptr;
StatusCallback done = nullptr;
Status status;
{
mutex_lock l(mu_);
// If we are the first error encountered, mark the status appropriately and later
// trigger an abort of the Rendezvous object by this thread only.
if (status_.ok() && !s.ok()) {
error = true;
error_rendez = rendez_;
error_rendez->Ref();
status_ = s;
}
// If this is the last call to WhenDone, call the final callback below.
if (--pending_ == 0) {
CHECK(done_cb_ != nullptr);
done = done_cb_;
done_cb_ = nullptr;
}
status = status_;
}
if (error) {
error_rendez->StartAbort(status);
error_rendez->Unref();
}
if (done != nullptr) {
delete this;
done(status);
}
}
TF_DISALLOW_COPY_AND_ASSIGN(ExecutorBarrier);
};
// A few helpers to facilitate create/delete kernels.
// Creates a kernel based on "ndef" on device "device". The kernel can access the functions
// in the "flib". The caller takes ownership of returned "*kernel".
Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib,
const NodeDef& ndef, int graph_def_version, OpKernel** kernel);
// Deletes "kernel" returned by CreateKernel.
void DeleteNonCachedKernel(OpKernel* kernel);
} // end namespace tensorflow
#endif // TENSORFLOW_COMMON_RUNTIME_EXECUTOR_H_
#include "tensorflow/core/common_runtime/executor.h"
#include <atomic>
#include <deque>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "tensorflow/core/common_runtime/costmodel_manager.h"
#include "tensorflow/core/common_runtime/pending_counts.h"
#include "tensorflow/core/common_runtime/step_stats_collector.h"
#include "tensorflow/core/framework/allocation_description.pb.h"
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/framework/cancellation.h"
#include "tensorflow/core/framework/control_flow.h"
#include "tensorflow/core/framework/device_attributes.pb.h"
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/framework/log_memory.h"
#include "tensorflow/core/framework/node_def_util.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/op_segment.h"
#include "tensorflow/core/framework/step_stats.pb.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_reference.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/framework/types.pb.h"
#include "tensorflow/core/graph/edgeset.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/notification.h"
#include "tensorflow/core/lib/core/stringpiece.h"
#include "tensorflow/core/lib/gtl/flatmap.h"
#include "tensorflow/core/lib/gtl/flatset.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h"
#include "tensorflow/core/lib/gtl/manual_constructor.h"
#include "tensorflow/core/lib/gtl/stl_util.h"
#include "tensorflow/core/lib/hash/hash.h"
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/thread_annotations.h"
#include "tensorflow/core/platform/tracing.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/tensor_slice_reader_cache.h"
namespace tensorflow {
namespace {
// 1-D, 0 element tensor.
static const Tensor* const kEmptyTensor = new Tensor;
bool IsInitializationOp(const Node* node) {
return node->op_def().allows_uninitialized_input();
}
// Sets the timeline_label field of *node_stats, using data from *node.
// Returns true iff the node is a transfer node.
// TODO(tucker): merge with the DetailText function in session.cc in a common location.
bool SetTimelineLabel(const Node* node, NodeExecStats* node_stats) {
bool is_transfer_node = false;
string memory;
for (auto& all : node_stats->memory()) {
int64 tot = all.total_bytes();
if (tot >= 0.1 * 1048576.0) {
int64 peak = all.peak_bytes();
if (peak > 0) {
memory =
strings::StrCat(memory, "[", all.allocator_name(),
strings::Printf(" %.1fMB %.1fMB] ", tot / 1048576.0, peak / 1048576.0));
} else {
memory = strings::StrCat(memory, "[", all.allocator_name(),
strings::Printf(" %.1fMB] ", tot / 1048576.0));
}
}
}
const NodeDef& def = node->def();
string text = "";
if (IsSend(node)) {
string tensor_name;
TF_CHECK_OK(GetNodeAttr(def, "tensor_name", &tensor_name));
string recv_device;
TF_CHECK_OK(GetNodeAttr(def, "recv_device", &recv_device));
text = strings::StrCat(memory, def.name(), " = ", def.op(), "(",
tensor_name, " @", recv_device);
is_transfer_node = true;
} else if (IsRecv(node)) {
string tensor_name;
TF_CHECK_OK(GetNodeAttr(def, "tensor_name", &tensor_name));
string send_device;
TF_CHECK_OK(GetNodeAttr(def, "send_device", &send_device));
text = strings::StrCat(memory, def.name(), " = ", def.op(), "(",
tensor_name, " @", send_device);
is_transfer_node = true;
} else {
text = strings::StrCat(
memory, def.name(), " = ", def.op(), "(",
str_util::Join(
std::vector<StringPiece>(def.input().begin(), def.input().end()),
", "),
")");
}
node_stats->set_timeline_label(text);
return is_transfer_node;
}
// Helper routines for collecting step stats.
namespace nodestats {
inline int64 NowInUsec() { return Env::Default()->NowMicros(); }
void SetScheduled(NodeExecStats* nt, int64 t) { nt->set_scheduled_micros(t); }
void SetAllStart(NodeExecStats* nt) { nt->set_all_start_micros(NowInUsec()); }
void SetOpStart(NodeExecStats* nt) {
DCHECK_NE(nt->all_start_micros(), 0);
nt->set_op_start_rel_micros(NowInUsec() - nt->all_start_micros());
}
void SetOpEnd(NodeExecStats* nt) {
DCHECK_NE(nt->all_start_micros(), 0);
nt->set_op_end_rel_micros(NowInUsec() - nt->all_start_micros());
}
void SetAllEnd(NodeExecStats* nt) {
DCHECK_NE(nt->all_start_micros(), 0);
nt->set_all_end_rel_micros(NowInUsec() - nt->all_start_micros());
}
void SetOutput(NodeExecStats* nt, int slot, const Tensor* v) {
DCHECK(v);
NodeOutput* no = nt->add_output();
no->set_slot(slot);
v->FillDescription(no->mutable_tensor_description());
}
void SetMemory(NodeExecStats* nt, OpKernelContext* ctx) {
for (const auto& allocator_pair : ctx->wrapped_allocators()) {
AllocatorMemoryUsed* memory = nt->add_memory();
// retrieving the sizes from the wrapped allocator removes the executor's reference to it,
// so allocator_pair.second must not be dereferenced again after this statement
auto sizes = allocator_pair.second->GetSizesAndUnRef();
memory->set_allocator_name(allocator_pair.first->Name());
memory->set_total_bytes(std::get<0>(sizes));
if (allocator_pair.first->TracksAllocationSizes()) {
memory->set_peak_bytes(std::get<1>(sizes));
memory->set_live_bytes(std::get<2>(sizes));
}
}
auto* ms = nt->mutable_memory_stats();
ms->set_host_temp_memory_size(ctx->host_temp_memory_size());
ms->set_device_temp_memory_size(ctx->device_temp_memory_size());
for (const auto& alloc_id : ctx->host_persistent_alloc_ids()) {
ms->mutable_host_persistent_tensor_alloc_ids()->Add(alloc_id);
}
for (const auto& alloc_id : ctx->device_persistent_alloc_ids()) {
ms->mutable_device_persistent_tensor_alloc_ids()->Add(alloc_id);
}
ms->set_host_persistent_memory_size(ctx->host_persistent_memory_allocated());
ms->set_device_persistent_memory_size(
ctx->device_persistent_memory_allocated());
}
void SetReferencedTensors(NodeExecStats* nt,
const TensorReferenceVector& tensors) {
// be careful not to increment the reference count on any tensor
// while recording the information
for (size_t i = 0; i < tensors.size(); ++i) {
AllocationDescription* description = nt->add_referenced_tensor();
tensors.at(i).FillDescription(description);
}
}
} // namespace nodestats
class ExecutorImpl;
class GraphView;
struct EdgeInfo {
int dst_id;
int output_slot : 31;
// true if this is the last info for output_slot in the EdgeInfo list.
bool is_last : 1;
int input_slot;
};
struct NodeItem {
NodeItem() {}
// A graph node.
const Node* node = nullptr;
// The kernel for this node.
OpKernel* kernel = nullptr;
bool kernel_is_expensive : 1; // True iff kernel->IsExpensive()
bool kernel_is_async : 1; // True iff kernel->AsAsync() != nullptr
bool is_merge : 1; // True iff IsMerge(node)
bool is_enter : 1; // True iff IsEnter(node)
bool is_exit : 1; // True iff IsExit(node)
bool is_control_trigger : 1; // True iff IsControlTrigger(node)
bool is_sink : 1; // True iff IsSink(node)
// True iff IsEnter(node) || IsExit(node) || IsNextIteration(node)
bool is_enter_exit_or_next_iter : 1;
// Cached values of node->num_inputs() and node->num_outputs(), to avoid levels of indirection.
int num_inputs;
int num_outputs;
// ExecutorImpl::tensors_[input_start] is the 1st positional input for this node.
int input_start = 0;
// Number of output edges.
int num_output_edges;
PendingCounts::Handle pending_id;
const EdgeInfo* output_edge_list() const { return output_edge_base(); }
// ith output edge.
const EdgeInfo& output_edge(int i) const {
DCHECK_GE(i, 0);
DCHECK_LT(i, num_output_edges);
return output_edge_base()[i];
}
DataType input_type(int i) const {
DCHECK_LT(i, num_inputs);
return static_cast<DataType>(input_type_base()[i]);
}
DataType output_type(int i) const {
DCHECK_LT(i, num_outputs);
return static_cast<DataType>(output_type_base()[i]);
}
// Return array of per-output allocator attributes.
const AllocatorAttributes* output_attrs() const { return output_attr_base(); }
private:
friend class GraphView;
// Variable length section starts immediately after *this
// (uint8 is enough for DataType).
// EdgeInfo out_edges[num_out_edges];
// AllocatorAttributes output_attr[num_outputs];
// uint8 input_type[num_inputs];
// uint8 output_type[num_outputs];
// Return pointer to variable length section.
char* var() const {
return const_cast<char*>(reinterpret_cast<const char*>(this) + sizeof(NodeItem));
}
EdgeInfo* output_edge_base() const {
return reinterpret_cast<EdgeInfo*>(var());
}
AllocatorAttributes* output_attr_base() const {
return reinterpret_cast<AllocatorAttributes*>(var() + sizeof(EdgeInfo) * num_output_edges);
}
uint8* input_type_base() const {
return reinterpret_cast<uint8*>(var() +
sizeof(EdgeInfo) * num_output_edges +
sizeof(AllocatorAttributes) * num_outputs);
}
uint8* output_type_base() const {
return reinterpret_cast<uint8*>(
var() + sizeof(EdgeInfo) * num_output_edges +
sizeof(AllocatorAttributes) * num_outputs + sizeof(uint8) * num_inputs);
}
TF_DISALLOW_COPY_AND_ASSIGN(NodeItem);
};
typedef gtl::InlinedVector<TensorValue, 4> TensorValueVec;
typedef gtl::InlinedVector<DeviceContext*, 4> DeviceContextVec;
typedef gtl::InlinedVector<AllocatorAttributes, 4> AllocatorAttributeVec;
// Immutable view of a Graph organized for efficient execution.
class GraphView {
public:
GraphView() : space_(nullptr) {}
~GraphView();
void Initialize(const Graph* g);
Status SetAllocAttrs(const Graph* g, const Device* device);
NodeItem* node(int id) const {
DCHECK_GE(id, 0);
DCHECK_LT(id, num_nodes_);
uint32 offset = node_offsets_[id];
return ((offset == kuint32max)
? nullptr
: reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]));
}
private:
char* InitializeNode(char* ptr, const Node* n);
size_t NodeItemBytes(const Node* n);
int32 num_nodes_ = 0;
uint32* node_offsets_ = nullptr; // array of size "graph_.num_node_ids()"
// node_offsets_[id] holds the byte offset for node w/ "id" in space_
char* space_; // NodeItem objects are allocated here
TF_DISALLOW_COPY_AND_ASSIGN(GraphView);
};
class ExecutorImpl : public Executor {
public:
ExecutorImpl(const LocalExecutorParams& p, const Graph* g)
: params_(p), graph_(g), gview_() {
CHECK(p.create_kernel != nullptr);
CHECK(p.delete_kernel != nullptr);
}
~ExecutorImpl() override {
for (int i = 0; i < graph_->num_node_ids(); i++) {
NodeItem* item = gview_.node(i);
if (item != nullptr) {
params_.delete_kernel(item->kernel);
}
}
for (auto fiter : frame_info_) {
delete fiter.second;
}
delete graph_;
}
Status Initialize();
// Process all Nodes in the current graph, attempting to infer the
// memory allocation attributes to be used wherever they may allocate
// a tensor buffer.
Status SetAllocAttrs();
void RunAsync(const Args& args, DoneCallback done) override;
private:
friend class ExecutorState;
struct ControlFlowInfo {
gtl::FlatSet<string, HashStr> unique_frame_names;
std::vector<string> frame_names;
};
struct FrameInfo {
FrameInfo()
: input_count(0),
total_inputs(0),
pending_counts(nullptr),
nodes(nullptr) {}
// The total number of inputs to a frame.
int input_count;
// The total number of input tensors of a frame.
// == sum(nodes[*].num_inputs()) where nodes are the nodes in the frame.
int total_inputs;
// Used to determine the next place to allocate space in the
// pending_counts data structure we'll eventually construct
// 用于确定下一个位置,以分配我们将最终构建的 pending_counts 数据结构中的空间,
PendingCounts::Layout pending_counts_layout;
// Each frame has its own PendingCounts only for the nodes