

#include "tensorflow/core/common_runtime/device.h"
#include "tensorflow/core/framework/rendezvous.h"
#include "tensorflow/core/framework/session_state.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/lib/core/notification.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"

namespace tensorflow {

class StepStatsCollector;

// Executor runs a graph computation.
// 执行器运行图形计算。
// Example:
//   Graph* graph = ...;
//      ... construct graph ...
//   Executor* executor;
//   TF_CHECK_OK(NewSimpleExecutor(my_device, graph, &executor));
//   Rendezvous* rendezvous = NewNaiveRendezvous();
//   TF_CHECK_OK(rendezvous->Send("input", some_input_tensor));
//   TF_CHECK_OK(executor->Run({ExecutorOpts, rendezvous, nullptr}));
//   TF_CHECK_OK(rendezvous->Recv("output", &output_tensor));
//   ... ...
// Multiple threads can call Executor::Run concurrently.
class Executor {
  virtual ~Executor() {}

  // RunAsync() executes the graph computation. "done" is run when the graph computation completes.
  // If any error happens during the computation, "done" is run and the error is passed to "done".
  // RunAsync() 执行图形计算。 "done" 在图形计算完成时运行。 
  // 如果在计算过程中出现任何错误,运行"done",并将错误传递给"done"。
  // RunAsync() is given a few arguments in Args. The caller must ensure objects passed in Args 
  // (rendezvous, stats_collector, etc.) are alive at least until done is invoked.
  // All pointers to the argument objects can be nullptr.
  //  在 Args 中给出了 RunAsync() 的一些参数 。
  // 调用者必须确保在 Args(rendezvous,stats_collector 等)中传递的对象至少在调用完成之前是活着的。
  // 参数对象的所有指针都可以为 nullptr。
  // "step_id" is a process-wide unique identifier for the step being run. Executors on different 
  // devices may receive the same step_id in the case that a step runs Ops on more than one device. 
  // The step_id is used for tracking resource usage of a given step.
  // "step_id" 是正在运行的步骤的流程范围的唯一标识符。在步骤在多个设备上运行 Ops 的情况下,
  // 不同设备上的执行程序可能会收到相同的 step_id。 step_id 用于跟踪给定步骤的资源使用情况。
  // RunAsync() uses the given "rendezvous", if not null, as the mechanism to communicate
  // inputs and outputs of the underlying graph computation.
  // 如果不为空,RunAsync() 使用给定的 "rendezvous" 作为通信底层图形计算的输入和输出的机制。
  // RunAsync() calls "stats_collector", if not null, to keep track of stats. 
  // This allows us to collect statistics and traces on demand.
  // 如果不为空,RunAsync() 调用 "stats_collector" 来跟踪统计信息。
  // 这使我们能够根据需要收集统计数据和痕迹。
  // RunAsync() is provided a "call_frame", if the executor is used for executing a function,
  // is used to pass arguments and return values between the caller and the callee.
  // 如果 executor 用于执行一个函数,RunAsync() 被提供了一个 "call_frame" 
  // 用于传递参数并在调用者和被调用者之间返回值。
  // RunAsync() uses "cancellation_manager", if not nullptr, to register callbacks that 
  // should be called if the graph computation is cancelled. 
  // Note that the callbacks merely unblock any long-running computation,
  // and a cancelled step will terminate by returning/calling the DoneCallback as usual.
  // 如果不为空,RunAsync() 使用 "cancellation_manager"注册回调,当图形计算被取消应该调用。
  // 请注意,回调只是解除阻止任何长时间运行的计算,一般取消的步骤将通过返回/调用 DoneCallback 来终止。
  // RunAsync() dispatches closures to "runner". 
  // Typically, "runner" is backed up by a bounded threadpool.
  // RunAsync() 将闭包(closures)分派到 "runner"。通常,"runner" 由有界线程池备份。
  struct Args {
    int64 step_id = 0;
    Rendezvous* rendezvous = nullptr;
    StepStatsCollector* stats_collector = nullptr;
    FunctionCallFrame* call_frame = nullptr;
    CancellationManager* cancellation_manager = nullptr;
    SessionState* session_state = nullptr;
    TensorStore* tensor_store = nullptr;
    ScopedStepContainer* step_container = nullptr;

    // If true, calls Sync() on the device.
    bool sync_on_finish = false;

    typedef std::function<void()> Closure;
    typedef std::function<void(Closure)> Runner;
    Runner runner = nullptr;

    // A callback that is invoked each time a node has finished executing.
    typedef std::function<Status(const string& node_name, const int output_slot,
                                 const Tensor* tensor, const bool is_ref,
                                 OpKernelContext* ctx)>
    NodeOutputsCallback node_outputs_cb = nullptr;
  typedef std::function<void(const Status&)> DoneCallback;
  virtual void RunAsync(const Args& args, DoneCallback done) = 0;

  // Synchronous wrapper for RunAsync().
  Status Run(const Args& args) {
    Status ret;
    Notification n;
    RunAsync(args, [&ret, &n](const Status& s) {
      ret = s;
    return ret;

// Creates an Executor that computes the given "graph".
// If successful, returns the constructed executor in "*executor". The
// caller keeps the ownership of "device". The returned executor takes
// the ownership of "graph". Otherwise, returns an error status.
// "params" provides a set of context for the executor. We expect that
// different context would provide different implementations.
struct LocalExecutorParams {
  Device* device;

  // The library runtime support.
  FunctionLibraryRuntime* function_library = nullptr;

  // create_kernel returns an instance of op kernel based on NodeDef.
  // delete_kernel is called for every kernel used by the executor
  // when the executor is deleted.
  std::function<Status(const NodeDef&, OpKernel**)> create_kernel;
  std::function<void(OpKernel*)> delete_kernel;

  Executor::Args::NodeOutputsCallback node_outputs_cb;
::tensorflow::Status NewLocalExecutor(const LocalExecutorParams& params,
                                      const Graph* graph, Executor** executor);

// A class to help run multiple executors in parallel and wait until all of them are complete.
// ExecutorBarrier deletes itself after the function returned by Get() is called.
class ExecutorBarrier {
  typedef std::function<void(const Status&)> StatusCallback;

  // Create an ExecutorBarrier for 'num' different executors.
  // 'r' is the shared Rendezvous object that is used to communicate state. 
  // If any of the executors experiences an error, the rendezvous object will be aborted exactly once.
  // 'done' is called after the last executor completes, and ExecutorBarrier is deleted.
  ExecutorBarrier(int num, Rendezvous* r, StatusCallback done)
      : rendez_(r), done_cb_(done), pending_(num) {}

  ~ExecutorBarrier() {}

  // Returns a closure that Executors must call when they are done computing,
  // passing the status of their execution as an argument.
  StatusCallback Get() {
    return std::bind(&ExecutorBarrier::WhenDone, this, std::placeholders::_1);

  Rendezvous* rendez_ = nullptr;
  StatusCallback done_cb_ = nullptr;

  mutable mutex mu_;
  int pending_ GUARDED_BY(mu_) = 0;
  Status status_ GUARDED_BY(mu_);

  void WhenDone(const Status& s) {
    bool error = false;
    Rendezvous* error_rendez = nullptr;
    StatusCallback done = nullptr;
    Status status;
      mutex_lock l(mu_);
      // If we are the first error encountered, mark the status appropriately and later
      // trigger an abort of the Rendezvous object by this thread only.
      if (status_.ok() && !s.ok()) {
        error = true;
        error_rendez = rendez_;
        status_ = s;

      // If this is the last call to WhenDone, call the final callback below.
      if (--pending_ == 0) {
        CHECK(done_cb_ != nullptr);
        done = done_cb_;
        done_cb_ = nullptr;

      status = status_;

    if (error) {
    if (done != nullptr) {
      delete this;


// A few helpers to facilitate create/delete kernels.

// Creates a kernel based on "ndef" on device "device". The kernel can access the functions 
// in the "flib". The caller takes ownership of returned "*kernel".
Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib,
                          const NodeDef& ndef, int graph_def_version, OpKernel** kernel);

// Deletes "kernel" returned by CreateKernel.
void DeleteNonCachedKernel(OpKernel* kernel);

}  // end namespace tensorflow


#include "tensorflow/core/common_runtime/executor.h"

#include <atomic>
#include <deque>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

#include "tensorflow/core/common_runtime/costmodel_manager.h"
#include "tensorflow/core/common_runtime/pending_counts.h"
#include "tensorflow/core/common_runtime/step_stats_collector.h"
#include "tensorflow/core/framework/allocation_description.pb.h"
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/framework/cancellation.h"
#include "tensorflow/core/framework/control_flow.h"
#include "tensorflow/core/framework/device_attributes.pb.h"
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/framework/log_memory.h"
#include "tensorflow/core/framework/node_def_util.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/op_segment.h"
#include "tensorflow/core/framework/step_stats.pb.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_reference.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/framework/types.pb.h"
#include "tensorflow/core/graph/edgeset.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/notification.h"
#include "tensorflow/core/lib/core/stringpiece.h"
#include "tensorflow/core/lib/gtl/flatmap.h"
#include "tensorflow/core/lib/gtl/flatset.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h"
#include "tensorflow/core/lib/gtl/manual_constructor.h"
#include "tensorflow/core/lib/gtl/stl_util.h"
#include "tensorflow/core/lib/hash/hash.h"
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/thread_annotations.h"
#include "tensorflow/core/platform/tracing.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/tensor_slice_reader_cache.h"

namespace tensorflow {
namespace {

// 1-D, 0 element tensor.
static const Tensor* const kEmptyTensor = new Tensor;

bool IsInitializationOp(const Node* node) {
  return node->op_def().allows_uninitialized_input();

// Sets the timeline_label field of *node_stats, using data from *node.
// Returns true iff the node is a transfer node.
// TODO(tucker): merge with the DetailText function in session.cc in a common location.
bool SetTimelineLabel(const Node* node, NodeExecStats* node_stats) {
  bool is_transfer_node = false;
  string memory;
  for (auto& all : node_stats->memory()) {
    int64 tot = all.total_bytes();
    if (tot >= 0.1 * 1048576.0) {
      int64 peak = all.peak_bytes();
      if (peak > 0) {
        memory =
            strings::StrCat(memory, "[", all.allocator_name(),
                     strings::Printf(" %.1fMB %.1fMB] ", tot / 1048576.0, peak / 1048576.0));
      } else {
        memory = strings::StrCat(memory, "[", all.allocator_name(),
                     strings::Printf(" %.1fMB] ", tot / 1048576.0));
  const NodeDef& def = node->def();
  string text = "";
  if (IsSend(node)) {
    string tensor_name;
    TF_CHECK_OK(GetNodeAttr(def, "tensor_name", &tensor_name));
    string recv_device;
    TF_CHECK_OK(GetNodeAttr(def, "recv_device", &recv_device));
    text = strings::StrCat(memory, def.name(), " = ", def.op(), "(",
                           tensor_name, " @", recv_device);
    is_transfer_node = true;
  } else if (IsRecv(node)) {
    string tensor_name;
    TF_CHECK_OK(GetNodeAttr(def, "tensor_name", &tensor_name));
    string send_device;
    TF_CHECK_OK(GetNodeAttr(def, "send_device", &send_device));
    text = strings::StrCat(memory, def.name(), " = ", def.op(), "(",
                           tensor_name, " @", send_device);
    is_transfer_node = true;
  } else {
    text = strings::StrCat(
        memory, def.name(), " = ", def.op(), "(",
            std::vector<StringPiece>(def.input().begin(), def.input().end()),
            ", "),
  return is_transfer_node;

// Helper routines for collecting step stats.
namespace nodestats {
inline int64 NowInUsec() { return Env::Default()->NowMicros(); }

void SetScheduled(NodeExecStats* nt, int64 t) { nt->set_scheduled_micros(t); }

void SetAllStart(NodeExecStats* nt) { nt->set_all_start_micros(NowInUsec()); }

void SetOpStart(NodeExecStats* nt) {
  DCHECK_NE(nt->all_start_micros(), 0);
  nt->set_op_start_rel_micros(NowInUsec() - nt->all_start_micros());

void SetOpEnd(NodeExecStats* nt) {
  DCHECK_NE(nt->all_start_micros(), 0);
  nt->set_op_end_rel_micros(NowInUsec() - nt->all_start_micros());

void SetAllEnd(NodeExecStats* nt) {
  DCHECK_NE(nt->all_start_micros(), 0);
  nt->set_all_end_rel_micros(NowInUsec() - nt->all_start_micros());

void SetOutput(NodeExecStats* nt, int slot, const Tensor* v) {
  NodeOutput* no = nt->add_output();

void SetMemory(NodeExecStats* nt, OpKernelContext* ctx) {
  for (const auto& allocator_pair : ctx->wrapped_allocators()) {
    AllocatorMemoryUsed* memory = nt->add_memory();
    // retrieving the sizes from the wrapped allocator removes the executor's reference to it,
    // so allocator_pair.second must not be dereferenced again after this statement
    auto sizes = allocator_pair.second->GetSizesAndUnRef();
    if (allocator_pair.first->TracksAllocationSizes()) {
  auto* ms = nt->mutable_memory_stats();
  for (const auto& alloc_id : ctx->host_persistent_alloc_ids()) {
  for (const auto& alloc_id : ctx->device_persistent_alloc_ids()) {

void SetReferencedTensors(NodeExecStats* nt,
                          const TensorReferenceVector& tensors) {
  // be careful not to increment the reference count on any tensor
  // while recording the information
  for (size_t i = 0; i < tensors.size(); ++i) {
    AllocationDescription* description = nt->add_referenced_tensor();

}  // namespace nodestats

class ExecutorImpl;
class GraphView;

struct EdgeInfo {
  int dst_id;
  int output_slot : 31;
  // true if this is the last info for output_slot in the EdgeInfo list.
  bool is_last : 1;
  int input_slot;

struct NodeItem {
  NodeItem() {}

  // A graph node.
  const Node* node = nullptr;

  // The kernel for this node.
  OpKernel* kernel = nullptr;

  bool kernel_is_expensive : 1;  // True iff kernel->IsExpensive()
  bool kernel_is_async : 1;      // True iff kernel->AsAsync() != nullptr
  bool is_merge : 1;             // True iff IsMerge(node)
  bool is_enter : 1;             // True iff IsEnter(node)
  bool is_exit : 1;              // True iff IsExit(node)
  bool is_control_trigger : 1;   // True iff IsControlTrigger(node)
  bool is_sink : 1;              // True iff IsSink(node)
  // True iff IsEnter(node) || IsExit(node) || IsNextIteration(node)
  bool is_enter_exit_or_next_iter : 1;

  // Cached values of node->num_inputs() and node->num_outputs(), to avoid levels of indirection.
  int num_inputs;
  int num_outputs;

  // ExecutorImpl::tensors_[input_start] is the 1st positional input for this node.
  int input_start = 0;

  // Number of output edges.
  int num_output_edges;

  PendingCounts::Handle pending_id;

  const EdgeInfo* output_edge_list() const { return output_edge_base(); }

  // ith output edge.
  const EdgeInfo& output_edge(int i) const {
    DCHECK_GE(i, 0);
    DCHECK_LT(i, num_output_edges);
    return output_edge_base()[i];

  DataType input_type(int i) const {
    DCHECK_LT(i, num_inputs);
    return static_cast<DataType>(input_type_base()[i]);
  DataType output_type(int i) const {
    DCHECK_LT(i, num_outputs);
    return static_cast<DataType>(output_type_base()[i]);

  // Return array of per-output allocator attributes.
  const AllocatorAttributes* output_attrs() const { return output_attr_base(); }

  friend class GraphView;

  // Variable length section starts immediately after *this
  // (uint8 is enough for DataType).
  //   EdgeInfo            out_edges[num_out_edges];
  //   AllocatorAttributes output_attr[num_outputs];
  //   uint8               input_type[num_inputs];
  //   uint8               output_type[num_outputs];

  // Return pointer to variable length section.
  char* var() const {
    return const_cast<char*>(reinterpret_cast<const char*>(this) + sizeof(NodeItem));           

  EdgeInfo* output_edge_base() const {
    return reinterpret_cast<EdgeInfo*>(var());
  AllocatorAttributes* output_attr_base() const {
    return reinterpret_cast<AllocatorAttributes*>(var() + sizeof(EdgeInfo) * num_output_edges);
  uint8* input_type_base() const {
    return reinterpret_cast<uint8*>(var() +
                                    sizeof(EdgeInfo) * num_output_edges +
                                    sizeof(AllocatorAttributes) * num_outputs);
  uint8* output_type_base() const {
    return reinterpret_cast<uint8*>(
        var() + sizeof(EdgeInfo) * num_output_edges +
        sizeof(AllocatorAttributes) * num_outputs + sizeof(uint8) * num_inputs);


typedef gtl::InlinedVector<TensorValue, 4> TensorValueVec;
typedef gtl::InlinedVector<DeviceContext*, 4> DeviceContextVec;
typedef gtl::InlinedVector<AllocatorAttributes, 4> AllocatorAttributeVec;

// Immutable view of a Graph organized for efficient execution.
class GraphView {
  GraphView() : space_(nullptr) {}

  void Initialize(const Graph* g);
  Status SetAllocAttrs(const Graph* g, const Device* device);

  NodeItem* node(int id) const {
    DCHECK_GE(id, 0);
    DCHECK_LT(id, num_nodes_);
    uint32 offset = node_offsets_[id];
    return ((offset == kuint32max)
                ? nullptr
                : reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]));

  char* InitializeNode(char* ptr, const Node* n);
  size_t NodeItemBytes(const Node* n);

  int32 num_nodes_ = 0;
  uint32* node_offsets_ = nullptr;  // array of size "graph_.num_node_ids()"
  // node_offsets_[id] holds the byte offset for node w/ "id" in space_

  char* space_;  // NodeItem objects are allocated here


class ExecutorImpl : public Executor {
  ExecutorImpl(const LocalExecutorParams& p, const Graph* g)
      : params_(p), graph_(g), gview_() {
    CHECK(p.create_kernel != nullptr);
    CHECK(p.delete_kernel != nullptr);

  ~ExecutorImpl() override {
    for (int i = 0; i < graph_->num_node_ids(); i++) {
      NodeItem* item = gview_.node(i);
      if (item != nullptr) {
    for (auto fiter : frame_info_) {
      delete fiter.second;
    delete graph_;

  Status Initialize();

  // Process all Nodes in the current graph, attempting to infer the
  // memory allocation attributes to be used wherever they may allocate
  // a tensor buffer.
  Status SetAllocAttrs();

  void RunAsync(const Args& args, DoneCallback done) override;

  friend class ExecutorState;

  struct ControlFlowInfo {
    gtl::FlatSet<string, HashStr> unique_frame_names;
    std::vector<string> frame_names;

  struct FrameInfo {
        : input_count(0),
          nodes(nullptr) {}

    // The total number of inputs to a frame.
    int input_count;

    // The total number of input tensors of a frame.
    // == sum(nodes[*].num_inputs()) where nodes are the nodes in the frame.
    int total_inputs;

    // Used to determine the next place to allocate space in the
    // pending_counts data structure we'll eventually construct
	// 用于确定下一个位置,以分配我们将最终构建的 pending_counts 数据结构中的空间,
    PendingCounts::Layout pending_counts_layout;

    // Each frame has its own PendingCounts only for the nodes
