mxnet代码理解——MXAPIPredictor结构体

最新推荐文章于 2020-10-27 14:30:03 发布

永恒_一瞬

最新推荐文章于 2020-10-27 14:30:03 发布

阅读量706

点赞数

分类专栏：开源架构深度学习文章标签： mxnet 深度学习架构

本文链接：https://blog.csdn.net/yangjf91/article/details/84135401

版权

深度学习同时被 2 个专栏收录

20 篇文章

订阅专栏

开源架构

15 篇文章

订阅专栏

MXAPIPredictor结构体在 c_predict_api .cc中定义:

// predictor interface
struct MXAPIPredictor 
{
  std::vector<NDArray> out_arrays; 	// output arrays
  std::vector<NDArray> arg_arrays;  // argument arrays
  std::vector<NDArray> aux_arrays;  // auxiliary arrays
  std::vector<TShape> out_shapes;  	// output shapes
  std::vector<uint32_t> out_shapes_buffer;  // uint32_t buffer for output shapes
  std::unordered_map<std::string, size_t> key2arg;  // key to arguments
  std::unique_ptr<Executor> exec;  	// executor
  nnvm::Symbol sym;  				// symbol
  Context ctx;  					// Context
};

包含输入数据arg_arrays、aux_arrays，输出数据和尺度out_arrays、out_shapes、out_shapes_buffer，网络结构key2arg，执行器exec，NNVM的编译器sym，上下文关系ctx。其中执行器的Executor类为：

class Executor 
{
 public:
  virtual ~Executor() {}
  virtual void Forward(bool is_train) = 0;//get the result 
  virtual void PartialForward(bool is_train, int step, int *step_left) = 0;//issue operation specified by step.
  virtual void Backward(const std::vector<NDArray> &head_grads, bool is_train = true) = 0;//NDArrays specified by grad_in_args_store will be updated accordingly.
  virtual void Print(std::ostream &os) const {} // print the execution plan info to output stream.
  virtual const std::vector<NDArray> &outputs() const = 0;//get array of outputs in the executor
  virtual const std::unordered_map<std::string, NDArray>& in_arg_map() const = 0;//input argument map in the executor
  virtual const std::unordered_map<std::string, NDArray>& arg_grad_map() const = 0;//get input argument graident map
  virtual const std::unordered_map<std::string, NDArray>& aux_state_map() const = 0;//get aux state map
  //Return a new executor with the same symbol and shared memory, but different input/output shapes
  virtual Executor* Reshape(const bool partial_shaping,
                            const bool allow_up_sizing,
                            const Context& default_ctx,
                            const std::map<std::string, Context>& ctx_map,
                            const std::unordered_map<std::string, TShape>&
                              provided_arg_shapes,
                            std::vector<NDArray>* in_args,
                            std::vector<NDArray>* arg_grads,
                            std::vector<NDArray>* aux_states) = 0; 
   //Create an operator by bind symbol with context and arguments.If user do not want to compute the gradients of i-th argument, grad_req_type[i] can be kNullOp.
  static Executor *Bind(nnvm::Symbol symbol,
                        const Context& default_ctx,
                        const std::map<std::string, Context>& group2ctx,
                        const std::vector<NDArray> &in_args,
                        const std::vector<NDArray> &arg_grad_store,
                        const std::vector<OpReqType> &grad_req_type,
                        const std::vector<NDArray> &aux_states,
                        Executor* shared_exec = NULL);
	//Only need some of the necessary arrays, and the other arrays can be infered automatically.
  static Executor* SimpleBind(nnvm::Symbol symbol,
                              const Context& default_ctx,
                              const std::map<std::string, Context>& group2ctx,
                              const std::vector<Context>& in_arg_ctxes,
                              const std::vector<Context>& arg_grad_ctxes,
                              const std::vector<Context>& aux_state_ctxes,
                              const std::unordered_map<std::string, TShape>& arg_shape_map,
                              const std::unordered_map<std::string, int>& arg_dtype_map,
                              const std::unordered_map<std::string, int>& arg_stype_map,
                              const std::vector<OpReqType>& grad_req_types,
                              const std::unordered_set<std::string>& param_names,
                              std::vector<NDArray>* in_args,
                              std::vector<NDArray>* arg_grads,
                              std::vector<NDArray>* aux_states,
                              std::unordered_map<std::string, NDArray>*
                                shared_data_arrays = nullptr,
                              Executor* shared_exec = nullptr);
  typedef std::function<void(const char*, void*)> MonitorCallback;//user-defined monitor callback
  virtual void SetMonitorCallback(const MonitorCallback& callback) {}//Install a callback to notify the completion of operation.
};  // class executor

sym是nnvm的Symbol类，定义为：

class NNVM_DLL Symbol {
 public:
  enum ListAttrOption {kRecursive = 0, kShallow = 1};//option passed to ListAttr
  enum ListInputOption {kAll = 0, kReadOnlyArgs = 1, kAuxiliaryStates = 2};//option passed to ListInputNames
  std::vector<NodeEntry> outputs; //output entries contained in the symbol
  Symbol Copy() const;//A deep copy of this symbol
  void Print(std::ostream &os) const; // Print the symbol info to output stream.
  Symbol operator[] (size_t index) const;//Get the index-th element from the returned tuple.
  std::vector<NodePtr> ListInputs(ListInputOption option) const;// List the input variable nodes
  std::vector<std::string> ListInputNames(ListInputOption option) //List the input namesconst;
  std::vector<std::string> ListOutputNames() const;//List the names of outputs for this symbol.
   //Compose the symbol with arguments, this changes the current symbol.
  void Compose(const array_view<const Symbol*>& args,
               const std::unordered_map<std::string, const Symbol*>& kwargs,
               const std::string& name);
	//equivalent to Copy then Compose.
  Symbol operator () (const array_view<const Symbol*>& args,
                      const std::unordered_map<std::string, const Symbol*>& kwargs,
                      const std::string& name) const;
  void AddControlDeps(const Symbol& src);	//Add control flow dependencies to the operators in symbols.
  Symbol GetInternals() const;	//A new symbol whose output contains all the outputs of the symbols including input variables and intermediate outputs.
  Symbol GetChildren() const;  	//Get the direct inputs of the head node(s) of this symbol.
  void SetAttrs(const std::vector<std::pair<std::string, std::string> >& attrs);  //Set additional attributes to current node.  	
  bool GetAttr(const std::string& key, std::string* out) const;//Get attributes from the symbol.  	
  std::unordered_map<std::string, std::string> ListAttrs(ListAttrOption option) const;//Get attribute dictionary from the symbol.  	
  std::vector<std::tuple<std::string, std::string, std::string> > ListAttrsRecursive() const;//Get attribute dictionary from the symbol and all children.  
  static Symbol CreateFunctor(const Op* op, std::unordered_map<std::string, std::string> attrs); //Create symbolic functor(AtomicSymbol) by given operator and attributes.
  static Symbol CreateFunctor(const NodeAttrs& attrs); //Create symbolic functor(AtomicSymbol) by given node attributes.
  static Symbol CreateVariable(const std::string& name); //Create symbol node representing variable.
  static Symbol CreateGroup(const std::vector<Symbol>& symbols);//Create equivalence of symbol by grouping the symbols together.
};

ctx是base.h中定义的Context结构体：

struct Context 
{
  enum DeviceType {kCPU = cpu::kDevMask, kGPU = gpu::kDevMask, kCPUPinned = 3, kCPUShared = 5,};//Type of device
  DeviceType dev_type;//the device type we run the op on
  int32_t dev_id;//device id we are going to run it on
  Context() : dev_type(kCPU), dev_id(0) {}//default constructor
  //Get corresponding device mask，return cpu::kDevMask or gpu::kDevMask
  inline DeviceType dev_mask() const 
  {
    if (dev_type == kCPUPinned || dev_type == kCPUShared)
    	return kCPU;
    return dev_type;
  }
  //Returns dev_id for kGPU and kCPUPinned, 0 otherwise
  inline int real_dev_id() const 
  {
    if (dev_type == kCPUPinned || dev_type == kGPU)
    	return dev_id;
    return 0;
  }
  //used to enable Context as std::map key
  inline bool operator<(const Context &b) const;
  // check if current context equals another one
  inline bool operator==(const Context &b) const 
  {
    return dev_type == b.dev_type && dev_id == b.dev_id;
  }
  // check if current context not equals another one
  inline bool operator!=(const Context &b) const 
  {
    return !(*this == b);
  }
  // save the content into binary stream
  inline void Save(dmlc::Stream *strm) const 
  {
    strm->Write(&dev_type, sizeof(dev_type));
    strm->Write(&dev_id, sizeof(dev_id));
  }
  // load the content from binary stream
  inline bool Load(dmlc::Stream *strm)
   {
    if (strm->Read(&dev_type, sizeof(dev_type)) != sizeof(dev_type)) 
    	return false;
    if (strm->Read(&dev_id, sizeof(int32_t)) != sizeof(int32_t)) 
    	return false;
    return true;
  }
  static const int32_t kMaxDevType = 6; //the maximal device type
  static const int32_t kMaxDevID = 16; //the maximal device index
  inline static Context Create(DeviceType dev_type, int32_t dev_id = -1);//Create a new context
  inline static Context CPU(int32_t dev_id = 0);//return CPU Context
  inline static Context GPU(int32_t dev_id = -1);//Create a GPU,-1 for current GPU.context.
  inline static Context CPUPinned(int32_t dev_id = -1);//Create a pinned CPU context
  inline static Context CPUShared(int32_t dev_id = 0);//Create a CPU shared memory context.
  inline static Context FromString(const std::string& str);//Create a context from string of the format [cpu|gpu|cpu_pinned]
  inline static int32_t GetGPUCount();//Get the number of GPUs available
  inline static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total);//get the free and total available memory on a GPU
};

out_shapes是通过vector存储的TShape数据，TShape是nnvm中定义的尺度的类，为了可视化通过vector<uint32_t>的out_shapes_buffer来保存实际尺度数据。TShape类的定义为：

class TShape : public Tuple<dim_t> 
{
 public:
  TShape() = default;//default constructor
  inline TShape(uint32_t ndim) 
  {  
    this->SetDim(ndim);
    std::fill_n(begin(), ndim, 1);
  }// constructor to construct a shape with all 1
  inline TShape(const Tuple<dim_t>& s) // copy constructor of TShape
    {this->assign(s.begin(), s.end());}
  inline TShape(std::initializer_list<dim_t> init) //constructor from initializer list
    {this->assign(init.begin(), init.end());}
  inline TShape(Tuple<dim_t>&& s) //move constructor.
    {this->swap(s);}
  template<typename RandomAccessIterator>
  inline TShape(RandomAccessIterator begin, RandomAccessIterator end) //construct the Tuple from content of iterator
    {this->assign(begin, end);}
  inline TShape& operator=(const Tuple<dim_t>& src) 
  {
    this->assign(src.begin(), src.end());
    return *this;
  }//assignment function from tshape
 inline TShape& operator=(Tuple<dim_t>&& src) 
 {
    TShape(std::move(src)).swap(*this);
    return *this;
  }//move assignment function from tshape
  inline size_t Size() const 
  {
    dim_t size = 1;
    const dim_t* start = begin(), *fin = end();
    for (const dim_t* it = start; it != fin; ++it) 
    {
      size *= *it;
    }
    return size;
  }//total number of elements in the shape
  inline size_t ProdShape(int dimstart, int dimend) const 
  {
    dim_t num = 1;
    const dim_t *d = this->data();
    for (int i = dimstart; i < dimend; ++i) 
    {
      num *= d[i];
    }
    return num;
  }//product shape in [dimstart,dimend)
  inline const dim_t *data() const //the begin data pointer to content of the tuple
    {return begin();}
  inline dim_t *data() //the begin data pointer to content of the tuple
    {return begin();}
#ifdef MSHADOW_XINLINE
  template<int dim>
  inline TShape(const mshadow::Shape<dim> &s) 
    {this->assign(s.shape_, s.shape_ + dim);}
  template<int dim>
  inline TShape(mshadow::Shape<dim> &&s)
    {this->assign(s.shape_, s.shape_ + dim);}
  template<int dim>
  inline TShape &operator=(const mshadow::Shape<dim> &shape) 
  {
    this->assign(shape.shape_, shape.shape_ + dim);
    return *this;
  }//assignment from shape
  template<int dim>
  inline mshadow::Shape<dim> get() const 
  {
    CHECK_EQ(dim, static_cast<int>(ndim()))
        << "dimension do not match target dimension " << dim << " vs " << ndim();
    const dim_t *d = this->data();
    mshadow::Shape<dim> s;
    for (int i = 0; i < dim; ++i) {
      s[i] = d[i];
    }
    return s;
  }//get the shape of tensor specifying dim
  inline mshadow::Shape<2> FlatTo2D(void) const 
  {
    mshadow::Shape<2> s;
    if (ndim() == 0) return mshadow::Shape2(0, 0);
    const dim_t *d = this->data();
    s.shape_[1] = d[ndim() - 1];
    dim_t ymax = 1;
    for (size_t i = 1; i < ndim(); ++i) {
      ymax *= d[i - 1];
    }
    s.shape_[0] = ymax;
    return s;
  }//flatten the higher dimension to second dimension, return a 2D shape
  inline mshadow::Shape<3> FlatTo3D(size_t axis_begin, size_t axis_end) const {
    CHECK(axis_end >= axis_begin);
    mshadow::Shape<3> s;
    if (ndim() == 0) return mshadow::Shape3(0, 0, 0);
    const dim_t *d = this->data();
    s.shape_[0] = 1;
    s.shape_[1] = 1;
    s.shape_[2] = 1;
    for (size_t i = 0; i < axis_begin; ++i) {
      s.shape_[0] *= d[i];
    }
    for (size_t i = axis_begin; i <= axis_end; ++i) {
      s.shape_[1] *= d[i];
    }
    for (size_t i = axis_end + 1; i < ndim(); ++i) {
      s.shape_[2] *= d[i];
    }
    return s;
  }//flatten the shape into three parts: [0, axis_begin), [axis_begin, axis_end], (axis_end, ndim)
  inline mshadow::Shape<3> FlatTo3D(size_t axis) const //flatten the axis before and after the specified axis, so it becomes 3D tensor
  { return FlatTo3D(axis, axis);}
  inline bool operator==(const TShape &s) const 
  {
    if (ndim() != s.ndim()) return false;
    return std::equal(begin(), end(), s.begin());
  }
  inline bool operator!=(const TShape &s) const
    {return !(*this == s);}
  template<int dim>
  inline bool operator==(const mshadow::Shape<dim> &s) const 
  {
    if (ndim_ != dim) return false;
    const dim_t *d = dim <= kStackCache ? data_stack_ : data_heap_;
    for (size_t i = 0; i < dim; ++i) 
    {
      if (d[i] != s.shape_[i]) return false;
    }
    return true;
  }//whether two shape equals
  template<int dim>
  inline bool operator!=(const mshadow::Shape<dim> &s) const 
  { return !(*this == s);}//whether two shape not equals
#endif
};

out_arrays、arg_arrays、aux_arrays都是vector存储的NDArray数据，NDArray的定义为：

class NDArray {
 public:
  NDArray() {}//default constructor
  //constructs a new dynamic NDArray
  NDArray(const TShape &shape, Context ctx, bool delay_alloc = false, int dtype = mshadow::default_type_flag)
      : ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype)), shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0})
      {}
  //constructor for NDArray with storage type
  NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx,
          bool delay_alloc = true, int dtype = mshadow::default_type_flag,
          std::vector<int> aux_types = {}, std::vector<TShape> aux_shapes = {},
          TShape storage_shape = TShape(mshadow::Shape1(0)));
  //constructing a static NDArray that shares data with TBlob, make sure the memory region is available through out the life of NDArray
  NDArray(const TBlob &data, int dev_id)
      : ptr_(std::make_shared<Chunk>(data, dev_id)), shape_(data.shape_),
        dtype_(data.type_flag_), storage_type_(kDefaultStorage),
        entry_({nullptr, 0, 0}) 
        {}
  //constructing a static NDArray that shares data with TBlob which is with deleter
  NDArray(const TBlob &data, int dev_id, const std::function<void()>& deleter)
      : ptr_(new Chunk(data, dev_id),
        [deleter](Chunk *p) 
        {
          deleter();    // call custom deleter
          delete p;     // delete Chunk object
        }),
        shape_(data.shape_),
        dtype_(data.type_flag_), storage_type_(kDefaultStorage),
        entry_({nullptr, 0, 0}) 
        {}

  //create ndarray from shared memory
  NDArray(int shared_pid, int shared_id, const TShape& shape, int dtype)
      : ptr_(std::make_shared<Chunk>(shared_pid, shared_id, shape, dtype)), shape_(shape),
        dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) 
        {}

  //constructing a static NDArray of non-default storage that shares data with TBlob
  NDArray(const NDArrayStorageType stype, const TShape &shape,
          const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
      : ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)), shape_(shape),
        dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) 
        {}

  //This indicates whether an array created by reshape or slice
  inline bool IsView() const 
  {
    // View only works on the default storage
    if (storage_type() != kDefaultStorage)
      return false;
    // If the array reuses memory, its shape may be different from the storage
    // shape. However, we shouldn't consider it as a view.
    if (reuse_)
      return false;
    return byte_offset_ > 0 || shape() != ptr_->storage_shape;
  }

  // Check whether the two arrays are the same array */
  inline bool IsSame(const NDArray& other) const 
  {
    return ptr_ == other.ptr_ &&
        shape_ == other.shape_ &&
        byte_offset_ == other.byte_offset_ &&
        dtype_ == other.dtype_;
  }
    
  inline const TShape& shape() const {return shape_;}//return the shape of current NDArray.
  //the shape of underlying chunk which stores the NDArray data/value
  inline const TShape &storage_shape() const 
  {
    CHECK(ptr_ != nullptr);
    CHECK_NE(storage_type(), kDefaultStorage)
             << "storage_shape() is not intended for kDefaultStorage.";
    return ptr_->storage_shape;
  }    
  inline const TShape& aux_shape(size_t index) const {CHECK_NE(storage_type(), kDefaultStorage)<< "...";return ptr_->aux_shapes[index];}//get the shape of aux_data(index)
  const std::vector<TShape>& aux_shapes() const {CHECK_NE(storage_type(), kDefaultStorage)<< "...";return ptr_->aux_shapes;}//return the shapes of all aux data
  const std::vector<int>& aux_types() const {CHECK_NE(storage_type(), kDefaultStorage)<< "...";return ptr_->aux_types;}//returns the dtypes of all aux data
  inline void set_aux_shape(size_t index, const TShape& shape) const {CHECK_NE(storage_type(), kDefaultStorage)<< "...";ptr_->set_aux_shape(index, shape);}// reset the exact size when the shape is known

  //return the data TBlob
  inline const TBlob& data() const 
  {
    if (storage_type() == kDefaultStorage) CheckAndAlloc();
    SetTBlob();
    return tblob_;
  }  
  NDArray grad() const;// return the gradient ndarray.
  // return the aux TBlob
  inline TBlob aux_data(size_t i) const 
  {
    auto stype = storage_type();
    TBlob res;
    auto shape = aux_shape(i);
    auto type = aux_type(i);
    MSHADOW_TYPE_SWITCH(type, DType, 
    {
      auto dptr = static_cast<DType*>(ptr_->aux_handles[i].dptr);
      CHECK(stype == kRowSparseStorage || stype == kCSRStorage)
            << "Unexpected storage type: " << stype;
      res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type);
    });
    return res;
  }
  
  inline Context ctx() const{CHECK(!is_none());return ptr_->shandle.ctx;}// return the context of NDArray  
  inline int dtype() const {return dtype_;}// return the data type of NDArray
  inline int aux_type(size_t i) const {CHECK(!is_none());return ptr_->aux_types[i];}
  inline NDArrayStorageType storage_type() const {return storage_type_;}
  inline bool is_none() const {return ptr_.get() == nullptr;}//return whether this ndarray is not initialized
  bool fresh_out_grad() const; //return updated grad state in entry_ 
  void set_fresh_out_grad(bool state) const; //return updated grad state in entry_
  // a sparse ndarray's aux_data and storage are initialized
  inline bool storage_initialized() const 
  {
    if (is_none()) return false;
    auto stype = storage_type();
    CHECK_NE(stype, kDefaultStorage) << "...";
    if (stype == kRowSparseStorage) 
    {
      CHECK_EQ(aux_shape(rowsparse::kIdx)[0], storage_shape()[0])
               << "inconsistent storage shape " << storage_shape()
               << " vs. aux shape " << aux_shape(rowsparse::kIdx);
      return aux_shape(rowsparse::kIdx).Size() != 0;
    } 
    else if (stype == kCSRStorage) 
    {
      CHECK_EQ(aux_shape(csr::kIdx)[0], storage_shape()[0])
               << "inconsistent storage shape " << storage_shape()
               << " vs. aux shape " << aux_shape(csr::kIdx);
      return aux_shape(csr::kIdx).Size() != 0;
    } 
    else {LOG(FATAL) << "Unknown storage type";}
    return true;
  }
  // get storage handle
  inline Storage::Handle storage_handle() const 
  {
    CHECK(!is_none());
    CHECK_EQ(storage_type(), kDefaultStorage);
    CheckAndAlloc();
    return ptr_->shandle;
  }
   // write finished, performed read
  inline void WaitToRead() const 
  {
    if (is_none()) return;
    Engine::Get()->WaitForVar(ptr_->var);
  }
  //read/write operations finished, performed write 
  inline void WaitToWrite() const 
  {
    if (is_none()) return;
    Engine::Get()->PushAsync([](RunContext, Engine::CallbackOnComplete on_complete) 
      {on_complete();}, Context{}, {}, {ptr_->var});
    Engine::Get()->WaitForVar(ptr_->var);
  }
  inline Engine::VarHandle var() const {return ptr_->var;}//return the associated variable of the ndarray.
  inline size_t byte_offset() const {return byte_offset_;}//return byte offset in chunk of the ndarray
  inline size_t version() const {return var()->version();}//return var version of the NDArray
  void Save(dmlc::Stream *strm) const;//save the content into binary stream
  bool LegacyLoad(dmlc::Stream *strm, const uint32_t magic);//load ndarrays before supporting sparse ndarrays
  bool Load(dmlc::Stream *strm);//load the content from binary stream
  NDArray &operator=(real_t scalar);//set all the elements in ndarray to be scalar
  NDArray &operator+=(const NDArray &src);//elementwise add to current space this mutate the current NDArray
  NDArray &operator+=(const real_t &src);//elementwise add to current space
  NDArray &operator-=(const NDArray &src);//elementwise subtract from current ndarray
  NDArray &operator-=(const real_t &src);//elementwise subtract from current ndarray
  NDArray &operator*=(const NDArray &src);//elementwise multiplication to current ndarray
  NDArray &operator*=(const real_t &src);//elementwise multiplication to current ndarray
  NDArray &operator/=(const NDArray &src);//elementwise division from current ndarray
  NDArray &operator/=(const real_t &src);//elementwise division from current ndarray
  NDArray Copy(Context ctx) const;// return a new copy this NDArray
  void SyncCopyFromCPU(const void *data, size_t size) const;//Do a synchronize copy from a continugous CPU memory region. will call WaitToWrite
  void SyncCopyFromNDArray(const NDArray &src, int i = -1, int j = -1);//Copy from src.data()/aux_data(i) to this->data()/aux_data(j)
  void SyncCopyToCPU(void *data, size_t size) const;//Do a synchronize copy to a continugous CPU memory region. will call WaitToRead
  void SyncCheckFormat(const bool full_check) const;//check whether the NDArray format is valid
  NDArray Slice(index_t begin, index_t end) const;//Slice a NDArray
  NDArray SliceWithRecord(index_t begin, index_t end);//Slice a NDArray. Supports recording with autograd
  NDArray At(index_t idx) const;//Index a NDArray
  NDArray AtWithRecord(index_t idx);
  NDArray aux_ndarray(size_t i) const;//Generate a deep copy of aux_data(i)
  NDArray data_ndarray() const;//Generate a deep copy of data()
  //Create a NDArray that shares memory with current one, The new array must have smaller memory size
  inline NDArray AsArray(const TShape &shape, int dtype) const 
  {
    CHECK_EQ(storage_type(), kDefaultStorage) << "...";
    CHECK_GE(ptr_->shandle.size, shape.Size() * mshadow::mshadow_sizeof(dtype)) << "...";
    // We can't reuse memory in a view.
    CHECK(!IsView());
    NDArray ret = *this;
    ret.shape_ = shape;
    ret.dtype_ = dtype;
    ret.reuse_ = true;
    return ret;
  } 
  DLManagedTensor* ToDLPack() const; //Create a reference view of NDArray 
  static NDArray FromDLPack(const DLManagedTensor* tensor); //create a NDArray using the memory allocated by an external deep learning framework that is DLPack compatible.
  //Update ndarray chunk storage handles using existing ndarray storage handles
  inline void SparseUpdateChunk(const NDArray &arr) const 
  {
    CHECK(shape_ == arr.shape_) << "ndarray shape is different from the target";
    CHECK(dtype_ == arr.dtype_) << "ndarray dtype is different from the target";
    auto stype = arr.storage_type();
    CHECK(stype == kCSRStorage || stype == kRowSparseStorage) << "...";
    // swap shandles between src and dst
    Storage::Handle shandle_dst = arr.ptr_->shandle;
    arr.ptr_->shandle = ptr_->shandle;
    ptr_->shandle = shandle_dst;
    ptr_->storage_shape = arr.ptr_->storage_shape;
    ptr_->storage_type = arr.ptr_->storage_type;
    ptr_->ctx = arr.ptr_->ctx;
    // swap aux_handles between src and dst
    size_t aux_idx = 0;
    CHECK(ptr_->aux_handles.size() == arr.ptr_->aux_handles.size()) << "...";
    for (auto &aux_handle : arr.ptr_->aux_handles) 
    {
      Storage::Handle aux_dst = ptr_->aux_handles[aux_idx];
      ptr_->aux_handles[aux_idx] = aux_handle;
      aux_handle = aux_dst;
      aux_idx++;
    }
    ptr_->aux_types = arr.ptr_->aux_types;
    ptr_->aux_shapes = arr.ptr_->aux_shapes;
  }
  NDArray Reshape(const TShape &shape) const;//Get an reshaped NDArray
  NDArray ReshapeWithRecord(const TShape &shape);//Get an reshaped NDArray. Supports autograd recording
  //Return a copy of this NDArray without autograd history
  NDArray Detach() const 
  {
    NDArray ret(*this);
    ret.entry_ = nnvm::NodeEntry{nullptr, 0, 0};
    return ret;
  }
  nnvm::Symbol get_autograd_symbol() const;
  //Allocate the space if it is delayed allocated.
  inline void CheckAndAlloc() const 
  {
    CHECK_EQ(storage_type(), kDefaultStorage);
    ptr_->CheckAndAlloc();
  }
  //Allocate the space if the allocation has been delayed or the requested size is bigger than the available one.
  void ReshapeAndAlloc(const TShape& shape) 
  {
    CHECK_EQ(storage_type(), kDefaultStorage);
    CHECK(!is_none());
    shape_ = shape;
    ptr_->CheckAndAlloc(shape.Size() * mshadow::mshadow_sizeof(dtype_));
  }
  //Alloc memory for non-default storage
  inline void CheckAndAlloc(const std::vector<TShape> &aux_shapes) const 
  {
    CHECK_NE(storage_type(), kDefaultStorage) << "...";
    ptr_->CheckAndAlloc(shape_, aux_shapes, dtype_);
  }
  inline void CheckAndAllocData(const TShape &storage_shape) const 
  {
    CHECK_NE(storage_type(), kDefaultStorage) << "...";
    ptr_->CheckAndAllocData(storage_shape, dtype_);
  }
  inline void CheckAndAllocAuxData(size_t i, const TShape &aux_shape) const 
  {
    CHECK_NE(storage_type(), kDefaultStorage) << "...";
    ptr_->CheckAndAllocAuxData(i, aux_shape);
  }
#if MXNET_USE_MKLDNN == 1
  explicit NDArray(const mkldnn::memory *mkldnn_mem, bool static_data = true);//Create NDArray from mkldnn memory.
  bool IsMKLDNNData() const {return ptr_->IsMKLDNN();}//Test if the data is stored in one of special MKLDNN format
  bool IsDefaultData() const {return ptr_->IsDefault();}//Test if the data is stored in one of default MXNet formats.
  
  /* there is a shared pointer that hold the memory either in NDArray or in MKLDNN stream. 
  As long as we call these functions inside an operator, the return memory is always valid. */
  const mkldnn::memory *GetMKLDNNData() const;//returns mkldnn::memory with the default primitive_desc.
  const mkldnn::memory *GetMKLDNNData(const mkldnn::memory::primitive_desc &desc) const;//returns mkldnn::memory with the given primitive_desc
  const mkldnn::memory *GetMKLDNNDataReorder(const mkldnn::memory::primitive_desc &desc) const;//returns mkldnn::memory with the given primitive_desc have the same physical layout as the given primitive_desc.
  void CopyFrom(const mkldnn::memory &mem);//copies data from mkldnn memory.
  mkldnn::memory *CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc);//allocates memory for array and creates mkldnn memory with the specified format.
  //changes the layout of this NDArray, but it happens after all accesses to the array are complete.
  void Reorder2DefaultAsync();
  void MKLDNNDataReorderAsync(const mkldnn::memory::primitive_desc &desc);
  NDArray Reorder2Default() const;//creates a new NDArray with the reordered data.
  void InvalidateMKLDNNData();
  NDArray MKLDNNDataReshape(const TShape &shape) const;//reshape an array, only valid inside the current invocation of this operator.
  void UpdateMKLDNNMemDesc();//Fix mkldnn memory descriptor mismatch from NDArray.
#endif
  //Save list of ndarray into the Stream.x
  static void Save(dmlc::Stream* fo,
                   const std::vector<NDArray>& data,
                   const std::vector<std::string>& names);
  //Load list of ndarray into from the stream.
  static void Load(dmlc::Stream* fi,
                   std::vector<NDArray>* data,
                   std::vector<std::string>* keys);
 private:
  friend class Imperative;
  struct Chunk //the real data chunk that backs NDArray
  {
    Storage::Handle shandle;//storage handle from storage engine, for non-default storage, store the actual values in the NDArray
    std::vector<Storage::Handle> aux_handles;//store the aux data(such as indices) if it's needed by non-default storage.
#if MXNET_USE_MKLDNN == 1
    std::shared_ptr<MKLDNNMemory> mkl_mem_;//is created when data is stored in MKLDNN format.
#endif
    Engine::VarHandle var;//variable from engine
    bool static_data;//construct from static data, true means the data do not come from Storage, and do not need to be freed
    bool delay_alloc;//whether data allocation is delayed
    NDArrayStorageType storage_type = kDefaultStorage;// the type of the storage.
    std::vector<int> aux_types;//type of aux
    Context ctx;// context of data
    TShape storage_shape;// The shape of the chunk data.
    std::vector<TShape> aux_shapes;// The shape of aux data.
    Chunk() : static_data(true), delay_alloc(false) {}//default cosntructor
    //construct a new chunk
    Chunk(TShape shape, Context ctx_, bool delay_alloc_, int dtype)
        : static_data(false), delay_alloc(true), ctx(ctx_) 
    {
      auto size = shape.Size();
      storage_shape = shape;
      var = Engine::Get()->NewVariable();
      shandle.size = size * mshadow::mshadow_sizeof(dtype);
      shandle.ctx = ctx_;
      if (!delay_alloc_) this->CheckAndAlloc();
    }
    Chunk(const TBlob &data, int dev_id)
        : static_data(true), delay_alloc(false) 
    {
      CHECK(storage_type == kDefaultStorage);
      var = Engine::Get()->NewVariable();
      if (data.dev_mask() == cpu::kDevMask) {ctx = Context::CPU();} 
      else {CHECK_EQ(data.dev_mask(), gpu::kDevMask);ctx = Context::GPU(dev_id);}
      // init shandle
      shandle.ctx = ctx;
      shandle.dptr = data.dptr_;
      shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
      storage_shape = data.shape_;
    }
    Chunk(int shared_pid, int shared_id, const TShape& shape, int dtype)
        : static_data(false), delay_alloc(false) 
    {
      var = Engine::Get()->NewVariable();
      ctx = Context::CPUShared(0);
      shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
      shandle.ctx = ctx;
      shandle.shared_pid = shared_pid;
      shandle.shared_id = shared_id;
      Storage::Get()->Alloc(&shandle);
      storage_shape = shape;
    }
    // Constructor for a non-default storage chunk
    Chunk(NDArrayStorageType storage_type_, const TShape &storage_shape_, Context ctx_, bool delay_alloc_, int dtype, const std::vector<int> &aux_types_, const std::vector<TShape> &aux_shapes_)
        : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_), aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_), aux_shapes(aux_shapes_) 
    {
      shandle.ctx = ctx;
      var = Engine::Get()->NewVariable();
      // aux_handles always reflect the correct number of aux data
      for (size_t i = 0; i < aux_shapes.size(); i++) 
      {
        CheckAndAllocAuxData(i, aux_shapes[i]);
        // this line is needed in case when aux_shapes[i].Size() = 0
        // aux_handles[i] will not be updated and take only default value.
        aux_handles[i].ctx = ctx;
      }
      if (!delay_alloc) {CheckAndAllocData(storage_shape, dtype);}
    }
    Chunk(const NDArrayStorageType storage_type_, const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
        : static_data(true), delay_alloc(false), storage_type(storage_type_) 
    {
      using namespace mshadow;
      CHECK_NE(storage_type, kDefaultStorage);
      // init var
      var = Engine::Get()->NewVariable();
      // init ctx
      if (data.dev_mask() == cpu::kDevMask) {ctx = Context::CPU();} 
      else {CHECK_EQ(data.dev_mask(), gpu::kDevMask);ctx = Context::GPU(dev_id);}
      // init shandle
      shandle.ctx = ctx;
      shandle.dptr = data.dptr_;
      shandle.size = data.shape_.Size() * mshadow_sizeof(data.type_flag_);
      storage_shape = data.shape_;
      // init aux handles
      for (const auto &aux : aux_data) 
      {
        Storage::Handle aux_handle;
        aux_handle.ctx = ctx;
        aux_handle.dptr = aux.dptr_;
        aux_handle.size = aux.shape_.Size() * mshadow_sizeof(aux.type_flag_);
        aux_handles.push_back(aux_handle);
        aux_types.emplace_back(aux.type_flag_);
        aux_shapes.emplace_back(aux.shape_);
      }
    }

    //set the shape for ith aux data, and update storage shape if necessary
    inline void set_aux_shape(const size_t i, const TShape& shape) 
    {
      aux_shapes[i] = shape;
      if (storage_shape.ndim() > 0) 
      {
        if (storage_type == kRowSparseStorage && i == rowsparse::kIdx) {storage_shape[0] = shape[0];} 
        else if (storage_type == kCSRStorage && i == csr::kIdx) {storage_shape[0] = shape[0];}
      }
    }
    // check if delay alloc is on, do alloc if not yet done
    inline void CheckAndAlloc(void) 
    {
      if (delay_alloc) 
      {
        shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx);
#if MXNET_USE_MKLDNN == 1
        mkl_mem_ = nullptr;
#endif
        delay_alloc = false;
      }
    }
    // Check and alloc memory for a dense ndarray, size is the number of bytes
    void CheckAndAlloc(uint64_t dbytes) 
    {
      CHECK_EQ(kDefaultStorage, storage_type) << "...";
      dbytes = std::max(dbytes, static_cast<uint64_t>(shandle.size));
      if (delay_alloc) 
      {
        shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
#if MXNET_USE_MKLDNN == 1
        mkl_mem_ = nullptr;
#endif
        delay_alloc = false;
      } 
      else if (shandle.size < dbytes) 
      {
        // free storage if necessary and alloc again
        if (shandle.size > 0) Storage::Get()->Free(shandle);
        // init storage
        shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
#if MXNET_USE_MKLDNN == 1
        mkl_mem_ = nullptr;
#endif
      }
    }

    inline void CheckAndAlloc(const TShape &shape, const std::vector<TShape> &aux_shapes, int dtype) 
    {
      // calculate size, perform allocation
      if (kRowSparseStorage == storage_type) {
        // For row sparse, aux_shape indicates the number of rows to allocate
        auto aux_shape = aux_shapes[rowsparse::kIdx];
        CheckAndAllocAuxData(rowsparse::kIdx, aux_shape);
        TShape storage_shape(shape);
        storage_shape[0] = aux_shape[0];
        CheckAndAllocData(storage_shape, dtype);
      } else if (kCSRStorage == storage_type) {
        CheckAndAllocAuxData(csr::kIndPtr, aux_shapes[csr::kIndPtr]);
        CheckAndAllocAuxData(csr::kIdx, aux_shapes[csr::kIdx]);
        CheckAndAllocData(aux_shapes[csr::kIdx], dtype);
      } else {
        LOG(FATAL) << "Storage type " << storage_type << " not implemented for CheckAndAlloc";
      }
    }
    // create storage handle for data based on shape and dtype, assuming ctx is set storage shape is also updated if data is already allocated, try reuse the storage. Otherwise, free the current one and allocate new storage
    void CheckAndAllocData(const TShape &shape, int dtype);

#if MXNET_USE_MKLDNN == 1
    // Have MKL memory reference to the data in the default storage
    // or create memory for MKLDNN.
    void SetMKLMem(const TShape &shape, int dtype);
    // If the data is stored in MKLDNN layout, we reorder data in mkl_mem_ and
    // save the result in shandle.
    void Reorder2Default();
    // Reroder data to a specified layout.
    void MKLDNNDataReorder(const mkldnn::memory::primitive_desc &desc);
    bool IsMKLDNN() const;
    bool IsDefault() const;
#endif

    // create storage handle for aux data based on shape this function assumes ctx, aux shapes and aux types are set aux shape is also updated if aux data is already allocated, try reuse the storage. Otherwise, free the current one and allocate new storage
    inline void CheckAndAllocAuxData(size_t i, const TShape &shape) 
    {
      CHECK_EQ(shape.ndim(), 1) << "...";
      CHECK_NE(storage_type, kUndefinedStorage) << "...";
      CHECK_NE(storage_type, kDefaultStorage) << "...";
      if (aux_handles.size() <= i) {aux_handles.resize(i + 1);}
      size_t aux_bytes = shape.Size() * mshadow::mshadow_sizeof(aux_types[i]);
      if (aux_handles[i].size < aux_bytes) 
      {
        // free storage if necessary and alloc again
        if (aux_handles[i].size > 0) Storage::Get()->Free(aux_handles[i]);
        // init aux storage
        aux_handles[i] = Storage::Get()->Alloc(aux_bytes, ctx);
      }      
      set_aux_shape(i, shape);// init shape
    }
    ~Chunk();//destructor
  };  // struct Chunk
  void SetTBlob() const;
  std::shared_ptr<Chunk> ptr_{nullptr};//internal data of NDArray
  TShape shape_;//shape of current NDArray
  size_t byte_offset_ = 0;//byte offset in chunk
  int dtype_ = -1;//type of data
  bool reuse_ = false;//whether the NDArray uses memory of another NDArray.
  NDArrayStorageType storage_type_ = kUndefinedStorage;//storage type of data
  nnvm::NodeEntry entry_;//node entry for autograd
  mutable TBlob tblob_;//internal TBlob
};  // class NDArray