tensorflow之device

最新推荐文章于 2022-10-18 12:50:01 发布

wyg_031113

最新推荐文章于 2022-10-18 12:50:01 发布

阅读量845

点赞数

分类专栏： tensorflow 文章标签： tensorflow 人工智能 python

本文链接：https://blog.csdn.net/wyg_031113/article/details/124523813

版权

tensorflow 专栏收录该内容

22 篇文章 2 订阅

订阅专栏

这篇博客详细介绍了TensorFlow中设备的管理，包括设备属性、设备上下文和设备基类。设备在TensorFlow中用于执行计算，如GPU、CPU和TPU。设备属性定义了设备的特性，如内存容量和局部性。设备上下文管理数据在设备间的拷贝和内存分配，而设备基类提供了计算、资源管理和数据传输的基本功能。此外，还讨论了同步和异步计算以及设备的生命周期管理。

摘要由CSDN通过智能技术生成

tensorflow支持多种设备，如GPU,CPU, TPU等待。设备其实就是计算设备。用来执行计算的。在tensorflow中，计算被封装到OP中. OP的实现Kernel实现具体计算。计算还需要数据，因此需要能把数据复制到设备中，然后进行计算。

设备属性

通过proto定义了设备的属性：tensorflow/core/framework/device_attributes.proto


message InterconnectLink {
  int32 device_id = 1;
  string type = 2;
  int32 strength = 3;
}

message LocalLinks {
  repeated InterconnectLink link = 1;
}

message DeviceLocality {
  // Optional bus locality of device.  Default value of 0 means
  // no specific locality.  Specific localities are indexed from 1.
  int32 bus_id = 1;

  // Optional NUMA locality of device.
  int32 numa_node = 2;

  // Optional local interconnect links to other devices.
  LocalLinks links = 3;
}

message DeviceAttributes {
  // Fully specified name of the device within a cluster.
  string name = 1;

  // String representation of device_type.
  string device_type = 2;

  // Memory capacity of device in bytes.
  int64 memory_limit = 4;

  // Platform-specific data about device that may be useful
  // for supporting efficient data transfers.
  DeviceLocality locality = 5;

  // A device is assigned a global unique number each time it is
  // initialized. "incarnation" should never be 0.
  fixed64 incarnation = 6;

  // String representation of the physical device that this device maps to.
  string physical_device_desc = 7;

  // A physical device ID for use in XLA DeviceAssignments, unique across
  // clients in a multi-client setup. Set to -1 if unavailable, non-negative
  // otherwise.
  int64 xla_global_id = 8;
}

设备上下文：DeviceContext

提供stream_executor
提供CPU到非CPU(GPU)设备的Copy数据
提供本设备内部数据拷贝
提供设备内存分配Allocator


// A class that devices can subclass to pass around
// Device-specific context to OpKernels.
class DeviceContext : public core::RefCounted {
 public:
  ~DeviceContext() override {}
  virtual stream_executor::Stream* stream() const { return nullptr; }
  virtual void MaintainLifetimeOnStream(const Tensor* t,
                                        stream_executor::Stream* stream) const {
  }

  // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into
  // "device_tensor" which is on a non-CPU device "device". "device_tensor"
  // must be allocated to be of the same size as "cpu_tensor".
  virtual void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                                     Tensor* device_tensor, StatusCallback done,
                                     bool sync_dst_compute = true) const {
    done(errors::Internal("Unrecognized device type in CPU-to-device Copy"));
  }

  // Same as CopyCPUTensorToDevice, but in a synchronous way.
  Status CopyCPUTensorToDeviceSync(const Tensor* cpu_tensor, Device* device,
                                   Tensor* device_tensor) const;

  // Copies a tensor in this device.
  virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
                                      Device* device, Tensor* output_tensor,
                                      StatusCallback done) const {
    done(errors::Unimplemented("Copy in same device not implemented."));
  }

  // "device_tensor" is a tensor on a non-CPU device.  Copies
  // device_tensor into "cpu_tensor".  "cpu_tensor" must be allocated
  // to be of the same size as "device_tensor".
  virtual void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                                     StringPiece tensor_name, Device* device,
                                     Tensor* cpu_tensor, StatusCallback done) {
    done(errors::Internal("Unrecognized device type in device-to-CPU Copy"));
  }

  // Same as `CopyDeviceTensorToCPU`, but blocks until the copy is done.
  Status CopyDeviceTensorToCPUSync(const Tensor* device_tensor,
                                   StringPiece tensor_name, Device* device,
                                   Tensor* cpu_tensor);

  // If possible, wait for all events on *stream to complete then execute func.
  // A non-OK Status is returned otherwise.  The stream argument should be the
  // one provided by AcceleratorDeviceInfo.  This function is not applicable to
  // devices that don't provide such a value.
  virtual Status ThenExecute(Device* device, stream_executor::Stream* stream,
                             std::function<void()> func) {
    return errors::Internal("ThenExecute not supported by device");
  }

  // check if device is a pluggable device
  virtual bool IsPluggableDevice() { return false; }

  // Returns the pinned host memory allocator for the device.
  virtual Allocator* host_memory_allocator() const { return nullptr; }
};

设备基类

设备依赖env
设备中有cpu线程池, 也有其他设备上的线程池，eigen tensorflow线程池
设置Allocator以分配内存给Tensor
从Proto反序化出一个tensor到设备上
维护加速设备信息：GPU/TPU


class DeviceBase {
 public:
  explicit DeviceBase(Env* env) : env_(env) {}
  virtual ~DeviceBase();

  Env* env() const { return env_; }

  struct CpuWorkerThreads {
    int num_threads = 0;
    thread::ThreadPool* workers = nullptr;
  };

  // Does not take ownership.
  void set_tensorflow_cpu_worker_threads(CpuWorkerThreads* t) {
    cpu_worker_threads_ = t;
  }

  virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const {
    CHECK(cpu_worker_threads_ != nullptr);
    return cpu_worker_threads_;
  }

  // "stream" is used in special circumstances (such as the
  // constructors of Ops) where there is no available OpKernelContext.
  // "default_context" is used by OpKernelContext whenever a device does not
  // supply a DeviceContext for an op in TryGetDeviceContext() (e.g. when only
  // using a single stream.)
  // "event_mgr" is used to delay deallocation of temporary GPU buffers.
  // TODO(pbar) Work out how to move this out of DeviceBase.
  struct AcceleratorDeviceInfo {
    // Make sure all the defaults are NULL, so we can spot missing assignments.
    stream_executor::Stream* stream = nullptr;
    DeviceContext* default_context = nullptr;
    EventMgr* event_mgr = nullptr;
    int gpu_id = -1;
  };

  // Does not take ownership.
  void set_tensorflow_accelerator_device_info(
      AcceleratorDeviceInfo* device_info) {
    accelerator_device_info_ = device_info;
  }

  virtual const AcceleratorDeviceInfo* tensorflow_accelerator_device_info()
      const {
    return accelerator_device_info_;
  }

  // The preferred thread pool for this device. If it is nullptr, the system
  // automatically assigns a thread pool for execution.
  virtual thread::ThreadPool* tensorflow_device_thread_pool() {
    return device_thread_pool_;
  }

  // Does not take ownership.
  void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d);

  // Return the Allocator implementation to use based on the allocator
  // attributes requested.  See allocator.h for more details.
  virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) {
    LOG(FATAL) << "GetAllocator() is not implemented.";
    return nullptr;
  }

  // This method is provided for backwards compatibility, and will be removed
  // in a future release.
  ABSL_DEPRECATED("Use `this->GetAllocator()` or `this->GetScopedAllocator()`.")
  Allocator* GetStepAllocator(AllocatorAttributes attr, ResourceMgr*) {
    return GetAllocator(attr);
  }

  // Return an Allocator prepared for use in particular places by graph
  // optimization
  virtual Allocator* GetScopedAllocator(AllocatorAttributes attr,
                                        int64_t step_id) {
    LOG(FATAL) << "Device does not implement GetScopedAllocator()";
    return nullptr;
  }

  virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; }

  virtual bool has_eigen_cpu_device() const {
    return !eigen_cpu_devices_.empty();
  }

  virtual const Eigen::ThreadPoolDevice* eigen_cpu_device();

  // Caller owns the return value. The OpKernelContext calls this even
  // for devices that do not implement an eigen_gpu_device. Overridden
  // by GPU devices to return a derived type.
  virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; }

  virtual DeviceBase* UnderlyingDevice() { return this; }
  virtual const DeviceBase* UnderlyingDevice() const { return this; }

  // This is overridden by GPU devices to reinitialize the derived
  // type returned by MakeGpuDevice.
  virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/,
                                       PerOpGpuDevice* /*device*/,
                                       DeviceContext* /*dc*/,
                                       Allocator* /*allocator*/) {
    return Status::OK();
  }

  // Unimplemented by default
  virtual const DeviceAttributes& attributes() const;
  virtual int NumaNode() const { return attributes().locality().numa_node(); }
  virtual const std::string& name() const;
  virtual const DeviceNameUtils::ParsedName& parsed_name() const;

  // Updates `attributes()`, indicating the XLA global ID associated with this
  // device. This ID is unique across clients in a multi-client setup. For TPUs
  // this does not happen until the TPU system has been initialized.
  //
  // Implemented in Device.
  virtual void set_xla_global_id(int64_t id) {}

  // Materializes the given TensorProto into 'tensor' stored in Device
  // memory.  Most devices will want to override this.
  //
  // TODO(vrv): We should be able to put this function into
  // OpKernelContext and handle the copies from device memory via send
  // and receive nodes, instead of requiring that each device handle
  // the copies here as well as in copy ops.
  virtual Status MakeTensorFromProto(const TensorProto& tensor_proto,
                                     const AllocatorAttributes alloc_attrs,
                                     Tensor* tensor) {
    return errors::Internal("Device does not implement MakeTensorFromProto()");
  }

  // Some devices (i.e. GPUs) may free device memory prior to its actual use
  // being completed on the assumption that subsequent allocations can only be
  // used serially with respect to pending uses.  If this function returns a
  // non-zero value it is the value of a device-specific counter such that any
  // device memory tagged with an earlier freed-at count is really unencumbered
  // by pending uses.  For this to be useful the device memory allocator must
  // be tagging deallocated memory chunks using the same counter.
  virtual uint64 SafeAllocFrontier(uint64 old_value) { return 0; }

  // Copies `input_tensor` to `output_tensor`, where both tensors are on this
  // device. This function assumes that `output_tensor` has already been
  // allocated with a buffer that is large enough to hold `input_tensor`'s data.
  // Calls `done` from a device-specific thread after copy is finished, which
  // may be the same as calling thread.
  //
  // NOTE(ayushd): This function is for TensorFlow internal use only.  Deep copy
  // is discouraged and should not be used in OpKernels.
  virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
                                      Tensor* output_tensor,
                                      const DeviceContext* device_context,
                                      StatusCallback done) {
    done(errors::Internal("Device ", name(), " does not implement ",
                          "CopyTensorInSameDevice"));
  }

 protected:
  // Does not take ownership.
  void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) {
    device_thread_pool_ = thread_pool;
  }

 private:
  Env* const env_;
  CpuWorkerThreads* cpu_worker_threads_ = nullptr;
  // Set by GPUs as well as by TPU devices.
  AcceleratorDeviceInfo* accelerator_device_info_ = nullptr;
  thread::ThreadPool* device_thread_pool_ = nullptr;
  std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_;
};

// Methods to create and check for Symbolic execution devices.
// Such devices are mostly used for TF-XLA bridge. TF should not treat these as
// normal devices.
void AddSymbolicExecutionDevice(absl::string_view device_name);
bool IsSymbolicExecutionDevice(absl::string_view device_name);

设备

设备名
设备类型：从设备属性中获得
同步异步计算：最核心的功能，设备就是用来执行op kernel的
持有ResourceMgr
持有Env
Sync: 计算任务在设备上排队，调用此函数等执行完


class Device : public DeviceBase {
 public:
  // Callback type that takes a Status and returns void.
  typedef std::function<void(const Status&)> DoneCallback;

  Device(Env* env, const DeviceAttributes& device_attributes);
  ~Device() override;

  // Full name of this device (see top comment).
  const std::string& name() const override { return device_attributes_.name(); }

  // Parsed name of this device
  const DeviceNameUtils::ParsedName& parsed_name() const {
    return parsed_name_;
  }

  // Describes what kind of device this is.  This is intended to be
  // human-readable and not computer-parsed, except that two devices
  // with the same device_type() are expected to perform similarly
  // (both from a computation and communication perspective).
  const std::string& device_type() const {
    return device_attributes_.device_type();
  }

  // Returns an aggregation of device attributes.
  const DeviceAttributes& attributes() const override {
    return device_attributes_;
  }

  // Performs the actual compute function.
  //
  // Subclasses may override this function if they wish to perform
  // some initialization before each compute.
  //同步计算，会阻塞
  virtual void Compute(OpKernel* op_kernel, OpKernelContext* context) {
    op_kernel->Compute(context);
  }

  // Asynchronous kernel's compute.
  //异步计算，在执行完成后调用done来通知调用者已经完成
  virtual void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                            AsyncOpKernel::DoneCallback done) {
    op_kernel->ComputeAsync(context, std::move(done));
  }

  // Blocks until all operations queued on the device at the time of
  // the call have completed.  Returns any error pending on the device
  // at completion.
  virtual Status Sync() = 0;

  // Calls the given callback when all operations queued on the device at the
  // time of the call have completed. The callback is passed any error pending
  // on the device at completion.
  // TODO(b/112409994): Consolidate these two APIs, removing the synchronous
  // version.
  virtual void Sync(const DoneCallback& done);

  // On session completion, the executor may call Device::Sync() depending on
  // flag settings. Override this to return false for devices that don't allow
  // such calls. Instead, these devices must use other mechanisms (such as
  // num_deferred_ops) to ensure the device has finished processing necessary
  // work at session completion. In addition, for these devices, RefreshStatus
  // must be called at session completion to retrieve execution result status.
  //
  // Devices that override this function must also implement RefreshStatus.
  virtual bool AllowsSyncOnCompletion() const { return true; }

  // This is used in conjunction with AllowsSyncOnCompletion to allow the
  // executor to get execution result status at session completion.
  //
  // For supported devices, this call returns the underlying device stream's
  // current status in a non-blocking way, without using blocking calls such as
  // Stream::BlockHostUntilDone or Device::Sync. When applicable, the device
  // status is also updated with the retrieved stream status.
  virtual Status RefreshStatus() {
    return errors::Unimplemented(
        "RefreshStatus is not supported on this device.");
  }

  // Optionally modify the device's GraphDef before execution.
  //
  // This method should be considered experimental and is supplied to enable
  // prototyping of TensorFlow device implementations that need to modify
  // the GraphDef before execution.
  //
  // 'graph' supplies the partition of the graph assigned to this
  // device.
  virtual Status MaybeRewriteGraph(std::unique_ptr<Graph>* /*graph*/) {
    return Status::OK();
  }

  // Sets `out_context` a new DeviceContext* for executing a graph, or nullptr
  // if the device does not support contexts. Returns an error status if any
  // error occurred while trying to create a context, otherwise OK.
  //
  // The caller takes ownership of one reference on the output DeviceContext*,
  // and should call Unref().
  virtual Status TryGetDeviceContext(DeviceContext** out_context) {
    *out_context = nullptr;
    return Status::OK();
  }

  // Returns the op segment of this device.  The caller can reuse op
  // kernels registered for the same session running on this device.
  OpSegment* op_segment() { return &op_seg_; }

  // Returns the resource manager associated w/ this device.
  virtual ResourceMgr* resource_manager() { return rmgr_; }

  // Summarizes the status of this Device, for debugging.
  std::string DebugString() const { return device_attributes_.DebugString(); }

  // Assembles the parameter components into a complete DeviceAttributes value.
  static DeviceAttributes BuildDeviceAttributes(
      const std::string& name, DeviceType device, Bytes memory_limit,
      const DeviceLocality& locality, const std::string& physical_device_desc);

  static DeviceAttributes BuildDeviceAttributes(
      const std::string& name, DeviceType device, Bytes memory_limit,
      const DeviceLocality& locality) {
    // Pass in an empty string as physical device name.
    return BuildDeviceAttributes(name, device, memory_limit, locality, "");
  }

  // Updates `attributes()`, indicating the XLA global ID associated with this
  // device. This ID is unique across clients in a multi-client setup. For TPUs
  // this does not happen until the TPU system has been initialized.
  void set_xla_global_id(int64_t id) override {
    device_attributes_.set_xla_global_id(id);
  }

  // Clears the resource manager associated with this device.
  void ClearResourceMgr() { rmgr_->Clear(); }

  virtual bool IsLocal() const { return true; }

  // Informs if this Device can be used as a caller in RemoteCall operation.
  virtual bool IsRemoteCallAllowed() const;

 protected:
  void DeleteResourceMgr() {
    delete rmgr_;
    rmgr_ = nullptr;
  }

 private:
  DeviceAttributes device_attributes_;
  DeviceNameUtils::ParsedName parsed_name_;

  // op_seg_ maps session handle and op name to OpKernel objects.
  OpSegment op_seg_;

  // Resources associated w/ this device. E.g., shared variables, etc.
  ResourceMgr* rmgr_ = nullptr;

  TF_DISALLOW_COPY_AND_ASSIGN(Device);
};