写在前面:以下内容完成于2019年底,只是把笔记放到了CSDN上。
需要注释掉NCLL及分布式相关的配置
libcudart.patch
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 4591702..07e1268 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -59,7 +59,7 @@ def _load_cudart():
if platform.system() == 'Windows':
lib = find_cuda_windows_lib()
else:
- lib = ctypes.cdll.LoadLibrary(None)
+ lib = ctypes.cdll.LoadLibrary("libcudart.so")
if hasattr(lib, 'cudaGetErrorName'):
return lib
remove_nccl.patch
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 159b153..6f7423d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,7 +95,7 @@ option(USE_LMDB "Use LMDB" ON)
option(USE_METAL "Use Metal for iOS build" ON)
option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
-option(USE_NCCL "Use NCCL" ON)
+option(USE_NCCL "Use NCCL" OFF)
option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
option(USE_NNAPI "Use NNAPI" OFF)
option(USE_NNPACK "Use NNPACK" ON)
@@ -119,7 +119,7 @@ option(USE_TENSORRT "Using Nvidia TensorRT library" OFF)
option(USE_ZMQ "Use ZMQ" OFF)
option(USE_ZSTD "Use ZSTD" OFF)
option(USE_MKLDNN "Use MKLDNN" OFF)
-option(USE_DISTRIBUTED "Use distributed" ON)
+option(USE_DISTRIBUTED "Use distributed" OFF)
cmake_dependent_option(
USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
"USE_DISTRIBUTED" OFF)
diff --git a/tools/setup_helpers/dist_check.py b/tools/setup_helpers/dist_check.py
index 8859fe1..5d2ed1c 100644
--- a/tools/setup_helpers/dist_check.py
+++ b/tools/setup_helpers/dist_check.py
@@ -6,7 +6,7 @@ from .env import IS_CONDA, IS_LINUX, IS_WINDOWS, CONDA_DIR, check_env_flag, chec
from .cuda import USE_CUDA
# On ROCm, RCCL development isn't complete. https://github.com/ROCmSoftwarePlatform/rccl
-USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS and not check_env_flag("USE_ROCM")
+USE_DISTRIBUTED = False
USE_GLOO_IBVERBS = False
IB_DEVINFO_CMD = "ibv_devinfo"
diff --git a/tools/setup_helpers/nccl.py b/tools/setup_helpers/nccl.py
index c1cc886..576f74e 100644
--- a/tools/setup_helpers/nccl.py
+++ b/tools/setup_helpers/nccl.py
@@ -9,7 +9,7 @@ from .env import IS_WINDOWS, IS_DARWIN, IS_CONDA, CONDA_DIR, check_negative_env_
from .cuda import USE_CUDA, CUDA_HOME
-USE_NCCL = USE_CUDA and not IS_DARWIN and not IS_WINDOWS
+USE_NCCL = False
USE_SYSTEM_NCCL = False
NCCL_LIB_DIR = None
NCCL_SYSTEM_LIB = None
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 9c4a018..6849cb1 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -690,7 +690,7 @@ if (BUILD_PYTHON)
list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
- if (USE_CUDA)
+ if (USE_CUDA AND USE_NCCL)
list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/ddp.cpp)
endif()
endif()
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 2b42e1e..11a866d 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -435,7 +435,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
&::c10d::ProcessGroup::Work::wait,
py::call_guard<py::gil_scoped_release>());
-#ifdef USE_CUDA
+#if defined(USE_CUDA) && defined(USE_TORCH)
module.def(
"_dist_bucket_tensors",
&::c10d::bucketTensors,
iGPU上显示资源不足:iGPU的寄存器数据只有dGPU的一半,需要将降低CUDA的线程数
RuntimeError: cuda runtime error (7) : too many resources requested for launch at /opt/zhangmm/docker/pytorch_1.1.0/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu:67
cankao: https://github.com/pytorch/pytorch/issues/8103#issuecomment-424343705
Pytorch on PX2
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 70a7d05b6..48bf1173e 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
void initDeviceProperty(DeviceIndex device_index) {
cudaDeviceProp device_prop;
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+ // patch for "too many resources requested for launch"
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
device_properties[device_index] = device_prop;
}
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index e535f4d83..ac057c504 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -12,7 +12,10 @@ namespace at { namespace cuda { namespace detail {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N)
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 9e3ed7d85..08fcb4532 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -9,7 +9,10 @@
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
// Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//const int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+const int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N)