训练第一轮有loss,但是第二轮的我时候出现device cudnn cuda 错误

[ERROR] KERNEL(5124,7f9d16bf9240,python):2022-09-17-15:58:13.292.697 [mindspore/ccsrc/plugin/device/gpu/kernel/nn/flatten_gpu_kernel.h:44] Launch] cudaMemcpyAsync error in FlattenFwdGpuKernelMod::Launch, error code is 700 

[ERROR] DEVICE(5124,7f9d16bf9240,python):2022-09-17-15:58:13.292.710 [mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc:540] LaunchKernel] Launch kernel failed, kernel full name: Gradients/Default/gradAdd/Reshape-op10082 

[CRITICAL] KERNEL(5124,7f9d16bf9240,python):2022-09-17-15:58:13.293.145 [mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_input_gpu_kernel.h:117] Launch] cuDNN Error: ConvolutionBackwardData failed | Error Number: 8 CUDNN_STATUS_EXECUTION_FAILED 

The function call stack: 

In file /home/luoxuewei/miniconda3/lib/python3.9/site-packages/mindspore/ops/_grad/grad_nn_ops.py(65)/        dx = input_grad(dout, w, x_shape)/ 

 
[CRITICAL] KERNEL(5124,7f9d16bf9240,python):2022-09-17-15:58:13.293.288 [mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_input_gpu_kernel.h:117] Launch] cuDNN Error: ConvolutionBackwardData failed | Error Number: 8 CUDNN_STATUS_EXECUTION_FAILED 

The function call stack: 

In file /home/luoxuewei/miniconda3/lib/python3.9/site-packages/mindspore/ops/_grad/grad_nn_ops.py(65)/        dx = input_grad(dout, w, x_shape)/ 

 
[CRITICAL] KERNEL(5124,7f9d16bf9240,python):2022-09-17-15:58:13.293.592 [mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_filter_gpu_kernel.h:118] Launch] cuDNN Error: ConvolutionBackwardFilter failed | Error Number: 8 CUDNN_STATUS_EXECUTION_FAILED 

The function call stack: 

In file /home/luoxuewei/miniconda3/lib/python3.9/site-packages/mindspore/ops/_grad/grad_nn_ops.py(67)/        dw = filter_grad(dout, x, w_shape)/ 

 
[ERROR] DEVICE(5124,7f9d16bf9240,python):2022-09-17-15:58:13.293.700 [mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_driver.cc:167] SyncStream] cudaStreamSynchronize failed, ret[700], an illegal memory access was encountered 

Traceback (most recent call last): 

  File "/home/luoxuewei/shelei/PFST-LSTM-source-4567_x2ms/experiment/CIKM/dec_PFST_ConvLSTM_dataloader_Gan_SA_mindspore.py", line 526, in  

    model.train() 

  File "/home/luoxuewei/shelei/PFST-LSTM-source-4567_x2ms/experiment/CIKM/dec_PFST_ConvLSTM_dataloader_Gan_SA_mindspore.py", line 249, in train 

    output_G = trainer1(in_frame_dat, group_truth) 

  File "/home/luoxuewei/miniconda3/lib/python3.9/site-packages/mindspore/nn/cell.py", line 601, in __call__ 

    raise err 

  File "/home/luoxuewei/miniconda3/lib/python3.9/site-packages/mindspore/nn/cell.py", line 597, in __call__ 

    output = self._run_construct(cast_inputs, kwargs) 

  File "/home/luoxuewei/miniconda3/lib/python3.9/site-packages/mindspore/nn/cell.py", line 416, in _run_construct 

    output = self.construct(*cast_inputs, **kwargs) 

  File "/home/luoxuewei/miniconda3/lib/python3.9/site-packages/mindspore/nn/wrap/cell_wrapper.py", line 375, in construct 

    grads = self.grad(self.network, self.weights)(*inputs, sens) 

  File "/home/luoxuewei/miniconda3/lib/python3.9/site-packages/mindspore/ops/composite/base.py", line 399, in after_grad 

    return grad_(fn, weights)(*args, **kwargs) 

  File "/home/luoxuewei/miniconda3/lib/python3.9/site-packages/mindspore/common/api.py", line 93, in wrapper 

    results = fn(*arg, **kwargs) 

  File "/home/luoxuewei/miniconda3/lib/python3.9/site-packages/mindspore/ops/composite/base.py", line 391, in after_grad 

    out = _pynative_executor(fn, grad_.sens_param, *args, **kwargs) 

  File "/home/luoxuewei/miniconda3/lib/python3.9/site-packages/mindspore/common/api.py", line 951, in __call__ 

    return self._executor(sens_param, obj, args) 

RuntimeError: mindspore/ccsrc/plugin/device/gpu/kernel/nn/conv2d_grad_filter_gpu_kernel.h:118 Launch] cuDNN Error: ConvolutionBackwardFilter failed | Error Number: 8 CUDNN_STATUS_EXECUTION_FAILED 

The function call stack: 

In file /home/luoxuewei/miniconda3/lib/python3.9/site-packages/mindspore/ops/_grad/grad_nn_ops.py(67)/        dw = filter_grad(dout, x, w_shape)/ 

 
[ERROR] DEVICE(5124,7f9d16bf9240,python):2022-09-17-15:58:13.818.362 [mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_driver.cc:167] SyncStream] cudaStreamSynchronize failed, ret[700], an illegal memory access was encountered 

[ERROR] ME(5124,7f9d16bf9240,python):2022-09-17-15:58:13.818.390 [mindspore/ccsrc/runtime/hardware/device_context_manager.cc:81] WaitTaskFinishOnDevice] SyncStream failed 

[ERROR] DEVICE(5124,7f9d16bf9240,python):2022-09-17-15:58:13.829.692 [mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_driver.cc:158] DestroyStream] cudaStreamDestroy failed, ret[700], an illegal memory access was encountered 

[ERROR] DEVICE(5124,7f9d16bf9240,python):2022-09-17-15:58:13.829.710 [mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_manager.cc:61] ReleaseDevice] Op Error: Failed to destroy CUDA stream. | Error Number: 0 

[ERROR] DEVICE(5124,7f9d16bf9240,python):2022-09-17-15:58:13.829.724 [mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_driver.cc:158] DestroyStream] cudaStreamDestroy failed, ret[700], an illegal memory access was encountered 

[ERROR] DEVICE(5124,7f9d16bf9240,python):2022-09-17-15:58:13.829.733 [mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_manager.cc:61] ReleaseDevice] Op Error: Failed to destroy CUDA stream. | Error Number: 0 

[ERROR] DEVICE(5124,7f9d16bf9240,python):2022-09-17-15:58:13.830.140 [mindspore/ccsrc/plugin/device/gpu/hal/device/gpu_device_manager.cc:67] ReleaseDevice] cuDNN Error: Failed to destroy cuDNN handle | Error Number: 4 CUDNN_STATUS_INTERNAL_ERROR 

[ERROR] DEVICE(5124,7f9d16bf9240,python):2022-09-17-15:58:13.831.354 [mindspore/ccsrc/plugin/device/gpu/hal/device/cuda_driver.cc:48] FreeDeviceMem] cudaFree failed, ret[700], an illegal memory access was encountered 

[CRITICAL] PRE_ACT(5124,7f9d16bf9240,python):2022-09-17-15:58:13.831.371 [mindspore/ccsrc/common/mem_reuse/mem_dynamic_allocator.cc:428] operator()] Free device memory[0x7f992e000000] error. 

Error in atexit._run_exitfuncs: 

RuntimeError: mindspore/ccsrc/common/mem_reuse/mem_dynamic_allocator.cc:428 operator()] Free device memory[0x7f992e000000] error. 

****************************************************解答*****************************************************

看起来是算子出现了 内存相关问题,能否参考这篇帖子帮忙协助定位下,具体是哪个算子。

https://bbs.huaweicloud.com/forum/thread-169762-1-1.html

或者 如果不清楚的话也可以设置下这两个环境变量,把日志给我们看下

export CUDA_LAUNCH_BLOCKING=1
export GLOG_v=1
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,以下是一个简单的代码示例,用于在第二轮联邦学习中将全局模型设为第一轮联邦学习的全局模型: ```python import torch from torchvision import datasets, transforms from torch.utils.data import DataLoader from torch import nn, optim # 定义模型结构 class Model(nn.Module): def __init__(self): super().__init__() self.fc1 = nn.Linear(784, 256) self.fc2 = nn.Linear(256, 128) self.fc3 = nn.Linear(128, 10) self.dropout = nn.Dropout(p=0.2) self.relu = nn.ReLU() def forward(self, x): x = x.view(x.shape[0], -1) x = self.dropout(self.relu(self.fc1(x))) x = self.dropout(self.relu(self.fc2(x))) x = self.fc3(x) return x # 定义联邦学习服务器 class FederatedServer: def __init__(self, num_clients, train_data, test_data, lr=0.01, batch_size=64, epochs=10): self.num_clients = num_clients self.train_data = train_data self.test_data = test_data self.lr = lr self.batch_size = batch_size self.epochs = epochs self.clients = [] self.server_model = Model() self.criterion = nn.CrossEntropyLoss() self.optimizer = optim.SGD(self.server_model.parameters(), lr=self.lr) # 初始化客户端 for i in range(num_clients): data_loader = DataLoader(train_data[i], batch_size=self.batch_size, shuffle=True) client_model = Model() client_optimizer = optim.SGD(client_model.parameters(), lr=self.lr) self.clients.append({'data_loader': data_loader, 'model': client_model, 'optimizer': client_optimizer}) # 训练客户端模型 def train_client_model(self, client): client['model'].train() for epoch in range(self.epochs): for images, labels in client['data_loader']: client['optimizer'].zero_grad() output = client['model'](images) loss = self.criterion(output, labels) loss.backward() client['optimizer'].step() # 聚合客户端模型 def aggregate_client_models(self): for param, global_param in zip(self.server_model.parameters(), self.global_model.parameters()): param.data = global_param.data for client in self.clients: for param, client_param in zip(self.server_model.parameters(), client['model'].parameters()): param.data += client_param.data / self.num_clients # 在测试集上评估模型 def evaluate_model(self): self.server_model.eval() test_loss = 0 test_accuracy = 0 with torch.no_grad(): for images, labels in self.test_data: output = self.server_model(images) test_loss += self.criterion(output, labels) ps = torch.exp(output) top_p, top_class = ps.topk(1, dim=1) equals = top_class == labels.view(*top_class.shape) test_accuracy += torch.mean(equals.type(torch.FloatTensor)) return test_loss / len(self.test_data), test_accuracy / len(self.test_data) # 训练联邦模型 def train(self, global_model=None): self.global_model = global_model if self.global_model is not None: self.server_model.load_state_dict(self.global_model.state_dict()) for epoch in range(self.epochs): for client in self.clients: self.train_client_model(client) self.aggregate_client_models() test_loss, test_accuracy = self.evaluate_model() print(f"Epoch {epoch+1}/{self.epochs}, Test Loss: {test_loss:.3f}, Test Accuracy: {test_accuracy:.3f}") # 返回更新后的全局模型 return self.server_model ``` 这段代码与之前的联邦学习服务器代码相似,除了添加了一个名为 `global_model` 的参数。在 `train` 方法中,如果 `global_model` 不为 `None`,则会将 `global_model` 的参数加载到服务器的模型中。这样,第二轮联邦学习的全局模型就与第一轮联邦学习的全局模型保持一致了。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值