To函数
功能:转换数据类型\设备
1.tensor.to()
2.mouble.to()
例子:
torch.cuda
多GPU分发并行机制
把数据等分,给不同的GPU运行
torch.nn.DataParallel
# ============================ 手动选择gpu
gpu_list = [2,3]
gpu_list_str = ','.join(map(str, gpu_list))
os.environ.setdefault("CUDA_VISIBLE_DEVICES", gpu_list_str)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# ============================ 依内存情况自动选择主gpu
def get_gpu_memory():
import platform
if 'Windows' != platform.system():
import os
os.system('nvidia-smi -q -d Memory | grep -A4 GPU | grep Free > tmp.txt')
memory_gpu = [int(x.split()[2]) for x in open('tmp.txt', 'r').readlines()]
os.system('rm tmp.txt')
else:
memory_gpu = False
print("显存计算功能暂不支持windows操作系统")
return memory_gpu
gpu_memory = get_gpu_memory()
if not gpu_memory:
print("\ngpu free memory: {}".format(gpu_memory))
gpu_list = np.argsort(gpu_memory)[::-1]
gpu_list_str = ','.join(map(str, gpu_list))
os.environ.setdefault("CUDA_VISIBLE_DEVICES", gpu_list_str)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
常见报错
1.原因:gpu训练出来的model,用map_location映射到cpu上
path_state_dict = "./model_in_gpu_0.pkl"
state_dict_load = torch.load(path_state_dict, map_location="cpu")
print("state_dict_load:\n{}".format(state_dict_load))
2.多GPU(DataParallel)训练出来的模型,需要更改一下层的名字,因为在前面多了一个moudle字段
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict_load.items():
namekey = k[7:] if k.startswith('module.') else k
new_state_dict[namekey] = v
print("new_state_dict:\n{}".format(new_state_dict))
net.load_state_dict(new_state_dict)
让网络加载new_state_dict即可