pytorch 函数理解
DDP 的使用
# -*- coding: utf-8 -*-
# @Author: solicucu
# @Email: lihj85@mail2.sysu.edu.cn
import torch
import torch.nn as nn
import argparse
import os
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
class Network(nn.Module):
def __init__(self, in_planes=3, out_planes=64):
super().__init__()
self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=2, padding=1)
def forward(self, x):
return self.conv(x)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", default=-1, type=int)
parser.add_argument("--device_ids", default="0", type=str)
args = parser.parse_args()
local_rank = args.local_rank
device_ids = args.device_ids
# 可以有更简单的方式,在执行程序前加就行
# CUDA_VISIBLE_DEVICES="4,5,6,7" python -m torch.distributed.launch --nproc_per_node 4 ddp.py
os.environ["CUDA_VISIBLE_DEVICES"] = device_ids
print("local_rank:", local_rank)
"""
采用 dist.launch 启动后,自动传入一个local_rank 参数给main 函数
local_rank: 0
local_rank: 1
"""
# set device
torch.cuda.set_device(local_rank)
# 通过dist.launch 才能初始化成功
dist.init_process_group(backend="nccl")
isinit = dist.is_initialized()
print("is_init",isinit)
# 获取总共的gpus 数
world_size = dist.get_world_size()
print("world_size", world_size)
device = torch.device("cuda", local_rank)
model = Network().to(device)
## create DDP model
model = DDP(model, device_ids=[local_rank], output_device=local_rank)
x = torch.randn(64, 3, 20, 20).to(device)
res = model(x)
if dist.get_rank() == 0:
# 只在主进程打印
print(res.size())
if __name__ == "__main__":
main()
# https://zhuanlan.zhihu.com/p/178402798
"""run.sh
python -m torch.distributed.launch --nproc_per_node 2 ddp.py \
--device "0,1" \
"""
torch.scatter_(dim, index, src/value)
参数:
dim: 指定一个维度
index: 索引列表
src: 一个源tensor
value: single value
作用:
tensor本身,后面称self, 根据指定的维度dim 和索引 index 来用src里面指定的值替换self的值。
如果是value,统一替换成value值
替换方式:
example: for 3-D tensor
self[index[i][j][k]][j][k] = src[i][j][k] # if dim == 0
# 假如指定了dim = 0,那么表示采用index的值,替换self dim=0 的索引,然后把跟index索引 的src的值赋值过去。
self[i][index[i][j][k]][k] = src[i][j][k] # if dim == 1
self[i][j][index[i][j][k]] = src[i][j][k] # if dim == 2
显然对self,index, src 的 size 要有一定的要求
官网的要求是:
self, index and src (if it is a Tensor) should have same number of dimensions.
It is also required that index.size(d) <= src.size(d) for all dimensions d, and that index.size(d) <= self.size(d) for all dimensions d != dim
意思是他们三个要有相同的维度数,要么都是2-d,3-d,4-d 。。。
index的每个维度大小要小于 src的维度大小,这很显然,因为src要根据index的索引进行取值
除了指定的维度dim,其他维度的大小都要小于self的,假定self为size 为 [n1, n2, n3],
这也很显然,self在dim的数目不影响src,假定dim 是1, index[i][j][k], 这里 j 可以比n2大,但是,有一点官方没说的是,index的所有值,都必须小于n2.
其实,下面做两个测试,除了指定的dim的大小可以不一样,其他维度的大小都必须一样
例如 self 的size 是[n1, n2] dim = 1
那么index 的size 必须为[n1, k] , k 没有限定,如果大于n2,那么就有重复的其实没有什么意义,但是index[i][j] 要小于n2
如果index size 为[k1, k], k1 < n1 ,那是不行的。
def test_scatter_():
# if given a value, the index must have the same size in specified dim
# for example, if index is [[2,3], [3]] is not allow,
# but size of index must same with origin tensor except specify dim
z = torch.zeros(2, 4).scatter_(1, torch.tensor([[2, 3], [3,1]]), 1.23)
print(z)
"""
举个例子, v = index[0][0] = 2, 因为指定维度是1, 所以第一个要替换的值是
self[0][v] = self[0][2] = 1.23
tensor([[0.0000, 0.0000, 1.2300, 1.2300],
[0.0000, 1.2300, 0.0000, 1.2300]])
"""
logits = torch.randn(4, 5)
target = torch.randint(high = 5, size = [4])
target = target.unsqueeze(1)
print(target)
tensor = torch.zeros(logits.size())
scatter = tensor.scatter_(dim = 1, index = target, value = 1)
print(scatter)
"""
tensor([[3],
[3],
[3],
[0]])
这里的index的size [4, 1]
self的size为[4, 5]
指定的dim = 1, 所以1 < 5 可以
tensor([[0., 0., 0., 1., 0.],
[0., 0., 0., 1., 0.],
[0., 0., 0., 1., 0.],
[1., 0., 0., 0., 0.]])
结果说明, v = index[0][0] = 3
so, self[0][v] = self[0][3] = 1
其实可以理解为遍历index 来修改self
"""
# give a source tensor
source = torch.Tensor([[1,2],[3,4]])
index = torch.tensor([[1,0],[1,0]], dtype = torch.int64)
# change to self_index = [[0,1],[0,0],[1,1],[1,0]]
# except specified dim , other dim with same size scr 2 x 2, self = 2 x 4
tensor = torch.zeros(2, 4)
scatter = tensor.scatter_(1, index, source)
print(scatter)
"""
tensor([[2., 1., 0., 0.],
[4., 3., 0., 0.]])
结果说明:
v = index[1][1] = 0
so, self[1][v] = self[1][0] = src[1][1] = 4
"""
F.cross_entropy = - (i == label) * log(pi)
the input is original scores before softmax
target is the class id for specify which log likelihood to be used compute
F.log_softmax = log( softmax(x))
x is a vector, softmax = (compute the probability)
F.nll_loss = - (i == label) * log(pi)
log(pi) is obained from log_softmax
torch.nn.functional.cross_entropy(input, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction=‘mean’)
input is a tensor with shape [N, C] C is the number of class
target is a tensor with shape [N] where each value is in range(0, C-1)
softmax is a function compute the probability
torch.nn.NLLLoss(weight=None, size_average=None, ignore_index=-100, reduce=None, reduction=‘mean’)
ln = -w* (yn) here yn is the probability of y(n,label)
tensor.normal_(mean=0, std=1, generator=None) → Tensor
这是tensor自带的函数,可以根据指定的均值和标准差初始化权重
tensor.fill_(value)
指定特定值填充
torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode=‘zeros’)
in_channels : 输入通道数
out_channels: 输出通道数
kernel_size : 卷积核的尺寸,可以是一个数(那么h和w一样),也可以是一个二元元组
stride:卷积的步长
padding :边界填充的宽度,h,w 同时填充
dilation: 卷积核放大系数,注意此处卷积核原始尺寸为k,那么 放大后变为s = d * (k-1) + 1
groups: 这个就是把输入通道分成多少组来独立卷积,然后再拼接
bias:如果为true,在输出会加一个可学习的偏置值
padding_mode: padding 填充的值
这个计算方式跟没有dilation一样,因为卷积核s = d x(k- 1)+ 1
卷积核的形式如下:
附属一些卷积的可视化:
torch.nn.BatchNorm2d(num_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
简单来说,如果batch-size = n
那么一个batch 有 n 个feature map,batchnorm2d的意义就是对于一个batch,在feature map 的每个位置,都是有着n个值(分别表示不同的样本),相当于对这个n维向量做一个归范化。
不同的是,这里的规范化可以伴随这训练的变化而变化,同时有两个学习的参数
num_features : tensor 的通道数
eps : 用于防止分母为0,保证数据的稳定性
momentum: 用来计算移动平均值和方差的动量,e.g.计算公式: new_avg = (1-momentum)* last_avg + momentum * cur_avg
affine: 如果为true 那么γ 和β 就是可以训练的参数,否则分别为1 和 0
track_runnig_stats: 就是当前模块是否记录batch的均值和方差,换句话说,为true就采用滑动估计值,否则采用当前batch的值计算。