1.Bridge with NumPy
- Torsors 与 numpy的变量指向同一个内存(Tensors on the CPU and NumPy arrays can share their underlying memory locations, and changing one will change the other)
1.1 Tensor To Numpy Array
t = torch.ones(5)
n = t.numpy()
t.add_(1)
print(f"t: {t}")
print(f"n: {n}")
1.2 Numpy Array To Tensor
n = np.ones(5)
t = torch.from_numpy(n)
np.add(n, 1, out=n)
print(f"t: {t}")
print(f"n: {n}")
2 Usage in Pytorch —— An example
import torch, torchvision
model = torchvision.models.resnet18(pretrained=True)
data = torch.rand(1, 3, 64, 64)
labels = torch.rand(1, 1000)
prediction = model(data) # forward pass
loss = (prediction - labels).sum()
# When call .backward() on loss, autograd calculates these gradients
# and stores them in the respective tensors' .grad attribute
loss.backward() # backward pass
#model.parameters():Register all the parameters of the model in the optimizer
optim = torch.optim.SGD(model.parameters(), lr = 1e-2, momentum=0.9)
#Initiate gradient descent.The optimizer adjusts each parameter by its gradient stored in .grad
optim.step()
# ---- At this point, everything needed to train NN is done
3 Differentiate in autograd
import torch
# Only Tensors of floating point dtype can require gradients
# 需要[2., 3.] 而不是[2, 3]
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)
Q = 3*a**3 - b**2
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)
print(9*a**2 == a.grad)
print(-2*b == b.grad)
4 Frozen params
from torch import nn, optim
import torchvision
model = torchvision.models.resnet18(pretrianed=True)
#Freezee all the params in the network
for param in model.parameters():
param.requires_grad = False
# Replace the classifier (the last linear layer) of the resnet model
# with a new linear layer that act as classifier
model.fc = nn.Linear(512, 10)
# The only params that compute gradients are the weights and bias of model.fc
# Optimize only the classifier
optimizer = optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9)
5 Finetuning torchvision model
- In this document we will perform two types of transfer learning: finetuning and feature extraction. In finetuning, we start with a pretrained model and update all of the model’s parameters for our new task, in essence retraining the whole model. In feature extraction, we start with a pretrained model and only update the final layer weights from which we derive predictions. It is called feature extraction because we use the pretrained CNN as a fixed feature-extractor, and only change the output layer.
- If feature_extract = False, the model is finetuned and all model parameters are updated. If feature_extract = True, only the last layer parameters are updated, the others remain fixed.
6 What is state_dict?
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
# Let’s take a look at the state_dict from the simple model used in the Training a classifier tutorial.
class TheModelClass(nn.Module):
def __init__(self):
super(TheModelClass, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
# nn.Linear(in_features, out_features, bias=True)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu((self.conv2(x))))
x = x.view(-1, 16*5*5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
#Initiate model and optimizer
model = TheModelClass()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
print("Model's state_dict:")
for param_tensor in model.state_dict():
print(param_tensor, "\t", model.state_dict()[param_tensor].size())
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
print(var_name, "\t", optimizer.state_dict()[var_name])
Model’s state_dict:
conv1.weight torch.Size([6, 3, 5, 5])
conv1.bias torch.Size([6])
conv2.weight torch.Size([16, 6, 5, 5])
conv2.bias torch.Size([16])
fc1.weight torch.Size([120, 400])
fc1.bias torch.Size([120])
fc2.weight torch.Size([84, 120])
fc2.bias torch.Size([84])
fc3.weight torch.Size([10, 84])
fc3.bias torch.Size([10])
Optimizer’s state_dict:
state {}
param_groups [{‘lr’: 0.001, ‘momentum’: 0.9, ‘dampening’: 0, ‘weight_decay’: 0, ‘nesterov’: False, ‘params’: [140257928747520, 140257928747600, 140257928747680, 140257928747760, 140257928747840, 140257928747920, 140257928748000, 140257928748080, 140257928748160, 140257928748240]}]
7 What object does datasets.ImageFolder() return ?
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x])
for x in ["train", "val"]}
8 What object does torch.utils.data.DataLoader() return ?
It represents a Python iterable over a dataset, with support for
- map-style and iterable-style datasets,
- customizing data loading order,
- automatic batching,
- single- and multi-process data loading,
- automatic memory pinning.
- num_workers: To avoid blocking computation code with data loading, PyTorch provides an easy switch to perform multi-process data loading by simply setting the argument num_workers to a positive integer.
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x],
batch_size=batch_size, shuffle=True, num_workers=4) for x in ["train", "val"]}
{‘train’: <torch.utils.data.dataloader.DataLoader object at 0x7fc5d3aa5350>, ‘val’: <torch.utils.data.dataloader.DataLoader object at 0x7fc5d3aa54d0>}
9 What is the difference between model.named_parameters() , model.parameters() and model.state_dict()?
- model.parameters() : weights matrix , bias vector , and requires_grad
- model.name_parameters() : name ,weights matrix , bias vector , and requires_grad
- model.state_dict() : name ,weights matrix , bias vector , and buffers (batchnorm’s running_mean)
10 The Five key steps during training
- ①求输出output:
output = model(images)
- ②求损失函数
loss = criterion(output, labels)
- ③设置优化器全局最优
optimizer.zero_grad()
- ④损失函数向后梯度传播backward,获得梯度
loss.backward()
- ⑤向更优解走一步
optimizer.step()
11 FINETUNING TORCHVISION MODELS
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import sys, os
base_dir = os.path.dirname(os.path.realpath(__file__))
data_dir = base_dir + "/data/hymenoptera_data"
model_name = "squeezenet"
num_classes = 2
batch_size = 8
num_epochs = 15
feature_extract = True
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25,
is_inception=False):
since = time.time()
val_acc_history = []
# deep copy copies everything it may copy too much, such as data which
# is intended to be shared between copies
best_model_wts = copy.deepcopy(model.state_dict())
# A state_dict is simply a Python dict object that maps each layer to
# its param tensor. Note that only layers with learnable params and
# registered buffers(batchnorm's running_mean) have entries in the
# model's state_dict.
# Optimizer objects also have a state_dict, which contains information
# about the optim's state, as well as the hyperparameters used
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == "train":
model.train()
else:
model.eval()
running_loss = 0.0
running_corrects = 0
for inputs, lables in dataloaders[phase]:
inputs = inputs.to(device)
lables = lables.to(device)
# Before optimize, we should set optimizer.zero_grad()
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == "train"):
# Special case for inception because in training it has
# an auxiliary output. In train mode we calculate the loss
# by summing the final output and the auxiliary output
# but in testing we only consider the final output
if is_inception and phase == "train":
outputs, aux_outputs = model(inputs)
loss1 = criterion(outputs, lables)
loss2 = criterion(aux_outputs, lables)
loss = loss1 + 0.4*loss2
else:
outputs = model(inputs)
loss = criterion(outputs, lables)
# torch.max(input, dim)
_, preds = torch.max(outputs, 1)
# backward + optimize only if in training phase
if phase == "train":
loss.backward()
optimizer.step()
# statistic
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == lables.data)
epoch_loss = running_loss / len(dataloaders[phase].dataset)
epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
print("{} loss : {:.4f} Acc : {:.4f}".format(phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == "val" and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
if phase == "val":
val_acc_history.append(epoch_acc)
print()
time_elapsed = time.time() - since
print("Training complete in {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60))
print("Best val Acc: {:.4f}".format(best_acc))
# load best model weights
# Loads a model's param dict using a deserialized state_dict
model.load_state_dict(best_model_wts)
return model, val_acc_history
# By default, when we load a pretrained model all of the parameters have .requires_grad=True
# However, if we are feature extracting and only want to compute gradients for the newly
# initialized layer then we want all of the other parameters to not require gradients.
def set_parameter_requires_grad(model, feature_extracting):
if feature_extracting:
for param in model.parameters():
param.requires_grad = False
# The goal here is to reshape the last layer to have the same number of inputs as before,
# AND to have the same number of outputs as the number of classes in the dataset.
def initialize_model(model_name, num_classes, feature_extract, use_pretrained = True):
# 下载模型,修改input size与output size, 设置模型参数需要梯度下降
model_ft = None
input_size = 0
if model_name == "resnet":
model_ft = models.resnet18(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
in_features = model_ft.fc.in_features
model_ft.fc = nn.Linear(in_features, num_classes)
input_size = 224
elif model_name == "alexnet":
model_ft = models.alexnet(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
in_features = model_ft.classifier[6].in_features
model_ft.classifier[6] = nn.Linear(in_features, num_classes)
input_size = 224
elif model_name == "vgg":
model_ft = models.vgg11(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
in_features = model_ft.classifier[6].in_features
model_ft.classifier[6] = nn.Linear(in_features, num_classes)
input_size = 224
elif model_name == "squeezenet":
model_ft = models.squeezenet1_0(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
model_ft.classifier[1] = nn.Conv2d(512, num_classes, kernel_size=(1, 1), stride=(1, 1))
model_ft.num_classes = num_classes
input_size = 224
elif model_name == "densenet":
model_ft = models.densenet121(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
input_size = model_ft.classifier.in_features
model_ft.classifier = nn.Linear(input_size, num_classes)
input_size = 224
elif model_name == "inception":
model_ft = models.inception_v3(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
# Handle the auxilary net
input_size = model_ft.AuxLogits.fc.in_features
model_ft.AuxLogits.fc = nn.Linear(input_size, num_classes)
# Handle the primary net
input_size = model_ft.fc.in_features
model_ft.fc = nn.Linear(input_size, num_classes)
input_size = 299
else:
print("Invalid model name, exiting...")
exit()
return model_ft, input_size
model_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=True)
# Data augmentation and normalization for training
# Just normalization for validation
# 用数据字典保存train及val对应的transform操作
data_transforms = {
"train": transforms.Compose([
transforms.RandomResizedCrop(input_size),
transforms.RandomHorizontalFlip(),
# Convert a PIL Image or numpy.ndarray to tensor.
transforms.ToTensor(),
# transforms.Normalize(mean, std, inplace=False)
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
"val": transforms.Compose([
transforms.Resize(input_size),
transforms.CenterCrop(input_size),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
}
print("Initializing Datasets and Dataloaders...")
# Create training and validation datasets
# datasets.ImageFolder(): 获得数据集路径,并对所指向的数据进行数据增强
# datasets.ImageFolder(root: str, transform: Union[Callable, NoneType] = None ...]
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x])
for x in ["train", "val"]}
# Create training and validation dataloaders
# 装载数据
# To avoid blocking computation code with data loading, PyTorch provides an easy switch
# to perform multi-process data loading by simply setting the argument num_workers to a
# positive integer.
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x],
batch_size=batch_size, shuffle=True, num_workers=4) for x in ["train", "val"]}
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Create the Optimizer
model_ft = model_ft.to(device)
params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
params_to_update = []
temp = model_ft.named_parameters()
for name, param in model_ft.named_parameters():
if param.requires_grad == True:
params_to_update.append(param)
print("\t", name)
else:
for name, param in model_ft.named_parameters():
if param.requires_grad == True:
print("\t", name)
# Observe that params are being optimized
optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)
# Setup the loss fxn
criterion = nn.CrossEntropyLoss()
# Train and evaluate
model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft,
num_epochs=num_epochs, is_inception=(model_name=="inception"))
# Comparison with Model Trained from Scratch
# lets see how the model learns if we do not use transfer learning.
# Initialize the non-pretrained version of the model used for this run
scratch_model, _ = initialize_model(model_name, num_classes, feature_extract=False, use_pretrained=False)
scratch_model = scratch_model.to(device)
scratch_optimizer = optim.SGD(scratch_model.parameters(),lr=0.001, momentum=0.9)
scratch_criterion = nn.CrossEntropyLoss()
_, scratch_hist = train_model(scratch_model, dataloaders_dict, scratch_criterion,
scratch_optimizer, num_epochs=num_epochs, is_inception=(model_name=="inception"))
# Plot the training curves of validation accuracy vs. number
# of training epochs for the transfer learning method and
# the model trained from scratch
ohist = []
shist = []
ohist = [h.cpu().numpy() for h in hist]
shist = [h.cpu().numpy() for h in scratch_hist]
plt.title("Validation Accuracy vs. Number of Training Epochs")
plt.xlabel("Training Epochs")
plt.ylabel("Validation Accuracy")
plt.plot(range(1, num_epochs+1), ohist, label="Pretrained")
plt.plot(range(1, num_epochs+1), shist, label="Scratch")
# Set the y-limits current axes
plt.ylim((0,1.))
plt.legend()
plt.show()
12 Numpy warm-up
Taylar Extension to fit sine func
import numpy as np
import math
x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)
a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()
learning_rate = 1e-6
for t in range(2000):
y_pred = a + b * x + c * x ** 2 + d * x ** 3
loss = np.square(y_pred - y).sum()
if (t % 100 == 99):
print("Loss:(1-{}):{}".format(t, loss))
grad_y_pred = 2.0 * (y_pred - y)
grad_a = grad_y_pred.sum()
grad_b = (grad_y_pred * x).sum()
grad_c = (grad_y_pred * x**2).sum()
grad_d = (grad_y_pred * x**3).sum()
a -= learning_rate * grad_a
b -= learning_rate * grad_b
c -= learning_rate * grad_c
d -= learning_rate * grad_d
print("Result: y = {} + {}x + {}x^2 +{}x^3".format(a, b, c, d))
Use Tensor Autograd
import numpy as np
import math
import torch
dtype = torch.float
device = torch.device("cpu")
# device (torch.device, optional) – the desired device of returned tensor.
x = torch.linspace(-math.pi, math.pi, 2000, dtype=dtype, device=device)
y = torch.sin(x)
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)
learning_rate = 1e-6
loop = 1
for t in range(loop):
y_pred = a + b * x + c * x ** 2 + d * x ** 3
# tensor(217819.4688).item() = 217819.4688
loss = (y_pred - y).pow(2).sum().item()
if (t % 100 == 99):
print("Loss:(1-{}):{}".format(t+1, loss))
# Use autograd to compute the backward pass. This call will compute the
# gradient of loss with respect to all Tensors with requires_grad=True.
# After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
# the gradient of the loss with respect to a, b, c, d respectively.
loss.backward()
# grad_y_pred = 2.0 * (y_pred - y)
# grad_a = grad_y_pred.sum()
# grad_b = (grad_y_pred * x).sum()
# grad_c = (grad_y_pred * x**2).sum()
# grad_d = (grad_y_pred * x**3).sum()
# Disabling gradient calculation is useful for
# inference, when you are sure that you will not
# call Tensor.backward(). It will reduce memory
# consumption for computations that would otherwise
# have requires_grad=True.
with torch.no_grad():
a -= learning_rate * a.grad
b -= learning_rate * b.grad
c -= learning_rate * c.grad
d -= learning_rate * d.grad
# a -= learning_rate * grad_a
# b -= learning_rate * grad_b
# c -= learning_rate * grad_c
# d -= learning_rate * grad_d
#
# print("Result: y = {} + {}x + {}x^2 +{}x^3".format(a, b, c, d))
torch.randn() nn.Conv2d() and torch.flatten()
- torch.randn(4, 1, 5, 5): input.size() = 415*5
- nn.Conv2d(1, 32, 5, 1, 1): nn.Conv2d(input_channel=1, out_channel=32, kernel_size=(5,5), stride=(1,1), padding=(1,1))
# input.size() = 4*1*5*5
input = torch.randn(32, 1, 5, 5)
m = nn.Sequential(
# nn.Conv2d(input_channel=1, out_channel=32, kernel_size=(5,5), stride=(1,1), padding=(1,1))
nn.Conv2d(1, 32, 5, 1, 1),
#nn.Flatten()
)
output = m(input)
output.size()
print(output)
print(output.size())
Flatten():Flattens a contiguous range of dims into a tensor.
start_dim – first dim to flatten (default = 1).
end_dim – last dim to flatten (default = -1).
- 无flatten():torch.Size([32, 32, 3, 3])
- 有flatten():torch.Size([32, 288])
nn.Linear():
# nn.Linear(in_features = 3, out_features = 1)
# 此处xx.size=(2000,3),经过线性映射WTx+b后,output.size=(10,1)
nn.Linear(3, 1)
14 linear_layer.weight, linear_layer.weight[0], linear_layer.weight[0][0]
- linear_layer.weight:
print("linear_layer.weight: {}".format(linear_layer.weight))
linear_layer.weight: Parameter containing:
tensor([[ 0.3482, -0.0626, -0.0416]], requires_grad=True)
- linear_layer.weight[0]:
print("linear_layer.weight[0]: {}".format(linear_layer.weight[0]))
linear_layer.weight[0]: tensor([ 0.3482, -0.0626, -0.0416], grad_fn=)
- linear_layer.weight[0][0]:
print("linear_layer.weight[0][0]: {}".format(linear_layer.weight[0][0]))
linear_layer.weight[0][0]: 0.34817636013031006
15 Learning Pytorch with Example
import torch
import torch.optim as optim
import math
import torch.nn as nn
num = 10
x = torch.linspace(-math.pi, math.pi, num)
y = torch.sin(x)
p = torch.tensor([1, 2, 3])
# torch.unsqueeze(-1) 将(1,num)数组转换成(num,1)的矩阵
# (num,1) * (1,p_num) = (num, p_num) 即(2000 ,3)
xx = x.unsqueeze(-1).pow(p)
model = torch.nn.Sequential(
# nn.Linear(in_features = 3, out_features = 1)
# 此处xx.size=(2000,3),经过线性映射WTx+b后,output.size=(10,1)
nn.Linear(3, 1),
# The Flatten layer flatens the output of the linear layer to a 1D tensor,
# to match the shape of `y`.
nn.Flatten()
)
loss_fn = nn.MSELoss(reduction="sum")
learning_rate = 1e-6
optimimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
for t in range(2000):
y_pred = model(xx)
loss = loss_fn(y_pred, y)
if (t % 100 == 99):
print(t, loss.item())
model.zero_grad()
# Compute gradient
loss.backward()
# with torch.no_grad():
# for param in model.parameters():
# param -= learning_rate * param.grad
optimimizer.step()
# You can access the first layer of `model` like accessing
# the first item of a list
linear_layer = model[0]
print("Result: y = {} + {}x + {}x^2 + {}x^3".format(linear_layer.bias.item(),
linear_layer.weight[0, 0].item(), linear_layer.weight[0, 1].item() ,linear_layer.weight[0,2].item()))
16 Define a class
- Learn the mechanism of inlined function
import os
import numpy as np
import torch
from PIL import Image
class PennFudanDataset(object):
def __init__(self, root, transforms):
self.root = root
self.transforms = transforms
# os.listdir():returns a list containing the names of the entries in the directory given by path
self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
def __getitem__(self, idx):
img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
img = Image.open(img_path).convert("RGB")
# note that we haven't converted the mask to RGB,
# because each color corresponds to a different instance
# with 0 being background
mask = Image.open(mask_path)
# convert the PIL Image into a numpy array
mask = np.array(mask)
# instances are encoded as different colors
obj_ids = np.unique(mask)
# first id is the background, so remove it
obj_ids = obj_ids[1:]
# split the color-encoded mask into a set of binary masks
masks = mask == obj_ids[:, None, None]
# get bounding box coordinates for each mask
num_objs = len(obj_ids)
boxes = []
for i in range(num_objs):
pos = np.where(masks[i])
xmin = np.min(pos[1])
xmax = np.max(pos[1])
ymin = np.min(pos[0])
ymax = np.max(pos[0])
boxes.append([xmin, ymin, xmax, ymax])
# convert everything into a torch.Tensor
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# there is only one class
labels = torch.ones((num_objs,), dtype=torch.int64)
masks = torch.as_tensor(masks, dtype=torch.uint8)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# suppose all instances are not crowd
iscrowd = torch.zeros((num_objs, ), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transform is not None:
img, target = self.transforms(img, target)
return img, target
def __len__(self):
return len(self.imgs)