# pytorch学习笔记（四）

## 2.正文

### 2.1 Tensor

#### 2.1.1 热身：numpy

import numpy as np

#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N,D_in,H,D_out = 64,1000,100,10

#Create random input and output data
x = np.random.randn(N,D_in)
y = np.random.randn(N,D_out)

#Randomly initialize weights
w1 = np.random.randn(D_in,H)
w2 = np.random.randn(H,D_out)

learning_rate = 1e-6
for t in range(500):
#forward pass:compute predicted y
h = x.dot(w1)
h_relu = np.maximum(h,0)
y_pred = h_relu.dot(w2)

#compute and print loss
loss = np.square(y_pred-y).sum()
if t%100 == 99:
print(t,loss)

#backprop to compute gradients of w1 and w2 tith respect to loss

#update weights
w2 -= learning_rate*grad_w2

##### 隐藏层

99 611.8403334325828
199 5.780260334791743
299 0.09678974435224459
399 0.0019321130866979581
499 4.126089452091746e-05

#### 2.1.2Pytorch：Tensor

import torch

dtype = torch.float
device = torch.device("cpu")
#device = torch.device("cuda:0")#Uncommrnt this to run on GPU

#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N,D_in,H,D_out = 64,1000,100,10

#Create random input and output data
x = torch.randn(N,D_in,device=device,dtype=dtype)
y = torch.randn(N,D_out,device=device,dtype=dtype)

#Randomly initialize weights
w1 = torch.randn(D_in,H,device=device,dtype=dtype)
w2 = torch.randn(H,D_out,device = device, dtype = dtype)

learning_rate = 1e-6
for t in range(500):
#Forward pass:compute predicted y
h = x.mm(w1)
h_relu = h.clamp(min = 0)
y_pred = h_relu.mm(w2)

#compute and print loss
loss = (y_pred - y).pow(2).sum().item()
if t %100 == 99:
print(t,loss)

#backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y)

# update weights
w2 -= learning_rate * grad_w2

99 688.8875122070312
199 4.103602886199951
299 0.04172804579138756
399 0.0007906379760242999
499 8.704190258868039e-05

import torch

dtype = torch.float
device = torch.device("cpu")
#device = torch.device("cuda:0")#Uncommrnt this to run on GPU

#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

#Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

#Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
#Forward pass:compute predicted y
y_pred = x.mm(w1).clamp(min=0).mm(w2)

#compute and print loss using operations on Tensors
#Now loss is a Tensor of shape(1,)
#loss.item() gets the a scalar value held in the loss.

loss = (y_pred - y).pow(2).sum()
if t % 100 == 99:
print(t, loss.item())

#use autograd to compute the backward pass.This call will compute the
#gradient of loss with respect to all Tensors with requires_grad = True
#of the loss with respect to w1 and w2 respectively.
loss.backward()

#because weight have requires_grad = True,but we don't need to track this
#An alternative way is to operate on weight.data and weight.grad.data.
#Recall that tensor.data gives a tensor that shares the storage with
#tensor,but doesn't track history.
#You can also use torch.optim.SGD to achieve this

#Manually zero the gradients after updating weights
w2.grad.zero_()

99 468.9629821777344
199 2.9594504833221436
299 0.023482277989387512
399 0.0004086267144884914
499 5.1561615691753104e-05

import torch

@staticmethod
def forward(ctx, input):
ctx.save_for_backward(input)
return input.clamp(min=0)

@staticmethod
input, = ctx.saved_tensors

dtype = torch.float
device = torch.device("cpu")
#device = torch.device("cuda:0")#Uncommrnt this to run on GPU

#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

#Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

#Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
#Forward pass:compute predicted y
relu = MyReLU.apply
y_pred = x.mm(w1).clamp(min=0).mm(w2)

#compute and print loss using operations on Tensors
#Now loss is a Tensor of shape(1,)
#loss.item() gets the a scalar value held in the loss.

loss = (y_pred - y).pow(2).sum()
if t % 100 == 99:
print(t, loss.item())

#use autograd to compute the backward pass.This call will compute the
#gradient of loss with respect to all Tensors with requires_grad = True
#of the loss with respect to w1 and w2 respectively.
loss.backward()

#because weight have requires_grad = True,but we don't need to track this
#An alternative way is to operate on weight.data and weight.grad.data.
#Recall that tensor.data gives a tensor that shares the storage with
#tensor,but doesn't track history.
#You can also use torch.optim.SGD to achieve this

#Manually zero the gradients after updating weights
w2.grad.zero_()

99 664.2792358398438
199 3.2187328338623047
299 0.023685619235038757
399 0.00038831226993352175
499 4.969811925548129e-05

### 2.3 nn.module

#### 2.3.1 nn

nn中定义了一系列可以近似等同于神经网络层的modules，我们来看看用nn来完成tow-layer network：

import torch

#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

#Create random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

#use the nn package to define our model as a sequence of layers.nn.Sequential
#is a Module which contains other Modules,and applies them in sequence to
#produce its output.Each Linear Module computes output from input using a
#linear function,and holds internal Tensors for its weight and bias
model = torch.nn.Sequential(
torch.nn.Linear(D_in,H),
torch.nn.ReLU(),
torch.nn.Linear(H,D_out),
)

#the nn package also contains definitions of popular loss functions;in this
#case we will use Mean Squared Error(MSE) as our lossfunction.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
#Forward pass:compute predicted y
y_pred = model(x)

#compute and print loss using operations on Tensors
loss = loss_fn(y_pred,y)
if t % 100 == 99:
print(t, loss.item())

#zero the gradients before running the backward pass

#this call will compute gradients for all learnable parameters in the model.
loss.backward()

for param in model.parameters():
param -= learning_rate*param.grad

99 2.496163845062256
199 0.06094813346862793
299 0.003522129962220788
399 0.0002878477971535176
499 2.720016345847398e-05

#### 2.3.2 optim

import torch

#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

#Create random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

#use the nn package to define our model as a sequence of layers.nn.Sequential
#is a Module which contains other Modules,and applies them in sequence to
#produce its output.Each Linear Module computes output from input using a
#linear function,and holds internal Tensors for its weight and bias
model = torch.nn.Sequential(
torch.nn.Linear(D_in,H),
torch.nn.ReLU(),
torch.nn.Linear(H,D_out),
)

#the nn package also contains definitions of popular loss functions;in this
#case we will use Mean Squared Error(MSE) as our lossfunction.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
#Forward pass:compute predicted y
y_pred = model(x)

#compute and print loss using operations on Tensors
loss = loss_fn(y_pred,y)
if t % 100 == 99:
print(t, loss.item())

#before the backward pass,use the optimizer object to zero all of the gradients
#for all the variables it will update.This is because by fault,gradients are
#accumulated in buffers whenever .backward() is called.

#this call will compute gradients for all learnable parameters in the model.
loss.backward()

#calling the step function on an Optimizer makes an updata to its parameters
optimizer.step()

99 51.58766174316406
199 0.7978752851486206
299 0.0029272770043462515
399 9.20035017770715e-06
499 1.124239989991338e-08


#### 2.3.3 Custom nn Modules

import torch

class TwoLayerNet(torch.nn.Module):
def __init__(self,D_in,H,D_out):
super(TwoLayerNet, self).__init__()
self.linear1 = torch.nn.Linear(D_in,H)
self.linear2 = torch.nn.Linear(H,D_out)

def forward(self, x):
h_relu = self.linear1(x).clamp(min=0)
y_pred = self.linear2(h_relu)
return y_pred
#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

#Create random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in,H,D_out)

#the nn package also contains definitions of popular loss functions;in this
#case we will use Mean Squared Error(MSE) as our lossfunction.
criterion = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)
for t in range(500):
#Forward pass:compute predicted y
y_pred = model(x)

#compute and print loss using operations on Tensors
loss = criterion(y_pred,y)
if t % 100 == 99:
print(t, loss.item())

#before the backward pass,use the optimizer object to zero all of the gradients
#for all the variables it will update.This is because by fault,gradients are
#accumulated in buffers whenever .backward() is called.

#this call will compute gradients for all learnable parameters in the model.
loss.backward()

#calling the step function on an Optimizer makes an updata to its parameters
optimizer.step()

#### 2.3.4 Control Flow + Weight Sharing

import torch
import random

class TwoLayerNet(torch.nn.Module):
def __init__(self,D_in,H,D_out):
super(TwoLayerNet, self).__init__()
self.input_linear = torch.nn.Linear(D_in,H)
self.middle_linear = torch.nn.Linear(H,H)
self.output_linear = torch.nn.Linear(H,D_out)

def forward(self, x):
h_relu = self.input_linear(x).clamp(min=0)
for _ in range(random.randint(0,3)):
h_relu = self.middle_linear(h_relu).clamp(min=0)
y_pred = self.output_linear(h_relu)
return y_pred
#N is batch size;D_in isinput dimension
#H is hidden dimension;D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

#Create random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in,H,D_out)

#the nn package also contains definitions of popular loss functions;in this
#case we will use Mean Squared Error(MSE) as our lossfunction.
criterion = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)
for t in range(500):
#Forward pass:compute predicted y
y_pred = model(x)

#compute and print loss using operations on Tensors
loss = criterion(y_pred,y)
if t % 100 == 99:
print(t, loss.item())

#before the backward pass,use the optimizer object to zero all of the gradients
#for all the variables it will update.This is because by fault,gradients are
#accumulated in buffers whenever .backward() is called.

#this call will compute gradients for all learnable parameters in the model.
loss.backward()

#calling the step function on an Optimizer makes an updata to its parameters
optimizer.step()

## 3.小结

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客