karpathy make more -- 4

1 Introduction

这个部分要完成一个网络的模块化,然后实现一个新的网络结构。

2 使用torch的模块化功能

2.1 模块化

将输入的字符长度变成8,并将之前的代码模块化

# Near copy paste of the layers we have developed in Part 3

# -----------------------------------------------------------------------------------------------

class Linear:
  
  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5 # note: kaiming init
    self.bias = torch.zeros(fan_out) if bias else None
  
  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out
  
  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])

# -----------------------------------------------------------------------------------------------
class BatchNorm1d:
  
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      if x.ndim == 2:
        dim = 0
      elif x.ndim == 3:
        dim = (0,1)
      xmean = x.mean(dim, keepdim=True) # batch mean
      xvar = x.var(dim, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]

# -----------------------------------------------------------------------------------------------
class Tanh:
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out
  def parameters(self):
    return []

# -----------------------------------------------------------------------------------------------
class Embedding:
  
  def __init__(self, num_embeddings, embedding_dim):
    self.weight = torch.randn((num_embeddings, embedding_dim))
    
  def __call__(self, IX):
    self.out = self.weight[IX]
    return self.out
  
  def parameters(self):
    return [self.weight]

class Flatten:
  def __call__(self, x):
    self.out = x.view(x.shape[0], -1)
    return self.out
  def parameters(self):
    return []

# -----------------------------------------------------------------------------------------------
class FlattenConsecutive:
  
  def __init__(self, n):
    self.n = n
    
  def __call__(self, x):
    B, T, C = x.shape
    x = x.view(B, T//self.n, C*self.n)
    if x.shape[1] == 1:
      x = x.squeeze(1)
    self.out = x
    return self.out
  
  def parameters(self):
    return []

# -----------------------------------------------------------------------------------------------
class Sequential:
  
  def __init__(self, layers):
    self.layers = layers
  
  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    self.out = x
    return self.out
  
  def parameters(self):
    # get parameters of all layers and stretch them out into one list
    return [p for layer in self.layers for p in layer.parameters()]

定义网络结构

block_size = 8
n_emb = 10
n_batch = 32
n_hidden = 200
g = torch.Generator().manual_seed(2147483647)
model = Sequential([
    Embedding(vocab_size, n_emb),
    Flatten(),
    Linear(n_emb * block_size, n_hidden, bias=False),
    BatchNorm1d(n_hidden),
    Tanh(),
    Linear(n_hidden, vocab_size),
])
with torch.no_grad():
    model.layers[-1].weight *= 0.1
    
print(sum(p.nelement() for p in model.parameters()))
for p in model.parameters():
    p.requires_grad = True

for layer in model.layers:
    if isinstance(layer, BatchNorm1d):
        layer.training = True

进行训练

import torch.nn.functional as F
max_iter = 200000
lossi = []
ud = []
for i in range(max_iter):
    ix = torch.randint(0, Xtr.shape[0], (n_batch,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix]
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb)
    for p in model.parameters():
        p.grad = None
    loss.backward()
    lr = 0.1 if i < 100000 else 0.01
    for p in model.parameters():
        p.data -= lr * p.grad.data
    
    lossi.append(loss.item())
    with torch.no_grad():
        ud.append([((-lr * p.grad).std() / p.data.std()).log10().item() for p in model.parameters()])
    if i % 1000 == 0:
        print(f"Iteration: {i}/{max_iter}, Loss: {loss.item()}")
    # break

显示曲线

import matplotlib.pyplot as plt
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(dim=1, keepdim=False))

在这里插入图片描述
比较训练和测试的误差

@torch.no_grad()
def batch_infer(datasets):
    X, Y = {
        'train' : (Xtr, Ytr),
        'val' : (Xdev, Ydev),
        'test' : (Xte, Yte),
    }[datasets]
    logits = model(X)
    loss = F.cross_entropy(logits, Y)
    print(f'{datasets}, loss is: {loss}')

for layer in model.layers:
    if isinstance(layer, BatchNorm1d):
        layer.training = False
batch_infer('train')
batch_infer('val')

train, loss is: 1.926148533821106
val, loss is: 2.028862237930298

网络现在有一点过拟合了。
看一下输出的结果

for _ in range(20):
    context = [0] * block_size
    ch = []
    while(True):
        X = torch.tensor([context])
        logits = model(X)
        probs = torch.softmax(logits, dim=-1).squeeze(0)
        ix = torch.multinomial(probs, num_samples=1).item()
        context = context[1:] + [ix]
        ch.append(itos[ix])
        if ix == 0:
            break
    print(''.join(ch))

quab.
nomawa.
brenne.
sevanille.
razlyn.
zile.
audaina.
zaralynn.
dawsyn.
wyle.
yalikbi.
zuria.
endrame.
mesty.
nooap.
dangele.
ellania.
bako.
memaisee.
zailan.

2.2 加上wavenet

在这里插入图片描述
这个图表示,两个点使用相同的参数矩阵C,进行映射。
首先来看矩阵乘法的表示
在这里插入图片描述
然后再来看我们这个问题,
在这里插入图片描述

代码表示为:

class FlattenConsecutive:
  
  def __init__(self, n):
    self.n = n
    
  def __call__(self, x):
    B, T, C = x.shape
    x = x.view(B, T//self.n, C*self.n)
    if x.shape[1] == 1:
      x = x.squeeze(1)
    self.out = x
    return self.out
  
  def parameters(self):
    return []

定义新的完整网络

block_size = 8
n_emb = 24
n_batch = 32
n_hidden = 128
g = torch.Generator().manual_seed(2147483647)
model = Sequential([
    Embedding(vocab_size, n_emb),
    FlattenConsecutive(2), Linear(n_emb * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size)
])
with torch.no_grad():
    model.layers[-1].weight *= 0.1
    
print(sum(p.nelement() for p in model.parameters()))
for p in model.parameters():
    p.requires_grad = True

for layer in model.layers:
    if isinstance(layer, BatchNorm1d):
        layer.training = True

这里注意一个问题,因为我们采用是batchnormal, 也就是说除了最后一维的数据,其他的数据需要normalize

class BatchNorm1d:
  
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      if x.ndim == 2:
        dim = 0
      elif x.ndim == 3:
        dim = (0,1)
      xmean = x.mean(dim, keepdim=True) # batch mean
      xvar = x.var(dim, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]

其他的内容和之前的网络相同,最后看一下训练的结果
在这里插入图片描述

train, loss is: 1.7904815673828125
val, loss is: 1.9868937730789185

sabris.
lilly.
pryce.
antwling.
lakelyn.
dayre.
theora.
hunna.
michael.
amillia.
zivy.
zuri.
florby.
jairael.
aiyank.
anahit.
madelynn.
briani.
payzleigh.
sola.

2.3 convolution

我们只执行了这里的黑色部分的代码,如果完整执行就是一个convolutional neural network。
在这里插入图片描述

References

[1] WaveNet 2016 from DeepMind https://arxiv.org/abs/1609.03499

  • 13
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值