一、文本预处理
step1-读入文本
import re
import collections
# read time_machine
def read_time_machine():
with open('/home/yuzhu/input/timemachine/timemachine.txt', 'r') as f:
lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f]
return lines
lines = read_time_machine()
print('# sentences %d' % len(lines))
step2-分词
# split sentences into word or char tokens
def tokenize(sentences, token='word'):
if token == 'word':
return [sentence.split(' ') for sentence in sentences]
elif token == 'char':
return [list(sentence) for sentence in sentences]
else:
print('ERROR: unknown token type' + token)
tokens = tokenize(lines)
print(tokens[0:2])
step3-建立字典
# build dictionary
class Vocab(object):
def __init__(self, tokens, min_freq=0, use_special_tokens=False):
counter = count_corpus(tokens)
self.token_freqs = list(counter.items())
self.idx_to_token = []
if use_special_tokens:
# padding, begin of sentence, end of sentence, unknown
self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
self.idx_to_token += ['', '' ,'' ,'']
else:
self.unk = 0
self.idx_to_token += ['']
self.idx_to_token += [token for token, freq in self.token_freqs if freq >= min_freq and token not in self.idx_to_token]
self.token_to_idx = dict()
for idx, token in enumerate(self.idx_to_token):
self.token_to_idx[token] = idx
def __len__(self):
return len(self.idx_to_token)
def __getitem__(self,tokens):
if not isinstance(tokens, (list, tuple)):
return self.token_to_idx.get(tokens, self.unk)
return [self.__getitem__(token) for token in tokens]
def to_tokens(self, indices):
if not isinstance(indices, (list,tuple)):
return self.idx_to_token[indices]
return [self.idx_to_token[index] for index in indices]
step4-将词转换为索引
def count_corpus(sentences):
tokens = [tk for st in sentences for tk in st]
return collections.Counter(tokens)
# Example
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[0:10])
Results:
# sentences 3583
[[''], ['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'time', 'machine', 'by', 'h', 'g', 'wells']]
[('', 0), ('the', 1), ('project', 2), ('gutenberg', 3), ('ebook', 4), ('of', 5), ('time', 6), ('machine', 7), ('by', 8), ('h', 9)]
利用现有的工具进行分词
两个比较常用的工具包括:sqaCy和NLTK
二、语言模型和数据集
1-语言模型
一段自然语言文本可以看作是一个离散时间序列,给定一个长度为T的词的序列,语言模型的目标就是评估该序列是否合理,即计算该序列的概率
假设序列中的每个词都是依次生成的:
那么
语言模型的参数就是词的概率以及给定前几个词的情况下的条件概率
2-n元语法
序列长度增加,计算储存多个词共同出现的概率的复杂度会呈指数级增加,n元语法通过马尔科夫假设简化模型
马尔科夫假设是指一个词的出现只与前面的n个词相关,即n阶马尔科夫链(Markov chain of irder n),如果n=1,那么有
基于n-1阶马尔科夫链,我们可以将语言模型改写为:
n元语法(n-grams)就是基于n-1阶马尔科夫链的概率语言模型
当n分别等于1、2和3时,我们将其分别称为一元语法(unigram)、二元语法(bigram)、三元语法(trigram),例如长度为4的序列在一元语法、二元语法以及三元语法中的概率分别为:
3-语言模型数据集
读取数据集:
with open('/home/yuzhu/input/jayzhou_lyrics/jayzhou_lyrics.txt') as f:
corpus_chars = f.read()
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[: 10000]
建立字符索引:
# 去重,得到索引到字符的映射
idx_to_char = list(set(corpus_chars))
# 字符到索引的映射
char_to_idx = {char : i for i, char in enumerate(idx_to_char)}
vocab_size = len(char_to_idx)
# 将每个字符转化为索引,得到索引的一个序列
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[: 20]
定义数据加载函数:
def load_data_jay_lyrics():
with open('/home/yuzhu/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f:
corpus_chars = f.read()
corpus_chars = f.read()
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[0:10000]
idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
corpus_indices = [char_to_idx[char] for char in corpus_chars]
return corpus_indices, char_to_idx, idx_to_char, vocab_size2-n元语法
4-数据采样
时序数据采样
在训练中我们需要每次随机读取小批量的样本和标签,时序数据的一个样本通常包含连续的字符,也就是说如果序列的长度为T,时间的部署为n,那么一共就有T-n个合法的样本,但是这些样本有大量的重合,通常采用更加高效的采样方式
M1-随机采样
Program:
import torch
import random
def data_iter_random(corpus_indices, batch_size, num_steps, device=None):
# 减1是因为对于长度为n的序列,x最多包含其中前n-1个字符
num_examples = (len(corpus_indices) - 1) // num_steps
example_indices = [i * num_steps for i in range(num_examples)]
random.shuffle(example_indices)
def _data(i):
# 返回从i开始的长为num_steps的序列
return corpus_indices[i: i+num_steps]
if device is None:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for i in range(0, num_examples, batch_size):
# 每次选出batch_size个随机样本
batch_indices = example_indices[i:i + batch_size]
x = [_data(j) for j in batch_indices]
y = [_data(j+1) for j in batch_indices]
yield torch.tensor(x, device=device), torch.tensor(y, device=device)
my_seq = list(range(30))
for x, y in data_iter_random(my_seq, batch_size=2, num_steps=6):
print('x: ', x, '\ny:', y, '\n')
Results:
x: tensor([[ 0, 1, 2, 3, 4, 5],
[ 6, 7, 8, 9, 10, 11]])
y: tensor([[ 1, 2, 3, 4, 5, 6],
[ 7, 8, 9, 10, 11, 12]])
x: tensor([[12, 13, 14, 15, 16, 17],
[18, 19, 20, 21, 22, 23]])
y: tensor([[13, 14, 15, 16, 17, 18],
[19, 20, 21, 22, 23, 24]])
```
M2-相邻采样
#### 三、循环神经网络
##### 1-循环神经网络
基于当前的输入和过去的输入序列,预测序列的下一个字符。循环神经网络引入一个隐藏变量H,认为Ht记录了当前字符为止的序列信息,利用Ht对序列的下一个字符进行预测
![在这里插入图片描述](https://img-blog.csdnimg.cn/20200214154400734.png?x-oss-process=image#pic_center/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2JrYnZ1c2h1,size_16,color_FFFFFF,t_70)
##### 2-循环神经网络的构造
![在这里插入图片描述](https://img-blog.csdnimg.cn/20200214154433312.png?x-oss-process=image#pic_center/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2JrYnZ1c2h1,size_16,color_FFFFFF,t_70)
##### 3-循环神经网络的实现
PROGRAM:
· 读入数据
```python
import torch
import torch.nn as nn
import time
import math
import sys
sys.path.append("/home/kesci/input")
import d2l_jay9460 as d2l
(corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
· one-hot向量——用来将字符表示成向量
def one_hot(x, n_class, dtype=torch.float32):
result = torch.zeros(x.shape[0], n_class, dtype=dtype, device=x.device) # shape: (n, n_class)
result.scatter_(1, x.long().view(-1, 1), 1) # result[i, x[i, 0]] = 1
return result
x = torch.tensor([0, 2])
x_one_hot = one_hot(x, vocab_size)
print(x_one_hot)
print(x_one_hot.shape)
print(x_one_hot.sum(axis=1))
每次采样的小批量的形状是(batch_size, time_step),下面函数将小批量转换成矩阵,矩阵的个数等于时间的步数
def to_onehot(X, n_class):
return [one_hot(X[:, i], n_class) for i in range(X.shape[1])]
X = torch.arange(10).view(2, 5)
inputs = to_onehot(X, vocab_size)
print(len(inputs), inputs[0].shape)
· 初始化模型参数
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size
# num_inputs: d
# num_hiddens: h, 隐藏单元的个数是超参数
# num_outputs: q
def get_params():
def _one(shape):
param = torch.zeros(shape, device=device, dtype=torch.float32)
nn.init.normal_(param, 0, 0.01)
return torch.nn.Parameter(param)
# 隐藏层参数
W_xh = _one((num_inputs, num_hiddens))
W_hh = _one((num_hiddens, num_hiddens))
b_h = torch.nn.Parameter(torch.zeros(num_hiddens, device=device))
# 输出层参数
W_hq = _one((num_hiddens, num_outputs))
b_q = torch.nn.Parameter(torch.zeros(num_outputs, device=device))
return (W_xh, W_hh, b_h, W_hq, b_q)
· 定义模型——rnn使用循环的方式依次完成循环神经网络每个时间步的计算
def rnn(inputs, state, params):
# inputs和outputs皆为num_steps个形状为(batch_size, vocab_size)的矩阵
W_xh, W_hh, b_h, W_hq, b_q = params
H, = state
outputs = []
for X in inputs:
H = torch.tanh(torch.matmul(X, W_xh) + torch.matmul(H, W_hh) + b_h)
Y = torch.matmul(H, W_hq) + b_q
outputs.append(Y)
return outputs, (H,)
函数init-rnn-state初始化隐藏变量,这里的返回值是一个元组
def init_rnn_state(batch_size, num_hiddens, device):
return (torch.zeros((batch_size, num_hiddens), device=device), )
· 裁减梯度
循环神经网络中容易出现梯度的衰减或者是梯度的爆炸,这会导致网络几乎无法训练,裁减梯度用来解决梯度爆炸的问题
def grad_clipping(params, theta, device):
norm = torch.tensor([0.0], device=device)
for param in params:
norm += (param.grad.data ** 2).sum()
norm = norm.sqrt().item()
if norm > theta:
for param in params:
param.grad.data *= (theta / norm)
· 定义预测函数
def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state,
num_hiddens, vocab_size, device, idx_to_char, char_to_idx):
state = init_rnn_state(1, num_hiddens, device)
output = [char_to_idx[prefix[0]]] # output记录prefix加上预测的num_chars个字符
for t in range(num_chars + len(prefix) - 1):
# 将上一时间步的输出作为当前时间步的输入
X = to_onehot(torch.tensor([[output[-1]]], device=device), vocab_size)
# 计算输出和更新隐藏状态
(Y, state) = rnn(X, state, params)
# 下一个时间步的输入是prefix里的字符或者当前的最佳预测字符
if t < len(prefix) - 1:
output.append(char_to_idx[prefix[t + 1]])
else:
output.append(Y[0].argmax(dim=1).item())
return ''.join([idx_to_char[i] for i in output])
· 困惑度
使用困惑度(preplexity)来评价语言模型的好坏,困惑度是对交叉熵损失函数作指数计算后得到的值
· 定义模型训练函数
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
vocab_size, device, corpus_indices, idx_to_char,
char_to_idx, is_random_iter, num_epochs, num_steps,
lr, clipping_theta, batch_size, pred_period,
pred_len, prefixes):
if is_random_iter:
data_iter_fn = d2l.data_iter_random
else:
data_iter_fn = d2l.data_iter_consecutive
params = get_params()
loss = nn.CrossEntropyLoss()
for epoch in range(num_epochs):
if not is_random_iter: # 如使用相邻采样,在epoch开始时初始化隐藏状态
state = init_rnn_state(batch_size, num_hiddens, device)
l_sum, n, start = 0.0, 0, time.time()
data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device)
for X, Y in data_iter:
if is_random_iter: # 如使用随机采样,在每个小批量更新前初始化隐藏状态
state = init_rnn_state(batch_size, num_hiddens, device)
else: # 否则需要使用detach函数从计算图分离隐藏状态
for s in state:
s.detach_()
# inputs是num_steps个形状为(batch_size, vocab_size)的矩阵
inputs = to_onehot(X, vocab_size)
# outputs有num_steps个形状为(batch_size, vocab_size)的矩阵
(outputs, state) = rnn(inputs, state, params)
# 拼接之后形状为(num_steps * batch_size, vocab_size)
outputs = torch.cat(outputs, dim=0)
# Y的形状是(batch_size, num_steps),转置后再变成形状为
# (num_steps * batch_size,)的向量,这样跟输出的行一一对应
y = torch.flatten(Y.T)
# 使用交叉熵损失计算平均分类误差
l = loss(outputs, y.long())
# 梯度清0
if params[0].grad is not None:
for param in params:
param.grad.data.zero_()
l.backward()
grad_clipping(params, clipping_theta, device) # 裁剪梯度
d2l.sgd(params, lr, 1) # 因为误差已经取过均值,梯度不用再做平均
l_sum += l.item() * y.shape[0]
n += y.shape[0]
if (epoch + 1) % pred_period == 0:
print('epoch %d, perplexity %f, time %.2f sec' % (
epoch + 1, math.exp(l_sum / n), time.time() - start))
for prefix in prefixes:
print(' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
num_hiddens, vocab_size, device, idx_to_char, char_to_idx))