7.1 优化算法进阶
如果自变量的迭代方向仅仅取决于自变量当前位置,这可能会带来一些问题。对于noisy gradient,我们需要谨慎的选取学习率和batch size, 来控制梯度方差和收敛的结果。
在二阶优化中,我们使用Hessian matrix的逆矩阵(或者pseudo inverse)来左乘梯度向量,
i. e.
Δ
x
=
H
−
1
g
\text { i. e. } \Delta_{x}=H^{-1} \mathrm{g}
i. e. Δx=H−1g,相当于将
H
H
H映射为一个单位矩阵,拥有分布均匀的Spectrum,也即我们去优化的等价标函数的Hessian matrix为良好的identity matrix。
给定学习率,梯度下降迭代自变量时会使自变量在竖直方向比在水平方向移动幅度更大。那么,我们需要一个较小的学习率从而避免自变量在竖直方向上越过目标函数最优解。然而,这会造成自变量在水平方向上朝最优解移动变慢。
AdaGrad算法
def get_data_ch7():
data = np.genfromtxt('/home/kesci/input/airfoil4755/airfoil_self_noise.dat', delimiter='\t')
data = (data - data.mean(axis=0)) / data.std(axis=0)
return torch.tensor(data[:1500, :-1], dtype=torch.float32), \
torch.tensor(data[:1500, -1], dtype=torch.float32)
features, labels = get_data_ch7()
def init_adagrad_states():
s_w = torch.zeros((features.shape[1], 1), dtype=torch.float32)
s_b = torch.zeros(1, dtype=torch.float32)
return (s_w, s_b)
def adagrad(params, states, hyperparams):
eps = 1e-6
for p, s in zip(params, states):
s.data += (p.grad.data**2)
p.data -= hyperparams['lr'] * p.grad.data / torch.sqrt(s + eps)
RMSProp算法
def get_data_ch7():
data = np.genfromtxt('/home/kesci/input/airfoil4755/airfoil_self_noise.dat', delimiter='\t')
data = (data - data.mean(axis=0)) / data.std(axis=0)
return torch.tensor(data[:1500, :-1], dtype=torch.float32), \
torch.tensor(data[:1500, -1], dtype=torch.float32)
features, labels = get_data_ch7()
def init_rmsprop_states():
s_w = torch.zeros((features.shape[1], 1), dtype=torch.float32)
s_b = torch.zeros(1, dtype=torch.float32)
return (s_w, s_b)
def rmsprop(params, states, hyperparams):
gamma, eps = hyperparams['beta'], 1e-6
for p, s in zip(params, states):
s.data = gamma * s.data + (1 - gamma) * (p.grad.data)**2
p.data -= hyperparams['lr'] * p.grad.data / torch.sqrt(s + eps)
AdaDelta算法
def init_adadelta_states():
s_w, s_b = torch.zeros((features.shape[1], 1), dtype=torch.float32), torch.zeros(1, dtype=torch.float32)
delta_w, delta_b = torch.zeros((features.shape[1], 1), dtype=torch.float32), torch.zeros(1, dtype=torch.float32)
return ((s_w, delta_w), (s_b, delta_b))
def adadelta(params, states, hyperparams):
rho, eps = hyperparams['rho'], 1e-5
for p, (s, delta) in zip(params, states):
s[:] = rho * s + (1 - rho) * (p.grad.data**2)
g = p.grad.data * torch.sqrt((delta + eps) / (s + eps))
p.data -= g
delta[:] = rho * delta + (1 - rho) * g * g
Adam算法
def get_data_ch7():
data = np.genfromtxt('/home/kesci/input/airfoil4755/airfoil_self_noise.dat', delimiter='\t')
data = (data - data.mean(axis=0)) / data.std(axis=0)
return torch.tensor(data[:1500, :-1], dtype=torch.float32), \
torch.tensor(data[:1500, -1], dtype=torch.float32)
features, labels = get_data_ch7()
def init_adam_states():
v_w, v_b = torch.zeros((features.shape[1], 1), dtype=torch.float32), torch.zeros(1, dtype=torch.float32)
s_w, s_b = torch.zeros((features.shape[1], 1), dtype=torch.float32), torch.zeros(1, dtype=torch.float32)
return ((v_w, s_w), (v_b, s_b))
def adam(params, states, hyperparams):
beta1, beta2, eps = 0.9, 0.999, 1e-6
for p, (v, s) in zip(params, states):
v[:] = beta1 * v + (1 - beta1) * p.grad.data
s[:] = beta2 * s + (1 - beta2) * p.grad.data**2
v_bias_corr = v / (1 - beta1 ** hyperparams['t'])
s_bias_corr = s / (1 - beta2 ** hyperparams['t'])
p.data -= hyperparams['lr'] * v_bias_corr / (torch.sqrt(s_bias_corr) + eps)
hyperparams['t'] += 1
7.2 word2vec
词嵌入基础。Word2Vec 词嵌入工具将每个词表示成一个定长的向量,并通过在语料库上的预训练使得这些向量能较好地表达不同词之间的相似和类比关系,以引入一定的语义信息。简单来说,Word2Vec 能从语料中学到如何将离散的词映射为连续空间中的向量,并保留其语义上的相似关系。
Skip-Gram 跳字模型:假设背景词由中心词生成。
CBOW (continuous bag-of-words) 连续词袋模型:假设中心词由背景词生成。
二次采样
def discard(idx):
'''
@params:
idx: 单词的下标
@return: True/False 表示是否丢弃该单词
'''
return random.uniform(0, 1) < 1 - math.sqrt(
1e-4 / counter[idx_to_token[idx]] * num_tokens)
subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
print('# tokens: %d' % sum([len(st) for st in subsampled_dataset]))
def compare_counts(token):
return '# %s: before=%d, after=%d' % (token, sum(
[st.count(token_to_idx[token]) for st in dataset]), sum(
[st.count(token_to_idx[token]) for st in subsampled_dataset]))
提取中心词和背景词
def get_centers_and_contexts(dataset, max_window_size):
'''
@params:
dataset: 数据集为句子的集合,每个句子则为单词的集合,此时单词已经被转换为相应数字下标
max_window_size: 背景词的词窗大小的最大值
@return:
centers: 中心词的集合
contexts: 背景词窗的集合,与中心词对应,每个背景词窗则为背景词的集合
'''
centers, contexts = [], []
for st in dataset:
if len(st) < 2: # 每个句子至少要有2个词才可能组成一对“中心词-背景词”
continue
centers += st
for center_i in range(len(st)):
window_size = random.randint(1, max_window_size) # 随机选取背景词窗大小
indices = list(range(max(0, center_i - window_size),
min(len(st), center_i + 1 + window_size)))
indices.remove(center_i) # 将中心词排除在背景词之外
contexts.append([st[idx] for idx in indices])
return centers, contexts
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)
tiny_dataset = [list(range(7)), list(range(7, 10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
print('center', center, 'has contexts', context)
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
'''
@params:
center: 中心词下标,形状为 (n, 1) 的整数张量
contexts_and_negatives: 背景词和噪音词下标,形状为 (n, m) 的整数张量
embed_v: 中心词的 embedding 层
embed_u: 背景词的 embedding 层
@return:
pred: 中心词与背景词(或噪音词)的内积,之后可用于计算概率 p(w_o|w_c)
'''
v = embed_v(center) # shape of (n, 1, d)
u = embed_u(contexts_and_negatives) # shape of (n, m, d)
pred = torch.bmm(v, u.permute(0, 2, 1)) # bmm((n, 1, d), (n, d, m)) => shape of (n, 1, m)
return pred
训练模型
def train(net, lr, num_epochs):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("train on", device)
net = net.to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
for epoch in range(num_epochs):
start, l_sum, n = time.time(), 0.0, 0
for batch in data_iter:
center, context_negative, mask, label = [d.to(device) for d in batch]
pred = skip_gram(center, context_negative, net[0], net[1])
l = loss(pred.view(label.shape), label, mask).mean() # 一个batch的平均loss
optimizer.zero_grad()
l.backward()
optimizer.step()
l_sum += l.cpu().item()
n += 1
print('epoch %d, loss %.2f, time %.2fs'
% (epoch + 1, l_sum / n, time.time() - start))
7.3 词嵌入进阶
GloVe 模型
求近义词
def knn(W, x, k):
'''
@params:
W: 所有向量的集合
x: 给定向量
k: 查询的数量
@outputs:
topk: 余弦相似性最大k个的下标
[...]: 余弦相似度
'''
cos = torch.matmul(W, x.view((-1,))) / (
(torch.sum(W * W, dim=1) + 1e-9).sqrt() * torch.sum(x * x).sqrt())
_, topk = torch.topk(cos, k=k)
topk = topk.cpu().numpy()
return topk, [cos[i].item() for i in topk]
def get_similar_tokens(query_token, k, embed):
'''
@params:
query_token: 给定的单词
k: 所需近义词的个数
embed: 预训练词向量
'''
topk, cos = knn(embed.vectors,
embed.vectors[embed.stoi[query_token]], k+1)
for i, c in zip(topk[1:], cos[1:]): # 除去输入词
print('cosine sim=%.3f: %s' % (c, (embed.itos[i])))
get_similar_tokens('chip', 3, glove)
求类比词
def get_analogy(token_a, token_b, token_c, embed):
'''
@params:
token_a: 词a
token_b: 词b
token_c: 词c
embed: 预训练词向量
@outputs:
res: 类比词d
'''
vecs = [embed.vectors[embed.stoi[t]]
for t in [token_a, token_b, token_c]]
x = vecs[1] - vecs[0] + vecs[2]
topk, cos = knn(embed.vectors, x, 1)
res = embed.itos[topk[0]]
return res
get_analogy('man', 'woman', 'son', glove)