Sigmoid函数及其导数是什么?
函数:
导数:
Tanh函数及其导数是什么?
函数:
导数:
交叉熵损失函数的公式是什么?
二分类
其中:
yi,样本i的label,正类为1,负类为0。
pi,样本i预测为正类的概率。
多分类
其中:
M,类别的数量。
yic,符号函数,如果样本i的真实类别等于c取1,否则取0。
pic,观测样本,属于类别c的预测概率
Attention函数的公式是什么?
1 Concat策略
2 Add
3 Dot Product
4 Scaled-dot Product
5 Content-Based
Multi-head Self-attention代码实现。
# -*- coding:utf-8 -*-
import torch
import torch.nn as nn
import torch.nn.functional as F
class MultiHeadAttention(nn.Module):
def __init__(self, query_dim, key_dim, num_units, num_heads):
super(MultiHeadAttention, self).__init__()
self.query_dim = query_dim
self.key_dim = key_dim
self.num_units = num_units
self.num_heads = num_heads
self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
def forward(self, query, key, mask=None):
# b * q_dim * q_dim
querys = self.W_query(query)
# b * k_dim * k_dim
keys = self.W_key(key)
# b * k_dim * k_dim
values = self.W_value(key)
split_size = self.num_units // self.num_heads
# h * b * q_dim * (num_units // h)
querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0)
# h * b * k_dim * (num_units // h)
keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)
# h * b * k_dim * (num_units // h)
values = torch.stack(torch.split(values, split_size, dim=2), dim=0)
# h * b * q_dim * k_dim
scores = torch.matmul(querys, keys.transpose(2, 3))
scores = scores / (self.key_dim ** 0.5)
if mask is not None:
# b * k_dim -> h * b * q_dim * k_dim
mask = mask.unsqueeze(1).unsqueeze(0).repeat(self.num_heads, 1, querys.shape[2], 1)
scores = scores.masked_fill(mask, float('-inf'))
scores = F.softmax(scores, dim=3)
# h * b * q_dim * (num_units // h)
out = torch.matmul(scores, values)
# b * q_dim * num_units
out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0)
return out, scores
## 类实例化
attention = MultiHeadAttention(64, 64, 32, 4)
## 输入
qurry = torch.randn(8, 6, 64)
key = torch.randn(8, 6, 64)
mask = torch.tensor([[False, False, False, False, True, True],
[False, False, False, True, True, True],
[False, False, False, False, True, True],
[False, False, False, True, True, True],
[False, False, False, False, True, True],
[False, False, False, True, True, True],
[False, False, False, False, True, True],
[False, False, False, True, True, True], ])
## 输出
out, scores = attention(qurry, key, mask)
print('out:', out.shape) ## torch.Size([8, 6, 32])
print('scores:', scores.shape) ## torch.Size([4, 8, 6, 6])
Beam Search的实现。
import torch
import torch.nn.functional as F
def decoder(src_embedding, tar):
return F.softmax(torch.randn([1, 1, 1000]), dim=-1)
max_len = 20
n = 3
src_embedding = torch.randn([1, 1, 1000])
dec_ids = torch.tensor([-1])
res = []
outs = decoder(src_embedding, dec_ids)
top_v, top_i = outs.topk(n)
print(top_v)
for i in range(n):
res.append([[top_i[0][0][i].item()], torch.log(top_v[0][0][i]).item()])
for _ in range(max_len - 1):
cand = []
for r in res:
dec_ids = torch.tensor(dec_ids.numpy().tolist() + [r[0][-1]])
outs = decoder(src_embedding, dec_ids)
top_v, top_i = outs.topk(n)
for i in range(n):
cand.append([r[0] + [top_i[0][0][i].item()], r[1] + torch.log(top_v[0][0][i]).item()])
if cand:
res = sorted(cand, key=lambda x: x[1], reverse=True)[:n]
print(res)
L1正则的公式是什么?
L2正则的公式是什么?
为什么加入L2正则化可以对权重进行衰减(weight decay)?
在不使用L2正则化时,求导结果中W前系数为1,现在W前面系数为1-ηλ,因为η、λ都是正的,所以1-ηλ小于1,它的效果是减小W,这也就是权重衰减(weight decay)的由来。
Dropout的公式是什么?
对x加入噪音得到x’,我们希望:
Dropout对每个元素进行如下扰动:
Dropout代码实现。
import torch
from torch import nn
def dropout_layer(X, dropout):
assert 0 <= dropout <= 1
# 在本情况中,所有元素都被丢弃
if dropout == 1:
return torch.zeros_like(X)
# 在本情况中,所有元素都被保留
if dropout == 0:
return X
mask = (torch.rand(X.shape) > dropout).float()
return mask * X / (1.0 - dropout)
Kmeans代码实现。
import numpy as np
class Kmeans:
def __init__(self, k, times):
self.k = k
self.times = times
def fit(self, X):
X = np.asarray(X)
self.cluster_centers_ = X[np.random.randint(0, len(X), size=self.k)]
self.labels_ = np.zeros(len(X))
for t in range(self.times):
for index, x in enumerate(X):
dis = np.sqrt(np.sum((x - self.cluster_centers_) ** 2, axis=1))
self.labels_[index] = dis.argmin()
for i in range(self.k):
self.cluster_centers_[i] = np.mean(X[self.labels_ == i], axis=0)
def predict(self, X):
X = np.asarray(X)
results = np.zeros(len(X))
for index, x in enumerate(X):
dis = np.sqrt(np.sum((x - self.cluster_centers_) ** 2, axis=1))
results[index] = dis.argmin()
return results
kmeans = Kmeans(3, 200)
train = np.random.randint(0, 100, size=(500, 2))
test = np.random.randint(0, 100, size=(100, 2))
kmeans.fit(train)
res = kmeans.predict(test)
import matplotlib.pyplot as plt
for i in range(3):
plt.scatter(test[preds == i][:, 0], test[preds == i][:, 1])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='+', s=300)
plt.legend()
plt.show()