1、
pos_mask_matrix = torch.clamp(pos_mask_matrix.float(), 0, 1) # 将tensor限制到0-1之间
exp_u_pos = torch.exp(u_pos)
delta_exp_u_pos = torch.mul(exp_u_pos, pos_mask_matrix)
sum_delta_exp_u_pos = torch.stack([torch.sum(delta_exp_u_pos, 2)] * delta_exp_u_pos.shape[2], 2)
p_pos = torch.div(delta_exp_u_pos, sum_delta_exp_u_pos + 1e-10)
import torch
exp_u_pos = torch.tensor([[[1,2,3],
[4,5,6],
[7,8,9]],
[[1, 2, 3],
[4, 5, 6],
[7, 8, 9]]
])
pos_mask_matrix = torch.tensor([[[1.0, 0.0, 1.0],
[0.0, 1.0, 1.0],
[1.0, 1.0, 1.0]],
[[1.0, 0.0, 1.0],
[0.0, 1.0, 1.0],
[1.0, 1.0, 1.0]]
])
print(exp_u_pos.shape)
print(pos_mask_matrix.shape)
delta_exp_u_pos = torch.mul(exp_u_pos, pos_mask_matrix) # 只计算相关ngram的向量,其余位置置零 # [2,3,3]
sum_delta_exp_u_pos = torch.stack([torch.sum(delta_exp_u_pos, 2)] * delta_exp_u_pos.shape[2], 2)
attention = torch.div(delta_exp_u_pos, sum_delta_exp_u_pos + 1e-10)
print("delta_exp_u_pos:\n",delta_exp_u_pos)
print("torch.sum(delta_exp_u_pos):\n",torch.sum(delta_exp_u_pos,2)) # 把相关ngram向量相加 [2, 3]
print("res:\n",[torch.sum(delta_exp_u_pos, 2)] * delta_exp_u_pos.shape[2]) #
print(sum_delta_exp_u_pos.shape) #
print("sum_delta_exp_u_pos:\n",sum_delta_exp_u_pos) # [2,3,3]
print("attention :\n",attention) # 每一行除以每一行的和,计算attention
class MultiChannelAttention(nn.Module):
def __init__(self, ngram_size, hidden_size, cat_num):
super(MultiChannelAttention, self).__init__()
self.word_embedding = nn.Embedding(ngram_size, hidden_size, padding_idx=0)
self.channel_weight = nn.Embedding(cat_num, 1)
self.temper = hidden_size ** 0.5
def forward(self, word_seq, hidden_state, char_word_mask_matrix, channel_ids):
# word_seq: (batch_size, channel, word_seq_len)
# hidden_state: (batch_size, character_seq_len, hidden_size)
# mask_matrix: (batch_size, channel, character_seq_len, word_seq_len)
'''
word_seq = word_ids = ngram_ids,
matching_matrix = [channel, max_seq_length, max_word_size],
word_mask = matching_matrix,
channel_ids = tensor([0,1,2,3,4,5,6,7,8,9])
'''
# embedding (batch_size, channel, word_seq_len, word_embedding_dim)
batch_size, character_seq_len, hidden_size = hidden_state.shape
channel = char_word_mask_matrix.shape[1]
word_seq_length = word_seq.shape[2]
embedding = self.word_embedding(word_seq) # 给ngram编码 [batch_size, channel, word_seq_len, hideen_size]
tmp = embedding.permute(0, 1, 3, 2) # [batch_size, channel, ngram_hideen, word_seq_len]
tmp_hidden_state = torch.stack([hidden_state] * channel, 1) # [batch_size, channel, character_seq_len, hidden_size]
# u (batch_size, channel, character_seq_len, word_seq_len)
u = torch.matmul(tmp_hidden_state, tmp) / self.temper # [batch_size, channel, character_seq_len, word_seq_len]
# attention (batch_size, channel, character_seq_len, word_seq_len)
tmp_word_mask_metrix = torch.clamp(char_word_mask_matrix, 0, 1) # [batch_size, channel, character_seq_len, word_seq_len]
'''
tmp_word_mask_metrix表示跟当前字符相关的ngram位置
'''
# print("tmp_word_mask_metrix:",tmp_word_mask_metrix.size())
exp_u = torch.exp(u)
delta_exp_u = torch.mul(exp_u, tmp_word_mask_metrix) # 只计算相关ngram的向量,其余位置置零,[batch_size, channel, character_seq,_len, word_seq_len]
# print("delta_exp_u:", delta_exp_u.size())
sum_delta_exp_u = torch.stack([torch.sum(delta_exp_u, 3)] * delta_exp_u.shape[3], 3) # [batch_size, channel, character_seq, word_seq_len]
# print("sum_delta_exp_u:", sum_delta_exp_u.size())
attention = torch.div(delta_exp_u, sum_delta_exp_u + 1e-10) # [batch_size, channel, character_seq, word_seq_len]
# print("attention:", attention.size())
attention = attention.view(batch_size * channel, character_seq_len, word_seq_length)
embedding = embedding.view(batch_size * channel, word_seq_length, hidden_size)
character_attention = torch.bmm(attention, embedding)
character_attention = character_attention.view(batch_size, channel, character_seq_len, hidden_size)
channel_w = self.channel_weight(channel_ids) # 初始化每个通道的编码 [batch_size, channel, 1]
channel_w = nn.Softmax(dim=1)(channel_w) # 计算每个通道的权重 [batch_size, channel, 1]
channel_w = channel_w.view(batch_size, -1, 1, 1) # [batch_size, channel, 1, 1]
character_attention = torch.mul(character_attention, channel_w) # [batch_size, channel, character_seq_len, hidden_size], 通过广播的方式,给每个通道的ngram分配不同的值
character_attention = character_attention.permute(0, 2, 1, 3) # [batch_size, character_seq_len, channel, hidden_size]
character_attention = character_attention.flatten(start_dim=2) # [batch_size, character_seq_len, channel * hidden_size]
return character_attention
# 注意
torch.stack([h1,h2], dim=k) #在第k维度进行拼接
torch.stack([h1]*channel, dim=k) #在第k维度进行拼接
2、pytorch中的交叉熵计算
https://blog.csdn.net/qq_28418387/article/details/95918829
import torch
import torch.nn.functional as F
slot_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
# hidden state
slot_logits = torch.tensor([[[0.1, 0.2, 0.3],
[0.5, 0.6, 0.4],
[0.9, 0.8, 0.7]],
[[0.1, 0.2, 0.3],
[0.4, 0.5, 0.6],
[0.7, 0.8, 0.9]]
])
attention_mask = torch.tensor([[1,0,0], # [2, 3]
[1,1,0]])
slot_labels_ids = torch.tensor([[1,0,0], # [2, 3]
[2,1,0]])
# 由于标签个数为3,所以slot_labels_ids的值只能取0、1、2和-100
active_loss = attention_mask.view(-1) == 1
active_logits = slot_logits.view(-1, 3)[active_loss] # num_slot_labels = 3
'''
slot_logits.view(-1, 3)
tensor([[0.1000, 0.2000, 0.3000],
[0.5000, 0.6000, 0.4000],
[0.9000, 0.8000, 0.7000],
[0.1000, 0.2000, 0.3000],
[0.4000, 0.5000, 0.6000],
[0.7000, 0.8000, 0.9000]])
tensor([[0.1000, 0.2000, 0.3000],
[0.1000, 0.2000, 0.3000],
[0.4000, 0.5000, 0.6000]])
'''
active_labels = slot_labels_ids.view(-1)[active_loss] # real label
slot_loss = slot_loss_fct(active_logits, active_labels)
print(active_logits)
# log_softmax=F.log_softmax(active_logits,dim=1)
# print(log_softmax)
print(active_labels)
print(slot_loss)
3、
.join(): 连接字符串数组
4、fastnlp 词表的前两位是什么?
vocab = databundle.get_vocab('chars')
print(vocab.to_word(0))
print(vocab.to_word(1))
print(vocab.to_word(2))
print(vocab.to_word(3))
print(vocab.to_word(4))
print(vocab.to_word(5))
'''
result:
0--><pad>
1--><unk>
2--,
3--的
4--0
5--。
'''
5、广播的计算方法
import torch
import torch.nn as nn
a = torch.tensor([
[[0.1]],
[[0.2]]
])
b = torch.tensor([[[1, 2, 3],
[4, 5, 6]],
[[1, 2, 3],
[4, 5, 6]]
])
print(a.size())
print(b.size())
res = torch.mul(b, a)
print(res)
torch.Size([2, 1, 1])
torch.Size([2, 2, 3])
'''
tensor([[[0.1000, 0.2000, 0.3000],
[0.4000, 0.5000, 0.6000]],
[[0.2000, 0.4000, 0.6000],
[0.8000, 1.0000, 1.2000]]])
'''
6、
激活函数作用:
1、完成数据的非线性变换,解决线性模型的表达、分类能力不足的问题;
(改变之前数据的线性关系,如果网络中全部是线性变换,则多层网络可以通过矩阵变换,直接转换成一层神经网络。所以激活函数的存在,使得神经网络的“多层”有了实际的意义,使网络更加强大,增加网络的能力,使它可以学习复杂的事物,复杂的数据,以及表示输入输出之间非线性的复杂的任意函数映射。)
2、执行数据的归一化,将输入数据映射到某个范围内,再往下传递,这样做的好处是可以限制数据的扩张,防止数据过大导致的溢出风险。
在少量层结构中, 我们可以尝试很多种不同的激励函数. 在卷积神经网络 Convolutional neural networks 的卷积层中, 推荐的激励函数是 relu. 在循环神经网络中 recurrent neural networks, 推荐的是 tanh 或者是 relu
7、句子中的token被切分时,该怎么获得有效的token
sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
if valid_ids is not None:
batch_size, max_len, feat_dim = sequence_output.shape
valid_output = torch.zeros(batch_size, max_len, feat_dim, dtype=sequence_output.dtype, device=input_ids.device)
for i in range(batch_size):
temp = sequence_output[i][valid_ids[i] == 1]
valid_output[i][:temp.size(0)] = temp
sequence_output = self.dropout(valid_output)
...
8、在网络中固定部分参数进行训练
参考链接
https://blog.csdn.net/special_hang/article/details/89676432
class RESNET_attention(nn.Module):
def __init__(self, model, pretrained):
super(RESNET_attetnion, self).__init__()
self.resnet = model(pretrained)
for p in self.parameters():
p.requires_grad = False
self.f = nn.Conv2d(2048, 512, 1)
self.g = nn.Conv2d(2048, 512, 1)
self.h = nn.Conv2d(2048, 2048, 1)
self.softmax = nn.Softmax(-1)
self.gamma = nn.Parameter(torch.FloatTensor([0.0]))
self.avgpool = nn.AvgPool2d(7, stride=1)
self.resnet.fc = nn.Linear(2048, 10)
# 使用过滤器过滤-部分参数
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.0001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-5)
9、运行应用程序后,使用以下命令来清楚缓存
'''
释放缓存分配器当前持有的所有未占用的缓存内存,以便这些内存可以在其他GPU应用程序中使用;
并且可以使用nvidia-smi查看
'''
torch.cuda.empty_cache()
9、DataLoader中num_workers的作用
https://www.cnblogs.com/hesse-summer/p/11343870.html
https://stackoverflow.com/questions/53998282/how-does-the-number-of-workers-parameter-in-pytorch-dataloader-actually-work
注:将数据转移到GPU上并不是
DataLoader的job
10、transformers中的bertTokenizer
from transformers import BertTokenizer
token = BertTokenizer.tokenize(text) # 切分tokens
token_ids = BertTokenizer.convert_tokens_to_ids(token) # 将tokens转化为id,没有cls和sep
# ---------------------------------------
token_ids = BertTokenizer.encode(token) # 切分tokens并转化为id,包含cls和sep
11、将json字符串美观地写入文件中
g = open(tgt, 'w', encoding='utf-8')
json.dump(triples, fp=g, indent=4, ensure_ascii=False)
g.close()
11、输入文本为"小黑 在 清华",do_basic_tokenize=False很关键!
BERTTokenizer(vocab_path, do_lower_case=False, do_basic_tokenize=False)
12、outputs, (hn, cn) = self.lstm(input)
outputs表示每个字符的隐藏层向量,hn表示最后一个字符的隐藏层向量。
'''
如,当输入batch_size=1时,input=[30, 1, 100], 即[seq_len, batch_size, hidden]
那么当模型为Bi-LSTM时,
outputs=[30, 1, 200] # 维度200表示前向100,拼接后向100
hn=[1, 200]
cn=[1, 200]
并且
outputs[-1, 0, 0:100] = hn[0, 0:100]
outputs[ 0, 0, 101:200] = hn[0, 101:200]
因为前向最后一个字符是从左到右计算的,而后向最后一个字符是从右往左计算的。
'''
13、计算词向量时用到的“负采样”方法是指:用与“正样本”相同的上下文词,再在字典中随机选择一个单词。
例如:
给定一句话“这是去上学的班车”,则对这句话进行正采样,得到上下文“上”和目标词“学”,则这两个字就是正样本。
负样本的采样需要选定同样的“上”,然后在训练的字典中任意取另一个字,“梦”,这一对就构成负样本。
14、在指定路径下创建虚拟环境
'''
conda create -p ~/.conda/envs/env_name python=3.7
'''
15、在[0, 10, 1.1, 1.1, 1.1, 1.1, 1.1]中,由于位置[1]由于权重较大,所以被采样的次数较多。
import torch
from torch.utils.data.sampler import WeightedRandomSampler
# 位置[0]的权重为0,位置[1]的权重为10,其余位置权重均为1.1
weights = torch.Tensor([0, 10, 1.1, 1.1, 1.1, 1.1, 1.1])
wei_sampler = WeightedRandomSampler(weights, 10, replacement=True)
print(list(wei_sampler))
# [1, 1, 1, 4, 1, 4, 4, 1, 1, 6]
16、key-value memory network的核心要素:
(1)对key、value都是用nn 。Embedding()进行随机初始化;
(2)key一般是字符;
(3)使用字符的hidden与字符的key相乘,然后加入value,即可得到最后的结果。
17、PRGC三元组模型中,如何得到Global Correspondence
import torch
torch.manual_seed(1)
sequence_output = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
[[5, 2, 1], [8, 9, 3], [3, 7, 6]]])
batch, seq_len, hidden = sequence_output.size()
print(sequence_output)
sub_extend = sequence_output.unsqueeze(2).expand(-1, -1, seq_len, -1)
obj_extend = sequence_output.unsqueeze(1).expand(-1, seq_len, -1, -1)
print(sub_extend)
print(obj_extend)
corres_pred = torch.cat([sub_extend, obj_extend], 3)
print(corres_pred)
18、torch.where(a, b, c)
import torch
rel_threshold = 0.5
rel_pred = torch.tensor([[0.1,0.2,0.6,0.7],
[0.8,0.9,0.1,0.2]])
"""
torch.where(a, b, c)-->当a满足条件时,选择b;否则选择c。
"""
rel_pred_onehot = torch.where(rel_pred > rel_threshold, torch.ones(rel_pred.size()), torch.zeros(rel_pred.size()))
bs_idxs, pred_rels = torch.nonzero(rel_pred_onehot, as_tuple=True)
"""
tensor([0, 0, 1, 1]) tensor([2, 3, 0, 1])
"""
print(rel_pred_onehot)
print(bs_idxs, pred_rels)
condition = torch.tensor([True, True, False, False, True])
input = torch.tensor([1, 2, 3, 4, 5])
res= torch.where(condition, input, torch.tensor(-100))
print(res)