bert以平均、首字、尾字表示词向量(1)

1. word preprocess.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35


import torch
from transformers import AutoTokenizer


def encoder_texts(texts: List[List[str]], tokenizer):
# 统计句子中最大的词长度
fix_len = max([max([len(word) for word in text]) for text in texts])

matrix = []
for text in texts:
vector = []

text = [tokenizer.cls_token, *text, tokenizer.sep_token]
input_ids = tokenizer.batch_encode_plus(
text,
add_special_tokens=False,
)['input_ids']

for _input_ids in input_ids:
# 修复例如: texts = [['\ue5f1\ue5f1\ue5f1\ue5f1']] 这种情况
_input_ids = _input_ids or [tokenizer.unk_token_id]
vector.append(_input_ids + (fix_len - len(_input_ids)) * [tokenizer.pad_token_id])
matrix.append(torch.tensor(vector, dtype=torch.long))
return pad_sequence(matrix, batch_first=True)


if __name__ == '__main__':
texts = [
['我', '爱中国'],
['我', '爱', '中国']
]
tokenizer = AutoTokenizer.from_pretrained('ckiplab/albert-tiny-chinese')
print(encoder_texts(texts, tokenizer))

output:

1
2
3
4
5
6
7
8
9
tensor([[[ 101,    0,    0],
[2769, 0, 0],
[4263, 704, 1744],
[ 0, 0, 0]],

[[ 101, 0, 0],
[2769, 0, 0],
[4263, 0, 0],
[ 704, 1744, 0]]])

2. transformer layer.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

# -*- coding: utf-8 -*-

import torch
import torch.nn as nn
from supar.modules.scalar_mix import ScalarMix
from supar.utils.fn import pad

# pad 等价于 pad_sequence。

# from torch.nn.utils.rnn import pad_sequence

class TransformerEmbedding(nn.Module):
r"""
A module that directly utilizes the pretrained models in `transformers`_ to produce BERT representations.
While mainly tailored to provide input preparation and post-processing for the BERT model,
it is also compatible with other pretrained language models like XLNet, RoBERTa and ELECTRA, etc.

Args:
model (str):
Path or name of the pretrained models registered in `transformers`_, e.g., ``'bert-base-cased'``.
n_layers (int):
The number of BERT layers to use. If 0, uses all layers.
n_out (int):
The requested size of the embeddings. If 0, uses the size of the pretrained embedding model. Default: 0.
stride (int):
A sequence longer than max length will be splitted into several small pieces
with a window size of ``stride``. Default: 10.
pooling (str):
Pooling way to get from token piece embeddings to token embedding.
``first``: take the first subtoken. ``last``: take the last subtoken. ``mean``: take a mean over all.
Default: ``mean``.
pad_index (int):
The index of the padding token in BERT vocabulary. Default: 0.
dropout (float):
The dropout ratio of BERT layers. Default: 0. This value will be passed into the :class:`ScalarMix` layer.
requires_grad (bool):
If ``True``, the model parameters will be updated together with the downstream task. Default: ``False``.

.. _transformers:
https://github.com/huggingface/transformers
"""

def __init__(self, model, n_layers=4, n_out=0, stride=256, pooling='mean', pad_index=0, dropout=0, requires_grad=True):
super().__init__()

from transformers import AutoConfig, AutoModel, AutoTokenizer
self.bert = AutoModel.from_pretrained(model, config=AutoConfig.from_pretrained(model, output_hidden_states=True))
self.bert = self.bert.requires_grad_(requires_grad)

self.model = model
self.n_layers = n_layers or self.bert.config.num_hidden_layers
self.hidden_size = self.bert.config.hidden_size
self.n_out = n_out or self.hidden_size
self.stride = stride
self.pooling = pooling
self.pad_index = pad_index
self.dropout = dropout
self.requires_grad = requires_grad
self.max_len = int(max(0, self.bert.config.max_position_embeddings) or 1e12) - 2

self.tokenizer = AutoTokenizer.from_pretrained(model)

self.scalar_mix = ScalarMix(self.n_layers, dropout)
self.projection = nn.Linear(self.hidden_size, self.n_out, False) if self.hidden_size != n_out else nn.Identity()

def __repr__(self):
s = f"{self.model}, n_layers={self.n_layers}, n_out={self.n_out}, "
s += f"stride={self.stride}, pooling={self.pooling}, pad_index={self.pad_index}"
if self.dropout > 0:
s += f", dropout={self.dropout}"
if self.requires_grad:
s += f", requires_grad={self.requires_grad}"

return f"{self.__class__.__name__}({s})"

def forward(self, subwords):
r"""
Args:
subwords (~torch.Tensor): ``[batch_size, seq_len, fix_len]``.
Returns:
~torch.Tensor:
BERT embeddings of shape ``[batch_size, seq_len, n_out]``.
"""

mask = subwords.ne(self.pad_index)
lens = mask.sum((1, 2))
# [batch_size, n_subwords]
subwords = pad(subwords[mask].split(lens.tolist()), self.pad_index, padding_side=self.tokenizer.padding_side)
bert_mask = pad(mask[mask].split(lens.tolist()), 0, padding_side=self.tokenizer.padding_side)

# return the hidden states of all layers
bert = self.bert(subwords[:, :self.max_len], attention_mask=bert_mask[:, :self.max_len].float())[-1]
# [n_layers, batch_size, max_len, hidden_size]
bert = bert[-self.n_layers:]
# [batch_size, max_len, hidden_size]
bert = self.scalar_mix(bert)
# [batch_size, n_subwords, hidden_size]
for i in range(self.stride, (subwords.shape[1]-self.max_len+self.stride-1)//self.stride*self.stride+1, self.stride):
part = self.bert(subwords[:, i:i+self.max_len], attention_mask=bert_mask[:, i:i+self.max_len].float())[-1]
bert = torch.cat((bert, self.scalar_mix(part[-self.n_layers:])[:, self.max_len-self.stride:]), 1)

# [batch_size, seq_len]
bert_lens = mask.sum(-1)
bert_lens = bert_lens.masked_fill_(bert_lens.eq(0), 1)
# [batch_size, seq_len, fix_len, hidden_size]
embed = bert.new_zeros(*mask.shape, self.hidden_size).masked_scatter_(mask.unsqueeze(-1), bert[bert_mask])
# [batch_size, seq_len, hidden_size]
if self.pooling == 'first':
embed = embed[:, :, 0]
elif self.pooling == 'last':
embed = embed.gather(2, (bert_lens-1).unsqueeze(-1).repeat(1, 1, self.hidden_size).unsqueeze(2)).squeeze(2)
else:
embed = embed.sum(2) / bert_lens.unsqueeze(-1)
embed = self.projection(embed)

return embed

到此,能够拿到以词为级别的output,embed.shape查看一下即可,后续就可以concat其他input做多输入。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值