一、论文阅读
二、代码解读
1.输入数据的处理
class LEBertProcessor(Processor):
def __init__(self, args, tokenizer):
作者默认导入词向量的前10000个词(不是字)进行构建字典树,并更新字典树的最大深度,最后返回word_embed_dict(哈希词-词向量), word_list(前10000个词向量), word_embed_dim(词向量维度)
def load_word_embedding(cls, word_embed_path, max_scan_num):
"""
todo 存在许多单字的,考虑是否去掉
加载前max_scan_num个词向量, 并且返回词表
:return:
"""
logger.info('loading word embedding from pretrain')
word_embed_dict = dict()
word_list = list()
with open(word_embed_path, 'r', encoding='utf8') as f:
for idx, line in tqdm(enumerate(f)):
# 只扫描前max_scan_num个词向量
if idx > max_scan_num:
break
items = line.strip().split()
if idx == 0:
assert len(items) == 2
num_embed, word_embed_dim = items
num_embed, word_embed_dim = int(num_embed), int(word_embed_dim)
else:
assert len(items) == word_embed_dim + 1
word = items[0]
embedding = np.empty([1, word_embed_dim])
embedding[:] = items[1:]
word_embed_dict[word] = embedding
word_list.append(word)
logger.info('word_embed_dim:{}'.format(word_embed_dim))
logger.info('size of word_embed_dict:{}'.format(len(word_embed_dict)))
logger.info('size of word_list:{}'.format(len(word_list)))
return word_embed_dict, word_list, word_embed_dim
字典树的创建,max_depth 为最大词的的长度。
import collections
class TrieNode:
def __init__(self):
self.children = collections.defaultdict(TrieNode)
self.is_word = False
class Trie:
"""
In fact, this Trie is a letter three.
root is a fake node, its function is only the begin of a word, same as <bow>
the the first layer is all the word's possible first letter, for example, '中国'
its first letter is '中'
the second the layer is all the word's possible second letter.
and so on
"""
def __init__(self, use_single=True):
self.root = TrieNode()
self.max_depth = 0
if use_single:
self.min_len = 0
else:
self.min_len = 1
def insert(self, word):
current = self.root
deep = 0
for letter in word:
current = current.children[letter]
deep += 1
current.is_word = True
if deep > self.max_depth:
self.max_depth = deep
def search(self, word):
current = self.root
for letter in word:
current = current.children.get(letter)
if current is None:
return False
return current.is_word
def enumerateMatch(self, str, space=""):
"""
Args:
str: 需要匹配的词
Return:
返回匹配的词, 如果存在多字词,则会筛去单字词
"""
matched = []
while len(str) > self.min_len:
if self.search(str):
matched.insert(0, space.join(str[:])) # 短的词总是在最前面
del str[-1]
if len(matched) > 1 and len(matched[0]) == 1: # filter single character word
matched = matched[1:]
return matched
@classmethod
def build_trie_tree(cls, word_list, save_path):
"""
# todo 是否不将单字加入字典树中
构建字典树
:return:
"""
logger.info('building trie tree')
trie_tree = Trie()
for word in word_list:
trie_tree.insert(word)
write_pickle(trie_tree, save_path)
return trie_tree
获取输入数据
def get_input_data(self, file):
lines = load_lines(file)
features = []
cls_token_id = self.tokenizer.cls_token_id
sep_token_id = self.tokenizer.sep_token_id
pad_token_id = self.tokenizer.pad_token_id
o_label_id = self.label_vocab.convert_token_to_id('O')
pad_label_id = self.label_vocab.convert_token_to_id('[PAD]')
for line in tqdm(lines):
data = json.loads(line)
text = data['text']
labels = data['label']
char_index2words = self.get_char2words(text)
# 在开头与结尾分别添加[CLS]与[SEP]
input_ids = [cls_token_id] + self.tokenizer.convert_tokens_to_ids(text) + [sep_token_id]
label_ids = [o_label_id] + self.label_vocab.convert_tokens_to_ids(labels) + [o_label_id]
word_ids_list = []
word_pad_id = self.word_vocab.convert_token_to_id('[PAD]')
# 获取每个字的词向量,并且每个字最多只能3个(默认),然后对缺失的进行pad补齐
for words in char_index2words:
words = words[:self.max_word_num]
word_ids = self.word_vocab.convert_tokens_to_ids(words)
word_pad_num = self.max_word_num - len(words)
word_ids = word_ids + [word_pad_id] * word_pad_num
word_ids_list.append(word_ids)
# 开头和结尾进行padding
word_ids_list = [[word_pad_id]*self.max_word_num] + word_ids_list + [[word_pad_id]*self.max_word_num]
if len(input_ids) > self.max_seq_len:
input_ids = input_ids[: self.max_seq_len]
label_ids = label_ids[: self.max_seq_len]
word_ids_list = word_ids_list[: self.max_seq_len]
input_mask = [1] * len(input_ids)
token_type_ids = [0] * len(input_ids)
assert len(input_ids) == len(label_ids) == len(word_ids_list)
# padding
padding_length = self.max_seq_len - len(input_ids)
input_ids += [pad_token_id] * padding_length
input_mask += [0] * padding_length
token_type_ids += [0] * padding_length
label_ids += [pad_label_id] * padding_length
word_ids_list += [[word_pad_id]*self.max_word_num] * padding_length
text = ''.join(text)
input_ids = torch.LongTensor(input_ids)
label_ids = torch.LongTensor(label_ids)
input_mask = torch.LongTensor(input_mask)
token_type_ids = torch.LongTensor(token_type_ids)
word_ids = torch.LongTensor(word_ids_list)
word_mask = (word_ids != word_pad_id).long()
feature = {
'text': text, 'input_ids': input_ids, 'attention_mask': input_mask, 'token_type_ids': token_type_ids,
'word_ids': word_ids, 'word_mask': word_mask, 'label_ids': label_ids
}
features.append(feature)
return features
获取每个字的词列表,max_depth为初始的10000个字典树的最长的词长度
def get_char2words(self, text):
"""
获取每个汉字,对应的单词列表
:param text:
:return:
"""
text_len = len(text)
char_index2words = [[] for _ in range(text_len)]
for idx in range(text_len):
sub_sent = text[idx:idx + self.trie_tree.max_depth] # speed using max depth
words = self.trie_tree.enumerateMatch(sub_sent) # 找到以text[idx]开头的所有单词
for word in words:
start_pos = idx
end_pos = idx + len(word)
for i in range(start_pos, end_pos):
char_index2words[i].append(word)
# todo 截断
# for i, words in enumerate(char_index2words):
# char_index2words[i] = char_index2words[i][:self.max_word_num]
return char_index2words
2.词向量构造
class WordEmbeddingAdapter(nn.Module):
def __init__(self, config):
super(WordEmbeddingAdapter, self).__init__()
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.tanh = nn.Tanh()
self.linear1 = nn.Linear(config.word_embed_dim, config.hidden_size)
self.linear2 = nn.Linear(config.hidden_size, config.hidden_size)
attn_W = torch.zeros(config.hidden_size, config.hidden_size)
self.attn_W = nn.Parameter(attn_W)
self.attn_W.data.normal_(mean=0.0, std=config.initializer_range)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, layer_output, word_embeddings, word_mask):
"""
:param layer_output:bert layer的输出,[b_size, len_input, d_model]
:param word_embeddings:每个汉字对应的词向量集合,[b_size, len_input, num_word, d_word]
:param word_mask:每个汉字对应的词向量集合的attention mask, [b_size, len_input, num_word]
"""
# transform
# 为了保证词向量和字符向量维度保持一致,作者对Char-Words Pair中的词向量使用非线性变换:
# 将词向量,与字符向量进行维度对齐
word_outputs = self.linear1(word_embeddings)
word_outputs = self.tanh(word_outputs)
word_outputs = self.linear2(word_outputs)
word_outputs = self.dropout(word_outputs) # word_outputs:[b_size, len_input, num_word, d_model]
# 计算每个字符向量,与其对应的所有词向量的注意力权重,然后加权求和。采用双线性映射计算注意力权重
# layer_output = layer_output.unsqueeze(2) # layer_output:[b_size, len_input, 1, d_model]
socres = torch.matmul(layer_output.unsqueeze(2), self.attn_W) # [b_size, len_input, 1, d_model]
socres = torch.matmul(socres, torch.transpose(word_outputs, 2, 3)) # [b_size, len_input, 1, num_word]
socres = socres.squeeze(2) # [b_size, len_input, num_word]
socres.masked_fill_(word_mask, -1e9) # 将pad的注意力设为很小的数
socres = F.softmax(socres, dim=-1) # [b_size, len_input, num_word]
attn = socres.unsqueeze(-1) # [b_size, len_input, num_word, 1]
weighted_word_embedding = torch.sum(word_outputs * attn, dim=2) # [N, L, D] # 加权求和,得到每个汉字对应的词向量集合的表示
layer_output = layer_output + weighted_word_embedding
layer_output = self.dropout(layer_output)
layer_output = self.layer_norm(layer_output)
return layer_output