Deep contextualized word representations
论文:Deep contextualized word representations
作者:Matthew E. Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, Luke Zettlemoyer
时间:2018
此论文介绍了一种新的深度上下文的词向量表示,这个模型 不仅可以表达词语使用的复杂特征,还可以表达一词多义。 此论文中词向量是一个深度双向语言模型(biLM)的内部状态的学习函数,它是在一个大型文本语料库上预训练的。论文表明,这些表示可以很容易地添加到现有的模型中,并显著改善六个具有挑战性的NLP问题的现状,包括问题回答、文本隐含和情感分析。
一、完整代码
大都是用pytorch实现的,这里使用tensorflow实现
# 完整代码在这里
import tensorflow as tf
import numpy as np
corpus = [
'i do"t think you will win',
'you are so bad',
'you are unbelievable',
'its taste is not good',
'good',
'so funny',
'i"m so glad to hear that'
]
# 准备vocabulary_char
vocabulary_char = list('abcdefghijklmnopqrstuvwxyz0123456789,;.!?:’"/\|_@#$%ˆ&*˜‘+-=<>()[]{}') + ['<bow>','<eow>','<pow>']
vocabulary_char = dict(zip(vocabulary_char, range(len(vocabulary_char))))
# 准备vocabulary_word
vocabulary_word = set()
for sentence in corpus:
vocabulary_word = vocabulary_word.union(set(sentence.split()))
vocabulary_word = list(vocabulary_word) + ['<bos>','<eos>','<pos>']
vocabulary_word = dict(zip(vocabulary_word, range(len(vocabulary_word))))
# 数据处理
def data_process(corpus, max_words_num, max_character_num, out='int'):
"""
max_words_num: 句子最大长度
max_character_num:单词最大字符数
"""
sentences = []
words = []
for item in corpus:
item = item.split()[:max_words_num]
sentences.append(['<bos>'] + item + ['<eos>'] + ['<pos>']*(10-len(item)))
for item in sentences:
word_list = []
for characters in item:
if characters in ['<bos>', '<eos>', '<pos>']:
word_list.append(['<pow>']*(max_character_num+2))
else:
characters = list(characters)[:max_character_num]
word_list.append(['<bow>'] + characters + ['<eow>'] + ['<pow>']*(max_character_num-len(characters)))
words.append(word_list)
if out == 'int':
for i,items in enumerate(sentences):
for j,item in enumerate(items):
sentences[i][j] = vocabulary_word[item]
for i,items in enumerate(words):
for j,item in enumerate(items):
for k,char in enumerate(item):
words[i][j][k] = vocabulary_char[char]
return np.array(sentences), np.array(words)
sentence, words = data_process(corpus, 10, 6)
# sentence.shape, words.shape
# ((7, 12), (7, 12, 8))
# 这里准备一下outputs 也就是sentence left shift and right shift
y_true = np.c_[sentence[:,1:], np.array([23]*7).reshape(-1,1), sentence[:,:-1], np.array([23]*7).reshape(-1,1)]
class Character_layer(tf.keras.layers.Layer):
def __init__(self, input_dim, embedding_dim, output_dim, filters_list, kernel_size_list, highway_num):
"""
input_dim:vocabulary_character 维度
embedding_dim:embedding 维度
output_dim:projection 维度
filters_list:conv1 channels
kernel_size_list:conv1 kernel_size
highway_num:highway的数量
"""
super(Character_layer, self).__init__()
self.embedding = tf.keras.layers.Embedding(input_dim, embedding_dim)
self.list_conv1 = []
for filters, kernel_size in zip(filters_list, kernel_size_list):
conv1 = tf.keras.layers.Conv1D(filters, kernel_size, padding='same')
self.list_conv1.append(conv1)
self.list_highway = []
self.dim = sum(filters_list)
for i in range(highway_num):
embedding = tf.keras.layers.Dense(self.dim*2)
self.list_highway.append(embedding)
self.projection = tf.keras.layers.Dense(output_dim)
def build(self, input_shape):
pass
def call(self, inputs): # Defines the computation from inputs to outputs
batch_size, seq_len, token_len = inputs.shape
inputs = tf.reshape(inputs, shape=(batch_size*seq_len, token_len))
inputs = self.embedding(inputs)
inputs = tf.transpose(inputs, [0,2,1])
outputs_list = []
for cov in self.list_conv1:
output = cov(inputs)
output = tf.reduce_max(output, axis=1)
output = tf.keras.activations.relu(output)
outputs_list.append(output)
outputs = tf.concat(outputs_list, axis=-1)
for highway in self.list_highway:
highway = highway(outputs)
activation = tf.keras.activations.relu(highway[:,:self.dim])
sigmoid = tf.keras.activations.sigmoid(highway[:,self.dim:])
outputs = activation*sigmoid + outputs*(1-sigmoid)
outputs = self.projection(outputs)
outputs = tf.reshape(outputs, shape=(batch_size, seq_len, -1))
return outputs
class Elmo(tf.keras.models.Model):
def __init__(self, lstm_num):
super().__init__()
self.embedding = Character_layer(len(vocabulary_char),200,50,[12,24,36],[2,3,4],2)
self.forward_lstm = []
self.backward_lstm = []
for i in range(lstm_num):
self.forward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=False))
self.backward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=True))
self.forward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=False))
self.backward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=True))
self.forward_projection = tf.keras.layers.Dense(len(vocabulary_word), activation='softmax')
self.backward_projection = tf.keras.layers.Dense(len(vocabulary_word), activation='softmax')
def get_result(self, inputs):
inputs = self.embedding(inputs)
outputs1 = self.forward_lstm[0](inputs)
outputs2 = self.backward_lstm[0](inputs)
for lstm in self.forward_lstm:
outputs1 = lstm(outputs1)
for lstm in self.backward_lstm:
outputs2 = lstm(outputs2)
outputs1 = self.forward_projection(outputs1)
outputs2 = self.forward_projection(outputs2)
return tf.concat([outputs1, outputs2], axis=1)
model = Elmo(5)
result = model.get_result(words)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
def train_one_epoch(i):
with tf.GradientTape() as tape:
y_predict = model.get_result(words)
loss_value = loss(y_true=y_true, y_pred=y_predict)
grads = tape.gradient(loss_value, model.trainable_variables)
print("Step: {}, Initial Loss: {}".format(i, loss_value.numpy()))
optimizer.apply_gradients(zip(grads, model.trainable_variables))
for i in range(100):
train_one_epoch(i)
二、论文解读
传统的词向量表示其位置是固定了,不会因为上下文语句不同而产生变化,但是在日常生活中,一词多义是非常常见的,例如“apple”这个词,既可以表示苹果,也可以表示公司;这篇论文解决了单词的一词多义问题,即词的位置可以通过上下文的不同而发生改变;
ELMo具有深层次的网络结构,其使用了多层的LSTM,这比仅仅使用单层LSTM层显著提高了性能;同时由于LSTM是循环神经网络,其可以捕捉单词意义的上下文;大量的实验表明,ELMo表示在实践中工作得非常好。,它可以很容易地添加到现有的模型中,用于6个不同的和具有挑战性的语言理解问题,包括文本蕴涵、问题回答和情感分析。
2.1 模型结构
这一部分讲解参考了ELMo解读(论文 + PyTorch源码),并添加了一些内容;
ELMo 预训练模型结构如下:
![](https://img-blog.csdnimg.cn/1468cfd8bd124019aa02fe881da7aafd.png)
2.1.1 Character Encode Layer
Character Encode Layer结构如下所示:
![](https://img-blog.csdnimg.cn/8a3a4862954b49b19353875dcc81d3d6.png)
首先对Input Sentence进行分析;举个例子,假设我们有一句话"i like this dog very much",解析后得到结果如下:
![](https://img-blog.csdnimg.cn/78037a38f75a4519a281c18b31395351.png)
其做法就是固定句子长度为W:num_words
,固定单词长度为C:max_chars_per_token
;
- 对句子:在句子开头和结尾分别加上
<bos>
和<eos>
,不足长度在后面添加<pos>
; - 对单词:在单词开头和结尾分别加上
<bow>
和<eow>
,不足长度在后面添加<pow>
然后对句子和单词分别建立词表:vocabulary_words
和vocabulary_chars
,将setence和word都转化为数字形式;
以上便是数据准备部分;
然后接下来各个层的介绍我直接粘贴ELMo解读(论文 + PyTorch源码):
![](https://img-blog.csdnimg.cn/d8eafeee8970418a86d3e82b3c046046.png)
2.1.2 N-BiLSTM Layer
LSTM是一个循环网络结构;这里使用的是双向网络循环;原文太详细,这里还是粘贴一下ELMo解读(论文 + PyTorch源码):
![](https://img-blog.csdnimg.cn/69e7789abdc74c6f840028b8b4bf3b27.png)
在这篇论文中,分别训练了多个正向的LSTM和多个负向的LSTM;把最后的输出合并再进行投影就可以训练了;模型如下所示:
![](https://img-blog.csdnimg.cn/1468cfd8bd124019aa02fe881da7aafd.png)
可以看到这里的outputs分别是word的right shift和left shift的合并;训练完毕后,我们就可以利用这些参数进行词表示了;
2.1.3 词表示
![](https://img-blog.csdnimg.cn/19b31950639e43ad912241a0592a12f1.png)
从图中可以看出,ELMo此表示是由 input sentence embedding 以及多个 隐藏层(BiLSTM) 表示的,其中输出表示层最后的维度应该是D,而每一层的BiLSTM由于由两个LSTM表示,再加上每一个LSTM都线性投影成D,则每一层最后一维度都是2D,到最后应该有 D + L ∗ 2 D D+L * 2D D+L∗2D;为了方便计算,这里我们对最后一层采取复制的方式,最后得到总维度应该是 ( L + 1 , B , W , 2 D ) (L+1,B,W,2D) (L+1,B,W,2D);模型介绍完毕!
三、过程实现
3.1 导包和数据整备
代码如下:
import tensorflow as tf
import numpy as np
corpus = [
'i do"t think you will win',
'you are so bad',
'you are unbelievable',
'its taste is not good',
'good',
'so funny',
'i"m so glad to hear that'
]
# 准备vocabulary_char
vocabulary_char = list('abcdefghijklmnopqrstuvwxyz0123456789,;.!?:’"/\|_@#$%ˆ&*˜‘+-=<>()[]{}') + ['<bow>','<eow>','<pow>']
vocabulary_char = dict(zip(vocabulary_char, range(len(vocabulary_char))))
# 准备vocabulary_word
vocabulary_word = set()
for sentence in corpus:
vocabulary_word = vocabulary_word.union(set(sentence.split()))
vocabulary_word = list(vocabulary_word) + ['<bos>','<eos>','<pos>']
vocabulary_word = dict(zip(vocabulary_word, range(len(vocabulary_word))))
# 数据处理
def data_process(corpus, max_words_num, max_character_num, out='int'):
"""
max_words_num: 句子最大长度
max_character_num:单词最大字符数
"""
sentences = []
words = []
for item in corpus:
item = item.split()[:max_words_num]
sentences.append(['<bos>'] + item + ['<eos>'] + ['<pos>']*(10-len(item)))
for item in sentences:
word_list = []
for characters in item:
if characters in ['<bos>', '<eos>', '<pos>']:
word_list.append(['<pow>']*(max_character_num+2))
else:
characters = list(characters)[:max_character_num]
word_list.append(['<bow>'] + characters + ['<eow>'] + ['<pow>']*(max_character_num-len(characters)))
words.append(word_list)
if out == 'int':
for i,items in enumerate(sentences):
for j,item in enumerate(items):
sentences[i][j] = vocabulary_word[item]
for i,items in enumerate(words):
for j,item in enumerate(items):
for k,char in enumerate(item):
words[i][j][k] = vocabulary_char[char]
return np.array(sentences), np.array(words)
sentence, words = data_process(corpus, 10, 6)
# sentence.shape, words.shape
# ((7, 12), (7, 12, 8))
# 这里准备一下outputs 也就是sentence left shift and right shift
y_true = np.c_[sentence[:,1:], np.array([23]*7).reshape(-1,1), sentence[:,:-1], np.array([23]*7).reshape(-1,1)]
3.2 Character Encode Layer
代码如下:
class Character_layer(tf.keras.layers.Layer):
def __init__(self, input_dim, embedding_dim, output_dim, filters_list, kernel_size_list, highway_num):
"""
input_dim:vocabulary_character 维度
embedding_dim:embedding 维度
output_dim:projection 维度
filters_list:conv1 channels
kernel_size_list:conv1 kernel_size
highway_num:highway的数量
"""
super(Character_layer, self).__init__()
self.embedding = tf.keras.layers.Embedding(input_dim, embedding_dim)
self.list_conv1 = []
for filters, kernel_size in zip(filters_list, kernel_size_list):
conv1 = tf.keras.layers.Conv1D(filters, kernel_size, padding='same')
self.list_conv1.append(conv1)
self.list_highway = []
self.dim = sum(filters_list)
for i in range(highway_num):
embedding = tf.keras.layers.Dense(self.dim*2)
self.list_highway.append(embedding)
self.projection = tf.keras.layers.Dense(output_dim)
def build(self, input_shape):
pass
def call(self, inputs): # Defines the computation from inputs to outputs
batch_size, seq_len, token_len = inputs.shape
inputs = tf.reshape(inputs, shape=(batch_size*seq_len, token_len))
inputs = self.embedding(inputs)
inputs = tf.transpose(inputs, [0,2,1])
outputs_list = []
for cov in self.list_conv1:
output = cov(inputs)
output = tf.reduce_max(output, axis=1)
output = tf.keras.activations.relu(output)
outputs_list.append(output)
outputs = tf.concat(outputs_list, axis=-1)
for highway in self.list_highway:
highway = highway(outputs)
activation = tf.keras.activations.relu(highway[:,:self.dim])
sigmoid = tf.keras.activations.sigmoid(highway[:,self.dim:])
outputs = activation*sigmoid + outputs*(1-sigmoid)
outputs = self.projection(outputs)
outputs = tf.reshape(outputs, shape=(batch_size, seq_len, -1))
return outputs
3.3 N-BiLSTM Layer
代码如下:
class Elmo(tf.keras.models.Model):
def __init__(self, lstm_num):
super().__init__()
self.embedding = Character_layer(len(vocabulary_char),200,50,[12,24,36],[2,3,4],2)
self.forward_lstm = []
self.backward_lstm = []
for i in range(lstm_num):
self.forward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=False))
self.backward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=True))
self.forward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=False))
self.backward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=True))
self.forward_projection = tf.keras.layers.Dense(len(vocabulary_word), activation='softmax')
self.backward_projection = tf.keras.layers.Dense(len(vocabulary_word), activation='softmax')
def get_result(self, inputs):
inputs = self.embedding(inputs)
outputs1 = self.forward_lstm[0](inputs)
outputs2 = self.backward_lstm[0](inputs)
for lstm in self.forward_lstm:
outputs1 = lstm(outputs1)
for lstm in self.backward_lstm:
outputs2 = lstm(outputs2)
outputs1 = self.forward_projection(outputs1)
outputs2 = self.forward_projection(outputs2)
return tf.concat([outputs1, outputs2], axis=1)
3.4 模型训练
代码如下:
model = Elmo(5)
result = model.get_result(words)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
def train_one_epoch(i):
with tf.GradientTape() as tape:
y_predict = model.get_result(words)
loss_value = loss(y_true=y_true, y_pred=y_predict)
grads = tape.gradient(loss_value, model.trainable_variables)
print("Step: {}, Initial Loss: {}".format(i, loss_value.numpy()))
optimizer.apply_gradients(zip(grads, model.trainable_variables))
for i in range(100):
train_one_epoch(i)
四、整体总结
内容参考:
- https://zhuanlan.zhihu.com/p/466841781
- https://blog.csdn.net/Magical_Bubble/article/details/89160032