建立一个LSTM模型,输入为单词序列。模型将能够考虑单词顺序。Emojifier-V2将继续使用预训练的单词嵌入来表示单词,但会将其输入到LSTM中,LSTM的工作是预测最合适的表情符号。
模型结构如下:
嵌入层如下,根据输入句子中的单词,得到单词在词向量字典表(word_to_index)中的索引,进行0填充,经过embedding层,输出句子中每个单词所对的50维的词向量。
输入中的最大整数(即单词索引)应不大于词汇表的大小。该层输出一个维度数组(batch size, max input length, dimension of word vectors)。
首先读取单词到词向量的字典。
#word_to_index:字典将单词映射到词汇表中的索引(400,001个单词,有效索引范围是0到400,000)
#index_to_word:字典从索引到词汇表中对应词的映射
#word_to_vec_map:将单词映射到其GloVe向量表示的字典。
def read_glove_vecs(glove_file):
with open(glove_file, 'r', encoding='utf8') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()#line包括单词,以及对应的词向量(50维)
curr_word = line[0]
words.add(curr_word)
word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
i = 1
words_to_index = {}
index_to_words = {}
for w in sorted(words):#按字母顺序排序好的单词
words_to_index[w] = i
index_to_words[i] = w
i = i + 1
return words_to_index, index_to_words, word_to_vec_map
数据预处理:
class Sentence_Data(Dataset):
def __init__(self, filename):
super(Sentence_Data, self).__init__()
self.max_len = 10
data, label = emo_utils.read_csv(filename)
self.label = torch.from_numpy(label)
self.len = self.label.size()[0]
words_to_index, index_to_words, word_to_vec_map = emo_utils.read_glove_vecs('data/glove.6B.50d.txt')
self.embedding = self.pretrained_embedding_layer(word_to_vec_map, words_to_index)
self.data = self.sentence_to_vec(data, words_to_index=words_to_index)
def __getitem__(self, item):
return self.data[item], self.label[item]
def __len__(self):
return self.len
def pretrained_embedding_layer(self, word_to_vec_map, word_to_index):
"""
创建embedding层,加载50维的GloVe向量
:param word_to_vec_map:
:param word_to_index:
:return:
"""
vocab_len = len(word_to_index) + 1
embedding_size = word_to_vec_map["cucumber"].shape[0]
# 初始化嵌入矩阵
embedding_matrix = np.zeros((vocab_len, embedding_size))#400001*50
for word, index in word_to_index.items():#embedding_matrix每一行对应50维的词向量
embedding_matrix[index, :] = word_to_vec_map[word]
embedding_matrix = torch.Tensor(embedding_matrix)
# 定义embedding层
embedding_layer = torch.nn.Embedding.from_pretrained(embedding_matrix)#将嵌入权重设置为等于嵌入矩阵
return embedding_layer
def sentence_to_vec(self, data, words_to_index):#将句子中的单词转化为词向量(50维)
vec_list = []
for sentence in data:
words_index = self.sentences_to_indices(sentence, words_to_index, self.max_len)
words_index = torch.LongTensor(words_index)
words_vec = self.embedding(words_index)
vec_list.append(words_vec)
return vec_list #(batch size, max input length, dimension of word vectors)
def sentences_to_indices(self, x, words_to_index, max_len):#将句子中的每个单词转化为词典中对应的序号
"""
输入的是X(字符串句子列表),再转化为对应的句子列表
:param x: 句子数组,维度为(m,1)
:param word_to_index: 字典类型,单词到索引的映射
:param max_len: 最大句长
:return:
"""
X_indices = np.zeros(max_len)
sentences_words = x.lower().split()
j = 0
for w in sentences_words:
X_indices[j] = words_to_index[w]
j += 1
return X_indices
模型构建:
mo'x
class LSTM_EMO(torch.nn.Module):
def __init__(self, input_size, num_classes):
super(LSTM_EMO, self).__init__()
self.lstm = torch.nn.LSTM(input_size=input_size, hidden_size=128, num_layers=2, dropout=0.5, batch_first=True)#lstm隐藏层大小hidden_size,num_layers为lstm的层数,batch_first 这个要注意,通常我们输入的数据shape=(batch_size,seq_length,embedding_dim),而batch_first默认是False,
self.dropout = torch.nn.Dropout(0.5)
self.fc = torch.nn.Linear(128, num_classes)
self.softmax = torch.nn.Softmax(dim=1)
def forward(self, x):
out, (h_n, c_n) = self.lstm(x) #out的维度为(batch_size,max_len,hidden_size),它包含的LSTM的最后一层的输出特征(h_t),t是batch_size中每个句子的长度.,h_n包含的是句子的最后一个单词的隐藏状态,h_n的维度为(num_layers,batch_size,hidden_size),h_n[-1]为第二层LSTM最后一个单词的输出,维度为(batch_size,hidden_size)
out = self.dropout(h_n[-1]) #若是h_n则为整个句子的输出,
linear_out = self.fc(out)
return linear_out
def predict(self, x):
out, (h_n, c_n) = self.lstm(x)
out = self.dropout(h_n[-1])
linear_out = self.fc(out)
y_pre = self.softmax(linear_out)
return y_pre
模型参数:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 初始化训练参数
batch_size = 32
epoch_nums = 300
learning_rate = 0.001
costs = []
input_size = 50 #输入词向量的维度
num_classes = 5 #输出5种情感类别
# 加载训练数据
train_data_path = "/content/drive/MyDrive/Colab Notebooks/吴恩达L5HW1/Emojify/data/train_emoji.csv"
train_data = Sentence_Data(train_data_path)
train_data_loader = DataLoader(train_data, shuffle=True, batch_size=32)
# 初始化模型
m = LSTM_EMO(input_size=input_size, num_classes=num_classes)
m.to(device)
# 定义优化器和损失函数
loss_fn = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(m.parameters(), lr=learning_rate)
模型训练:
# 开始训练
print("learning_rate=" + str(learning_rate))
for epoch in range(epoch_nums):
cost = 0
index = 0
for data, label in train_data_loader:
data, label = data.to(device), label.to(device)
optimizer.zero_grad()
out, (h_n, c_n) = m.lstm(data)
#print(h_n[-1].shape)
y_pred = m.forward(data)
loss = loss_fn(y_pred, label.long())
loss.backward()
optimizer.step()
cost = cost + loss.cpu().detach().numpy()
index = index + 1
if epoch % 50 == 0:
costs.append(cost / index)
print("epoch=" + str(epoch) + ": " + "loss=" + str(cost / (index + 1)))
learning_rate=0.001
epoch=0: loss=1.3239593505859375
epoch=50: loss=0.1451170692841212
epoch=100: loss=0.0035808665367464223
epoch=150: loss=0.0017829457065090537
epoch=200: loss=0.0008461652129578093
epoch=250: loss=0.00044136530777905136
测试集上的精确率不高,需要查找原因。