word2vec1 pytorch版本
之前文本分类的时候我们知道不管是自己设计的还是pytorch里面自带的随机数embedding都还只是随机数,如果我们能够训练这个模型,肯定是最好的,我们这里要学的word2vec就是去训练一个embedding矩阵,可以算上是一个预训练模型了。
为什么说他不是预训练模型呢,因为真正的预训练模型能够根据你的数据对此向量进行调整,叫做微调,但是这个词向量模型一旦训练完后,想要进行微调是很难的。
构建任务:
代码思路:我们先读数据,然后将数据分词放到一个列表中。然后去构建模型,我们这里先用pytorch来写,模型主要是两个线性层,我们先构建一个词表word2index,就可以确定传入到模型中的参数大小了,其实就是线性层的大小。
import pandas as pd
import os
import jieba
import torch
import torch.nn as nn
from tqdm import tqdm
def read_data(path):
text = pd.read_csv(path,encoding="gbk",names=["text"])["text"].tolist()
#这里吧name指定为表头,防止第一句话变成了标题。
#防止被当成表头
#在训练的时候我们可以去掉一些停用词,这里我们先没有去。
result = []
for t in text:
tc = jieba.lcut(t)
result.append(tc)
return result
def build_word(train_text):
word_2_index = {"UNK":0}
for text in train_text:
for word in text:
if word not in word_2_index:
word_2_index[word] = len(word_2_index)
return word_2_index
class Word2Vec(nn.Module):
def __init__(self,word_size,embedding_num):
super().__init__()
self.w1 = nn.Linear(word_size,embedding_num)
self.w2 = nn.Linear(embedding_num,word_size)
self.log_softmax = nn.LogSoftmax(dim=-1)#这里dim = -1表示对最后一维进行softmax
self.loss_fun1 = nn.NLLLoss()
# self.loss_fun2 = nn.CrossEntropyLoss()
#CrossEntropyLoss它里面包括了softmax和NLLLoss,所以不需要再单独写出来。
def forward(self,x,label):
h = self.w1.forward(x)
p = self.w2(h)
p2 = self.log_softmax(p)
loss = self.loss_fun1(p2,label)
# loss2 = self.loss_fun2(p,label)
return loss
pass
def word_2_onehot(word):
global word_2_index,device
word_idx = word_2_index.get(word,0)
word_onehot = torch.zeros((1,len(word_2_index)),dtype=torch.float32,device=device)
word_onehot[0,word_idx] = 1
return word_onehot
if __name__ == "__main__":
all_data = read_data(os.path.join("..", "data", "word2vec_data", "数学原始数据.csv"))
word_2_index = build_word(all_data)
epoch = 100
batch_size = 10
lr = 0.001
embedding_num = 99
n_gram = 2
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = Word2Vec(len(word_2_index),embedding_num).to(device)
opt = torch.optim.Adam(model.parameters(),lr=lr)
for e in range(epoch):
for text in tqdm(all_data):
for ni,now_word in enumerate(text):
other_words = text[max(ni-n_gram,0):ni] + text[ni+1:ni+1+n_gram]
now_word_onehot = word_2_onehot(now_word)
for other_word in other_words:
other_word_idx = torch.tensor(word_2_index.get(other_word,1),device=device).reshape(-1)
# other_word_onehot = word_2_onehot(other_word)
loss = model.forward(now_word_onehot,other_word_idx)
loss.backward()
opt.step()
opt.zero_grad()
#这里我们发现,other_word_idx 不是onehot形式,因为pytorch里面在计算loss的时候,会自动将label转化为onehot形式。
# print(f"当前词:{now_word},周围词:{other_words}")
#这个写法会发现会太慢了,所以我们从训练策略上改一下,这里周围词那个循环太多了,我们可以将周围词给拼接一下
#给输入的当前词我们可以给复制一下
word2vec 2 pytorch版本
这个写法会发现会太慢了,所以我们从训练策略上改一下,这里周围词那个循环太多了,我们可以将周围词给拼接一下 #给输入的当前词我们可以给复制一下
我们为了代码速度快点,我们给停用词加上。
最后将我们训练出来word2vec.pt给保存下来。
import pandas as pd
import os
import jieba
import torch
import torch.nn as nn
from tqdm import tqdm
def read_data(path,num=None):
stop_words = get_stop_word(os.path.join("..", "data", "word2vec_data", "stopwords.txt"))
text = pd.read_csv(path,encoding="gbk",names=["text"])["text"].tolist()
result = []
for t in text:
tc = jieba.lcut(t)
tc = [i for i in tc if i not in stop_words]
result.append(tc)
if num is None:
return result
else:
return result[:num]
def get_stop_word(path):
with open(path,"r",encoding="utf-8") as f:
return f.read().split("\n")
def build_word(train_text):
word_2_index = {"UNK":0}
for text in train_text:
for word in text:
if word not in word_2_index:
word_2_index[word] = len(word_2_index)
return word_2_index
class Word2Vec(nn.Module):
def __init__(self,word_size,embedding_num):
super().__init__()
self.w1 = nn.Linear(word_size,embedding_num)
self.w2 = nn.Linear(embedding_num,word_size)
# self.log_softmax = nn.LogSoftmax(dim=-1)
# self.loss_fun1 = nn.NLLLoss()
self.loss_fun2 = nn.CrossEntropyLoss()
def forward(self,x,label):
h = self.w1.forward(x)
p = self.w2(h)
# p2 = self.log_softmax(p)
loss = self.loss_fun2(p,label)
return loss
def word_2_onehot(word):
global word_2_index,device
word_idx = word_2_index.get(word,0)
word_onehot = torch.zeros((1,len(word_2_index)),dtype=torch.float32,device=device)
word_onehot[0,word_idx] = 1
return word_onehot
if __name__ == "__main__":
all_data = read_data(os.path.join("..", "data", "word2vec_data", "数学原始数据.csv"))
word_2_index = build_word(all_data)
epoch = 2
batch_size = 10
lr = 0.02
embedding_num = 100
n_gram = 5
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = Word2Vec(len(word_2_index),embedding_num).to(device)
opt = torch.optim.Adam(model.parameters(),lr=lr)
for e in range(epoch):
for text in tqdm(all_data):
for ni,now_word in enumerate(text):
other_words = text[max(ni-n_gram,0):ni] + text[ni+1:ni+1+n_gram]
now_word_onehot = word_2_onehot(now_word)
now_word_onehot = now_word_onehot.repeat(len(other_words),1)
other_words_idx = torch.tensor([word_2_index.get(i) for i in other_words],device=device,dtype=torch.int64)
loss = model.forward(now_word_onehot,other_words_idx)
loss.backward()
opt.step()
opt.zero_grad()