1.简介
word2vec用于提取embedding向量使用较多,即当结果较多时,如果使用onehot编码,每个特征会得到一个[0,0…,1,0,…0]的矩阵,维度较多,因此通过构建一个预测模型,在模型拟合效果较好时,使用内参作为对应的各特征向量。
2.模型构造
2.1 对每个特征向量(零一向量)
对于每个个长度为V(即存在v个特征)的特征向量,输入到输出如下图所示:
即通过W与W’矩阵后,输出一个与输入相同长度的矩阵,取softmax结果视为预测结果。
2.2 CBOW结构
假设每个特征长度为k,选取窗口大小为c/2,CBOW即选取中心词的上下各c/2个词的特征向量输入,进而获得中心词的预测向量,需要注意的是,为了使得c个词向量的输入只有一个输出,需要在中间的隐层对第一层输入的c个结果取平均值。
3.代码实现
3.1 torch版本
import torch
import torch.nn as nn
from collections import defaultdict
import numpy as np
from torch.autograd import Variable
class W2v(nn.Module):
def __init__(self,data,eblen):
super().__init__()
'''
self.words:用于统计各词频率
self.wordslist:去重词组
self.wordid:输入词汇,输出词汇序号,用于相似词获取
self.onehot:对词汇进行编码
self.wordslen:词汇长度
'''
self.words=defaultdict(int)
self.wordslist=[]
self.wordid={}
self.onehot=[]
self.wordslen=0
#获取所有去重值
for sentence in data:
for word in sentence.split():
self.words[word]+=1
#获取word列表,word id
self.wordslist=list(self.words.keys())
for i, v in enumerate(self.wordslist):
self.wordid[v]=i
#获得独热码
self.wordslen=len(self.wordslist)
self.ohmatrix=np.zeros((self.wordslen,self.wordslen))
for i in range(self.wordslen):
self.ohmatrix[i][i]=1
self.w1=nn.Linear(self.wordslen,eblen,bias=False)
self.w2=nn.Linear(eblen,self.wordslen,bias=False)
self.logsoftmax=nn.LogSoftmax(dim=-1)
def forward(self,x):
#x:(batch,2w,flen)
h=torch.mean(self.w1(x),dim=1)
v=self.w2(h)
y=self.logsoftmax(v)
return y
def geteb(self,x):
idx=self.wordid[x]
return self.w1.weight.T[idx]
def getsimilarword(self,x,num):
if x not in self.wordslist:
print('invalid input!')
else:
x_eb=m.geteb(x).detach().numpy()
w1=m.w1.weight.T.detach().numpy()
simatrix=np.true_divide(np.dot(w1,x_eb.T)/np.linalg.norm(x_eb,axis=-1),np.linalg.norm(w1,axis=-1))
#print(simatrix.shape)
simvalue=[self.wordslist[i] for i in np.argsort(simatrix)[::-1][1:num+1]]
similarity=[simatrix[np.argsort(simatrix)[::-1][1:num+1]]]
print(simvalue,'\n',similarity)
def getdata(self,data,window):
#制作训练集,CBOW
train,label=[],[]
for sentence in data:
words=sentence.split()
for w in range(window,len(words)-window):
tr=[self.ohmatrix[self.wordid[v]] for v in words[w-window:w]+words[w+1:w+window+1]]
train.append(tr)
l=[self.ohmatrix[self.wordid[words[w]]]]
label.append(l)
self.trdata=Variable(torch.Tensor(train),requires_grad=True)
self.label=Variable(torch.Tensor(label).squeeze(1),requires_grad=False)
return self.trdata,self.label
3.2 numpy版本
import numpy as np
from tqdm import tqdm
from collections import defaultdict
class Word2Vec():
'''
numpy实现,使用CBOW
eblen: h向量长度
window:窗口大小,即输入使用i位置上下各window个01向量
data: 用于训练的输入集
self.words:用于统计出现单词及其频数字典
self.wordlen:单词去重后大小
self.wordsList:去重单词列表
self.wordid:字典,输入单词返回所在列表指数
w1:embedding矩阵
w2:h到输出矩阵
'''
def __init__(self,eb_len,window,data):
self.eblen=eb_len
self.window=window
self.words=defaultdict(int)
self.wordlen=0
self.wordsList=[]
self.wordid={}
self.train_data=[]
self.label=[]
self.progress_data(data)
def train(self,epoch,lr,batch):
loss=0
for p in tqdm(range(epoch)):
for d in tqdm(range(batch,len(self.train_data),batch)):
trdata=np.array(self.train_data[d-batch:d])
y_true=np.array(self.label[d-batch:d]).reshape(batch,-1)
#print(trdata.shape)
h,v,y=self.forward(trdata)
e=np.subtract(y,y_true)#(n,flen)
self.back(e,h,lr,trdata)
loss+=1/batch*np.sum(np.log(np.sum(np.exp(v),axis=-1)))
#print([v[o][np.argmax(y_true[o],axis=-1)] for o in range(len(v))])
loss-=1/batch*np.sum([v[o][np.argmax(y_true[o],axis=-1)] for o in range(len(v))])
print('epoch: ',p, 'loss: ',loss)
loss=0
def back(self,e,h,lr,x):
#即输出单词的条件概率
self.w1=self.w1-lr*np.mean(np.dot(x.T,np.dot(e,self.w2.T)),axis=1)
self.w2=self.w2-lr*np.dot(h.T,e)
def getSimilarword(self,x,num):
'''
num:top num
x:str
simatrix:相似度矩阵
simvalue:相似值
similarity:输出值相似度
'''
if x not in self.words:
print('invalid value!')
else:
x_order=self.wordid[x]
x_eb=self.w1[x_order]
simatrix=np.true_divide(np.dot(self.w1,x_eb.T)/np.linalg.norm(x_eb,axis=-1),np.linalg.norm(self.w1,axis=-1))
#print(simatrix.shape)
simvalue=[self.wordsList[i] for i in np.argsort(simatrix)[::-1][1:num+1]]
similarity=[simatrix[np.argsort(simatrix)[::-1][1:num+1]]]
print(simvalue,'\n',similarity)
def forward(self,x):
#则train(n,2*window,self.wordlen)h需要取均值、
#print(x)
h=np.dot(x,self.w1)#(n,2w,eb)
h=np.mean(h,axis=1)#(n,eb)
v=np.dot(h,self.w2)#(n,flen)
#print(v)
ex=np.exp(v-np.max(v,axis=-1).reshape(-1,1))
#print('========\n',np.sum(ex,axis=-1).reshape(-1,1))
y=np.true_divide(ex,np.sum(ex,axis=-1).reshape(-1,1))
#print(np.sum(y,axis=-1))
return h,v,y
#数据处理
def progress_data(self,data):
for sentence in data:
for word in sentence.split():
self.words[word]+=1
print('original dic has been build')
self.wordlen=len(self.words.keys())
self.wordsList=list(self.words.keys())
self.w1=np.random.uniform(-1,1,(self.wordlen,self.eblen))
self.w2=np.random.uniform(-1,1,(self.eblen,self.wordlen))
#构建词典,指向值所在位置
self.wordid={}
for id,value in enumerate(self.words):
self.wordid[value]=id
print('words id get!')
#onehot编码
onehot=np.zeros((self.wordlen,self.wordlen))
for i in range(self.wordlen):
onehot[i][i]=1
print('onehot embedding has been finished')
#制作训练集
train,label=[],[]
for sentence in tqdm(data):
#把sentence转化为onehot
sentence=[onehot[self.wordid[i]] for i in sentence.split()]
cur=self.window*2
while cur<len(sentence):
train.append(sentence[cur-2*self.window:cur-self.window]+sentence[cur-self.window+1:cur+1])
label.append(sentence[cur-self.window:cur-self.window+1])
cur+=1
self.train_data=train
self.label=label
def savemodel(self):
np.savez_compressed('model.npz',w1=self.w1,w2=self.w2)
def loadmodel(self,path):
self.w1=np.load(path)['w1']
self.w2=np.load(path)['w2']