python实现word2vec(不使用框架)

参考两篇博客
基础知识
源码解析

import time
import numpy as np
import math

wordHash = {}
wordNum = 0
window = 2
words = []
vecSize = 100
u = 0.1
t = 500

#将单词存入map
def read_file():
    global wordNum,wordHash,words
    f =open("test.txt",encoding="utf-8")
    sentences = f.readlines()
    for sentence in sentences:
        words = sentence.split(" ")
        for word in words:
            if word in wordHash:
                wordHash[word] += 1
            else:
                wordHash[word] = 1
            wordNum+=1



#对单词构建哈弗曼编码
def buildHFMTree():
    global wordHash
    vocab = sorted(wordHash.items(), key=lambda item: item[1], reverse=True)
    length = len(vocab)*2
    weight = [None]*length
    parent = [None]*length
    pos = [None]*length
    for i in range(length):
        if i < length/2:
            wordHash[vocab[i][0]] = i
            weight[i] = vocab[i][1]
        else:
            weight[i] = wordNum
    lp = len(vocab)-1
    rp = lp+1
    addp = lp+1
    while True:
        if lp<0:
            if rp+1==addp:
                break
            weight[addp] = weight[rp]+weight[rp+1]
            pos[rp] = 0
            pos[rp+1] = 1
            parent[rp] = addp
            parent[rp+1] = addp
            addp+=1
            rp+=2
            continue
        if weight[lp] < weight[rp]:
            if lp-1>=0 and weight[lp-1] < weight[rp]:
                min = lp
                max = lp-1
                lp = lp-2
            else:
                min = lp
                max = rp
                lp-=1
                rp+=1
        else:
            if weight[rp+1] > weight[lp]:
                min = rp
                max = lp
                lp -= 1
                rp+=1
            else:
                min = rp
                max = rp+1
                rp+=2
        weight[addp] = weight[min]+weight[max]
        pos[min] = 0
        pos[max] = 1
        parent[min] = addp
        parent[max] = addp
        addp+=1
    return pos,parent

def sigmiod(n):
    return np.exp(n)/(1+np.exp(n))

def getHFMCode(word,pos,parent):
    global wordHash
    i = wordHash[word]
    code = []
    while parent[i]!=None:
        code.append(pos[i])
        i = parent[i]
    print("单词'"+word+"'的哈弗曼编码:"+str(code))
    return code

def updataParam(word,pos,parent,ansVec,projVec,paramVec):
    global wordHash
    i = wordHash[word]
    ll = 0
    paramChange = np.zeros((vecSize,wordNum-1))
    projChange = np.zeros((vecSize))
    while parent[i] != None:
        d = pos[i]
        n = ansVec[parent[i]-wordNum]
        try:
            ll += (1-d)*math.log(sigmiod(n))+d*math.log(1-sigmiod(n))
        except Exception:
            ll += 0
        m = 1-d-sigmiod(n)
        gradProj = m*projVec
        gradParam = m * paramVec[:,parent[i] - wordNum]
        paramChange[:,parent[i]-wordNum] += gradProj*u
        projChange += gradParam*u
        i = parent[i]
    return projChange,paramChange,ll


def initVec():
    global wordNum,vecSize
    wordVec = np.random.random((wordNum,vecSize))
    paramVec = np.zeros((vecSize,wordNum-1))
    for i in range(wordNum):
        for j in range(vecSize):
            wordVec[i][j] = (wordVec[i][j]-0.5)/vecSize
    return wordVec,paramVec

def train(wordVec,paramVec,pos,parent):
    global vecSize
    for k in range(t+1):
        paramChange = np.zeros((vecSize, wordNum - 1))
        wordChange = np.zeros((wordNum,vecSize))
        for i in range(len(words)):
            projVec = np.zeros(vecSize)
            n = 0
            for j in range(i-window,i):
                if j<0:
                   continue
                projVec += wordVec[wordHash[words[j]]]
                n+=1
            for j in range(i+1,i+window):
                if j>=len(words):
                    continue
                projVec += wordVec[wordHash[words[j]]]
                n+=1
            projVec = projVec/n
            ansVec = projVec.dot(paramVec)
            projChange1,paramChange1,ll = updataParam(words[i],pos,parent,ansVec,projVec,paramVec)
            for j in range(i - window, i):
                if j < 0:
                    continue
                wordChange[wordHash[words[j]]]+=projChange1
            for j in range(i + 1, i + window):
                if j >= len(words):
                    continue
                wordChange[wordHash[words[j]]]+=projChange1
            paramChange+=paramChange1
            if k%100==0:
                print("第"+str(k)+"轮训练中,单词"+words[i]+"的损失为:"+str(ll))
        wordVec+=wordChange
        paramVec+=paramChange

print("开始读取单词")
read_file()
print("读取单词结束")
print("开始构建哈弗曼树")
pos,parent = buildHFMTree()
print("构建完成")
#getHFMCode('interpretations',pos,parent)
print("开始初始化单词向量")
wordVec,paramVec = initVec()
print("单词向量初始完成")
print("准备训练参数")
train(wordVec,paramVec,pos,parent)

语料:

In the near future the translation history will only be viewable when you log in to your account and it will be centrally managed in my activity record. This upgrade will clear the previous history so if you want the system to record certain translations for future review please be sure to save the translation results

运行结果:
在这里插入图片描述
在这里插入图片描述
不足
相对于源码,未实现负采样,多线程和指数运算近似来加快性能的功能

实现word2vec跳字模型需要以下步骤: 1. 数据预处理:读取文本数据并将其转换为词向量表示。可以使用gensim库中的Word2Vec实现。 2. 构建神经网络使用PyTorch或TensorFlow等深度学习框架构建神经网络,用于训练模型。跳字模型的核心是将一个词的上下文转换为它的向量表示。 3. 模型训练:将预处理的数据输入到神经网络中进行训练。在训练中,模型将学习如何根据上下文预测中心词。 4. 模型评估和优化:使用测试数据对模型进行评估,并根据评估结果进行优化,如调整学习率、增加训练数据等。 以下是一个使用PyTorch实现跳字模型的示例代码: ```python import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader class SkipGram(nn.Module): def __init__(self, vocab_size, embedding_dim): super(SkipGram, self).__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim) self.linear = nn.Linear(embedding_dim, vocab_size) def forward(self, x): x = self.embedding(x) x = self.linear(x) return x class Word2VecDataset(Dataset): def __init__(self, corpus, window_size): self.word_pairs = [] for i in range(window_size, len(corpus) - window_size): for j in range(-window_size, window_size + 1): if j != 0: self.word_pairs.append((corpus[i], corpus[i+j])) def __len__(self): return len(self.word_pairs) def __getitem__(self, idx): return self.word_pairs[idx] def train_word2vec(corpus, embedding_dim, window_size, batch_size, num_epochs): vocab_size = len(set(corpus)) dataset = Word2VecDataset(corpus, window_size) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = SkipGram(vocab_size, embedding_dim).to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters()) for epoch in range(num_epochs): total_loss = 0 for batch in dataloader: x, y = batch[0], batch[1] x, y = x.to(device), y.to(device) optimizer.zero_grad() output = model(x) loss = criterion(output.view(-1, vocab_size), y) loss.backward() optimizer.step() total_loss += loss.item() print("Epoch {}, loss={:.4f}".format(epoch+1, total_loss/len(dataloader))) return model ``` 在这个示例代码中,我们使用SkipGram类来定义跳字模型,该类包含了一个嵌入层和一个线性层。Word2VecDataset类用于将语料库转换为训练数据,每个样本都是一个中心词和一个上下文词。我们使用PyTorch中的DataLoader类来加载训练数据。在训练过程中,我们使用Adam优化器和交叉熵损失函数。最后,我们返回训练好的模型。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值