117word2vec

 word2vec1  pytorch版本

之前文本分类的时候我们知道不管是自己设计的还是pytorch里面自带的随机数embedding都还只是随机数,如果我们能够训练这个模型,肯定是最好的,我们这里要学的word2vec就是去训练一个embedding矩阵,可以算上是一个预训练模型了。

为什么说他不是预训练模型呢,因为真正的预训练模型能够根据你的数据对此向量进行调整,叫做微调,但是这个词向量模型一旦训练完后,想要进行微调是很难的。

 构建任务:

代码思路:我们先读数据,然后将数据分词放到一个列表中。然后去构建模型,我们这里先用pytorch来写,模型主要是两个线性层,我们先构建一个词表word2index,就可以确定传入到模型中的参数大小了,其实就是线性层的大小。

import pandas as pd
import os
import jieba
import torch
import torch.nn as nn
from tqdm import tqdm

def read_data(path):
    text = pd.read_csv(path,encoding="gbk",names=["text"])["text"].tolist()
    #这里吧name指定为表头,防止第一句话变成了标题。
    #防止被当成表头
    #在训练的时候我们可以去掉一些停用词,这里我们先没有去。
    result = []
    for t in text:
        tc = jieba.lcut(t)
        result.append(tc)
    return result


def build_word(train_text):
    word_2_index = {"UNK":0}
    for text in train_text:
        for word in text:
            if word not in word_2_index:
                word_2_index[word] = len(word_2_index)
    return word_2_index


class Word2Vec(nn.Module):
    def __init__(self,word_size,embedding_num):
        super().__init__()
        self.w1 = nn.Linear(word_size,embedding_num)
        self.w2 = nn.Linear(embedding_num,word_size)

        self.log_softmax = nn.LogSoftmax(dim=-1)#这里dim = -1表示对最后一维进行softmax
        self.loss_fun1 = nn.NLLLoss()

        # self.loss_fun2 = nn.CrossEntropyLoss()
        #CrossEntropyLoss它里面包括了softmax和NLLLoss,所以不需要再单独写出来。

    def forward(self,x,label):
        h = self.w1.forward(x)
        p = self.w2(h)

        p2 = self.log_softmax(p)
        loss = self.loss_fun1(p2,label)
        # loss2 = self.loss_fun2(p,label)

        return loss

        pass

def word_2_onehot(word):
    global word_2_index,device
    word_idx = word_2_index.get(word,0)
    word_onehot = torch.zeros((1,len(word_2_index)),dtype=torch.float32,device=device)
    word_onehot[0,word_idx] = 1
    return word_onehot

if __name__ == "__main__":
    all_data = read_data(os.path.join("..", "data", "word2vec_data", "数学原始数据.csv"))
    word_2_index = build_word(all_data)


    epoch = 100
    batch_size = 10
    lr = 0.001
    embedding_num = 99
    n_gram = 2

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model = Word2Vec(len(word_2_index),embedding_num).to(device)
    opt = torch.optim.Adam(model.parameters(),lr=lr)

    for e in range(epoch):
        for text in tqdm(all_data):
            for ni,now_word in enumerate(text):
                other_words = text[max(ni-n_gram,0):ni] + text[ni+1:ni+1+n_gram]

                now_word_onehot = word_2_onehot(now_word)
                for other_word in other_words:
                    other_word_idx = torch.tensor(word_2_index.get(other_word,1),device=device).reshape(-1)
                    # other_word_onehot = word_2_onehot(other_word)
                    loss = model.forward(now_word_onehot,other_word_idx)
                    loss.backward()
                    opt.step()
                    opt.zero_grad()
#这里我们发现,other_word_idx 不是onehot形式,因为pytorch里面在计算loss的时候,会自动将label转化为onehot形式。

                # print(f"当前词:{now_word},周围词:{other_words}")

 #这个写法会发现会太慢了,所以我们从训练策略上改一下,这里周围词那个循环太多了,我们可以将周围词给拼接一下
 #给输入的当前词我们可以给复制一下

word2vec 2  pytorch版本

这个写法会发现会太慢了,所以我们从训练策略上改一下,这里周围词那个循环太多了,我们可以将周围词给拼接一下
#给输入的当前词我们可以给复制一下

我们为了代码速度快点,我们给停用词加上。

最后将我们训练出来word2vec.pt给保存下来。

import pandas as pd
import os
import jieba
import torch
import torch.nn as nn
from tqdm import tqdm

def read_data(path,num=None):
    stop_words = get_stop_word(os.path.join("..", "data", "word2vec_data", "stopwords.txt"))
    text = pd.read_csv(path,encoding="gbk",names=["text"])["text"].tolist()
    result = []
    for t in text:
        tc = jieba.lcut(t)
        tc = [i for i in tc if i not in stop_words]
        result.append(tc)

    if num is None:
        return result
    else:
        return result[:num]

def get_stop_word(path):
    with open(path,"r",encoding="utf-8") as f:
        return f.read().split("\n")


def build_word(train_text):
    word_2_index = {"UNK":0}
    for text in train_text:
        for word in text:
            if word not in word_2_index:
                word_2_index[word] = len(word_2_index)
    return word_2_index


class Word2Vec(nn.Module):
    def __init__(self,word_size,embedding_num):
        super().__init__()
        self.w1 = nn.Linear(word_size,embedding_num)
        self.w2 = nn.Linear(embedding_num,word_size)

        # self.log_softmax = nn.LogSoftmax(dim=-1)
        # self.loss_fun1 = nn.NLLLoss()

        self.loss_fun2 = nn.CrossEntropyLoss()

    def forward(self,x,label):
        h = self.w1.forward(x)
        p = self.w2(h)

        # p2 = self.log_softmax(p)
        loss = self.loss_fun2(p,label)

        return loss


def word_2_onehot(word):
    global word_2_index,device
    word_idx = word_2_index.get(word,0)
    word_onehot = torch.zeros((1,len(word_2_index)),dtype=torch.float32,device=device)
    word_onehot[0,word_idx] = 1
    return word_onehot

if __name__ == "__main__":
    all_data = read_data(os.path.join("..", "data", "word2vec_data", "数学原始数据.csv"))
    word_2_index = build_word(all_data)


    epoch = 2
    batch_size = 10
    lr = 0.02
    embedding_num = 100
    n_gram = 5

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model = Word2Vec(len(word_2_index),embedding_num).to(device)
    opt = torch.optim.Adam(model.parameters(),lr=lr)

    for e in range(epoch):
        for text in tqdm(all_data):
            for ni,now_word in enumerate(text):
                other_words = text[max(ni-n_gram,0):ni] + text[ni+1:ni+1+n_gram]

                now_word_onehot = word_2_onehot(now_word)
                now_word_onehot = now_word_onehot.repeat(len(other_words),1)

                other_words_idx = torch.tensor([word_2_index.get(i) for i in other_words],device=device,dtype=torch.int64)
                loss = model.forward(now_word_onehot,other_words_idx)
                loss.backward()

                opt.step()
                opt.zero_grad()


   
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值