无网络环境下配置并运行 word2vec复现.py

最新推荐文章于 2025-05-09 23:54:54 发布

风筝超冷

最新推荐文章于 2025-05-09 23:54:54 发布

阅读量311

点赞数 4

文章标签： python linux 开发语言

本文链接：https://blog.csdn.net/qq_60245590/article/details/147775525

版权

需运行文件

# -*- coding: utf-8 -*-
import torch
import pandas as pd
import jieba
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import DataLoader,Dataset
from transformers import AutoTokenizer,AutoModel

def get_stop_word():
    with open("../data/baidu_stopwords.txt",encoding="utf-8") as f:
        return f.read().split("\n")

def read_data(n=3):
    import jieba.posseg as psg
    # with open("../data/数学原始数据.csv",encoding="gbk") as f:
    all_data = pd.read_csv("../data/数学原始数据.csv",names=["data"],encoding="gbk") #
    all_data = all_data["data"].tolist()

    no_t = ["x","c","m","d","uj","r",""]

    result = []

    word_fre = {}

    for data in all_data:
        words = psg.lcut(data)

        new_word = []
        for word,t in words:
            if t in no_t:
                continue

            if word not in stop_words:
                word_fre[word] = word_fre.get(word,0) + 1
                new_word.append(word)

        result.append(new_word)

    new_result = []

    for words in result:
        new_word = []
        for word in words:

            if word_fre[word]<n:
                continue
            new_word.append(word)
        new_result.append(new_word)

    return new_result



def build_data(all_data):
    result = []

    for data in all_data:

        for ni,now_word in enumerate(data):
            other_word = data[max(ni-n_gram,0):ni] + data[ni+1:ni+1+n_gram]

            for o in other_word:
                result.append((now_word,o))
    return result


class MyDataset(Dataset):
    def __init__(self,all_data):
        self.all_data = all_data

    def __len__(self):
        return len(self.all_data)

    def __getitem__(self, index):
        data = self.all_data[index]
        # index = word_2_index[data[0]]
        # index = [word_2_index[i] for i in data[1]]

        word1_idx = tokenizer(data[0])["input_ids"][0]
        word2_idx = tokenizer(data[1])["input_ids"][0]

        return word1_idx,word2_idx

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.base_model = AutoModel.from_pretrained("../model/Qwen2.5-0.5B-Instruct")
        self.linear1 = nn.Linear(corpus_len,emb_dim)
        self.linear1.weight.data[:,:151936] = self.base_model.embed_tokens.weight.data.T
        self.linear2 = nn.Linear( emb_dim, corpus_len)
        self.linear2.weight.requires_grad = False
        # self.linear2.weight.r
        self.loss_fun = nn.CrossEntropyLoss()

    def forward(self,batch_w1_index,batch_w2_index):

        word1_onehot = torch.zeros(size=[len(batch_w1_index),corpus_len])
        # word1_onehot[batch_w1_index] = 1.0
        for i in range(len(batch_w1_index)):
            word1_onehot[i][batch_w1_index] = 1.0

        # word2_onehot = torch.zeros(size=[1, corpus_len])
        # word2_onehot[0][batch_w2_index] = 1.0

        h = self.linear1(word1_onehot)
        predict = self.linear2(h)

        loss = self.loss_fun(predict,batch_w2_index)
        return loss


def add_word(all_data):
    global tokenizer
    new_data = []
    for i in all_data:
        new_data.extend(i)
    new_data = list(set(new_data))
    # tokenizer.convert_tokens_to_string("hh")

    for word in new_data:
        t = tokenizer(word)["input_ids"]
        if len(t)!=1:
            tokenizer.add_tokens(word)
            # print(word)



if __name__ == "__main__":

    aaa = 10

    n_gram = 1
    batch_size = 100
    epoch = 10
    emb_dim = 896
    lr = 0.01

    grad_acc = 1

    stop_words = get_stop_word()
    stop_words = stop_words + ["。","，","（","）"]
    all_data = read_data()
    rel_words = build_data(all_data)

    tokenizer = AutoTokenizer.from_pretrained("../model/Qwen2.5-0.5B-Instruct")
    add_word(all_data)

    corpus_len = len(tokenizer.get_vocab())

    # tokenizer.convert_tokens_to_string("hh")

    train_dataset = MyDataset(rel_words)

    train_dataloader = DataLoader(train_dataset,batch_size=batch_size,shuffle=False)

    model = Model()
    opt = torch.optim.Adam(model.parameters(),lr=lr)

    for e in range(epoch):
        for batch_idx,(batch_w1_index,batch_w2_index) in tqdm(enumerate(train_dataloader,start=1)):
            loss = model.forward(batch_w1_index,batch_w2_index)
            loss.backward()


            if batch_idx%grad_acc == 0:
                opt.step()
                opt.zero_grad()

        print(loss)

创建和激活虚拟环境（可选）

python3 -m venv word2vec_offline
source word2vec_offline/bin/activate

安装依赖

pip install torch pandas jieba tqdm transformers

下载依赖的离线安装包

在有网络的机器上，执行：

mkdir offline_pkgs
pip download torch pandas jieba tqdm transformers -d offline_pkgs

这样会把所有依赖包（包括依赖的依赖）下载到 offline_pkgs 文件夹。

拷贝依赖和项目文件到无网络环境

拷贝 offline_pkgs 文件夹到无网络环境

拷贝你的 word2vec复现.py 以及所需的 ../data/、../model/ 文件夹

3. 在无网络环境下新建虚拟环境

python3 -m venv venv
source venv/bin/activate

4. 离线安装依赖

进入 offline_pkgs 文件夹，执行：

pip install --no-index --find-links=offline_pkgs torch pandas jieba tqdm transformers

如果有依赖报错，先安装报错的依赖，再装主包。

5. 检查依赖安装

pip list

确认 torch、pandas、jieba、tqdm、transformers 都已安装。

6. 运行你的代码

确保你在虚拟环境中，且数据和模型路径正确：

python word2vec复现.py