需运行文件
# -*- coding: utf-8 -*-
import torch
import pandas as pd
import jieba
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import DataLoader,Dataset
from transformers import AutoTokenizer,AutoModel
def get_stop_word():
with open("../data/baidu_stopwords.txt",encoding="utf-8") as f:
return f.read().split("\n")
def read_data(n=3):
import jieba.posseg as psg
# with open("../data/数学原始数据.csv",encoding="gbk") as f:
all_data = pd.read_csv("../data/数学原始数据.csv",names=["data"],encoding="gbk") #
all_data = all_data["data"].tolist()
no_t = ["x","c","m","d","uj","r",""]
result = []
word_fre = {}
for data in all_data:
words = psg.lcut(data)
new_word = []
for word,t in words:
if t in no_t:
continue
if word not in stop_words:
word_fre[word] = word_fre.get(word,0) + 1
new_word.append(word)
result.append(new_word)
new_result = []
for words in result:
new_word = []
for word in words:
if word_fre[word]<n:
continue
new_word.append(word)
new_result.append(new_word)
return new_result
def build_data(all_data):
result = []
for data in all_data:
for ni,now_word in enumerate(data):
other_word = data[max(ni-n_gram,0):ni] + data[ni+1:ni+1+n_gram]
for o in other_word:
result.append((now_word,o))
return result
class MyDataset(Dataset):
def __init__(self,all_data):
self.all_data = all_data
def __len__(self):
return len(self.all_data)
def __getitem__(self, index):
data = self.all_data[index]
# index = word_2_index[data[0]]
# index = [word_2_index[i] for i in data[1]]
word1_idx = tokenizer(data[0])["input_ids"][0]
word2_idx = tokenizer(data[1])["input_ids"][0]
return word1_idx,word2_idx
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.base_model = AutoModel.from_pretrained("../model/Qwen2.5-0.5B-Instruct")
self.linear1 = nn.Linear(corpus_len,emb_dim)
self.linear1.weight.data[:,:151936] = self.base_model.embed_tokens.weight.data.T
self.linear2 = nn.Linear( emb_dim, corpus_len)
self.linear2.weight.requires_grad = False
# self.linear2.weight.r
self.loss_fun = nn.CrossEntropyLoss()
def forward(self,batch_w1_index,batch_w2_index):
word1_onehot = torch.zeros(size=[len(batch_w1_index),corpus_len])
# word1_onehot[batch_w1_index] = 1.0
for i in range(len(batch_w1_index)):
word1_onehot[i][batch_w1_index] = 1.0
# word2_onehot = torch.zeros(size=[1, corpus_len])
# word2_onehot[0][batch_w2_index] = 1.0
h = self.linear1(word1_onehot)
predict = self.linear2(h)
loss = self.loss_fun(predict,batch_w2_index)
return loss
def add_word(all_data):
global tokenizer
new_data = []
for i in all_data:
new_data.extend(i)
new_data = list(set(new_data))
# tokenizer.convert_tokens_to_string("hh")
for word in new_data:
t = tokenizer(word)["input_ids"]
if len(t)!=1:
tokenizer.add_tokens(word)
# print(word)
if __name__ == "__main__":
aaa = 10
n_gram = 1
batch_size = 100
epoch = 10
emb_dim = 896
lr = 0.01
grad_acc = 1
stop_words = get_stop_word()
stop_words = stop_words + ["。",",","(",")"]
all_data = read_data()
rel_words = build_data(all_data)
tokenizer = AutoTokenizer.from_pretrained("../model/Qwen2.5-0.5B-Instruct")
add_word(all_data)
corpus_len = len(tokenizer.get_vocab())
# tokenizer.convert_tokens_to_string("hh")
train_dataset = MyDataset(rel_words)
train_dataloader = DataLoader(train_dataset,batch_size=batch_size,shuffle=False)
model = Model()
opt = torch.optim.Adam(model.parameters(),lr=lr)
for e in range(epoch):
for batch_idx,(batch_w1_index,batch_w2_index) in tqdm(enumerate(train_dataloader,start=1)):
loss = model.forward(batch_w1_index,batch_w2_index)
loss.backward()
if batch_idx%grad_acc == 0:
opt.step()
opt.zero_grad()
print(loss)
创建和激活虚拟环境(可选)
python3 -m venv word2vec_offline
source word2vec_offline/bin/activate
安装依赖
pip install torch pandas jieba tqdm transformers
下载依赖的离线安装包
在有网络的机器上,执行:
mkdir offline_pkgs
pip download torch pandas jieba tqdm transformers -d offline_pkgs
这样会把所有依赖包(包括依赖的依赖)下载到 offline_pkgs 文件夹。
拷贝依赖和项目文件到无网络环境
- 拷贝 offline_pkgs 文件夹到无网络环境
- 拷贝你的 word2vec复现.py 以及所需的 ../data/、../model/ 文件夹
3. 在无网络环境下新建虚拟环境
python3 -m venv venv
source venv/bin/activate
4. 离线安装依赖
进入 offline_pkgs 文件夹,执行:
pip install --no-index --find-links=offline_pkgs torch pandas jieba tqdm transformers
如果有依赖报错,先安装报错的依赖,再装主包。
5. 检查依赖安装
pip list
确认 torch、pandas、jieba、tqdm、transformers 都已安装。
6. 运行你的代码
确保你在虚拟环境中,且数据和模型路径正确:
python word2vec复现.py