pytorch-task4

import os
import csv
import jieba
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import Word2Vec
import torch
import torch.nn as nn
from torch.optim import Adam
import torch.autograd as autograd
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

class_num = 10
batch_size = 256
maxlen = 500
word2vec_size = 100

train_dir = './data/cnews/cnews.train.txt'
valid_dir = './data/cnews/cnews.val.txt'
test_dir = './data/cnews/cnews.test.txt'
word2vec_dir = './word2vec/word2vec.hdf5'
userdict_dir = './dict/userdict.txt'
stopword_dir = './dict/stopword.txt'

def cut_word(x, stop_word):
    words = []
    for word in list(jieba.cut(x)):
        if word not in stop_word and len(word) != 1:
            words.append(word)
    return words


def get_word_vocab(content):
    word_vocb = []
    for sentence in content:
        word_vocb.extend(list(set(sentence)))
    return list(set(word_vocb))


def get_x(content, word_index):
    X = np.array((len(content), maxlen))
    for i in range(len(content)):
        if len(content[i]) < maxlen:
            for j in range(0, len(content[i])):
                X[i][j] = word_index[content[i][j]]
        else:
            for j in range(0, maxlen):
                X[i][j] = word_index[content[i][j]]
    return X


def get_label_vector(label):
    label_code = pd.get_dummies(list(set(label)))
    label_vector = dict()
    for col in label_code.columns:
        label_vector[col] = label_code[col].tolist()
    return label_vector


print('read data')
data = pd.read_csv(test_dir, delimiter='\t', index_col=None, names=['label', 'content'])
test = pd.read_csv(valid_dir, delimiter='\t', index_col=None, names=['label', 'content'])

print(data.shape)
print(test.shape)

print('cut word')
jieba.load_userdict(userdict_dir)
stop_word = pd.read_csv(stopword_dir, quoting=csv.QUOTE_NONE, index_col=None, names=['word'])['word'].tolist()
data['content'] = data['content'].apply(lambda x: cut_word(x, stop_word))
test['content'] = test['content'].apply(lambda x: cut_word(x, stop_word))
content = pd.concat([data['content'], test['content']], axis=0, ignore_index=True)

print('word vocab')
word_vocab = get_word_vocab(content)
word_index = dict(zip(word_vocab, range(1, len(word_vocab) + 1)))
index_word = dict(zip(list(word_index.values()), list(word_index.keys())))

print('word2vec')
if not os.path.exists(word2vec_dir):
    model = Word2Vec(content, size=word2vec_size, seed=2019, min_count=6, window=10, iter=8, workers=8)
    model.save(word2vec_dir)
else:
    model = Word2Vec.load(word2vec_dir)

embedding_matrix = np.zeros((len(word_index) + 1, word2vec_size))
for word, i in word_index.items():
    if word in model:
        embedding_matrix[i] = model[word]
print(embedding_matrix.shape)

print('label_vector')
label_vector = get_label_vector(data['label'])

class DataLoader(object):
    def __init__(self, data, config):
        self.data = data
        self.batch_size = config['batch_size']
        self.maxlen = config['maxlen']
        self.word_index = config['word_index']
        self.label_vector = config['label_vector']

    def pad_sequences(self, content):
        sequences = np.zeros((len(content), self.maxlen))
        for i in range(len(content)):
            if len(content[i]) < maxlen:
                for j in range(0, len(content[i])):
                    sequences[i][j] = self.word_index[content[i][j]]
            else:
                for j in range(0, maxlen):
                    sequences[i][j] = self.word_index[content[i][j]]
        return sequences

    def train_batch_data(self, is_shuffle=True):
        if is_shuffle:
            self.data = self.data.sample(frac=1).reset_index(drop=True)

        length = len(self.data) // self.batch_size

        if self.batch_size * length < len(self.data):
            length += 1

        for i in tqdm(range(length)):
            if self.batch_size * (i + 1) > len(self.data):
                batch_data = self.data['content'].iloc[self.batch_size * i:]
            else:
                batch_data = self.data['content'].iloc[self.batch_size * i:self.batch_size * (i + 1)]
            sequences = self.pad_sequences(batch_data)
            label = self.data['label'].map(self.label_vector)
            yield torch.LongTensor(sequences), torch.LongTensor(label)

    def test_batch_data(self):
        length = len(self.data) // self.batch_size

        if self.batch_size * length < len(self.data):
            length += 1

        for i in tqdm(range(length)):
            if self.batch_size * (i + 1) > len(self.data):
                batch_data = self.data['content'].iloc[self.batch_size * i:]
            else:
                batch_data = self.data['content'].iloc[self.batch_size * i:self.batch_size * (i + 1)]
            sequences = self.pad_sequences(batch_data)
            yield torch.LongTensor(sequences)

class TextCNN(nn.Module):
    def __init__(self, config):
        super(TextCNN, self).__init__()
        self.class_num = config['class_num']
        self.embedding_matrix = config['embedding_matrix']

        self.embeding = nn.Embedding(self.embedding_matrix.shape[0], self.embedding_matrix.shape[1], _weight=self.embedding_matrix)
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2))
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2))
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2))
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2))
        self.out = nn.Linear(256, self.class_num)

    def forward(self, x):
        x = self.embeding(x)
        x = x.view(x.size(0), 1, maxlen, word2vec_size)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = x.view(x.size(0), -1)  # 将(batch,outchanel,w,h)展平为(batch,outchanel*w*h)
        print(x.size(0))
        print(x.size())
        output = self.out(x)
        return output

class Model(object):
    def __init__(self, train_loader, valid_loader, test_loader, config):
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.test_loader = test_loader
        self.model = TextCNN(config=config)
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = Adam(self.model.parameters(), lr=config['learning_rate'])

    def verification(self):
        res = []
        for query, title, fea, label in self.valid_loader.train_batch_data(is_shuffle=True):
            out = self.model(query, title, fea)
            res.extend([item.detach().numpy()[1] for item in list(out)])
    
        res = pd.DataFrame(res, columns=['pred'])
        valid_ans = pd.concat([self.valid_loader.data.loc[:, ['query_id', 'label']], res], axis=1)
    
        qauc = calculate_qauc(valid_ans)
        print('qauc is:')
        print(qauc)
        if qauc > self.mx_qauc:
            self.mx_qauc = qauc
            torch.save(self.model, './wnd/model/model.pkl')

    def fit(self, epoch):
        for i in range(epoch):
            for sequences, label in self.train_loader.train_batch_data():
                print(sequences)
                out = self.model(sequences)  # 前向传播求出的预测值
                self.optimizer.zero_grad()  # 将梯度初始化为零
                loss = self.criterion(out, autograd.Variable(label.long()))  # 损失函数
                loss.backward()  # 反向传播求梯度
                self.optimizer.step()  # 更新所有参数

            # self.verification()

    def restore(self):
        self.model = torch.load('./wnd/model/model.pkl')

    def predict(self):
        res = []
        for sequences in self.test_loader.test_batch_data():
            out = self.model(sequences)
            res.extend([item.detach().numpy()[1] for item in list(out)])

        res = pd.DataFrame(res, columns=['pred'])
        res.to_csv('./nn_res.csv', header=None, index=None, sep=',')

config = dict()
config['batch_size'] = batch_size
config['maxlen'] = maxlen
config['word_index'] = word_index
config['label_vector'] = label_vector
config['class_num'] = class_num
config['learning_rate'] = 1e-3
config['embedding_matrix'] = torch.Tensor(embedding_matrix)

data = data.sample(frac=1).reset_index(drop=True)
train = data.head(8000)
valid = data.tail(2000)

print('data', data.shape)
print('train', train.shape)
print('valid', valid.shape)
print('test', test.shape)

train_loader = DataLoader(train, config)
valid_loader = DataLoader(valid, config)
test_loader = DataLoader(test, config)
model = Model(train_loader, valid_loader, test_loader, config)
model.fit(2)
# model = Model(train_loader, valid_loader, test_loader, config)
# model.restore()
model.predict()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
根据引用\[1\]和引用\[2\]的内容,你可以使用以下命令在虚拟环境中安装PyTorch和相关库: ``` mamba install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia ``` 这个命令会安装PyTorch、torchvision和torchaudio,并指定使用CUDA 11.7版本。同时,它会从pytorch和nvidia的频道中获取软件包。 然而,根据引用\[3\]的内容,如果你在指定的镜像源中找不到指定版本的PyTorch,可能会导致安装的是CPU版本而不是GPU版本。为了解决这个问题,你可以尝试使用其他镜像源或者手动指定安装GPU版本的PyTorch。 综上所述,你可以尝试使用以下命令来安装PyTorch和相关库,并指定使用CUDA 11.7版本: ``` mamba install pytorch torchvision pytorch-cuda=11.7 -c pytorch -c nvidia ``` 希望这能帮到你! #### 引用[.reference_title] - *1* [三分钟搞懂最简单的Pytorch安装流程](https://blog.csdn.net/weixin_44261300/article/details/129643480)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^control,239^v3^insert_chatgpt"}} ] [.reference_item] - *2* [Pytorch与NVIDA驱动控制安装](https://blog.csdn.net/m0_48176714/article/details/129311194)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^control,239^v3^insert_chatgpt"}} ] [.reference_item] - *3* [解决使用conda下载pytorch-gpu版本困难的问题](https://blog.csdn.net/qq_41963301/article/details/131070422)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^control,239^v3^insert_chatgpt"}} ] [.reference_item] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值