训练数据
需要训练数据的可以从这里直接下载https://download.csdn.net/download/zhangdongren/19143082
训练代码
#导入库
import torch
import torch.nn as nn
#处理数据
from torchtext.legacy import data
import torch.optim as optim
import numpy as np
#中文分词处理工具
import jieba
import re
BATCH_SIZE = 2
#产生同样的结果
SEED = 2019
torch.manual_seed(SEED)
#Cuda 算法
torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def x_tokenize(x):
str1 = re.sub('[^\u4e00-\u9fa5]', "", x)
return jieba.lcut(str1)
TEXT = data.Field(sequential=True, tokenize=x_tokenize, fix_length=BATCH_SIZE, use_vocab=True)
LABEL = data.Field(sequential=False, use_vocab=False)
#使用torchtext.data.Tabulardataset.splits读取文件
train, dev, test = data.TabularDataset.splits(path='dataset', train='csv_train.csv', validation='csv_dev.csv',
test='csv_test.csv', format='csv', skip_header=True, csv_reader_params={'delimiter' : ','},
fields=[('Sentence', TEXT), ('Label', LABEL)])
#构建词表,即需要给每个单词编码,也就是用数字表示每个单词,这样才能传入模型。
TEXT.build_vocab(train)
#构建迭代器BucketIterator
train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test), batch_size=BATCH_SIZE, shuffle = True,
sort = False, sort_within_batch = False, repeat = False)
#****************************************************
class TextRNN(nn.Module):
#定义所有层
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
super().__init__()
#embedding 层
self.embedding = nn.Embedding(vocab_size, embedding_dim)
#lstm 层
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
#全连接层
self.fc = nn.Linear(hidden_dim * 2, output_dim)
def forward(self, text):
#text = [batch size, sent_length]
embedded = self.embedding(text)
#embedded = [batch size, sent_len, emb dim]
out, (hidden, cell) = self.lstm(embedded)
# 句子最后时刻的 hidden state
out = self.fc(out[:, -1, :])
return out
#****************************************************
#定义超参数
size_of_vocab = len(TEXT.vocab)
embedding_dim = 300
num_hidden = 128
num_layers = 2
num_output = 3
dropout = 0.2
#实例化模型
model = TextRNN(size_of_vocab, embedding_dim, num_hidden, num_output, num_layers, bidirectional = True, dropout = dropout)
#model = TextRNN(size_of_vocab, embedding_dim, num_hidden, num_output, num_layers, bidirectional = True, dropout = dropout)
#model.load_state_dict(torch.load('dataset/myjd.pt'))
#****************************************************
#定义优化器和损失
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
#如果cuda可用
model = model.to(device)
criterion = criterion.to(device)
#****************************************************
#训练模型
for epoch in range(100):
batch_loss = []
for batch in train_iter:
#在每一个batch后设置0梯度
optimizer.zero_grad()
text = batch.Sentence.to(device)
label = batch.Label.to(device)
#转换成一维张量
predictions = model(text).squeeze()
#计算损失
loss = criterion(predictions, label)
batch_loss.append(loss.item())
#反向传播损耗并计算梯度
loss.backward()
#更新权重
optimizer.step()
avg_loss = np.array(batch_loss).mean()
print("==epoch===>%s===avg_loss===>%s"%(epoch, avg_loss))
torch.save(model.state_dict(), 'dataset/myjd.pt')
测试代码
rnn = TextRNN(size_of_vocab, embedding_dim, num_hidden, num_output, num_layers, bidirectional = True, dropout = dropout)
rnn.load_state_dict(torch.load('dataset/myjd.pt'))
def predict():
sent1 = '不推荐买'
demo = [data.Example.fromlist(data=[sent1,0], fields=[('Sentence', TEXT), ('Label', LABEL)])]
demo_iter = data.BucketIterator(dataset = data.Dataset(demo, fields=[('Sentence', TEXT), ('Label', LABEL)]),
batch_size = BATCH_SIZE, shuffle = True, sort_key = lambda x:len(x.text), sort_within_batch = False, repeat = False)
for batch in demo_iter:
text = batch.Sentence
text = torch.t(text)
out = rnn(text)
if torch.argmax(out, dim=1).item() == 0:
print('差评')
elif torch.argmax(out, dim=1).item() == 2:
print('好评')
else:
print('中评')
predict()