#此数据为京东评论数据分为两列一列为text,一列为target分数需要数据集私信#1.处理数据#导入数据处理的基础包import numpy as np
import pandas as pd
#导入用于计数的包from collections import Counter
import os
import requests
#这里label的评分有1-5有5类#目的是将label的数值 -1 缩放到[0,4]之间defget_label(label):
label = label -1return label
data["target"]= data['target'].apply(get_label)#按评论进行去重,对于重复项,保留第一次出现的值
data = data.drop_duplicates('text',keep='first')#会将标签重新从零开始顺序排序,使用参数设置drop=True删除旧的索引序列
data = data.reset_index(drop=True)import re
defclear_character(sentence):
pattern1='[a-zA-Z0-9]'
pattern2 = re.compile(u'[^\s1234567890::'+'\u4e00-\u9fa5]+')
pattern3='[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+'
line1=re.sub(pattern1,'',sentence)#去除英文字母和数字
line2=re.sub(pattern2,'',line1)#去除表情和其他字符
line3=re.sub(pattern3,'',line2)#去除去掉残留的冒号及其它符号
new_sentence=''.join(line3.split())#去除空白return new_sentence
#在["评论"]这一列使用定义的"clear_character"函数
data["text"]=data['text'].apply(clear_character)
data.head()# 导入中文分词包jieba, 并用jieba对原始文本做分词import jieba
from tqdm import tqdm
defcomment_cut(content):# TODO: 使用结巴完成对每一个comment的分词
seg =list(jieba.cut(content.strip()))return seg
# 输出进度条
tqdm.pandas(desc='apply')
data['text']= data['text'].progress_apply(comment_cut)# 观察新的数据的格式
data.head()# 停用词可以去网上搜下载的停用词表改为json格式,读取下载的停用词表,并保存在列表中withopen("D:\\shujuji\\1\\stopwords.json","r",encoding='utf-8')as f:
stopWords = f.read().split("\n")# 去除停用词defrm_stop_word(wordList):
filtered_words =[word for word in wordList if word notin stopWords]return filtered_words
#return " ".join(filtered_words)#这行代码中.progress_apply()函数的作用等同于.apply()函数的作用,只是写成.progress_apply()函数才能被tqdm包监控从而输出进度条。
data['text']= data['text'].progress_apply(rm_stop_word)# 观察新的数据的格式
data.head()# 去除低频词, 去掉词频小于10的单词,并把结果存放在data['comment_processed']里from collections import Counter
list_set =[]for i inrange(len(data)):for j in data.iloc[i]['text']:
list_set.extend(j)
words_count = Counter(list_set)
min_threshold=10
my_dict ={k: v for k, v in words_count.items()if v < min_threshold}
filteredA = Counter(my_dict)# 去除低频词defrm_low_frequence_word(wordList):# your code, remove stop words# TODO
outstr =''for word in wordList:if word notin filteredA:if word !='\t':
outstr += word
outstr +=" "#filtered_words = [word for word in wordList if word not in filteredA]return outstr
#这行代码中.progress_apply()函数的作用等同于.apply()函数的作用,只是写成.progress_apply()函数才能被tqdm包监控从而输出进度条。
data['text']= data['text'].progress_apply(rm_low_frequence_word)
data.head()
#进行模型处理过程import collections
import os
import random
import time
from tqdm import tqdm
import numpy as np
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
#os.environ["CUDA_VISIBLE_DEVICES"] = "6"#使用GPU运算
device=torch.device("cuda:6"if torch.cuda.is_available()else"cpu")#首先将comment_processed中的每一条评论转换为列表
word_list=[str(s).split()for s in data["text"]]print(word_list)#生成word2vec模型from gensim.models.word2vec import Word2Vec
import time
start = time.time()#窗口大小设置为3,词的最小出现次数为1
model_w2v = Word2Vec(word_list, window =3,iter=5,size=256,min_count=1)print('完成')
end = time.time()print('花费时间:', end - start)print(model_w2v)#数据集做划分from sklearn.model_selection import train_test_split
Temp_trin, valid_data = train_test_split(data,test_size=0.2, random_state=42)#默认split_ratio=0.7
train_data,test_data = train_test_split(Temp_trin,test_size=0.2, random_state=42)
train_data.to_csv("D:/shujuji/1/2/train_data.csv",index=False,header=True,encoding="utf-8")
valid_data.to_csv("D:/shujuji/1/2/valid_data.csv",index=False,header=True,encoding="utf-8")
test_data.to_csv("D:/shujuji/1/2/test_data.csv",index=False,header=True,encoding="utf-8")#torchtext处理数据过程import torch
import torchtext
from torchtext.legacy import data
from torchtext.legacy.data import Field
from torchtext.legacy.data import TabularDataset
torch.backends.cudnn.deterministic =True
tokenize =lambda x:x.split()
TEXT = data.Field(sequential=True,tokenize=tokenize)
LABEL = data.Field(sequential=False, dtype=torch.long, use_vocab=False)
fields =[('text',TEXT),('label',LABEL)]#定理类划分数据集classDataFrameDataset(data.Dataset):def__init__(self, df, fields, is_test=False,**kwargs):
examples =[]for i, row in df.iterrows():
label = row.target ifnot is_test elseNone
text = row.text
examples.append(data.Example.fromlist([text, label], fields))super().__init__(examples, fields,**kwargs)@staticmethoddefsort_key(ex):returnlen(ex.text)@classmethoddefsplits(cls, fields, train_df, val_df=None, test_df=None,**kwargs):
train_data, val_data, test_data =(None,None,None)
data_field = fields
if train_df isnotNone:
train_data = cls(train_df.copy(), data_field,**kwargs)if val_df isnotNone:
val_data = cls(val_df.copy(), data_field,**kwargs)if test_df isnotNone:
test_data = cls(test_df.copy(), data_field,True,**kwargs)returntuple(d for d in(train_data, val_data, test_data)if d isnotNone)print(f'Number of training examples: {len(train_data)}')print(f'Number of validation examples: {len(valid_data)}')print(f'Number of testing examples: {len(test_data)}')
train_df,val_df,test_df = DataFrameDataset.splits(fields, train_df=train_data, val_df=valid_data, test_df = test_data)# 构建词表
TEXT.build_vocab(train_df)# print(train[0].__dict__.keys())print(vars(train_df.examples[0]))print(vars(test_df.examples[0]))#语料库单词频率越高,索引越靠前。前两个默认为unk和pad。print(TEXT.vocab.stoi)#查看训练数据集中最常见的单词。print(TEXT.vocab.freqs.most_common(40))print(TEXT.vocab.itos[:10])#查看TEXT单词表#生成词嵌入矩阵import numpy as np
embedding_dic =dict(zip(model_w2v.wv.index2word, model_w2v.wv.syn0))
embedding_matrix = np.zeros((len(TEXT.vocab),256))for w, i in TEXT.vocab.stoi.items():
embedding_vec = embedding_dic.get(w)if embedding_vec isnotNone:
embedding_matrix[i]= embedding_vec
print(embedding_matrix.shape)#划分batchfrom torchtext.legacy.data import Iterator, BucketIterator
train_batch_size =64
val_batch_size =64
test_batch_size =64#相当于把样本划分batch,只是多做了一步,把相等长度的单词尽可能的划分到一个batch,不够长的就用padding。# 同时对训练集和验证集进行迭代器构建
train_iterator, valid_iterator = BucketIterator.splits((train_df, val_df),
batch_sizes=(train_batch_size, val_batch_size),
device=device,
sort_key=lambda x:len(x.text),
sort_within_batch=False,
repeat=False)# 对测试集进行迭代器构建
test_iterator = Iterator(
test_df,
batch_size=test_batch_size,
device=device,
sort=False,
sort_within_batch=False,
repeat=False)#定义bilstm+attention模型import torch.nn as nn
import torch.nn.functional as F
classLSTMmodel(nn.Module):def__init__(self,embedding_size,hidden_size,output_size):super(LSTMmodel,self).__init__()
self.embedding=nn.Embedding(len(TEXT.vocab),256)
self.lstm=nn.LSTM(embedding_size,hidden_size,num_layers=2,bidirectional=True,dropout=0.5)
self.fc=nn.Linear(hidden_size*2,output_size)
self.dropout = nn.Dropout(0.5)#向前传播defattention_net(self, x, query, mask=None):
d_k = query.size(-1)# d_k为query的维度# query:[batch, seq_len, hidden_dim*2], x.t:[batch, hidden_dim*2, seq_len]# print("query: ", query.shape, x.transpose(1, 2).shape) # torch.Size([128, 38, 128]) torch.Size([128, 128, 38])# 打分机制 scores: [batch, seq_len, seq_len]
scores = torch.matmul(query, x.transpose(1,2))/ math.sqrt(d_k)# print("score: ", scores.shape) # torch.Size([128, 38, 38])# 对最后一个维度 归一化得分
alpha_n = F.softmax(scores, dim=-1)# print("alpha_n: ", alpha_n.shape) # torch.Size([128, 38, 38])# 对权重化的x求和# [batch, seq_len, seq_len]·[batch,seq_len, hidden_dim*2] = [batch,seq_len,hidden_dim*2] -> [batch, hidden_dim*2]
context = torch.matmul(alpha_n, x).sum(1)return context, alpha_n
defforward(self,text):
embedded=self.embedding(text)
output,(hidden,c)=self.lstm(embedded)#hidden的维度是(num_layers * num_directions, batch, hidden_size)取最后一层的前向和后向输出,[4,64,hidden_size]
h = torch.cat((hidden[-1,:,:], hidden[-2,:,:]), dim=1)
output = output.permute(1,0,2)# [batch, seq_len, hidden_dim*2]
query = self.dropout(output)# 加入attention机制
attn_output, alpha_n = self.attention_net(output, query)
output = self.fc(attn_output)#print("h",h)#print(h.shape)# output=self.fc(h)return output
deftrain(model, iterator, optimizer, criterion):
epoch_loss =0
epoch_acc =0
total_len =0
count =0
model.train()#model.train()代表了训练模式#这步一定要加,是为了区分model训练和测试的模式的。#有时候训练时会用到dropout、归一化等方法,但是测试的时候不能用dropout等方法。for batch in iterator:#iterator为train_iterator
optimizer.zero_grad()#加这步防止梯度叠加
predictions = model(batch.text)#print("predictions",predictions)#batch.comment_processed comment_processed
loss = criterion(predictions, batch.label)
epoch_loss += loss.item()
loss.backward()#反向传播
optimizer.step()#梯度下降
epoch_acc +=((predictions.argmax(axis =1))== batch.label).sum().item()#(acc.item():一个batch的正确率) *batch数 = 正确数#train_iterator所有batch的正确数累加。
total_len +=len(batch.label)#计算train_iterator所有样本的数量
count +=1print(f'训练了{count}个batch')return epoch_loss / total_len, epoch_acc / total_len
#epoch_loss / total_len :train_iterator所有batch的损失#epoch_acc / total_len :train_iterator所有batch的正确率defevaluate(model, iterator, criterion):
epoch_loss =0
epoch_acc =0
total_len =0
count =0
model.eval()#转换成测试模式,冻结dropout层或其他层。with torch.no_grad():for batch in iterator:#iterator为valid_iterator#没有反向传播和梯度下降
predictions = model(batch.text)
loss = criterion(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc +=((predictions.argmax(axis =1))== batch.label).sum().item()
total_len +=len(batch.label)
count +=1
model.train()#调回训练模式 print(f'验证了{count}个batch')return epoch_loss / total_len, epoch_acc / total_len
#设置参数#设置超参数
EMBEDDING_SIZE =256
HIDDEN_SIZE =128
OUTPUT_SIZE =5#实例化模型
model = LSTMmodel(embedding_size = EMBEDDING_SIZE,
hidden_size = HIDDEN_SIZE,
output_size = OUTPUT_SIZE,).to(device)#模型词向量初始化成预训练的词向量#from_munpy ndarray和tensor转换#将生成的词向量-id矩阵嵌入到我们的网络模型中
model.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))[2:10]defcount_parameters(model):#统计模型参数returnsum(p.numel()for p in model.parameters()if p.requires_grad)print(f'The model has {count_parameters(model):,} trainable parameters')import torch.optim as optim
optimizer = optim.Adam(model.parameters())#定义优化器
criterion = nn.CrossEntropyLoss()#定义损失函数,交叉熵损失函数
model = model.to(device)#送到gpu上去
criterion = criterion.to(device)#送到gpu上去import time
defepoch_time(start_time, end_time):#查看每个epoch的时间
elapsed_time = end_time - start_time
elapsed_mins =int(elapsed_time /60)
elapsed_secs =int(elapsed_time -(elapsed_mins *60))return elapsed_mins, elapsed_secs
#最后训练import math
N_EPOCHS =10
best_valid_loss =float('inf')#无穷大for epoch in tqdm(range(N_EPOCHS),desc='Processing'):
start_time = time.time()
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)if valid_loss < best_valid_loss:#只要模型效果变好,就存模型
best_valid_loss = valid_loss
torch.save(model.state_dict(),'Best-Checkpoint.pt')print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')