数据集:数据集是关于酒店评价的语料
目标:利用预训练模型将输入文本进行特征表示, 后接自定义网络输出结果
代码
import torch
import torch.nn as nn
from tensorflow.keras.preprocessing import sequence
import pandas as pd
from sklearn.utils import shuffle
from functools import reduce
# 直接加载预训练模型进行输入文本的特征表示
source='huggingface/pytorch-transformers'
model_name='bert-base-chinese'
model=torch.hub.load(source,'model',model_name)
tokenizer=torch.hub.load(source,'tokenizer',model_name)
cutlen=32
def get_bert_encode(text):
indexed_tensors=tokenizer.encode(text[:cutlen])[1:-1]
indexed_tensors=sequence.pad_sequences([indexed_tensors],cutlen)
token_tensor=torch.LongTensor(indexed_tensors)
with torch.no_grad():
encoder_layers,_=model(token_tensor)
encoder_layers=encoder_layers[0]
return encoder_layers
# 定义模型
class Net_model(nn.Module):
def __init__(self,char_size=32,embedding_size=768):
super().__init__()
self.char_size=char_size
self.embedding_size=embedding_size
self.fc1=nn.Linear(self.char_size*self.embedding_size,2)
def forward(self,x):
x=x.view(-1,self.char_size*self.embedding_size)
x=self.fc1(x)
return x
# 构造训练数据验证数据批次生成器
def data_loader(train_data_path,valid_data_path,batch_size):
train_data=pd.read_csv(train_data_path,header=None,sep='\t').drop([0])
valid_data=pd.read_csv(valid_data_path,header=None,sep='\t').drop([0])
if len(valid_data)<batch_size:
raise(" batch size is not match")
def __loader_generator(data):
for batch in range(0,len(data),batch_size):
batch_encoder=[]
batch_labels=[]
for item in shuffle(data.values.tolist())[batch:batch+batch_size]:
encoders=get_bert_encode(item[0])
batch_encoder.append(encoders)
batch_labels.append([int(item[1])])
encoder=reduce(lambda x,y:torch.cat((x,y),dim=0),batch_encoder)
label=torch.tensor(reduce(lambda x,y:x+y,batch_labels))
yield (encoder,label)
return __loader_generator(train_data),__loader_generator(valid_data),len(train_data),len(valid_data)
# 定义训练函数
def train(train_data):
total_loss=0
total_acc=0
for train_data,label in train_data:
optimizer.zero_grad()
result_output=net(train_data)
loss=criterion(result_output,label)
loss.backward()
total_loss+=loss
optimizer.step()
total_acc+=(result_output.argmax(1)==label).sum().item()
return total_loss,total_acc
# 定义验证函数
def valid(valid_data):
total_valid_loss=0
total_valid_acc=0
for valid_datas,label in valid_data:
with torch.no_grad():
result_output=net(valid_data)
loss=criterion(result_output,label)
total_valid_loss+=loss
total_valid_acc+=(result_output.argmax(1)==label).sum().item()
return total_valid_loss,total_valid_acc
if __name__=='__main__':
train_data_path='/root/data/cn_data/SST-2/train.tsv'
valid_data_path='/root/data/cn_data/SST-2/dev.tsv'
net=Net_model()
epochs=6
batch_size=32
criterion=nn.CrossEntropyloss()
optimizer=torch.optim.SGD(net.parameters(),lr=0.05,momentum=0.9)
for epoch in range(epoches):
train_data,valid_data,train_len,valid_len=data_loader(train_data_path,valid_data_path,batch_size)
train_loss,train_acc=train(train_data)
valid_loss,valid_acc=valid(valid_data)
train_average_loss = train_loss* batch_size / train_len
valid_average_loss = valid_loss* batch_size / valid_len
train_average_acc = train_acc/ train_len
valid_average_acc = valid_acc/ valid_len
print("Train Loss:", train_average_loss, "|", "Train Acc:", train_average_acc)
print("Valid Loss:", valid_average_loss, "|", "Valid Acc:", valid_average_acc)
#保存模型
torch.save(net.state_dict(),'./net_model_model.pkl')
#模型预测
if __name__=='__main__':
text='酒店环境不错,设备也不错'
加载模型
net.load_state_dict(torch.load('./net_model_model.pkl')
print("输入文本为:", text)
with torch.no_grad():
predict=net(get_bert_encode(text))
print("预测的值为",torch.argmax(predict).item()
训练集验证集损失及正确率
预测结果
输入文本为: 酒店环境不错,设备也不错.
预测的值为: 1