数据来源: 微博,网上下载的
格式:
前面的是label,后面的是内容
数据预处理:
没有做预处理,应该去掉转述别人的内容,只保留自己的。
- 构建平行语料
def get_data():
pd_all = pd.read_csv("weibo_senti_100k.csv")
print('评论数目(总体):%d' % pd_all.shape[0])
print('评论数目(正向):%d' % pd_all[pd_all.label==1].shape[0])
print('评论数目(负向):%d' % pd_all[pd_all.label==0].shape[0])
positive_data = pd_all[pd_all.label == 1]
negative_data = pd_all[pd_all.label == 0]
positive_data = np.array(positive_data["review"])
negative_data = np.array(negative_data["review"])
# data_cnt_list = [10000, 20000, 50000, 80000]
# 分别构建了10000,20000,50000,80000的平行语料
for i in range(len(data_cnt_list)):
# p_data_list = []; n_data_list = []
example_data_list = []; example_label_list = []
for j in tqdm(range(0, data_cnt_list[i], 2), ncols = 80,
desc = "weibo_snti_%d_parrel"%data_cnt_list[i]):
# 1:1的构建正负例
example_data_list.append(np.random.choice(positive_data))
example_data_list.append(np.random.choice(negative_data))
example_label_list.append(1); example_label_list.append(0)
# shuffle操作
d_index = np.arange(data_cnt_list[i])
np.random.shuffle(d_index)
# print(d_index, "@@@")
# d_index = d_index.tolist()
example_data_list = np.array(example_data_list)
example_label_list = np.array(example_label_list)
example_data_list = example_data_list[d_index]
example_label_list = example_label_list[d_index]
data_ = pd.DataFrame(example_data_list, columns = ['data'])
label_ = pd.DataFrame(example_label_list, columns = ['label'])
data_ = pd.concat([data_, label_], axis = 1, ignore_index = True)
data_ = data_.rename(columns = {0: "data", 1: "label"})
data_.to_csv("weibo_snti_%d_parrel.csv"%data_cnt_list[i])
- 切分语料,使用sklearn.model_selection的train_test_split
def read_data(cnt, args):
df = pd.read_csv("weibo_snti_%d_parrel.csv"%cnt)
data = df["data"]; label = df["label"]
train_inputs, test_inputs, train_label, test_label = \
train_test_split(data, label)
train_inputs = train_inputs.tolist(); train_label = train_label.tolist()
test_inputs = test_inputs.tolist(); test_label = test_label.tolist()
- 构建数据,使用上篇文章中的BertDataSet
train_dataset = BertDataSet(train_inputs, train_label, "../bert-base-chinese",
seq_len = args.seq_len, encoding = args.encoding)
test_dataset = BertDataSet(test_inputs, test_label, "../bert-base-chinese",
seq_len = args.seq_len, encoding = args.encoding)
train_data_loader = DataLoader(train_dataset, batch_size = args.batch_size,
num_workers = 4)
test_data_loader = DataLoader(test_dataset, batch_size = args.batch_size,
num_workers = 4)
BertDataSet也需要修改:
from torch.utils.data import Dataset
import torch
import transformers as tfs
class BertDataSet(Dataset):
"""docstring for ClassName"""
def __init__(self, dataset, labelset, tokenizer_name, seq_len,
encoding = "utf-8"):
super(BertDataSet, self).__init__()
# self.vocab = vocab
self.seq_len = seq_len
# self.corpus_path = corpus_path
self.encoding = encoding
self.dataset = dataset
self.labelset = labelset
self.tokenizer = tfs.BertTokenizer.from_pretrained(tokenizer_name)
assert len(self.labelset) == len(self.labelset)
self.data_len = len(self.labelset)
# special_tokens_dict = {'additional_special_tokens': ["[EOS]"]}
# self.tokenizer.add_special_tokens(special_tokens_dict)
self.cls_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.cls_token)
self.pad_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token)
# self.tokenizer.eos_token("eos_token")
# bert_base_chinese中没有EOS
self.tokenizer.eos_token = "[EOS]"
# print(self.tokenizer.eos_token)
# print(self.tokenizer.pad_token, self.tokenizer.unk_token)
self.eos_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.eos_token)
# print(self.eos_id, "@@@")
# input("@@@")
def __len__(self):
return self.data_len
def __getitem__(self, item):
assert item < self.data_len
label = self.labelset[item]
data = self.dataset[item]
input_ids, mask_ids = self.get_input_id(data)
bert_input = ([self.cls_id] + input_ids + [self.eos_id])[:self.seq_len]
bert_mask = ([1] + mask_ids + [self.pad_id])[:self.seq_len]
padding = [self.pad_id for _ in range(self.seq_len - len(bert_input))]
bert_input.extend(padding); bert_mask.extend(padding)
output = {
"bert_input": bert_input,
"bert_mask": bert_mask,
"label": label
}
# print(bert_input)
# print(bert_mask)
return {key: torch.tensor(value) for key, value in output.items()}
def get_input_id(self, data):
id_list = []; mask_list = []
for i, word in enumerate(data):
id_list.append(self.tokenizer.convert_tokens_to_ids(word))
mask_list.append(1)
return id_list, mask_list
- 定义模型:
import torch
import torch.nn as nn
import transformers as tfs
# import math
import json
class BertSentiClassificationModel(nn.Module):
def __init__(self):
super(BertSentiClassificationModel, self).__init__()
# config = json.loads(open("../bert-base-chinese/config.json").read())
self.bert = tfs.BertModel.from_pretrained("../bert-base-chinese")
# self.bert.to("cuda")
self.dropout = nn.Dropout(0.1)
# self.tokenizer = tfs.BertTokenizer.from_pretrained("../bert-base-chinese")
self.linear = nn.Linear(768, 2) # 二分类问题
self.softmax = nn.LogSoftmax(dim = -1)
def forward(self, batch):
# batch_tokenized = self.tokenizer.batch_encode_plus(batch_sentences,
# add_special_token = True, max_len = 100, pad_to_max_length = True)
input_ids = torch.tensor(batch['bert_input'])
attention_mask = torch.tensor(batch['bert_mask'])
# print("\n")
# print(input_ids.size(), attention_mask.size())
bert_ouput = self.bert(input_ids, attention_mask = attention_mask)
bert_cls = bert_ouput[0][:, 0, :] # 取cls
output = self.softmax(self.linear(self.dropout(bert_cls)))
return output
- 训练模型定义
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from model.model import BertSentiClassificationModel
from torch.optim import Adam
from .optim_schedule import ScheduledOptim
import tqdm
class BertSentiTrainer(object):
"""docstring for BertSentiTrainer"""
def __init__(self, bert: BertSentiClassificationModel, vocab_size: int,
train_dataloader: DataLoader, test_dataloader: DataLoader,
with_cuda: bool, log_freq: int = 50, lr: float = 1e-4,
betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000):
super(BertSentiTrainer, self).__init__()
cuda_condition = torch.cuda.is_available() and with_cuda
self.device = torch.device("cuda" if cuda_condition else "cpu")
self.bert = bert
self.bert.to(self.device)
self.train_data = train_dataloader
self.test_data = test_dataloader
self.criterion = nn.CrossEntropyLoss()
self.optim = Adam(self.bert.parameters(), lr=lr, betas=betas,
weight_decay=weight_decay)
self.optim_schedule = ScheduledOptim(self.optim, 512,
n_warmup_steps=warmup_steps)
self.log_freq = log_freq
def train(self, epoch):
self.iteration(epoch, self.train_data)
def test(self, epoch):
self.iteration(epoch, self.test_data, train = False)
def iteration(self, epoch, data_loader, train = True):
str_code = "train" if train else "test"
data_iter = tqdm.tqdm(enumerate(data_loader),
desc="EP_%s:%d" % (str_code, epoch),
total=len(data_loader),
bar_format="{l_bar}{r_bar}", ncols = 80)
avg_loss = 0.0
total_correct = 0
total_element = 0
for i, data in enumerate(data_loader):
data = {key: value.to(self.device) for key, value in data.items()}
classfilabel = self.bert(data)
loss = self.criterion(classfilabel, data["label"])
# print(loss.item())
# print(classfilabel.argmax(dim = -1))
# print(data["label"])
# print(classfilabel.argmax(dim = -1).eq(data["label"]).sum())
# input("@@@@")
if train:
self.optim_schedule.zero_grad()
loss.backward()
self.optim_schedule.step_and_update_lr()
correct = classfilabel.argmax(dim = -1).eq(data["label"]).sum().item()
avg_loss += loss.item()
total_correct += correct
total_element += len(data["label"])
post_fix = {
"epoch": epoch,
"iter": i,
"avg_loss": avg_loss / (i + 1),
"avg_acc": total_correct / total_element * 100,
"loss": loss.item()
}
if i % self.log_freq == 0:
data_iter.write(str(post_fix))
print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter),
"total_acc=", total_correct * 100.0 / total_element)
def save(self, epoch, file_path = "output/bert_senti.model"):
output_path = file_path + ".ep%d" %epoch
torch.save(self.bert.cpu(), output_path)
self.bert.to(self.device)
print("EP:%d Model Saved on:" % epoch, output_path)
return output_path
- 训练代码:
bert = BertSentiClassificationModel()
trainer = BertSentiTrainer(bert, args.vocab_size, train_data_loader,
test_data_loader, True, log_freq = args.freq)
for epoch in range(args.epochs):
trainer.train(epoch)
trainer.save(epoch)
if test_data_loader is not None:
trainer.test(epoch)
6.实验结果
训练一个epoch就有92%的正确率,还是在没有经过任何处理的情况。
优化点:
1、数据预处理
2、embedding层可以自己训练
3、可以自己使用LM模型预训练bert(时间久,数据要求高)
补充:
优化器代码:
'''A wrapper class for optimizer '''
import numpy as np
class ScheduledOptim():
'''A simple wrapper class for learning rate scheduling'''
def __init__(self, optimizer, d_model, n_warmup_steps):
self._optimizer = optimizer
self.n_warmup_steps = n_warmup_steps
self.n_current_steps = 0
self.init_lr = np.power(d_model, -0.5)
def step_and_update_lr(self):
"Step with the inner optimizer"
self._update_learning_rate()
self._optimizer.step()
def zero_grad(self):
"Zero out the gradients by the inner optimizer"
self._optimizer.zero_grad()
def _get_lr_scale(self):
return np.min([
np.power(self.n_current_steps, -0.5),
np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])
def _update_learning_rate(self):
''' Learning rate scheduling per step '''
self.n_current_steps += 1
lr = self.init_lr * self._get_lr_scale()
for param_group in self._optimizer.param_groups:
param_group['lr'] = lr