语料链接:https://pan.baidu.com/s/1YxGGYmeByuAlRdAVov_ZLg
提取码:tzao
neg.txt和pos.txt各5000条酒店评论,每条评论一行。
1. 导包和设定超参数
import numpy as np
import random
import torch
import matplotlib.pylab as plt
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
SEED = 123
BATCH_SIZE = 16
learning_rate = 2e-5
weight_decay = 1e-2
epsilon = 1e-8
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
2. 数据预处理
2.1 读取文件
def readFile(filename):
with open(filename, encoding='utf-8') as f:
content = f.readlines()
return content
pos_text, neg_text = readFile('./hotel/pos.txt'), readFile('./hotel/neg.txt')
sentences = pos_text + neg_text
# 设定标签
pos_targets = np.ones([len(pos_text)]) # (5000, )
neg_targets = np.zeros([len(neg_text)]) # (5000, )
targets = np.concatenate((pos_targets, neg_targets), axis=0).reshape(-1, 1) # (10000, 1)
total_targets = torch.tensor(targets)
2.2 BertTokenizer进行编码,将每一句转成数字
model_name = 'bert-base-chinese'
cache_dir = './sample_data/'
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
print(pos_text[2])
print(tokenizer.tokenize(pos_text[2]))
print(tokenizer.encode(pos_text[2]))
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(pos_text[2])))
不错,下次还考虑入住。交通也方便,在餐厅吃的也不错。
['不', '错', ',', '下', '次', '还', '考', '虑', '入', '住', '。', '交', '通', '也', '方', '便', ',', '在', '餐', '厅', '吃', '的', '也', '不', '错', '。']
[101, 679, 7231, 8024, 678, 3613, 6820, 5440, 5991, 1057, 857, 511, 769, 6858, 738, 3175, 912, 8024, 1762, 7623, 1324, 1391, 4638, 738, 679, 7231, 511, 102]
['[CLS]', '不', '错', ',', '下', '次', '还', '考', '虑', '入', '住', '。', '交', '通', '也', '方', '便', ',', '在', '餐', '厅', '吃', '的', '也', '不', '错', '。', '[SEP]']
为了使每一句的长度相等,稍作处理;
# 将每一句转成数字 (大于126做截断,小于126做 Padding,加上首位两个标识,长度总共等于128)
def convert_text_to_token(tokenizer, sentence, limit_size = 126):
tokens = tokenizer.encode(sentence[:limit_size]) # 直接截断
if len(tokens) < limit_size + 2: # 补齐(pad的索引号就是0)
tokens.extend([0] * (limit_size + 2 - len(tokens)))
return tokens
input_ids = [convert_text_to_token(tokenizer, sen) for sen in sentences]
input_tokens = torch.tensor(input_ids)
print(input_tokens.shape) # torch.Size([10000, 128])
2.3 attention_masks, 在一个文本中,如果是PAD符号则是0,否则就是1
# 建立mask
def attention_masks(input_ids):
atten_masks = []
for seq in input_ids: # [10000, 128]
seq_mask = [float(i > 0) for i in seq] # PAD: 0; 否则: 1
atten_masks.append(seq_mask)
return atten_masks
atten_masks = attention_masks(input_ids)
attention_tokens = torch.tensor(atten_masks)
print(attention_tokens.shape) # torch.Size([10000, 128])
-
构造input_ids 和 atten_masks 的目的 和 前面一节中提到的
.encode_plus
函数返回的 input_ids 和 attention_mask 一样 -
input_type_ids 和 本次任务无关,它是针对每个训练集有两个句子的任务(如问答任务)。
2.4 划分训练集和测试集
- 两个划分函数的参数 random_state 和 test_size 值要一致,才能使得 train_inputs 和 train_masks一一对应。
from sklearn.model_selection import train_test_split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_tokens, total_targets,
random_state=666, test_size=0.2)
train_masks, test_masks, _, _ = train_test_split(attention_token