本文主要是针对入门级别的Bert使用,包括英文文本分类和中文文本分类。
这部分主要使用Bert进行情感分析,属于中文文本分类,同样使用BertForSequenceClassification
数据集中包括三个情感分类等级[-1,0,1]
流程和第一部分一致,主要修改地方是在Bert的config文件中将类别设置成3,并将数据集中的[-1,0,1],变化成[0,1,2]的形式,bert的预训练模型使用
bert-base-uncased-cn
这个数据集包括俩列:[‘label’, ‘txt’]
首先读入数据:
df = pd.read_csv(os.path.join(data_path,"train.tsv"), delimiter='\t')
df_dev=pd.read_csv(os.path.join(data_path,"dev.tsv"), delimiter='\t')
print("train:",df.head())
print("dev:",df_dev.head())
提取句子并进行处理
#提取语句并处理
sentencses=['[CLS] ' + sent + ' [SEP]' for sent in df.txt.values]
labels=df.label.values
#这里中性还是使用0表示1表示积极2表示不积极
labels=list(map(lambda x:0 if x == 0 else 1 if x == 1 else 2,[x for x in labels]))
print("train label:",labels[100:110])
print("第一句话:",sentencses[0])
tokenizer=BertTokenizer.from_pretrained(bert_pre_tokenizer,do_lower_case=True)
tokenized_sents=[tokenizer.tokenize(sent) for sent in sentencses]
print("tokenized的第一句话:",tokenized_sents[0])
定义Bert的输入格式
- 处理后的句子
- 同样这里还是一句话[0,0,0,0,…]默认即可不需要传入
- mask
MAX_LEN=80
#训练集部分
#将分割后的句子转化成数字 word-->idx
input_ids=[tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_sents]
print("转化后的第一个句子:",input_ids[0])
#做PADDING
#大于128做截断,小于128做PADDING
input_ids=pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
print("Padding 第一个句子:",input_ids[0])
#建立mask
attention_masks = []
for seq in input_ids:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
print("第一个attention mask:",attention_masks[0])
#验证集部分
#构建验证集
dev_sentencses=['[CLS] ' + sent + ' [SEP]' for sent in df_dev.txt.values]
dev_labels=df_dev.label.values
print("dev_label:",dev_labels[100:110])
dev_labels=list(map(lambda x:0 if x == 0 else 1 if x == 1 else 2,[x for x in dev_labels]))
# dev_labels=[to_categorical(i, num_classes=3) for i in dev_labels]
dev_tokenized_sents=[tokenizer.tokenize(sent) for sent in dev_sentencses]
dev_input_ids=[tokenizer.convert_tokens_to_ids(sent) for sent in dev_tokenized_sents]
dev_input_ids=pad_sequences(dev_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
dev_attention_masks = []
for seq in dev_input_ids:
dev_seq_mask = [float(i>0) for i in seq]
dev_attention_masks.append(dev_seq_mask)
构建训练集和验证集的dataloader
train_inputs = torch.tensor(input_ids)
validation_inputs = torch.tensor(dev_input_ids)
train_labels = torch.tensor(labels)
validation_labels = torch.tensor(dev_labels)
train_masks = torch.tensor(attention_masks)
validation_masks = torch.tensor(dev_attention_masks)
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
装载预训练模型,这里是bert的中文模型
#装载预训练bert模型
modelConfig = BertConfig.from_pretrained(bert_config)
model = BertForSequenceClassification.from_pretrained(bert_pre_model, config=modelConfig)
print(model.cuda())
定义优化器
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.0}
]
optimizer = BertAdam(optimizer_grouped_parameters,
lr=2e-5,
warmup=.1)
训练部分
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
train_loss_set = []
epochs = 4
for _ in trange(epochs, desc="Epoch"):
model.train()
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
for step, batch in enumerate(train_dataloader):
batch = tuple(t.to(device) for t in batch)#将数据放置在GPU上
b_input_ids, b_input_mask, b_labels = batch
optimizer.zero_grad()
loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0]
# print("loss:",loss)
train_loss_set.append(loss.item())
loss.backward()
optimizer.step()
tr_loss += loss.item()
nb_tr_examples += b_input_ids.size(0)
nb_tr_steps += 1
print("Train loss: {}".format(tr_loss / nb_tr_steps))
#验证集
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in validation_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
tmp_eval_accuracy = flat_accuracy(logits, label_ids)
eval_accuracy += tmp_eval_accuracy
nb_eval_steps += 1
print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
测试一下效果
#str
st="真的好吗?"
str='[CLS] ' + st + ' [SEP]'
str_tokenized_sents = tokenizer.tokenize(str)
print(str_tokenized_sents)
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
str_input_ids = [tokenizer.convert_tokens_to_ids(str_tokenized_sents)]
str_input_ids = pad_sequences(str_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
print(str_input_ids)
str_mask = [[float(i > 0) for i in str_input_ids[0]]]
str_label=[0]
str_input_ids = torch.tensor(str_input_ids).cuda()
str_mask = torch.tensor(str_mask).cuda()
str_label = torch.tensor(str_label).cuda()
print("size:",str_input_ids.size(),str_mask.size(),str_label.size())
logits_str = model(str_input_ids, token_type_ids=None, attention_mask=str_mask)[0]
print(np.argmax(logits_str.detach().cpu().numpy(), axis=1))
结果是个中性,再看看夸张点的
这里是负面的