【参考:使用PaddleNLP进行恶意网页识别(五):用BERT识别恶意网页内容 - 飞桨AI Studio】
自己的代码【参考:用BERT识别恶意网页内容 - 飞桨AI Studio】
代码更新,仿造pytorch,特此记录
import os
import sys
import codecs # 编码转换
import chardet # 第三方编码检测模块
import shutil
import re
import time
import numpy as np
import pandas as pd
import jieba
from tqdm import tqdm, trange
from bs4 import BeautifulSoup
from functools import partial
import paddle
import paddlenlp as ppnlp
from paddlenlp.data import Stack, Pad, Tuple
import paddle.nn.functional as F
import paddle.nn as nn
from visualdl import LogWriter
print(paddle.__version__)
Dataset
import numpy as np
class SelfDefinedDataset(paddle.io.Dataset):
def __init__(self, data,tokenizer):
super(SelfDefinedDataset, self).__init__()
self.data = data
self.tokenizer=tokenizer
def __getitem__(self, idx):
text=self.data[idx][0]
label=self.data[idx][1]
#tokenizer.encode方法能够完成切分token,映射token ID以及拼接特殊token
encoded_inputs = self.tokenizer.encode(text=text,
max_seq_len=512, # 默认max_seq_len=512 会自动截断
pad_to_max_seq_len=True) # padding
input_ids = encoded_inputs["input_ids"]
#注意,在早前的PaddleNLP版本中,token_type_ids叫做segment_ids
segment_ids = encoded_inputs["token_type_ids"]
return np.array(input_ids),np.array(segment_ids),np.array(label,dtype=np.int64) # 最好全部返回np类型
def __len__(self):
return len(self.data)
def get_labels(self):
return ["0", "1"]
# 读取文件
def txt_to_list(file_name):
res_list = []
for line in open(file_name):
res_list.append(line.strip().split('\t'))
return res_list
trainlst = txt_to_list('webtrain.txt')
devlst = txt_to_list('webdev.txt')
testlst = txt_to_list('webtest.txt')
#调用ppnlp.transformers.BertTokenizer进行数据处理,tokenizer可以把原始输入文本转化成模型model可接受的输入数据格式。
tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained("bert-base-chinese")
train_ds = SelfDefinedDataset(trainlst,tokenizer)
dev_ds = SelfDefinedDataset(devlst,tokenizer)
test_ds = SelfDefinedDataset(testlst,tokenizer)
# 测试
# next(iter(train_ds))
# 标签
label_list = train_ds.get_labels()
print(label_list)
print("训练集样本个数:{}".format(len(train_ds)))
print("验证集样本个数:{}".format(len(dev_ds)))
print("测试集样本个数:{}".format(len(test_ds)))
DataLoader
如果碰到显存不够的情况下 把batch_size或者max_seq_len改小一点
from paddle.io import DataLoader
batch_size=16
train_loader = DataLoader(dataset=train_ds,
batch_size=batch_size,
shuffle=True)
dev_loader = DataLoader(dataset=dev_ds,
batch_size=batch_size,
shuffle=True)
test_loader = DataLoader(dataset=test_ds,
batch_size=batch_size,
shuffle=True)
# next(iter(train_loader)) # 测试
# DataLoader会自动把数据转成Tensor
模型
#加载预训练模型Bert用于文本分类任务的Fine-tune网络BertForSequenceClassification, 它在BERT模型后接了一个全连接层进行分类。
#由于本任务中的恶意网页识别是二分类问题,设定num_classes为2
model = ppnlp.transformers.BertForSequenceClassification.from_pretrained("bert-base-chinese", num_classes=2)
# 【参考:[modeling — PaddleNLP 文档](https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.bert.modeling.html)】
训练
这部分代码未改
#设置训练超参数
#学习率
learning_rate = 1e-4
#训练轮次
epochs = 20
#学习率预热比率
warmup_proption = 0.1
#权重衰减系数
weight_decay = 0.01
num_training_steps = len(train_loader) * epochs
num_warmup_steps = int(warmup_proption * num_training_steps)
def get_lr_factor(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
else:
return max(0.0,
float(num_training_steps - current_step) /
float(max(1, num_training_steps - num_warmup_steps)))
#学习率调度器
lr_scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate, lr_lambda=lambda current_step: get_lr_factor(current_step))
#优化器
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
#损失函数
criterion = paddle.nn.loss.CrossEntropyLoss()
#评估函数
metric = paddle.metric.Accuracy()
#评估函数,设置返回值,便于VisualDL记录
def evaluate(model, criterion, metric, data_loader):
model.eval()
metric.reset()
losses = []
for batch in data_loader:
input_ids, segment_ids, labels = batch
logits = model(input_ids, segment_ids)
loss = criterion(logits, labels)
losses.append(loss.numpy())
correct = metric.compute(logits, labels)
metric.update(correct)
accu = metric.accumulate()
print("eval loss: %.5f, accu: %.5f" % (np.mean(losses), accu))
model.train()
metric.reset()
return np.mean(losses), accu
from tqdm import tqdm
#开始训练
global_step = 0
with LogWriter(logdir="./log") as writer:
for epoch in range(1, epochs + 1):
for step, batch in enumerate(tqdm(train_loader), start=1): #从训练数据迭代器中取数据
input_ids, segment_ids, labels = batch
logits = model(input_ids, segment_ids)
loss = criterion(logits, labels) #计算损失
probs = F.softmax(logits, axis=1)
correct = metric.compute(probs, labels)
metric.update(correct)
acc = metric.accumulate()
global_step += 1
if global_step % 1 == 0 :
print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (global_step, epoch, step, loss, acc))
#记录训练过程
writer.add_scalar(tag="train/loss", step=global_step, value=loss)
writer.add_scalar(tag="train/acc", step=global_step, value=acc)
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_gradients()
eval_loss, eval_acc = evaluate(model, criterion, metric, dev_loader)
#记录评估过程
writer.add_scalar(tag="eval/loss", step=epoch, value=eval_loss)
writer.add_scalar(tag="eval/acc", step=epoch, value=eval_acc)
预测
# 预测一条数据
def predict(model, text, tokenizer, label_map):
encoded_inputs = tokenizer.encode(text=text,
max_seq_len=200, # 默认max_seq_len=512 会自动截断
pad_to_max_seq_len=True) # 自动padding
input_ids = encoded_inputs["input_ids"]
#注意,在早前的PaddleNLP版本中,token_type_ids叫做segment_ids
segment_ids = encoded_inputs["token_type_ids"]
model.eval()
input_ids = paddle.to_tensor([input_ids]) # 增加一个维度batch_size=1
segment_ids = paddle.to_tensor([segment_ids])
logits = model(input_ids, segment_ids)
# print(logits)
idx = paddle.argmax(logits, axis=1).tolist() # 转成list 没有item()
labels = label_map[idx[0]]
return labels
data = ['娱乐城 百家乐 澳门百家乐 金宝博 澳门赌场 汉庭网络 真人百家乐博彩网皇冠网全讯网 皇冠现金网赌博网站 官方网站皇冠足球比分', '登入 简 錯誤 錯誤 用戶帳號 密碼 記住我的戶口號碼登入登入重要告示 私隱政策桌面版本']
label_map = {0: '正常网页', 1: '恶意网页'}
for text in data:
print('预测内容: {} \n网页标签: {}'.format(text, predict(model, text, tokenizer, label_map)))