【PaddleNLP】恶意网页识别(五):用BERT识别恶意网页内容

【参考:使用PaddleNLP进行恶意网页识别(五):用BERT识别恶意网页内容 - 飞桨AI Studio

自己的代码【参考:用BERT识别恶意网页内容 - 飞桨AI Studio

代码更新,仿造pytorch,特此记录

import os
import sys
import codecs # 编码转换
import chardet # 第三方编码检测模块
import shutil
import re
import time
import numpy as np
import pandas as pd
import jieba
from tqdm import tqdm, trange
from bs4 import BeautifulSoup
from functools import partial
import paddle
import paddlenlp as ppnlp
from paddlenlp.data import Stack, Pad, Tuple
import paddle.nn.functional as F
import paddle.nn as nn
from visualdl import LogWriter

print(paddle.__version__)

Dataset

import numpy as np

class SelfDefinedDataset(paddle.io.Dataset):
    def __init__(self, data,tokenizer):
        super(SelfDefinedDataset, self).__init__()
        self.data = data
        self.tokenizer=tokenizer
       

    def __getitem__(self, idx):
        text=self.data[idx][0]
        label=self.data[idx][1]
        #tokenizer.encode方法能够完成切分token,映射token ID以及拼接特殊token
        encoded_inputs = self.tokenizer.encode(text=text,
                                               max_seq_len=512, # 默认max_seq_len=512 会自动截断
                                               pad_to_max_seq_len=True)  # padding
        input_ids = encoded_inputs["input_ids"]
        #注意,在早前的PaddleNLP版本中,token_type_ids叫做segment_ids
        segment_ids = encoded_inputs["token_type_ids"]
       
        return np.array(input_ids),np.array(segment_ids),np.array(label,dtype=np.int64) # 最好全部返回np类型

    def __len__(self):
        return len(self.data)
        
    def get_labels(self):
        return ["0", "1"]

# 读取文件
def txt_to_list(file_name):
    res_list = []
    for line in open(file_name):
        res_list.append(line.strip().split('\t'))
    return res_list

trainlst = txt_to_list('webtrain.txt')
devlst = txt_to_list('webdev.txt')
testlst = txt_to_list('webtest.txt')

#调用ppnlp.transformers.BertTokenizer进行数据处理,tokenizer可以把原始输入文本转化成模型model可接受的输入数据格式。
tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained("bert-base-chinese")


train_ds = SelfDefinedDataset(trainlst,tokenizer)
dev_ds = SelfDefinedDataset(devlst,tokenizer)
test_ds = SelfDefinedDataset(testlst,tokenizer)

# 测试
# next(iter(train_ds))

# 标签
label_list = train_ds.get_labels()
print(label_list)

print("训练集样本个数:{}".format(len(train_ds)))
print("验证集样本个数:{}".format(len(dev_ds)))
print("测试集样本个数:{}".format(len(test_ds)))

DataLoader

如果碰到显存不够的情况下 把batch_size或者max_seq_len改小一点

from paddle.io import DataLoader


batch_size=16 
train_loader = DataLoader(dataset=train_ds, 
                                batch_size=batch_size, 
                                shuffle=True)
dev_loader = DataLoader(dataset=dev_ds,
                                batch_size=batch_size, 
                                shuffle=True)
test_loader = DataLoader(dataset=test_ds, 
                                batch_size=batch_size, 
                                shuffle=True)

# next(iter(train_loader)) # 测试
# DataLoader会自动把数据转成Tensor

模型

#加载预训练模型Bert用于文本分类任务的Fine-tune网络BertForSequenceClassification, 它在BERT模型后接了一个全连接层进行分类。
#由于本任务中的恶意网页识别是二分类问题,设定num_classes为2
model = ppnlp.transformers.BertForSequenceClassification.from_pretrained("bert-base-chinese", num_classes=2)
# 【参考:[modeling — PaddleNLP 文档](https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.bert.modeling.html)】

训练

这部分代码未改

#设置训练超参数

#学习率
learning_rate = 1e-4
#训练轮次
epochs = 20
#学习率预热比率
warmup_proption = 0.1
#权重衰减系数
weight_decay = 0.01

num_training_steps = len(train_loader) * epochs
num_warmup_steps = int(warmup_proption * num_training_steps)

def get_lr_factor(current_step):
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    else:
        return max(0.0,
                    float(num_training_steps - current_step) /
                    float(max(1, num_training_steps - num_warmup_steps)))
#学习率调度器
lr_scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate, lr_lambda=lambda current_step: get_lr_factor(current_step))

#优化器
optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=weight_decay,
    apply_decay_param_fun=lambda x: x in [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ])

#损失函数
criterion = paddle.nn.loss.CrossEntropyLoss()
#评估函数
metric = paddle.metric.Accuracy()
#评估函数,设置返回值,便于VisualDL记录
def evaluate(model, criterion, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    for batch in data_loader:
        input_ids, segment_ids, labels = batch
        logits = model(input_ids, segment_ids)
        loss = criterion(logits, labels)
        losses.append(loss.numpy())
        correct = metric.compute(logits, labels)
        metric.update(correct)
        accu = metric.accumulate()
    print("eval loss: %.5f, accu: %.5f" % (np.mean(losses), accu))
    model.train()
    metric.reset()
    return np.mean(losses), accu
from tqdm import tqdm
#开始训练
global_step = 0
with LogWriter(logdir="./log") as writer:
    for epoch in range(1, epochs + 1):
        for step, batch in enumerate(tqdm(train_loader), start=1): #从训练数据迭代器中取数据
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            loss = criterion(logits, labels) #计算损失
            
            probs = F.softmax(logits, axis=1)
            correct = metric.compute(probs, labels)
            metric.update(correct)
            acc = metric.accumulate()

            global_step += 1
            if global_step % 1 == 0 :
                print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (global_step, epoch, step, loss, acc))
                #记录训练过程
                writer.add_scalar(tag="train/loss", step=global_step, value=loss)
                writer.add_scalar(tag="train/acc", step=global_step, value=acc)
                
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()
            
        eval_loss, eval_acc = evaluate(model, criterion, metric, dev_loader)
        #记录评估过程
        writer.add_scalar(tag="eval/loss", step=epoch, value=eval_loss)
        writer.add_scalar(tag="eval/acc", step=epoch, value=eval_acc)

预测

# 预测一条数据
def predict(model, text, tokenizer, label_map): 
   
    encoded_inputs = tokenizer.encode(text=text,
                                               max_seq_len=200, # 默认max_seq_len=512 会自动截断
                                               pad_to_max_seq_len=True)  # 自动padding
    input_ids = encoded_inputs["input_ids"]
    #注意,在早前的PaddleNLP版本中,token_type_ids叫做segment_ids
    segment_ids = encoded_inputs["token_type_ids"]
        

    model.eval()
    input_ids = paddle.to_tensor([input_ids]) # 增加一个维度batch_size=1
    segment_ids = paddle.to_tensor([segment_ids])
    logits = model(input_ids, segment_ids)
    # print(logits)
    idx = paddle.argmax(logits, axis=1).tolist() # 转成list 没有item()
    labels = label_map[idx[0]] 
    return labels
data = ['娱乐城 百家乐 澳门百家乐 金宝博 澳门赌场 汉庭网络 真人百家乐博彩网皇冠网全讯网 皇冠现金网赌博网站 官方网站皇冠足球比分', '登入 简 錯誤 錯誤 用戶帳號 密碼 記住我的戶口號碼登入登入重要告示 私隱政策桌面版本']
label_map = {0: '正常网页', 1: '恶意网页'}

for text in data:
    print('预测内容: {} \n网页标签: {}'.format(text, predict(model, text, tokenizer, label_map)))
  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值