PaddlePaddle基本用法详解（四）、PaddlePaddle训练文本分类模型

最新推荐文章于 2025-04-01 22:17:19 发布

郭庆汝

最新推荐文章于 2025-04-01 22:17:19 发布

阅读量2.2k

点赞数

文章标签： paddlepaddle 文本分类

本文链接：https://blog.csdn.net/guoqingru0311/article/details/124289969

版权

paddlepaddle 同时被 2 个专栏收录

4 篇文章

订阅专栏

文本分类

1 篇文章

订阅专栏

本文详细介绍了如何使用PaddlePaddle进行文本分类的模型训练，包括数据预处理、字典生成、模型搭建、训练过程及模型保存。通过创建字典文件、对文本进行编码、构建CNN网络模型并进行训练，最终实现对中文新闻分类的模型。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

PaddlePaddle基本用法详解（四）、PaddlePaddle训练文本分类模型

在这里插入图片描述

1、原始数据：

2、转换得到的字典文件格式：

3、文本数据预处理后得到的训练集数据格式：

4、目录结构如下所示：

5、数据预处理以及模型训练代码：01_txt_pre.py

# 中文资讯分类示例
# 任务：根据样本，训练模型，将新的文本划分到正确的类别
'''
数据来源：从网站上爬取56821条中文新闻摘要
数据类容：包含10类(国际、文化、娱乐、体育、财经、汽车、教育、科技、房产、证券)
'''

############################## 第一部分：数据预处理 ##############################
import os
from multiprocessing import cpu_count
import numpy as np
import paddle
import paddle.fluid as fluid
import matplotlib.pyplot as plt
# 定义公共变量
data_root = "./data/news_classify/"  # 数据集所在目录
data_file = "news_classify_data.txt"  # 原始样本的文件名称
test_file = "test_list.txt"  # 测试集文件名称
train_file = "train_list.txt"  # 训练集文件名称
dict_file = "dict_txt.txt"  # 字典文件名称

data_file_path = data_root + data_file  # 原始样本完整路径
test_file_path = data_root + test_file  # 测试集文件路径
train_file_path = data_root + train_file  # 训练集文件路径
dict_file_path = data_root + dict_file  # 字典文件路径


# 生成字典文件
def create_dict():
    dict_set = set()  # 定义一个集合，利用集合进行去重
    with open(data_file_path, "r", encoding="utf-8")  as f:
        lines = f.readlines()  # 读取所有的行
    # 遍历每一行进行处理
    for line in lines:
        title = line.split("_!_")[-1]  # 拆分取出最后一字段
        title = title.replace("\n", "")  # 去掉换行符
        print(title)

        # 遍历标题，取出每一个字，将其加入到集合中进行去重
        for w in title:
            dict_set.add(w)  # 将每一个字推入集合中去重

    # 遍历集合为每一个字分配编码值
    dict_list = []
    i = 1  # 编码值计数器
    for s in dict_set:
        dict_list.append([s, i])  # 将 字-值 键值对存入临时表
        i += 1

    # 将列表转换为字典
    dict_txt = dict(dict_list)  # 将列表转换为字典
    end_dict = {"<unk>": i}  # 未知字符
    dict_txt.update(end_dict)  # 将未知字符添加到字典中

    # 将字典保存在文件中
    with open(dict_file_path, "w", encoding="utf-8")  as f:
        # 将字典转换为字符串,并写入文件
        f.write(str(dict_txt))

    print("****" * 5, "生成字典结束，文件保存在：", dict_file_path, "****" * 5)


# 对一行标题进行数字编码
def line_encoding(title, dict_txt, label):
    """
    对一行标题进行数字编码
    :param title: 传入的标题
    :param dict_txt: 传入的字典
    :param label: 传入标签
    :return:
    """
    new_line = ""  # 编码后的结果
    for w in title:  # 遍历标题，取出每个字
        if w in dict_txt:  # 如果字在字典中
            code = str(dict_txt[w])
        else:
            # 不存在，取<unk>对应的编码
            code = str(dict_txt["<unk>"])
        new_line = new_line + code + ","  # 将编码追加到新字符串（且每个编码间使用英文”，“进行分隔）
    new_line = new_line[:-1]  # 去掉最后一个多余的逗号
    new_line = new_line + "\t" + label + "\n"  # 重新进行拼接
    return new_line


# 将原始样本中的所有的行进行编码
# 将编码后的新样本数据存入测试集、训练集中
def create_data_list():
    # 清空测试集、训练集
    with open(test_file_path, "w")   as f:
        pass

    with open(train_file_path, "w")   as f:
        pass

    # 读取字典和原始样本内容
    with open(dict_file_path, "r", encoding="utf-8") as f:
        # 读取字典的内容，取出第一行（只有一行）
        # 通过调用eval函数，将字符串作为指令执行生成字典对象
        dict_txt = eval(f.readlines()[0])
    # 打开原始样本文件，取出新闻辩题部分，进行整句编码
    with open(data_file_path, "r", encoding="utf-8")  as f:
        lines = f.readlines()

    i = 0  # 计数器，用于测试训练集与测试集
    count = 0  # 测试集计数器
    for line in lines:
        line = line.replace("\n", "")
        words = line.split("_!_")  # 根据分隔符进行拆分
        label = words[1]  # 类别
        title = words[3]  # 标签
        new_line = line_encoding(title, dict_txt, label)  # 传入的为title、字典、对应标签
        print("new_line:", new_line)
        # 将数据存入测试集
        if i % 10 == 0:  # 写入测试集
            with open(test_file_path, "a", encoding="utf-8")  as f:
                f.write(new_line)
            count += 1  # 测试集计数器
        # 将数据写入测试集
        else:
            with open(train_file_path, "a", encoding="utf-8") as f:
                f.write(new_line)
        i += 1  # 计数器加1
    print("****" * 5, "生成训练集与测试集完成", "****" * 5)
    print("样本总数为%s,其中测试集%s，训练集%s" % (str(len(lines)), str(count), str(len(lines) - count)))


create_dict()  # 生成字典
# 将原始样本中的所有的行进行编码
# 将编码后的新样本数据存入测试集、训练集中
create_data_list()


############################## 第二部分：模型的搭建、训练、保存 ##############################

# 读取字典文件，获取字典长度
def get_dict_len(dict_path):
    with open(dict_path, "r", encoding="utf-8")  as f:
        line = eval(f.readlines()[0])           # 使用eval将字符串转换成字典对象
    return len(line.keys())


# 定义data_mepper,将reader读取到的数据进行二次处理
# 将传入的字符串转换为整形
def data_mapper(sample):
    data, label = sample  # 将sample拆分为两个变量
    # 将data中的每个数字拆分。并转换为整形，返回一个列表
    val = [int(w) for w in data.split(",")]
    return val, int(label)


# 定义训练集的reader
def train_reader(train_file_path):
    def reader():
        with open(train_file_path, "r") as f:
            lines = f.readlines()
            np.random.shuffle(lines)  # 打乱所有样本操作

            for line in lines:
                data, label = line.split("\t")  # 拆分数据，标签
                yield data, label  # 返回的是一个元组的形式

    return paddle.reader.xmap_readers(data_mapper,
                                      reader,  # 将reader(）函数读取到的数据返还给data_mapper进行处理
                                      cpu_count(),
                                      1024      # 缓冲区大大小
                                      )


# 定义测试集的reader
def test_reader(train_file_path):
    def reader():
        with open(train_file_path, "r") as f:
            lines = f.readlines()

            for line in lines:
                data, label = line.split("\t")  # 拆分数据，标签
                yield data, label  # 返回的是一个元组的形式

    return paddle.reader.xmap_readers(data_mapper,
                                      reader,  # 将reader(）函数读取到的数据返还给data_mapper进行处理
                                      cpu_count(),
                                      1024
                                      )


# 模型组建
def CNN_net(data, dict_dim, class_dim=10, emb_dim=128, hid_dim=128, hid_dim2=98):
    """
    模型组建
    :param data：原始数据
    :param dict_dim: 词典大小
    :param class_dim: 分类数量
    :param emb_dim: 词向量长度  即将原始文本映射到128维的向量
    :param hid_dim: 第一层卷积卷积核数量
    :param hid_dim2: 第二层卷积卷积核数量
    """
    # embedding层（词嵌入层）：对数据进行转换，转换为特征向量（生成词向量，生成粘稠实向量）
    emb = fluid.layers.embedding(input=data,
                                 size=[dict_dim, emb_dim])
    # 并列卷积、池化(进行序列的卷积池化，进行一维空间的特征提取)，        fluid.nets.sequence_conv_pool：为序列卷积池化
    conv1 = fluid.nets.sequence_conv_pool(input=emb,  # 输入，上一个词嵌入层的输出
                                          num_filters=hid_dim,  # 卷积核数量（128个）
                                          filter_size=3,  # 卷积核大小（即在前后三个词语间提取局部特征）
                                          act="tanh",  # 激活函数
                                          pool_type="sqrt"  # 池化类型
                                          )

    # 并列卷积、池化(进行序列的卷积池化，进行一维空间的特征提取)
    conv2 = fluid.nets.sequence_conv_pool(input=emb,  # 输入，上一个词嵌入层的输出
                                          num_filters=hid_dim2,  # 卷积核数量（98个卷积核）
                                          filter_size=4,  # 卷积核大小（即在前后四个词间提取特征）
                                          act="tanh",  # 激活函数
                                          pool_type="sqrt"  # 池化类型（开方）
                                          )

    output = fluid.layers.fc(input=[conv1, conv2],  # 两个卷积池化层的输出共同作为输入（即合并成一个特征向量）
                             size=class_dim,  # 分类数量
                             act="softmax"  # 激活函数
                             )
    return output


# 定义变量
# 经过编码后的数据
words = fluid.layers.data(name="words",
                          shape=[1],
                          dtype="int64",
                          lod_level=1  # 张量层级（表示变长的文本）
                          )
print(type(words))
label = fluid.layers.data(name="label",
                          shape=[1],
                          dtype="int64")

# 获取字典的长度
# 因为在进行词嵌入过程中需要传入字典的长度
dict_dim = get_dict_len(dict_file_path)  # 获取字典的长度

# 传入模型数据，得到预测结果
model = CNN_net(words, dict_dim)

# 构建损失函数
cost = fluid.layers.cross_entropy(input=model,
                                  label=label)
# 对cost求得均值
avg_cost = fluid.layers.mean(cost)

# 准确率（传入预测结果与真实结果）
acc = fluid.layers.accuracy(input=model,
                            label=label)
# 克隆Program(用于模型的评估)
# for_test=True:表示用于测试，少做一些优化
test_program = fluid.default_main_program().clone(for_test=True)
# 优化器
optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.002)  # 自适应优化器的变种
optimizer.minimize(avg_cost)

# 执行器
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())

# 准备数据
## 包装训练集读取器
tr_reader = train_reader(train_file_path)
batch_train_reader = paddle.batch(reader=tr_reader,
                                  batch_size=128)

## 包装测试集读取器
ts_reader = test_reader(test_file_path)
batch_test_reader = paddle.batch(reader=ts_reader,
                                 batch_size=128)

# 定义数据喂入器
feeder = fluid.DataFeeder(place=place,
                          feed_list=[words, label])  # 喂入样本与标签


cost_train=[]      # 记录训练过程损失值
acc_train=[]       # 记录训练过程准确率
time=0             # 计数器
batches=[]         # 记录迭代次数

cost_avg_test=[]   # 记录测试集损失值
acc_avg_test=[]    # 记录测试集准确率
epoch=100
# 开始训练
for pass_id in range(epoch):
    for batch_id, data in enumerate(batch_train_reader()):
        train_cost, train_acc = exe.run(
                                        program=fluid.default_main_program(),
                                        feed=feeder.feed(data),  # 喂入参数
                                        fetch_list=[avg_cost, acc]  # 获取结果
                                    )
        cost_train.append(train_cost[0])        # 记录训练过程损失值
        acc_train.append(train_acc[0])          # 记录训练过程准确率
        batches.append(time)                    # 记录迭代次数
        time += 1

        # 打印
        if batch_id % 100==0:
            print("pass_id:%d，batch_id：%d，cost：%f，acc：%f"%(pass_id,batch_id,
                                                           train_cost[0],train_acc[0]))

    # 每轮训练结束后，对模型进行评估
    test_cost_list=[]
    test_acc_list=[]

    for batch_id,data in enumerate(batch_test_reader()):
        test_cost, test_acc = exe.run(
                                        program=test_program,   # 执行评估的program
                                        feed=feeder.feed(data),  # 喂入参数
                                        fetch_list=[avg_cost, acc]  # 获取结果
                                    )
        test_cost_list.append(test_cost[0])     # 记录损失值
        test_acc_list.append(test_acc[0])  #  记录正确率

    # 计算平均的损失值、准确率
    avg_test_cost = sum(test_cost_list) / len(test_cost_list)
    avg_test_acc=sum(test_acc_list) / len(test_acc_list)
    print("test_cost：%f，test_acc：%f"%(avg_test_cost,avg_test_acc))

    cost_avg_test.append(avg_test_cost)     # 记录每轮测试集平均损失值
    acc_avg_test.append(avg_test_acc)       # 记录每轮测试集平均准确率

# 保存模型
model_save_dir="./model/news_classify/"       # 模型路径
if not os.path.exists(model_save_dir):
    os.mkdir(model_save_dir)
fluid.io.save_inference_model(model_save_dir,
                              feeded_var_names=["words"], # 表示模型需要喂入的数据
                              target_vars=[model],      # 表示模型要得到的数据
                              executor=exe              # 表示执行器
                              )
print("模型保存完毕")

# 训练过程可视化
plt.figure("training",facecolor="lightgray")
plt.title("training",fontsize=24)
plt.xlabel("iter",fontsize=20)
plt.ylabel("cost_train/acc_train",fontsize=20)
plt.plot(batches,cost_train,color="red",label="Training Cost")
plt.plot(batches,acc_train,color="green",label="Training Acc")
plt.legend()
plt.grid()
plt.savefig("train.png")
plt.show()

# 测试过程
plt.figure("testing",facecolor="lightgray")
plt.title("testing",fontsize=24)
plt.xlabel("eopch",fontsize=20)
plt.ylabel("cost_test/acc_test",fontsize=20)
plt.plot(range(epoch),cost_avg_test,color="red",label="Testing Cost")
plt.plot(range(epoch),acc_avg_test,color="green",label="Testing Acc")
plt.legend()
plt.grid()
plt.savefig("test.png")
plt.show()

5、模型测试代码：02_txt_test.py

import os
from multiprocessing import cpu_count
import numpy as np
import paddle
import paddle.fluid as fluid
import matplotlib.pyplot as plt
# 定义公共变量
data_root = "./data/news_classify/"  # 数据集所在目录
data_file = "news_classify_data.txt"  # 原始样本的文件名称
test_file = "test_list.txt"  # 测试集文件名称
train_file = "train_list.txt"  # 训练集文件名称
dict_file = "dict_txt.txt"  # 字典文件名称

data_file_path = data_root + data_file  # 原始样本完整路径
test_file_path = data_root + test_file  # 测试集文件路径
train_file_path = data_root + train_file  # 训练集文件路径
dict_file_path = data_root + dict_file  # 字典文件路径


############################## 第三部分：模型的加载与预测 ##############################
model_save_dir = "./model/news_classify/"  # 模型路径
def get_data(sentenct):  # 对待预测文本进行编码
    with open(dict_file_path, "r", encoding="utf-8") as f:
        dict_txt = eval(f.readlines()[0])

    keys = dict_txt.keys()
    ret = []  # 存放编码结构
    for s in sentenct:  # 遍历句子
        if s not in keys:  # 如果字不在字典中
            s = "<unk>"
        ret.append(int(dict_txt[s]))  # 取出字的编码，并添加进列表中
    array = np.array(ret)
    ret = array.astype("int64")     # 万分注意：此处需要将int32型的数据转为int64型否则会报错
    # print(ret.dtype)
    return ret


# 创建执行器
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())    # 初始化所有的变量

# 加载模型
infer_program, feed_var_names, target_var = fluid.io.load_inference_model(model_save_dir, exe)

# 生成测试数据
texts = []
data1 = get_data("在获得诺贝尔文学奖7年之后，莫言15日晚间在山西汾阳贾家庄如是说")
data2 = get_data("综合'今日美国'、《世界日报》等当地媒体报道，芝加哥河滨警察局表示")
data3 = get_data("中国队无缘2020年世界杯")
data4 = get_data("中国人民银行今日发布通知，降低准备金率，预计释放4000亿流动性")
data5 = get_data("10月20日,第六届世界互联网大会正式开幕")
data6 = get_data("同一户型，为什么高层比低层要贵那么多？")
data7 = get_data("揭秘A股周涨5%资金动向：追捧2类股，抛售600亿香饽饽")
data8 = get_data("宋慧乔陷入感染危机，前夫宋仲基不戴口罩露面，身处国外神态轻松")
data9 = get_data("此盆栽花很好养，花美似牡丹，三季开花，南北都能养，很值得栽培")  # 不属于任何一个类别

texts.append(data1)
texts.append(data2)
texts.append(data3)
texts.append(data4)
texts.append(data5)
texts.append(data6)
texts.append(data7)
texts.append(data8)
texts.append(data9)

# 获取每个句子词数量
base_shape = [[len(c) for c in texts]]
# 生成数据（将数据转为带有层级的张量）
tensor_words = fluid.create_lod_tensor(texts,
                                       base_shape,
                                       place)
# 执行预测
result = exe.run(program=infer_program,     #用于预测的模型
                 feed={feed_var_names[0]: tensor_words},  # 待预测的数据
                 fetch_list=target_var
                 )

names = ["文化", "娱乐", "体育", "财经", "房产", "汽车", "教育", "科技", "国际", "证券"]

# 获取最大值的索引
for i in range(len(texts)):
    lab = np.argsort(result)[0][i][-1]  # 取出最大值的元素下标
    print("预测结果：%d, 名称:%s, 概率:%f" % (lab, names[lab], result[0][i][lab]))

在这里插入图片描述