近期,在新冠肺炎疫情防控的关键期,网上各种有关疫情防控的谣言接连不断,这些谣言操纵了舆论感情,误导了公众判断,更影响了社会稳定。本项目基于基于循环神经网络(RNN)的课言检测模型实现了微博谣言检测
本实践使用 Paddle Fluid API 编程并搭建一个循环神经网络(Recurrent Neural Network,RNN),进行谣言检测。主要分为五个步骤:
1.数据准备
2.模型配置
3.模型训练
(1)定义网络
(2)定义损失函数
(3)定义优化方法
4.模型评估
5.模型预测
数据集介绍:
本次实践所使用的数据是从新浪微博不实信息举报平台抓取的中文谣言数据,数据集中共包含1538条谣言和1849条非谣言。如下图所示,每条数据均为json格式,其中text字段代表微博原文的文字内容。
数据处理及模型:
# Step1、数据准备
# (1)解压数据,读取并解析数据,生成all_data.txt
# (2)生成数据字典,即dict.txt
# (3)生成数据列表,并进行训练集与验证集的划分,train_list.txt 、eval_list.txt
# (4)定义训练数据集提供器train_reader和验证数据集提供器eval_reader
#解压原始数据集,将Rumor_Dataset.zip解压至data目录下
import zipfile
import os
import random
from PIL import Image
from PIL import ImageEnhance
import json
src_path="D:\PycharmProjects2020\\tensor1\yaoyanjianche\data\Chinese_Rumor_Dataset-master.zip"
target_path="D:\PycharmProjects2020\\tensor1\yaoyanjianche\data\Chinese_Rumor_Dataset-master"
if(not os.path.isdir(target_path)): #如果不存在target_path路径的话,进行解压
z = zipfile.ZipFile(src_path, 'r')
z.extractall(path=target_path)
z.close()
# 分别为谣言数据、非谣言数据、全部数据的文件路径
rumor_class_dirs = os.listdir(target_path + "/Chinese_Rumor_Dataset-master/CED_Dataset/rumor-repost/")
non_rumor_class_dirs = os.listdir(target_path + "/Chinese_Rumor_Dataset-master/CED_Dataset/non-rumor-repost/")
original_microblog = target_path + "/Chinese_Rumor_Dataset-master/CED_Dataset/original-microblog/"
# 谣言标签为0,非谣言标签为1
rumor_label = "0"
non_rumor_label = "1"
# 分别统计谣言数据与非谣言数据的总数
rumor_num = 0
non_rumor_num = 0
all_rumor_list = []
all_non_rumor_list = []
# 解析谣言数据
for rumor_class_dir in rumor_class_dirs:
if (rumor_class_dir != '.DS_Store' and rumor_class_dir !='._.DS_Store' ):
# 遍历谣言数据,并解析 老提示编码错误的原因 :UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb0
# 因为文件夹里面除了.DS_Store 还有._.DS_Store文件 ._.DS_Store文件没有判断
with open(original_microblog + rumor_class_dir, 'r',encoding='UTF-8') as f:
rumor_content = f.read()
rumor_dict = json.loads(rumor_content)
all_rumor_list.append(rumor_label + "\t" + rumor_dict["text"] + "\n")#取text的值并加上标签
rumor_num += 1
# 解析非谣言数据
for non_rumor_class_dir in non_rumor_class_dirs:
if (non_rumor_class_dir != '.DS_Store' and non_rumor_class_dir != '._.DS_Store'):
with open(original_microblog + non_rumor_class_dir, 'r',encoding='UTF-8') as f2:
non_rumor_content = f2.read()
non_rumor_dict = json.loads(non_rumor_content)
all_non_rumor_list.append(non_rumor_label + "\t" + non_rumor_dict["text"] + "\n")
non_rumor_num += 1
print("谣言数据总量为:" + str(rumor_num))
print("非谣言数据总量为:" + str(non_rumor_num))
# print(all_rumor_list)
# 全部数据进行乱序后写入all_data.txt
data_list_path = "D:\PycharmProjects2020\\tensor1\yaoyanjianche\data"
all_data_path = data_list_path + "\\all_data.txt"
all_data_list = all_rumor_list + all_non_rumor_list
random.shuffle(all_data_list)
# 在生成all_data.txt之前,首先将其清空
with open(all_data_path, 'w',encoding='UTF-8') as f:
f.seek(0)
f.truncate()
with open(all_data_path, 'a',encoding='UTF-8') as f:
for data in all_data_list:
f.write(data)
# 导入必要的包
import os
from multiprocessing import cpu_count
import numpy as np
import shutil
import paddle
import paddle.fluid as fluid
from PIL import Image
import matplotlib.pyplot as plt
# 生成数据字典
def create_dict(data_path, dict_path):
dict_set = set()
# 读取全部数据
with open(data_path, 'r', encoding='utf-8') as f:
lines = f.readlines()#读取文件的所有的行,保存在列表中
# 把数据生成一个集合
for line in lines:
content = line.split('\t')[-1].replace('\n', '')#[]括号 里面,表示取值 0是从左到右第一个。-1,从右到左第一个。 既,取标签右边的文字把标签去掉,replace是把'\n'替换成''
for s in content:
dict_set