基于天涯论坛的 “新冠疫情”舆情分析
完整数据和代码链接:https://download.csdn.net/download/weixin_43906500/14935218
1.天涯论坛数据架构
天涯论坛主页主要分为推荐板块、推荐贴文、搜索板块、其他链接四个部分。对于新冠疫情的相关帖子主要利用搜索板块进行数据获取。
搜索板块实现关键词检索,可检索出75页发帖,每页发帖文章为10篇
根据贴文数据,可以提取发帖人、发帖时间、点击量、回复量以及贴文内容信息
根据帖子下方评论可以提取评论信息,以进行后续的文本分析
用户主页含有用户基本信息,如用户昵称、等级、关注量、粉丝量、天涯分、注册日期等信息
2.舆情分析技术实现
2.1数据获取
使用python爬虫获取数据
from pyquery import PyQuery as pq
import requests
from urllib.parse import quote
from time import sleep
import json
page = 75
key_word = '时政'
def prase_all_page(urls):
"""
解析所有搜索页,获取帖子url,过滤无评论帖子
:param urls:
:return: content_urls
"""
content_urls = []
for url in urls:
sleep(1)
print('正在抓取:', url)
doc = pq(requests.get(url=url, timeout=30).text)
# print(doc)
doc('.searchListOne li:last-child').remove() # 删除最后一个无用li节点
lis = doc('.searchListOne li').items() # 获取content节点生成器
for li in lis:
reverse = li('.source span:last-child').text()
a = li('a:first-child')
content_url = a.attr('href')
# print(content_url)
# print('评论数:', reverse)
content_urls.append(content_url)
return content_urls
def prase_all_content(urls):
"""
获取网页相关信息
:param urls:
:return:
"""
dic = []
i = 0
for url in urls:
print(i)
i = i + 1
try:
dic1 = {}
print('正在解析:', url)
doc = pq(requests.get(url=url, timeout=30).text)
title = doc('.atl-head .atl-title').text()
main_id = doc('.atl-head .atl-menu').attr('_host')
replytime = doc('.atl-head .atl-menu').attr('js_replytime')
loc = replytime.rfind('-')
# print(replytime)
print(replytime[0:4])
if(replytime[0:4]!="2020 "):
continue
print(replytime)
replycount = doc('.atl-head .atl-menu').attr('js_replycount')
clickcount = doc('.atl-head .atl-menu').attr('js_clickcount')
article = next(doc('.bbs-content').items()).text()
dic1["title"] = str(title)
dic1["main_id"] = main_id
dic1["time"] = replytime
dic1["replycount"] = replycount
dic1["clickcount"] = clickcount
dic1["article"] = article
comments_replys = []
comments = doc('.atl-main div:gt(1)').items() # 通栏广告后的评论列表
for comment in comments: # 处理评论
dic3 = {}
dic4 = {}
dic5 = {}
host_id = comment.attr('_hostid')
# user_name = comment.attr('_host')
comment_text = comment('.bbs-content').text()
replys = comment('.item-reply-view li').items() # 评论回复
if replys != None:
for reply in replys:
rid = reply.attr('_rid')
rtext = reply('.ir-content').text()
if rid:
if rid != main_id and rid != host_id:
dic5[host_id] = rtext
if host_id:
k = comment_text.rfind("----------------------------")
if (k != -1):
comment_text = comment_text[k + 29:]
dic4[host_id] = comment_text
dic3['comment'] = dic4
dic3['reply'] = dic5
comments_replys.append(dic3)
dic1["comments_replys"] = comments_replys
dic.append(dic1)
except:
continue
string = json.dumps(dic, ensure_ascii=False, indent=4)
print(string)
file_name = key_word + ".json"
f = open(file_name,'w',encoding='utf-8')
json.dump(dic,f,ensure_ascii=False, indent=4)
def run(key, page):
"""
:param key:
:param page:
:return:
"""
start_urls = []
for p in range(1, page+1):
url = 'http://search.tianya.cn/bbs?q={}&pn={}'.format(quote(key), p)
start_urls.append(url)
content_urls = prase_all_page(start_urls)
# print(content_urls)
prase_all_content(content_urls)
if __name__ == '__main__':
run(key_word, page)
结果如下:
2.2趋势分析
通过对发帖时间的统计,获取每个月发帖数量;通过对每月发帖文章进行统计,获取每月发帖关键词;而后进行趋势分析
import json
from collections import Counter
from pyecharts.charts import Bar
import jieba
from pyecharts import options as opts
#去除停用词
def get_stopwords():
stopwords = [line.strip() for line in open("stopword.txt", 'r',encoding="utf-8").readlines()]
stopwords_other = ['\n',' ']
stopwords.extend(stopwords_other)
return stopwords
def get_article_count_plus():
with open("data.json",'r',encoding='utf-8') as load_f:
load_dict = json.load(load_f)
stopwords = get_stopwords()
list = []
dic_word = {}
for dic in load_dict:
time = dic['time']
loc = time.rfind('-')
list.append(time[0:7])
article = dic['article']
seg_list = jieba.lcut(article)
month = time[0:7]
if month in dic_word.keys():
dic_word[month].extend(seg_list)
else:
dic_word[month] = []
dic = dict(Counter(list))
d = sorted(dic.items(), key=lambda d:d[0])
key_word_used = []
key_word = []
for k in d:
m = k[0]
list = [i for i in dic_word[m] if i not in stopwords]
word_count = Counter(list)
word_list = word_count.most_common(12)
for i in word_list:
if(i[0] not in key_word_used):
key_word.append(i[0])
key_word_used.append(i[0])
break
columns = [i[0] for i in d]
data = [i[1] for i in d]
col = []
for i in range(len(columns)):
c1 = columns[i].find('-')
m = columns[i][c1+1:] + '(' + key_word[i] + ')'
col.append(m)
print(col)
print(data)
return col,data
if __name__ == "__main__":
col,data = get_article_count_plus()
c = (
Bar()
.add_xaxis(col)
.add_yaxis("发帖量", data)
.set_global_opts(title_opts=opts.TitleOpts(title="发帖量及关键词统计", subtitle="柱状图"))
)
c.render("article_conut_plus.html")
可视化结果如下
2.3词云绘制
通过对所有文章使用jieba分词、去除停用词并进行词频统计,而后利用python pyecharts库进行词云绘制
import jieba
import json
from wordcloud import WordCloud
from collections import Counter
from pyecharts import options as opts
from pyecharts.charts import Page, WordCloud
from pyecharts.globals import SymbolType
#去除停用词
stopwords = [line.strip() for line i