创作不易,如果以下内容对你有帮助,别忘了点一个赞,让更多的小伙伴能看到吧~~
1. 问题描述
- 为了研究自媒体和官媒的传播新闻《武汉快递小哥汪勇的先进事迹》的及时性和传播力度,对比两种传播途径的差异,并给出新闻传播随时间的基本规律。本文通过爬取微博针对某一问题进行数据爬取,进行数据分析对比。
2. 数据获取
- 主要利用python的requests库和etree库对数据进行爬取和提取并整理,以下附上爬取的部分代码:
# 数据爬取
def start_crawl(base_url,month,number):
for i in range(2,month):
for j in range(1,number):
headers = request_header()
url = base_url.format(i,i+1,j)
time.sleep(0.5)
response = requests.get(url=url, headers=headers)
res = etree.HTML(response.text)
yield res
# 数据获取并清洗
def data_clean(base_url,month,number):
res = start_crawl(base_url,month,number)
data = []
for html in res:
div_list = html.xpath('//div[@class="content"]')
for div in div_list:
info = []
name = div.xpath('./div[@class="info"]//a[@class="name"]/text()')
content = div.xpath('./p[@class="txt"]//text()')
time = div.xpath('./p[@class="from"]/a[1]/text()')
name = name[0] if len(name) > 0 else None
time = time[0].strip() if len(time) > 0 else None
content = ''.join([i.strip() for i in content])
info.append(name)
info.append(time)
info.append(content)
if info[0] is not None and info[1] is not None:
data.append(info)
return data
-
附上一张元素审查时的经典图:
-
以下附上部分数据表:
3. 数据整理并可视化
- 作出两种媒体整体热度趋向图
data = data.loc[data['发布时间'].str.contains('今天') == False,:] # 去除干扰
data.index = np.arange(len(data))
# 取出月份
data["发布时间"] = data["发布时间"].apply(lambda x: x[:3])
# 对发布时间进行分层聚合处理
time_count = data.groupby(by="发布时间").count()
# 画图
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.plot(time_count.index,time_count.values.ravel(),"-g")
plt.title("随时间的热度走向图")
plt.xlabel("时间(月份)")
plt.ylabel("发布量(个)")
plt.show()
plt.bar(x=time_count.index,height=time_count.values.ravel(),align='center',color="g")
plt.title("随时间的热度走向图")
plt.xlabel("时间(月份)")
plt.ylabel("发布量(个)")
plt.show()
- 作出官媒和自媒体各自热度占比图
# 自定义官媒名称
OMN = ["青岛文明网","880山西交通广播","楚天交通广播","央视影音","抚州南丰发布",\
"珠江商报","云南日报","国家邮政局","中国青年报","武汉晚报","长江日报",\
"新都资讯","成都少先队","成都发布","潍坊政法","杭锦发布","四川党的建设杂志",\
"法治日报","中国舆论场","苏州发布","新浪辽宁","山西经济广播","山西共青团",\
"北京朝阳","江西共青团","包头新闻网","鄂尔多斯发布","羊城晚报","保定发布",\
"陇南礼县发布","中国残联","人民日报全国党媒平台","四川文明网","内蒙古团委",\
"武汉广播电视台","湖北卫视","潍坊市人民检察院","湖北省妇联","湖北日报",\
"人民日报","江西卫视根据地","陕西新闻广播","南京晨报","今晚报","广东共青团",\
"南昌日报","河北综合广播","陕西都市快报","楚天交通广播","陕西新广","青海网",\
"山东卫视","渭南日报社","国家应急广播","央视新闻","罗湖共青团","内蒙古团委",\
"三秦青年","共青团青岛市委","中国共青团杂志","重庆共青团","延安青年","共青团中央",\
"西藏共青团","大同共青团","中国青年报","黑龙江晨报","国家邮政局","四川共青团",\
"合肥日报","陕西日报","南京晨报","中国新闻网","江苏共青团","共青团南川区","惠州共青团",\
"人民法院报","天津日报","青海共青团","广西卫视","贺州共青团","共青团包头市委员会",\
"西藏共青团","武汉发布","中国网","山西政法","云南网","吉林人民广播电台"
]
OM = []
for name in data["媒体名"]:
if name in OMN:
count += 1
OM.append(name)
OM = pd.Series(OM)
PM = []
for name in data["媒体名"]:
if name not in OMN:
count += 1
PM.append(name)
PM = pd.Series(PM)
plt.figure(figsize=(10,8))
x = (OM.size,PM.size)
labels = ["官媒","自媒体"]
plt.pie(x=x,labels=labels,colors=['red','c'],autopct='%.1f%%',pctdistance=0.5,labeldistance=1.2,radius=1.2,explode=[0,0.1],\
wedgeprops={'linewidth':1.5,'edgecolor':'green'},textprops={'fontsize':10,'color':'black'})
plt.title("官媒和自媒体占比图")
plt.show()
- 作出官媒和自媒体各自热度图
# 分别切分出官媒的数据和自媒体数据
OM_data = data_.loc[data_['媒体名'].isin(OM)]
PM_data = data_.loc[~data_['媒体名'].isin(OM)]
# 对两种媒体分别进行分层聚合
om_plot = OM_data.groupby(by="发布时间").count()
pm_plot = PM_data_.groupby(by="发布时间").count()
# 画图
plt.figure(figsize=(10,8))
ax = plt.subplot(111)
ax.plot(om_plot.index,om_plot.values,linewidth=3,label="官媒")
ax.plot(pm_plot.index,pm_plot.values,linewidth=3,label="自媒体")
plt.title("官媒和自媒体随时间的热度走向图")
plt.xlabel("时间(月份)")
plt.ylabel("发布量(个)")
plt.legend(loc="best")
plt.show()
- 画出2月~3月新闻初期的传播规律图
# 获取2月~3月的日期
data_["发布时间"] = data_["发布时间"].apply(lambda x: x[:6])
data_1 = data_.loc[data_["发布时间"].str.contains("02月")]
data_2 = data_.loc[data_["发布时间"].str.contains("03月")]
data_ = pd.concat([data_1,data_2],axis=0)
data_.index = np.arange(len(data_))
# 取出官媒和自媒体各自数据
OM_data_ = data_.loc[data_['媒体名'].isin(OM)]
PM_data_ = data_.loc[~data_['媒体名'].isin(OM)]
# 分别对其分层聚合处理
om_plot_ = OM_data_.groupby(by="发布时间").count()
pm_plot_ = PM_data_.groupby(by="发布时间").count()
# 画图
plt.figure(figsize=(15,5))
ax = plt.subplot(121)
ax.plot(om_plot_.index,om_plot_.values,linewidth=3,label="官媒")
plt.title("2月~3月官媒随时间的热度走向图")
plt.xlabel("时间(天数)")
plt.ylabel("发布量(个)")
plt.legend(loc="best")
ax_2 = plt.subplot(122)
ax_2.plot(pm_plot_.index,pm_plot_.values,linewidth=3,label="自媒体")
plt.title("2月~3月自媒体随时间的热度走向图")
plt.xlabel("时间(天数)")
plt.ylabel("发布量(个)")
plt.xticks([])
plt.legend(loc="best")
plt.show()
4.云词图
- 附部分代码
data_ = pd.read_excel('C:/Users/雷神/Desktop/wy.xlsx',sheet_name=0,index_col=0)
data_ = data_.loc[data_['发布时间'].str.contains('今天') == False,:] # 去除干扰
article = data_["文章"]
text = []
for a in article:
text.append(a)
text = ",".join(str(i) for i in text)
# 将字符串切分为单个字符
def chinese_jieba(text):
wordlist_jieba=jieba.cut(text,cut_all=True)
space_wordlist='/'.join(wordlist_jieba)
return space_wordlist
# 绘制词云图
with open("F:/wangyong/article.txt",encoding="utf-8") as f:
text = f.read()
text = chinese_jieba(text)
image = PIL.Image.open('F:/wangyong/bg2.jpg')
MASK = np.array(image)
wordcloud = WordCloud(font_path="C:\\Windows\\Fonts\\simkai.ttf",\
background_color="white", width=800, \
repeat=False, mask=MASK,\
height=500, max_words=180,min_font_size=8).generate(text)
#img = wordcloud.to_image()
#img.show()
wordcloud.to_file('F:/wangyong/wordcloud.png')
plt.figure(figsize=(10,8),dpi=100)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
- 图像展示
结语
- 可以看出:
- 自媒体明显比官媒的传播路径广,可以初步认为自媒体时代已经到来。
- 新闻在传播初期,新闻传播得极快,在中期新闻量会有一个饱和,接着就是骤减,后期可能因为新闻出现新的改变,还会有一个小高峰。