问题描述
新闻标题是新闻的主旨,从新闻标题中可以进行多种内容的挖掘,通过对学校新闻网的新闻进行分析获得热点词,基于这些标题中的热点词设计并实现构建词云的算法,最后根据词云模型统计热点词的词频,可以快速了解这段时间学校的工作重心和工作内容。
代码实现
实现思路
1.导入所需要的包
import requests
import jieba
import wordcloud
import matplotlib.pyplot as plt
from os import path
from lxml import etree
2.伪装成浏览器用户
headers={
'Host': 'www.haue.edu.cn',
'Referer': 'http://www.haue.edu.cn/info/1034/1502.htm',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48'
}
3.发送请求获取响应信息
def getjson(url):
response = requests.get(url, headers=headers, timeout=3)
if response.status_code == 200:
return response.content.decode("utf-8")
else:
print("Fail to get page")
return None
测试
response = getjson('http://www.haue.edu.cn/xwdt/xxxw.htm')
response
4.爬虫选择器定位
浏览器打开F12,查看页面结构,我们需要得到的是li标签中的信息
可以看出我们需要的信息在‘/html/body/div[3]/div/div[2]/div[2]/ul/li’
response = getjson(url)
tree = etree.HTML(response)
li_list = tree.xpath('/html/body/div[3]/div/div[2]/div[2]/ul/li')
li_list
info_list = []
for li in li_list :
title = li.xpath('./a/text()') # 获取a标签内容
url = li.xpath('./a/@href')[0] # 获取a标签href链接
item = {}
item['title'] = title
item['url'] = 'http://www.haue.edu.cn/' + url[3:]
info_list.append(item)
info_list.append(title)
info_list
5.屏蔽无用词并分词
article_dict = {}
content = ''.join('%s' %a for a in info_list)
article_jieba = jieba.lcut(content)
stop_words = set(line.strip() for line in open('stopwords.txt',encoding='utf-8'))
for data in article_jieba:
if len(data) == 1 or data in stop_words:
continue
if data not in article_dict:
article_dict[data] = 1
else:
article_dict[data] += 1
article_list = list(article_dict.items())
cloud_data = ''
for word in article_list:
cloud_data += word[0] + ' '
完整代码
import requests
import jieba
import wordcloud
import matplotlib.pyplot as plt
from os import path
from lxml import etree
# 伪装用户浏览器
headers={
'Host': 'www.haue.edu.cn',
'Referer': 'http://www.haue.edu.cn/info/1034/1502.htm',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48'
}
def getjson(url):
# 发送请求,得到响应消息
response = requests.get(url, headers=headers, timeout=3)
# 响应信息解析
if response.status_code == 200:
return response.content.decode("utf-8")
else:
print("Fail to get page")
return None
def getPageInfo():
info_list=[]
for i in range (0,157):
if i == 0:
url = 'http://www.haue.edu.cn/xwdt/xxxw.htm'
else:
i = str(i)
url = 'http://www.haue.edu.cn/xwdt/xxxw/' + i + '.htm'
response = getjson(url)
tree = etree.HTML(response)
# 爬虫选择器定位
li_list = tree.xpath('/html/body/div[3]/div/div[2]/div[2]/ul/li')
for li in li_list :
# 获取新闻标题
title = li.xpath('./a/text()')
info_list.append(title)
return info_list
def createWordCloud(info_list):
article_dict = {}
# 将新闻标题连接成一串字符
content = ''.join('%s' %a for a in info_list)
# 使用jieba分词
article_jieba = jieba.lcut(content)
# 加载屏蔽词
stop_words = set(line.strip() for line in open('stopwords.txt',encoding='utf-8'))
# 取出有用的的词
for data in article_jieba:
if len(data) == 1 or data in stop_words:
continue
if data not in article_dict:
article_dict[data] = 1
else:
article_dict[data] += 1
article_list = list(article_dict.items())
cloud_data = ''
for word in article_list:
cloud_data += word[0] + ' '
backgroud_Image = plt.imread('bg.jpg')
w = wordcloud.WordCloud(
# 设置背景颜色
background_color='white',
# 设置背景图片
mask=backgroud_Image,
# 若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字
font_path = "simhei.ttf",
# 设置最大显示的字数
max_words=2000,
# 设置有多少种随机生成状态,即有多少种配色方案
random_state=30,
# 设置字体最大值
max_font_size=200,
)
w.generate(cloud_data)
# 显示词云图
plt.imshow(w)
# 是否显示x轴、y轴下标
plt.axis('off')
plt.show()
# 获得模块所在的路径
d = path.dirname(__file__)
w.to_file(path.join(d, "词云.jpg"))
if __name__ == '__main__':
createWordCloud(getPageInfo())