利用python爬取网页做词云分析——2019年政府工作报告

最新推荐文章于 2024-08-21 18:17:45 发布

广西小表哥

最新推荐文章于 2024-08-21 18:17:45 发布

阅读量5.6k

点赞数 4

分类专栏： Python爬虫文章标签： python 爬虫词云

本文链接：https://blog.csdn.net/weixin_44562468/article/details/88880807

版权

Python爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

直接上代码

#!/usr/bin/python
# -*- coding:utf-8 -*-


import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from wordcloud import WordCloud
from bs4 import BeautifulSoup
import jieba
import requests


# 获取网页中的正文文本
def extract_text(url):
    page_source = requests.get(url).content
    bs_source = BeautifulSoup(page_source, "lxml")
    report_text = bs_source.find_all('p')
    text = ''
    for p in report_text:
        text += p.get_text()
        text += '\n'
    return text

# 词频分析
def word_frequency(text):
    from collections import Counter
    words = [word for word in jieba.cut(text, cut_all=True) if len(word) >= 2]
    c = Counter(words)
    for word_freq in c.most_common(35):
        word, freq = word_freq
        print(word, freq)

# 生成词频
url_2019 = 'http://news.sina.com.cn/c/xl/2019-03-05/doc-ihsxncvf9915493.shtml'
text_2019 = extract_text(url_2019)
word_frequency(text_2019)

# 词云分析
words = jieba.lcut(text_2019, cut_all=True)
exclude_words = ["我们", "提高", "国家"]
for word in words:
    if word in exclude_words:
        words.remove(word)
cuted = ' '.join(words)
path = 'SIMHEI.TTF'
abel_mask = np.array(Image.open(r'ML.png'))
wc = WordCloud(font_path=path, background_color='black', mask=abel_mask，max_words=30, width=800, height=400, margin=2, max_font_size=250, min_font_size=40).generate(cuted)

# 作图
plt.figure(dpi=300)  # 通过分辨率放大或缩小图片
plt.imshow(wc)
plt.axis('off')
plt.show()