直接上代码
#!/usr/bin/python
# -*- coding:utf-8 -*-
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from wordcloud import WordCloud
from bs4 import BeautifulSoup
import jieba
import requests
# 获取网页中的正文文本
def extract_text(url):
page_source = requests.get(url).content
bs_source = BeautifulSoup(page_source, "lxml")
report_text = bs_source.find_all('p')
text = ''
for p in report_text:
text += p.get_text()
text += '\n'
return text
# 词频分析
def word_frequency(text):
from collections import Counter
words = [word for word in jieba.cut(text, cut_all=True) if len(word) >= 2]
c = Counter(words)
for word_freq in c.most_common(35):
word, freq = word_freq
print(word, freq)
# 生成词频
url_2019 = 'http://news.sina.com.cn/c/xl/2019-03-05/doc-ihsxncvf9915493.shtml'
text_2019 = extract_text(url_2019)
word_frequency(text_2019)
# 词云分析
words = jieba.lcut(text_2019, cut_all=True)
exclude_words = ["我们", "提高", "国家"]
for word in words:
if word in exclude_words:
words.remove(word)
cuted = ' '.join(words)
path = 'SIMHEI.TTF'
abel_mask = np.array(Image.open(r'ML.png'))
wc = WordCloud(font_path=path, background_color='black', mask=abel_mask,max_words=30, width=800, height=400, margin=2, max_font_size=250, min_font_size=40).generate(cuted)
# 作图
plt.figure(dpi=300) # 通过分辨率放大或缩小图片
plt.imshow(wc)
plt.axis('off')
plt.show()