DrissionPage这个比原来的一些爬虫框架简单多了可以点这里学习
from DrissionPage import WebPage
from DataRecorder import Recorder
page = WebPage()
rc = Recorder('book3.txt')
page.get('https://so.gushiwen.cn/guwen/bookv_46653FD803893E4F493C373A509792D3.aspx')
page.wait.load_complete()
while True:
for p in page.eles('x://*[@class="contson"]/p'):
rc.add_data(p.text)
btn = page('下一章', timeout=2)
if btn:
btn.click()
page.wait.load_start()
# 没有则退出程序
else:
break
rc.record()
这个爬取的是西游记
import jieba
from wordcloud import WordCloud
import numpy as np
from PIL import Image
from matplotlib import colors
f = open(r'book3.txt', "r", encoding="utf-8")
text = f.read()
f.close()
words_list_jieba = jieba.lcut(text)
def findifhave(demo, stop):
for ret in stop:
if (demo == ret):
return 'T'
stop = ['\n',',','。','?','!','“',':',',']
with open("stop.txt", 'r', encoding='utf-8') as f1:
for line in f1:
stop.append(line.replace("\n", ""))
f1.close()
dict = {}
for key in words_list_jieba:
dict[key] = dict.get(key, 0) + 1
for demo in list(dict.keys()):
if ('T' == findifhave(demo, stop)):
del dict[demo]
dict1 = sorted(dict.items(), key=lambda d: d[1], reverse=True)
print(dict1)
background_image = np.array(Image.open('xL3BNEe63N.png'))
colormaps = colors.ListedColormap(['#871A84', '#BC0F6A', '#BC0F60', '#CC5F6A', '#AC1F4A'])
wordcloud = WordCloud(font_path='ShiShiRuYi-XiaoYaCiLi-2.ttf', # 字体
prefer_horizontal=0.99,
background_color='white', # 背景色
max_words=200, # 显示单词数
max_font_size=100, # 最大字号
stopwords=stop, # 过滤噪声词
mask=background_image, # 背景轮廓
colormap=colormaps, # 使用自定义颜色
collocations=False
).fit_words(dict)
image = wordcloud.to_image()
image.show() # 展示图片
wordcloud.to_file('词云图.png') # 保存图片
这个是制作结巴图
这个是效果图