爬取《水浒传》全文，并形成词云-CSDN博客

本文链接：https://blog.csdn.net/2301_80145736/article/details/144473957

内容：任选书本网页爬取对应内容，提取词频做成词云。

这里找到的是水浒传的全文，爬取并进行分频形成词云。

库的引用部分：

import time
import jieba
from matplotlib import pyplot as plt
from matplotlib.image import imread  
from wordcloud import wordcloud
import bs4
import requests

随机找到一个可以爬取得网页：(代码提供对应注释，其中文件名字可以自行更改替换)

#爬虫部分
#循环遍历爬取所需内容的每一页
for i in range(1247,1367):
    #
    url = （）根据具体网页设置循环遍历的条件和网页地址，这里不展示原爬虫地址
    #捕获异常
    try:
        #打印要爬取的网页
        print(url)
        #构造一个向服务器请求资源的Request对象
        r=requests.get(url)
        #判断返回的Response类型状态是不是200。如果是200，他将表示返回的内容是正确的，如果不是200，他就会产生一个HttpError的异常。
        r.raise_for_status()
        # print(r.status_code)
        #修改读取代码的格式
        r.encoding='utf-8'
        #设置放置读取到的信息的文件
        path= '评论第一页.html'
        #写入读取的内容
        with open(path, 'a+', encoding='utf-8') as file:
            file.write(r.text)
        #时间间隔为0
        time.sleep(0)
        #设置摘取的内容放入path2文件中
        path2 = '摘录.txt'
        #将上方读到的信息放入汤中
        soup = bs4.BeautifulSoup(r.text, 'html.parser')
        #摘取汤中h5标签中的内容
        h5 = soup.find('h5', class_="py-3 lh-base text-center")
        #打印每章的标题
        print(h5.text)
        #将摘取的h5标签的内容写入path2文件
        with open(path2, 'a+', encoding='utf-8') as file:
            file.write(str(h5.text))
        #摘取汤中类名为“grap”的div标签的内容
        div = soup.find('div', class_="grap")
        #设置一个列表存放大量内容
        comment = []
        #寻找div标签下所有标签的内容，依次添加到提前设定的列表中
        for di in div.find_all():
            #获取内容中是string的部分
            text = di.string
            if text != "\n":
                comment.append(text)
        #遍历列表内容并写入文件path2
        with open(path2, 'a+', encoding='utf-8') as file:
            for n in range(0, len(comment)):
                file.write(str(comment[n]))
            # file.write("\n")
    except Exception as ex:
        print("第{}页采集出错，出错原因：{}".format(i-1246,ex))

词云部分：（每句代码请看注释）这里对分频的内容进行了设置，将停用词设为只能出现的词，输出水浒传中108好汉的出现频次。

注：停用词表（这里为仅参考此表）需要提前自定义

def getText(filepath):
    f = open(filepath,"r",encoding='utf-8')
    text = f.read()#把读入的内容放进text
    print(text)
    f.close() #关闭文件
    return text	#返回读出的文本数据
#停用词
def stopwordslist(filepath):
    #移除字符串开头和结尾的空格和换行符
    stopwords = [line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords
def wordFreg(filepath,text,topn):
#使用jieba库中的lcut()对文本分词,返回一个列表
    words = jieba.lcut(text.strip())
    print(words)
#创建一个新字典来存放词频
    counts = {}
#打开停用词的文件，并以列表形式放入列表stopwords()中
    usewords=stopwordslist('ueswords.txt')
#去掉长度为1的词
    for word in words:
        #长度为1，则不列入字典
        if len(word) ==1:
            continue
        #把同译的词归为一类
        elif word in usewords:
        #键存在则对键对应的值+1，键不存在则创建键，并把键对应的值设0，并+1
            counts[word] = counts.get(word,0) +1
#按照出现次数从大到小排序
    #把字典count中的每一项都放入列表list中
    items = list(counts.items())
#   key = lambda x:x[1]表示根据每个元组的第二个元素进行排序，reverse='True'意味着从大到小排序
    items.sort(key = lambda x:x[1],reverse=True)
#以写入形式打开名为filepath去掉后四位+_词频.txt的文件
    f = open(filepath[:-4]+'_词频.txt',"w+",encoding='gbk')
#取出前topn个出现字数最多的词
    for i in range(topn):
        word, count = items[i]
        #写入词频文件中
        f.writelines("{}\t{}\n".format(word,count))
    #关闭文件
    f.close()
    return counts
#调用函数getText
text=getText("摘录.txt")
#调用函数wordFreg
count=wordFreg("摘录.txt", text, 100)
# 读入形状图片
bg_pic = imread('tiger.jpg')
#打开词频文件
f = open("摘录_词频.txt",'r',encoding='gbk')
#指根据词频文件生成词云
textf=f.read()
#调用本机使用的字体；背景颜色为白色；宽度1000，高度1000；词云中字体的最大字号；设定mask蒙版；图案到整个词云图片边的距离
wcloud = wordcloud.WordCloud(font_path="r'C://windows//Font//simhei.ttf",background_color = "white",width=1000,max_words=500,mask=bg_pic,height=1000,margin=2).generate(textf)
#生成名为"水浒传cloud_star.png"的文件
wcloud.to_file("水浒传cloud_star.png")
f.close()
#显示词云图片
plt.imshow(wcloud)
# 关闭坐标轴
plt.axis('off')
#展示词云图片
plt.show()

词云图设为老虎的轮廓，最终词云图为如下：