python爬取《三国演义》小说&统计词频&生成词云图
注意点:
-
爬取小说正文时用的正则表达式涉及到多行匹配。需要开启多行模式(?s)
源代码练习
import re
import requests
import jieba
import jieba.analyse
import codecs
from collections import Counter
import wordcloud
import matplotlib.pyplot as plt
# 要爬取的网址url
url = 'http://www.janpn.com/book/sanguoyanyi2.html'
def get_content(url):
# 先获取二进制数据,再进行编码
txt = requests.get(url).content.decode('utf-8')
# print(txt)
book_title = re.compile(r'<h3 class="bookTitle">(.+)</h3>')
# m1.findall(txt)得到的是一个数组,取m1.findall(txt)[0]
# print(m1.findall(txt))
# book_chapters_re = re.compile(r'<li><a href="(.+\.html)">([第|回].+)</a></li>')
# book_chapters_re = re.compile(r'<ul class="panel-body panel-chapterlist"><li><a href="(.+)">(.*)</a></li></ul>')
# 一定要注意 要使用非贪婪模式 来匹配特定结尾.html
book_chapters_re = re.compile(r'<li><a href="(.*?\.html)".*?>(.+?)</a></li>')
book_chapters = book_chapters_re.findall(txt)
# 开启多行模式 正文是很多段落的
book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>')
# 过滤正文的\r\n
m3 = re.compile(r'\r\n')
# 过滤正文的
m4 = re.compile(r' ')
# 过滤正文的<br />
m5 = re.compile(r'<br />')
print(book_chapters)
# print(book_chapters)
with open('三国演义.txt','a') as f:
for i in book_chapters:
print([i[0],i[1]])
# print(book_chapters)
print(i[0])
i_url = i[0]
print("正在下载--->%s" % i[1])
# 根据每个章节的url,先获取二进制,再编码
content_html = requests.get(i_url).content.decode('utf-8')
# 匹配正文
content = book_content_re.findall(content_html)[0]
print(content)
content = m3.sub('',content)
content = m4.sub('',content)
content = m5.sub('',content)
print(content)
f.write('\n'+i[1]+'\n')
f.write(content)
# =================================================
# 创建停用词列表
def stopwordlist():
stopwords = [line.strip() for line in open('../结巴分词/hit_stopwords.txt',encoding='UTF-8').readline()]
return stopwords
# 对句子进行中文分词 并 去停用词
def seg_depart(sentence):
print('正在分词')
sentence_depart = jieba.cut(sentence.strip())
# 创建一个停用词列表
stopwords = stopwordlist()
# 输出结果为outstr
outstr = ''
# 去停用词
for word in sentence_depart:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += ' '
return outstr
# 读取文件并生成词云图
filepath = '三国演义.txt'
def create_word_cloud(filepath):
#读取文件内容
content = codecs.open(filepath,'r','gbk').read()
# 去停用词
content = seg_depart(content)
# 结巴分词
wordlist = jieba.cut(content)
wl = ' '.join(wordlist)
print(wl)
# 配置词云图
wc = wordcloud.WordCloud(
# 这只背景颜色
background_color='white',
# 设置最大显示的词数
max_words=100,
# 设置字体路径
font_path = 'C:\Windows\Fonts\msyh.ttc',
#
height=1200,
width=1600,
# 设置字体最大值
max_font_size=300,
# 设置有多少种随机配色方案
random_state=50
)
# 生成词云图
myword = wc.generate(wl)
# 展示词云图
plt.imshow(myword)
plt.axis('off')
plt.show()
# =================================================
# 开启爬虫
# get_content(url)
# 生成词云图
create_word_cloud(filepath)
# ===================================================
# 读取文件 词频统计
def count_from_file(filepath,top_limit=0):
with codecs.open(filepath,'r','gbk') as f:
content = f.read()
# 将多个空格替换为一个空格
content = re.sub(r'\s+',r' ',content)
content = re.sub(r'\.+',r' ',content)
# 去停用词
content = seg_depart(content)
return count_from_str(content)
def count_from_str(content,top_limit=0):
if top_limit<=0:
top_limit=100
# 提取文章的关键词
tags = jieba.analyse.extract_tags(content,topK=100)
print("关键词:")
print(tags)
words = jieba.cut(content)
counter = Counter()
for word in words:
if word in tags:
counter[word]+=1
return counter.most_common(top_limit)
# =====================================
print("打印词频统计")
# 打印词频统计
result = count_from_file(filepath)
print(result)
def test(url):
# 开启多行匹配模式 因为如果涉及到换行符 就要用多行
book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>')
content_html = requests.get(url).content.decode('gbk')
print(content_html)
content = book_content_re.findall(content_html)
print(content)
# test("http://www.janpn.com/book/171/171182/35225767.html")