注意:此代码仅用于学习!!!
要求:
在任意找一本图书,抓取它某一页的短评并进行页面解析将短评文字抽取后输出,同时保存到某一.txt文件中;再对其中的评分进行抽取计算法其总分;同时下载保存短评的头像图片。
https://book.douban.com/subject/2567698/comments/此地址可用于测试
import requests as req
import re
import bs4
import os
import jieba
from wordcloud import WordCloud
def getHTMLText(url):
try:
h={'User-Agent':'Mozilla/5.0(Windows NT 10.0; Win64; x64)'
'AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/79.0.3945.130 Safari/537.36 OPR/66.0.3515.115'}
r=req.get(url, headers=h)
print('--------------------------')
print(r.status_code)
print('--------------------------')
r.raise_for_status()
r.encoding=r.apparent_encoding
return '成功读取网页源代码'
except:
return '产生异常'
url = input('请输入网页链接地址:')
print(getHTMLText(url))
print('--------------------------')
# #爬取文字
h={'User-Agent':'Mozilla/5.0(Windows NT 10.0; Win64; x64)'
'AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/79.0.3945.130 Safari/537.36 OPR/66.0.3515.115'}
r=req.get(url, headers=h)
soup=bs4.BeautifulSoup(r.text,'html.parser')
pattern=soup.find_all('span','short')
ls=[]
for e in pattern:
print(e.text)
ls.append(e.text+'\n')
fp=open('pinglun.txt','w')
for i in range(len(ls)):
fp.write(ls[i])
fp.close()
#词云展示
f = open('pinglun.txt', 'r', encoding='gbk', errors='ignore')
excludes = {"的", "和", "是", "我", "在", "都", "很"}
t = f.read()
f.close()
ls = jieba.lcut(t)
txt = ' '.join(ls)
# 删除无意义的词语
excludes_words = set(excludes)
txt_clean = ' '.join([word for word in txt.split() if word not in excludes_words])
w = WordCloud(font_path='simhei.ttf',
width=1000, height=700, background_color='white',
)
w.generate(txt_clean)
w.to_file('pinglun.png')
print('--------------------------')
print("成功生成词云图片!")
print('--------------------------')
#爬取评分
# 检查是否成功获取网页内容
if r.status_code == 200:
content = r.text
else:
print("无法访问网页。")
exit()
print('这一页大家的评分如下:')
print('--------------------------')
# 使用正则表达式匹配评分信息
pattern = r"<span class=\"user-stars allstar(.*?)0 rating\""
matches = re.findall(pattern, content)
# 输出匹配到的评分信息
for match in matches:
print(match.strip())
#求总分
total = 0
for match in matches:
number = float(match.strip()) # 移除空白字符并将字符串转换为浮点数
total += number # 将数字加到总和上
print('总分为:')
print(total) # 打印总和
print('--------------------------')
#爬取图片
path = 'pic'
if os.path.exists(path)==False:
os.makedirs(path)
h={'User-Agent':'Mozilla/5.0(Windows NT 10.0; Win64; x64)'
'AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/79.0.3945.130 Safari/537.36 OPR/66.0.3515.115'}
r=req.get(url, headers=h)
soup=bs4.BeautifulSoup(r.text,'html.parser')
print('--------------------------')
print('正在查找/下载网页中的图片')
print('--------------------------')
ls1=[]
for e in soup.find_all('img'):
addr=e.get('src')
ls1.append(addr)
n=0
for e in ls1:
n=n+1
photo=req.get(e, timeout=10)
fp=open(path+'\{0:03}.jpg'.format(n),'wb')
fp.write(photo.content)
fp.close()
print('--------------------------')
print('图片下载完成')
print('--------------------------')
下面是运行结果的截图啦~
生成的一些文件:
爬取的评论文件:
生成的词云图片: