“迷你爬虫编程小练习”进阶:抽取豆瓣某本书的前 50 条短评内容并计算评分(star)的平均值。提示:有的评论中并不包含评分。
import re,time
import requests
from bs4 import BeautifulSoup
count = 0
i = 0
s,count_s,count_del = 0,0,0
lst_stars =[]
url = 'https://book.douban.com/subject/1456692/comments/hot?p='
while count<50: # 少于50就循环
try:
r = requests.get(url+str(i+1))
except Exception as err:
print(err)
break
soup = BeautifulSoup(r.text,'lxml')
comments = soup.find_all('span','short') # 得到评论
pattern = re.compile('<span class="user-stars allstar(.*?) rating"') # 得到分数
p = re.findall(pattern,r.text)
for item in comments: # 打印爬下来的当前页的所有评论
count += 1 # 每得到一条评论,序号加1
if count >50:
count_del += 1 # 如果超过50,count_del加1,打不打印评论
else:
print(count,item.string) # 打印序号,评论内容
for star in p:
lst_stars.append(int(star)) # 将当前页得到的分数放入列表lst_stars
time.sleep(5) #根据robots.txt协议,间隔5秒
i +=1
for star in lst_stars[:-count_del]:
s += int(star)
if count >=50:
print(s//(len(lst_stars)-count_del))