-
要求:抓取前50条短评内容并计算评分(有的评论中并不包含评分)的平均值。
-
热评格式:
-
评分格式:
-
代码
# -*- coding: utf-8 -*-
'''
‘theLittlePrinces’
@author: LU
'''
import requests,re,time
from bs4 import BeautifulSoup
count = 0 #热评数量
i = 0 #页码
s = 0 #评分总和
count_s = 0
count_del = 0 #超过50的数量
lst_stars = [] #评分列表
while count<50:
try:
r = requests.get('https://book.douban.com/subject/1084336/comments/hot?p='+str(i+1)) #翻页
except Exception as err:
print(err)
break
soup = BeautifulSoup(r.text, 'lxml')
comments = soup.find_all('span', 'short') #寻找评论所在的行(评论行的标签是span,属性内容是short);find_all方法返回的是一个列表
for item in comments:
count += 1
if count>50:
count_del += 1
else:
print(count, item.string)
pattern = re.compile('<span class="user-stars allstar(.*?)rating"') #找评分
p = re.findall(pattern, r.text) #返回列表p
for star in p:
lst_stars.append(int(star))
time.sleep(5)
i += 1
for star in lst_stars[:-count_del]:
s += int(star)
if count >= 50:
print(s//(len(lst_stars)-count_del))
- 运行结果
链接:https://pan.baidu.com/s/1xcnggj3uc1UWpM6Kbk7A6A
提取码:dowt