Python - 爬取豆瓣短评评论
import requests
from bs4 import BeautifulSoup
import re
import time
# 保存豆瓣评分
source_douban_score = r'e:/test/txt/douban_book_score.txt'
# 获取书名
def get_book_name(soup):
nbg = soup.find('a','nbg')
book_tilte = nbg['title']
return book_tilte
# 获取评论
def get_comment(soup):
shorts = soup.find_all('span','short')
for short in shorts:
short_content = short.string
#short_tag = short.name
#short_attrs = short.attrs
print(short_content)
# 获取评分
def get_book_score(markup):
pattern_s = re.compile(pattern='user-stars allstar(.*) rating')
stars = re.findall(pattern=pattern_s,string=markup)
sum_star = 0
sum_len = len(stars)
avg_score = 0
if sum_len>0:
for i in range(0,sum_len):
star = stars[i]
sum_star+= int(star)
avg_score = sum_star//sum_len
return avg_score
# 添加请求头,Cookie需要自己获取
headers = {
'Cookie': 'll="118159"; bid=Tlj9LZXK6qY; __utma=30149280.1907998096.1644908381.1644908381.1644908381.1; __utmc=30149280; __utmz=30149280.1644908381.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=81379588.181811108.1644908384.1644908384.1644908384.1; __utmc=81379588; __utmz=81379588.1644908384.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; gr_user_id=4cc23846-8110-487a-9875-d2c22a01ffc5; gr_cs1_beee101c-a4d0-4c07-acf1-64c7195fdd7b=user_id%3A0; ap_v=0,6.0; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1644908384%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.3ac3=*; _vwo_uuid_v2=DDF9B30D066068B64AEED05150ED2CC21|59ca475fd37ba868d8b940b30cb8051c; __gads=ID=30286cce13d6257e-222fedbd9fd00006:T=1644908408:RT=1644908408:S=ALNI_Mbxvt-k0lb2MC6tAcI60A5qPobCCw; viewed="35680544"; __utmb=30149280.4.10.1644908381; __utmb=81379588.3.10.1644908384; __yadk_uid=OU6iPv3WVtSb7pup23vqweBrH4Gj0jdG; _pk_id.100001.3ac3=5c3c19525f813f77.1644908384.1.1644908470.1644908384.; _ga=GA1.1.1475140875.1644908471; refer_url=https://read.douban.com/category/105; _ga_RXNMP372GL=GS1.1.1644908471.1.1.1644909370.60',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'Host': 'book.douban.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
def get_book(book_id):
douban_url = 'https://book.douban.com/subject/{}/'.format(book_id)
print('获取书号:',book_id,douban_url)
try:
print()
r = requests.get(douban_url,headers = headers)
if r.status_code == 200:
time.sleep(.6)
markup = r.text
soup = BeautifulSoup(markup,'lxml')
# 获取书名
book_name = get_book_name(soup)
# 获取评论
# get_comment()
# 获取评分
score = get_book_score(markup)
if score >0:
with open(source_douban_score,'a') as f:
content = book_name+' '+ str(score)
f.write(content)
print(content)
else:
print('获取失败,',r.status_code)
except Exception as err:
print('获取错误',err)
# 获取 https://book.douban.com/subject/35680544/
get_book(str(35680544))