还需添加ip池 未实现
import requests
from bs4 import BeautifulSoup
import re #正则规范信息
import csv #保存至.csv
import random
import time #生成随机秒数,反爬
aurl = 'https://book.douban.com/tag/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
res = requests.get(aurl,headers = headers)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text,'html.parser')
calss1 = soup.select('#content > div > div.article > div:nth-child(2) > div > table > tbody > tr > td > a')
for calss2 in calss1 :
tag = calss2.get_text().strip()
for tage in range(0,41,20):
dat = {
'start':tage,
type:'T'
}
url = 'https://book.douban.com/tag/%s' % tag
#每个url下载3页信息,下载一页保存一页
books = []
response = requests.get(url, headers=headers,params=dat)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
names = soup.select('#subject_list > ul > li > div.info > h2 > a')
details = soup.select('#subject_list > ul > li > div.info > div.pub')
scores = soup.select('#subject_list > ul > li > div.info > div.star.clearfix > span.rating_nums')
comments = soup.select('#subject_list > ul > li > div.info > div.star.clearfix > span.pl')
for name, detail, score, comment in zip(names, details, scores, comments):
try:
dict_book = {}
name2 = name.get_text()
name = ''.join(name2.split())
detail = detail.get_text().split('/')
author = detail[0].strip()
pubtime = detail[-2].strip()
price2 = detail[-1].strip()
price1 = re.findall(r'(\d+\.\d*).*', price2) # 正则取出数值,保持格式一致
price = price1[0]
score = score.get_text()
comment2 = comment.get_text().strip()
comment1 = re.findall(r'(\d+)\D*', comment2)
comment = comment1[0]
dict_book['书名'] = name
dict_book['作者'] = author
dict_book['上市时间'] = pubtime
dict_book['价格'] = price
dict_book['书籍评分'] = score
dict_book['评分人数'] = comment
books.append(dict_book)
time.sleep(random.random() * 3) # 随机休眠
except IndexError as e:
print('IndexError:', e)
finally:
print(name)
# errors='ignore' 避免格式错误导致写入循环终止
with open(r'fileName.csv', 'w', errors='ignore') as csvfile:
filednames = ['书名', '作者', '上市时间', '价格', '书籍评分', '评分人数']
# 以字典格式写入 filednames
writer = csv.DictWriter(csvfile, filednames)
# 写入 filednames
writer.writeheader()
for book_ in books:
writer.writerow({
'书名': book_['书名'], '作者': book_['作者'], '上市时间': book_['上市时间'], '价格': book_['价格'],
'书籍评分': book_['书籍评分'], '评分人数': book_['评分人数']
})