分析
每页地址变化
http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-1
http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-2
使用发开者工具 测试 书名 评论数 等 在网页源代码的 ul列表下
代码
import requests
import parsel
import csv
import time
f = open('畅销书排行.csv', mode='a', encoding='utf-8', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
'标题',
'评论',
'推荐',
'作者',
'出版日期',
'出版社',
'原价',
'售价',
'电子书价格',
'详情页',
])
csv_writer.writeheader()
def onepage(url):
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.7.4313.400'}
resp = requests.get(url=url, headers=headers)
selector = parsel.Selector(resp.text)
lis = selector.css('ul.bang_list li')
for li in lis:
title = li.css('.name a::text').get()
comment = li.css('.star a::text').get()
recomment = li.css('.tuijian::text').get()
author = li.css('div:nth-child(5) a::attr(title)').get()
data = li.css('div:nth-child(6) span::text').get()
press = li.css('div:nth-child(6) a::text').get()
price_r = li.css('.price .price_r::text').get()
price_n = li.css('.price .price_n::text').get()
price_e = li.css('price_e span::text').get()
href = li.css('.name a::attr(href)').get()
dit = {
'标题': title,
'评论': comment,
'推荐': recomment,
'作者': author,
'出版日期': data,
'出版社': press,
'原价': price_r,
'售价': price_n,
'电子书价格': price_e,
'详情页': href,
}
print(dit)
csv_writer.writerow(dit)
for page in range(1, 26):
print(f'正在爬取第{page}页')
time.sleep(1)
url = f'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-{page}'
onepage(url)