===========写给入门小白看的========================
==================直接上代码=========================
************************************************************************************
# 豆瓣读书 新书速递
import requests
from parsel import Selector
import re
# from time import sleep
def main():
for i in range(1,3): # 新书速递一般是13 or 14 or 15页,记得先打开网站看看具体是多少页
# sleep(2)
url = f'https://book.douban.com/latest?subcat=%E5%85%A8%E9%83%A8&p={i}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
resp = requests.get(url=url,headers=headers)
selec = Selector(resp.text)
hrefs = selec.xpath('//li/div[2]/h2/a/@href').getall()
marc_200a = selec.xpath('//li/div[2]/h2/a/text()').getall()
for i in range(len(hrefs)):
href = hrefs[i]
m_200a = marc_200a[i]
get_all(href,m_200a,headers)
def get_all(url,m_200a,headers):
r = requests.get(url=url,headers=headers).text
s = Selector(r)
# 作者
marc_200f = s.xpath('//*[@id="info"]/span[1]/a/text()').getall()
m_200f = '/'.join(marc_200f) if marc_200f else None
# 译者
marc_200f_g = s.xpath('//*[@id="info"]/span/a/text()').getall()
marc_200fg = [*marc_200f,*marc_200f_g]
marc_200g = [i for i in marc_200fg if marc_200fg.count(i) != 2]
m_200g = '/'.join(marc_200g) if marc_200g else None
marc_210c = re.findall('<span class="pl">出版社:</span>.*?">(.*?)</a>',r,re.S)
m_210c = ''.join(marc_210c) if marc_210c else None
marc_500a = re.findall('<span class="pl">原作名:</span> (.*?)<br/>',r,re.S)
m_500a = ''.join(marc_500a).replace(',',' ') if marc_500a else None
marc_210d = re.findall('<span class="pl">出版年:</span> (.*?)<br/>',r,re.S)
m_210d = ''.join(marc_210d).replace('-','.') if marc_210d else None
marc_215a = re.findall('<span class="pl">页数:</span> (.*?)<br/>',r,re.S)
m_215a = ''.join(marc_215a) if marc_215a else None
marc_010d = re.findall('<span class="pl">定价:</span> (.*?)<br/>',r,re.S)
m_010d = ''.join(marc_010d).replace('元','') if marc_010d else None
marc_010b = re.findall('<span class="pl">装帧:</span> (.*?)<br/>',r,re.S)
m_010b = ''.join(marc_010b) if marc_010b else None
marc_225a = re.findall('<span class="pl">丛书:.*?">(.*?)</a>',r,re.S)
m_225a = ''.join(marc_225a) if marc_225a else None
marc_010a = re.findall('<span class="pl">ISBN:</span> (.*?)<br/>',r,re.S)
m_010a = ''.join(marc_010a) if marc_010a else None
marc = f'{m_010a},{m_010b},{m_010d},{m_200a},{m_200f},{m_200g},{m_210c},{m_210d},{m_215a},{m_225a},{m_500a}'
print(marc)
save(marc)
def save(marc):
with open('BOOK.csv','a',encoding='utf-8-sig',newline='') as f:
f.write(f'{marc}\n')
if __name__ == '__main__':
main()
print('=====下载完成======')
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++