==================
50个线程爬取豆瓣读书新书速递
==================
import requests
from fake_useragent import UserAgent
from parsel import Selector
import re
from concurrent.futures import ThreadPoolExecutor
def get_book_info(url,title,headers):
r = requests.get(url=url,headers=headers).text
s = Selector(r)
# 作者
marc_200f = s.xpath('//*[@id="info"]/span[1]/a/text()').getall()
m_200f = '/'.join(marc_200f) if marc_200f else ''
# 译者
marc_200f_g = s.xpath('//*[@id="info"]/span/a/text()').getall()
marc_200fg = [*marc_200f,*marc_200f_g]
marc_200g = [i for i in marc_200fg if marc_200fg.count(i) != 2]
m_200g = '/'.join(marc_200g) if marc_200g else ''
# 出版社
list1 = s.xpath('//*[@id="info"]//text()').getall()
list2 = [i.strip() for i in list1 if i.strip() != ""]
m_210c = list2[list2.index('出版社:')+1] if '出版社:' in list2 else ''
marc_500a = re.findall('<span class="pl">原作名:</span> (.*?)<br/>',r,re.S)
m_500a = ''.join(marc_500a).replace(',',' ') if marc_500a else ''
marc_210d = re.findall('<span class="pl">出版年:</span> (.*?)<br/>',r,re.S)
m_210d = ''.join(marc_210d).replace('-','.',1).split('-')[0] if marc_210d else ''
marc_215a = re.findall('<span class="pl">页数:</span> (.*?)<br/>',r,re.S)
m_215a = ''.join(marc_215a) if marc_215a else ''
marc_010d = re.findall('<span class="pl">定价:</span> ([\d.]*).*?<br/>',r,re.S)
m_010d = '%.2f'% float(''.join(marc_010d)) if marc_010d else ''
marc_010b = re.findall('<span class="pl">装帧:</span> (.*?)<br/>',r,re.S)
m_010b = ''.join(marc_010b).replace('平装','') if marc_010b else ''
marc_225a = re.findall('<span class="pl">丛书:.*?">(.*?)</a>',r,re.S)
m_225a = ''.join(marc_225a) if marc_225a else ''
marc_010a = re.findall('<span class="pl">ISBN:</span> (.*?)<br/>',r,re.S)
m_010a = ''.join(marc_010a) if marc_010a else ''
marc = f'{m_010a}|{m_010b}|CNY{m_010d}|{title}|{m_200f}|{m_200g}|{m_210c}|{m_210d}|{m_215a}页|{m_225a}|{m_500a}'
print(marc)
with open('NEWBOOK.csv','a',encoding='utf-8-sig',newline='') as f:
f.write(f'{marc}\n')
def main(page):
base_url = f'https://book.douban.com/latest?subcat=%E5%85%A8%E9%83%A8&p={page}'
headers = {'User-Agent':UserAgent().random,'Accept-Encoding': 'Gzip'} # 使用gzip压缩传输数据让访问更快
r = requests.get(url=base_url,headers=headers,proxies=None).text
s = Selector(r)
urls = s.xpath('//li/div[2]/h2/a/@href').getall()
title = s.xpath('//li/div[2]/h2/a/text()').getall()
for j in range(len(urls)):
get_book_info(urls[j],title[j],headers)
if __name__ == '__main__':
# 开启50个线程
with ThreadPoolExecutor(50) as t:
for i in range(1,10): # 爬取的页数
t.submit(main,page=i)
print('全部下载完成')