import requests
from lxml import etree
from multiprocessing import Pool
from re import findall
import json
import os
from threading import Thread
# 多线程间通信的队列from queue import Queue
# 多进程间通信的队列from multiprocessing import Queue
defget_index_page():"""
获取所有章节的详情页url
:return: 所有章节的url
"""
url ='https://www.sisimanhua.com/manhua/feichaizhongshengzhiwoyaodangdalao/'
response = requests.get(url)if response.status_code ==200:
html = etree.HTML(response.text)
all_section_url = html.xpath('//ul[@id="chapter-list-1"]/li/a/@href')return['https://www.sisimanhua.com'+url for url in all_section_url]else:print('首页数据获取失败!', response)defget_all_details_data(url):
response = requests.get(url)if response.status_code !=200:print(f'请求{url}失败!')return# 提取每一章的每一个图片的地址
re_str = r'(?s)<script>.+?var siteUrl = "(.+?)".+?chapterImages = \[(.+?)\]'# print(response.text)
result = findall(re_str, response.text)
image_urls = json.loads(f'[{result[0][1]}]')
image_urls =['https://images.sisimanhua.com'+x for x in image_urls]return download_save_image2(image_urls)# 2.多线程下载一个章节的所有图片defdownload_seve_one_imge(url):
section = url.split('/')[-2]
response = requests.get(url)if response.status_code !=200:print('图片下载失败!')return# 判断是否存在对应的文件夹,如果不存在就创建文件ifnot os.path.exists(f'./files/{section}'):
os.makedirs(f'./files/{section}')
file_name = url.split('/')[-1].replace('webp','jpg')withopen(f'files/{section}/{file_name}','wb')as f:
f.write(response.content)print('图片下载完成!')defdownload_save_image2(image_urls):for url in image_urls:
t = Thread(target=download_seve_one_imge, args=(url,))
t.start()# 1.单线程下载一个章节的所有图片defdownload_save_image(image_urls):
section = image_urls[0].split('/')[-2]# 下载图片for url in image_urls:
response = requests.get(url)if response.status_code !=200:print('图片下载失败!')continue# 判断是否存在对应的文件夹,如果不存在就创建文件ifnot os.path.exists(f'./files/{section}'):
os.makedirs(f'./files/{section}')
file_name = url.split('/')[-1].replace('webp','jpg')withopen(f'files/{section}/{file_name}','wb')as f:
f.write(response.content)print('图片下载完成!')if __name__ =='__main__':
urls = get_index_page()
pool = Pool(5)# 进程池中添加任务的函数的返回值就是每个任务对应的函数的返回值的列表
result = pool.map(get_all_details_data, urls)
pool.close()print('=================结束===================')print(result)
q = Queue()defdownload(url:str, q):print(f'下载{url}')
time.sleep(randint(1,3))
result ='data'+url.split('.')[-2]
q.put(result)defdownload2(url:str):print(f'下载{url}')
time.sleep(randint(1,3))
result ='data'+ url.split('.')[-2]return result
if __name__ =='__main__':# 1.多个进程# ps = []# for x in range(10):# p = Process(target=download, args=(f'https://www.{x}.com', q))# p.start()# ps.append(p)## for p in ps:# p.join()## q.put('end')# while True:# data = q.get()# if data == 'end':# break# print(data)# 2.进程池
pool = Pool(4)
result = pool.map(download2,[f'https://www.{x}.com'for x inrange(10)])print(result)