import requests
from lxml import etree
import os
from concurrent.futures import ThreadPoolExecutor
def getting_page(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
}
resp = requests.get(url,headers=headers)
#数据解析
html = etree.HTML(resp.text)
lis = html.xpath("/html/body/div[2]/div/div[3]/ul/li")
for li in lis:
img_src = "https://pic.netbian.com"+li.xpath("./a/img/@src")[0]
img_name = li.xpath('./a/img/@alt')[0] + '.jpg'
img_name = img_name.encode('iso-8859-1').decode('gbk')
# 可持续化存储
img_data = requests.get(img_src, headers=headers).content
img_path = 'plmm/' + img_name
with open(img_path,"wb") as f:
f.write(img_data)
print(img_name,"下载完成!")
# 创建一个文件夹存储图片
if not os.path.exists('./plmm'):
os.mkdir('./plmm')
if __name__ == '__main__':
with ThreadPoolExecutor(40) as t:
for i in range(1,50): #爬取1到49页的图
#将下载任务交给线程池
t.submit(getting_page,f"https://pic.netbian.com/4kmeinv/index_{i}.html")
print("全部下载完成")
多线程下载图片爬虫
于 2023-07-30 00:50:23 首次发布