python批量下载模库网图片
步骤:
- 获取页数
- 获取列表页
- 获取图片链接和名字相关字典
- 创建存放图片的文件夹
- 下载图片
代码
import requests
from lxml import etree
import urllib3
import time
import socket
import random
import ssl
import os
headers = {
"authority": "www.mcool.com",
"cookie": "8I05_7875_saltkey=XyvccP22; 8I05_7875_lastvisit=1608950583; 8I05_7875_atarget=1; 8I05_7875_visitedfid=60; Hm_lvt_c8d405f8732109572fa8b064fd4a17bf=1608954184; UM_distinctid=1769d2482b56ed-029b4324d41cfd-c791e37-1fa400-1769d2482b676e; 8I05_7875_st_p=0%7C1608954220%7C03599a46a7b75c11739633e43b5eb549; 8I05_7875_viewid=tid_11065; 8I05_7875_sid=TAR7sH; CNZZDATA1278841804=1565357761-1608951168-https%253A%252F%252Fwww.baidu.com%252F%7C1608958818; 8I05_7875_lastact=1608963606%09forum.php%09forumdisplay; 8I05_7875_st_t=0%7C1608963606%7Cfd1be307c6ac5a8f534ec5021852222b; 8I05_7875_forum_lastvisit=D_60_1608963606; Hm_lpvt_c8d405f8732109572fa8b064fd4a17bf=1608963607",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66",
"path": "/beijing",
}
ssl._create_default_https_context = ssl._create_unverified_context
urllib3.disable_warnings()
def get_page_num(url):
res = requests.get(url, headers=headers, verify=False, timeout=100)
html = etree.HTML(res.text)
num = int(html.xpath('//div[@class="pg"]//a[@class="last"]//text()')[0].replace(".", "").strip())
return num
def get_page_url_list(start_num, end_num):
basic = "https://www.mcool.com/beijing/"
page_url_list = []
for i in range(end_num - start_num):
page_url = basic + str(i + 1)
page_url_list.append(page_url)
return page_url_list
def get_image_dict(page_url_list):
image_url_list = []
image_name_list = []
exception_page_url_list = []
exception_question = []
for i in page_url_list:
try:
res = requests.get(i, headers=headers, verify=False, timeout=100)
res.close()
socket.setdefaulttimeout(30)
time.sleep(random.randint(3, 10))
with open("1.html", 'wb') as f:
f.write(res.content)
html = etree.HTML(res.text)
image_url = html.xpath('//img[@class="lazy"]/@src')
image_name = html.xpath('//img[@class="lazy"]/@alt')
image_url_list.append(image_url)
image_name_list.append(image_name)
except Exception as e:
exception_page_url_list.append(i)
exception_question.append(e)
print(e)
print(i)
continue
image_url_list = [i for k in image_url_list for i in k]
image_name_list = [i for k in image_name_list for i in k]
image_dict = dict(zip(image_name_list, image_url_list))
return image_dict
def get_image_file_name_list(keyword, image_name_list):
current_dir = os.getcwd()
target_dir = current_dir + "\\" + keyword + "\\"
if os.path.exists(target_dir) == False:
os.mkdir(keyword)
image_file_name_list = [target_dir + x + '.jpg' for x in image_name_list]
return image_file_name_list
def download_image(image_url_list):
exception_image_url_list = []
exception_image_name_list = []
for i in image_url_list:
try:
res = requests.get(i, headers=headers, verify=False, timeout=100)
time.sleep(random.randint(1, 5))
res.close()
socket.setdefaulttimeout(30)
with open(image_file_name_list[image_url_list.index(i)], 'wb') as f:
f.write(res.content)
except Exception as e:
exception_image_url_list.append(i)
exception_image_name_list.append(image_name_list[image_url_list.index(i)])
print(e)
continue
print(exception_image_url_list)
print(exception_image_name_list)
exception_image_dict = dict(zip(exception_image_name_list, exception_image_url_list))
return exception_image_dict
url = "https://www.mcool.com/beijing"
keyword = "picture"
page_url_list = get_page_url_list(0, 3)
image_dict = get_image_dict(page_url_list)
print(image_dict)
image_name_list = list(image_dict.keys())
image_url_list = list(image_dict.values())
image_file_name_list = get_image_file_name_list(keyword, image_name_list)
download_image(image_url_list)