python批量下载模库网图片

这里写自定义目录标题

python批量下载模库网图片

步骤:

  1. 获取页数
  2. 获取列表页
  3. 获取图片链接和名字相关字典
  4. 创建存放图片的文件夹
  5. 下载图片

代码

import requests
from lxml import etree
import urllib3
import time
import socket
import random
import ssl
import os

headers = {
    "authority": "www.mcool.com",
    "cookie": "8I05_7875_saltkey=XyvccP22; 8I05_7875_lastvisit=1608950583; 8I05_7875_atarget=1; 8I05_7875_visitedfid=60; Hm_lvt_c8d405f8732109572fa8b064fd4a17bf=1608954184; UM_distinctid=1769d2482b56ed-029b4324d41cfd-c791e37-1fa400-1769d2482b676e; 8I05_7875_st_p=0%7C1608954220%7C03599a46a7b75c11739633e43b5eb549; 8I05_7875_viewid=tid_11065; 8I05_7875_sid=TAR7sH; CNZZDATA1278841804=1565357761-1608951168-https%253A%252F%252Fwww.baidu.com%252F%7C1608958818; 8I05_7875_lastact=1608963606%09forum.php%09forumdisplay; 8I05_7875_st_t=0%7C1608963606%7Cfd1be307c6ac5a8f534ec5021852222b; 8I05_7875_forum_lastvisit=D_60_1608963606; Hm_lpvt_c8d405f8732109572fa8b064fd4a17bf=1608963607",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66",
    "path": "/beijing",
}
ssl._create_default_https_context = ssl._create_unverified_context
urllib3.disable_warnings()


# 获取页数
def get_page_num(url):
    res = requests.get(url, headers=headers, verify=False, timeout=100)
    html = etree.HTML(res.text)
    #     print(res.text)
    num = int(html.xpath('//div[@class="pg"]//a[@class="last"]//text()')[0].replace(".", "").strip())
    return num


# 获取列表页
def get_page_url_list(start_num, end_num):
    basic = "https://www.mcool.com/beijing/"
    page_url_list = []
    for i in range(end_num - start_num):
        page_url = basic + str(i + 1)
        page_url_list.append(page_url)
    return page_url_list


# 获取图片链接和名字相关字典
def get_image_dict(page_url_list):
    image_url_list = []
    image_name_list = []
    exception_page_url_list = []
    exception_question = []
    for i in page_url_list:
        try:
            res = requests.get(i, headers=headers, verify=False, timeout=100)
            res.close()
            socket.setdefaulttimeout(30)
            time.sleep(random.randint(3, 10))
            #             print(res.text)
            with open("1.html", 'wb') as f:
                f.write(res.content)
            html = etree.HTML(res.text)
            image_url = html.xpath('//img[@class="lazy"]/@src')
            image_name = html.xpath('//img[@class="lazy"]/@alt')
            image_url_list.append(image_url)
            image_name_list.append(image_name)
        except Exception as e:
            exception_page_url_list.append(i)
            exception_question.append(e)
            print(e)
            print(i)
        continue
    image_url_list = [i for k in image_url_list for i in k]
    image_name_list = [i for k in image_name_list for i in k]
    image_dict = dict(zip(image_name_list, image_url_list))
    return image_dict


# 创建存放图片的文件夹
def get_image_file_name_list(keyword, image_name_list):
    # 当前工作目录
    current_dir = os.getcwd()
    target_dir = current_dir + "\\" + keyword + "\\"
    # 如果文件夹不存在就创建
    if os.path.exists(target_dir) == False:
        os.mkdir(keyword)
    image_file_name_list = [target_dir + x + '.jpg' for x in image_name_list]
    return image_file_name_list


# 下载图片
def download_image(image_url_list):
    exception_image_url_list = []
    exception_image_name_list = []
    for i in image_url_list:
        try:
            res = requests.get(i, headers=headers, verify=False, timeout=100)
            time.sleep(random.randint(1, 5))
            res.close()
            socket.setdefaulttimeout(30)
            # db.insert(inser_img_sql, res.content)
            with open(image_file_name_list[image_url_list.index(i)], 'wb') as f:
                f.write(res.content)
        except Exception as e:
            exception_image_url_list.append(i)
            exception_image_name_list.append(image_name_list[image_url_list.index(i)])
            print(e)
        continue
    print(exception_image_url_list)
    print(exception_image_name_list)
    exception_image_dict = dict(zip(exception_image_name_list, exception_image_url_list))
    return exception_image_dict
    # while len(exception_image_url_list) > 0:
    #     download_image(exception_image_url_list)


url = "https://www.mcool.com/beijing"
keyword = "picture"
# num = get_page_num(url)
# print(num)
# print(type(num))
page_url_list = get_page_url_list(0, 3)
# print(page_url_list)
image_dict = get_image_dict(page_url_list)
print(image_dict)
image_name_list = list(image_dict.keys())
image_url_list = list(image_dict.values())
image_file_name_list = get_image_file_name_list(keyword, image_name_list)
download_image(image_url_list)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值