Python打包多线程下载（windows）

江湖不当人

已于 2023-09-04 11:07:37 修改

阅读量767

点赞数

文章标签： python 爬虫 windows

于 2023-08-16 16:04:02 首次发布

本文链接：https://blog.csdn.net/Big_Data_Legend/article/details/132320818

版权

目标网站：https://www.igdcc.com/4Kmeinv

目标产出：抓取第一页中的所有图片。

网站分析：无任何反爬策略，适合新手熟悉相关包的使用。

导入第三方包、设置线程数

import os
import sys
import random
import time
import requests
import threading
from lxml import etree

real_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')
max_connections = 10
pool_sema = threading.BoundedSemaphore(max_connections)

抓取网站第一页数据

def get_text(url):
    for i in range(3):
        try:
            resp = requests.get(url, timeout=10)
            if resp.status_code != 200:
                time.sleep(5)
                continue
            return resp.text
        except Exception as e:
            time.sleep(5)
    print('抓取失败：', url)

获取图片的url

def get_image_urls(text):
    if not text:
        return
    root = etree.HTML(text)
    a_lts = root.xpath('//*/div[@class="mt15 clearfix pic-auto pic-list"]/a')
    ret = []
    for a in a_lts:
        urls = a.xpath('.//@data-original')
        if urls:
            ret.append(urls[0])
    return ret

保存图片（增加线程锁）

def get_content(url):
    for i in range(3):
        try:
            resp = requests.get(url, timeout=10)
            if resp.status_code != 200:
                time.sleep(5)
                continue
            return resp.content
        except Exception as e:
            time.sleep(5)



def save_image(output_file, image_url):
    pool_sema.acquire()
    content = get_content(image_url)
    if not content:
        return
    with open(output_file, 'wb') as fw:
        fw.write(content)
    time.sleep(random.random())
    pool_sema.release()

主函数

def start():
    url = 'https://www.igdcc.com/4Kmeinv/'
    # output_dir = os.path.join(real_path, 'data')
    output_dir = os.path.join(os.path.dirname(sys.argv[0]), 'data')
    if not os.path.exists(output_dir):
        cmd = 'mkdir %s' % output_dir
        cmd = cmd.replace('/', '\\')
        os.system(cmd)
        print('数据目录创建成功：data')
    print('只会抓取第一页哦>_<')
    text = get_text(url)
    image_urls = get_image_urls(text)
    print('开始抓取图片~')
    thread_list = []
    for i in range(len(image_urls)):
        output_file = os.path.join(output_dir, str(i) + '.png')
        image_url = image_urls[i]
        th = threading.Thread(target=save_image, args=(output_file, image_url))
        thread_list.append(th)
    for th in thread_list:
        th.start()
    for th in thread_list:
        th.join()