多线程爬取kx1d图片-CSDN博客

本文链接：https://blog.csdn.net/chuan_yu_chuan/article/details/54605152

本文介绍了一个使用Python实现的多线程图片下载器。该下载器能够从指定网页抓取图片链接，并通过多线程方式高效下载这些图片到本地指定目录。文中详细展示了如何解析网页内容获取图片URL，以及如何利用线程池来加速下载过程。

摘要由CSDN通过智能技术生成

# -*-coding:utf-8-*-
import os
import shutil
import threading

import lxml.html
import requests

list_href = []


class Download(object):
    current_num = 0

    def __init__(self, output, hf_list):
        self.output = output
        self.hf_list = hf_list
        self._value_lock = threading.Lock()

    def downJpgList(self):
        for jpg_url in self.hf_list:
            print(jpg_url)
            res = requests.get(jpg_url)
            imageFile = open(os.path.join(output, os.path.basename(jpg_url)), 'wb')
            for chunk in res.iter_content(100000):
                imageFile.write(chunk)
            imageFile.close()


def get_url_download(url_page, current_num, total_num):
    global list_href
    url_page_arr = url_page.rsplit('.', maxsplit=1)
    while current_num <= total_num:
        url_page_new = url_page_arr[0] + r'_%s.' + url_page_arr[1]
        current_num += 1
        s_content = requests.get(url_page_new % (current_num))
        tree_html = lxml.html.fromstring(s_content.text)
        href = [img.get('src') for img in tree_html.cssselect('.articleBody  a   img')]
        list_href.extend(href)


def get_url_first_download(url_page):
    global list_href
    s_content = requests.get(url_page)
    tree_html = lxml.html.fromstring(s_content.text)
    href = [img.get('src') for img in tree_html.cssselect('.articleBody  a   img')]
    list_href.extend(href)


def rand_generate():
    import random
    lst = [chr(i) for i in range(97, 123)]
    lst.extend(chr(i) for i in range(65, 91))
    lst.extend(i for i in range(0, 10))
    lst = list(map(str, lst))
    return ''.join(random.sample(lst, 12))


def view_bar(num=1, sum=100, bar_word=':'):
    rate = float(num) / float(sum)
    rate_num = int(rate * 100)
    # print ('\r%{}:'.format(rate_num),end='\n')
    os.write(1, bytes('\r%{}:'.format(rate_num), 'gbk'))
    for i in range(0, num):
        os.write(1, bytes(bar_word, 'gbk'))
        sys.stdout.flush()


if __name__ == '__main__':
    import sys, getopt, random

    # 若没有指定文件名称，默认随机数生成文件名
    outputfile = rand_generate()
    opts, args = getopt.getopt(sys.argv[1:], "hn:o:", ["--name=", "--output="])
    for opt, arg in opts:
        if opt == '-h':
            print('test.py -n <name> -o <outputfile> url_page')
            sys.exit()
        elif opt in ("-n", "--name"):
            model_name = arg
        elif opt in ("-o", "--output"):
            outputfile = arg
    url_page = args[0]
    print('outputfile ', outputfile, 'url_page ', url_page)
    output = os.path.join('I:\chuan\down', outputfile)
    if os.path.exists(output):
        shutil.rmtree(output)
    os.mkdir(output)
    ss = requests.get(url_page)
    ss.encoding = 'utf-8'
    tree = lxml.html.fromstring(ss.text)
    num = tree.cssselect('.pages > span')[0].text_content()
    nums = str(num).split("/")
    current_num = int(nums[0])
    total_num = int(nums[1])
    get_url_first_download(url_page)
    get_url_download(url_page, current_num, total_num)
    downloadThreads = []
    for i in range(5):
        download = Download(output, [href for href in list_href[i::5]])
        downloadThread = threading.Thread(target=download.downJpgList)
        downloadThreads.append(downloadThread)
        downloadThread.start()
    for downloadThread in downloadThreads:
        downloadThread.join()
    print('ok')