爬虫多线程爬取图片

最新推荐文章于 2024-06-30 23:42:19 发布

挽风起苍岚

最新推荐文章于 2024-06-30 23:42:19 发布

阅读量347

点赞数 2

文章标签：爬虫

本文链接：https://blog.csdn.net/qq_62238325/article/details/139742728

版权

import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor
import requests
from lxml import etree
import threading


def get_page_content(url):
    """获取指定URL的网页内容"""
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve content from {url}")
        return None


def parse_image_urls(html_content):
    """解析网页内容，获取图片的URL"""
    tree = etree.HTML(html_content)
    url_list = tree.xpath('//div[contains(@class,"list")]//ul/li//a/@href')
    image_url_list = [f'http://www.netbian.com/{i}' for i in url_list]
    return image_url_list

def get_image_url_from_detail_page(detail_url):
    """从详情页获取图片URL，返回结果或None"""
    html_content = get_page_content(detail_url)
    if html_content:
        tree = etree.HTML(html_content)
        image_url = tree.xpath('/html/body/div[2]/div[2]/div[3]/div/p/a/img/@src')[0]
        return image_url
    return None

def get_image_urls(search_url, max_workers=10):
    """使用多线程获取所有图片的URL"""
    start_time = time.time()
    html_content = get_page_content(search_url)
    if html_content:
        image_url_list = parse_image_urls(html_content)
        print("正在爬取,请等待..........")
        # 使用线程池来并行处理请求
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # 使用map函数来异步执行函数并收集结果
            results = executor.map(get_image_url_from_detail_page, image_url_list)
        # 过滤掉None的结果
        end_time = time.time()
        print('用时' + str(end_time - start_time))
        return [url for url in results if url]
    return None




def download_image(url, output_folder, image_name):
    """下载并保存图片的线程函数"""
    image_path = os.path.join(output_folder, image_name)
    try:
        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            with open(image_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f'{image_name} has been saved.')
        else:
            print(f'Failed to retrieve image from {url}')
    except Exception as e:
        print(f'Error downloading {url}: {e}')


def main():
    image_urls = []
    for i in range(10):
        image_urls.append(get_image_urls(f'http://www.netbian.com/e/search/result/index.php?page={i}&searchid=108'))
    print(image_urls)
    import numpy as np
    image_urls = list(np.array(image_urls).flatten())
    # 指定保存图片的文件夹
    output_folder = './images'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    # 创建一个线程列表
    threads = []

    # 遍历图片URL列表，为每个图片创建一个线程
    for index, url in enumerate(image_urls, start=1):
        # 生成图片的文件名
        image_name = f'{str(index).zfill(3)}.jpg'
        # 创建线程
        thread = threading.Thread(target=download_image, args=(url, output_folder, image_name))
        thread.start()  # 启动线程
        threads.append(thread)

    # 等待所有线程完成
    for thread in threads:
        thread.join()

    print("All photos have been stored. Exiting program.")
    sys.exit()


if __name__ == "__main__":
    main()