爬虫多线程爬取图片

import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor
import requests
from lxml import etree
import threading


def get_page_content(url):
    """获取指定URL的网页内容"""
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve content from {url}")
        return None


def parse_image_urls(html_content):
    """解析网页内容,获取图片的URL"""
    tree = etree.HTML(html_content)
    url_list = tree.xpath('//div[contains(@class,"list")]//ul/li//a/@href')
    image_url_list = [f'http://www.netbian.com/{i}' for i in url_list]
    return image_url_list

def get_image_url_from_detail_page(detail_url):
    """从详情页获取图片URL,返回结果或None"""
    html_content = get_page_content(detail_url)
    if html_content:
        tree = etree.HTML(html_content)
        image_url = tree.xpath('/html/body/div[2]/div[2]/div[3]/div/p/a/img/@src')[0]
        return image_url
    return None

def get_image_urls(search_url, max_workers=10):
    """使用多线程获取所有图片的URL"""
    start_time = time.time()
    html_content = get_page_content(search_url)
    if html_content:
        image_url_list = parse_image_urls(html_content)
        print("正在爬取,请等待..........")
        # 使用线程池来并行处理请求
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # 使用map函数来异步执行函数并收集结果
            results = executor.map(get_image_url_from_detail_page, image_url_list)
        # 过滤掉None的结果
        end_time = time.time()
        print('用时' + str(end_time - start_time))
        return [url for url in results if url]
    return None




def download_image(url, output_folder, image_name):
    """下载并保存图片的线程函数"""
    image_path = os.path.join(output_folder, image_name)
    try:
        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            with open(image_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f'{image_name} has been saved.')
        else:
            print(f'Failed to retrieve image from {url}')
    except Exception as e:
        print(f'Error downloading {url}: {e}')


def main():
    image_urls = []
    for i in range(10):
        image_urls.append(get_image_urls(f'http://www.netbian.com/e/search/result/index.php?page={i}&searchid=108'))
    print(image_urls)
    import numpy as np
    image_urls = list(np.array(image_urls).flatten())
    # 指定保存图片的文件夹
    output_folder = './images'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    # 创建一个线程列表
    threads = []

    # 遍历图片URL列表,为每个图片创建一个线程
    for index, url in enumerate(image_urls, start=1):
        # 生成图片的文件名
        image_name = f'{str(index).zfill(3)}.jpg'
        # 创建线程
        thread = threading.Thread(target=download_image, args=(url, output_folder, image_name))
        thread.start()  # 启动线程
        threads.append(thread)

    # 等待所有线程完成
    for thread in threads:
        thread.join()

    print("All photos have been stored. Exiting program.")
    sys.exit()


if __name__ == "__main__":
    main()

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

挽风起苍岚

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值