import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor
import requests
from lxml import etree
import threading
def get_page_content(url):
"""获取指定URL的网页内容"""
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Failed to retrieve content from {url}")
return None
def parse_image_urls(html_content):
"""解析网页内容,获取图片的URL"""
tree = etree.HTML(html_content)
url_list = tree.xpath('//div[contains(@class,"list")]//ul/li//a/@href')
image_url_list = [f'http://www.netbian.com/{i}' for i in url_list]
return image_url_list
def get_image_url_from_detail_page(detail_url):
"""从详情页获取图片URL,返回结果或None"""
html_content = get_page_content(detail_url)
if html_content:
tree = etree.HTML(html_content)
image_url = tree.xpath('/html/body/div[2]/div[2]/div[3]/div/p/a/img/@src')[0]
return image_url
return None
def get_image_urls(search_url, max_workers=10):
"""使用多线程获取所有图片的URL"""
start_time = time.time()
html_content = get_page_content(search_url)
if html_content:
image_url_list = parse_image_urls(html_content)
print("正在爬取,请等待..........")
# 使用线程池来并行处理请求
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# 使用map函数来异步执行函数并收集结果
results = executor.map(get_image_url_from_detail_page, image_url_list)
# 过滤掉None的结果
end_time = time.time()
print('用时' + str(end_time - start_time))
return [url for url in results if url]
return None
def download_image(url, output_folder, image_name):
"""下载并保存图片的线程函数"""
image_path = os.path.join(output_folder, image_name)
try:
response = requests.get(url, stream=True, timeout=10)
if response.status_code == 200:
with open(image_path, 'wb') as file:
for chunk in response.iter_content(1024):
file.write(chunk)
print(f'{image_name} has been saved.')
else:
print(f'Failed to retrieve image from {url}')
except Exception as e:
print(f'Error downloading {url}: {e}')
def main():
image_urls = []
for i in range(10):
image_urls.append(get_image_urls(f'http://www.netbian.com/e/search/result/index.php?page={i}&searchid=108'))
print(image_urls)
import numpy as np
image_urls = list(np.array(image_urls).flatten())
# 指定保存图片的文件夹
output_folder = './images'
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 创建一个线程列表
threads = []
# 遍历图片URL列表,为每个图片创建一个线程
for index, url in enumerate(image_urls, start=1):
# 生成图片的文件名
image_name = f'{str(index).zfill(3)}.jpg'
# 创建线程
thread = threading.Thread(target=download_image, args=(url, output_folder, image_name))
thread.start() # 启动线程
threads.append(thread)
# 等待所有线程完成
for thread in threads:
thread.join()
print("All photos have been stored. Exiting program.")
sys.exit()
if __name__ == "__main__":
main()
爬虫多线程爬取图片
最新推荐文章于 2024-06-30 23:42:19 发布