多线程、多进程爬虫

最新推荐文章于 2024-10-11 17:09:52 发布

CrazyTomato139

最新推荐文章于 2024-10-11 17:09:52 发布

阅读量548

点赞数

文章标签：爬虫

本文链接：https://blog.csdn.net/weixin_66163649/article/details/139492408

版权

第1关：多线程、多进程爬虫

import requests
from lxml import etree
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from multiprocessing import Pool
import os
import threading
import psutil

# URL伪装
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
}


def downimg(img_src):
    start_time = time.time()
    
    name = img_src.split('/')[-1].split('.')[0]
    img_url = "http://127.0.0.1:8080" + img_src
    img = requests.get(img_url)

    dir_path = 'step1/images'
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    img_path = dir_path + '/' + name + '.jpg'  # 图片的最终存储路径

    print(img_url, name + '.jpg', '开始下载。。。')

    thread = threading.currentThread()
    process = psutil.Process(os.getpid())
    print("线程ID：%s, 进程ID：%s"
          % (thread.ident, process.pid))


    #********** Begin *********#
    """保存图片"""
    with open(img_path, 'wb')as file:
        file.write(img.content)
    #********** End *********#
    finisTime = time.time() - start_time


def parsePage():
    url = "http://127.0.0.1:8080/imgs/"
    response = requests.get(url=url, headers=header)
    html_content = response.text

    #********** Begin *********#
    """解析网页"""
    html = etree.HTML(html_content)
    item_list = html.xpath("//div[@class='box']/div/a/img/@src")
    print(item_list)
    s_time = time.time()
    #********** End *********#

    """非线程操作"""

    # for item in item_list:
    #     downimg(item)

    #********** Begin *********#
    """线程操作方式"""
    thread = []
    for item in item_list:
        thread.append(threading.Thread(target=downimg, args=(item, )))
    for t in thread:
        t.start()
    for t in thread:
        t.join()
    #********** End *********#



if __name__ == '__main__':
    parsePage()