头歌:多线程、多进程爬虫

step1/web/index.html文件下,将所有alt=""填入step1/images

step1/student.py文件源码

import requests
from lxml import etree
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from multiprocessing import Pool
import os
import threading
import psutil
# URL伪装
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
}
def downimg(img_src):
    start_time = time.time()
    
    name = img_src.split('/')[-1].split('.')[0]
    img_url = "http://127.0.0.1:8080" + img_src
    img = requests.get(img_url)
    dir_path = 'step1/images'
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    img_path = dir_path + '/' + name + '.jpg'  # 图片的最终存储路径
    print(img_url, name + '.jpg', '开始下载。。。')
    thread = threading.currentThread()
    process = psutil.Process(os.getpid())
    print("线程ID:%s, 进程ID:%s"
          % (thread.ident, process.pid))
    #********** Begin *********#
    """保存图片"""
    with open(img_path, 'wb')as file:
        file.write(img.content)
    #********** End *********#
    finisTime = time.time() - start_time
    print(name + ".jpg 用时为:" + str(finisTime) + " second")
def parsePage():
    url = "http://127.0.0.1:8080/imgs/"
    response = requests.get(url=url, headers=header)
    html_content = response.text
    #********** Begin *********#
    """解析网页"""
    html = etree.HTML(html_content)
    item_list = html.xpath("//div[@class='box']/div/a/img/@src")
    print(item_list)
    s_time = time.time()
    
    #********** End *********#
    """非线程操作"""
    # for item in item_list:
    #     downimg(item)
    #********** Begin *********#
    """线程操作方式"""
    thread = []
    for item in item_list:
        
        thread.append(threading.Thread(target=downimg, args=(item, )))
    for t in thread:
        t.start()
    for t in thread:
        t.join()
    #********** End *********#
    print('总耗时: %s' % (time.time() - s_time))

希望可以给到帮助

  • 3
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值