【无标题】

多线程网络爬虫

多线程

直接上代码

# 进程 队列
from multiprocessing import Process, Queue
# 线程池
from concurrent.futures import ThreadPoolExecutor

from lxml import etree
import requests

# 获取文件下载路径
def get_img_src(q):
    # url = "https://www.veer.com/topic/11751/"
    for i in range (1, 10):
        url = f"https://www.veer.com/topic/11751/?page={i}"
        rsp = requests.get (url)
        tree = etree.HTML (rsp.text)
        href_list = tree.xpath ('//div/article/section/section/a/@href')
        # print(href_list)
        for href in href_list:
            rsp2 = requests.get (href)
            tree = etree.HTML (rsp2.text)
            image = tree.xpath ('//div/main/section[1]/section[1]/section/figure[1]/div/img/@src')
            # print(image)
            image_update = "https:" + "".join (image)
            # print (image_update)
            q.put (image_update)  # 向队列添加数据
            print (f'{image_update},塞进队列')
        # 传入为空 结束
        q.put (' ')


def download(url):
    print ("---------------start------------" + url)

    name = url.split ("/")[-1]
    resp = requests.get (url)
    with open ('./img/' + name, mode='wb+') as f:
        f.write (resp.content)

    print ("---------------end------------" + url)


# 下载图片--给个线程池
def download_img(q):
    # 做一个线程池做缓冲
    with ThreadPoolExecutor (30) as t:
        while 1:
            src = q.get ()  # 从队列中获取数据,如果没有数据就会阻塞
            if src == "":
                break
            t.submit (download, src)


if __name__ == '__main__':
    q = Queue ()
    p1 = Process (target=get_img_src, args=(q,))
    p2 = Process (target=download_img, args=(q,))
    p1.start ()
    p2.start ()
# 如果有想要代码和了解更多的可以加 Eternity-taiko-66
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值