【无标题】

最新推荐文章于 2024-01-18 23:25:36 发布

Eternity-taiko-66

最新推荐文章于 2024-01-18 23:25:36 发布

阅读量161

点赞数

分类专栏： python爬虫文章标签： python 开发语言网络爬虫

本文链接：https://blog.csdn.net/qq_44820940/article/details/127996096

版权

python爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

多线程网络爬虫

多线程

多线程

直接上代码

# 进程 队列
from multiprocessing import Process, Queue
# 线程池
from concurrent.futures import ThreadPoolExecutor

from lxml import etree
import requests

# 获取文件下载路径
def get_img_src(q):
    # url = "https://www.veer.com/topic/11751/"
    for i in range (1, 10):
        url = f"https://www.veer.com/topic/11751/?page={i}"
        rsp = requests.get (url)
        tree = etree.HTML (rsp.text)
        href_list = tree.xpath ('//div/article/section/section/a/@href')
        # print(href_list)
        for href in href_list:
            rsp2 = requests.get (href)
            tree = etree.HTML (rsp2.text)
            image = tree.xpath ('//div/main/section[1]/section[1]/section/figure[1]/div/img/@src')
            # print(image)
            image_update = "https:" + "".join (image)
            # print (image_update)
            q.put (image_update)  # 向队列添加数据
            print (f'{image_update},塞进队列')
        # 传入为空 结束
        q.put (' ')


def download(url):
    print ("---------------start------------" + url)

    name = url.split ("/")[-1]
    resp = requests.get (url)
    with open ('./img/' + name, mode='wb+') as f:
        f.write (resp.content)

    print ("---------------end------------" + url)


# 下载图片--给个线程池
def download_img(q):
    # 做一个线程池做缓冲
    with ThreadPoolExecutor (30) as t:
        while 1:
            src = q.get ()  # 从队列中获取数据，如果没有数据就会阻塞
            if src == "":
                break
            t.submit (download, src)


if __name__ == '__main__':
    q = Queue ()
    p1 = Process (target=get_img_src, args=(q,))
    p2 = Process (target=download_img, args=(q,))
    p1.start ()
    p2.start ()
# 如果有想要代码和了解更多的可以加 Eternity-taiko-66