协程初体验——爬取B站专栏图片

最新推荐文章于 2023-04-12 13:21:38 发布

缓次郎

最新推荐文章于 2023-04-12 13:21:38 发布

阅读量640

点赞数 1

分类专栏：爬虫 Python 文章标签： python

本文链接：https://blog.csdn.net/qq_39559879/article/details/107203023

版权

Python 同时被 2 个专栏收录

10 篇文章 0 订阅

订阅专栏

爬虫

4 篇文章 0 订阅

订阅专栏

对于B站专栏这种静态网页来说，爬取图片思路的很简单：

请求网页拿到 HTML
解析拿到的 HTML 提取出图片链接

至于协程我推荐去看 Python 官方文档，然后写几个爬虫试试差不多就明白写法的固定模式了。
直接上代码：
BiliArticelImgCrawler.py

import re
import asyncio

from lxml.html import fromstring

from crawler import Crawler
from config import Config


HOST = 'https://www.bilibili.com/read/{}'


class BiliArticleImgCrawler(Crawler):
    def __init__(self, cvs, config=None):
        '''传入 cv 号列表批量保存专栏文章图片链接'''
        urls = []
        # 当然也可以用 map
        for cv in cvs:
            if not cv.startswith('cv'):
                cv = 'cv' + cv
            urls.append(HOST.format(cv))

        super(BiliArticleImgCrawler, self).__init__(urls, config=config)

    async def onconnect(self, resp):
        title, img_urls = self.parse_url(await resp.text())
        article_url = str(resp.url)
        cv = article_url[article_url.rfind(r'/')+1:] # 提取 cv 号

        self.save_to_textfile(title, img_urls, cv)

    def save_to_textfile(self, title, urls, cv):
        '''将图片链接保存至文本文件'''
        output = re.sub(r'[\\\\/:*?\"<>|]', '_', title).replace(' ', '') + '_' + cv + '.txt'

        with open(output, 'w', encoding='utf-8') as file:
            for url in urls:
                uri = 'https:' + url
                print(uri)
                print(uri, file=file)
                file.flush()

    def parse_url(self, html):
        selector = fromstring(html)

        title = selector.xpath('//title/text()')[0]
        url_list = selector.xpath('//img/@data-src')

        return (title, url_list)


async def main():
    cv = ['114514']
    crawler = BiliArticleImgCrawler(cv)
    await crawler.run()


if __name__ == '__main__':
    asyncio.run(main())

config.py

# 默认请求头
DEFAULT_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36 Edg/81.0.416.53',
}


class Config(object):
    def __init__(self, headers=DEFAULT_HEADERS, connect_num=1,
    download_num=1, timeout=60, delay=1):
        '''客户端请求设置类'''
        self.headers = headers # 请求头
        self.connect_num = connect_num # 并发连接数
        self.download_num = download_num # 并发下载数

        self.timeout = timeout # 请求超时
        self.delay = delay # 请求延时

crawler.py

import asyncio

from aiohttp import ClientSession, ClientTimeout

from config import Config


class Crawler(object):
    def __init__(self, base_urls, cqsize=0, dqsize=200, config=None, session=None):
        '''爬虫基类，需要复写连接、下载回调
        base_urls 一个 URL 列表，用来初始化连接队列
        cqsize    连接队列大小限制，默认无限制
        dqsize    下载队列大小限制，默认为最多存放 200 条 URL
        config    客户端配置，没有传入则使用默认配置

        session   默认 session
        '''

        # 如果没有传入配置则使用默认配置
        self.config = config if config else Config()

        self.connect_queue = asyncio.Queue(maxsize=cqsize)
        self.download_queue = asyncio.Queue(maxsize=dqsize)

        self.base_urls = base_urls
        self.base_url_num = len(base_urls)

        self.session = session

        # 初始化连接队列
        for url in base_urls:
            self.connect_queue.put_nowait(url)

    async def onconnect(self, resp, *args):
        '''连接回调，必需一个 session.get 的实例，其余参数可以在 run 方法中传入'''
        pass

    async def ondownload(self, resp, *args):
        '''下载回调，必需一个 session.get 的实例，其余参数可以在 run 方法中传入'''
        pass

    async def clear_queue(self, queue):
        '''清空队列，在请求到无效数据时可能会用到'''
        while not queue.empty():
            await queue.get()
            queue.task_done()

    async def connect(self, session, args):
        '''连接队列生产者方法'''
        while True:
            try:
                url = await self.connect_queue.get()

                async with session.get(url, headers=self.config.headers) as resp:
                    if args:
                        await self.onconnect(resp, *args)
                    else:
                        await self.onconnect(resp)
            except asyncio.CancelledError:
                break
            except Exception as e:
                print('ConnectError:', e)
            else:
                if self.base_url_num > 1: # 链接总数大于 1 时才延迟
                    await asyncio.sleep(self.config.delay)

            self.connect_queue.task_done()

    async def download(self, session, args):
        '''下载队列消费者方法'''
        while True:
            try:
                url = await self.download_queue.get()

                async with session.get(url, headers=self.config.headers) as resp:
                    if args:
                        await self.ondownload(resp, *args)
                    else:
                        await self.ondownload(resp)
            except asyncio.CancelledError:
                break
            except Exception as e:
                print('DownloadError:', e)
            else:
                await asyncio.sleep(self.config.delay)

            self.download_queue.task_done()

    async def cancel_tasks(self, tasks, return_exceptions=False):
        for task in tasks:
            task.cancel()

        await asyncio.gather(*tasks, return_exceptions=return_exceptions)

    async def create_tasks(self, session, connect_cb_args, download_cb_args):
        '''创建生产者消费者'''
        connect_tasks = []
        download_tasks = []

        # 简单的特判，防止并发数大于链接总数
        if len(self.base_urls) < self.config.connect_num:
            corou_num = len(self.base_urls)
        else:
            corou_num = self.config.connect_num

        for n in range(corou_num):
            connect_tasks.append(asyncio.create_task(self.connect(session, connect_cb_args)))
            await asyncio.sleep(0)

        for n in range(self.config.download_num):
            download_tasks.append(asyncio.create_task(self.download(session, download_cb_args)))
            await asyncio.sleep(0)

        await self.connect_queue.join()
        await self.cancel_tasks(connect_tasks)

        await self.download_queue.join()
        await self.cancel_tasks(download_tasks)


    async def run(self, connect_cb_args=None, download_cb_args=None):
        '''启动爬虫，可传入元组作为回调参数'''
        if self.session:
            await self.create_tasks(self.session, connect_cb_args, download_cb_args)
        else:
            timeout = ClientTimeout(total=self.config.timeout)
            async with ClientSession(timeout=timeout) as session:
                await self.create_tasks(session, connect_cb_args, download_cb_args)