对于B站专栏这种静态网页来说,爬取图片思路的很简单:
- 请求网页拿到 HTML
- 解析拿到的 HTML 提取出图片链接
至于协程我推荐去看 Python 官方文档,然后写几个爬虫试试差不多就明白写法的固定模式了。
直接上代码:
BiliArticelImgCrawler.py
import re
import asyncio
from lxml.html import fromstring
from crawler import Crawler
from config import Config
HOST = 'https://www.bilibili.com/read/{}'
class BiliArticleImgCrawler(Crawler):
def __init__(self, cvs, config=None):
'''传入 cv 号列表批量保存专栏文章图片链接'''
urls = []
# 当然也可以用 map
for cv in cvs:
if not cv.startswith('cv'):
cv = 'cv' + cv
urls.append(HOST.format(cv))
super(BiliArticleImgCrawler, self).__init__(urls, config=config)
async def onconnect(self, resp):
title, img_urls = self.parse_url(await resp.text())
article_url = str(resp.url)
cv = article_url[article_url.rfind(r'/')+1:] # 提取 cv 号
self.save_to_textfile(title, img_urls, cv)
def save_to_textfile(self, title, urls, cv):
'''将图片链接保存至文本文件'''
output = re.sub(r'[\\\\/:*?\"<>|]', '_', title).replace(' ', '') + '_' + cv + '.txt'
with open(output, 'w', encoding='utf-8') as file:
for url in urls:
uri = 'https:' + url
print(uri)
print(uri, file=file)
file.flush()
def parse_url(self, html):
selector = fromstring(html)
title = selector.xpath('//title/text()')[0]
url_list = selector.xpath('//img/@data-src')
return (title, url_list)
async def main():
cv = ['114514']
crawler = BiliArticleImgCrawler(cv)
await crawler.run()
if __name__ == '__main__':
asyncio.run(main())
config.py
# 默认请求头
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36 Edg/81.0.416.53',
}
class Config(object):
def __init__(self, headers=DEFAULT_HEADERS, connect_num=1,
download_num=1, timeout=60, delay=1):
'''客户端请求设置类'''
self.headers = headers # 请求头
self.connect_num = connect_num # 并发连接数
self.download_num = download_num # 并发下载数
self.timeout = timeout # 请求超时
self.delay = delay # 请求延时
crawler.py
import asyncio
from aiohttp import ClientSession, ClientTimeout
from config import Config
class Crawler(object):
def __init__(self, base_urls, cqsize=0, dqsize=200, config=None, session=None):
'''爬虫基类,需要复写连接、下载回调
base_urls 一个 URL 列表,用来初始化连接队列
cqsize 连接队列大小限制,默认无限制
dqsize 下载队列大小限制,默认为最多存放 200 条 URL
config 客户端配置,没有传入则使用默认配置
session 默认 session
'''
# 如果没有传入配置则使用默认配置
self.config = config if config else Config()
self.connect_queue = asyncio.Queue(maxsize=cqsize)
self.download_queue = asyncio.Queue(maxsize=dqsize)
self.base_urls = base_urls
self.base_url_num = len(base_urls)
self.session = session
# 初始化连接队列
for url in base_urls:
self.connect_queue.put_nowait(url)
async def onconnect(self, resp, *args):
'''连接回调,必需一个 session.get 的实例,其余参数可以在 run 方法中传入'''
pass
async def ondownload(self, resp, *args):
'''下载回调,必需一个 session.get 的实例,其余参数可以在 run 方法中传入'''
pass
async def clear_queue(self, queue):
'''清空队列,在请求到无效数据时可能会用到'''
while not queue.empty():
await queue.get()
queue.task_done()
async def connect(self, session, args):
'''连接队列生产者方法'''
while True:
try:
url = await self.connect_queue.get()
async with session.get(url, headers=self.config.headers) as resp:
if args:
await self.onconnect(resp, *args)
else:
await self.onconnect(resp)
except asyncio.CancelledError:
break
except Exception as e:
print('ConnectError:', e)
else:
if self.base_url_num > 1: # 链接总数大于 1 时才延迟
await asyncio.sleep(self.config.delay)
self.connect_queue.task_done()
async def download(self, session, args):
'''下载队列消费者方法'''
while True:
try:
url = await self.download_queue.get()
async with session.get(url, headers=self.config.headers) as resp:
if args:
await self.ondownload(resp, *args)
else:
await self.ondownload(resp)
except asyncio.CancelledError:
break
except Exception as e:
print('DownloadError:', e)
else:
await asyncio.sleep(self.config.delay)
self.download_queue.task_done()
async def cancel_tasks(self, tasks, return_exceptions=False):
for task in tasks:
task.cancel()
await asyncio.gather(*tasks, return_exceptions=return_exceptions)
async def create_tasks(self, session, connect_cb_args, download_cb_args):
'''创建生产者消费者'''
connect_tasks = []
download_tasks = []
# 简单的特判,防止并发数大于链接总数
if len(self.base_urls) < self.config.connect_num:
corou_num = len(self.base_urls)
else:
corou_num = self.config.connect_num
for n in range(corou_num):
connect_tasks.append(asyncio.create_task(self.connect(session, connect_cb_args)))
await asyncio.sleep(0)
for n in range(self.config.download_num):
download_tasks.append(asyncio.create_task(self.download(session, download_cb_args)))
await asyncio.sleep(0)
await self.connect_queue.join()
await self.cancel_tasks(connect_tasks)
await self.download_queue.join()
await self.cancel_tasks(download_tasks)
async def run(self, connect_cb_args=None, download_cb_args=None):
'''启动爬虫,可传入元组作为回调参数'''
if self.session:
await self.create_tasks(self.session, connect_cb_args, download_cb_args)
else:
timeout = ClientTimeout(total=self.config.timeout)
async with ClientSession(timeout=timeout) as session:
await self.create_tasks(session, connect_cb_args, download_cb_args)