scrapy 爬取 CSDN 并保存为 pdf 文件
- 首先配置下载器中间件,这里用 pyppeteer 自动化提取页面
async def _get_page_html(self, request):
browser = await launch(headless=True)
url = request.url
page = await browser.newPage()
await page.goto(url)
await asyncio.sleep(3)
html = await page.content()
await browser.close()
if html:
return HtmlResponse(url=url, body=html, encoding='utf-8', request=request)
else:
breakpoint()
async def process_request(self, request, spider):
special_handling = request.meta.get('special_handling', False)
if special_handling:
html = await self._get_page_html(request)
return html
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
# 从Crawler配置获取设置参数
middleware = cls(*args, **kwargs)
return middleware
找到 CSDN 网站接口的 url
第一次请求 url 链接,并且获取到 csdn 动态网页的总页数
def start_requests(self):
value = input('请输入搜索内容:')
start_url = f'https://so.csdn.net/api/v3/search?q={value}&t=blog&p=1&s=0&tm=0&lv=-1&ft=0&l=&u=&ct=-1&pnt=-1&ry=-1&ss=-1&dct=-1&vco=-1&cc=-1&sc=-1&akt=-1&art=-1&ca=-1&prs=&pre=&ecc=-1&ebc=-1&ia=1&dId=&cl=-1&scl=-1&tcl=-1&platform=pc&ab_test_code_overlap=&ab_test_random_code='
yield scrapy.Request(start_url, callback=self.second_requests, dont_filter=True, meta={'start_url': start_url, 'value':value, 'special_handling': True})
def second_requests(self, response):
datas = response.xpath('/html/body/pre/text()').extract_first()
data = json.loads(datas)
print('{}: 共有 {} 页,请输入下载的页数: '.format(response.meta['value'],int(data['total_page'])))
while True:
try:
num = int(input())
break
except:
print('请重新输入')
for i in range(1,num+1):
url = response.meta['start_url'].replace('p=1','p={}'.format(i))
yield scrapy.Request(url, callback=self.detail_url, dont_filter=False, meta={'special_handling': True})
接下来找到内容所对应的 url 链接,这里使用 breakpoint() 进行断点,如果 datas 的值为 None,那么程序就会停在这里进行调试。
def detail_url(self, response):
data = json.loads(response.xpath('/html/body/pre/text()').extract_first())
datas = data.get('result_vos')
while True:
if datas:
for data in datas:
title = re.sub('<em>|</em>','',data['title']).strip()
url = data['url']
yield scrapy.Request(url, callback=self.parse, dont_filter=True, meta={'title':title})
break
else:
breakpoint()
最后,提取出页面的 HTML 文本代码,从 weasyprint 导入 HTML 进行 PDF 的转换
def parse(self,response):
title = response.meta['title']
page = response.xpath('//*[@id="content_views"]').extract_first()
while True:
if page:
html = '''<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>{}</title></head><body><p><h1>{}</h1><hr><br></p>{}</body></html>'''.format(title, title, page)
file_name = re.sub(r'[\/:*?"<>|‘]', '', title).replace(' ', '')
file_path = f'E:/LiuChuang_reptile/CSDN/Documents/{file_name}.pdf'
HTML(string=html).write_pdf(file_path, **self.options)
print('{} 下载完成\n链接: {}'.format(file_name, response.url))
break
else:
breakpoint()
代码所使用的模块
import json
import scrapy
from weasyprint import HTML
import re
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
from scrapy.http import HtmlResponse
import asyncio
from pyppeteer import launch
如有问题还请指出,最终以你为主😀,如果需要完整代码请评论或者私聊!