from DrissionPage import SessionPage
import os
import pdfkit
import logging
from multiprocessing import Pool
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger521 = logging.getLogger()
def get_pdf(_name, _url):
logger521.info(f'开始将{_url}写入{_name}.pdf')
pdfkit.from_url(_url, f'./{_name}.pdf')
logger521.info(f'成功写入{_name}.pdf')
if __name__ == '__main__':
#读取PDF列表
pdflist=[]
for root, dirs, files in os.walk('.', topdown=False):
for name in files:
if name.endswith(".pdf"):
pdflist.append(name.replace('.pdf', ''))
#创建页面对象
print("开始爬取链接!")
page = SessionPage()
page.get('https://xxx.xxx/')
# 获取所有开源库<a>元素列表
links = page.eles('.md-nav__link')
# 遍历所有<a>元素
link_list = []
for link in links:
# 检索标题符合特定模式且未被保存为PDF的链接
if link.link is None or link.text in pdflist \
or any([pattern in link.text for pattern in ['xxx', 'xxx', 'xxx']]):
continue
link_list.append((link.text, link.link))
# 打印链接信息
print(link.text, link.link)
print("开始写入文件!")
for _name, _url in link_list:
get_pdf(_name, _url, )
print("全部完成!")
【Python】以PDF格式批量下载网页包含的超链接网页
最新推荐文章于 2024-10-13 19:04:04 发布