微信文章链接:https://mp.weixin.qq.com/s/imfnV7q8WY8gwDLYBir_pw
'''将网页url生成pdf文件'''
headers = [
('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'),
]
options = {
'custom-header': headers,
'debug-javascript': [''],
'javascript-delay': 10000,
'no-stop-slow-scripts': "",
'load-media-error-handling': 'abort',
}
def url_to_pdf(url, fpath):
# 将wkhtmltopdf.exe程序绝对路径传入config对象(如果刚安装wkhtmltopdf未重启电脑)
path_wkthmltopdf = r'E:\Software\wkhtmltopdf\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
# 生成pdf文件,fpath为文件路径
pdfkit.from_url(url, fpath, configuration=config, options=options)
print('完成:', fpath)
url_to_pdf(url, 'out_file.pdf')
生成PDF文件:
原因:html代码中图片链接属性是 data-src,未识别出图片链接
此外,还有图片url要用绝对路径
使用requests库获取html源码,将 data-src替换为src
import pdfkit
import requests
from bs4 import BeautifulSoup
res = requests.get(url)
# data-src替换为src 有时候返回的正文被隐藏了,将hidden去掉
html = res.text.replace("data-src", "src").replace('style="visibility: hidden;"',"")
soup = BeautifulSoup(html)
# 选择正文(去除javascrapt等)
html = soup.select('div#img-content')[0]
# 可以修改字体
font = '''
<style type="text/css">
@font-face{font-family: "微软雅黑";src:url("C:\\Windows\\Fonts\\msyh.ttc")
</style>
<style type = "text/css">
p { font-family: '微软雅黑', cursive; }
</style>
'''
html = font + str(html)
# 选项
options = {
'page-size': 'A4',
# 'margin-top': '0.75in',
# 'margin-right': '0.75in',
# 'margin-bottom': '0.75in',
# 'margin-left': '0.75in',
'encoding': "UTF-8",
# 'custom-header': headers,
# 'debug-javascript': [''],
# 'javascript-delay': 10000,
# 'no-stop-slow-scripts': "",
# 'load-media-error-handling': 'abort',
}
path_wkthmltopdf = r'E:\Software\wkhtmltopdf\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
pdfkit.from_string(str(html), 'outfile5.pdf', configuration=config, options=options)
最终得到和网页一样的PDF