首先安装wkhtmltopdf软件到python工作目录,以下有已经安装好的,可以直接解压到工作目录进行使用
链接:https://pan.baidu.com/s/1zKHw5S4sNehnPAgeNwwK9g
提取码:ax45
下面是python代码,代码是在某乎一位大佬的基础上进行改进的(之前的用不了了):
# coding=utf-8
import os
import re
import time
import urllib
import pdfkit
from bs4 import BeautifulSoup
from PyPDF2 import PdfFileReader, PdfFileWriter
import urllib.request
import ssl
# test12
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{content}
</body>
</html>
"""
path_wk = os.getcwd() + u'\wkhtmltopdf\\bin\wkhtmltopdf.exe' # 安装位置
config = pdfkit.configuration(wkhtmltopdf=path_wk)
# ----------------------------------------------------------------------
def parse_url_to_html(url, name):
"""
解析URL,返回HTML内容
:param url:解析的url
:param name: 保存的html文件名
:return: html
"""
try:
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Accept': 'text / html, application / xhtml + xml, application / xml',
'Accept - Encoding': 'gzip, deflate, br',
'Accept - Language': 'zh - CN, zh'
}
context = ssl._create_unverified_context()
req = urllib.request.Request(url, headers=header)
webPage = urllib.request.urlopen(req, context=context)
data = webPage.read().decode('utf-8')
soup = BeautifulSoup(data, "html.parser")
#
# response = requests.get(url)
# soup = BeautifulSoup(response.content, 'html.parser')
# 正文
body = soup.find_all(class_="x-wiki-content")[0]
# 标题
title = soup.find('h4').get_text()
# 标题加入到正文的最前面,居中显示
center_tag = soup.new_tag("center")
title_tag = soup.new_tag('h1')
title_tag.string = title
center_tag.insert(1, title_tag)
body.insert(1, center_tag)
html = str(body)
# body中的img标签的src相对路径的改成绝对路径
pattern = "(<img .*?src=\")(.*?)(\")"
def func(m):
if not m.group(3).startswith("http"):
rtn = m.group(1) + "http://www.liaoxuefeng.com" + m.group(2) + m.group(3)
return rtn
else:
return m.group(1) + m.group(2) + m.group(3)
html = re.compile(pattern).sub(func, html)
html = html_template.format(content=html)
html = html.encode("utf-8")
with open(name, 'wb') as f:
f.write(html)
return name
except Exception as e:
print("解析错误!")
# ----------------------------------------------------------------------
def get_url_list(now_url):
"""
获取所有URL目录列表
:return:
"""
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Accept': 'text / html, application / xhtml + xml, application / xml',
'Accept - Encoding': 'gzip, deflate, br',
'Accept - Language': 'zh - CN, zh'
}
context = ssl._create_unverified_context()
req = urllib.request.Request(now_url, headers=header)
webPage = urllib.request.urlopen(req, context=context)
data = webPage.read().decode('utf-8')
soup = BeautifulSoup(data, "html.parser")
menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]
urls = []
for li in menu_tag.find_all("div"):
url = "http://www.liaoxuefeng.com" + li.a.get('href')
urls.append(url)
print(urls)
return urls
# ----------------------------------------------------------------------
def save_pdf(htmls, file_name):
"""
把所有html文件保存到pdf文件
:param htmls: html文件列表
:param file_name: pdf文件名
:return:
"""
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'outline-depth': 10,
}
try:
pdfkit.from_file(htmls, file_name, options=options, configuration=config)
except:
# 此处一个Exit with code 1 due to network error: ContentNotFoundError异常
# 此异常为是因为css文件引用了外部的资源,如:字体,图片,iframe加载等。
# 选择忽略此异常
pass
# ----------------------------------------------------------------------
def main():
dic = {
'java': 'https://www.liaoxuefeng.com/wiki/1252599548343744',
'python': 'https://www.liaoxuefeng.com/wiki/1016959663602400',
'js': 'https://www.liaoxuefeng.com/wiki/1022910821149312',
'blockchain': 'https://www.liaoxuefeng.com/wiki/1207298049439968',
'sql': 'https://www.liaoxuefeng.com/wiki/1177760294764384',
'git': 'https://www.liaoxuefeng.com/wiki/896043488029600'
}
for (name, url) in dic.items():
print(u"开始输出{}文件!".format(name))
start = time.time()
file_name = u"liaoxuefeng" + u"_{}_".format(name) + u"tutorial"
urls = get_url_list(url)
for index, url in enumerate(urls):
parse_url_to_html(url, str(index) + ".html")
htmls = []
pdfs = []
for i in range(len(urls)):
htmls.append(str(i) + '.html')
pdfs.append(file_name + str(i) + '.pdf')
save_pdf(str(i) + '.html', file_name + str(i) + '.pdf')
print(u"转换完成第" + str(i) + u'个html')
pdf_output = PdfFileWriter()
for pdf in pdfs:
with open(pdf, 'rb') as pdf_input:
pdf_input = PdfFileReader(pdf_input)
# pdf_input = PdfFileReader(open(pdf, 'rb'))
page_count = pdf_input.getNumPages()
for i in range(page_count):
pdf_output.addPage(pdf_input.getPage(i))
with open(u"廖雪峰" + u"_{}_".format(name) + u"all.pdf", "wb") as output:
pdf_output.write(output)
print(u"输出{}PDF成功!".format(name))
for html in htmls:
os.remove(html)
print(u"删除临时xaml文件")
for pdf in pdfs:
os.remove(pdf)
print(u"删除临时pdf文件")
total_time = time.time() - start
print(u"总共耗时:%f 秒" % total_time)
print('-'*50)
# ----------------------------------------------------------------------
def changeDir(dir_name):
"""
目录切换
"""
if not os.path.exists(dir_name):
os.mkdir(dir_name)
os.chdir(dir_name)
# ----------------------------------------------------------------------
if __name__ == '__main__':
# 存放文件的路径
dir_name = os.getcwd() + u'/al_pdf'
changeDir(dir_name)
main()
最后建立的工作目录如图:
建立完毕,直接运行,开始刷刷地输出,最后得到pdf文件:
附上最新地pdf文件:
链接:https://pan.baidu.com/s/1veei17qXoWaOCQKXV2wZTQ
提取码:rerf