以下是将c语言中文网的“”23 种设计模式“的网页转成pdf的示例,但唯独有一点,图片无法显示:
import requests
from lxml import etree
from pdfkit import from_string
import re
r1=r"//dd/a/text()"
r2=r"//dd/a/@href"
r3=r'<div id="arc-body">.*?</div>'
headers={
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
}
html = requests.get("http://c.biancheng.net/view/1317.html",headers=headers).content
parse_tree = etree.HTML(html)
res1 = parse_tree.xpath(r1)
res2 = parse_tree.xpath(r2)
for name ,url in zip(res1,res2):
print(name,url)
html = requests.get("http://c.biancheng.net"+url, headers=headers).content.decode("utf-8")
res3 = re.search(r3,html,flags=re.S).string
#去掉广告
res4=re.sub(r'(<ul id="ad-page-top-left".*?)<div id="arc-body">',"",res3,flags=re.S)
res5=re.sub(r'pre-next-page clearfix.*?"1.4.8";',"",res4,flags=re.S)
with open("test.html", "w+") as f:
f.write(res5)
try:
from_string(res5,name+".pdf")
except Exception as e:
pass
# break
所以这里改为先整合成html,然后再手动在浏览器上转换为pdf吧!
import requests
from lxml import etree
from pdfkit import from_url
import re
import os
r1=r"//dd/a/text()"
r2=r"//dd/a/@href"
r3=r'<div id="arc-body">.*?</div>'
headers={
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
}
html = requests.get("http://c.biancheng.net/view/1317.html",headers=headers).content
parse_tree = etree.HTML(html)
res1 = parse_tree.xpath(r1)
res2 = parse_tree.xpath(r2)
count_num = 1
for name ,url in zip(res1,res2):
print(name,url)
html = requests.get("http://c.biancheng.net"+url, headers=headers).content.decode("utf-8")
res3 = re.search(r3,html,flags=re.S).string
#去掉广告
res4=re.sub(r'(<ul id="ad-page-top-left".*?)<div id="arc-body">',"",res3,flags=re.S)
res5=re.sub(r'pre-next-page clearfix.*?"1.4.8";',"",res4,flags=re.S)
#将图片src改为绝对url路径
res6 =re.sub(r'<img alt="(.*?)" src="(.*?)">',r'<img alt="\1" src="http://c.biancheng.net\2">',res5,flags=re.S)
with open("test.html", "a+") as f:
f.write("<h1 id='{}'>{}.{}</h1><br>{}".format(count_num,count_num,name,res6))
with open("index.txt", "a+") as f:
f.write("<a href='#{}'>{}.{}</a><br>".format(count_num,count_num,name))
count_num += 1
with open('23种设计模式.html',"a+") as f:
with open("index.txt","r") as f1:
f.write(f1.read())
with open("test.html","r") as f2:
f.write(f2.read())
os.system("rm {} {}".format("index.txt","test.html"))