双线程_网站上的C++教程转换成 PDF 电子书
# 把网站上的C++教程转换成 PDF 电子书, https://www.runoob.com/python3/python3-tutorial.html
# coding=utf-8
import os,re,time,logging,threading
import pdfkit # 另需 wkhtmltopdf.org 下载安装,再将其执行文件路径加入到系统环境 $PATH 中,并允许其联网
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver import Firefox #须将驱动文件放到当前目录,Firefox需geckodriver.exe,Chrome需chromedriver.exe
from selenium.webdriver.firefox.options import Options
foptions = Options()
foptions.add_argument('-headless') #使用无界面浏览器
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{content}
</body>
</html>
"""
# 1_下面输入网址
url0='https://www.runoob.com/cplusplus/cpp-tutorial.html'
def parse_url_to_html(url, name): # 解析URL,返回HTML内容
try:
browser = Firefox( options=foptions) # executable_path='./geckodriver',
browser.get(url)
sleep(5)
soup = BeautifulSoup(browser.page_source,'html.parser')
# 2_下面输入正文中要截取位置的特征码
body = soup.find_all('div', class_='article-intro') # class_ ,
print('网页上符合特征码的位置有:',len(body),' 个')
html = str(body[0])
# body中的img标签的src相对路径的改成绝对路径
pattern = "(<img .*?src=\")(.*?)(\")"
def func(m):
if not m.group(3).startswith("http"):
# 3_此处分析图片地址结构,完善为准确网址
rtn = m.group(1) + "https:" + m.group(2) + m.group(3)
# rtn = m.group(1) + "https://" + url.split('/')[2] + m.group(2) + m.group(3) #与url主址同用这句
# rtn = "<p></p>" # 若图片不能下载,可用此句取消图片,避免 pdfkit 显示错误
return rtn
else:
return m.group(1)+m.group(2)+m.group(3)
html = re.compile(pattern).sub(func, html)
# 3.1_增加或替换为换行回车代码'<br/>',修改替换部分内容,文本中分号"需加\,其它不用加。
html=html.replace('<h3>实例</h3>', '<p>实例</p>')
html=html.replace('<h2 class="tutheader">实例</h2>', '<p>实例</p>')
html=html.replace('<h2 class="example">实例</h2>', '<p>实例</p>') #
# ss=re.search(r'<h1>C 标准库 - <span class="color_h1"><.*?></span></h1>', html) # 用正则表达式搜索,配合下句进行替换,提升开头h4为一级目录
# if ss: html = html.replace(ss.group(), ss.group().replace('h1','h2'))
html=html.replace('https:https:', 'https:')
html = html.replace('https:/wp-content', 'https://www.runoob.com/wp-content')
# 3.2_修正示例排版格式错误
pattern_1 = r"(<span class=\"hl-code\">)(.*?)(</span>)"
def func_1(m_1):
x=m_1.group(2)
if '\n' in x:
x=x.replace(' ',' ')
x=x.replace('\n','<br/>')
thwz=m_1.group(1) + x + m_1.group(3)
else:
thwz=x
return thwz
html = re.compile(pattern_1,re.DOTALL).sub(func_1, html)
html = html_template.format(content=html)
html = html.encode("utf-8")
browser.close()
with open(name, 'wb') as f:
f.write(html)
return name
except:
logging.error("解析错误", exc_info=True)
def get_url_list(): # 获取所有URL目录列表
browser = Firefox( options=foptions) # executable_path='./geckodriver',
browser.get(url0)
sleep(3)
soup = BeautifulSoup(browser.page_source,'html.parser')
# 4_下面输入目录区域位置的特征码
menu_tag = soup.find_all('div', class_='design')[0] # ,class_="_2rhmJa"
urls = [] # 此处应检查首页是否重复,若重复,改为 urls = []
# 5_下面输入各子目录位置的特征码
for li in menu_tag.find_all('a'):
href0=li.get('href')
if href0[0]=='/':
url = "https://www.runoob.com" + href0
else:
url = "https://www.runoob.com/python3/" + li.get('href')
urls.append(url)
print('目录地址清单:',urls,'地址数量:',len(urls))
browser.close()
return urls
def save_pdf(htmls, file_name): # 把所有html文件保存到pdf文件
options = {
'page-size': 'A4',
'margin-top': '0.5in',
'margin-right': '0.5in',
'margin-bottom': '0.5in',
'margin-left': '0.7in',
'encoding': "UTF-8",
'custom-header': [('Accept-Encoding', 'gzip')],
'cookie': [('cookie-name1', 'cookie-value1'),('cookie-name2','cookie-value2'),],
'outline-depth': 10,
}
pdfkit.from_file(htmls, file_name, options=options)
def dxchxz(urls0,startno):
# print(urls0,startno) # 各线程内部数据相对其它线程是独立的
[parse_url_to_html(url, str(startno+index) + ".html") for index, url in enumerate(urls0)]
def main():
start = time.time()
urls = get_url_list()
# 6_下面输入要保存的pdf文件名
file_name = u"C++教程runoob_OK.pdf"
# 7_输入调试码,0 为调试检查网页地址是否正确,1 为正常完整运行
tsm = 1
if tsm==1 :
# 先删除原有html
htmls=[x for x in os.listdir('.') if os.path.isfile(x) and re.match(r'\d+\.html',os.path.basename(x))]
for html in htmls:
os.remove(html)
#建立双线程下载
t1 = threading.Thread(target=dxchxz, args=(urls[:int(len(urls)/2)],0))
sleep(1.5)
t2 = threading.Thread(target=dxchxz, args=(urls[int(len(urls)/2):],int(len(urls)/2)))
t1.start()
t2.start()
t1.join()
t2.join()
# 转pdf并删除html
htmls=[x for x in os.listdir('.') if os.path.isfile(x) and re.match(r'\d+\.html',os.path.basename(x))]
htmls = [str(x)+".html" for x in range(len(htmls))]
save_pdf(htmls, file_name)
for html in htmls:
os.remove(html)
pass
total_time = time.time() - start
print(u"总共耗时:%f 秒" % total_time)
if __name__ == '__main__':
main()