脚本涉及知识点
- http请求
- python xpath 解析 html 树
- 元组
- 本地文件打开和写入
- 字符串拼接
脚本实验文档地址
Spring for Apache Kafka
python 脚本代码
from urllib.request import urlopen
from lxml import etree
h_tags = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'h9', 'h10')
code_tags = ('code',)
newline_tags = ('br', 'div')
preamble_ids = ('preamble',)
link_tags = ('a',)
list_tags = ('li',)
def writ_element(element, store):
tail = ''
if isinstance(element, etree._Element):
if element.tag in h_tags:
for index in range(int(element.tag[1:])):
store = store + "#"
store = store + " "
tail = '\n'
elif element.tag in code_tags:
data_lang = element.attrib.get('data-lang')
if data_lang:
store = store + '```' + data_lang + '\n'
tail = '\n```'
else:
store = store + "`"
tail = "`"
elif element.attrib.get('id') in preamble_ids:
store = store + '## 序\n\n'
elif element.attrib.get('class') == 'details':
store = store + "\n"
tail = '\n'
elif element.tag in link_tags:
if element.attrib.get('href'):
if element.attrib.get('class') == 'anchor':
pass
else:
link_url = element.attrib.get('href')
store = store + '['
tail = '](' + link_url + ')'
elif element.tag in list_tags:
store = store + '- '
tail = '\n'
elif element.tag in newline_tags:
store = store + "\n\n"
for sub in element.xpath("node()"):
if isinstance(sub, etree._Element):
store = writ_element(sub, store)
elif isinstance(sub, etree._ElementUnicodeResult):
if "\n" != sub.__str__():
store = store + sub.__str__()
else:
raise Exception("位置元素:" + str(type(element)))
return store + tail
with open("../temp/Spring for Apache Kafka.md", 'w', encoding="utf-8") as md_file:
try:
html_data = urlopen("https://docs.spring.io/spring-kafka/docs/2.8.10/reference/html/")
html_ele = etree.HTML(html_data.read())
header_ele = html_ele.xpath("/html/body/div[@id='header']")[0]
header_md = writ_element(header_ele, "")
content_ele = html_ele.xpath("/html/body/div[@id='content']")[0]
content_md = writ_element(content_ele, "")
md_file.write(header_md + content_md)
finally:
md_file.close()
pass
效果展示
原文档
对比格式化后的 markdown 格式文档