Python脚本：快速将 Spring 在线官方文档格式化为 Markdown 格式文档

背风衣人

已于 2022-11-01 17:56:51 修改

阅读量372

点赞数

分类专栏： Python 文章标签： python 爬虫

于 2022-11-01 17:50:00 首次发布

本文链接：https://blog.csdn.net/lijiewen2017/article/details/127638140

版权

Python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

章节目录

脚本涉及知识点
脚本实验文档地址
python 脚本代码
效果展示
- 原文档
- 对比格式化后的 markdown 格式文档

脚本涉及知识点

http请求
python xpath 解析 html 树
元组
本地文件打开和写入
字符串拼接

脚本实验文档地址

Spring for Apache Kafka

python 脚本代码

# Spring 文档 markdown 格式化脚本
from urllib.request import urlopen
from lxml import etree

# 标题
h_tags = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'h9', 'h10')
# 代码块
code_tags = ('code',)
# 换行
newline_tags = ('br', 'div')
# 序言
preamble_ids = ('preamble',)
# 链接
link_tags = ('a',)
# 列表
list_tags = ('li',)


def writ_element(element, store):
    # 判断加工
    tail = ''
    if isinstance(element, etree._Element):
        if element.tag in h_tags:
            # 标题转换
            for index in range(int(element.tag[1:])):
                store = store + "#"
            store = store + " "
            tail = '\n'
        elif element.tag in code_tags:
            # 代码块转换
            data_lang = element.attrib.get('data-lang')
            if data_lang:
                store = store + '```' + data_lang + '\n'
                tail = '\n```'
            else:
                store = store + "`"
                tail = "`"
        elif element.attrib.get('id') in preamble_ids:
            # 根据 id 属性值补充 “序” 标题
            store = store + '## 序\n\n'
        elif element.attrib.get('class') == 'details':
            # 根据 class 属性值补充换行格式
            store = store + "\n"
            tail = '\n'
        elif element.tag in link_tags:
            # 根据 a 标签和其 href 属性值补充链接语法
            if element.attrib.get('href'):
                if element.attrib.get('class') == 'anchor':
                    # 如果是锚点，忽略
                    pass
                else:
                    link_url = element.attrib.get('href')
                    store = store + '['
                    tail = '](' + link_url + ')'
        elif element.tag in list_tags:
            # 根据 li 标签补充列表语法
            store = store + '- '
            tail = '\n'
        elif element.tag in newline_tags:
            # fix：换行放到最后判断，在错误的优先级里，它可能跳过了你真正想要实现的判断加工程序
            store = store + "\n\n"

    # 元素类型
    for sub in element.xpath("node()"):
        # 进入元素
        if isinstance(sub, etree._Element):
            store = writ_element(sub, store)
            # 结果写入
        elif isinstance(sub, etree._ElementUnicodeResult):
            if "\n" != sub.__str__():
                store = store + sub.__str__()
        else:
            raise Exception("位置元素：" + str(type(element)))
    return store + tail


# 以写模式打开文件
# fix: 缺少 encoding= 参数导致写的时候报错："UnicodeEncodeError: 'gbk' codec can't encode character '\xa9' in position 78: illegal multibyte sequence"
with open("../temp/Spring for Apache Kafka.md", 'w', encoding="utf-8") as md_file:
    try:
        # 发起请求，获取html文本
        html_data = urlopen("https://docs.spring.io/spring-kafka/docs/2.8.10/reference/html/")
        html_ele = etree.HTML(html_data.read())

        # 标题和文档说明
        header_ele = html_ele.xpath("/html/body/div[@id='header']")[0]
        header_md = writ_element(header_ele, "")

        # 正文
        content_ele = html_ele.xpath("/html/body/div[@id='content']")[0]
        content_md = writ_element(content_ele, "")

        # 文本合并写入文件
        md_file.write(header_md + content_md)
    finally:
        md_file.close()
        pass