Python转换md文件至pdf

最新推荐文章于 2024-06-25 09:33:11 发布

尘落雨冷

最新推荐文章于 2024-06-25 09:33:11 发布

阅读量1.3k

点赞数 2

文章标签： python

本文链接：https://blog.csdn.net/qq1787991631/article/details/129878598

版权

基础的转换

转换原理：

1、使用markdown库将md转换为html

2、使用pdfkit将html转换为pdf（需要安装单独安装wkhtmltopdf）

实现代码

# pip3 install markdown
# pip3 install pdfkit

import codecs
import markdown
import pdfkit

with codecs.open("test.md", "r", encoding="utf-8") as f:
    md_content = f.read()

html_content = markdown.markdown(md_content)
with codecs.open("test.html", "w", encoding="utf-8") as f:
    # 加入文件头防止中文乱码
    f.write('<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>')
    f.write(html_content)

pdfkit.from_file("test.html", "test.pdf")

存在问题

以上代码生成的，无法不支持代码高亮、表格等一些特殊格式的显示，可以通过相关渲染库和扩展插件解决

md特有格式不支持解决

引入如下python库

python-markdown-math
markdown_checklist
pymdown-extensions

from pymdownx import superfences
import markdown

extensions = [
    'toc',  # 目录，[toc]
    'extra',  # 缩写词、属性列表、释义列表、围栏式代码块、脚注、在HTML的Markdown、表格
]
third_party_extensions = [
    'mdx_math',  # KaTeX数学公式，$E=mc^2$和$$E=mc^2$$
    'markdown_checklist.extension',  # checklist，- [ ]和- [x]
    'pymdownx.magiclink',  # 自动转超链接，
    'pymdownx.caret',  # 上标下标，
    'pymdownx.superfences',  # 多种块功能允许嵌套，各种图表
    'pymdownx.betterem',  # 改善强调的处理(粗体和斜体)
    'pymdownx.mark',  # 亮色突出文本
    'pymdownx.highlight',  # 高亮显示代码
    'pymdownx.tasklist',  # 任务列表
    'pymdownx.tilde',  # 删除线
]
extensions.extend(third_party_extensions)
extension_configs = {
    'mdx_math': {
        'enable_dollar_delimiter': True  # 允许单个$
    },
    'pymdownx.superfences': {
        "custom_fences": [
            {
                'name': 'mermaid',  # 开启流程图等图
                'class': 'mermaid',
                'format': superfences.fence_div_format
            }
        ]
    },
    'pymdownx.highlight': {
        'linenums': True,  # 显示行号
        'linenums_style': 'pymdownx-inline'  # 代码和行号分开
    },
    'pymdownx.tasklist': {
        'clickable_checkbox': True,  # 任务列表可点击
    }
}

# 此行代码修改
html_content = markdown.markdown(md_content, extensions=extensions, extension_configs=extension_configs)

css渲染解决

经过插件修改后，html的样式将会带有class的样式，但是转成的html和pdf由于没有引擎渲染，需要手动引入CSS进行渲染。

需要安装pygments，然后生成相应的样式

pip3 install pygments
# 如果需要代码高亮的样式，执行如下命令
pygmentize -f html -a .highlight -S default > pygments.css

with codecs.open("test.html", "w", encoding="utf-8") as f:
    # 将CSS文件写入html文件中
    with open("pygments.css", "r") as g:
        f.write('''<head>
<style>
{}
</style>
{}
</head>\n'''.format(g.read(), '<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>'))
    f.write(html_content)

图片大小导致pdf文字异常

当源文件里面有图片的话，会导致生成的pdf随着图片尺寸变化，特别难看，通过手动修改html的样式参数实现图片自动伸缩

import codecs
from bs4 import BeautifulSoup

# 优化html中的图片信息
with codecs.open("test.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(html_content, features="lxml")
    image_content = soup.find_all("img")
    for i in image_content:
        i["style"] = "max-width:100%; overflow:hidden;"
    with codecs.open("test_final.html", "w", encoding="utf-8") as g:
        g.write(soup.prettify())

完整流程

import codecs
import markdown
import pdfkit
from pymdownx import superfences
from bs4 import BeautifulSoup

extensions = [
    'toc',  # 目录，[toc]
    'extra',  # 缩写词、属性列表、释义列表、围栏式代码块、脚注、在HTML的Markdown、表格
]
third_party_extensions = [
    'mdx_math',  # KaTeX数学公式，$E=mc^2$和$$E=mc^2$$
    'markdown_checklist.extension',  # checklist，- [ ]和- [x]
    'pymdownx.magiclink',  # 自动转超链接，
    'pymdownx.caret',  # 上标下标，
    'pymdownx.superfences',  # 多种块功能允许嵌套，各种图表
    'pymdownx.betterem',  # 改善强调的处理(粗体和斜体)
    'pymdownx.mark',  # 亮色突出文本
    'pymdownx.highlight',  # 高亮显示代码
    'pymdownx.tasklist',  # 任务列表
    'pymdownx.tilde',  # 删除线
]
extensions.extend(third_party_extensions)
extension_configs = {
    'mdx_math': {
        'enable_dollar_delimiter': True  # 允许单个$
    },
    'pymdownx.superfences': {
        "custom_fences": [
            {
                'name': 'mermaid',  # 开启流程图等图
                'class': 'mermaid',
                'format': superfences.fence_div_format
            }
        ]
    },
    'pymdownx.highlight': {
        'linenums': True,  # 显示行号
        'linenums_style': 'pymdownx-inline'  # 代码和行号分开
    },
    'pymdownx.tasklist': {
        'clickable_checkbox': True,  # 任务列表可点击
    }
}
with codecs.open("test.md", "r", encoding="utf-8") as f:
    md_content = f.read()

html_content = markdown.markdown(md_content, extensions=extensions, extension_configs=extension_configs)
with codecs.open("test.html", "w", encoding="utf-8") as f:
    # 加入文件头防止中文乱码
    with open("pygments.css", "r") as g:
        f.write('''<head>
<style>
{}
</style>
{}
</head>\n'''.format(g.read(), '<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>'))
    f.write(html_content)

# 优化html中的图片信息
with codecs.open("test.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, features="lxml")
    image_content = soup.find_all("img")
    for i in image_content:
        i["style"] = "max-width:100%; overflow:hidden;"
    with codecs.open("test_final.html", "w", encoding="utf-8") as g:
        g.write(soup.prettify())

pdfkit.from_file("test_final.html", "test.pdf")