#PDF2HTML|文档转换#PDF转HTML

最新推荐文章于 2024-08-04 07:05:35 发布

向日葵花籽儿

最新推荐文章于 2024-08-04 07:05:35 发布

阅读量414

点赞数 8

分类专栏：数据分析文章标签： pdf html

本文链接：https://blog.csdn.net/weixin_45312236/article/details/135959312

版权

数据分析专栏收录该内容

8 篇文章 0 订阅

订阅专栏

PDF转HTML
废话不多说，直接上代码

import os
from tqdm import tqdm
import html
import fitz

def is_contain_chinese(text):
    for char in text:
        if '\u4e00' <= char <= '\u9fff':
            return True
    return False
def pdf2html(input_path, html_path):
    doc = fitz.open(input_path)
    print(doc)
    chinese_html_content = "<!DOCTYPE html><html lang=\"en\"><head><meta charset=\"UTF-8\"><title>Title</title></head><body style=\"display: flex;justify-content: center;flex-direction: column;background: #0e0e0e;align-items: center;\">"

    html_content = "<!DOCTYPE html><html lang=\"en\"><head><meta charset=\"UTF-8\"><title>Title</title></head><body style=\"display: flex;justify-content: center;flex-direction: column;background: #0e0e0e;align-items: center;\">"
    for page in tqdm(doc):
        html_content += page.get_text('html')
        check_html_content = html.unescape(page.get_text('html'))
        if 'image' in check_html_content:
            continue
        if is_contain_chinese(check_html_content):
            chinese_html_content += check_html_content

    print("开始输出html文件", input_path)
    #     print(html.unescape(html_content))

    html_content += "</body></html>"
    chinese_html_content += "</body></html>"
    html_content = html.unescape(html_content)
    with open(os.path.join(html_path,"html.html"), 'w', encoding='utf8', newline="") as fp:
        fp.write(html_content)

    with open(os.path.join(html_path,"chinese_html.html"), 'w', encoding='utf8', newline="") as fp:
        fp.write(chinese_html_content)


input_path = 
html_path = r"\output"
pdf2html(input_path, html_path)