#PDF2HTML|文档转换#PDF转HTML

PDF转HTML
废话不多说,直接上代码

import os
from tqdm import tqdm
import html
import fitz

def is_contain_chinese(text):
    for char in text:
        if '\u4e00' <= char <= '\u9fff':
            return True
    return False
def pdf2html(input_path, html_path):
    doc = fitz.open(input_path)
    print(doc)
    chinese_html_content = "<!DOCTYPE html><html lang=\"en\"><head><meta charset=\"UTF-8\"><title>Title</title></head><body style=\"display: flex;justify-content: center;flex-direction: column;background: #0e0e0e;align-items: center;\">"

    html_content = "<!DOCTYPE html><html lang=\"en\"><head><meta charset=\"UTF-8\"><title>Title</title></head><body style=\"display: flex;justify-content: center;flex-direction: column;background: #0e0e0e;align-items: center;\">"
    for page in tqdm(doc):
        html_content += page.get_text('html')
        check_html_content = html.unescape(page.get_text('html'))
        if 'image' in check_html_content:
            continue
        if is_contain_chinese(check_html_content):
            chinese_html_content += check_html_content

    print("开始输出html文件", input_path)
    #     print(html.unescape(html_content))

    html_content += "</body></html>"
    chinese_html_content += "</body></html>"
    html_content = html.unescape(html_content)
    with open(os.path.join(html_path,"html.html"), 'w', encoding='utf8', newline="") as fp:
        fp.write(html_content)

    with open(os.path.join(html_path,"chinese_html.html"), 'w', encoding='utf8', newline="") as fp:
        fp.write(chinese_html_content)


input_path = 
html_path = r"\output"
pdf2html(input_path, html_path)
  • 8
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值