python pdf转markdown

最新推荐文章于 2024-08-11 01:31:16 发布

吴越南蛮

最新推荐文章于 2024-08-11 01:31:16 发布

阅读量2k

点赞数 1

分类专栏： Python

本文链接：https://blog.csdn.net/weixin_43069875/article/details/111241350

版权

Python 专栏收录该内容

20 篇文章 0 订阅

订阅专栏

该博客介绍了一种方法，通过Python的pdfplumber库读取wxPython in Action这本书的PDF文件，将每一页转换为JSON，并依据字符位置进行归类。之后，通过检查字体判断代码段，并将其转换为Markdown格式。最后，将所有JSON文件合并成一个Markdown文件，方便阅读和检索代码示例。

摘要由CSDN通过智能技术生成

wxPython in action，比较系统地介绍了wxPython的使用。一直用的是中文翻译的版本，有时想运行里面的demo，复制出来很费劲，所以写了一个自动化的脚本，将文本转为了markdown格式。

读取pdf文件，将每一页转为json文件。

import pdfplumber
import json

path = r'wxpython in action.pdf'

'''字典组成的列表，按照某一个属性归类，也就是返回一个二维数组'''
def groupByKey(arr,key = 'y0',yBalance = 3.19):
    tempValue = arr[0][key]
    tempIndex = 0
    results = []
    while(True):
        tempArr = []
        flag = False
        for i,j in enumerate(arr[tempIndex:]):
            # 注意char的y0属性表示每一个字符距离页面底部的距离，同一行文本可能有一个偏移值，所以需要加上这个范围
            # 否则同一行文本会被割裂成多行
            if float(j[key]) >= float(tempValue) - yBalance and float(j[key]) <= float(tempValue) + yBalance:
                tempArr.append(j)
            else:
                tempValue = j[key]
                tempIndex += i
                flag = True
                break
        results.append(tempArr)
        if not flag:
            break
    return results

'''通过检测字符的字体来判断是否是代码文本，也就是pdf中那个蓝色的斜体字体'''
def checkIsCode(arr):
    for i in arr:
        # 要注意空格的字体和文本的字体还不一样
        if i['fontname'] == 'JJDCVA+Times-BoldItalic' or i['fontname'] == 'DCQJKI+Times-BoldItalic':
            return True
    return False

'''处理单页pdf'''
def handleSinglePage(page,index):
    chars = page.chars
    # 通过char对象的y0属性转为二维数组
    results = groupByKey(chars, 'y0')
    resultList = []
    for j in results:
        textTemp = ''
        for k in j:
            textTemp += k['text']
        # 去掉这个特殊字符，转为普通空格
        textTemp = textTemp.replace('\xa0',' ')
        resultList.append({'text':textTemp,'isCode':checkIsCode(j)})
    # 写入json文件
    json.dump(resultList,open('../jsonFiles/'+str(index)+'.json','w',encoding='utf-8'),ensure_ascii=False,indent=4)

with pdfplumber.open(path) as pdf:
    # 获取所有page对象
  pages = pdf.pages
  for i,page in enumerate(pages[18:]):
    handleSinglePage(page,i+18)
    print('page '+str(i+18)+' completed!')

生成的json文件格式如下
每一个对象表示一行文本，text是文本内容，isCode为true表示是代码文本。

[
    {
        "text": "Part1 wxPython入门",
        "isCode": false
    },
    {
        "text": "1. 欢迎来到wxPython",
        "isCode": false
    },
    {
        "text": "下面是一个例子，它创建了一个有一个文本框的窗口用来显示鼠标的位",
        "isCode": false
    },
    {
        "text": "置。",
        "isCode": false
    },
    {
        "text": "#!/bin/env python ",
        "isCode": true
    },
    {
        "text": "import wx ",
        "isCode": true
    }
]

处理json文件，合成为一个markdown文件

from os.path import join
import json

'''json文件根目录'''
basePath = r'../jsonFiles'

'''arr 字典数组 key 需要分组的属性'''
def groupByKey(arr,key):
    tempValue = arr[0][key]
    tempIndex = 0
    results = []
    while(True):
        tempArr = []
        flag = False
        for i,j in enumerate(arr[tempIndex:]):
            if j[key] == tempValue:
                tempArr.append(j)
            else:
                tempValue = j[key]
                tempIndex += i
                flag = True
                break
        results.append(tempArr)
        if not flag:
            break
    return results

'''生成md字符串'''
def generateMdText(content):
    data= json.loads(content)
    results = groupByKey(data,'isCode')
    resultText = ''
    for i in results:
        if i[0]['isCode'] == True:
            text= joinCode(i)
            resultText += text
        else:
            text = joinText(i)
            resultText += text
    # 每一页结束插入md分隔符
    resultText += '\n---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n'
    return resultText

'''拼接普通文本'''
def joinText(arr):
    temp = ''
    for i in arr:
        temp += i['text']+'\n'
    return temp

'''拼接代码文本'''
def joinCode(arr):
    # 拼接python md代码块开头
    result = '```python\n'
    temp = joinText(arr)
    # 替换掉文本中不符合python规范的单引号和双引号'''
    result += temp.replace('”','\"').replace('“','\"').replace('’','\'')
    # 拼接python md代码块结尾
    result += '```\n';
    return result

# 页码范围18-565
for i in range(18,565):
    p = join(basePath,str(i)+'.json')
    with open(p,'r',encoding='utf-8') as f:
        # 读取对应的json文件
        content = f.read()
        # 生成md字符串
        mdText= generateMdText(content)
        with open('../md/wxpython in action.md', 'a', encoding='utf-8') as f2:
            f2.write(mdText)