Python---解析各种文件汇总

我姓曹，谢谢

已于 2024-07-17 10:39:46 修改

阅读量1.7k

点赞数 1

文章标签： python xml

于 2022-06-07 10:42:23 首次发布

本文链接：https://blog.csdn.net/weixin_57999977/article/details/125160615

版权

本文详细介绍了如何使用Python的pdfminer3k库处理PDF，minidom和ElementTree库操作XML，以及xlrd/xlwt库处理Excel。包括PDF内容提取、XML节点增删改查和Excel读写技巧。

摘要由CSDN通过智能技术生成

三、XML（第二种ElementTree）

一、PDF

1、安装pdfminer3k

pip install pdfminer3k

读取pdf文件内容

import sys
import importlib
importlib.reload(sys)

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed

def readPDF(path, toPath):
    with open(path, "rb") as f:
        parser = PDFParser(f)
        pdfFile = PDFDocument()
        parser.set_document(pdfFile)
        pdfFile.set_parser(parser)
        pdfFile.initialize()
    if not pdfFile.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        manager = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(manager, laparams=laparams)
        interpreter = PDFPageInterpreter(manager, device)

        for page in pdfFile.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if(isinstance(x, LTTextBoxHorizontal)):
                    with open(toPath, "a") as f:
                        str = x.get_text()
                        print(str)
                        f.write(str+"\n")

path = '123.pdf'
toPath = '2.txt'
readPDF(path, toPath)

2、转码失败，缺少字体对应的包和转码包

WARNING:pdfminer.converter:undefined: <PDFCIDFont: basefont='SimSun', cidcoding='Adobe-GB1'>, 7716

Python 3.4 compatibility, and nosetests by goulu · Pull Request #71 · euske/pdfminer · GitHubhttps://github.com/euske/pdfminer/pull/71/commits/2103e5875ef04cfaf424b25d2fd0dc9535a90714

进链接下载下面的四个包，放在pdfminer/cmap 文件夹下

3、使用PyMuPDF库

PyMuPDF（也称为fitz，pip install PyMuPDF）是一个用于处理PDF文件的Python库，可以将PDF文件转换为图像，并进一步将这些图像插入到Word文档中。

import fitz

# 打开PDF文件
pdf_document = "input.pdf"
doc = fitz.open(pdf_document)
output_word = "output.docx"

text = ""
# 遍历每一页并提取文本内容
for page_num in range(len(doc)):
    page = doc[page_num]
    text += page.get_text()

# 将提取的文本写入到Word文档中
with open(output_word, "w") as f:
    f.write(text)

4、pdf2docx

from pdf2docx import parse

# 指定PDF和输出Word文件路径
pdf_document = "input.pdf"
output_word = "output.docx"

# 转换PDF为Word
parse(pdf_document, output_word)

二、XML（第一种minidom）

1、创建xml文件

# coding:utf-8
# import sys   #python2(防止乱码)
# reload(sys)
# sys.setdefaultencoding('utf8')

# import sys    #python3(防止乱码)
# import imp
# imp.reload(sys)

import os
import xml.dom.minidom
from xml.dom.minidom import parse

# 在内存中创建一个空的文档
doc = xml.dom.minidom.Document()

# 创建根元素
root = doc.createElement('root')

# 设置根元素的属性
root.setAttribute('type', 'test这是根目录属性')

# 将根节点添加到文档对象中
doc.appendChild(root)

# 创建子元素
first = doc.createElement('first')
# 添加注释
first.appendChild(doc.createComment('desc这是注释'))

# 设置子元素的属性
first.setAttribute('type', 'attr这是属性')

# 子元素中嵌套子元素，并添加文本节点
child = doc.createElement('child')
child.setAttribute('type','这是第一个子节点的属性')
child.appendChild(doc.createTextNode('这是一个子节点'))
child1 = doc.createElement('child1')
child1.appendChild(doc.createTextNode('这是第二个子节点'))

# 将子元素添加到boot节点中
first.appendChild(child)
first.appendChild(child1)
# 将book节点添加到root根元素中
root.appendChild(first)
# 获取节点下的内容
print(root.toxml())

fp = open(r'test.xml', 'w', encoding='utf-8')  # 需要指定utf-8的文件编码格式，不然notepad中显示十六进制
doc.writexml(fp, indent='', addindent='\t', newl='\n', encoding='utf-8')
fp.close()

2、增加 xml文件节点

# 对test.xml新增一个子元素second
xml_file = r'test.xml'

# 获取解析文件
domTree = parse(xml_file)
# #拿到根节点
root = domTree.documentElement

# 在内存中创建一个空的文档
doc = xml.dom.minidom.Document()

# 创建新节点
second = doc.createElement('second')
# 新节点属性
second.setAttribute('type', '新节点属性')
# 新节点的子节点
child3 = doc.createElement('child3')
child3.appendChild(doc.createTextNode('这点新增节点的字节的内容'))

second.appendChild(child3)

first1 = root.getElementsByTagName('first')[0]

# insertBefore方法  父节点.insertBefore(新节点，父节点中的子节点)
root.insertBefore(second, first1)
# appendChild将新产生的子元素在最后插入
# root.appendChild(book)

print(root.toxml())

with open(xml_file, 'w', encoding='utf-8') as fh:
    domTree.writexml(fh, indent='', addindent='\t', newl='', encoding='utf-8')

3、读取xml文件

xml_file = r'test.xml'

# 获取解析文件
domTree = parse(xml_file)
# #拿到根节点
root = domTree.documentElement
# 显示xml节点文档内容
print(root.toxml())
# 获取节点属性
print('根节点的的type属性为：', root.getAttribute('type'))
# 获取节点对象
first = root.getElementsByTagName('first')
print(first)
# 获取节点对象属性
print('节点属性', first[0].getAttribute('type'))
# 获取节点内容data 和nodeValue都行
# 按层级关系找
print('节点内容', root.getElementsByTagName('first')[0].getElementsByTagName('child')[0].childNodes[0].data)
# 直接全局找
print('节点内容', root.getElementsByTagName('child')[0].childNodes[0].nodeValue)

4、修改xml文件节点

# 获取节点
child = root.getElementsByTagName('child')
# 更新节点属性
child[0].setAttribute('type','123')
# 更新节点内容
child[0].childNodes[0].data = 1234
# 写入文件
with open(xml_file, 'w', encoding='utf-8') as fh:
    domTree.writexml(fh, indent='', addindent='\t', newl='', encoding='utf-8')

5、删除节点

# removeChild() 方法删除指定节点。
# removeAttribute() 方法删除指定属性
first = root.getElementsByTagName('first')
first[0].removeAttribute('type')
root.removeChild(first[0])
with open(xml_file, 'w', encoding='utf-8') as fh:
    domTree.writexml(fh, indent='', addindent='\t', newl='', encoding='utf-8')

三、XML（第二种ElementTree）

1、创建xml文件

# coding:utf-8
from xml.etree import ElementTree as ET

#创建根标签
root=ET.Element('root')
# 创建子节点，与root还没有关系
datetime_object=ET.Element('datetime_object',{'type':'arrte'})

#创建子节点datetime_object的子节点
datetime=ET.Element('datetime',{'type':'arrt','type1':'arrt1'})

# 将datetime_object的子节点添加进来
datetime_object.append(datetime)

# 将datetime_object节点添加到根节点
root.append(datetime_object)

#root节点放到节点树中
tree=ET.ElementTree(root)
#保存xml文件
# short_empty_elements=True,节点中没有元素，用简写方式显示例：<datetime type='arrt' />
tree.write('test.xml',encoding='utf-8',short_empty_elements=True)

2、增加 xml文件节点

# 1.解析xml文件，返回ElementTree对象
tree = ET.parse(r'test.xml')
# 2.获得根节点
root = tree.getroot()
# 创建新节点对象  ET.Element(节点名称,attrib={'属性名称':'属性值'})
ET.Element('datetime', attrib={'id': str(id), 'comment': data['comment']})
# 找到父节点
datetimes = root.find('datetime_object').find("datetimes")
# 将新节点添加到父节点
datetimes.append(datetime)
# 写入文件
tree.write(self.object_file)

3、读取xml文件

# 获取所有datetime节点  按层级一层一层找
datetime = root.find('datetime_object').find("datetimes").findall("datetime")
for date in datetime:
    print("tag:", date.tag) #节点名称
    print("text:", date.text) #节点内容
    print("attrib:", date.attrib) #节点属性

4、修改xml文件节点

# 全部更新
_ = {'type':'attr','type1':'attr1'}
for i in root.find('datetime_object').find("datetimes").findall("datetime"):
    if i.attrib['type'] == 'attr':
       for k, v in i.items():
           i.set(k, str(_[k]))
           i.text = '1243'
tree.write(r'test.xml')

# 单个更新
datetime = root.find('datetime_object').find("datetimes").findall("datetime")
datetime[0].set('type','新值')
datetime[0].text = '新值'

5、删除节点

# 获取所有datetime节点  按层级一层一层找
datetime = root.find('datetime_object')
# 匹配到以后删除整个节点
for i in datetime.findall("datetime"):
    if i.attrib['type'] == 'atttttttttttt':
        datetime.remove(i)
# 删除节点某个属性
del datetime.attrib['type']
tree.write(r'test.xml')

四、Excel

1、读取用xlrd

import xlrd

# 打开文件
workbook = xlrd.open_workbook(r'data.xlsx')
# 获取所有sheet
sheet_name = workbook.sheet_names()
# 根据sheet索引或者名称获取sheet内容
sheet = workbook.sheet_by_index(0)  # sheet索引从0开始
sheet1 = workbook.sheet_by_name(sheet_name[0])
# 获取该sheet表中('行','列')的数据
value = sheet.cell_value(1, 2)
# 获取整行和整列的值（数组）0开始
rows = sheet.row_values(0)  # 获取第2行内容
cols = sheet.col_values(0)  # 获取第3列内容
print(rows, cols)
# sheet的名称，行数，列数
print(sheet.name, sheet.nrows, sheet.ncols)

2、写入用xlwt

import xlwt
import time
import os

# 创建一个新的工作簿和工作表
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('Sheet1')

# 写入表头
headers = ['point_addr', 'point_content', 'calc_type', 'calc_base', 'point_unit', 'parent_addr', 'bit_start',
           'bit_length', 'del_flag', 'device_type']
for col, header in enumerate(headers):
    worksheet.write(0, col, header)

# 写入数据
for row, obj in enumerate(_point, start=1):
    worksheet.write(row, 0, obj.point_addr)
    worksheet.write(row, 1, obj.point_content)
    worksheet.write(row, 2, obj.calc_type)
    worksheet.write(row, 3, obj.calc_base)
    worksheet.write(row, 4, obj.point_unit)
    worksheet.write(row, 5, obj.parent_addr)
    worksheet.write(row, 6, obj.bit_start)
    worksheet.write(row, 7, obj.bit_length)
    worksheet.write(row, 8, obj.del_flag)
    worksheet.write(row, 9, obj.device_type)

file_name = str(int(time.time())) + '.xls'
# 保存工作簿为 xls 文件
workbook.save(os.path.join(MEDIA_ROOT, file_name))