docx转xlsx

最新推荐文章于 2024-08-15 01:55:01 发布

weixin_38871988

最新推荐文章于 2024-08-15 01:55:01 发布

阅读量351

点赞数

文章标签： python pandas 开发语言

本文链接：https://blog.csdn.net/weixin_38871988/article/details/128241629

版权

from docx import Document
from openpyxl import Workbook

def docx2xlsx(path):
    document = Document(path)
    wb = Workbook()
    wb.remove(wb.worksheets[0])
    for index, table in enumerate(document.tables, start=1):
        ws = wb.create_sheet('sheet{}'.format(index))
        for row in table.rows:
            values = list(map(lambda cell: cell.text, row.cells))
            # print(values)
            ws.append(values)
    wb.save(path[:-5] + '_out1.xlsx')

if __name__ == '__main__':
    docx2xlsx("./data/test_data.docx")

# 首先要pip install python-docx
# 如果原文件是doc格式，那就先转成docx
from docx import Document
import pandas as pd
import re
path = "./data/test_data.docx"
docx = Document(path)
table_s = docx.tables  # 返回一个Table对象的列表

list_ = []  # 初始化一个空列表，用来装后面的dict_
for table in table_s:  # 循环所有的表格列表
    dict_ = {}
    # 构建字典，eg：dict_['名称'] = table.cell(0, 1).text  # 表格的索引是从（0行，0列）开始的
    for i in range(0, 5):  # 循环获得表格前五行的内容,取值左闭右开
        dict_[table.cell(i, 0).text] = table.cell(i, 1).text
        dict_[table.cell(i, 2).text] = table.cell(i, 3).text

    dict_[table.cell(5, 0).text] = table.cell(5, 1).text #第五行地址占了整个一行

    dict_[table.cell(6, 0).text[:5]] = table.cell(6, 0).text[5:] #取投诉内容信息
    re_rearch_res = re.search(r"记 录 人：(.*)", table.cell(6, 1).text, 0)
    dict_["记录人"] = re_rearch_res.group(1).strip() #投诉内容信息里正则获取记录人信息
    re_rearch_res=re.search(r"记录时间：(.*)", table.cell(6, 1).text, 0)
    dict_["记录时间"] = re_rearch_res.group(1).strip() #投诉内容信息里正则获取记录时间信息

    for i in range(7, 12):  # 循环获得后面的内容
        dict_[table.cell(i, 0).text.split('：\n')[0]] = table.cell(i, 0).text.split('：\n')[1]
    list_.append(dict_)
print(list_)

#用pandas转df数据保存xlsx
df = pd.DataFrame(list_)
df.to_excel(path[:-5] + '_out2.xlsx', index=True, merge_cells=True)

doc转docx
可能会报错

from win32com import client
# 转换doc为docx
def doc2docx(fn):
    word = client.Dispatch("Word.Application")  # 打开word应用程序
    # word = DispatchEx('Word.Application') # 启动独立的进程
    doc = word.Documents.Open(fn)  # 打开word文件
    doc.SaveAs("{}x".format(fn), 12)  # 另存为后缀为".docx"的文件，其中参数12或16指docx文件
    doc.Close()  # 关闭原来word文件
    word.Quit()
    return
path = "./test_data_ori.doc"
doc2docx(path)