Word表格信息提取汇总至Excel

最新推荐文章于 2024-02-07 07:51:33 发布

编程瞎学者

最新推荐文章于 2024-02-07 07:51:33 发布

阅读量998

点赞数

文章标签： python microsoft Powered by 金山文档

本文链接：https://blog.csdn.net/qaq_39/article/details/129158866

版权

因为这个人办公过程中，接到了收集几十个Word申请表，还需要我自己把里面的信息一条条复制粘贴到Excel表格里面的离谱工作，所以有了本篇代码。

代码注释非常详细，而且安装好库就能用，所以就少写点啦

需要的库不多：pandas，python-docx，pywin32。使用pip或者conda可以直接安装

当然，再好用的东西也得你自己告诉他你要什么是不是？

所以: 你的word在哪儿：files_path

给一张空表参考一下：Ffile

你想让它汇总什么：need_keys

你的逆天命名方式总有一些相似的地方吧：name_keys

它应该会持续更新，我也在代码里面留下了一些我忘记了是做什么用的变量，它们不会影响使用，但最好不要动它们。

# -*-coding:utf-8-*-
import os
import pandas as pd
from docx import Document
from win32com import client as wc


files_path  = r'C:\Users\Desktop\code\work'
Ffile = r'C:\Users\Desktop\code\表.docx'
need_keys = '姓 名，性别，手 机，备注'
name_keys = '申请表'

""" 本代码建立在python-docx包的合并单元格
    实际储存格式为子单元格存有相同数值的基础上
    当这一数据结构改变时，本代码需要更新，
    否则将不再具有使用价值
    暂时只能读一个word中只有一个表格的情况
    python-docx基于储存格式和dict不允许相同key的基础上，
    所需要素：
            1.未填信息的word空表（格式文件）
            2.需要汇总的信息，以中文逗号分隔
            如'姓名，性别，联系方式'
            3.文件夹所在路径
            4.文件名关键字
    代码原理如下：
    将原本的word空表（即格式文件）读入字典，在字典中建立列表
    遍历列表，填充字典
    删除不需要的信息
    2023.02.22更新，用列表替换原本的字符串，
    解决了如：属性（是否大帅比）值（是）引起的读取错误
"""
class read_table:
    def __init__(self, file_path, keys, format_file, name_keys):
        self.file_path = file_path  # 文件最大父目录
        self.keys = keys  # 所需信息
        self.format_file = format_file  # 格式文件
        self.name_keys = name_keys  # 文件名关键字

    def build_list(self, name_keys):  # 文件列表建立函数
        file_list = []  # 文件列表
        word = wc.Dispatch("Word.Application")  # 调用Word程序，用以完成doc转docx功能
        for root, dirs, files in os.walk(self):  # 遍历所有文件名
            for name in files:
                if name.startswith('~$'):  # 跳过临时文件
                    continue
                elif name.split('.')[-1] == 'doc':  # 若为doc 转换文件格式为docx
                    name = os.path.join(root, name)  # 获取文件路径
                    docx = word.Documents.Open(name)  # 打开word文件
                    docx.SaveAs("{}x".format(name), 12)  # 另存为后缀为".docx"的文件，其中参数12指docx文件
                    docx.Close()  # 关闭原来word文件
                    os.remove(name)  # 移除原有doc文件
                    if name_keys =='':   # 2023.02.22更新，可以不指定文件名中的关键字
                        file_list.append("{}x".format(name))
                    else:
                        for key in name_keys.split('，'):  # 判断该word文件是否为需要的文件
                            if name.find(key) != -1:
                                file_list.append("{}x".format(name))  # 记录文件名
                                break
                elif name.split('.')[-1] == 'docx':  # docx直接进入判断
                    if name_keys == '':    #同上
                        file_list.append(os.path.join(root, name))
                    else:
                        for key in name_keys.split('，'):
                            if name.find(key) != -1:
                                file_list.append(os.path.join(root, name))
                                break
        word.Quit()  # 遍历结束，退出word程序
        print('*' * 20)
        print('程序初始化完成！')
        print('*' * 20)
        return file_list  # 返回文件列表

    def read(self):  # 主函数
        file_list = read_table.build_list(self.file_path, self.name_keys)  # 建立文件列表
        main_dict = {}  # 建立数据字典
        rows = Document(self.format_file).tables[0].rows  # 读取格式文件中的表格
        for row in rows:  # 读取属性，以字典键的形式储存
            for cell in row.cells:
                if cell.text != '':
                    main_dict[cell.text] = []
        check_str = []  # 字典属性记录字符串
        dim_str = ''  # 多余属性记录字符串
        for key in main_dict.keys():
            check_str.append(key)
        for each_file in file_list:  # 对于每个文件
            print('开始读取文件：')
            print(each_file)
            rows = Document(each_file).tables[0].rows  # 获取表格
            i = 0  # 行号
            j = 0  # 列号
            d = 0  # 值，竖向搜索控制
            k = 0  # 值，横向搜索控制
            key = ''  # 当前属性
            while i < len(rows):
                # 开始遍历表格，循环判断条件并没有什么作用，只是还没有进行代码重构
                # 循环的正常进行由代码中的continue和break来控制
                if d == 0:  # 好吧，我也不知道它是干啥的1111111111111111111111111111111111111111111111111111111111111
                    j = 0
                while j < len(rows[i].cells):  # 循环的作用是保证实现任务目标前代码不会终止
                    if d == 0 and k != 1:  # 正常遍历
                        if not rows[i].cells[j].text in check_str or rows[i].cells[j].text == key:
                            j += 1  # 上述的判断保证了值和重复关键字不会被作为中心单元格
                            continue
                        else:  # 以当前格子的文本为属性
                            key = rows[i].cells[j].text
                        if j == len(rows[i].cells) - 1:  # 当这个格子是改行的最后一个格子时，需要特殊对待
                            if rows[i].cells[j - 1] == rows[i].cells[j]:
                                break  # 该格子的值和前一个格子相等，下一行
                            elif not rows[i].cells[j].text in check_str:
                                main_dict[key].append(rows[i].cells[j].text)
                                i += 1  # 当前格子是个值，记录一下，下一行
                                break
                            else:
                                d = 1  # 当前格子是个新属性，向下搜索
                                continue
                        if rows[i + 1].cells[j].text in check_str:  # 当属性下方还是属性，
                            if key == rows[i + 1].cells[j].text:  # 它们还相等
                                if rows[i].cells[j + 1].text in check_str:
                                    j += 1  # 右侧也是个属性，下一格
                                    continue
                                elif rows[i].cells[j + 1].text == rows[i + 1].cells[j + 1].text:
                                    main_dict[key].append(rows[i].cells[j + 1].text)
                                    j += 1  # 右边和右下角是两个相等的值，记录，下一格，下一个属性
                                    key = ''
                                    continue
                            else:  # 它们不相等
                                if rows[i].cells[j + 1].text in check_str:
                                    key = ''  # 右侧是个属性，下一格，下一个属性
                                    j += 1
                                    continue
                                else:
                                    main_dict[key].append(rows[i].cells[j + 1].text)
                                    j += 1  # 右侧是个值，记录，开始横向搜索
                                    k = 1
                                    continue
                        else:  # 属性下方是个值，记录，竖向搜索
                            main_dict[key].append(rows[i + 1].cells[j].text)
                            i += 1
                            d += 1  # 向下搜索
                            continue
                    elif k == 1:  # 横向搜索中，
                        if rows[i].cells[j].text == rows[i].cells[j + 1].text or (
                                rows[i].cells[j + 1].text in check_str):
                            key = ''  # 右侧的值和当前值相等或者右侧是个属性，
                            k = 0  # 下一格，下一个属性，终止横向搜索
                            j += 1
                            continue
                        else:  # 记录该值，继续横向搜索
                            main_dict[key].append(rows[i].cells[j + 1].text)
                            j += 1
                            continue
                    else:  # 竖向搜索中
                        if not rows[i + 1].cells[j].text in check_str:  # 值下方是个值
                            if j != len(rows[i + 1].cells) - 1:  # 不是最后一格
                                if rows[i + 1].cells[j].text != rows[i + 1].cells[j + 1].text:
                                    main_dict[key].append(rows[i + 1].cells[j].text)
                                    i += 1  # 下方的值与右下角的值不等，记录值，继续向下
                                    d += 1  # 向下搜索
                                    continue
                                else:  # 终止向下搜索，返回开始位置，下一格
                                    i = i - d
                                    j += 1
                                    d = 0
                                    continue
                            else:  # 最后一格时，因为右侧无法进行判断，与左侧格子进行对比
                                if rows[i + 1].cells[j - 1].text != rows[i + 1].cells[j].text:
                                    main_dict[key].append(rows[i + 1].cells[j].text)  # 记录值
                                    i += 1
                                    d += 1  # 向下搜索
                                    continue
                                else:
                                    i = i - d + 1
                                    d = 0
                                    j += 1
                        else:  # 值下方是个属性，终止向下搜索
                            if j == len(rows[i].cells) - 1:
                                i = i - d + 1
                            else:
                                i = i - d
                            j += 1
                            d = 0
                            continue
                i += 1  # 一行走完了就要下一行啊
                if i == len(rows) - 1:  # 最后一行不能和下方单元格进行判断
                    j = 0
                    if not rows[i].cells[0].text in check_str or \
                            rows[i].cells[0].text == rows[i - 1].cells[0].text:
                        # 开头是个值，或者当前格子的内容与上方格子内容一样，终止搜索
                        break
                    else:
                        while j < len(rows[i].cells) - 1:
                            if rows[i].cells[j].text in check_str:  # 当前格子是个属性
                                if not rows[i].cells[j + 1].text in check_str:  # 右侧和它不相等，记录，下一格
                                    main_dict[rows[i].cells[j].text].append(rows[i].cells[j + 1].text)
                                j += 1
                            else:
                                j += 1
                        break
            check_keys = []  # 记录字典属性
            for key in main_dict.keys():
                check_keys.append(key)
            i = 0
            j = 1
            while i < len(check_keys) - 1:  # 对比字典属性
                if check_keys[i].find(check_keys[j]) != -1:  # 若某属性被其他属性所包含
                    if dim_str.find(check_keys[j]) == -1:  # 记录该属性
                        dim_str += check_keys[j] + '，'
                    for value in main_dict[check_keys[j]]:  # 将子属性值追加至父属性值列表，清空子属性
                        if value != '':
                            main_dict[check_keys[i]].append(value)
                            main_dict[check_keys[j]].remove(value)
                    j += 1
                elif check_keys[j].find(check_keys[i]) != -1:
                    if dim_str.find(check_keys[i]) == -1:
                        dim_str += check_keys[i] + '，'
                    for value in main_dict[check_keys[i]]:
                        if value != '':
                            main_dict[check_keys[j]].append(value)
                            main_dict[check_keys[i]].remove(value)
                    j += 1
                else:
                    j += 1
                if j >= len(check_keys) - 1:
                    i += 1
                    j = i + 1
            list_length = 0  # 属性值列表长度最大值
            for key in main_dict.keys():
                if list_length < len(main_dict[key]):
                    list_length = len(main_dict[key])
            for key in main_dict.keys():  # 用空值填补较短列表
                while len(main_dict[key]) < list_length:
                    main_dict[key].append('')
        print('*' * 10)
        print('文件读取完毕，开始写入结果！')
        print('*' * 10)
        final_check = []  # 记录列表属性
        for key in main_dict.keys():
            final_check.append(key)
        for table_key in final_check:  # 对每个列表属性，若不被需要，删除
            n = 0
            if self.keys.find(table_key) == -1:
                for key in dim_str.split('，'):
                    if table_key.find(key) != -1 and key != '':
                        del main_dict[key]
                        n = 1
                        break
                if n == 0:
                    del main_dict[table_key]
        df = pd.DataFrame(main_dict)  # 将字典写入DataFrame，保存为excel文件
        df.to_excel(os.path.join(self.file_path, 'result.xlsx'))
        print('结果文件已保存至：' + os.path.join(self.file_path, 'result.xlsx'))


if __name__ == "__main__":
    T = read_table(files_path,need_keys ,
                   Ffile, '')
    T.read()

既然你看到这儿了，说明你可能要参考或者要用这个东西，有什么问题或者建议记得告诉我哈，

邮箱在这儿：mrlonely0@163.com

高情商：我的python还有非常大的进步空间

低情商：我啥都不会