Python-docx-MjWord-CSDN博客

本文链接：https://blog.csdn.net/2401_82889064/article/details/136048771
Python操作Word文档工具类：
获取Word文档中的段落
获取Word文档中的表格
获取表格中的单元格
向Word文档中的段落里写入数据，占位符${name}
向Word文档中的表格里写入数据，占位符${name}
向表格中的单元格里写入数据
向Word文档里插入图片
插入表格行、删除表格行、合并单元格
提取Word文档中的表格并保存至Excel文件
Word文档转换成PDF文件
根据模板读取Word文档里的数据
"""
@Title: MjWord工具类
@Time: 2024/2/8
@Author: Michael Jie
"""

import os

import win32com.client as win32
from docx import Document
from docx.shared import Cm
from docx.table import Table, _Cell
from docx.text.paragraph import Paragraph

from office.MjUtil import MjUtil
from office.MjExcel import MjExcel


class MjWord:
    # 构造函数
    def __init__(self,
                 path: str):
        """
        :param path: Word文件路径
        """
        if not os.path.isfile(path):
            raise Exception("非文件路径！")
        post = path.lower().split(".")[-1]
        if post != "docx":
            raise Exception("非docx文件路径！")
        self.path = path  # 文件路径
        self.docx = Document(path)  # 文档对象

    # 获取所有段落
    def get_paragraphs(self,
                       empty: bool = False) -> list[Paragraph]:
        """
        :param empty: 是否剔除空白段落，默认不剔除
        :return: list[Paragraph]
        """
        paragraphs = self.docx.paragraphs
        if not empty:
            return paragraphs
        # 剔除空白段落
        paragraphs_without_none = []
        for paragraph in paragraphs:
            if MjUtil.is_none(paragraph.text):
                continue
            paragraphs_without_none.append(paragraph)
        return paragraphs_without_none

    # 获取特定段落
    def get_paragraph(self,
                      inx: int | str,
                      empty: bool = False) -> Paragraph:
        """
        :param inx: 下标/字符串索引
        :param empty: 是否剔除空白段落，默认不剔除
        :return: Paragraph
        """
        paragraphs = self.get_paragraphs(empty)
        if type(inx).__name__ == "int":
            return paragraphs[inx]
        # 使用字符串索引
        for paragraph in paragraphs:
            if inx in paragraph.text:
                return paragraph
        return paragraphs[0]

    # 获取所有表格
    def get_tables(self) -> list[Table]:
        return self.docx.tables

    # 获取特定表格
    def get_table(self,
                  inx: int) -> Table:
        """
        :param inx: 下标索引
        :return: Table
        """
        return self.get_tables()[inx]

    # 获取单元格
    def get_cell(self,
                 table: Table | int,
                 index: tuple) -> _Cell:
        """
        :param table: 表格/下标索引
        :param index: 单元格索引，(0, 0)
        :return: _Cell
        """
        if type(table).__name__ == "int":
            table = self.get_table(table)
        return table.cell(index[0], index[1])

    # 向段落里写入数据，占位符${name}
    def write_data_to_paragraph(self,
                                ph: Paragraph | int | str,
                                data: dict):
        """
        :param ph: 段落/下标索引/字符串索引
        :param data: 数据
        :return:
        """
        if type(ph).__name__ == "int" or type(ph).__name__ == "str":
            ph = self.get_paragraph(ph)
        text = ph.text
        ph.text = MjUtil.write_data(text, data)

    # 向表格里写入数据，占位符${name}
    def write_data_to_table(self,
                            table: Table | int,
                            data: dict):
        """
        :param table: 表格/下标索引
        :param data: 数据
        :return:
        """
        if type(table).__name__ == "int":
            table = self.get_table(table)
        rows = table.rows
        for row in rows:
            cells = row.cells
            for cell in cells:
                text = cell.text
                cell.text = MjUtil.write_data(text, data)

    # 向表格里写入多条数据，占位符${name}
    def write_data_list_to_table(self,
                                 table: Table | int,
                                 data_list: list,
                                 inx: int = 1):
        """
        :param table: 表格/下标索引
        :param data_list: 数据
        :param inx: 占位符行索引
        :return:
        """
        if type(table).__name__ == "int":
            table = self.get_table(table)
        # 模板行
        head_row = table.rows[inx]
        head_row_cells = head_row.cells
        # 循环写入数据
        for data in data_list:
            new_cells = table.add_row().cells
            for index in range(len(new_cells)):
                text = head_row_cells[index].text
                new_cells[index].text = MjUtil.write_data(text, data)
        # 删除模板行
        self.remove_table_row(table, inx)

    # 向单元格里写入数据
    def write_data_to_cell(self,
                           table: Table | int,
                           index: tuple,
                           text: int | float | str):
        """
        :param table: 表格/下标索引
        :param index: 单元格索引，(0, 0)
        :param text: 数据
        :return:
        """
        cell = self.get_cell(table, index)
        cell.text = str(text)

    # 向文档里插入图片
    def write_picture_to_paragraph(self,
                                   ph: Paragraph | int | str,
                                   img_path: str,
                                   img_shape: tuple):
        """
        :param ph: 段落/下标/字符串索引
        :param img_path: 图片路径
        :param img_shape: 图片形状
        :return:
        """
        if not os.path.isfile(img_path):
            raise Exception("非文件路径！")
        if type(ph).__name__ == "int" or type(ph).__name__ == "str":
            ph = self.get_paragraph(ph)
        run = ph.add_run()
        run.add_picture(img_path, width=Cm(img_shape[0]), height=Cm(img_shape[1]))

    # 向单元格里插入图片
    def write_picture_to_cell(self,
                              table: Table | int,
                              inx: tuple,
                              img_path: str,
                              img_shape: tuple):
        """
        :param table: 表格/下标索引
        :param inx: 单元格索引
        :param img_path: 图片路径
        :param img_shape: 图片形状
        :return:
        """
        cell = self.get_cell(table, inx)
        ph = cell.paragraphs[0]
        self.write_picture_to_paragraph(ph, img_path, img_shape)

    # 插入表格行
    def add_table_row(self,
                      table: Table | int,
                      inx: int):
        """
        :param table: 表格/下标索引
        :param inx: 行索引
        :return:
        """
        if type(table).__name__ == "int":
            table = self.get_table(table)
        # 在末尾插入新行
        table.add_row()
        rows = table.rows
        # 更改新行位置
        rows[inx]._element.getparent().insert(
            rows[inx]._element.getparent().index(rows[inx]._element),
            rows[-1]._element
        )
        # 删除原索引行
        # rows[-1]._element.getparent().remove(rows[-1]._element)

    # 删除表格行
    def remove_table_row(self,
                         table: Table | int,
                         inx: int):
        """
        :param table: 表格/下标索引
        :param inx: 行索引
        :return:
        """
        if type(table).__name__ == "int":
            table = self.get_table(table)
        # table._tbl中从第三个数据开始才是真实的数据行
        table._tbl.remove(table._tbl[inx + 2])

    # 合并单元格
    def merge_table_cells(self,
                          table: Table | int,
                          inx: tuple):
        """
        :param table: 表格/下标索引
        :param inx: 索引，(0, 0, 1, 1)
        :return:
        """
        start = self.get_cell(table, (inx[0], inx[1]))
        end = self.get_cell(table, (inx[2], inx[3]))
        start.merge(end)

    # 提取Word表格，并保存至Excel
    def write_table_to_excel(self,
                             table: Table | int,
                             in_path: str,
                             out_path: str = None,
                             inx: int | str = None):
        """
        :param table: 表格/下标索引
        :param in_path: Excel文件路径
        :param out_path: Excel文件保存路径
        :param inx: 工作表索引/名称
        :return:
        """
        if type(table).__name__ == "int":
            table = self.get_table(table)
        # 创建Excel
        excel = MjExcel(in_path)
        sheet = excel.set_cur_sheet(inx)
        rows = table.rows
        for row in rows:
            lst = []
            cells = row.cells
            for cell in cells:
                lst.append(cell.text)
            sheet.append(lst)
        excel.save(out_path)

    # 保存
    def save(self,
             path: str = None):
        """
        :param path: 保存路径
        :return:
        """
        if path is None or path == "":
            self.docx.save(self.path)
        else:
            # 另存为
            self.docx.save(path)

    # Word转换PDF
    @staticmethod
    def convert_to_pdf(in_path: str,
                       out_path: str = None):
        """
        :param in_path: Word文档路径
        :param out_path: PDF路径
        :return:
        """
        # 校验Word文档路径
        in_strs = in_path.lower().split(".")
        if in_strs[-1] != "docx":
            raise Exception("非docx文件路径！")
        # 校验PDF文件路径，若为空则在Word文档同级路径下创建
        if MjUtil.is_none(out_path):
            out_path = in_strs[0] + ".pdf"
        out_post = out_path.lower().split(".")[-1]
        if out_post != "pdf":
            raise Exception("非pdf文件路径！")

        # 创建Word应用程序实例
        word_app = win32.gencache.EnsureDispatch("Word.Application")
        # 设置应用程序可见性
        word_app.Visible = False
        try:
            # 打开Word文档
            docx = word_app.Documents.Open(in_path)
            # 保存为PDF
            docx.SaveAs(out_path, FileFormat=17)
            docx.Close()
        finally:
            # 关闭Word应用程序
            word_app.Quit()

    # 读取段落里的数据
    @staticmethod
    def read_paragraph(temp: Paragraph,
                       text: Paragraph,
                       data: dict = None) -> dict:
        """
        :param temp: 模板段落
        :param text: 数据段落
        :param data: 初始数据
        :return: dict
        """
        return MjUtil.read_data(temp.text, text.text, data)

    # 读取表格里的数据
    @staticmethod
    def read_table(temp: Table,
                   text: Table,
                   data: dict = None) -> dict:
        """
        :param temp: 模板表格
        :param text: 数据表格
        :param data: 初始数据
        :return: dict
        """
        temp_rows = temp.rows
        for i, temp_row in enumerate(temp_rows):
            temp_cells = temp_row.cells
            for j, temp_cell in enumerate(temp_cells):
                text_cell = text.cell(i, j)
                MjUtil.read_data(temp_cell.text, text_cell.text, data)
        return data

    # 读取表格里的多条数据
    @staticmethod
    def read_table_list(temp: Table,
                        text: Table,
                        inx: int = 1) -> list:
        """
        :param temp: 模板表格
        :param text: 数据表格
        :param inx: 占位符行索引
        :return: list
        """
        temp_row = temp.rows[inx]
        data_list = []
        text_rows = text.rows
        for i in range(inx, len(text_rows)):
            # 一行对应一条数据
            data = {}
            text_cells = text_rows[i].cells
            for j, text_cell in enumerate(text_cells):
                MjUtil.read_data(temp_row.cells[j].text, text_cell.text, data)
            data_list.append(data)
        return data_list