Python操作Word文档工具类:
- 获取Word文档中的段落
- 获取Word文档中的表格
- 获取表格中的单元格
- 向Word文档中的段落里写入数据,占位符${name}
- 向Word文档中的表格里写入数据,占位符${name}
- 向表格中的单元格里写入数据
- 向Word文档里插入图片
- 插入表格行、删除表格行、合并单元格
- 提取Word文档中的表格并保存至Excel文件
- Word文档转换成PDF文件
- 根据模板读取Word文档里的数据
"""
@Title: MjWord工具类
@Time: 2024/2/8
@Author: Michael Jie
"""
import os
import win32com.client as win32
from docx import Document
from docx.shared import Cm
from docx.table import Table, _Cell
from docx.text.paragraph import Paragraph
from office.MjUtil import MjUtil
from office.MjExcel import MjExcel
class MjWord:
# 构造函数
def __init__(self,
path: str):
"""
:param path: Word文件路径
"""
if not os.path.isfile(path):
raise Exception("非文件路径!")
post = path.lower().split(".")[-1]
if post != "docx":
raise Exception("非docx文件路径!")
self.path = path # 文件路径
self.docx = Document(path) # 文档对象
# 获取所有段落
def get_paragraphs(self,
empty: bool = False) -> list[Paragraph]:
"""
:param empty: 是否剔除空白段落,默认不剔除
:return: list[Paragraph]
"""
paragraphs = self.docx.paragraphs
if not empty:
return paragraphs
# 剔除空白段落
paragraphs_without_none = []
for paragraph in paragraphs:
if MjUtil.is_none(paragraph.text):
continue
paragraphs_without_none.append(paragraph)
return paragraphs_without_none
# 获取特定段落
def get_paragraph(self,
inx: int | str,
empty: bool = False) -> Paragraph:
"""
:param inx: 下标/字符串索引
:param empty: 是否剔除空白段落,默认不剔除
:return: Paragraph
"""
paragraphs = self.get_paragraphs(empty)
if type(inx).__name__ == "int":
return paragraphs[inx]
# 使用字符串索引
for paragraph in paragraphs:
if inx in paragraph.text:
return paragraph
return paragraphs[0]
# 获取所有表格
def get_tables(self) -> list[Table]:
return self.docx.tables
# 获取特定表格
def get_table(self,
inx: int) -> Table:
"""
:param inx: 下标索引
:return: Table
"""
return self.get_tables()[inx]
# 获取单元格
def get_cell(self,
table: Table | int,
index: tuple) -> _Cell:
"""
:param table: 表格/下标索引
:param index: 单元格索引,(0, 0)
:return: _Cell
"""
if type(table).__name__ == "int":
table = self.get_table(table)
return table.cell(index[0], index[1])
# 向段落里写入数据,占位符${name}
def write_data_to_paragraph(self,
ph: Paragraph | int | str,
data: dict):
"""
:param ph: 段落/下标索引/字符串索引
:param data: 数据
:return:
"""
if type(ph).__name__ == "int" or type(ph).__name__ == "str":
ph = self.get_paragraph(ph)
text = ph.text
ph.text = MjUtil.write_data(text, data)
# 向表格里写入数据,占位符${name}
def write_data_to_table(self,
table: Table | int,
data: dict):
"""
:param table: 表格/下标索引
:param data: 数据
:return:
"""
if type(table).__name__ == "int":
table = self.get_table(table)
rows = table.rows
for row in rows:
cells = row.cells
for cell in cells:
text = cell.text
cell.text = MjUtil.write_data(text, data)
# 向表格里写入多条数据,占位符${name}
def write_data_list_to_table(self,
table: Table | int,
data_list: list,
inx: int = 1):
"""
:param table: 表格/下标索引
:param data_list: 数据
:param inx: 占位符行索引
:return:
"""
if type(table).__name__ == "int":
table = self.get_table(table)
# 模板行
head_row = table.rows[inx]
head_row_cells = head_row.cells
# 循环写入数据
for data in data_list:
new_cells = table.add_row().cells
for index in range(len(new_cells)):
text = head_row_cells[index].text
new_cells[index].text = MjUtil.write_data(text, data)
# 删除模板行
self.remove_table_row(table, inx)
# 向单元格里写入数据
def write_data_to_cell(self,
table: Table | int,
index: tuple,
text: int | float | str):
"""
:param table: 表格/下标索引
:param index: 单元格索引,(0, 0)
:param text: 数据
:return:
"""
cell = self.get_cell(table, index)
cell.text = str(text)
# 向文档里插入图片
def write_picture_to_paragraph(self,
ph: Paragraph | int | str,
img_path: str,
img_shape: tuple):
"""
:param ph: 段落/下标/字符串索引
:param img_path: 图片路径
:param img_shape: 图片形状
:return:
"""
if not os.path.isfile(img_path):
raise Exception("非文件路径!")
if type(ph).__name__ == "int" or type(ph).__name__ == "str":
ph = self.get_paragraph(ph)
run = ph.add_run()
run.add_picture(img_path, width=Cm(img_shape[0]), height=Cm(img_shape[1]))
# 向单元格里插入图片
def write_picture_to_cell(self,
table: Table | int,
inx: tuple,
img_path: str,
img_shape: tuple):
"""
:param table: 表格/下标索引
:param inx: 单元格索引
:param img_path: 图片路径
:param img_shape: 图片形状
:return:
"""
cell = self.get_cell(table, inx)
ph = cell.paragraphs[0]
self.write_picture_to_paragraph(ph, img_path, img_shape)
# 插入表格行
def add_table_row(self,
table: Table | int,
inx: int):
"""
:param table: 表格/下标索引
:param inx: 行索引
:return:
"""
if type(table).__name__ == "int":
table = self.get_table(table)
# 在末尾插入新行
table.add_row()
rows = table.rows
# 更改新行位置
rows[inx]._element.getparent().insert(
rows[inx]._element.getparent().index(rows[inx]._element),
rows[-1]._element
)
# 删除原索引行
# rows[-1]._element.getparent().remove(rows[-1]._element)
# 删除表格行
def remove_table_row(self,
table: Table | int,
inx: int):
"""
:param table: 表格/下标索引
:param inx: 行索引
:return:
"""
if type(table).__name__ == "int":
table = self.get_table(table)
# table._tbl中从第三个数据开始才是真实的数据行
table._tbl.remove(table._tbl[inx + 2])
# 合并单元格
def merge_table_cells(self,
table: Table | int,
inx: tuple):
"""
:param table: 表格/下标索引
:param inx: 索引,(0, 0, 1, 1)
:return:
"""
start = self.get_cell(table, (inx[0], inx[1]))
end = self.get_cell(table, (inx[2], inx[3]))
start.merge(end)
# 提取Word表格,并保存至Excel
def write_table_to_excel(self,
table: Table | int,
in_path: str,
out_path: str = None,
inx: int | str = None):
"""
:param table: 表格/下标索引
:param in_path: Excel文件路径
:param out_path: Excel文件保存路径
:param inx: 工作表索引/名称
:return:
"""
if type(table).__name__ == "int":
table = self.get_table(table)
# 创建Excel
excel = MjExcel(in_path)
sheet = excel.set_cur_sheet(inx)
rows = table.rows
for row in rows:
lst = []
cells = row.cells
for cell in cells:
lst.append(cell.text)
sheet.append(lst)
excel.save(out_path)
# 保存
def save(self,
path: str = None):
"""
:param path: 保存路径
:return:
"""
if path is None or path == "":
self.docx.save(self.path)
else:
# 另存为
self.docx.save(path)
# Word转换PDF
@staticmethod
def convert_to_pdf(in_path: str,
out_path: str = None):
"""
:param in_path: Word文档路径
:param out_path: PDF路径
:return:
"""
# 校验Word文档路径
in_strs = in_path.lower().split(".")
if in_strs[-1] != "docx":
raise Exception("非docx文件路径!")
# 校验PDF文件路径,若为空则在Word文档同级路径下创建
if MjUtil.is_none(out_path):
out_path = in_strs[0] + ".pdf"
out_post = out_path.lower().split(".")[-1]
if out_post != "pdf":
raise Exception("非pdf文件路径!")
# 创建Word应用程序实例
word_app = win32.gencache.EnsureDispatch("Word.Application")
# 设置应用程序可见性
word_app.Visible = False
try:
# 打开Word文档
docx = word_app.Documents.Open(in_path)
# 保存为PDF
docx.SaveAs(out_path, FileFormat=17)
docx.Close()
finally:
# 关闭Word应用程序
word_app.Quit()
# 读取段落里的数据
@staticmethod
def read_paragraph(temp: Paragraph,
text: Paragraph,
data: dict = None) -> dict:
"""
:param temp: 模板段落
:param text: 数据段落
:param data: 初始数据
:return: dict
"""
return MjUtil.read_data(temp.text, text.text, data)
# 读取表格里的数据
@staticmethod
def read_table(temp: Table,
text: Table,
data: dict = None) -> dict:
"""
:param temp: 模板表格
:param text: 数据表格
:param data: 初始数据
:return: dict
"""
temp_rows = temp.rows
for i, temp_row in enumerate(temp_rows):
temp_cells = temp_row.cells
for j, temp_cell in enumerate(temp_cells):
text_cell = text.cell(i, j)
MjUtil.read_data(temp_cell.text, text_cell.text, data)
return data
# 读取表格里的多条数据
@staticmethod
def read_table_list(temp: Table,
text: Table,
inx: int = 1) -> list:
"""
:param temp: 模板表格
:param text: 数据表格
:param inx: 占位符行索引
:return: list
"""
temp_row = temp.rows[inx]
data_list = []
text_rows = text.rows
for i in range(inx, len(text_rows)):
# 一行对应一条数据
data = {}
text_cells = text_rows[i].cells
for j, text_cell in enumerate(text_cells):
MjUtil.read_data(temp_row.cells[j].text, text_cell.text, data)
data_list.append(data)
return data_list