import logging,time,re
import numpy as np
from collections import defaultdict
from itertools import zip_longest
from kotei_omp.data import DocumentBlockObject
from kotei_omc.comparers.picture_comparer import PictureComparer, GraphicComparer
from kotei_omc.comparers.base_comparer import BaseComparer
from kotei_omc.comparers.plugins import register_plugin
from kotei_omc.data.diff import DiffItem
from kotei_omp.data import TextObject, GraphicObject, PictureObject, StyleObject, RunObject
from kotei_omp.data.table import CellObject, RowObject, TableObject
from kotei_omc.settings import settings
from kotei_omc.utils.type_checker import is_instance_of
from kotei_omc.middlewares.table_middlewares import CustomTableStrategyMiddleware
logger = logging.getLogger("req_diff")
@register_plugin("table")
class TableComparer(BaseComparer):
def get_block_resource(self, block, belong_to='block'):
return self.do_get_block_resource(block, belong_to, 'tables', TableObject)
def compare(self, block_name, base, target, belong_to=None):
t0 = time.time()
# 表格匹配
logger.info(f'start match table, block_name: {block_name}, base_num: {len(base)}, target_num: {len(target)}')
match_func = CustomTableStrategyMiddleware(self._path_base).match
if settings.MATCH_WITH_CHAPTER:
tb_delete_list, tb_add_list, old_new_tb_matched = self.do_match_with_chapter(base, target,match_func)
else:
tb_delete_list, tb_add_list, old_new_tb_matched = self.do_match_normal(base, target,match_func)
logger.info('finish match table')
# 表格新增删除
ls_tb_delete, ls_tb_add = self.process_delete_add_diff(block_name, 'table', tb_delete_list, tb_add_list, belong_to=belong_to)
# 表格差分
ls_tb_update = []
for old_table, new_table in old_new_tb_matched:
# 要求废止特殊处理
old_table, new_table = self.pre_process_require(old_table, new_table)
# 表格位置差分
if not old_table.is_same_pos(new_table):
ls_tb_update.append(DiffItem('update', 'table', sub_type='table', block_name=block_name,
old=old_table, new=new_table, belong_to=belong_to,diff_point='coordinate_desc'))
# 对匹配的每个表格进行对比
part_delete, part_add, part_update = self.compare_table(block_name, old_table, new_table,belong_to=belong_to)
ls_tb_delete.extend(self.row_del_add_after(part_delete,category='delete'))
ls_tb_add.extend(self.row_del_add_after(part_add,category='add'))
ls_tb_update.extend(self.cell_update_after(part_update))
t1 = time.time()
logger.info(f'Time Cost:table diff {block_name} {t1 - t0}')
return {'add': ls_tb_add, 'delete': ls_tb_delete, 'update': ls_tb_update}
@staticmethod
def copy_table_attrs(to_table, from_table):
for attr_name in ('layout', 'style', 'border', 'coordinate', 'data_id'):
setattr(to_table, attr_name, getattr(from_table, attr_name))
@staticmethod
def fill_visual_merged_cells(table):
num_rows = len(table.rows)
if num_rows == 0:
return
num_cols = max([len(row.cells) for row in table.rows])
if num_cols == 0:
return
# 判断是否有边界
def is_bordered(side):
return side.border_style is not None
for col in range(num_cols):
row_ptr = 0
while row_ptr < num_rows:
cell = table.rows[row_ptr].cells[col]
top_border_exists = is_bordered(cell.border.border_top)
if row_ptr == 0 or top_border_exists:
start_row = row_ptr
end_row = start_row
while end_row < num_rows:
current_cell = table.rows[end_row].cells[col]
bottom_border_exists = is_bordered(current_cell.border.border_bottom)
# import ipdb;ipdb.set_trace()
if bottom_border_exists or end_row == num_rows - 1:
break
else:
end_row += 1
block_text = None
block_content = None
for r in range(start_row, end_row + 1):
val = table.rows[r].cells[col].text
if val is not None and str(val).strip() != "":
block_text = val
block_content = table.rows[r].cells[col].content
break
if block_text is not None:
merged_ranges = [start_row, col, end_row, col]
for r in range(start_row, end_row + 1):
val = table.rows[r].cells[col].text
if val is None or str(val).strip() == "":
table.rows[r].cells[col].content = block_content
table.rows[r].cells[col].text = block_text
# 添加 merged_ranges 属性
if not table.rows[r].cells[col].merged_ranges:
table.rows[r].cells[col].merged_ranges = merged_ranges
row_ptr = end_row + 1
else:
row_ptr += 1
def compare_table(self, block_name, old_table, new_table, belong_to):
logger.info(f"start compare table, old_data_id: {old_table.data_id}, new_data_id: {new_table.data_id}")
# 使表格内列数一致
self.align_table_col(old_table, new_table)
# 表格中存在大量视觉上merge但是实际未合并的空格,需要将空格赋值为正确的文本,防止影响相似度匹配
self.fill_visual_merged_cells(old_table)
self.fill_visual_merged_cells(new_table)
if old_table.head_type == new_table.head_type == 'horizontal':
old_col_table, new_col_table = self.transpose_table(old_table, new_table)
else:
if old_table.head_type == 'vertical':
new_table.head_list = old_table.head_list
new_table.head_type = 'vertical'
elif new_table.head_type == 'vertical':
old_table.head_list = new_table.head_list
old_table.head_type = 'vertical'
old_col_table, new_col_table = old_table, new_table
# 列匹配
del_cols, add_cols = old_col_table.rows, new_col_table.rows
col_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_cols, add_cols,is_col=True,
head_indexes=[old_table.head_list,new_table.head_list])
if col_matched:
matched_old_cols, matched_new_cols = list(zip(*list(col_matched)))
del_cols = [old_col for old_col in old_col_table.rows if old_col not in matched_old_cols]
add_cols = [new_col for new_col in new_col_table.rows if new_col not in matched_new_cols]
sub_type = 'col' if old_table.head_type == 'horizontal' else 'row'
ls_col_delete, ls_col_add = self.process_delete_add_diff(block_name, sub_type, del_cols, add_cols,
belong_to=belong_to, head_type=old_table.head_type)
# 根据matched的列组合新的表,得到列一致的两个表
if col_matched:
old_col_indexes,new_col_indexes =[],[]
for old_col, new_col in col_matched:
old_col_indexes.append(old_col_table.rows.index(old_col))
new_col_indexes.append(new_col_table.rows.index(new_col))
old_equal_col_table = self.choice_cols(old_table, old_col_indexes)
new_equal_col_table = self.choice_cols(new_table, new_col_indexes)
else:
return ls_col_delete, ls_col_add, []
# 行匹配
del_rows, add_rows = old_equal_col_table.rows, new_equal_col_table.rows
row_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_rows, add_rows, is_col=False)
if row_matched:
matched_old_rows, matched_new_rows = list(zip(*list(row_matched)))
del_rows_indexes = [idx for idx, old_row in enumerate(old_equal_col_table.rows) if old_row not in matched_old_rows]
add_rows_indexes = [idx for idx, new_row in enumerate(new_equal_col_table.rows) if new_row not in matched_new_rows]
# 使用没有重组前的表,横表头直接处理,竖表头需要转置
if old_table.head_type == new_table.head_type == 'horizontal':
del_rows = [old_table.rows[idx] for idx in del_rows_indexes]
add_rows = [new_table.rows[idx] for idx in add_rows_indexes]
else:
old_transpose_table = self.choice_cols(old_table, list(range(len(old_table.rows))))
new_transpose_table = self.choice_cols(new_table, list(range(len(new_table.rows))))
del_rows = [old_transpose_table.rows[idx] for idx in del_rows_indexes]
add_rows = [new_transpose_table.rows[idx] for idx in add_rows_indexes]
sub_type = 'row' if old_table.head_type == 'horizontal' else 'col'
ls_row_delete, ls_row_add = self.process_delete_add_diff(block_name, sub_type, del_rows, add_rows,
belong_to=belong_to, head_type=old_table.head_type)
# 根据matched的行组合新的表,得到行一致的两个表
if row_matched:
old_equal_row_table, new_equal_row_table = TableObject(), TableObject()
old_equal_row_table.rows = list(matched_old_rows)
old_equal_row_table.head_type = old_table.head_type
self.copy_table_attrs(old_equal_row_table, old_table)
new_equal_row_table.rows = list(matched_new_rows)
new_equal_row_table.head_type = new_table.head_type
self.copy_table_attrs(new_equal_row_table, new_table)
# 查找行变更、列变更、单元格变更
ls_row_update, ls_col_update, ls_cell_update = self.compare_ordered_tables(block_name,old_equal_row_table,
new_equal_row_table,belong_to=belong_to)
else:
ls_row_update, ls_col_update, ls_cell_update = [], [], []
part_delete = ls_row_delete + ls_col_delete
part_add = ls_row_add + ls_col_add
part_update = ls_row_update + ls_col_update + ls_cell_update
logger.info(f"finish compare table, old_data_id: {old_table.data_id}, new_data_id: {new_table.data_id}")
return part_delete, part_add, part_update
def transpose_table(self, old_table, new_table):
"""
将表格进行转置操作,即将行转换为列,列转换为行。
Args:
old_table (TableObject): 原始表格对象
new_table (TableObject): 目标表格对象
Returns:
tuple: 返回转置后的两个表格对象 (old_col_table, new_col_table)
"""
# 创建新的表格对象用于存储转置后的数据
old_col_table, new_col_table = TableObject(), TableObject()
# 对原始表格的行进行转置操作
old_col_table.rows = self.transpose_table_rows(old_table.rows)
# 根据原始表格的表头类型,设置转置后的表头类型
old_col_table.head_type = 'vertical' if old_table.head_type == 'horizontal' else 'horizontal'
# 复制原始表格的属性到转置后的表格
self.copy_table_attrs(old_col_table, old_table)
# 对目标表格的行进行转置操作
new_col_table.rows = self.transpose_table_rows(new_table.rows)
# 根据目标表格的表头类型,设置转置后的表头类型
new_col_table.head_type = 'vertical' if new_table.head_type == 'horizontal' else 'horizontal'
# 复制目标表格的属性到转置后的表格
self.copy_table_attrs(new_col_table, new_table)
# 返回转置后的两个表格对象
return old_col_table, new_col_table
def compare_ordered_tables(self, block_name, old_table_obj, new_table_obj, belong_to):
row_updates, col_updates, cell_updates = [], [], []
# 获取新旧行数据
old_rows = getattr(old_table_obj, 'rows', [])
new_rows = getattr(new_table_obj, 'rows', [])
old_cells_list = [row.cells for row in old_rows]
new_cells_list = [row.cells for row in new_rows]
# 获取内容用于对比
old_content_cells_list = self.get_cell_content_list(old_cells_list, settings.DIFF_ATTR)
new_content_cells_list = self.get_cell_content_list(new_cells_list, settings.DIFF_ATTR)
# 删除完全一样的匹配
for row_index in range(len(old_content_cells_list) - 1, -1, -1):
# 如果新旧行内容相同,则删除该行
# 之后可以在这里增加原子操作逻辑,避免删除不同步
if old_content_cells_list[row_index] == new_content_cells_list[row_index]:
old_content_cells_list.pop(row_index)
new_content_cells_list.pop(row_index)
old_cells_list.pop(row_index)
new_cells_list.pop(row_index)
#原子一致性检查
flag = False
if len(old_content_cells_list) ==len(new_content_cells_list)==len(old_cells_list) == len(new_cells_list):
flag = True
if not flag:
logger.warning(f"{block_name} old_table_obj: {old_table_obj}, new_table_obj: {new_table_obj}; delete operator is not atomic; all the cells list will involved in finding differences computation")
if not old_content_cells_list:
return [], [], []
# 查找差异
diff_type, row_diffs, col_diffs, cell_diffs, cell_diff_points, cell_diff_values, \
row_diff_idx, col_diff_idx, graphic_diff, picture_diff = self.find_differences(
old_content_cells_list, new_content_cells_list, old_cells_list, new_cells_list)
# 抽取单元格内的图形图像差分
for item in graphic_diff + picture_diff:
if item:
cell_updates.extend(item)
# 处理单元格差分
for idx, (cell_diff_idx, diff_point, diff_value) in enumerate(
zip(cell_diffs, cell_diff_points, cell_diff_values)):
try:
# old = self.get_element_by_index(old_cells_list, cell_diff_idx)
# new = self.get_element_by_index(new_cells_list, cell_diff_idx)
old, new = old_cells_list, new_cells_list
for cell_idx in cell_diff_idx:
old = old[cell_idx]
new = new[cell_idx]
except IndexError:
continue # 忽略非法索引
cell_diff_obj = DiffItem(
'update', 'table', 'cell',
block_name=block_name,
old=old,
new=new,
belong_to=belong_to,
diff_point=diff_point,
diff_values=diff_value
)
cell_updates.append(cell_diff_obj)
# 处理行差分
# if diff_type == 'row':
# for row_idx, row_diff_col_idx in zip(row_diffs, row_diff_idx):
# try:
# old_row = [old_cells_list[row_idx][cell_idx] for cell_idx in row_diff_col_idx]
# new_row = [new_cells_list[row_idx][cell_idx] for cell_idx in row_diff_col_idx]
# except IndexError:
# continue
#
# row_diff_item = DiffItem(
# 'update', 'table', 'row',
# block_name=block_name,
# old=self.merge_cells_to_row(old_row),
# new=self.merge_cells_to_row(new_row),
# belong_to=belong_to)
# row_updates.append(row_diff_item)
# 处理列差分
# elif diff_type == 'col':
# for col_idx, col_diff_col_idx in zip(col_diffs, col_diff_idx):
# try:
# old_col = [old_cells_list[cell_idx][col_idx] for cell_idx in col_diff_col_idx]
# new_col = [new_cells_list[cell_idx][col_idx] for cell_idx in col_diff_col_idx]
# except IndexError:
# continue
#
# col_diff_item = DiffItem(
# 'update', 'table', 'col',
# block_name=block_name,
# old=self.merge_cells_to_row(old_col),
# new=self.merge_cells_to_row(new_col),
# belong_to=belong_to
# )
# col_updates.append(col_diff_item)
return row_updates, col_updates, cell_updates
def choice_cols(self, table_obj, col_indexes):
if table_obj.head_type == 'horizontal':
rows = []
for row_obj in table_obj.rows:
cells = []
for cel_idx in col_indexes:
cells.append(row_obj.cells[cel_idx])
rows.append(cells)
else:
rows = [[] for _ in range(len(table_obj.rows[0].cells))]
for cel_idx in col_indexes:
for idx, cell in enumerate(table_obj.rows[cel_idx].cells):
rows[idx].append(cell)
res_table_obj = TableObject()
for cell_list in rows:
row_obj = RowObject()
if cell_list:
row_obj.cells = cell_list
row_obj.coordinate = cell_list[0].coordinate
# 对cell_obj的layout.parent_ref进行判断,有值在进行赋值
if cell_list[0].layout.parent_ref:
row_obj.layout = cell_list[0].layout.parent_ref.layout
row_obj.style = cell_list[0].layout.parent_ref.style
row_obj.border = cell_list[0].layout.parent_ref.border
row_obj.row_index = cell_list[0].row_index
row_obj.data_id = cell_list[0].data_id
# res_table_obj.rows.append(self.merge_cells_to_row(cell_list))
res_table_obj.rows.append(row_obj)
self.copy_table_attrs(res_table_obj, table_obj)
return res_table_obj
@staticmethod
def process_delete_add_diff(block_name, sub_type, delete_tables, add_tables, belong_to, head_type=None):
def process_graphic_objects(action, cell_list):
"""
辅助函数:处理单元格中的图形对象和图片对象。
action: 操作类型('delete' 或 'add')
cells_list: 单元格列表
"""
diff_items = []
all_merged_ranges = []
for cell_obj in cell_list:
if cell_obj.merged_ranges:
# 合并单元格只处理一次
if cell_obj.merged_ranges not in all_merged_ranges:
all_merged_ranges.append(cell_obj.merged_ranges)
else:
continue
for item_obj in cell_obj.content:
if is_instance_of(item_obj, GraphicObject) or is_instance_of(item_obj, PictureObject): # 检查是否是图形或图片对象
diff_items.append(
DiffItem(action, item_obj._type, sub_type=item_obj._type, block_name=block_name,
old=item_obj if action == 'delete' else None,
new=None if action == 'delete' else item_obj,
belong_to=belong_to)
)
return diff_items
# filter_duplicate_cells 过滤在一行或者一列中因合并单元格引起的重复
# 相关代码暂时先不启用,可以在后续使用者启用查看是否会引起漏差分的问题在决定是否启用
# 如果在解析端可以处理合并单元格,则不需要过滤,避免冗余处理而降低效率
def filter_duplicate_cells(item,sub_type):
"""
根据text和merged_ranges过滤掉cells_list中的合并单元格
Args:
item: RowObject or TableObject
"""
if sub_type != 'table':
seen_contents = defaultdict(list)
for i in range(len(item.cells) - 1, -1, -1):
cell = item.cells[i]
cell_merged_ranges = cell.merged_ranges
if not cell_merged_ranges:
continue
cell_text = cell.text
if cell_merged_ranges == seen_contents[cell_text]:
del item.cells[i]
continue
seen_contents[cell_text] = cell_merged_ranges
else:
for row in item.rows:
seen_contents = defaultdict(list)
for i in range(len(row.cells) - 1, -1, -1):
cell = row.cells[i]
cell_merged_ranges = cell.merged_ranges
if not cell_merged_ranges:
continue
cell_text = cell.text
if cell_merged_ranges == seen_contents[cell_text]:
del row.cells[i]
continue
seen_contents[cell_text] = cell_merged_ranges
return item
ls_tb_add, ls_tb_delete = [], []
for tb_base_item in delete_tables:
# 过滤(行、列)合并单元格
tb_base_item = filter_duplicate_cells(tb_base_item,sub_type)
diff_obj = DiffItem('delete', 'table', sub_type=sub_type, block_name=block_name,
old=tb_base_item, new=None, belong_to=belong_to)
setattr(diff_obj, 'head_type', head_type)
ls_tb_delete.append(diff_obj)
# 如果是表格、行或列,处理单元格中的内容
if sub_type in ('row', 'col', 'table'):
cells_list = tb_base_item.cells if sub_type != 'table' else [
cell for row in tb_base_item.rows for cell in row.cells]
ls_tb_delete.extend(process_graphic_objects('delete', cells_list))
for tb_target_item in add_tables:
# 过滤(行、列)合并单元格
tb_target_item = filter_duplicate_cells(tb_target_item,sub_type)
diff_obj = DiffItem('add', 'table', sub_type=sub_type, block_name=block_name,
old=None, new=tb_target_item, belong_to=belong_to)
setattr(diff_obj, 'head_type', head_type)
ls_tb_add.append(diff_obj)
# 如果是表格、行或列,处理单元格中的内容
if sub_type in ('row', 'col', 'table'):
cells_list = tb_target_item.cells if sub_type != 'table' else [
cell for row in tb_target_item.rows for cell in row.cells]
ls_tb_add.extend(process_graphic_objects('add', cells_list))
return ls_tb_delete, ls_tb_add
def transpose_table_rows(self, rows):
"""
将表格的行进行转置操作,即将行转换为列,列转换为行。
Args:
rows (list): 原始表格的行列表,每个元素是一个RowObject对象
Returns:
list: 返回转置后的行列表,每个元素是一个RowObject对象
"""
# 创建新的行对象列表,数量等于原始表格的最大列数
max_cell_count = 0
for row in rows:
if len(row.cells) > max_cell_count:
max_cell_count = len(row.cells)
t_rows = [RowObject() for _ in range(max_cell_count)]
# 遍历原始表格的每一行
for row in rows:
# 遍历每一行的单元格
for idx, cell in enumerate(row.cells):
# 将单元格添加到转置后的对应行中
t_rows[idx].cells.append(cell)
# 为转置后的每一行设置属性
for row in t_rows:
# 设置行的坐标为第一个单元格的坐标
row.coordinate = row.cells[0].coordinate
# 设置行的数据ID为第一个单元格的数据ID
row.data_id = row.cells[0].data_id
# 设置行的布局为第一个单元格的布局
row.layout = row.cells[0].layout
# 如果第一个单元格有列索引,则设置行的列索引
if isinstance(row.cells[0].col_index, int):
row.col_index = row.cells[0].col_index
# 如果第一个单元格有行索引,则设置行的行索引
if isinstance(row.cells[0].row_index, int):
row.row_index = row.cells[0].row_index
# 返回转置后的行列表
return t_rows
def find_differences(self, array1: list, array2: list, old_items, new_items, diff_mode='normal'):
if isinstance(array1, list):
array1 = np.array(array1)
if isinstance(array2, list):
array2 = np.array(array2)
# 确保两个ndarray的shape相同
if array1.shape != array2.shape:
raise ValueError("两个ndarray的shape必须相同")
diff_type = diff_mode
if diff_type == 'normal':
# 计算行差异数
row_diff_count = np.sum(~np.all(array1 == array2, axis=1))
# 计算列差异数
col_diff_count = np.sum(~np.all(array1 == array2, axis=0))
# 根据差异数选择差异类型
diff_type = 'col' if col_diff_count < row_diff_count else 'row'
# 找出所有行和列的差异项
row_diffs, col_diffs, cell_diffs, cell_diff_points, cell_diff_values, row_diff_idx, col_diff_idx, graphic_diffs, picture_diffs = [
[] for _ in range(9)]
if diff_type == 'row':
for i in range(array1.shape[0]):
if not np.all(array1[i] == array2[i]):
# if np.sum(array1[i] != array2[i]) < settings.T_MERGE_MULTI_CELL_UPDATE_TO_ROW_UPDATE_MIN_CELL_COUNT:
# 如果该行只有一个数值不一致,则将这个差异项改为单元格的差异项
for j in range(array1.shape[1]):
if array1[i, j] == array2[i, j]:
continue
diff_point, diff_values, graphic_diff, picture_diff = self.get_cell_not_equal_attrs(
old_items[i][j], new_items[i][j], settings.CELL_COMPARE_ATTRS)
if diff_point:
cell_diffs.append((i, j))
cell_diff_points.append(' '.join(diff_point))
cell_diff_values.append(diff_values)
if graphic_diff:
graphic_diffs.append(graphic_diff)
if picture_diff:
picture_diffs.append(picture_diff)
# else:
# row_diffs.append(i)
# row_diff_idx.append(np.where(array1[i] != array2[i])[0].tolist())
else:
for j in range(array1.shape[1]):
if not np.all(array1[:, j] == array2[:, j]):
# if np.sum(array1[:, j] != array2[:,j]) < settings.T_MERGE_MULTI_CELL_UPDATE_TO_ROW_UPDATE_MIN_CELL_COUNT:
# 如果该列只有一个数值不一致,则将这个差异项改为单元格的差异项
for i in range(array1.shape[0]):
if array1[i, j] == array2[i, j]:
continue
diff_point, diff_values, graphic_diff, picture_diff = self.get_cell_not_equal_attrs(
old_items[i][j], new_items[i][j], settings.CELL_COMPARE_ATTRS)
if diff_point:
cell_diffs.append((i, j))
cell_diff_points.append(' '.join(diff_point))
cell_diff_values.append(diff_values)
if graphic_diff:
graphic_diffs.append(graphic_diff)
if picture_diff:
picture_diffs.append(picture_diff)
# else:
# col_diffs.append(j)
# col_diff_idx.append(np.where(array1[:, j] != array2[:, j])[0].tolist())
# 返回所有差异类型对应的单元格索引
return diff_type, row_diffs, col_diffs, cell_diffs, cell_diff_points, cell_diff_values, row_diff_idx, col_diff_idx, graphic_diffs, picture_diffs
@staticmethod
def get_cell_chars(cell_obj):
chars = []
for text_obj in cell_obj.content:
if not is_instance_of(text_obj, TextObject):
continue
chars.extend(text_obj.get_chars())
return chars
def _compare_cell_diff(self, base, target, data_type, block_name=''):
"""
对比单元格图像的方法
"""
result = []
if data_type == 'graphic':
cp_obj = GraphicComparer(self._base_block_mapping, self._target_block_mapping, self._path_base,
self._path_target)
else:
cp_obj = PictureComparer(self._base_block_mapping, self._target_block_mapping, self._path_base,
self._path_target)
item_result = cp_obj.compare(block_name, base, target, 'cell', True)
result.extend(item_result['add'])
result.extend(item_result['delete'])
result.extend(item_result['update'])
return result
@staticmethod
def _get_graphic_picture_obj(old_cell, new_cell):
base_graphic = []
target_graphic = []
base_picture = []
target_picture = []
for base_item in old_cell.content:
if is_instance_of(base_item, GraphicObject):
base_graphic.append(base_item)
elif is_instance_of(base_item, PictureObject):
base_picture.append(base_item)
for new_item in new_cell.content:
if is_instance_of(new_item, GraphicObject):
target_graphic.append(new_item)
elif is_instance_of(new_item, PictureObject):
target_picture.append(new_item)
return [(base_graphic, target_graphic), (base_picture, target_picture)]
def _get_cell_graphic_picture_diff(self, old_cell, new_cell):
"""
对比单元格图形图像的方法
"""
graphic_diff = []
picture_diff = []
block = old_cell
while not isinstance(block, DocumentBlockObject) and block and hasattr(block, 'layout'):
block = block.layout.parent_ref
block_name = block.name if block else ''
graphic_obj, picture_obj = self._get_graphic_picture_obj(old_cell, new_cell)
if graphic_obj[0] or graphic_obj[1]:
graphic_diff = self._compare_cell_diff(graphic_obj[0], graphic_obj[1], 'graphic', block_name)
if picture_obj[0] or picture_obj[1]:
picture_diff = self._compare_cell_diff(picture_obj[0], picture_obj[1], 'picture', block_name)
return graphic_diff, picture_diff
def get_cell_not_equal_attrs(self, old_cell, new_cell, compare_attrs):
diff_attrs = []
diff_values = []
if getattr(old_cell, 'auto_number', None) and getattr(new_cell, 'auto_number', None):
return [], [], [], []
if old_cell.text != new_cell.text:
diff_attrs.append('text')
diff_values.append((old_cell.text, new_cell.text))
else:
# 直接在对象上取值的属性
# direct_attr = ['style.background_color', 'border.border_top.border_style', 'style.background_color',
# 'border.border_bottom.border_style', 'border.border_left.border_style',
# 'border.border_right.border_style', 'style.background_style']
direct_attr = ['style.background_color', 'style.background_style']
attrs, values = self.get_not_equal_attrs(old_cell, new_cell, direct_attr)
diff_attrs.extend(attrs)
diff_values.extend(values)
for old_char, new_char in zip_longest(self.get_cell_chars(old_cell), self.get_cell_chars(new_cell),
fillvalue=None):
if old_char is None or new_char is None:
diff_attrs.append('text')
diff_values.append((str(old_char), str(new_char)))
else:
attrs, values = self.get_not_equal_attrs(old_char, new_char, compare_attrs)
diff_attrs.extend(attrs)
diff_values.extend(values)
# 单元格增加图形图像的比较
graphic_diff, picture_diff = self._get_cell_graphic_picture_diff(old_cell, new_cell)
unique_diff_attrs = list(set(diff_attrs))
unique_not_equal_values = [diff_values[diff_attrs.index(v)] for v in unique_diff_attrs]
return unique_diff_attrs, unique_not_equal_values, graphic_diff, picture_diff
def get_cell_content_list(self, cell_obj_lists, with_attr=False):
content_lists = []
processed_merged_ranges = set()
for cell_obj_list in cell_obj_lists:
row_content_list = []
for cell_obj in cell_obj_list:
# 检查是否是合并单元格且已经处理过
if hasattr(cell_obj, 'merged_ranges') and cell_obj.merged_ranges:
# 创建一个基于合并范围和内容的唯一键
merged_key = (tuple(cell_obj.merged_ranges), str(getattr(cell_obj, 'text', '')))
if merged_key in processed_merged_ranges:
# 如果已经处理过,设置为空字符串
row_content_list.append('') # 直接添加空字符串到结果中
continue
else:
# 如果是合并单元格但未处理过,标记为已处理
processed_merged_ranges.add(merged_key)
cell_contents = [f'text:{cell_obj.text}']
if with_attr:
attr_list = settings.CELL_COMPARE_ATTRS
for attr in attr_list:
if attr == "text":
continue
attr_value = self.get_nest_attr(cell_obj, attr)
if attr_value not in (None, ''):
cell_contents.append(f'{attr}:{str(attr_value)}')
row_content_list.append('🙉'.join(cell_contents))
content_lists.append(row_content_list)
return content_lists
def get_nest_attr(self, obj, nest_attr):
if is_instance_of(obj, CellObject):
result_attr = []
# 特殊处理单元格背景色
# if nest_attr in ('style.background_color', 'border.border_top.border_style',
# 'border.border_bottom.border_style', 'border.border_left.border_style',
# 'border.border_right.border_style', 'style.background_style'):
if nest_attr in ('style.background_color', 'style.background_style'):
return self.get_target_attr(obj, nest_attr)
if nest_attr == 'font_background_color':
nest_attr = 'style.background_color'
for item_obj in obj.content:
if is_instance_of(item_obj, GraphicObject) and nest_attr == 'graphic':
for item_attr in settings.PICTURE_COMPARE_ATTRS:
run_attr = self.get_target_attr(item_obj, item_attr)
if run_attr and str(run_attr) not in result_attr:
result_attr.append(str(run_attr))
graphic_text_obj = getattr(item_obj, 'text_obj', None)
if graphic_text_obj and graphic_text_obj.text:
text_attr_list = []
for text_attr in settings.TEXT_COMPARE_ATTRS:
for run_obj in graphic_text_obj.runs:
attr_val = self.get_target_attr(run_obj, text_attr)
if attr_val and str(attr_val) not in text_attr_list:
text_attr_list.append(str(attr_val))
if text_attr_list:
result_attr.extend(text_attr_list)
elif is_instance_of(item_obj, PictureObject) and nest_attr == 'picture':
for item_attr in settings.PICTURE_COMPARE_ATTRS:
run_attr = self.get_target_attr(item_obj, item_attr)
if run_attr and str(run_attr) not in result_attr:
result_attr.append(str(run_attr))
else:
if is_instance_of(item_obj, TextObject):
for run_obj in item_obj.runs:
run_attr = self.get_target_attr(run_obj, nest_attr)
if run_attr and str(run_attr) not in result_attr:
result_attr.append(str(run_attr))
elif is_instance_of(item_obj, RunObject):
run_attr = self.get_target_attr(item_obj, nest_attr)
if run_attr and str(run_attr) not in result_attr:
result_attr.append(str(run_attr))
return "".join(result_attr)
elif is_instance_of(obj, TextObject):
result_attr = []
for run_obj in obj.runs:
run_attr = self.get_target_attr(run_obj, nest_attr)
if run_attr and str(run_attr) not in result_attr:
result_attr.append(str(run_attr))
else:
return self.get_target_attr(obj, nest_attr)
@staticmethod
def get_target_attr(obj, nest_attr):
nest_attrs = nest_attr.split('.')
attr_str = nest_attrs.pop(0)
base_attr = getattr(obj, attr_str, None)
while base_attr and nest_attrs:
attr_str = nest_attrs.pop(0)
base_attr = getattr(base_attr, attr_str, None)
return base_attr
# @staticmethod
# def merge_cells_to_row(cell_list):
# row_obj = RowObject()
#
# if cell_list:
# row_obj.cells = cell_list
# row_obj.coordinate = cell_list[0].coordinate
# # 对cell_obj的layout.parent_ref进行判断,有值在进行赋值
# if cell_list[0].layout.parent_ref:
# row_obj.layout = cell_list[0].layout.parent_ref.layout
# row_obj.style = cell_list[0].layout.parent_ref.style
# row_obj.border = cell_list[0].layout.parent_ref.border
# row_obj.row_index = cell_list[0].row_index
# row_obj.data_id = cell_list[0].data_id
# return row_obj
@staticmethod
def align_table_col(base_table, target_table):
base_max_col_count = max([len(row.cells) for row in base_table.rows])
target_max_col_count = max([len(row.cells) for row in target_table.rows])
for base_row in base_table.rows:
if len(base_row.cells) != base_max_col_count:
# 匹配行的列数不一致,补齐缺失的cell
add_col_count = abs(len(base_row.cells) - base_max_col_count)
base_row.cells.extend([CellObject() for _ in range(add_col_count)])
for target_row in target_table.rows:
if len(target_row.cells) != target_max_col_count:
# 匹配行的列数不一致,补齐缺失的cell
add_col_count = abs(len(target_row.cells) - target_max_col_count)
target_row.cells.extend([CellObject() for _ in range(add_col_count)])
def cell_update_after(self, update_cells):
"""
单元格的变更后处理, 只有在content和merged_ranges都一样的情况下才过滤重复项
:return:
"""
if not update_cells:
return update_cells
result = []
custom_cells_merged_ranges = list()
seen_cells = defaultdict(list)
def normalize_content(cell):
"""标准化单元格内容用于比较"""
if not cell:
return ""
# 获取文本内容并标准化
content_text = str(cell.text) if hasattr(cell, 'text') else ""
# 标准化换行符
normalized = content_text.strip().replace('\r\n', '\n').replace('\r', '\n')
return normalized
def get_cell_key(item):
"""生成用于比较的键"""
old_cell = getattr(item, 'old', None)
new_cell = getattr(item, 'new', None)
# 获取内容键
old_content = normalize_content(old_cell)
new_content = normalize_content(new_cell)
content_key = f"{old_content}|{new_content}"
# 获取合并范围键
old_range = tuple(old_cell.merged_ranges) if old_cell and hasattr(old_cell,
'merged_ranges') and old_cell.merged_ranges else ()
new_range = tuple(new_cell.merged_ranges) if new_cell and hasattr(new_cell,
'merged_ranges') and new_cell.merged_ranges else ()
range_key = f"{old_range}|{new_range}"
return f"{content_key}||{range_key}"
def get_is_custom_cell(cell_obj):
for c_obj in cell_obj.get_heads():
if c_obj.text == settings.SPECIAL_CELL_CONTENT3:
return True
for item in update_cells:
# 如果不是单元格更新或者没有old/new对象,直接添加到结果中
if (item.type != 'update' or item.data_type != 'table' or item.sub_type != 'cell' or
(not item.old or not item.old.merged_ranges) and (not item.new or not item.new.merged_ranges)):
result.append(item)
continue
current_old_range = getattr(item.old, 'merged_ranges', []) if item.old else []
current_new_range = getattr(item.new, 'merged_ranges', []) if item.new else []
# 特殊定制的表格累加处理
if item.old and get_is_custom_cell(item.old):
if current_old_range not in custom_cells_merged_ranges:
custom_cells_merged_ranges.add(current_old_range)
result.append(item)
else:
existing_idx = custom_cells_merged_ranges.index(current_old_range)
result[existing_idx].old.text += item.old.text
result[existing_idx].old.content.extend(item.old.content)
elif item.new and get_is_custom_cell(item.new):
if current_new_range not in custom_cells_merged_ranges:
custom_cells_merged_ranges.append(current_new_range)
result.append(item)
else:
existing_idx = custom_cells_merged_ranges.index(current_new_range)
result[existing_idx].new.text += item.new.text
result[existing_idx].new.content.extend(item.new.content)
# 检查是否只有单侧有合并范围, 如果只有单侧有合并范围,则不视为重复
# elif len(current_old_range) <4 or len(current_new_range)<4:
# result.append(item)
else: # 处理普通单元格 - 进行去重检查
# 生成用于比较的键
cell_key = get_cell_key(item)
# 检查是否已经存在相同的键
is_duplicate = False
for existing_idx in seen_cells[cell_key]:
existing_item = result[existing_idx]
# 获取当前和已存在项目的合并范围
existing_old_range = getattr(existing_item.old, 'merged_ranges', []) if existing_item.old else []
existing_new_range = getattr(existing_item.new, 'merged_ranges', []) if existing_item.new else []
# 只有当merged_ranges完全相同时才认为是重复
if (current_old_range == existing_old_range and current_new_range == existing_new_range):
is_duplicate = True
break
if not is_duplicate:
seen_cells[cell_key].append(len(result))
result.append(item)
# 如果是重复项,则忽略(不添加到结果中)
return result
def row_del_add_after(self, part, category='add'):
"""
根据 category 参数处理新增或删除的行对象,判断行中的单元格是否有 merged_ranges 属性。
如果行中的任意一个单元格没有 merged_ranges 属性,则添加到结果列表中。
同时过滤具有相同 merged_ranges 的重复行对象,仅保留第一个出现的行。
注意:对于两列(行)中至少共享一个合并单元格,同时两列(行)内容完全相同,依然有可能会被误删除
解决方案:需要解析提供所有的单元格范围,之后综合计算整列(行)的范围进行判断,
若整列(行)都是因合并单元格而造成的冗余则进行过滤,否则(如只共享一(多)个合并单元格)则保留
:param part: 行对象列表
:param category: 操作类型,'add' 或 'delete'
:return: 处理后的结果列表
"""
# 使用列表来保存拼接后的列表
result = []
if not part:
return part
if category not in ('add', 'delete'):
return part
# 根据 category 决定处理新增还是删除的行对象
merged_rows = []
for row in part:
# 检查是否是行对象;(会有PictureObject和GraphicObject)如不是则直接加入结果中
if category == 'add' and not is_instance_of(row.new, RowObject):
result.append(row)
continue
if category == 'delete' and not is_instance_of(row.old, RowObject):
result.append(row)
continue
# 获取要检查的单元格列表
cells = row.new.cells if category == 'add' else row.old.cells
# 检查行中的每个单元格是否有 merged_ranges 属性
has_merged_ranges = any(hasattr(cell, 'merged_ranges') for cell in cells)
# 如果行中的任意一个单元格没有 merged_ranges 属性,则添加到结果中
if not has_merged_ranges:
result.append(row)
else:
merged_rows.append(row)
# 处理具有 merged_ranges 的行,过滤重复项
if merged_rows:
seen_contents = defaultdict(list)
def remove_timestamp(text):
return re.sub(r'\d{4}[-/]\d{2}[-/]\d{2}.*?(?=\t|\n|$)', '', text)
for index, row in enumerate(merged_rows):
# 获取当前行的内容
content = getattr(row, 'new_content' if category == 'add' else 'old_content', None)
if content:
#确保在处理可能包含非UTF-8编码字符的文本时不会出现解码错误
if isinstance(content, bytes):
content = content.decode('utf-8', errors='replace')
elif not isinstance(content, str):
content = str(content)
# 标准化 content
cleaned_content = remove_timestamp(content)
normalized_content = cleaned_content.strip().replace('\r\n', '\n').replace('\r', '\n')
seen_contents[normalized_content].append(index)
duplicates_row = {item: indices for item, indices in seen_contents.items() if len(indices) > 1}
removed_rows_indices = []
for _, indices in duplicates_row.items():
seen_merged_ranges = set()
for i in indices:
# 获取要检查的单元格列表
cells = merged_rows[i].new.cells if category == 'add' else merged_rows[i].old.cells
for cell in cells:
if cell.merged_ranges:
merged_range_tuple = tuple(cell.merged_ranges)
if merged_range_tuple not in seen_merged_ranges:
seen_merged_ranges.add(merged_range_tuple)
break
else:
removed_rows_indices.append(i)
break
# 添加未被移除的行到结果中
for index, row in enumerate(merged_rows):
if index not in removed_rows_indices:
result.append(row)
return result
def pre_process_require(self, old_table, new_table):
base_resources = [old_table]
target_resources = [new_table]
changes_dict = {} # 存储变更信息的字典
# 合并表格并编号(0=变更前,1=变更后)
for table_idx, table in enumerate(base_resources + target_resources):
col_list = table.get_col_list(col_name=settings.SPECIAL_COLUMN) #'要求廃止'
# 在循环外部初始化计数器
be_counter = 1
af_counter = 1
if col_list:
for row_index, cell in enumerate(col_list):
cell_text = getattr(cell, 'text', '')
if cell_text == settings.SPECIAL_CELL_CONTENT2: #'レ'
if table_idx < len(base_resources):
table_key = f"be_{be_counter:02d}"
be_counter += 1
else:
table_key = f"af_{af_counter:02d}"
af_counter += 1
# 获取列索引
col_index = cell.col_index
# 存储变更位置信息
changes_dict[table_key] = (row_index, col_index)
if any(changes_dict):
# 分离变更前和变更后的数据
be_changes = {k: v for k, v in changes_dict.items() if k.startswith('be_')}
af_changes = {k: v for k, v in changes_dict.items() if k.startswith('af_')}
# 处理变更前的数据
# 记录需要清理的key(分前后表)
be_clear_keys = []
af_clear_keys = []
if be_changes:
for table_key, (row_idx, col_idx) in be_changes.items():
next_col_idx = col_idx + 1
if next_col_idx < len(base_resources[0].rows[row_idx].cells):
if self._tbl_find_unique(base_resources[0], target_resources[0], row_idx, col_idx + 1):
be_clear_keys.append(table_key)
# 处理变更后的数据
if af_changes:
for table_key, (row_idx, col_idx) in af_changes.items():
next_col_idx = col_idx + 1
if next_col_idx < len(target_resources[0].rows[row_idx].cells):
if self._tbl_find_unique(target_resources[0], base_resources[0], row_idx, col_idx + 1):
af_clear_keys.append(table_key)
# 统一清理(分表操作)
for table_key in be_clear_keys:
row_idx, col_idx = changes_dict[table_key]
# 清理前表(base_resources)
for cell in base_resources[0].rows[row_idx].cells:
cell.text = ''
cell.content = []
cell.style = StyleObject()
for table_key in af_clear_keys:
row_idx, col_idx = changes_dict[table_key]
# 清理后表(target_resources)
for cell in target_resources[0].rows[row_idx].cells:
cell.text = ''
cell.content = []
cell.style = StyleObject()
return base_resources[0], target_resources[0]
@staticmethod
def _tbl_find_unique(base_tbl, target_tbl, row_idx, col_idx):
"""校验指定单元格是否满足唯一性条件:
1. 在 base_tbl 对应列中不存在相同值
2. 在 target_tbl 当前列中唯一(排除自己)
返回:是否需要清理(True/False)
"""
if not target_tbl or not base_tbl:
return False
target_rows = target_tbl.rows
if row_idx >= len(target_rows) or col_idx >= len(target_rows[row_idx].cells):
return False
compare_text = str(target_rows[row_idx].cells[col_idx].text)
# 条件1:检查 base_tbl 对应列是否存在相同值
if base_tbl.rows and col_idx < len(base_tbl.rows[0].cells):
for row in base_tbl.rows:
if col_idx < len(row.cells) and str(row.cells[col_idx].text) == compare_text:
return True # 需要清理
# 条件2:检查 target_tbl 当前列是否有重复(排除自己)
for i, row in enumerate(target_rows):
if i == row_idx:
continue
if col_idx < len(row.cells) and str(row.cells[col_idx].text) == compare_text:
return True # 需要清理
return False # 无需清理
详细讲解表格匹配的代码以及流程
最新发布