【python】diff-match-patch实现合同文本比对

Tom不秃

已于 2022-06-27 16:36:13 修改

阅读量2.1k

点赞数 2

分类专栏：笔记工具文章标签： python 文本比对 diff

于 2022-05-29 23:12:44 首次发布

本文链接：https://blog.csdn.net/da13122318861/article/details/125037031

版权

笔记同时被 2 个专栏收录

26 篇文章 0 订阅

订阅专栏

工具

5 篇文章 0 订阅

订阅专栏

在这里插入图片描述

1、将文件转成图片，OCR识别

把比对文件转为图片
OCR识别文字

使用合合旗下一站式OCR云端服务平台，对文字进行快速、精准的检测和识别。

2、google/diff-match-patch获取差异

google/diff-match-patch该库最初建于 2006 年，用于支持 Google Docs，现在可用于 C++、C#、Dart、Java、JavaScript、Lua、Objective C 和 Python。
Diff Match 和 Patch 库提供了强大的算法来执行同步纯文本所需的操作。

差异：
比较两个纯文本块并有效地返回差异列表。
匹配：
给定一个搜索字符串，在纯文本块中找到它的最佳模糊匹配。对准确性和位置进行加权。
修补：
将补丁列表应用于纯文本。即使底层文本不匹配，也要尽最大努力应用补丁。

from core.base.diff_match_patch import diff_match_patch
dmp = diff_match_patch()
patch = dmp.patch_make(origin_text,contrast_text)

3、获取每一个字符的坐标、页码等数据

def joinTextByChars(recognize_text, page):
    text_chars = []
    i = 1

    # 遍历每一行
    for x in recognize_text['lines']:
        tmp_index= 0

        # 遍历每一个字符
        for y in x['text']:
            char_polygon = x['char_polygons'][tmp_index]
            line_position = x['position']
            text_chars.append(
                {
                    'page': page, # 页码
                    'line': i, # 行号
                    'char': y, # 字符
                    'position': [(char_polygon[0], char_polygon[1]),(char_polygon[2], char_polygon[3]),(char_polygon[4], char_polygon[5]),(char_polygon[6], char_polygon[7])], # 字符的坐标块（左上、右上、右下、左下）
                    'line_position': [(line_position[0], line_position[1]),(line_position[2], line_position[3]),(line_position[4], line_position[5]),(line_position[6], line_position[7])], # 字符行的坐标块（左上、右上、右下、左下）
                }
            )
            tmp_index = tmp_index + 1

        # 行与行之间用\n拼接
        text_chars.append(
            {
                'page': page,
                'line': i,
                'char': '\n',
                'position': None,
                'line_position': None,
            }
        )
        i = i + 1
    # print(json.dumps(text_chars),'\n')
    return text_chars

4、定位每个差异的起始位置

def locDiffs(patch, origin_text_chars, contrast_text_chars):
    patch_diff = []
    patch_offset = 0
    for x in patch:
        tmp_offset_origin = 0
        tmp_offset_contrast = 0
        d = {}
        find_diff = False

        for m,n in x.diffs:
            if m == dmp.DIFF_EQUAL:
                if find_diff:
                    if "origin" not in d :
                        d["origin"] = {
                            "diff_text": "",
                            "diff_tag": dmp.DIFF_EQUAL,
                            "text_se_index": [
                                x.start1 + patch_offset + tmp_offset_origin,
                                x.start1 + patch_offset + tmp_offset_origin
                            ]
                        }

                    if "contrast" not in d :
                        d["contrast"] = {
                            "diff_text": "",
                            "diff_tag": dmp.DIFF_EQUAL,
                            "text_se_index": [
                                x.start2 + tmp_offset_contrast, 
                                x.start2 + tmp_offset_contrast
                            ]
                        }

                    tmp_offset_origin = tmp_offset_origin + len(d["origin"]["diff_text"])
                    tmp_offset_contrast = tmp_offset_contrast + len(d["contrast"]["diff_text"])

                    patch_diff.append(d)
                    d = {}

                tmp_offset_origin = tmp_offset_origin + len(n)
                tmp_offset_contrast = tmp_offset_contrast + len(n)
            elif m == dmp.DIFF_DELETE:
                find_diff = True

                d["origin"] = {
                        "diff_text": n,
                        "diff_tag": m,
                        "text_se_index": [
                            x.start1 + patch_offset + tmp_offset_origin, 
                            x.start1 + patch_offset + tmp_offset_origin + len(n)
                        ]
                    }
            elif m == dmp.DIFF_INSERT:
                find_diff = True

                d["contrast"] ={
                        "diff_text": n,
                        "diff_tag": m,
                        "text_se_index": [x.start2 + tmp_offset_contrast, x.start2 + tmp_offset_contrast + len(n)]
                    }

        if len(d) > 0:
            if "origin" not in d :
                d["origin"] = {
                    "diff_text": "",
                    "diff_tag": dmp.DIFF_EQUAL,
                    "text_se_index": [x.start1 + patch_offset + tmp_offset_origin, x.start1 + patch_offset + tmp_offset_origin]
                }

            if "contrast" not in d :
                d["contrast"] = {
                    "diff_text": "",
                    "diff_tag": dmp.DIFF_EQUAL,
                    "text_se_index": [x.start2 + tmp_offset_contrast, x.start2 + tmp_offset_contrast]
                }

            patch_diff.append(d)
        patch_offset = patch_offset + x.length1 - x.length2

    rtn_patch_diff = []
    for x in patch_diff:
        rtn_patch_diff.append(
            {
                'origin': locDiffPositions(x['origin'], origin_text_chars),
                'contrast': locDiffPositions(x['contrast'], contrast_text_chars),
            }
        )

    return rtn_patch_diff

5、获取每个diff差异的坐标块

def locDiffPositions(diff, text_chars):

    rtn = copy.deepcopy(diff)
    rtn['diff_positions'] = {}
    rtn['line_positions'] = {}

    if rtn['diff_text'] != '' and rtn['diff_text'].replace('\n', '').replace('\u00AD', '') == '':
        rtn['diff_text'] = ''
    else:
        line_now = None
        page_now = None
        diff_position = None

        if rtn['text_se_index'][0] == rtn['text_se_index'][1]:
            rtn['text_se_index'][1] = rtn['text_se_index'][1] + 1

        for i in range(rtn['text_se_index'][0], rtn['text_se_index'][1]):
            # print(i, text_chars[i]['char'])

            text_char = text_chars[i]
            # print('----\n',i,text_char['char'], text_char['position'],'\n-------')

            if text_char['position'] is None: # 换行符、换页符\u00AD时候会是None
                continue

            if text_char['page'] != page_now: # 换页了
                if page_now is not None:
                    if page_now not in rtn['diff_positions'].keys():
                        rtn['diff_positions'][page_now] = []

                    rtn['diff_positions'][page_now].append(diff_position)

                rtn['diff_positions'][text_char['page']] = []
                rtn['line_positions'][text_char['page']] = text_char['line_position']
                line_now = text_char['line']
                page_now = text_char['page']
                diff_position = text_char['position']

                continue

            if text_char['line'] != line_now: # 换行了
                rtn['diff_positions'][page_now].append(diff_position)

                rtn['line_positions'][text_char['page']][0] = (min(rtn['line_positions'][text_char['page']][0][0], text_char['line_position'][0][0]),min(rtn['line_positions'][text_char['page']][0][1], text_char['line_position'][0][1]))

                rtn['line_positions'][text_char['page']][1] = (max(rtn['line_positions'][text_char['page']][1][0], text_char['line_position'][1][0]),min(rtn['line_positions'][text_char['page']][1][1], text_char['line_position'][1][1]))

                rtn['line_positions'][text_char['page']][2] = (max(rtn['line_positions'][text_char['page']][2][0], text_char['line_position'][2][0]),max(rtn['line_positions'][text_char['page']][2][1], text_char['line_position'][2][1]))

                rtn['line_positions'][text_char['page']][3] = (min(rtn['line_positions'][text_char['page']][3][0], text_char['line_position'][3][0]),max(rtn['line_positions'][text_char['page']][3][1], text_char['line_position'][3][1]))

                line_now = text_char['line']
                diff_position = text_char['position']

            diff_position[0] = (min(diff_position[0][0], text_char['position'][0][0]),min(diff_position[0][1], text_char['position'][0][1]))
            diff_position[1] = (max(diff_position[1][0], text_char['position'][1][0]),min(diff_position[1][1], text_char['position'][1][1]))
            diff_position[2] = (max(diff_position[2][0], text_char['position'][2][0]),max(diff_position[2][1], text_char['position'][2][1]))
            diff_position[3] = (min(diff_position[3][0], text_char['position'][3][0]),max(diff_position[3][1], text_char['position'][3][1]))

        if diff_position is not None and len(diff_position) > 0:
            rtn['diff_positions'][page_now].append(diff_position)
    # del(rtn['text_se_index'])

    return rtn

6、完整调用代码

origin_text_chars = joinTextByChars(origin_pages['result'], page)
origin_text = origin_pages['whole_text']
contrast_text_chars = joinTextByChars(contrast_pages['result'], page)
contrast_text = contrast_pages['whole_text']

patch = dmp.patch_make(origin_text,contrast_text)
patch_diff.extend(locDiffs(patch, origin_text_chars, contrast_text_chars))
print(json.dumps(diff_res))

Tom不秃

关注

2
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
【python】diff-match-patch实现合同文本比对

1、将文件转成图片，OCR识别把比对文件转为图片OCR识别文字使用合合旗下一站式OCR云端服务平台，对文字进行快速、精准的检测和识别。2、google/diff-match-patch获取差异google/diff-match-patch该库最初建于 2006 年，用于支持 Google Docs，现在可用于 C++、C#、Dart、Java、JavaScript、Lua、Objective C 和 Python。Diff Match 和 Patch 库提供了强大的算法来执行同步纯文本所需.
复制链接

扫一扫