1、将文件转成图片,OCR识别
- 把比对文件转为图片
- OCR识别文字
使用合合旗下一站式OCR云端服务平台,对文字进行快速、精准的检测和识别。
2、google/diff-match-patch获取差异
google/diff-match-patch该库最初建于 2006 年,用于支持 Google Docs,现在可用于 C++、C#、Dart、Java、JavaScript、Lua、Objective C 和 Python。
Diff Match 和 Patch 库提供了强大的算法来执行同步纯文本所需的操作。
- 差异:
比较两个纯文本块并有效地返回差异列表。 - 匹配:
给定一个搜索字符串,在纯文本块中找到它的最佳模糊匹配。对准确性和位置进行加权。 - 修补:
将补丁列表应用于纯文本。即使底层文本不匹配,也要尽最大努力应用补丁。
from core.base.diff_match_patch import diff_match_patch
dmp = diff_match_patch()
patch = dmp.patch_make(origin_text,contrast_text)
3、获取每一个字符的坐标、页码等数据
def joinTextByChars(recognize_text, page):
text_chars = []
i = 1
# 遍历每一行
for x in recognize_text['lines']:
tmp_index= 0
# 遍历每一个字符
for y in x['text']:
char_polygon = x['char_polygons'][tmp_index]
line_position = x['position']
text_chars.append(
{
'page': page, # 页码
'line': i, # 行号
'char': y, # 字符
'position': [(char_polygon[0], char_polygon[1]),(char_polygon[2], char_polygon[3]),(char_polygon[4], char_polygon[5]),(char_polygon[6], char_polygon[7])], # 字符的坐标块(左上、右上、右下、左下)
'line_position': [(line_position[0], line_position[1]),(line_position[2], line_position[3]),(line_position[4], line_position[5]),(line_position[6], line_position[7])], # 字符行的坐标块(左上、右上、右下、左下)
}
)
tmp_index = tmp_index + 1
# 行与行之间用\n拼接
text_chars.append(
{
'page': page,
'line': i,
'char': '\n',
'position': None,
'line_position': None,
}
)
i = i + 1
# print(json.dumps(text_chars),'\n')
return text_chars
4、定位每个差异的起始位置
def locDiffs(patch, origin_text_chars, contrast_text_chars):
patch_diff = []
patch_offset = 0
for x in patch:
tmp_offset_origin = 0
tmp_offset_contrast = 0
d = {}
find_diff = False
for m,n in x.diffs:
if m == dmp.DIFF_EQUAL:
if find_diff:
if "origin" not in d :
d["origin"] = {
"diff_text": "",
"diff_tag": dmp.DIFF_EQUAL,
"text_se_index": [
x.start1 + patch_offset + tmp_offset_origin,
x.start1 + patch_offset + tmp_offset_origin
]
}
if "contrast" not in d :
d["contrast"] = {
"diff_text": "",
"diff_tag": dmp.DIFF_EQUAL,
"text_se_index": [
x.start2 + tmp_offset_contrast,
x.start2 + tmp_offset_contrast
]
}
tmp_offset_origin = tmp_offset_origin + len(d["origin"]["diff_text"])
tmp_offset_contrast = tmp_offset_contrast + len(d["contrast"]["diff_text"])
patch_diff.append(d)
d = {}
tmp_offset_origin = tmp_offset_origin + len(n)
tmp_offset_contrast = tmp_offset_contrast + len(n)
elif m == dmp.DIFF_DELETE:
find_diff = True
d["origin"] = {
"diff_text": n,
"diff_tag": m,
"text_se_index": [
x.start1 + patch_offset + tmp_offset_origin,
x.start1 + patch_offset + tmp_offset_origin + len(n)
]
}
elif m == dmp.DIFF_INSERT:
find_diff = True
d["contrast"] ={
"diff_text": n,
"diff_tag": m,
"text_se_index": [x.start2 + tmp_offset_contrast, x.start2 + tmp_offset_contrast + len(n)]
}
if len(d) > 0:
if "origin" not in d :
d["origin"] = {
"diff_text": "",
"diff_tag": dmp.DIFF_EQUAL,
"text_se_index": [x.start1 + patch_offset + tmp_offset_origin, x.start1 + patch_offset + tmp_offset_origin]
}
if "contrast" not in d :
d["contrast"] = {
"diff_text": "",
"diff_tag": dmp.DIFF_EQUAL,
"text_se_index": [x.start2 + tmp_offset_contrast, x.start2 + tmp_offset_contrast]
}
patch_diff.append(d)
patch_offset = patch_offset + x.length1 - x.length2
rtn_patch_diff = []
for x in patch_diff:
rtn_patch_diff.append(
{
'origin': locDiffPositions(x['origin'], origin_text_chars),
'contrast': locDiffPositions(x['contrast'], contrast_text_chars),
}
)
return rtn_patch_diff
5、 获取每个diff差异的坐标块
def locDiffPositions(diff, text_chars):
rtn = copy.deepcopy(diff)
rtn['diff_positions'] = {}
rtn['line_positions'] = {}
if rtn['diff_text'] != '' and rtn['diff_text'].replace('\n', '').replace('\u00AD', '') == '':
rtn['diff_text'] = ''
else:
line_now = None
page_now = None
diff_position = None
if rtn['text_se_index'][0] == rtn['text_se_index'][1]:
rtn['text_se_index'][1] = rtn['text_se_index'][1] + 1
for i in range(rtn['text_se_index'][0], rtn['text_se_index'][1]):
# print(i, text_chars[i]['char'])
text_char = text_chars[i]
# print('----\n',i,text_char['char'], text_char['position'],'\n-------')
if text_char['position'] is None: # 换行符、换页符\u00AD时候会是None
continue
if text_char['page'] != page_now: # 换页了
if page_now is not None:
if page_now not in rtn['diff_positions'].keys():
rtn['diff_positions'][page_now] = []
rtn['diff_positions'][page_now].append(diff_position)
rtn['diff_positions'][text_char['page']] = []
rtn['line_positions'][text_char['page']] = text_char['line_position']
line_now = text_char['line']
page_now = text_char['page']
diff_position = text_char['position']
continue
if text_char['line'] != line_now: # 换行了
rtn['diff_positions'][page_now].append(diff_position)
rtn['line_positions'][text_char['page']][0] = (min(rtn['line_positions'][text_char['page']][0][0], text_char['line_position'][0][0]),min(rtn['line_positions'][text_char['page']][0][1], text_char['line_position'][0][1]))
rtn['line_positions'][text_char['page']][1] = (max(rtn['line_positions'][text_char['page']][1][0], text_char['line_position'][1][0]),min(rtn['line_positions'][text_char['page']][1][1], text_char['line_position'][1][1]))
rtn['line_positions'][text_char['page']][2] = (max(rtn['line_positions'][text_char['page']][2][0], text_char['line_position'][2][0]),max(rtn['line_positions'][text_char['page']][2][1], text_char['line_position'][2][1]))
rtn['line_positions'][text_char['page']][3] = (min(rtn['line_positions'][text_char['page']][3][0], text_char['line_position'][3][0]),max(rtn['line_positions'][text_char['page']][3][1], text_char['line_position'][3][1]))
line_now = text_char['line']
diff_position = text_char['position']
diff_position[0] = (min(diff_position[0][0], text_char['position'][0][0]),min(diff_position[0][1], text_char['position'][0][1]))
diff_position[1] = (max(diff_position[1][0], text_char['position'][1][0]),min(diff_position[1][1], text_char['position'][1][1]))
diff_position[2] = (max(diff_position[2][0], text_char['position'][2][0]),max(diff_position[2][1], text_char['position'][2][1]))
diff_position[3] = (min(diff_position[3][0], text_char['position'][3][0]),max(diff_position[3][1], text_char['position'][3][1]))
if diff_position is not None and len(diff_position) > 0:
rtn['diff_positions'][page_now].append(diff_position)
# del(rtn['text_se_index'])
return rtn
6、完整调用代码
origin_text_chars = joinTextByChars(origin_pages['result'], page)
origin_text = origin_pages['whole_text']
contrast_text_chars = joinTextByChars(contrast_pages['result'], page)
contrast_text = contrast_pages['whole_text']
patch = dmp.patch_make(origin_text,contrast_text)
patch_diff.extend(locDiffs(patch, origin_text_chars, contrast_text_chars))
print(json.dumps(diff_res))