最小编辑距离(Minimum Edit Distance)
参考:https://web.stanford.edu/class/cs124/lec/med.pdf
目标(objective):给定两个字符串 s 1 s_{1} s1、 s 2 s_{2} s2及编辑操作,求将字符串 s 1 s_{1} s1转换为 s 2 s_{2} s2所需的最小操作代价。
编辑操作:
-
插入(insert),插入一个新字符,代价(cost) o i o_{\text{i}} oi;
-
删除(delete),删除一个字符,代价(cost) o d o_{\text{d}} od;
-
替换(substitute),将一个字符替换为另一个字符,代价(cost) o s o_{\text{s}} os。
1 递归(Recursion)
计算时间复杂度(time complexity): O ( 3 n ) \mathcal{O} (3^{n}) O(3n)
import numpy as np
COST_INSERT=1
COST_DELETE=1
COST_REPLACE=2
def edit_distance_recur(str1, str2):
"""
find minimum number operations required to convert string str1 into str2
comparing string from right to left (backward)
"""
op_list = ["insert", "delete", "replace"]
if len(str1) == 0:
distance = len(str2) * COST_INSERT
op_sequence = {
"insert": list(str2),
"delete": [],
"replace": [],
}
return distance, op_sequence
if len(str2) == 0:
distance = len(str1) * COST_DELETE
op_sequence = {
"insert": [],
"delete": list(str1),
"replace": [],
}
return distance, op_sequence
if str1[-1] == str2[-1]:
distance, op_sequence = edit_distance_recur(str1[: -1], str2[: -1])
else:
distance_insert, op_sequence_insert = edit_distance_recur(str1, str2[: -1])
distance_delete, op_sequence_delete = edit_distance_recur(str1[: -1], str2)
distance_replace, op_sequence_replace = edit_distance_recur(str1[: -1], str2[: -1])
distance_insert += COST_INSERT
distance_delete += COST_DELETE
distance_replace += COST_REPLACE
op = np.argmin(a=[distance_insert, distance_delete, distance_replace])
op = op_list[op]
if op == "insert":
distance = distance_insert
op_sequence = op_sequence_insert
op_sequence["insert"].append(str2[-1])
elif op == "delete":
distance = distance_delete
op_sequence = op_sequence_delete
op_sequence["delete"].append(str1[-1])
else:
distance = distance_replace
op_sequence = op_sequence_replace
op_sequence["replace"].append(str1[-1] + " with " + str2[-1])
return distance, op_sequence
edit_distance_recur("intention", "execution")
(8,
{'insert': ['x', 'e', 'c', 'u'],
'delete': ['i', 'n', 't', 'n'],
'replace': []})
2 动态规化(Dynamic Programming)
动态规化(Dynamic&programming):将问题分解为子问题(subproblem)求解;构造计算表(a tabular computation) D ( m , n ) D(m, n) D(m,n)
计算时间复杂度为 O ( m n ) \mathcal{O} (mn) O(mn);计算空间复杂度为 O ( m n ) \mathcal{O} (mn) O(mn)
自下而上(bottom-up)
def edit_distance_dp(str1, str2, cost_insert=1, cost_delete=1, cost_replace=1):
"""
find minimum number operations required to convert string str1 into str2
bottom-up
"""
len_str1 = len(str1)
len_str2 = len(str2)
str1 = "#" + str1
str2 = "#" + str2
# initialization
distance_table = np.zeros(shape=(len_str1 + 1, len_str2 + 1))
distance_table[:, 0] = np.linspace(start=0, stop=len_str1, num=len_str1 + 1)
distance_table[0, :] = np.linspace(start=0, stop=len_str2, num=len_str2 + 1)
for idx1 in range(1, len_str1 + 1):
for idx2 in range(1, len_str2 + 1):
# insert
distance_insert = distance_table[idx1, idx2 - 1] + cost_insert
# delete
distance_delete = distance_table[idx1 - 1, idx2] + cost_delete
# replace or remain
if str1[idx1] == str2[idx2]:
distance_replace = distance_table[idx1 - 1, idx2 - 1]
else:
distance_replace = distance_table[idx1 - 1, idx2 - 1] + cost_replace
distance_table[idx1, idx2] = np.min(
[distance_insert, distance_delete, distance_replace]
)
return distance_table[-1, -1]
edit_distance_dp("intention", "execution", cost_replace=2)
8.0
3 回溯(Backtrace)
回溯(Backtrace)计算时间、空间复杂度均为 O ( m + n ) \mathcal{O} (m + n) O(m+n)
从
(
0
,
0
)
(0, 0)
(0,0)到
(
M
,
N
)
(M, N)
(M,N)的任意非减路径均为一条可行对齐路径;最优路径由最优子路径组成。
def edit_distance_dp_backtrace(str1, str2, cost_insert=1, cost_delete=1, cost_replace=1):
"""
find minimum number operations required to convert string str1 into str2
bottom-up
backtrace
"""
len_str1 = len(str1)
len_str2 = len(str2)
str1 = "#" + str1
str2 = "#" + str2
# initialization
distance_table = np.zeros(shape=(len_str1 + 1, len_str2 + 1))
distance_table[:, 0] = np.linspace(start=0, stop=len_str1, num=len_str1 + 1)
distance_table[0, :] = np.linspace(start=0, stop=len_str2, num=len_str2 + 1)
for idx1 in range(1, len_str1 + 1):
for idx2 in range(1, len_str2 + 1):
# insert
distance_insert = distance_table[idx1, idx2 - 1] + cost_insert
# delete
distance_delete = distance_table[idx1 - 1, idx2] + cost_delete
# replace or remain
if str1[idx1] == str2[idx2]:
distance_replace = distance_table[idx1 - 1, idx2 - 1]
else:
distance_replace = distance_table[idx1 - 1, idx2 - 1] + cost_replace
distance_table[idx1, idx2] = np.min(
[distance_insert, distance_delete, distance_replace]
)
op_list = ["replace", "insert", "delete"]
path = []
idx1 = len_str1
idx2 = len_str2
while (idx1 > 0) & (idx2 > 0):
op = np.argmin(distance_table[idx1 - 1: idx1 + 1, idx2 - 1: idx2 + 1])
op = op_list[op]
if op == "insert":
op = "{} {}".format(op, str2[idx2])
idx2 -= 1
elif op == "delete":
op = "{} {}".format(op, str1[idx1])
idx1 -= 1
else:
if str1[idx1] == str2[idx2]:
op = "remaind"
else:
op = "{} {} with {}".format(op, str1[idx1], str2[idx2])
idx1 -= 1
idx2 -= 1
path.append(op)
if idx1 == 0:
path.extend(["{} {}".format("insert", str2[i]) for i in range(idx2, 0, -1)])
if idx2 == 0:
path.extend(["{} {}".format("delete", str1[i]) for i in range(idx1, 0, -1)])
path.reverse()
return distance_table[-1, -1], path
min_edit_distance, optimal_path = edit_distance_dp_backtrace("intention", "execution", cost_replace=2)
print("min edit distance: ", min_edit_distance)
print("optimal path: ", optimal_path)
min edit distance: 8.0
optimal path: ['insert e', 'replace i with x', 'replace n with e', 'replace t with c', 'delete e', 'replace n with u', 'remaind', 'remaind', 'remaind', 'remaind']