题目:
分析:
a:
- 字符串x的下标范围
[0, 1, 2, ..., m]
,y的下标范围[0, 1, 2, ..., n]
; - 二维数组
d[i, j]
为距离矩阵,用于计算两点之间的编辑距离,我们的目标是求x和y的编辑距离,即d[m, n]
; - 二维数组
option[i, j]
用于存储每一步的操作; dict_cost
为每种操作的代价,字典格式;
分析不同操作的条件和代价:
- 如果
1 <= i <= m
且1 <= j <= n
:
- copy
if x[i - 1] == y[j - 1]:
dis[0] = dict_cost['copy'] + d[i-1][j-1]
- replace:
个人认为这里x和y对应位置的字符不一样时,也是可以进行替换的
dis[1] = dict_cost['replace'] + d[i-1][j-1]
- delete
dis[2] = dict_cost['delete'] + d[i-1][j]
- insert
dis[3] = dict_cost['insert'] + d[i][j-1]
- twiddle
if i >= 2 and j >= 2 and x[i-1] == y[j-2] and x[i-2] == y[j-1]:
dis[4] = dict_cost['twiddle'] + d[i-2][j-2]
- kill
只能在j = n
时,才会执行kill操作;kill操作需要最后单独执行;
# kill的情况单独处理
kill_index = -1
for i in range(1, m):
temp = d[i][n] + dict_cost['kill']
if temp < d[m][n]:
kill_index = i
d[m][n] = temp
option[m][n] = 5
在上述几种操作(除kill以外)中取代价最小的操作,并记录这一步的操作:
d[i][j] = min(dis)
option[i][j] = dis.index(min(dis))
- 边界情况:
i = 0
且j = 0
d[0][0] = 0
option[0][0] = -1
i = 0
且j != 0
,此时x为空,只能执行插入操作
for j in range(1, n + 1):
d[0][j] = j * dict_cost['insert']
option[0][j] = 3
j = 0
且i != 0
,此时y为空,只能执行删除操作
for i in range(1, m + 1):
d[i][0] = i * dict_cost['delete']
option[i][0] = 2
完整的代码如下:
"""
算法导论(第三版) P232,15-5 编辑距离
"""
import numpy as np
def print_result(i, j, kill_index, option):
if option[i][j] == -1:
return
if option[i][j] == 0:
print_result(i-1, j-1, kill_index, option)
print('copy')
elif option[i][j] == 1:
print_result(i-1, j-1, kill_index, option)
print('replace')
elif option[i][j] == 2:
print_result(i-1, j, kill_index, option)
print('delete')
elif option[i][j] == 3:
print_result(i, j-1, kill_index, option)
print('insert')
elif option[i][j] == 4:
print_result(i-2, j-2, kill_index, option)
print('twiddle')
else:
print_result(kill_index, j, kill_index, option)
print('kill')
def function(x, y, dict_cost):
m, n = len(x), len(y)
d = np.full((m + 1, n + 1), np.inf, dtype=float) # 距离矩阵
columns = ['copy', 'replace', 'delete', 'insert', 'twiddle', 'kill']
option = np.full((m + 1, n + 1), -1, dtype=int) # 记录每一步的选择,与columns对应
d[0][0] = 0
option[0][0] = -1
for j in range(1, n + 1):
d[0][j] = j * dict_cost['insert']
option[0][j] = 3
for i in range(1, m + 1):
d[i][0] = i * dict_cost['delete']
option[i][0] = 2
for i in range(1, m + 1):
for j in range(1, n + 1):
dis = [np.inf] * 5 # 分别对应:'copy','replace','delete','insert','twiddle'
if x[i - 1] == y[j - 1]:
dis[0] = dict_cost['copy'] + d[i-1][j-1]
dis[1] = dict_cost['replace'] + d[i-1][j-1]
dis[2] = dict_cost['delete'] + d[i-1][j]
dis[3] = dict_cost['insert'] + d[i][j-1]
if i >= 2 and j >= 2 and x[i-1] == y[j-2] and x[i-2] == y[j-1]:
dis[4] = dict_cost['twiddle'] + d[i-2][j-2]
d[i][j] = min(dis)
option[i][j] = dis.index(min(dis))
# kill的情况单独处理
kill_index = -1
for i in range(1, m):
temp = d[i][n] + dict_cost['kill']
if temp < d[m][n]:
kill_index = i
d[m][n] = temp
option[m][n] = 5
# 打印结果
print('Total cost:', d[m][n])
# print(option)
print('Process:')
print_result(m, n, kill_index, option)
if __name__ == '__main__':
x = 'algorithm'
y = 'altruistic'
dict_cost = {'copy':1, 'replace':2, 'delete':3, 'insert':4, 'twiddle':5, 'kill':6}
function(x, y, dict_cost)
运行结果:该算法的时间复杂度和空间复杂度均为O(mn)
b:
可以将DNA对齐问题转化为编辑问题,对应的操作为:复制、替换、插入、删除,此时求两个字符串的最大编辑距离:
- 复制:
x[j] = y[j]
,且都不是空格,代价为+1; - 替换:
x[j] != y[j]
,且都不是空格,代价为-1; - 插入:在x中插入空格,代价为-2;
- 删除:在y中插入空格,代价为-2;
递归打印结果:
def get_results(x, y, i, j, option):
if option[i][j] == -1:
return [], []
if option[i][j] == 0 or option[i][j] == 1: # copy/replace
xx, yy = get_results(x, y, i-1, j-1, option)
return xx + [x[i-1]], yy + [y[j-1]]
elif option[i][j] == 2: # delete
xx, yy = get_results(x, y, i-1, j, option)
return xx + [x[i-1]], yy + [' ']
else: # insert
xx, yy = get_results(x, y, i, j-1, option)
return xx + [' '], yy + [y[j-1]]
运行结果:
完整代码:
"""
算法导论(第三版) P232,15-5 编辑距离
"""
import numpy as np
def get_results(x, y, i, j, option):
if option[i][j] == -1:
return [], []
if option[i][j] == 0 or option[i][j] == 1: # copy/replace
xx, yy = get_results(x, y, i-1, j-1, option)
return xx + [x[i-1]], yy + [y[j-1]]
elif option[i][j] == 2: # delete
xx, yy = get_results(x, y, i-1, j, option)
return xx + [x[i-1]], yy + [' ']
else: # insert
xx, yy = get_results(x, y, i, j-1, option)
return xx + [' '], yy + [y[j-1]]
def function(x, y, dict_cost):
m, n = len(x), len(y)
d = np.full((m + 1, n + 1), -np.inf, dtype=float) # 距离矩阵
columns = ['copy', 'replace', 'delete', 'insert']
option = np.full((m + 1, n + 1), -1, dtype=int) # 记录每一步的选择,与columns对应
d[0][0] = 0
option[0][0] = -1
for j in range(1, n + 1):
d[0][j] = j * dict_cost['insert']
option[0][j] = 3
for i in range(1, m + 1):
d[i][0] = i * dict_cost['delete']
option[i][0] = 2
for i in range(1, m + 1):
for j in range(1, n + 1):
dis = [-np.inf] * 4 # 分别对应:'copy','replace','delete','insert'
if x[i - 1] == y[j - 1]:
dis[0] = dict_cost['copy'] + d[i-1][j-1]
dis[1] = dict_cost['replace'] + d[i-1][j-1]
dis[2] = dict_cost['delete'] + d[i-1][j]
dis[3] = dict_cost['insert'] + d[i][j-1]
d[i][j] = max(dis)
option[i][j] = dis.index(max(dis))
# 打印结果
print('Total cost:', d[m][n])
# print(option)
print('Process:')
xx, yy = get_results(x, y, m, n, option)
print(xx)
print(yy)
if __name__ == '__main__':
x = 'GATCGGCAT'
y = 'CAATGTGAATC'
dict_cost = {'copy':1, 'replace':-1, 'delete':-2, 'insert':-2}
function(x, y, dict_cost)