今天看了浏览器的拼写检查的实现流程,发现其中最核心的就是计算字符串之间的最小编辑距离,同时又联想到之前用动态规划实现的子串等相关问题,所以今天在这里做一个总结,用一个代码同时求解两个字符串之间的最小编辑距离,最大子串问题。如下代码:(python)
def compute_distance(A, B, distance_dict, substr_len, sub_str, sub_continue_str, sub_continue_str_len): ### 核心函数,主要应用动态规划思想
i=len(A)
j=len(B)
if i==0:
distance_dict[(i, j)]=j
substr_len[(i,j)]=0
sub_str[(i,j)]=[]
sub_continue_str_len[(i,j)]=0
sub_continue_str[(i,j)]=[]
elif j==0:
distance_dict[(i,j)]=i
substr_len[(i,j)]=0
sub_str[(i,j)]=[]
sub_continue_str[(i,j)]=[]
sub_continue_str_len[(i,j)]=0
else:
if (i-1,j) not in distance_dict.keys():
distance_dict[(i-1, j)], substr_len[(i-1,j)], sub_str[(i-1,j)], sub_continue_str[(i-1,j)], sub_continue_str_len[(i-1,j)]=compute_distance(A[:-1], B, distance_dict, substr_len, sub_str, sub_continue_str, sub_continue_str_len)
if (i, j-1) not in distance_dict.keys():
distance_dict[(i, j-1)], substr_len[(i,j-1)], sub_str[(i, j-1)], sub_continue_str[(i,j-1)], sub_continue_str_len[(i,j-1)]=compute_distance(A, B[:-1], distance_dict, substr_len, sub_str, sub_continue_str, sub_continue_str_len)
if (i-1, j-1) not in distance_dict.keys():
distance_dict[(i-1, j-1)], substr_len[(i-1, j-1)], sub_str[(i-1, j-1)], sub_continue_str[(i-1,j-1)], sub_continue_str_len[(i-1,j-1)]=compute_distance(A[:-1], B[:-1], distance_dict, substr_len, sub_str, sub_continue_str, sub_continue_str_len)
### 动态规划思想的体现:使用字典记录所有计算过的数据,避免重复计算,从而加快速度
if A[-1]==B[-1]:
distance_dict[(i,j)]=min([distance_dict[(i-1, j)]+1, distance_dict[(i, j-1)]+1, distance_dict[(i-1,j-1)]])
substr_len[(i,j)]=substr_len[(i-1, j-1)]+1
sub_str[(i,j)]=sub_str[(i-1, j-1)]+[A[-1]]
sub_continue_str[(i,j)]=sub_continue_str[(i-1,j-1)]+[A[-1]]
sub_continue_str_len[(i,j)]=sub_continue_str_len[(i-1,j-1)]+1
else:
distance_dict[(i, j)] = min([distance_dict[(i - 1, j)]+1, distance_dict[(i, j - 1)]+1,distance_dict[(i - 1, j - 1)]+2])
substr_len[(i,j)]=max([substr_len[(i-1,j)], substr_len[(i, j-1)]])
sub_str[(i,j)]=sub_str[(i-1,j)] if substr_len[(i-1,j)] > substr_len[(i, j-1)] else sub_str[(i,j-1)]
sub_continue_str_len[(i,j)]=0
sub_continue_str[(i,j)]=[]
return distance_dict[(i,j)], substr_len[(i,j)], sub_str[(i,j)], sub_continue_str[(i,j)], sub_continue_str_len[(i,j)]
def main():
A='asdefats' #'INTENaTION' #'AGGCTATCACCTGACCTCCAGGCCGATGCCC'
B='werasdfaswer' #'EXECUaTIONEE' #'TAGCTATCACGACCGCGGTCGATTTGCCCGAC'
distance_dict={}。### 记录编辑距离
substr_len={}。 ### 记录最长公共子串的长度
substr={}。 ### 记录最长公共子串
sub_continue_str={}。 ## 记录连续的最长公共子串
sub_continue_str_len={}。 ### 记录连续的最长公共子串的长度
dis, len, str, con_str, con_str_len=compute_distance(A, B, distance_dict, substr_len, substr, sub_continue_str, sub_continue_str_len)
### ---------result-------------
max_sub_str=str
min_edit_distance=dis
key=max(sub_continue_str_len.items(), key=lambda x:x[1])[0]
max_continue_sub_str=sub_continue_str[key]
print ('the min edit distance is : ', min_edit_distance)
print ('the max sub squence is: ', max_sub_str)
print ('the max continue sub squence is: ', max_continue_sub_str)
if __name__=='__main__':
main()
运行结果如下:
the min edit distance is : 8
the max sub squence is: ['a', 's', 'd', 'f', 'a', 's']
the max continue sub squence is: ['a', 's', 'd']