前几天用C++实现了求两个字符串的最短编辑距离的算法,现在用Python重新实现一遍,基本思路和C++版本是一致的。
1.时间复杂度分析
Python的实现方式同C++实现的理论时间复杂度是一样的,运用动态规划方法都是O(m*n)。从测试结果来看,不同规模的数据耗费的时间基本通问题的规模呈线性增长,非递归比递归过程快得多。不过,比较反常的是空间优化后比优化前时间耗费减少,而优化后计算过程中多了必要的取模运算,理论上应该比优化前增加才是,尚未找到原因。
字符串X长度 | 字符串Y长度 | 最短编辑距离 | 递归时间(ms) | 非递归时间(ms) | 非递归(空间优化)时间(ms) |
---|---|---|---|---|---|
20 | 30 | 26 | 7.5 | 5.4 | 3.1 |
200 | 300 | 245 | 361 | 209 | 140 |
500 | 1000 | 815 | 17224 | 1344 | 1115 |
2000 | 1000 | 1635 | 101482 | 5721 | 4675 |
2000 | 3000 | 2411 | 299539 | 14503 | 14271 |
2.源代码
import random
import datetime
import sys
sys.setrecursionlimit(204800)
def edit_1(x, y, edit, i, j):
xlen = len(x)
ylen = len(y)
if i >= 0 and j >= 0 and i <= xlen and j <= ylen:
if i == 0 or j == 0 or edit[i][j] != 100000:
return edit[i][j]
else:
if x[i-1] == y[j-1]:
edit[i][j] = min(min(edit_1(x,y,edit,i,j-1)+1, edit_1(x,y,edit,i-1,j)+1), edit_1(x,y,edit,i-1,j-1))
return edit[i][j]
else:
if i >= 2 and j >= 2 and x[i-2] == y[j-1] and x[i-1] == y[j-2]:
edit[i][j] = min(min(edit_1(x,y,edit,i,j-1)+1, edit_1(x,y,edit,i-1,j)+1),min(edit_1(x,y,edit,i-1,j-1)+1, edit_1(x,y,edit,i-2,j-2)+1))
return edit[i][j]
else:
edit[i][j] = min(min(edit_1(x,y,edit,i,j-1)+1, edit_1(x,y,edit,i-1,j)+1), edit_1(x,y,edit,i-1,j-1)+1)
return edit[i][j]
else:
return 0
def edit_length_2(x, y):
edit = [[-1 for i in range(len(y) + 1)] for j in range(len(x) + 1)]
for i in range(len(x) + 1):
edit[i][0] = i
for j in range(len(y) + 1):
edit[0][j] = j
for i in range(1, len(x) + 1):
for j in range(1, len(y) + 1):
if x[i-1] == y[j-1]:
edit[i][j] = min(min(edit[i][j-1]+1, edit[i-1][j]+1), edit[i-1][j-1])
else:
if i >= 2 and j >= 2 and x[i-2] == y[j-1] and x[i-1] == y[j-2]:
edit[i][j] = min(min(edit[i][j-1]+1, edit[i-1][j]+1), min(edit[i-1][j-1]+1, edit[i-2][j-2]+1))
else:
edit[i][j] = min(min(edit[i][j-1]+1, edit[i-1][j]+1), edit[i-1][j-1]+1)
return edit[len(x)][len(y)]
def edit_length_3(x, y):
edit = [[-1 for i in range(len(y) + 1)] for j in range(3)]
for j in range(len(y) + 1):
edit[0][j] = j
for i in range(1, len(x) + 1):
edit[i%3][0] = edit[(i-1)%3][0] + 1
for j in range(1, len(y) + 1):
if x[i-1] == y[j-1]:
edit[i%3][j] = min(min(edit[i%3][j-1]+1, edit[(i-1)%3][j]+1), edit[(i-1)%3][j-1])
else:
if i >= 2 and j >= 2 and x[i-2] == y[j-1] and x[i-1] == y[j-2]:
edit[i%3][j] = min(min(edit[i%3][j-1]+1, edit[(i-1)%3][j]+1), min(edit[(i-1)%3][j-1]+1, edit[(i-2)%3][j-2]+1))
else:
edit[i%3][j] = min(min(edit[i%3][j-1]+1, edit[(i-1)%3][j]+1), edit[(i-1)%3][j-1]+1)
return edit[len(x)%3][len(y)]
def test_1(x, y):
edit = [[100000 for i in range(len(y) + 1)] for j in range(len(x) + 1)]
for i in range(len(x) + 1):
edit[i][0] = i
for j in range(len(y) + 1):
edit[0][j] = j
max_len = edit_1(x, y, edit, len(x), len(y))
print "Edit_Length_1: ",max_len
def test_2(x, y):
max_len = edit_length_2(x, y)
print "Edit_Length_2: ",max_len
def test_3(x, y):
max_len = edit_length_3(x, y)
print "Edit_Length_3: ",max_len
def rand_str(length):
str_0 = []
for i in range(length):
str_0.append(random.choice("abcdefghijklmnopqrstuvwxyz"))
return str_0
def main():
x = rand_str(20)
y = rand_str(30)
print "The String X Length is : ", len(x), " String is : ",
for i in range(len(x)):
print x[i],
print ""
print "The String Y Length is : ", len(y), " String is : ",
for i in range(len(y)):
print y[i],
print ""
time_1 = datetime.datetime.now()
test_1(x, y)
time_2 = datetime.datetime.now()
time_3 = datetime.datetime.now()
test_2(x, y)
time_4 = datetime.datetime.now()
time_5 = datetime.datetime.now()
test_3(x, y)
time_6 = datetime.datetime.now()
print "Function 1 spend ", (time_2 - time_1)
print "Function 2 spend ", (time_4 - time_3)
print "Function 3 spend ", (time_6 - time_5)
main()