Hunt-Szymanski算法 Python实现
该算法是LCS(Longest Common Subsequence)问题的一个优化算法由Hunt and Szymanski在1977年提出,网上找了很久但是都没有一个很详细的解释和代码实现,理解的过程也是云里雾里,,所以现在先把老师给的标准答案写在这,之后自己学懂了再慢慢解释,老师的代码是Python实现的。(之后如果有学弟学妹发现了这篇帖子后,,别给老板打小报告哈,,)
以下是该算法的完整代码:
def similarity_Hunt_and_Szymanski(s1, s2):
"""Return the similarity between two strings,
i.e., the maximal number of characters in the same order in the two strings
Algorithm: [Hunt and Szymanski, 1977] in O((|d| + log(r)) x log(min(|s1|,|s2|)))
where d is the number of different symbols in the longest string
and r is the number of positions with the same symbol in the two strings (equality points)
>>> similarity_Hunt_and_Szymanski('','abcd')
0
>>> similarity_Hunt_and_Szymanski('abcd','abcd')
4
>>> similarity_Hunt_and_Szymanski('abcd','wxyz')
0
>>> similarity_Hunt_and_Szymanski('abcd','wxabyd')
3
"""
# let s1 be the shortest string
if len(s1) > len(s2):
s1, s2 = s2, s1
equal = {}
# particular cases
if '' == s1:
return 0
# first preprocessing step: computation of the equality points
for i in range(0, len(s2)):
equal[i + 1] = list_of_indices(s2[i], s1)[::-1]
# second preprocessing step: similarity threshold table
threshold = [len(s1) + 1 for _ in range(0, len(s2) + 1)]
threshold[0] = 0
# processing step: algorithm proper
for i in range(0, len(s2)):
for j in equal[i + 1]:
k = look_for_threshold_index(j, threshold) # look for k such that threshold[k-1] < j <= threshold[k]:
if j < threshold[k]:
threshold[k] = j
# postprocessing step: looking for the result, i.e., the similarity between the two strings
# it is the first index in threshold with a value different from len(s1) + 1, starting from the right
result = 0
for k in range(len(s2), 0, -1):
if len(s1) + 1 != threshold[k]:
result = k
break
return result
def list_of_indices(c, s):
"""
Returns the list of indices of the occurrences of c in s
"""
result = []
i = 0
while i < len(s):
if type(s) == list:
try:
i = s[i:].index(c) + i + 1
except ValueError:
i = 0
else:
i = s.find(c, i) + 1
if 0 != i:
result.append(i - 1)
else:
break
return result
def look_for_threshold_index(j, threshold, left=None, right=None):
"""
Look for k such that threshold[k-1] < j <= threshold[k]
Algorithm: dichotomy search
>>> look_for_threshold_index(4,[4])
0
>>> look_for_threshold_index(4,[0, 1, 2, 3, 4, 5, 6, 7])
4
>>> look_for_threshold_index(5,[0, 2, 4, 6, 8, 10, 12, 14])
3
"""
if (None, None) == (left, right):
left, right = 0, len(threshold) - 1
if left > right:
raise ValueError('Value in left higher than right')
elif left + 1 == right or left == right:
return right
else:
mid = int((left + right) / 2)
if j <= threshold[mid]:
left, right = left, mid
else:
left, right = mid, right
return look_for_threshold_index(j, threshold, left, right)