lcs.py 最长公共子串算法

[size=large]感觉用来匹配相似文件比最短编辑距离更靠谱,最短编辑应该是用来纠错的


http://www.unixuser.org/~euske/python/
这个网站还有不少好脚本

http://www.unixuser.org/~euske/python/lcs.py

zuroc@frodo ~/dev/douban $ cat lcs.py
#!/usr/bin/env python
# find an LCS (Longest Common Subsequence).
# *public domain*

def find_lcs_len(s1, s2):
m = [ [ 0 for x in s2 ] for y in s1 ]
for p1 in range(len(s1)):
for p2 in range(len(s2)):
if s1[p1] == s2[p2]:
if p1 == 0 or p2 == 0:
m[p1][p2] = 1
else:
m[p1][p2] = m[p1-1][p2-1]+1
elif m[p1-1][p2] < m[p1][p2-1]:
m[p1][p2] = m[p1][p2-1]
else: # m[p1][p2-1] < m[p1-1][p2]
m[p1][p2] = m[p1-1][p2]
return m[-1][-1]

def find_lcs(s1, s2):
# length table: every element is set to zero.
m = [ [ 0 for x in s2 ] for y in s1 ]
# direction table: 1st bit for p1, 2nd bit for p2.
d = [ [ None for x in s2 ] for y in s1 ]
# we don't have to care about the boundery check.
# a negative index always gives an intact zero.
for p1 in range(len(s1)):
for p2 in range(len(s2)):
if s1[p1] == s2[p2]:
if p1 == 0 or p2 == 0:
m[p1][p2] = 1
else:
m[p1][p2] = m[p1-1][p2-1]+1
d[p1][p2] = 3 # 11: decr. p1 and p2
elif m[p1-1][p2] < m[p1][p2-1]:
m[p1][p2] = m[p1][p2-1]
d[p1][p2] = 2 # 10: decr. p2 only
else: # m[p1][p2-1] < m[p1-1][p2]
m[p1][p2] = m[p1-1][p2]
d[p1][p2] = 1 # 01: decr. p1 only
(p1, p2) = (len(s1)-1, len(s2)-1)
# now we traverse the table in reverse order.
s = []
while 1:
print p1,p2
c = d[p1][p2]
if c == 3: s.append(s1[p1])
if not ((p1 or p2) and m[p1][p2]): break
if c & 2: p2 -= 1
if c & 1: p1 -= 1
s.reverse()
return ''.join(s)

if __name__ == '__main__':
print find_lcs('abcoisjf','axbaoeijf')
print find_lcs_len('abcoisjf','axbaoeijf')[/size]
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值