参考:算法导论中文版和课程讲义
定义声明:最长公共子串要求在原字符串中连续,而子序列只需要保持相对顺序一致即可,并不要求连续。
子序列定义
问题描述:
枚举法
1:描述一个最长公共子序列
2:一个递归解
3:计算LCS的长度
编写python3代码如下:
# coding=utf-8
"""
Created on 2019.2.14
@author: 刘祥
参考:
"""
import numpy as np
from numpy import *#导入numpy的库函数
class LCS(object):
def __init__(self, x, y):
self.x = x
self.y = y
self.len_x = len(self.x)
self.len_y = len(self.y)
self.c = np.zeros((self.len_x+1, self.len_y+1), dtype="i").tolist()
self.b = np.zeros((self.len_x+1, self.len_y+1), dtype="a").tolist()
self.lcs = ""
def lcs_length(self):
for i in range(1, self.len_x+1):
for j in range(1, self.len_y+1):
if self.x[i-1] == self.y[j-1]: #注意这里的下标得减1
self.c[i][j] = self.c[i-1][j-1]+1
self.b[i][j] = '↖'
elif self.c[i-1][j] >= self.c[i][j-1]:
self.c[i][j] = self.c[i-1][j]
self.b[i][j] = '↑'
else:
self.c[i][j] = self.c[i][j-1]
self.b[i][j] = '←'
print(mat(self.c))
print(mat(self.b))
def get_lcs(self):
reverse_lcs = ""
reverse_lcs2 = ''
i = self.len_x
j = self.len_y
while i > 0 and j > 0:
if self.b[i][j] == '↖':
i -= 1
j -= 1
reverse_lcs += self.x[i-1] #其实此时的self.x[i-1]=self.y[j-1]
reverse_lcs2 += self.y[j-1]
elif self.b[i][j] == '↑':
i -= 1
elif self.b[i][j] == '←':
j -= 1
# else:
# break
x = 'ACCGGTCGAGTGCGCGGAAGCCG'
y = 'GTCGTTCGGAATGCCGTT'
test = LCS(x,y)
test.lcs_length()
运行结果:
C:\Users\liuxiang15\Desktop\homework3>python longest_common_subsequence.py
[[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[ 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1]
[ 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2]
[ 0 0 0 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3]
[ 0 1 1 1 2 2 2 2 3 3 3 3 3 3 3 3 4 4 4]
[ 0 1 1 1 2 2 2 2 3 4 4 4 4 4 4 4 4 4 4]
[ 0 1 2 2 2 3 3 3 3 4 4 4 5 5 5 5 5 5 5]
[ 0 1 2 3 3 3 3 4 4 4 4 4 5 5 6 6 6 6 6]
[ 0 1 2 3 4 4 4 4 5 5 5 5 5 6 6 6 7 7 7]
[ 0 1 2 3 4 4 4 4 5 5 6 6 6 6 6 6 7 7 7]
[ 0 1 2 3 4 4 4 4 5 6 6 6 6 7 7 7 7 7 7]
[ 0 1 2 3 4 5 5 5 5 6 6 6 7 7 7 7 7 8 8]
[ 0 1 2 3 4 5 5 5 6 6 6 6 7 8 8 8 8 8 8]
[ 0 1 2 3 4 5 5 6 6 6 6 6 7 8 9 9 9 9 9]
[ 0 1 2 3 4 5 5 6 7 7 7 7 7 8 9 9 10 10 10]
[ 0 1 2 3 4 5 5 6 7 7 7 7 7 8 9 10 10 10 10]
[ 0 1 2 3 4 5 5 6 7 8 8 8 8 8 9 10 11 11 11]
[ 0 1 2 3 4 5 5 6 7 8 8 8 8 9 9 10 11 11 11]
[ 0 1 2 3 4 5 5 6 7 8 9 9 9 9 9 10 11 11 11]
[ 0 1 2 3 4 5 5 6 7 8 9 10 10 10 10 10 11 11 11]
[ 0 1 2 3 4 5 5 6 7 8 9 10 10 11 11 11 11 11 11]
[ 0 1 2 3 4 5 5 6 7 8 9 10 10 11 12 12 12 12 12]
[ 0 1 2 3 4 5 5 6 7 8 9 10 10 11 12 13 13 13 13]
[ 0 1 2 3 4 5 5 6 7 8 9 10 10 11 12 13 14 14 14]]
[['' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '']
['' '↑' '↑' '↑' '↑' '↑' '↑' '↑' '↑' '↑' '↖' '↖' '←' '←' '←' '←' '←' '←'
'←']
['' '↑' '↑' '↖' '←' '←' '←' '↖' '←' '←' '↑' '↑' '↑' '↑' '↖' '↖' '←' '←'
'←']
['' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '←' '←' '←' '←' '←' '←' '↖' '↖' '←' '←'
'←']
['' '↖' '←' '↑' '↖' '←' '←' '↑' '↖' '↖' '←' '←' '←' '↖' '←' '↑' '↖' '←'
'←']
['' '↖' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '↖' '←' '←' '←' '↖' '←' '←' '↖' '↑'
'↑']
['' '↑' '↖' '←' '↑' '↖' '↖' '←' '↑' '↑' '↑' '↑' '↖' '←' '←' '←' '←' '↖'
'↖']
['' '↑' '↑' '↖' '←' '↑' '↑' '↖' '←' '↑' '↑' '↑' '↑' '↑' '↖' '↖' '←' '←'
'←']
['' '↖' '↑' '↑' '↖' '←' '←' '↑' '↖' '↖' '←' '←' '↑' '↖' '↑' '↑' '↖' '←'
'←']
['' '↑' '↑' '↑' '↑' '↑' '↑' '↑' '↑' '↑' '↖' '↖' '←' '↑' '↑' '↑' '↑' '↑'
'↑']
['' '↖' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '↖' '↑' '↑' '↑' '↖' '←' '←' '↖' '↑'
'↑']
['' '↑' '↖' '↑' '↑' '↖' '↖' '←' '↑' '↑' '↑' '↑' '↖' '↑' '↑' '↑' '↑' '↖'
'↖']
['' '↖' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '↖' '↑' '↑' '↑' '↖' '←' '←' '↖' '↑'
'↑']
['' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '↑' '↑' '↑' '↑' '↑' '↑' '↖' '↖' '←' '←'
'←']
['' '↖' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '↖' '←' '←' '↑' '↖' '↑' '↑' '↖' '←'
'←']
['' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '↑' '↑' '↑' '↑' '↑' '↑' '↖' '↖' '↑' '↑'
'↑']
['' '↖' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '↖' '←' '←' '←' '↖' '↑' '↑' '↖' '←'
'←']
['' '↖' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '↖' '↑' '↑' '↑' '↖' '↑' '↑' '↖' '↑'
'↑']
['' '↑' '↑' '↑' '↑' '↑' '↑' '↑' '↑' '↑' '↖' '↖' '←' '↑' '↑' '↑' '↑' '↑'
'↑']
['' '↑' '↑' '↑' '↑' '↑' '↑' '↑' '↑' '↑' '↖' '↖' '←' '←' '←' '↑' '↑' '↑'
'↑']
['' '↖' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '↖' '↑' '↑' '↑' '↖' '←' '←' '↖' '↑'
'↑']
['' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '↑' '↑' '↑' '↑' '↑' '↑' '↖' '↖' '←' '←'
'←']
['' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '↑' '↑' '↑' '↑' '↑' '↑' '↖' '↖' '←' '←'
'←']
['' '↖' '↑' '↑' '↖' '↑' '↑' '↑' '↖' '↖' '↑' '↑' '↑' '↖' '↑' '↑' '↖' '←'
'←']]
根据输出第一个矩阵的最右下元素为14我们可以知道LCS的长度是14
4.构建一个LCS
在构建最大公共子序列部分我犯了一个巨大错误!!!
我认为因为每当发现一个新的共同尾元素后,c[i][j] = c[i-1][j-1]+1
所以我们只需要沿着一条路径分别将加入x[i-1]和y[j-1]导致c[i][j]变化的x[i-1]或y[j-1]连接起来即可得到最大公共子序列(LCS).
于是我分别按照图中所示两条路径进行LCS的组装并输出
添加的类成员函数如下
def get_lcs_by_cy(self):
i = self.len_y
reverse_lcs = ""
while(i >= 0):
i -= 1
if self.c[self.len_x][i] < self.c[self.len_x][i+1]:
reverse_lcs += self.y[i]
lcs = reverse_lcs[::-1]
print("in get_lcs_by_cy lcs of x and y is: "+lcs)
def get_lcs_by_cx(self):
i = self.len_x
reverse_lcs = ""
while(i >= 0):
i -= 1
if self.c[i][self.len_y] < self.c[i+1][self.len_y]:
reverse_lcs += self.x[i]
lcs = reverse_lcs[::-1]
print("in get_lcs_by_cx lcs of x and y is: "+lcs)
类外添加代码:
test.get_lcs_by_cx()
test.get_lcs_by_cy()
运行结果如下:
in get_lcs_by_cx lcs of x and y is: ACCGTCGTCGGCCG
in get_lcs_by_cy lcs of x and y is: GTCGTCGGAAGCCG
事实上只有第2个lcsGTCGTCGGAAGCCG满足要求,
这种方法并不能保证计算的结果是lcs,比如在函数get_lcs_by_cy中,即使self.c[self.len_x][i] < self.c[self.len_x][i+1]
也不能保证self.y[i]能与之前匹配的字符串连接起来,比如y[i]可能在之前匹配字符串的前面。
正确方法1:结合self.b矩阵来倒推一个LCS。
添加类成员函数
def get_lcs_by_b(self, i, j):
if i == 0 or j == 0:
return
if self.b[i][j] == '↖':
self.get_lcs_by_b(i-1, j-1)
self.lcs += self.x[i-1]
elif self.b[i][j] == '↑':
self.get_lcs_by_b(i-1, j)
else:
self.get_lcs_by_b(i, j-1)
if i == self.len_x and j == self.len_y:
print("in get_lcs_by_b lcs of x and y is: "+self.lcs)
类外添加测试代码
test.get_lcs_by_b(len(x), len(y))
运行输出:
in get_lcs_by_b lcs of x and y is: GTCGTCGGAAGCCG
正确方法2:结合self.c矩阵和字符串x,y来倒推一个LCS。
添加类成员函数
def get_lcs_by_candxy(self, i, j):
if i == 0 or j == 0:
return
if self.x[i-1] == self.y[j-1]:
self.get_lcs_by_candxy(i-1, j-1)
self.lcs += self.x[i-1]
elif self.c[i-1][j] >= self.c[i][j-1]:
self.get_lcs_by_candxy(i-1, j)
else:
self.get_lcs_by_candxy(i, j-1)
if i == self.len_x and j == self.len_y:
print("in get_lcs_by_candxy lcs of x and y is: "+self.lcs)
类外测试函数
test.get_lcs_by_candxy(len(x), len(y))
运行结果:
in get_lcs_by_candxy lcs of x and y is: GTCGTCGGAAGCCG
这两种获得lcs的时间复杂度都是O(len(x)+len(y))
总结与思考:
如何存储所有的最大公共子序列(LCS)???
欢迎大家评论指正!!!