史密斯沃特曼算法
史密斯-沃特曼算法(Smith-Waterman algorithm)是一种进行局部序列比对(相对于全局比对)的算法,用于找出两个核苷酸序列或蛋白质序列之间的相似区域。该算法的目的不是进行全序列的比对,而是找出两个序列中具有高相似度的片段。 该算法由坦普尔·史密斯(Temple F. Smith)和迈克尔·沃特曼(Michael S. Waterman)于1981年提出。史密斯-沃特曼算法是尼德曼-翁施算法的一个变体,二者都是动态规划算法。这一算法的优势在于可以在给定的打分方法下找出两个序列的最优的局部比对(打分方法使用了置换矩阵和空位罚分)。该算法和尼德曼-翁施算法的主要区别在于该算法不存在负分(负分被替换为零),因此局部比对成为可能。回溯从分数最高的矩阵元素开始,直到遇到分数为零的元素停止 代码如下: 回溯过程自己想法写的,如有bug,欢迎指正!
@Slf4j
public class Demo2 {
private static int SPACE = - 2 ;
private static int MATCH = 9 ;
private static int DISMACH = - 6 ;
private final String original, target;
public static void main ( String [ ] args) {
String str1 = "TGACCGATGACCCCGGTTCAGGCTTCACCACAGTGTGGAACGCCTAGCATACTAGCATGCATCATAGCTAATGACCAGGTAGTCATGCACACGTGTACTGCCCGAGCGCGCCGCCGTCA" ;
String str2 = "TGACCGATGACCCCGGTTCAGGCTTCACCACAGTGTGGAACGCCTAGCATACTAGCATGCATCATAGCTAATGACCAGGTAGTCATGCACACGTGTACT" ;
smithWater ( str1. toUpperCase ( ) , str2. toUpperCase ( ) ) ;
}
public Demo2 ( String original, String target) {
this . original = "-" + original. toUpperCase ( ) ;
this . target = "-" + target. toUpperCase ( ) ;
}
private static void smithWater ( String str1, String str2) {
int len1 = str1. length ( ) ;
int len2 = str2. length ( ) ;
int [ ] [ ] dif = new int [ len1 + 1 ] [ len2 + 1 ] ;
for ( int a = 0 ; a <= len1; a++ ) {
dif[ a] [ 0 ] = 0 ;
}
for ( int a = 0 ; a <= len2; a++ ) {
dif[ 0 ] [ a] = 0 ;
}
for ( int i = 1 ; i <= len1; i++ ) {
for ( int j = 1 ; j <= len2; j++ ) {
if ( str1. charAt ( i - 1 ) == str2. charAt ( j - 1 ) ) {
dif[ i] [ j] = Math . max ( 0 , Math . max ( dif[ i - 1 ] [ j - 1 ] + MATCH , Math . max ( dif[ i - 1 ] [ j] + SPACE , dif[ i] [ j - 1 ] + SPACE ) ) ) ;
} else {
dif[ i] [ j] = Math . max ( 0 , Math . max ( dif[ i - 1 ] [ j - 1 ] + DISMACH , Math . max ( dif[ i - 1 ] [ j] + SPACE , dif[ i] [ j - 1 ] + SPACE ) ) ) ;
}
}
}
int longest = 0 ;
int iL = 0 , jL = 0 ;
for ( int i = 1 ; i <= len1; i++ ) {
for ( int j = 0 ; j <= len2; j++ ) {
if ( dif[ i] [ j] > longest) {
longest = dif[ i] [ j] ;
iL = i;
jL = j;
}
}
}
log. info ( "最高得分是:" + longest+ ",下标为:i=" + iL+ "j=" + jL) ;
int extent = iL;
int extentTrue = jL;
int [ ] [ ] path = new int [ len1 + 1 ] [ len2 + 1 ] ;
path[ iL] [ jL] = dif[ iL] [ jL] ;
while ( dif[ iL] [ jL] != 0 ) {
if ( Math . max ( dif[ iL - 1 ] [ jL - 1 ] , Math . max ( dif[ iL - 1 ] [ jL] , dif[ iL] [ jL - 1 ] ) ) == dif[ iL - 1 ] [ jL - 1 ] ) {
path[ iL - 1 ] [ jL - 1 ] = dif[ iL - 1 ] [ jL - 1 ] ;
iL = iL - 1 ;
jL = jL - 1 ;
if ( dif[ iL] [ jL] == 0 ) {
log. info ( "第一个匹配成功的下标1是:i=" + ( iL+ 1 ) + "j=" + ( jL+ 1 ) + "值是:" + dif[ iL+ 1 ] [ jL+ 1 ] ) ;
if ( iL == 0 && jL == 0 ) {
} else {
if ( iL+ 1 > jL+ 1 ) {
extentTrue = extentTrue- ( jL+ 1 ) ;
} else {
extent = extent- ( iL+ 1 ) ;
}
}
log. info ( "extent=" + extent+ ",extentTrue=" + extentTrue) ;
}
} else if ( Math . max ( dif[ iL - 1 ] [ jL - 1 ] , Math . max ( dif[ iL - 1 ] [ jL] , dif[ iL] [ jL - 1 ] ) ) == dif[ iL] [ jL - 1 ] ) {
path[ iL] [ jL - 1 ] = dif[ iL] [ jL - 1 ] ;
jL = jL - 1 ;
if ( dif[ iL] [ jL] == 0 ) {
log. info ( "第一个匹配成功的下标2是:i=" + iL+ "j=" + ( jL+ 1 ) + "值是:" + dif[ iL] [ jL+ 1 ] ) ;
if ( iL == 0 && jL == 0 ) {
} else {
if ( iL > jL+ 1 ) {
extentTrue = extentTrue- ( jL+ 1 ) ;
} else {
extent = extent- iL;
}
}
log. info ( "extent=" + extent+ ",extentTrue=" + extentTrue) ;
}
} else {
path[ iL - 1 ] [ jL] = dif[ iL - 1 ] [ jL] ;
iL = iL - 1 ;
if ( dif[ iL] [ jL] == 0 ) {
log. info ( "第一个匹配成功的下标3是:i=" + ( iL+ 1 ) + "j=" + jL+ "值是:" + dif[ iL+ 1 ] [ jL] ) ;
if ( iL == 0 && jL == 0 ) {
} else {
if ( iL+ 1 > jL) {
extentTrue = extentTrue- jL;
} else {
extent = extent- ( iL+ 1 ) ;
}
}
log. info ( "extent=" + extent+ ",extentTrue=" + extentTrue) ;
}
}
}
int n = 0 ;
for ( int i = 0 ; i < len1 + 1 ; i++ )
{
for ( int j = 0 ; j < len2+ 1 ; j++ ) {
if ( 0 != path[ i] [ j] ) n++ ;
}
}
log. info ( "匹配的长度n是:" + n) ;
if ( extent > extentTrue) {
log. info ( "匹配的长度是:" + n) ;
log. info ( "匹配成功的长度是:" + extentTrue) ;
float similarity = ( float ) extentTrue / n;
log. info ( "匹配率是:" + similarity) ;
} else {
log. info ( "匹配的长度是:" + n) ;
log. info ( "匹配成功的长度是:" + extent) ;
float similarity = ( float ) extent / n;
log. info ( "匹配率是:" + similarity) ;
}
}
}