MultipleSequenceBlast.java
多序列比对
在这里package com.itheima.read;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.function.BiConsumer;
public class MultipleSequenceBlast {
private Map<String, String> map = new HashMap<>();
//迭代器
private Iterator<String> mapIt = map.keySet().iterator();
public MultipleSequenceBlast() {
}
public MultipleSequenceBlast(Map<String, String> map) {
this.map = map;
}
/**
* 获取
*
* @return map
*/
public Map<String, String> getMap() {
return map;
}
/**
* 设置
*
* @param map
*/
public void setMap(String filePath) {
//读取文本到字符串data中
Path path = Paths.get(filePath);
List<String> lines = null;
try {
lines = Files.readAllLines(path);
} catch (IOException e) {
e.printStackTrace();
}
//文本中带>的为标题行,改行下面不带>的为内容行,把标题行处理完之后作为hashMap的key,其下面的内容行作为hashMap的value
//去除文本上游不合法的行
int start = lines.size();
for (int i = 0; i < lines.size(); i++) {
if (lines.get(i).contains(">")) {
start = i;
break;
}
}
//给map添加内容,
String temKey = "";
StringBuilder temValue = new StringBuilder();
for (int i = start; i < lines.size(); i++) {
if (lines.get(i).contains(">")) {
if (i != start) {//给map添加内容
//temValue转换成字符串,并清除\t,\n,空格,并和temKey一起添加到map中
map.put(temKey, temValue.toString().replaceAll("\t", "").replaceAll("\n", "").replaceAll(" ", ""));
}
//初始化temKey和temValue
temKey = lines.get(i).substring(lines.get(i).indexOf(">") + 1).strip();
temValue = new StringBuilder();
} else {
//把标题行下的内容添加到temValue中,
temValue.append(lines.get(i));
}
}
map.put(temKey, temValue.toString().replaceAll("\t", "").replaceAll("\n", "").replaceAll(" ", ""));
}
public void printAllMap() {
//打印map所有数据
Set<Map.Entry<String, String>> entries = map.entrySet();
for (Map.Entry<String, String> entry : entries) {
String key = entry.getKey();
String value = entry.getValue();
System.out.println(key);
System.out.println(value);
System.out.println("---------------------------------");
}
}
public void printMap(String key) {
System.out.println(map.get(key));
}
public void printMap() {
String s;
if (mapIt.hasNext()) {
s = mapIt.next();
} else {
mapIt = map.keySet().iterator();
s = mapIt.next();
}
System.out.println(s);
System.out.println(map.get(s));
System.out.println("-------------------");
}
private int score(char c1, char c2) {
//blast算法的打分函数
int score = 0;
if (c1 == c2) {
score += 2;
} else if (("" + c1 + c2).equals("AC") || ("" + c1 + c2).equals("GT") || ("" + c1 + c2).equals("CA") || ("" + c1 + c2).equals("TG")) {
score -= 5;
} else {
score -= 7;
}
return score;
}
private int getMax(int a, int b, int c) {
//求三个数的最大值
int max = a;
if (b > max) {
max = b;
}
if (c > max) {
max = c;
}
return max;
}
public ArrayList<Object> blast(String seq1, String seq2, int min_score) {
/*三个参数分别表示第一条序列,第二条序列,和满足要求的最小得分,当最小得分小于0时,表示不限制最小得分*/
//双序列比对算法,在序列两端插入和缺失不扣分
seq1 = seq1==null?"":seq1;
seq2 = seq2==null?"":seq2;
int l1 = seq1.length();
int l2 = seq2.length();
int GAP = -5;//-5 for any gap
/*
* #打分矩阵,矩阵中的每一个点储存着到这个点的最高得分
*储存着最优路径中上一个点到达这个点的方式,seq1在矩阵上面,seq2在矩阵左边,
*1表示通过对角线,
*2表示通过从做到右移动(即seq2在相应位置加一个空格),
*3表示从上到下移动(即在seq1的相应位置加一个空格)
* */
ArrayList<ArrayList<Integer>> scores = new ArrayList<>();
/*
*把两个序列插入不同空格的得分放入scores中,没个空格减5分
*scores[0]是一个列表,列表中储存着seq2插入不同空格的得分值,
*scores[1:l2+1]都是列表,列表中储存着
*seq1插入不同空格的得分值
*注意:seq1插入的空格数最多为len(seq2),seq2插入的空格最多为len(seq1)
*因为如果插入空格使seq1或seq2的长度超过len(seq1)+len(seq2),就会出现空格对空格的形式
* */
ArrayList<ArrayList<Integer>> point = new ArrayList<>();
ArrayList<Integer> line1;
ArrayList<Integer> line2;
for (int j = 0; j < l2 + 1; j++) {
if (j == 0) {
line1 = new ArrayList<>();
line2 = new ArrayList<>();
line1.add(Integer.valueOf(0));
line2.add(Integer.valueOf(0));
for (int i = 1; i < l1 + 1; i++) {
/*
*修改了blast打分算法line1.append(GAP*i)换成line1.append(0),
*即在序列起始位置添加空格不扣分
*/
line1.add(Integer.valueOf(GAP*i));
line2.add(Integer.valueOf(2));
}
} else {
line1 = new ArrayList<>();
line2 = new ArrayList<>();
/*
*修改了blast打分算法line1.append(GAP*j)换成line1.append(0),
*即在序列起始位置添加空格不扣分
*/
line1.add(Integer.valueOf(GAP*j));
line2.add(Integer.valueOf(3));
}
scores.add(line1);
point.add(line2);
}
/*fill the blank of scores and point
*计算出得分矩阵的值和point矩阵的值,在两端添加空格不罚分
* */
for (int j = 1; j < l2 + 1; j++) {
char letter2 = seq2.charAt(j - 1);
for (int i = 1; i < l1 + 1; i++) {
char letter1 = seq1.charAt(i - 1);
//对角线得分
int diagonalScore = score(letter1, letter2) + scores.get(j - 1).get(i - 1);
int leftScore;
leftScore = GAP + scores.get(j).get(i - 1);//l1加一个空格之后的得分,向右移动
int upScore;
upScore = GAP + scores.get(j - 1).get(i);//l2加一个空格之后的得分,向下移动
int maxScore = getMax(upScore, leftScore, diagonalScore);
scores.get(j).add(Integer.valueOf(maxScore));
if (scores.get(j).get(i) == diagonalScore) {
point.get(j).add(Integer.valueOf(1));
} else if (scores.get(j).get(i) == leftScore) {
point.get(j).add(Integer.valueOf(2));
} else {
point.get(j).add(Integer.valueOf(3));
}
}
}
//trace back
StringBuilder sb1 = new StringBuilder();
StringBuilder sb2 = new StringBuilder();
int i = l2;
int j = l1;
int finalScore;
if (scores.get(i).get(j) > min_score || min_score < 0) {
//最终得分
finalScore = scores.get(i).get(j);
} else {
//最终得分
finalScore = scores.get(i).get(j);
ArrayList<Object> res = new ArrayList<>();
res.add(null);
res.add(null);
res.add(Integer.valueOf(finalScore));
return res;
}
//获取最优路径,并把插入空格的序列输出
while (true) {
if (point.get(i).get(j) == 0) {
break;
} else if (point.get(i).get(j) == 1) {
sb1.append(seq1.charAt(j - 1));
sb2.append(seq2.charAt(i - 1));
i--;
j--;
} else if (point.get(i).get(j) == 2) {
sb1.append(seq1.charAt(j - 1));
sb2.append('.');
j--;
} else {
sb1.append('.');
sb2.append(seq2.charAt(i - 1));
i--;
}
}
String alignment1 = sb1.reverse().toString();
String alignment2 = sb2.reverse().toString();
ArrayList<Object> res = new ArrayList<>();
res.add(alignment1);
res.add(alignment2);
res.add(Integer.valueOf(finalScore));
//返回第一条片段,第二条片段,以及最终得分
return res;
}
public ArrayList<Object> seqMerge(String seq1, String seq2, int min_score) {
/*三个参数分别表示第一条序列,第二条序列,和满足要求的最小得分,当最小得分小于0时,表示不限制最小得分*/
//双序列比对算法,在序列两端插入和缺失不扣分
seq1 = seq1==null?"":seq1;
seq2 = seq2==null?"":seq2;
int l1 = seq1.length();
int l2 = seq2.length();
int GAP = -5;//-5 for any gap
/*
* #打分矩阵,矩阵中的每一个点储存着到这个点的最高得分
*储存着最优路径中上一个点到达这个点的方式,seq1在矩阵上面,seq2在矩阵左边,
*1表示通过对角线,
*2表示通过从做到右移动(即seq2在相应位置加一个空格),
*3表示从上到下移动(即在seq1的相应位置加一个空格)
* */
ArrayList<ArrayList<Integer>> scores = new ArrayList<>();
/*
*把两个序列插入不同空格的得分放入scores中,没个空格减5分
*scores[0]是一个列表,列表中储存着seq2插入不同空格的得分值,
*scores[1:l2+1]都是列表,列表中储存着
*seq1插入不同空格的得分值
*注意:seq1插入的空格数最多为len(seq2),seq2插入的空格最多为len(seq1)
*因为如果插入空格使seq1或seq2的长度超过len(seq1)+len(seq2),就会出现空格对空格的形式
* */
ArrayList<ArrayList<Integer>> point = new ArrayList<>();
ArrayList<Integer> line1;
ArrayList<Integer> line2;
for (int j = 0; j < l2 + 1; j++) {
if (j == 0) {
line1 = new ArrayList<>();
line2 = new ArrayList<>();
line1.add(Integer.valueOf(0));
line2.add(Integer.valueOf(0));
for (int i = 1; i < l1 + 1; i++) {
/*
*修改了blast打分算法line1.append(GAP*i)换成line1.append(0),
*即在序列起始位置添加空格不扣分
*/
line1.add(Integer.valueOf(0));
line2.add(Integer.valueOf(2));
}
} else {
line1 = new ArrayList<>();
line2 = new ArrayList<>();
/*
*修改了blast打分算法line1.append(GAP*j)换成line1.append(0),
*即在序列起始位置添加空格不扣分
*/
line1.add(Integer.valueOf(0));
line2.add(Integer.valueOf(3));
}
scores.add(line1);
point.add(line2);
}
/*fill the blank of scores and point
*计算出得分矩阵的值和point矩阵的值,在两端添加空格不罚分
* */
for (int j = 1; j < l2 + 1; j++) {
char letter2 = seq2.charAt(j - 1);
for (int i = 1; i < l1 + 1; i++) {
char letter1 = seq1.charAt(i - 1);
//对角线得分
int diagonalScore = score(letter1, letter2) + scores.get(j - 1).get(i - 1);
//如果在最后一行,向右移动不罚分,其他情况罚5分
int leftScore;
if (j == l2) {
leftScore = scores.get(j).get(i - 1);//l1加一个空格之后的得分,向右移动
} else {
leftScore = GAP + scores.get(j).get(i - 1);//l1加一个空格之后的得分,向右移动
}
//如果在最后一列,向下移动不罚分
int upScore;
if (i == l1) {
upScore = scores.get(j - 1).get(i);//l2加一个空格之后的得分,向下移动
} else {
upScore = GAP + scores.get(j - 1).get(i);//l2加一个空格之后的得分,向下移动
}
int maxScore = getMax(upScore, leftScore, diagonalScore);
scores.get(j).add(Integer.valueOf(maxScore));
if (scores.get(j).get(i) == diagonalScore) {
point.get(j).add(Integer.valueOf(1));
} else if (scores.get(j).get(i) == leftScore) {
point.get(j).add(Integer.valueOf(2));
} else {
point.get(j).add(Integer.valueOf(3));
}
}
}
//trace back
StringBuilder sb1 = new StringBuilder();
StringBuilder sb2 = new StringBuilder();
int i = l2;
int j = l1;
int finalScore;
if (scores.get(i).get(j) > min_score || min_score < 0) {
//最终得分
finalScore = scores.get(i).get(j);
} else {
//最终得分
finalScore = scores.get(i).get(j);
ArrayList<Object> res = new ArrayList<>();
res.add(null);
res.add(null);
res.add(Integer.valueOf(finalScore));
return res;
}
//获取最优路径,并把插入空格的序列输出
while (true) {
if (point.get(i).get(j) == 0) {
break;
} else if (point.get(i).get(j) == 1) {
sb1.append(seq1.charAt(j - 1));
sb2.append(seq2.charAt(i - 1));
i--;
j--;
} else if (point.get(i).get(j) == 2) {
sb1.append(seq1.charAt(j - 1));
sb2.append('.');
j--;
} else {
sb1.append('.');
sb2.append(seq2.charAt(i - 1));
i--;
}
}
String alignment1 = sb1.reverse().toString();
String alignment2 = sb2.reverse().toString();
ArrayList<Object> res = new ArrayList<>();
res.add(alignment1);
res.add(alignment2);
res.add(Integer.valueOf(finalScore));
//返回第一条片段,第二条片段,以及最终得分
return res;
}
private String[][] getStringList(String[] res0, String blastSeq1, String blastSeq2) {
//函数功能获取序列字符串的列表形式
int index = 0;
String[] res1 = new String[res0.length];
if (blastSeq1.charAt(0) == '.') {
res0[0] = ".";
res1[0] = String.valueOf(blastSeq2.charAt(0));
index++;
} else {
res1[1] = String.valueOf(blastSeq2.charAt(0));
index = index + 2;
}
for (int i = 1; i < blastSeq1.length(); i++) {
if (blastSeq1.charAt(i) == '.') {
if (res0[index - 1].equals(".")) {
res1[index-1] = res1[index-1] + blastSeq2.charAt(i);
} else {
res0[index] = ".";
res1[index] = String.valueOf(blastSeq2.charAt(i));
index++;
}
} else {
if (res0[index - 1].equals(".")) {
res1[index] = String.valueOf(blastSeq2.charAt(i));
index++;
} else {
index++;
res1[index] = String.valueOf(blastSeq2.charAt(i));
index++;
}
}
}
String[][] res = new String[2][];
res[0] = res0;
res[1] = res1;
//函数功能:获取
return res;
}
public String[] multipleSequenceBlast(String[] str,boolean isCallbackFunction1st) {
//参数1:表示序列的字符串数组,参数2:表示回调函数是不是第一次回调
Date startDate = new Date();
SimpleDateFormat dateFormat= new SimpleDateFormat("yyyy-MM-dd :hh:mm:ss");
//只有第一次调用多序列比对时才打印
if(isCallbackFunction1st) {
System.out.println("多序列比对开始时间: " + dateFormat.format(startDate));
}
//多序列比对算法,以str第一条序列为参考序列
String[][] res = new String[str.length][];
//生成参考序列模版,一个字符串数组,内容为["","字符串str[0]字符","","字符串str[0]字符",""]
//确定第一个不是空的字符串
int start=0;
for (int i = 0; i < str.length; i++) {
if(str[i]!=null&&!str[i].equals(".")){
start = i;
break;
}
}
//生成模版序列的字符串数组
res[start] = new String[2 * str[start].length() + 1];
for (int i = 0; i < str[start].length(); i++) {
res[start][i * 2 + 1] = String.valueOf(str[start].charAt(i));
}
//生成模版序列以上序列的字符串数组
for (int i = 0; i < start; i++) {
res[i] = new String[res[start].length];
}
//生成模版序列以下序列的字符串数组
for (int i = start+1; i < str.length; i++) {
ArrayList<Object> tem = blast(str[start], str[i], -1);
String seq1 = (String) tem.get(0);
String seq2 = (String) tem.get(1);
res[i] = getStringList(res[start], seq1, seq2)[1];
//当第一次调用多序列比对函数时,每比对完一条序列,打印一次时间信息
if(isCallbackFunction1st) {
System.out.println("比对完" + (i - start) + "条序列,已经用时" + (new Date().getTime() / 1000-startDate.getTime()/1000) + " s");
}
}
//递归调用自身
for (int i = 0; i < res[start].length; i++) {
if(res[start][i]!=null&&res[start][i].equals(".")){
String[] tem = new String[res.length];
for (int j = 0; j < tem.length; j++) {
tem[j] = res[j][i];
}
String[] sonAns = multipleSequenceBlast(tem,false);
for (int j = 0; j < res.length; j++) {
res[j][i] = sonAns[j];
}
}
}
//遍历res,并生成字符串数组ans
String[] ans = new String[str.length];
for (int i = start; i < res.length; i++) {
StringBuilder sb = new StringBuilder();
for (int j = 0; j < res[i].length; j++) {
if(res[i][j]!=null){
sb.append(res[i][j]);
}
}
ans[i] = sb.toString();
}
for (int i = 0; i < start; i++) {
StringBuilder sb = new StringBuilder();
for (int j = 0; j < ans[start].length(); j++) {
sb.append(".");
}
ans[i]= sb.toString();
}
Date endDate = new Date();//多序列比对结束时间
if (isCallbackFunction1st) {//只有第一次调用多序列比对时才打印
System.out.println("多序列比对结束时间: " + dateFormat.format(endDate));
System.out.println("多序列比对运行时间: " + (endDate.getTime() / 1000 - startDate.getTime() / 1000) + " s");
}
return ans;
}
public String[] mapToString(Map<String,String> map){
String[] res=new String[map.size()];
int index = 0;
Set<Map.Entry<String,String>> entries = map.entrySet();
for(Map.Entry<String,String> entry:entries){
String key = entry.getKey();
String value = entry.getValue();
res[index] = value;
index++;
}
// Arrays.sort(res, new Comparator<String[]>() {
// @Override
// public int compare(String[] o1, String[] o2) {
// int res = o1[0].length()-o2[0].length();//按长短从小到大排序
// res = res==0?o1[0].compareTo(o2[0]):res;
// return res;
// }
// });
return res;
}
}
插入代码片
使用该类
package com.itheima.read;
import jdk.swing.interop.SwingInterOpUtils;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Date;
public class Test {
public static void main(String[] args) throws FileNotFoundException {
MultipleSequenceBlast ms = new MultipleSequenceBlast();
ms.setMap("txtread\\src\\com\\itheima\\read\\05g.txt");
String[] s =ms.mapToString(ms.getMap());
System.out.println(s.length);
String[] res =ms.multipleSequenceBlast(s,true);
System.out.println(res.length);
ArrayList<Character>[] res1 = new ArrayList[res.length];
ArrayList<Integer> num1 = new ArrayList<>();
//提取SNP
for (int i = 0; i < res[0].length(); i++) {
for (int j = 0; j < res.length; j++) {
if(res[0].charAt(i)!=res[j].charAt(i)){
num1.add(i+1);
for (int k = 0; k < res1.length; k++) {
if(res1[k]==null){
res1[k] = new ArrayList<>();
}
res1[k].add(res[k].charAt(i));
}
break;
}
}
}
//打印数据到窗口以及文本中
PrintWriter out = new PrintWriter("txtread\\src\\com\\itheima\\read\\05G_OUT.txt");
for (int i = 0; i < num1.size(); i++) {
System.out.print(num1.get(i)+"\t");
out.print(num1.get(i)+"\t");
}
System.out.println();
out.println();
for (int i = 0; i < res1.length; i++) {
for (int j = 0; j < res1[i].size(); j++) {
System.out.print(res1[i].get(j)+"\t");
out.print(res1[i].get(j)+"\t");
}
System.out.println();
out.println();
}
out.close();
}
public static void printStringList(String[][] str){
for (int i = 0; i < str.length; i++) {
for (int j = 0; j < str[i].length; j++) {
System.out.print(str[i][j]+"\t");
}
System.out.println();
}
}
public static void printStringList(String[] str){
for (int i = 0; i < str.length; i++) {
for (int j = 0; j < str[i].length(); j++) {
System.out.print(str[i].charAt(j)+"\t");
}
System.out.println();
}
}
}
数据05g.txt格式为fasta格式,格式如下:
>1
GCATCATTCTTTCCGTGTTCACAGTTGTTTGATCGCATATTCTGAACAAGATATTCATTG
TTTGGTTGGCTGACAAATTTGACCTATAAAGAGATTTTATTTTTCTATTTCATTTCTCAC
ATTCATGACCGGACTTGAAAGGAAGATTTCAAAATCTTTTGGAGATCTAGTTTAAATTTT
>2
GCATCATTCTTTCCGTGTTCACAGTTGTTTGATCGCATATTCTGAACAAGATATTCATTG
TTTGGTTGGCTGACAAATTTGACCTATAAAGAGATTTTATTTTTCTATTTCATTTCTCAC
ATTCATGACCGGACTTGAAAGGAAGATTTCAAAATCTTTTGGAGATCTAGTTTAAATTTT
>3
GCATCATTCTTTCCGTGTTCACAGTTGTTTGATCGCATATTCTGAACAAGATATTCATTG
TTTGGTTGGCTGACAAATTTGACCTATAAAGAGATTTTATTTTTCTATTTCATTTCTCAC
ATTCATGACCGGACTTGAAAGGAAGATTTCAAAATCTTTTGGAGATCTAGTTTAAATTTT