基于Astral利用单拷贝同源基因构建物种树

该博客介绍了如何使用Python脚本在Linux环境下,通过proteinortho、muscle和Astral软件,自动化构建单拷贝同源蛋白的系统发育树。首先处理序列文本,去除空格,然后进行同源比对,提取单拷贝蛋白,整合序列,进行muscle比对,最后利用Astral构建物种树。整个过程包括序列预处理、比对、过滤、整合、比对、提取和建树等多个步骤。
摘要由CSDN通过智能技术生成

想介绍的都在之前的文章里了构建单拷贝同源蛋白系统发育树,一条命令提序列!
不同的是,之前是将得到的单拷贝同源基因比对后进行了串联,每个物种都得到一个很大的序列,然后进行建树;现在是使用并联的方法,是将每个单拷贝同源基因集比对后建树,然后再利用Astral构建了物种树,在之前的脚本上进行了功能扩充,输入的命令不变,只是最后直接多了一个*.gene.species.tree的树文件,树都建好了。

##构建单拷贝同源蛋白, python3
##在linux下运行,需要安装proteinortho与muscle,proteinortho安装地址:http://www.bioinf.uni-leipzig.de/Software/proteinortho/,muscle地址:http://www.drive5.com/muscle/。
##需要安装Astral,地址:https://github.com/smirarab/ASTRAL
##作者:刘宁华  山东大学青岛校区  1039438318@qq.com

#1. 处理序列文本
def chuli_seq(in_file_0):
	import os
	path = os.getcwd()
	path = path + "/" + in_file_0 + "/"
	files = os.listdir(path)
	file_path = []
	for file in files:
		file_path_1 = path + file
		file_path.append(file_path_1)
	for file in file_path:
		cmd = "sed -i 's/ /_/g' " + file
		os.system(cmd)
	return print("1. 序列预处理")


#2. 运行同源比对命令
def protein_ortho(file):
	import os
	cmd = "proteinortho6.pl -project=" + file + "_work -e=1e-5 -cov=50 -identity=50 -clean " + file + "/*.faa"
	os.system(cmd)
	return print("2. 完成同源比对")


#3. 比对结果处理,提取单拷贝蛋白结果
def guoLv_result(in_file_1, num):
	in_file_1 = open(in_file_1, 'r')
	out_file_1 = open("work_ortho", 'w')
	for line in in_file_1:
		if line.split('\t')[0] == line.split('\t')[1] == num:
			out_file_1.write(line)
	out_file_1.close()
	return print("3. 完成文件过滤")


#4. 整合全基因组蛋白序列为一个文件whole_protein
def get_whole_protein(file_name):
	import os
	cmd = "cd " + file_name +" ; cat ./*.faa >>  whole_protein ; mv whole_protein .. ; cd .."
	os.system(cmd)
	return print("4. 全基因组整合完成")


#5. fasta序列存为字典
def read_to_dict(in_file_2):
	seq_dict = {}
	ac = ''
	seq = ''
	for line in open(in_file_2):
		if line.startswith('>') and seq != '':
			seq_dict[ac] = seq
			seq = ''
		if line.startswith('>'):
			ac = line.strip()[1:]
		else:
			seq = seq + line.strip()
	seq_dict[ac] = seq
	return seq_dict


#6. 获取名称列表
def get_bacteria_name(in_file3):
	file = open(in_file3)
	line = file.readline()
	names = line.replace('.faa', '').split()[4:]
	return names


#7. 逐行读取记录,并将符合的序列写入新文件中
def get_single_seq(seq_dict):
	i = 1
	for line in open("work_ortho", 'r'):
		lines = line.strip().split('\t')[3:]
		out_file = open('%s.faa' % (i), 'w')
		for key in seq_dict:
			if key in lines:
				out_file.write('>' + key + '\n' + seq_dict[key] + '\n')
		out_file.close()
		i = i + 1
	return print("5. 完成蛋白序列提取")


#8. 获取单拷贝蛋白数量
def get_protein_num():
	import glob
	path_file = glob.glob(r'*.faa')
	protein_num = len(path_file)
	return protein_num


#9. muscle比对,并整合为一个文件 protein_align.fas
def get_muscle():
	import os
	cmd1 = "for file in ./*.faa; do muscle -in $file -out $file.fas1; done"
	cmd2 = "cat ./*.fas1 >>  protein_align.fas"
	os.system(cmd1)
	os.system(cmd2)
	return print("6. 完成序列比对")


#10. 提取同源蛋白序列
def get_result(names, seq_dict, protein_num):
	out_file_name = str(protein_num) + ".result.fasta"
	out_file = open(out_file_name, 'w')
	i = 3
	for name in names:
		seq_list = []
		seq_align = ''
		# 将蛋白序列读取为列表
		for lines in open("work_ortho", 'r'):
			lines = lines.strip().split('\t')[i]  # 索引第几列就是哪个蛋白序列
			seq_list.append(lines)
		for key in seq_dict:
			if key in seq_list:
				seq_align = seq_align + seq_dict[key]
		out_file.write('>' + name + '\n' + seq_align + '\n')
		i = i + 1
	out_file.close()
	return print("7. 生成目标文件 --> *.result.fasta")


#11,对比对好的单个单拷贝基因序列进行改名处理,先移入single_a_gene
def get_tree(in_file1, num2):
    import os
    cmd1 = "mkdir single_a_gene"
    cmd2 = "mv *.fas1 single_a_gene/"
    cmd3 = "mkdir changed_ID"
    os.system(cmd1)
    os.system(cmd2)
    os.system(cmd3)
    path = os.getcwd()
    #获取物种名与RASTid的对应
    path1 = path + "/" + in_file1
    name_id_dit = {}
    files1 = os.listdir(path1)
    for file1 in files1:
        path11 = path1 + "/" + file1
        file2 = file1.split(".")[0]
        with open(path11, "r") as infile:
            for line in infile:
                line1 = line.strip().split(".")[0:2]
                line2 = '.'.join(line1)
                name_id_dit[line2] = file2
                break
    #将比对好的单拷贝序列改名为物种名
    path2 = path + "/single_a_gene"
    files2 = os.listdir(path2)
    path3 = path + "/changed_ID"
    for file2 in files2:
        path22 = path2 + "/" + file2
        outfile = path3 + "/" + file2 + ".fasta"
        with open(outfile, "a") as outfile2:
            with open(path22, "r") as infile2:
                for line in infile2:
                    if line.startswith(">"):
                        line1 = line.strip().split(".")[0:2]
                        line2 = '.'.join(line1)
                        outfile2.write(">" + name_id_dit[line2] + "\n")
                    else:
                        outfile2.write(line)
    #fasttree建树
    cmd4 = "for file in changed_ID/*.fasta; do fasttree -boot 1000 $file > $file.tree; done"
    os.system(cmd4)
    cmd5 = "cat changed_ID/*.tree >> ./single_gene.tree"
    os.system(cmd5)
    cmd6 = "java -jar /home/LiuNingHua/lnh-software/Astral/astral.5.7.5.jar  -i single_gene.tree -o " + str(num2) + ".gene.species.tree"
    os.system(cmd6)
    return print("**. 生成物种树 --> *.gene.species.tree")


#12. 删除无用文件
def del_file():
	import os
	cmd1 = "rm -rf  *.faa  *.fas1  *.fas work_ortho whole_protein single_a_gene changed_ID"
	os.system(cmd1)
	return print("8. 文件清理,结束!")


###########################################################
if __name__=='__main__':
	import sys
	import os
	import time
	try:
		file_name = sys.argv[1]
		num = sys.argv[2]
		test = int(sys.argv[2])  #确保输入顺序正确
	except:
		print("请输入正确参数顺序!")
		print("usage: python3 protein_ortho.py <file_name> <bacteria_num>")
	else:
		chuli_seq(file_name)  #处理序列名中的空格
		in_file = file_name + "_work.proteinortho.tsv"
		protein_ortho(file_name)  #同源比对
		while True:
			if os.path.isfile(in_file):
				time.sleep(20)  #确保文件.proteinortho.tsv已完成写入
				guoLv_result(in_file, num)   # 获得work_ortho
				get_whole_protein(file_name)   # 集合全基因组,生成whole_protein
				whole_protein_dict = read_to_dict(in_file_2='whole_protein')   # 全基因组存入字典
				get_single_seq(whole_protein_dict)   # 提取出单蛋白序列
				protein_num = get_protein_num()  #获取单拷贝蛋白的数量
				get_muscle()   # 完成比对
				align_protein_dict = read_to_dict(in_file_2="protein_align.fas")  #提取后的蛋白序列存入字典
				bac_names = get_bacteria_name(in_file)  # 菌株名称
				get_result(bac_names, align_protein_dict, protein_num)  #完成序列提取
				get_tree(file_name, protein_num)  #Astral建树
				del_file()   # 删除中间文件
				break
  • 6
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值