最近用python编写了统计scaffoldN50和contigN50等信息的脚本。
运行结果截图如下:
源代码如下:
#!/usr/bin/env python
#func: cal N50,N60,N70,N80,N90,final
import os,sys
import operator
import datetime
import re
from optparse import OptionParser
#cutoff=100
class Fasta(object):
def __init__(self,name,seq,length):
self.name=name
self.sequence=seq
self.length=length
def process_fasta(reader):
file=reader.readlines()
counter=1
items=[]
for line in file:
line=line.strip()
if line.startswith('>'):
if(counter > 1):
items.append(anistance)
counter +=1
name=line
seq=''
length=0
else:
seq += line
length=len(seq)
anistance=Fasta(name,seq,length)
items.append(anistance)
return items
def sort_len(lists):
dict1={}
for i in lists:
dict1[i.name]=i.length
dict1_sort=sorted(dict1.iteritems(),key=operator.itemgetter(1),reverse=True)
return dict1_sort
def cal_N50_N90(dict_sort):
total_len=0;tmp_len=0;counter=0;max_len=0;min_len=1000000;
N50_len=[]
N50_num=[]
#total_length,Max_length,Min_length
for t in dict_sort:
total_len += t[1]
if(t[1]>=max_len):
max_len=t[1]
if(t[1]<=min_len):
min_len=t[1]
#N50~N90
for j in range(0,len(N)):
for i in dict_sort:
tmp_len += i[1]
counter += 1
if (tmp_len >= 0.01*N[j]*total_len):
N50_len.append(i[1])
N50_num.append(counter)
tmp_len = 0
counter = 0
break
return (N50_len,N50_num,total_len,max_len,min_len)
def split_N(items):
'''
Split scaffold into contig
'''
ctg=[]
global cutoff
for t in items:
line=t.sequence
array=re.split('[N+]{1,}',line)
for i in array:
if (len(i) >= cutoff):
ctg.append(len(i))
return ctg
def contig_N50(ctg):
ctg.sort(reverse=True)
total_len=0;max_len=0;min_len=10000;
counter=0;tmp_len=0;
#cal Total_len,Max_len,Min_len
for t in ctg:
total_len += t
if(t >= max_len):
max_len=t
elif(t <= min_len):
min_len=t
else:
pass
#cal N50~N90
N50=[]
c50=[]
for i in range(0,len(N)):
for t in ctg:
tmp_len += t
counter += 1
if(tmp_len >= 0.01*N[i]*total_len):
N50.append(t)
c50.append(counter)
tmp_len=0
counter=0
break
return N50,c50,total_len,max_len,min_len
if __name__ == '__main__':
USAGE="python stat_N50_N90.py [-i <input_filename>][-l <cutoff>][-o <out_filename>]\nExample:\n\t\tpython /WORK/DENOVO/03.script/stat_N50_N90.py -i Fasta.fa -o result.xls"
optParser=OptionParser(USAGE)
optParser.add_option("-i",action="store",type="string",dest="fileName",help="The filename of the input\
#data format should be fasta style(.fa/.fasta)")
optParser.add_option("-l",action="store",type="int",dest="num",default="100",help="Cutoff of minium length\
for scaffold(length with N,default=100)")
optParser.add_option("-o",action="store",type="string",dest="outName",help="The output filename")
(options,args)=optParser.parse_args()
Infile=options.fileName
Outfile=options.outName
cutoff=options.num
print "Infile:%s\tOutfile:%s\tcutoff:%d" %(Infile,Outfile,cutoff)
start=datetime.datetime.now()
infile=open(Infile,'r')
outfile=open(Outfile,'w')
global N
N=[50,60,70,80,90]
scaff=process_fasta(infile)
new_line=sort_len(scaff)
(N50_length,N50_number,Total_length,Max_length,Min_length)=cal_N50_N90(new_line)
outfile.write("#Coverage of assemble result from file:%s\n" %Infile)
outfile.write("#Minium length cutoff is:%d\n" %cutoff)
outfile.write("#Data production:%s\n" %start)
outfile.write("Total_scaff_length: %d\tMax_length: %d\tMin_length: %d\n" %(Total_length,Max_length,Min_length))
for t in range(0,len(N)):
outfile.write("\033[34mN%d_len:%d\tN%d_num:%d\n\033[0m" %(N[t],N50_length[t],N[t],N50_number[t]))
ctg=split_N(scaff)
(N50,c50,total_len,max_len,min_len)=contig_N50(ctg)
outfile.write("Total_contig_len=%d\tMax_len=%d\tMin_len=%d\n" %(total_len,max_len,min_len))
for i in range(0,len(N)):
outfile.write("\033[36mN%d_len:%d\tN%d_num:%d\n\033[0m" %(N[i],N50[i],N[i],c50[i]))
end=datetime.datetime.now()
outfile.write("Run-time:"+"\033[32m%d\n\033[0m" %((end-start).seconds))
infile.close()
outfile.close()