用python统计scaffold的N50等信息

 

最近用python编写了统计scaffoldN50和contigN50等信息的脚本。

运行结果截图如下:


源代码如下:

#!/usr/bin/env python
#func: cal N50,N60,N70,N80,N90,final
import os,sys
import operator
import datetime
import re
from optparse import OptionParser


#cutoff=100
class Fasta(object):
    def __init__(self,name,seq,length):
        self.name=name
        self.sequence=seq
        self.length=length


def process_fasta(reader):
    file=reader.readlines()
    counter=1
    items=[]
    for line in file:
        line=line.strip()
        if line.startswith('>'):
            if(counter > 1):
                items.append(anistance)
            counter +=1
            name=line
            seq=''                    
            length=0
        else:
            seq += line
            length=len(seq)
            anistance=Fasta(name,seq,length)     
    items.append(anistance)
    return items


def sort_len(lists):
    dict1={}
    for i in lists:
        dict1[i.name]=i.length
    dict1_sort=sorted(dict1.iteritems(),key=operator.itemgetter(1),reverse=True)                
    return dict1_sort        
        
def cal_N50_N90(dict_sort):
    total_len=0;tmp_len=0;counter=0;max_len=0;min_len=1000000; 
    N50_len=[]
    N50_num=[]
    #total_length,Max_length,Min_length
    for t in dict_sort:
        total_len += t[1]
        if(t[1]>=max_len):
            max_len=t[1]
        if(t[1]<=min_len):
            min_len=t[1]            
    #N50~N90  
    for j in range(0,len(N)): 
        for i in dict_sort:
            tmp_len += i[1]
            counter += 1
            if (tmp_len >= 0.01*N[j]*total_len):
                N50_len.append(i[1])
                N50_num.append(counter)
                tmp_len = 0
                counter = 0
                break
    return (N50_len,N50_num,total_len,max_len,min_len)           




def split_N(items):
    '''
    Split scaffold into contig
    '''
    ctg=[]
    global cutoff
    for t in items:
        line=t.sequence
        array=re.split('[N+]{1,}',line)
        for i in array:
            if (len(i) >= cutoff):
                ctg.append(len(i))
    return ctg


def contig_N50(ctg):
    ctg.sort(reverse=True)
    total_len=0;max_len=0;min_len=10000;
    counter=0;tmp_len=0;
    #cal Total_len,Max_len,Min_len
    for t in ctg:
        total_len += t
        if(t >= max_len):
            max_len=t
        elif(t <= min_len):
            min_len=t
        else:
            pass
    #cal N50~N90
    N50=[]
    c50=[]
    for i in range(0,len(N)):
        for t in ctg:
            tmp_len += t
            counter += 1
            if(tmp_len >= 0.01*N[i]*total_len):
                N50.append(t)
                c50.append(counter)
                tmp_len=0
                counter=0
                break
    return N50,c50,total_len,max_len,min_len                                                            




if __name__ == '__main__':
    USAGE="python stat_N50_N90.py [-i <input_filename>][-l <cutoff>][-o <out_filename>]\nExample:\n\t\tpython /WORK/DENOVO/03.script/stat_N50_N90.py -i Fasta.fa -o result.xls"
    optParser=OptionParser(USAGE)
    optParser.add_option("-i",action="store",type="string",dest="fileName",help="The filename of the input\
    #data format should be fasta style(.fa/.fasta)")
    optParser.add_option("-l",action="store",type="int",dest="num",default="100",help="Cutoff of minium length\
    for scaffold(length with N,default=100)")
    optParser.add_option("-o",action="store",type="string",dest="outName",help="The output filename")
    (options,args)=optParser.parse_args()
    Infile=options.fileName
    Outfile=options.outName
    cutoff=options.num
    print "Infile:%s\tOutfile:%s\tcutoff:%d" %(Infile,Outfile,cutoff)




    start=datetime.datetime.now()
    infile=open(Infile,'r')
    outfile=open(Outfile,'w')    
    
    global N
    N=[50,60,70,80,90]
    scaff=process_fasta(infile)
    new_line=sort_len(scaff)
    (N50_length,N50_number,Total_length,Max_length,Min_length)=cal_N50_N90(new_line)
    outfile.write("#Coverage of assemble result from file:%s\n" %Infile)
    outfile.write("#Minium length cutoff is:%d\n" %cutoff)
    outfile.write("#Data production:%s\n" %start)
    outfile.write("Total_scaff_length: %d\tMax_length: %d\tMin_length: %d\n" %(Total_length,Max_length,Min_length))
    for t in range(0,len(N)):
        outfile.write("\033[34mN%d_len:%d\tN%d_num:%d\n\033[0m" %(N[t],N50_length[t],N[t],N50_number[t]))
    
    ctg=split_N(scaff)
    (N50,c50,total_len,max_len,min_len)=contig_N50(ctg)
    outfile.write("Total_contig_len=%d\tMax_len=%d\tMin_len=%d\n" %(total_len,max_len,min_len))
    for i in range(0,len(N)):
        outfile.write("\033[36mN%d_len:%d\tN%d_num:%d\n\033[0m" %(N[i],N50[i],N[i],c50[i]))
    end=datetime.datetime.now()
    outfile.write("Run-time:"+"\033[32m%d\n\033[0m" %((end-start).seconds))
    infile.close()
    outfile.close()     

  • 2
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值