#!/usr/bin/env python
#-*- encoding=UTF-8 -*-
import multiprocessing
import gzip
import sys
## this function cal the sequence length per file
def cal_length(f):
seq = []
reads = 0
tt = gzip.open(f).readlines()
tlen = len(tt)
plen = tlen/4
for i in range(0,plen):
start,end = (i*4,(i+1)*4)
pdna = tt[start:end][1]
reads += 1
seq.append(len(pdna))
return (f,reads,seq)
if __name__ == '__main__':
fasta_files = sys.argv[1]## fasta files input list
flist = [ f.strip() for f in open(fasta_files).readlines() if f.strip()] ## read the list file
pool = multiprocessing.Pool(10) ## multi processing cal the sequence length
results = pool.map(cal_length,flist) ## get the calulate results
lengths = [] ### the seqences list
pool.close() ## close the processing
pool.join()
for res in results: ## read results
#print ('fasta file',res[0],'reads number:',res[1])
for r in res[2]:
lengths.append(int(r))
## the N50 calculate
N50_pos = sum(lengths)/2.0
lengths.sort()
lengths.reverse()
ValueSum,N50 = 0,0
for value in lengths:
ValueSum += value
if N50_pos <= ValueSum:
N50 = value
break
print ("fasta files N50:",N50)
1.使用示例:
python n50_cal_batch.py f.list
f.list是fastq文件的绝对路径的列表。
本博主新开公众号, 希望大家能扫码关注一下,十分感谢大家。