import os,re,sys
import gzip
import os
import pandas as pd
import json
inputpath1=r'流程'
list1=[item for item in os.listdir(inputpath1) if item.endswith('.fq.gz')]
outputpath1=r'流程\result.xls'
for i in list1:
with gzip.open(inputpath1+'/'+i,'rb') as f1,open(outputpath1,'a') as f2:
n=0
seq_num=0
bp_num=0
GC_num=0
Q20=0
Q30=0
# dict_len={}
list_len=[]
for line in f1:
line=line.decode(encoding='utf-8').strip()
# if line.startswith('@'):
# seq_num +=1
if n % 4 == 0:
seq_num += 1
if n % 4 == 1:
bp_num += len(line)
GC_num += line.count('G')+line.count('C')+line.count('g')+line.count('c')
list_len.append(len(line))
# if len(line) in dict_len:
# dict_len[len(line)]
# print(GC_num)
if n % 4 == 3:
list_Q=list(line)
Q20 += len([item for item in list_Q if ord(item) - 33 >= 20])
Q30 += len([item for item in list_Q if ord(item) - 33 >= 30])
n += 1
value_counts = pd.value_counts(list_len)
dict_len=dict(zip(value_counts.index.tolist(),value_counts.values.tolist()))
f2.write('{}'.format(i+'\t'+str(seq_num)+'\t'+str(bp_num)+'\t'+str(round(GC_num/bp_num*100,2))+'\t'+str(round(Q20/bp_num*100,2))+'\t'+str(round(Q30/bp_num*100,2))+'\t'+str(max(list_len))+'\t'+str(min(list_len))+'\t'+json.dumps(dict_len)+'\n'))
# infile=file("test.fq.txt")
#
# def qual_stat(qstr):
# q20 = 0
# q30 = 0
# for q in qstr:
# qual = ord(q) - 33# ord 返回对应的ASCII数值
# if qual >= 30:
# q30 += 1
# q20 += 1
# elif qual >= 20:
# q20 += 1
# return q20, q30
#
# arr=[]
# for line in infile:
# line=line.strip()
# arr.append(line)
# j=0
# a,g,c,t,n,dna=0,0,0,0,0,0
# total_count = 0
# q20_count = 0
# q30_count = 0
#
# for i in range(len(arr)):
# if arr[i].startswith("@"):
# j+=1
# #dna+=arr[i+1].count("A")+arr[i+1].count("G")+arr[i+1].count("C")+arr[i+1].count("T")
# dna+=len(arr[i+1])
# a+=arr[i+1].count("A")
# g+=arr[i+1].count("G")
# c+=arr[i+1].count("C")
# t+=arr[i+1].count("T")
# n+=arr[i+1].count("N")
# #total_count += len(arr[i+3])
# q20, q30 = qual_stat(arr[i+3])
# #q20_count += q20
# q30_count += q30
# ap=float(a)*100/float(dna)
# gp=float(g)*100/float(dna)
# cp=float(c)*100/float(dna)
# tp=float(t)*100/float(dna)
# np=float(n)*100/float(dna)
# q30=float(q30_count)*100/float(dna)
# # print "Total Reads:",j
# # print "Total Bases:",dna
# # print "Q30 Bases:",q30_count
# # print "q30 percents:",("%.2f" % q30)+"%"
# # print "A:",a,("%.2f" % ap)+"%"
# # print "G:",g,("%.2f" % gp)+"%"
# # print "C:",c,("%.2f" % cp)+"%"
# # print "T:",t,("%.2f" % tp)+"%"
# # print "N:",n,("%.2f" % np)+"%"
python对fastq的序列数,碱基数,GC含量,Q20,Q30统计-20230316
最新推荐文章于 2024-08-08 14:28:23 发布