0.test
import pandas as pd
import os
import gzip
import click
df1=pd.read_csv(r'F:\20221128_设计1500物种+1500耐药基因\5476_Candida_albicans.fasta.json.blastn.1000.xls',sep='\t')
print(df1)
1.count_pathogen_seq
import os
import click
import re
import pandas as pd
@click.command()
# @click.option('--inputpath1','-ip1',require=True,type=str,help='anorv-crpm002-single-cap.r2.out.mapped.sam')
# @click.option('--reference','-r',require=True,type=str,help='accession_to_Ename.xls')
# @click.option('--outputpath1','-op1',require=True,type=str,help='count.xls')
@click.option('-ip1', '--inputpath1', help='inputpath1(anorv-crpm002-single-cap.r2.out.mapped.sam |...)', required=True)
@click.option('-r', '--reference', help='reference(accession_to_Ename.xls |...)', required=True)
@click.option('-op1', '--outputpath1', help='outputpath1(count.xls |...)', required=True)
def count_species(inputpath1,reference,outputpath1):
# referenece = r'C:\Users\Administrator\Desktop\20230327需要查看的\accession_to_Ename.xls'
# inputpath1 = r'C:\Users\Administrator\Desktop\20230327需要查看的\anorv-crpm002-single-cap.r2.out.mapped.sam'
# outputpath1 = r'C:\Users\Administrator\Desktop\20230327需要查看的\count.xls'
df_ref = pd.read_csv(reference, sep='\t')
dict_ref = dict(zip(df_ref['accession'], df_ref['Ename']))
list_accession = []
with open(inputpath1, 'r') as f2:
for line in f2:
line = line.strip('\n')
list1 = line.split('\t')
if sum([int(item.replace('M', '')) for item in re.findall('\d+M', list1[5])]) >= 145:
list_accession.append(dict_ref[list1[2]])
value = pd.value_counts(list_accession)
# print(value)
# print(value.index,value.values)
#
dict1 = dict(zip(value.index, value.values))
# print(dict1)
df1 = pd.DataFrame.from_dict(dict1, orient='index', columns=['count'])
df1 = df1.rename_axis('Ename').reset_index()
# print(df_ref)
# df1=pd.merge(df1,df_ref,how='left',on=['accession'])
# # df1=df1[[accession]]
print(df1)
df1.to_csv(outputpath1, sep='\t', index=None)
if __name__=='__main__':
count_species()
## python3 20230328_os_click_count_pathogen_seq.py -ip1 /data/lijing/test_result/anorv-crpm002-single-cap.r2.out.mapped.sam -r /data/lijing/download/accession_to_Ename.xls -op1 /data/lijing/test_result/anorv-crpm002-single-cap.r2.count.xls
2.merge_pathogen_seq
import os
import pandas as pd
import click
@click.command()
@click.option('-i','--inputpath',help='inputpath(dir|...)',required=True)
@click.option('-o','--outputpath',help='outputpath(merge.count.xls|...)',required=True)
def merge_count(inputpath,outputpath):
# inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的'
# outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\merge.count.xls'
list_need=[item for item in os.listdir(inputpath) if item.endswith('.count.xls')]
df_add=pd.DataFrame(columns=['Ename'])
for i in list_need:
df1=pd.read_csv(inputpath+'/'+i,sep='\t')
df1=df1.rename(columns={'count':i.split('.')[0]+'_count'})
print(df1)
df_add=pd.merge(df_add,df1,how='outer',on=['Ename'])
df_add.fillna(0,inplace=True)
print(df_add)
df_add.to_csv(outputpath,sep='\t',index=None)
if __name__=='__main__':
merge_count()
## python3 /data/lijing/download/20230328_os_click_merge_pathogen_seq.py -i /data/lijing/test_result -o /data/lijing/test_result/merge.count.xls
3.统计序列
import gzip
inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\GCF_009858895.2_ASM985889v3_genomic.fna.gz'
outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\accession_to_Ename.xls'
with gzip.open(inputpath,'rb') as f1,open(outputpath,'w') as f2:
f2.write('%s' %('accession\tEname\n'))
for line in f1:
line=line.decode(encoding='utf-8').strip()
if line.startswith('>'):
f2.write('{}\t{}\n'.format(line.split(' ')[0][1:],'Severe acute respiratory syndrome coronavirus 2'))
import re
import pandas as pd
referenece=r'C:\Users\Administrator\Desktop\20230327需要查看的\accession_to_Ename.xls'
inputpath1=r'C:\Users\Administrator\Desktop\20230327需要查看的\anorv-crpm002-single-cap.r2.out.mapped.sam'
outputpath1=r'C:\Users\Administrator\Desktop\20230327需要查看的\count.xls'
df_ref=pd.read_csv(referenece,sep='\t')
dict_ref=dict(zip(df_ref['accession'],df_ref['Ename']))
list_accession=[]
with open(inputpath1,'r') as f2:
for line in f2:
line=line.strip('\n')
list1=line.split('\t')
if sum([int(item.replace('M',''))for item in re.findall('\d+M',list1[5])]) >= 145:
list_accession.append(dict_ref[list1[2]])
value=pd.value_counts(list_accession)
# print(value)
# print(value.index,value.values)
#
dict1=dict(zip(value.index,value.values))
# print(dict1)
df1=pd.DataFrame.from_dict(dict1,orient='index',columns=['count'])
df1=df1.rename_axis('Ename').reset_index()
# print(df_ref)
# df1=pd.merge(df1,df_ref,how='left',on=['accession'])
# # df1=df1[[accession]]
print(df1)
df1.to_csv(outputpath1,sep='\t',index=None)
4.count_seq_rpm
import os
import click
import re
import pandas as pd
import math
@click.command()
# @click.option('--inputpath1','-ip1',require=True,type=str,help='anorv-crpm002-single-cap.r2.out.mapped.sam')
# @click.option('--reference','-r',require=True,type=str,help='accession_to_Ename.xls')
# @click.option('--outputpath1','-op1',require=True,type=str,help='count.xls')
@click.option('-ip1', '--inputpath1', help='inputpath1(anorv-crpm002-single-cap.r2.out.mapped.sam |...)', required=True)
@click.option('-r', '--reference', help='reference(accession_to_Ename.xls |...)', required=True)
@click.option('-op1', '--outputpath1', help='outputpath1(count.xls |...)', required=True)
@click.option('-ip2', '--inputpath2', help='inputpath2(anorv-crpm002-single-cap.qc.xls |...)', required=True)
def count_species(inputpath1,reference,outputpath1,inputpath2):
# referenece = r'C:\Users\Administrator\Desktop\20230327需要查看的\accession_to_Ename.xls'
# inputpath1 = r'C:\Users\Administrator\Desktop\20230327需要查看的\anorv-crpm002-single-cap.r2.out.mapped.sam'
# outputpath1 = r'C:\Users\Administrator\Desktop\20230327需要查看的\count.xls'
df_ref = pd.read_csv(reference, sep='\t')
dict_ref = dict(zip(df_ref['accession'], df_ref['Ename']))
df_qc=pd.read_csv(inputpath2,sep='\t')
dict_qc={}
# df_qc['value']
for i in range(len(df_qc)):
dict_qc[df_qc.loc[i,'item']]=int(df_qc.loc[i,'value'])
list_accession = []
with open(inputpath1, 'r') as f2:
for line in f2:
line = line.strip('\n')
list1 = line.split('\t')
if sum([int(item.replace('M', '')) for item in re.findall('\d+M', list1[5])]) >= 145:
list_accession.append(dict_ref[list1[2]])
if len(list_accession)==0:
df1=pd.DataFrame(columns=['Ename','count','rpm'])
df1.to_csv(outputpath1, sep='\t', index=None)
else:
value = pd.value_counts(list_accession)
# print(value)
# print(value.index,value.values)
#
dict1 = dict(zip(value.index, value.values))
# print(dict1)
df1 = pd.DataFrame.from_dict(dict1, orient='index', columns=['count'])
df1 = df1.rename_axis('Ename').reset_index()
# print(df_ref)
# df1=pd.merge(df1,df_ref,how='left',on=['accession'])
# # df1=df1[[accession]]
df1['rpm']=df1['count'].apply(lambda x: math.ceil(x/dict_qc['clean_reads']*1000000))
print(df_qc)
print(dict_qc)
print(type(dict_qc['clean_reads']),df1)
df1.to_csv(outputpath1, sep='\t', index=None)
if __name__=='__main__':
count_species()
5.estimate_qc.
import json
import os
import click
@click.command()
@click.option('-i','--inputpath',help='inputpath(nanorv-crpm002-single-cap.json)',required=True)
@click.option('-o','--outputpath',help='outputpath(nanorv-crpm002-single-cap.qc.xls)',required=True)
def estimate_qc(inputpath,outputpath):
# inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\nanorv-crpm002-single-cap.json'
file=open(inputpath)
dict1=json.load(file)
# file=json.loads(inputpath)
print(dict1)
print(dict1['summary'])
# outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\nanorv-crpm002-single-cap.qc.xls'
with open(outputpath,'w') as f1:
f1.write('{}\t{}\n'.format('item','value'))
# for item in dict1['summary']:
# if item == 'before_filtering':
# if
f1.write('%s\t%s\n' %('raw_reads',str(dict1['summary']['before_filtering']['total_reads'])))
f1.write('%s\t%s\n' % ('raw_base', str(dict1['summary']['before_filtering']['total_bases'])))
f1.write('%s\t%s\n' %('clean_reads',str(dict1['summary']['after_filtering']['total_reads'])))
f1.write('%s\t%s\n' % ('clean_base', str(dict1['summary']['after_filtering']['total_bases'])))
f1.write('%s\t%s\n' %('q20(%)',str(round(dict1['summary']['after_filtering']['q20_rate']*100,2))))
f1.write('%s\t%s\n' % ('q30(%)', str(round(dict1['summary']['after_filtering']['q30_rate']*100,2))))
if __name__=='__main__':
estimate_qc()
6.merge_seq_rpm
import os
import pandas as pd
import click
@click.command()
@click.option('-i','--inputpath',help='inputpath(dir|...)',required=True)
@click.option('-o','--outputpath',help='outputpath(merge.count.xls|...)',required=True)
def merge_count(inputpath,outputpath):
# inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的'
# outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\merge.count.xls'
list_need=[item for item in os.listdir(inputpath) if item.endswith('.count.rpm.xls')]
df_add=pd.DataFrame(columns=['Ename'])
for i in list_need:
df1=pd.read_csv(inputpath+'/'+i,sep='\t')
df1=df1.rename(columns={'count':i.split('.')[0]+'_count','rpm':i.split('.')[0]+'_rpm'})
print(df1)
df_add=pd.merge(df_add,df1,how='outer',on=['Ename'])
df_add.fillna(0,inplace=True)
print(df_add)
df_add.to_csv(outputpath,sep='\t',index=None)
if __name__=='__main__':
merge_count()
7.merge_qc
import pandas as pd
import os
import numpy as np
import click
@click.command()
@click.option('-i','--inputpath',help='inputpath(dir)|..',required=True)
@click.option('-o','--outputpath',help='outputpath(qc.merge.xls)|..',required=True)
def qc_merge(inputpath,outputpath):
# inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的'
list_need=[item for item in os.listdir(inputpath) if item.endswith('.qc.xls')]
# outputpath=r''
df_add=pd.DataFrame()
for i in list_need:
# with open()
df1=pd.read_csv(inputpath+'/'+i,sep='\t')
print(df1)
df2=df1.T
# print(df1.T)
df2=df2.rename_axis('id').reset_index(drop=True)
array=np.array(df2)
list_1=array.tolist()
print(list_1)
list_name=list_1[0]
df2.columns=list_name
df2.drop([0],inplace=True,axis=0)
df2=df2.reset_index(drop=True)
df2['sample_name']=i.split('.qc.xls',1)[0]
print(df2)
# df_add=df_add.append(df2)
df_add=pd.concat([df_add,df2],axis=0).reset_index(drop=True)
df_part=df_add.pop('sample_name')
df_add.insert(0,'sample_name',df_part)
print(df_add)
df_add.to_csv(outputpath,sep='\t',index=None)
if __name__=='__main__':
qc_merge()
8.generate_tpl
import docxtpl
from docxtpl import DocxTemplate
from docx.shared import Mm
from docxtpl import InlineImage
from datetime import datetime
from datetime import timedelta
### http://www.360doc.com/content/21/1130/10/77916720_1006517842.shtml
### 自动填入表格信息
template=r'C:\Users\Administrator\Desktop\20230327需要查看的\模板.docx'
inputpath=r''
outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\自动报告.docx'
tpl=DocxTemplate(template)
date=datetime.today().strftime('%Y-%m-%d')
print(date)
# ## 自动填入图片
# context = {'myimage': InlineImage(tpl, 'templates/python_logo.png', width=Mm(20))}
#
# tpl.render(context)
# tpl.save(outputpath)