pathogen-next-generation-sequence,一般测序检测的流程和统计的事项

0.test

import pandas as pd
import os
import gzip
import click

df1=pd.read_csv(r'F:\20221128_设计1500物种+1500耐药基因\5476_Candida_albicans.fasta.json.blastn.1000.xls',sep='\t')
print(df1)

1.count_pathogen_seq

import os
import click
import re
import pandas as pd

@click.command()
# @click.option('--inputpath1','-ip1',require=True,type=str,help='anorv-crpm002-single-cap.r2.out.mapped.sam')
# @click.option('--reference','-r',require=True,type=str,help='accession_to_Ename.xls')
# @click.option('--outputpath1','-op1',require=True,type=str,help='count.xls')
@click.option('-ip1', '--inputpath1', help='inputpath1(anorv-crpm002-single-cap.r2.out.mapped.sam |...)', required=True)
@click.option('-r', '--reference', help='reference(accession_to_Ename.xls |...)', required=True)
@click.option('-op1', '--outputpath1', help='outputpath1(count.xls |...)', required=True)


def count_species(inputpath1,reference,outputpath1):
    # referenece = r'C:\Users\Administrator\Desktop\20230327需要查看的\accession_to_Ename.xls'
    # inputpath1 = r'C:\Users\Administrator\Desktop\20230327需要查看的\anorv-crpm002-single-cap.r2.out.mapped.sam'
    # outputpath1 = r'C:\Users\Administrator\Desktop\20230327需要查看的\count.xls'

    df_ref = pd.read_csv(reference, sep='\t')
    dict_ref = dict(zip(df_ref['accession'], df_ref['Ename']))

    list_accession = []
    with open(inputpath1, 'r') as f2:
        for line in f2:
            line = line.strip('\n')
            list1 = line.split('\t')
            if sum([int(item.replace('M', '')) for item in re.findall('\d+M', list1[5])]) >= 145:
                list_accession.append(dict_ref[list1[2]])

    value = pd.value_counts(list_accession)
    # print(value)
    # print(value.index,value.values)
    #
    dict1 = dict(zip(value.index, value.values))
    # print(dict1)
    df1 = pd.DataFrame.from_dict(dict1, orient='index', columns=['count'])
    df1 = df1.rename_axis('Ename').reset_index()
    # print(df_ref)
    # df1=pd.merge(df1,df_ref,how='left',on=['accession'])
    # # df1=df1[[accession]]
    print(df1)

    df1.to_csv(outputpath1, sep='\t', index=None)


if __name__=='__main__':
    count_species()

## python3 20230328_os_click_count_pathogen_seq.py -ip1 /data/lijing/test_result/anorv-crpm002-single-cap.r2.out.mapped.sam -r /data/lijing/download/accession_to_Ename.xls -op1 /data/lijing/test_result/anorv-crpm002-single-cap.r2.count.xls

2.merge_pathogen_seq

import os
import pandas as pd
import click

@click.command()
@click.option('-i','--inputpath',help='inputpath(dir|...)',required=True)
@click.option('-o','--outputpath',help='outputpath(merge.count.xls|...)',required=True)

def merge_count(inputpath,outputpath):
    # inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的'
    # outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\merge.count.xls'
    list_need=[item for item in os.listdir(inputpath) if item.endswith('.count.xls')]

    df_add=pd.DataFrame(columns=['Ename'])
    for i in list_need:
        df1=pd.read_csv(inputpath+'/'+i,sep='\t')
        df1=df1.rename(columns={'count':i.split('.')[0]+'_count'})
        print(df1)
        df_add=pd.merge(df_add,df1,how='outer',on=['Ename'])

    df_add.fillna(0,inplace=True)
    print(df_add)

    df_add.to_csv(outputpath,sep='\t',index=None)

if __name__=='__main__':
    merge_count()

## python3 /data/lijing/download/20230328_os_click_merge_pathogen_seq.py -i /data/lijing/test_result -o /data/lijing/test_result/merge.count.xls

3.统计序列

import gzip

inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\GCF_009858895.2_ASM985889v3_genomic.fna.gz'
outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\accession_to_Ename.xls'
with gzip.open(inputpath,'rb') as f1,open(outputpath,'w') as f2:
    f2.write('%s' %('accession\tEname\n'))
    for line in f1:
        line=line.decode(encoding='utf-8').strip()
        if line.startswith('>'):
            f2.write('{}\t{}\n'.format(line.split(' ')[0][1:],'Severe acute respiratory syndrome coronavirus 2'))

import re
import pandas as pd

referenece=r'C:\Users\Administrator\Desktop\20230327需要查看的\accession_to_Ename.xls'
inputpath1=r'C:\Users\Administrator\Desktop\20230327需要查看的\anorv-crpm002-single-cap.r2.out.mapped.sam'
outputpath1=r'C:\Users\Administrator\Desktop\20230327需要查看的\count.xls'

df_ref=pd.read_csv(referenece,sep='\t')
dict_ref=dict(zip(df_ref['accession'],df_ref['Ename']))

list_accession=[]
with open(inputpath1,'r') as f2:
    for line in f2:
        line=line.strip('\n')
        list1=line.split('\t')
        if sum([int(item.replace('M',''))for item in re.findall('\d+M',list1[5])]) >= 145:
            list_accession.append(dict_ref[list1[2]])

value=pd.value_counts(list_accession)
# print(value)
# print(value.index,value.values)
#
dict1=dict(zip(value.index,value.values))
# print(dict1)
df1=pd.DataFrame.from_dict(dict1,orient='index',columns=['count'])
df1=df1.rename_axis('Ename').reset_index()
# print(df_ref)
# df1=pd.merge(df1,df_ref,how='left',on=['accession'])
# # df1=df1[[accession]]
print(df1)

df1.to_csv(outputpath1,sep='\t',index=None)

4.count_seq_rpm

import os
import click
import re
import pandas as pd
import math

@click.command()
# @click.option('--inputpath1','-ip1',require=True,type=str,help='anorv-crpm002-single-cap.r2.out.mapped.sam')
# @click.option('--reference','-r',require=True,type=str,help='accession_to_Ename.xls')
# @click.option('--outputpath1','-op1',require=True,type=str,help='count.xls')
@click.option('-ip1', '--inputpath1', help='inputpath1(anorv-crpm002-single-cap.r2.out.mapped.sam |...)', required=True)
@click.option('-r', '--reference', help='reference(accession_to_Ename.xls |...)', required=True)
@click.option('-op1', '--outputpath1', help='outputpath1(count.xls |...)', required=True)
@click.option('-ip2', '--inputpath2', help='inputpath2(anorv-crpm002-single-cap.qc.xls |...)', required=True)

def count_species(inputpath1,reference,outputpath1,inputpath2):
    # referenece = r'C:\Users\Administrator\Desktop\20230327需要查看的\accession_to_Ename.xls'
    # inputpath1 = r'C:\Users\Administrator\Desktop\20230327需要查看的\anorv-crpm002-single-cap.r2.out.mapped.sam'
    # outputpath1 = r'C:\Users\Administrator\Desktop\20230327需要查看的\count.xls'

    df_ref = pd.read_csv(reference, sep='\t')
    dict_ref = dict(zip(df_ref['accession'], df_ref['Ename']))
    df_qc=pd.read_csv(inputpath2,sep='\t')
    dict_qc={}
    # df_qc['value']
    for i in range(len(df_qc)):
        dict_qc[df_qc.loc[i,'item']]=int(df_qc.loc[i,'value'])

    list_accession = []
    with open(inputpath1, 'r') as f2:
        for line in f2:
            line = line.strip('\n')
            list1 = line.split('\t')
            if sum([int(item.replace('M', '')) for item in re.findall('\d+M', list1[5])]) >= 145:
                list_accession.append(dict_ref[list1[2]])

    if len(list_accession)==0:
        df1=pd.DataFrame(columns=['Ename','count','rpm'])
        df1.to_csv(outputpath1, sep='\t', index=None)
    else:
        value = pd.value_counts(list_accession)
        # print(value)
        # print(value.index,value.values)
        #
        dict1 = dict(zip(value.index, value.values))
        # print(dict1)
        df1 = pd.DataFrame.from_dict(dict1, orient='index', columns=['count'])
        df1 = df1.rename_axis('Ename').reset_index()
        # print(df_ref)
        # df1=pd.merge(df1,df_ref,how='left',on=['accession'])
        # # df1=df1[[accession]]

        df1['rpm']=df1['count'].apply(lambda x: math.ceil(x/dict_qc['clean_reads']*1000000))
        print(df_qc)
        print(dict_qc)
        print(type(dict_qc['clean_reads']),df1)
        df1.to_csv(outputpath1, sep='\t', index=None)


if __name__=='__main__':
    count_species()

5.estimate_qc.

import json
import os
import click

@click.command()
@click.option('-i','--inputpath',help='inputpath(nanorv-crpm002-single-cap.json)',required=True)
@click.option('-o','--outputpath',help='outputpath(nanorv-crpm002-single-cap.qc.xls)',required=True)

def estimate_qc(inputpath,outputpath):
    # inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\nanorv-crpm002-single-cap.json'
    file=open(inputpath)
    dict1=json.load(file)
    # file=json.loads(inputpath)
    print(dict1)
    print(dict1['summary'])

    # outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\nanorv-crpm002-single-cap.qc.xls'
    with open(outputpath,'w') as f1:
        f1.write('{}\t{}\n'.format('item','value'))
        # for item in dict1['summary']:
        #     if item == 'before_filtering':
        #         if
        f1.write('%s\t%s\n' %('raw_reads',str(dict1['summary']['before_filtering']['total_reads'])))
        f1.write('%s\t%s\n' % ('raw_base', str(dict1['summary']['before_filtering']['total_bases'])))

        f1.write('%s\t%s\n' %('clean_reads',str(dict1['summary']['after_filtering']['total_reads'])))
        f1.write('%s\t%s\n' % ('clean_base', str(dict1['summary']['after_filtering']['total_bases'])))

        f1.write('%s\t%s\n' %('q20(%)',str(round(dict1['summary']['after_filtering']['q20_rate']*100,2))))
        f1.write('%s\t%s\n' % ('q30(%)', str(round(dict1['summary']['after_filtering']['q30_rate']*100,2))))

if __name__=='__main__':
    estimate_qc()

6.merge_seq_rpm

import os
import pandas as pd
import click

@click.command()
@click.option('-i','--inputpath',help='inputpath(dir|...)',required=True)
@click.option('-o','--outputpath',help='outputpath(merge.count.xls|...)',required=True)

def merge_count(inputpath,outputpath):
    # inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的'
    # outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\merge.count.xls'
    list_need=[item for item in os.listdir(inputpath) if item.endswith('.count.rpm.xls')]

    df_add=pd.DataFrame(columns=['Ename'])
    for i in list_need:
        df1=pd.read_csv(inputpath+'/'+i,sep='\t')
        df1=df1.rename(columns={'count':i.split('.')[0]+'_count','rpm':i.split('.')[0]+'_rpm'})
        print(df1)
        df_add=pd.merge(df_add,df1,how='outer',on=['Ename'])

    df_add.fillna(0,inplace=True)
    print(df_add)

    df_add.to_csv(outputpath,sep='\t',index=None)

if __name__=='__main__':
    merge_count()

7.merge_qc

import pandas as pd
import os
import numpy as np
import click

@click.command()
@click.option('-i','--inputpath',help='inputpath(dir)|..',required=True)
@click.option('-o','--outputpath',help='outputpath(qc.merge.xls)|..',required=True)

def qc_merge(inputpath,outputpath):
    # inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的'
    list_need=[item for item in os.listdir(inputpath) if item.endswith('.qc.xls')]
    # outputpath=r''

    df_add=pd.DataFrame()
    for i in list_need:
        # with open()
        df1=pd.read_csv(inputpath+'/'+i,sep='\t')
        print(df1)
        df2=df1.T
        # print(df1.T)

        df2=df2.rename_axis('id').reset_index(drop=True)

        array=np.array(df2)
        list_1=array.tolist()
        print(list_1)
        list_name=list_1[0]
        df2.columns=list_name
        df2.drop([0],inplace=True,axis=0)
        df2=df2.reset_index(drop=True)
        df2['sample_name']=i.split('.qc.xls',1)[0]
        print(df2)

        # df_add=df_add.append(df2)
        df_add=pd.concat([df_add,df2],axis=0).reset_index(drop=True)

    df_part=df_add.pop('sample_name')

    df_add.insert(0,'sample_name',df_part)
    print(df_add)

    df_add.to_csv(outputpath,sep='\t',index=None)

if __name__=='__main__':
    qc_merge()

8.generate_tpl

import docxtpl
from docxtpl import DocxTemplate
from docx.shared import Mm
from docxtpl import InlineImage
from datetime import datetime
from datetime import timedelta

### http://www.360doc.com/content/21/1130/10/77916720_1006517842.shtml
### 自动填入表格信息
template=r'C:\Users\Administrator\Desktop\20230327需要查看的\模板.docx'
inputpath=r''
outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\自动报告.docx'

tpl=DocxTemplate(template)

date=datetime.today().strftime('%Y-%m-%d')
print(date)

# ## 自动填入图片
# context = {'myimage': InlineImage(tpl, 'templates/python_logo.png', width=Mm(20))}
#
# tpl.render(context)
# tpl.save(outputpath)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值