例如:
1.统计raw_reads
import os
import gzip
inputpath = r'/data/lijing/Cust_16S_Capture'
outputpath=r'/data/lijing/Cust_16S_Capture/base.xls'
dict_rename = {'NL230601-2_S157_L001': '22-12373O', 'NL230601-6_S161_L001': '22-12373O',
'NL230608-1_S31_L001': '22-12373F', 'NL230608-2_S32_L001': '22-12373O',
'NL230608-3_S33_L001': '20-9263L', 'NL230608-4_S34_L001': '20-9263P',
'NL230608-5_S35_L001': '21-12164AB'}
list1 = [item for item in os.listdir(inputpath) if item.endswith('R1_001.fastq.gz')]
with open(outputpath,'w') as f2:
f2.write('name\tbase\n')
for i in list1:
# item.split('_R1_001.fastq.gz')[0]
with gzip.open(inputpath + '/' + i, 'rb') as f1:
base_num = 0
line_num = 0
for line in f1:
line_num += 1
if line_num % 4 == 2:
base_num += len(line)
f2.write('{}\t{}\n'.format(dict_rename[i.split('_R1_001.fastq.gz')[0]],str(base_num*2)))
3.统计种属
import os
import pandas as pd
import math
inputpath_mngs=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图2\宏基因组'
inputpath_mngs_base=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图2\宏基因组\base.xls'
inputpath_capture=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图2\捕获'
inputpath_capture_base=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图2\捕获\base.xls'
outputpath1=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图2\species_input'
outputpath2=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图2\genus_input'
df_mngs_base=pd.read_csv(inputpath_mngs_base,sep='\t')
dict_mngs_base=dict(zip(df_mngs_base['name'],df_mngs_base['base']))
df_capture_base=pd.read_csv(inputpath_capture_base,sep='\t')
dict_capture_base=dict(zip(df_capture_base['name'],df_capture_base['base']))
## mngs数据
list_mngs=[item for item in os.listdir(inputpath_mngs) if item.endswith('.classify.last.xls') ]
list_mngs_sample=list(set([item.split('.',1)[0] for item in list_mngs]))
print(list_mngs_sample)
# with open(outputpath1,'a') as f1:
for i in list_mngs_sample:
with open(outputpath1+'/'+i+'.mngs.input.xls','w') as f1:
f1.write('%s\t%s\t%s\t%s\n' %('x','y','value','group'))
for j in list_mngs:
if j.startswith(i):
df1=pd.read_csv(inputpath_mngs+'/'+j,sep='\t')
df1=df1[df1['G/S']=='species'].copy()
df1.sort_values(by=['count'],ascending=False,inplace=True)
# list_sort = df1['Ename'].tolist()
if len(df1) <= 30 :
dict_name_to_count=dict(zip(df1['Ename'],df1['count']))
else:
df1=df1.iloc[0:30,:]
dict_name_to_count = dict(zip(df1['Ename'], df1['count']))
# list_sort=df1['Ename'].tolist()[]
for item in dict_name_to_count:
f1.write('{}\t{}\t{}\t{}\n'.format(j.split('.')[1],item,dict_name_to_count[item]/dict_mngs_base[item],'mngs'))
# print(j)
## capture数据
list_capture=[item for item in os.listdir(inputpath_capture) if item.endswith('.classify.last.xls')]
list_capture_sample=list(set([item.split('.',1)[0] for item in list_capture]))
print(list_capture_sample)
# with open(outputpath1,'a') as f1
for i in list_capture_sample:
with open(outputpath1+'/'+i+'.capture.input.xls','w') as f1:
f1.write('%s\t%s\t%s\t%s\n' %('x','y','value','group'))
for j in list_capture:
if j.startswith(i):
df1=pd.read_csv(inputpath_capture+'/'+j,sep='\t')
df1=df1[df1['G/S']=='species'].copy()
df1.sort_values(by=['count'],ascending=False,inplace=True)
# list_sort = df1['Ename'].tolist()
if len(df1) <= 30 :
dict_name_to_count=dict(zip(df1['Ename'],df1['count']))
else:
df1=df1.iloc[0:30,:]
dict_name_to_count = dict(zip(df1['Ename'], df1['count']))
# list_sort=df1['Ename'].tolist()[]
for item in dict_name_to_count:
f1.write('{}\t{}\t{}\t{}\n'.format(j.split('.')[1],item,dict_name_to_count[item]/df_capture_base[item],'capture'))
list_merge=[item for item in os.listdir(outputpath1) if item.endswith('.xls')]
list_merge_sample=list(set([item.split('.',1)[0] for item in list_mngs]))
for i in list_merge_sample:
df_add=pd.DataFrame(columns=['x'])
for j in list_merge:
if j.startswith(i):
df2=pd.read_csv(outputpath1+'/'+j,sep='\t')
df_add=df_add.append(df2)
df_add['value']=df_add['value'].apply(lambda x:math.log2(x+1))
df_add.columns=['kmer','species','value','group']
df_add.to_csv(outputpath1+'/'+i+'.input.txt',sep='\t',index=None)
循环读文件
library("motifStack")
lf <- list.files('C:\\Users\\Administrator\\Desktop\\R_figure\\不同情况2\\',pattern = '.pcm$')
for (i in 1:length(lf)){
raw_path <- paste("C:\\Users\\Administrator\\Desktop\\R_figure\\不同情况2\\",lf[[i]],sep='')
pcm <- read.table(raw_path,header=F)
pcm <- pcm[,3:ncol(pcm)]
rownames(pcm) <- c("A","C","G","T")
motif <- new("pcm", mat=as.matrix(pcm), name=lf[[i]])
plot(motif)
#out_path <- paste("C:\\Users\\Administrator\\Desktop\\R_figure\\",lf[[1]],'.jpg',sep='')
jpeg(file=paste0("C:\\Users\\Administrator\\Desktop\\R_figure\\不同情况2\\",lf[[i]],'.jpg'),height = 400,width = 1000)
plot(motif,ic.scale=FALSE,ylab='probability',font='mono,Courier')
dev.off()
}
pcm <- read.table("C:\\Users\\Administrator\\Desktop\\R_figure\\不同情况2\\V69_a1a2_R.sum_mis_gap_1.mismatch_0.gap_1.pcm",header=F)
pcm <- pcm[,3:ncol(pcm)]
rownames(pcm) <- c("A","C","G","T")
motif <- new("pcm", mat=as.matrix(pcm), name="V69_a1a2_R.sum_mis_gap_1.mismatch_0.gap_1.pcm.jpg")
plot(motif)
plot(motif,ic.scale=FALSE,ylab='probability',font='mono,Courier')
jpeg(file=paste0("C:\\Users\\Administrator\\Desktop\\R_figure\\不同情况2\\V69_a1a2_R.sum_mis_gap_1.mismatch_0.gap_1.pcm.jpg"),height = 400,width = 2000)
plot(motif,ic.scale=FALSE,ylab='probability',font='mono,Courier')
dev.off()