https://blog.csdn.net/m0_53945548/article/details/135317476?spm=1001.2014.3001.5502
BWA-MEM2和Samtools安装见上文
(完整路径)/bwa-mem2 index contigs.fa
(完整路径)/bwa-mem2 mem -t 180 contigs.fa 1.fq.gz 2.fq.gz > aligen.sam
samtools view -Sb -@180 aligen.sam > aligen.bam
samtools sort -@180 aligen.bam -o aligen_sort.bam
samtools index -@ 180 aligen_sort.bam
samtools idxstats -@ 180 aligen_sort.bam > aligen_result.txt
sed -i "1 i GeneID\t\length\tmapped_read\tunmapped_read" aligen_result.txt
输出是以TAB分隔的,每行包括参考序列名称,序列长度,映射的读取段数,未映射的读取
#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
#########################################################
# TPM count
# written by PeiZhong in IFR of CAAS
import argparse
import pandas as pd
parser = argparse.ArgumentParser(description='TPM count')
parser.add_argument('OperaPath', help='Path that contain your samtools result file')
parser.add_argument('input_name', help='input_name')
parser.add_argument('output_name', help='output_name')
args = parser.parse_args()
OperaPath = args.OperaPath
input_name = args.input_name
output_name = args.output_name
if OperaPath.endswith("/"):
OperaPath = OperaPath
else:
OperaPath = OperaPath+"/"
target = OperaPath+input_name
# 'TPM count'
#############################################
def TPMcount(target):
table1 = pd.read_table(OperaPath+input_name, index_col=0)
table1 = table1[table1.index != "*"]
table1["Ng/Lg"] = table1['mapped_read'] / table1['length']
SUM_NjchuLj = table1["Ng/Lg"].sum()
table1["TPM"] = table1['Ng/Lg'] / SUM_NjchuLj * 1e6
result = table1[["Ng/Lg","TPM"]]
print(SUM_NjchuLj)
result.to_csv(OperaPath+output_name, index=True, sep="\t")
#############################################
TPMcount(target)
print(target)
合并很多个数据
#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
#########################################################
# TPM count
# written by PeiZhong in IFR of CAAS
import argparse
import os
import pandas as pd
parser = argparse.ArgumentParser(description='TPM count')
parser.add_argument('OperaPath', help='Path that contain your samtools result file')
parser.add_argument('file_maker', help='(sample)file_maker)')
args = parser.parse_args()
OperaPath = args.OperaPath
file_maker = args.file_maker
os.chdir(OperaPath)
files = os.listdir(OperaPath)
ls = []
for file in files:
if file_maker in file:
ls.append(file)
ls.sort()
print(ls)
all_data = pd.DataFrame()
for file_name in ls:
df = pd.read_csv(file_name, sep='\t')
df['file_name'] = file_name
all_data = pd.concat([all_data, df])
pivot_data = all_data.pivot(index='GeneID', columns='file_name', values='TPM')
pivot_data.to_csv('pivot_data.csv')