计算BAM文件中,单个位点的ATCG的read数量和coverage
import pandas as pd
import os
import pysam
import sys
# usage:
# python bam2coverage.py bamfile posfile output
#
# bamfile:
# posfile: column1, chrom; column2, position
# output:
#
# position is starting with 1
bamfile = sys.argv[1]
posfile = sys.argv[2]
outfile = sys.argv[3]
def bam2matrix_single_cpu(bam, position, outfile ):
tb_out = pd.DataFrame(columns=['chrom', 'pos', 'A', 'T', 'C', 'G', 'coverage'])
samfile = pysam.AlignmentFile(bam, "rb")
for idx, row in position.iterrows():
bases = {"A": 0, "G": 0, "C": 0, "T": 0}
chrom_ = row[0]
pos_a = row[1] - 1
pos_b = row[1]
for pileupcolumn in samfile.pileup(chrom_, pos_a, pos_b, truncate=True):
# 测序质量高于30的碱基
for pileupread in [al for al in pileupcolumn.pileups if al.alignment.mapq > 30]:
if not pileupread.is_del and not pileupread.is_refskip:
base = pileupread.alignment.query_sequence[pileupread.query_position]
try:
bases[base] += 1
except:
pass
cov = sum(list(bases.values()))
tb_out = tb_out.append({
'chrom': chrom_,
'pos': row[1],
'A': int(bases['A']),
'T': bases['T'],
'G': bases['G'],
'C': bases['C'],
'coverage': cov
}, ignore_index=True)
samfile.close()
tb_out.to_csv(outfile, index=False, sep='\t')
position = pd.read_table(posfile)
bam2matrix_single_cpu(bam=bamfile,
position=position,
outfile=outfile
)
例如想要知道如下几个位点的coverage情况,则建立tab分隔的文件如下(第一列为染色体号,第二列为坐标):
chrom pos
chr1 10000
chr2 20000
chr3 30000
执行脚本:
python myscript.py bamfile position.txt output.txt