计算样本 bed区间内gc ,depth
import pysam
import numpy as np
import pandas as pd
import math
import pyfaidx
def calGC(bamfile, bed):
sampleid = bamfile.strip().split("/")[-1].split(".")[0]
out = open("%s.GCcount.txt"%sampleid,"w")
sam = pysam.AlignmentFile(bamfile, 'rb')
bedfile = open(bed, "r")
allgc = 0
allbase = 0
for lines in bedfile:
chrs, start, end = lines.strip().split("\t")
start = int(start)
end = int(end)
base_num = 0
GCnum = 0
read_num=0
for read in sam.fetch(chrs,int(start),int(end)):
if not read.is_duplicate and not read.is_secondary:
read_num+=1
seq = read.query_sequence
seq_len = read.query_length
read_start = read.reference_start +1
# read_end = read.reference_end
read_end = read_start + seq_len-1
read_name = read.query_name
if (read_start<= start) and ( start < read_end <= end):
seq_in_bed = seq[start - read_start:]
elif (read_start >= start) and (read_end <= end):
seq_in_bed = seq
elif ( start <= read_start <= end) and (read_end >= end):
seq_in_bed = seq[:end-read_end-1]
elif (read_start <= start) and (read_end >= end):
seq_in_bed = seq[start-read_start:end-read_end-1]
g_count = seq_in_bed.count("G")
c_count = seq_in_bed.count("C")
GCnum +=(g_count + c_count)
base_num +=len(seq_in_bed)
allgc +=GCnum
allbase+=base_num
# print(read_num,GCnum,base_num)
if base_num==0:
GCcount=0
else:
GCcount = GCnum / base_num
out.write(lines.strip() +"\t"+str(GCcount)+"\t"+str(read_num)+"\n")
gchan = allgc/allbase
out2 = open("%s.all_GCcount.txt"%sampleid,"w")
out2.write(sampleid +"\t"+str(gchan)+"\n")
# return sampleid,dict1
return
calGC(bamfile)
或(pysam中fetch中 start ,end 不能相等,若相等不能读出reads,如果strat与end差1,计算的为后面end的深度。而 samtools depth -r chr:start,end 给出的是包括start ,end的depth,start,end可以相等)
def region_depth_count(bamfile, bedfile, min_mapq=0,NULL_LOG2_COVERAGE=-20):
def filter_read(read):
"""True if the given read should be counted towards coverage."""
return not (read.is_duplicate
or read.is_secondary
or read.is_unmapped
or read.is_qcfail
or read.mapq < min_mapq)
sampleid = bamfile.split("/")[-3]
bamfile = pysam.AlignmentFile(bamfile, 'rb')
rdbed=open(bedfile,"r")
log2dict={"chrom":[], "start":[], "end":[], "log2depth":[], "depth":[],"read_num":[],"sampleid":[]}
for lines in rdbed:
chrom,start,end,gene = lines.strip().split("\t")
start, end = int(start), int(end)
count = 0
bases = 0
for read in bamfile.fetch(reference=chrom, start=start, end=end):
if filter_read(read):
count += 1
# Only count the bases aligned to the region
rlen = read.query_alignment_length
if read.pos < start:
rlen -= start - read.pos
if read.pos + read.query_alignment_length > end:
rlen -= read.pos + read.query_alignment_length - end
bases += rlen
depth = bases / (end - start) if end > start else 0
计算参考基因组 bed区间gc
def calculate_gc(subseq):
"""Calculate the GC content of a string."""
cnt_at_lo = subseq.count('a') + subseq.count('t')
cnt_at_up = subseq.count('A') + subseq.count('T')
cnt_gc_lo = subseq.count('g') + subseq.count('c')
cnt_gc_up = subseq.count('G') + subseq.count('C')
tot = float(cnt_gc_up + cnt_gc_lo + cnt_at_up + cnt_at_lo)
if not tot:
return 0.0
frac_gc = (cnt_gc_lo + cnt_gc_up) / tot
return frac_gc
def fasta_extract_info(fastafile,bedfile):
rdbed=open(bedfile,"r")
fasta = pysam.FastaFile(fastafile)
fastainfo={"chrom":[], "start":[], "end":[], "GCnum":[], "bed_len":[], "gene":[]}
for lines in rdbed:
chrom,start,end,gene = lines.strip().split("\t")
start, end = int(start), int(end)
bed_len = end-start+1
subseq = fasta.fetch(region="%s:%s-%s"%(chrom,int(start),int(end)))
GCnum = calculate_gc(subseq)
fastainfo["chrom"].append(chrom)
fastainfo["start"].append(start)
fastainfo["end"].append(end)
fastainfo["GCnum"].append(GCnum)
fastainfo["bed_len"].append(bed_len)
fastainfo["gene"].append(gene)
fasta_info_df = pd.DataFrame(fastainfo)
return fasta_info_df