计算bed区间gc含量，碱基深度等

最新推荐文章于 2022-02-11 19:51:23 发布

风风是超人

最新推荐文章于 2022-02-11 19:51:23 发布

阅读量1.7k

点赞数 1

本文链接：https://blog.csdn.net/qq_42962326/article/details/113700496

版权

计算样本 bed区间内gc ，depth

import pysam
import numpy as np
import pandas as pd
import math
import pyfaidx


def calGC(bamfile， bed):
	sampleid = bamfile.strip().split("/")[-1].split(".")[0]
	out = open("%s.GCcount.txt"%sampleid,"w")
	sam = pysam.AlignmentFile(bamfile, 'rb')
	bedfile = open(bed, "r")
	allgc = 0
	allbase = 0
	for lines in bedfile:
		chrs, start, end = lines.strip().split("\t")
		start = int(start)
		end = int(end)
		base_num = 0
		GCnum = 0
		read_num=0
		for read in sam.fetch(chrs,int(start),int(end)):
			if not read.is_duplicate and not read.is_secondary:
				read_num+=1
				seq = read.query_sequence
				seq_len = read.query_length
				read_start = read.reference_start +1
#				read_end = read.reference_end
				read_end = read_start + seq_len-1
				read_name = read.query_name
				if (read_start<= start) and ( start < read_end <= end):
					seq_in_bed = seq[start - read_start:]
				elif (read_start >= start) and (read_end <= end):
					seq_in_bed = seq
				elif ( start <= read_start <= end) and (read_end >= end):
					seq_in_bed = seq[:end-read_end-1]
				elif (read_start <= start) and (read_end >= end):
					seq_in_bed = seq[start-read_start:end-read_end-1]
				g_count = seq_in_bed.count("G")
				c_count = seq_in_bed.count("C")
				GCnum +=(g_count + c_count)
				base_num +=len(seq_in_bed)
				allgc +=GCnum
				allbase+=base_num
#		       	print(read_num,GCnum,base_num)
		if base_num==0:
			GCcount=0
		else:
			GCcount = GCnum / base_num
		out.write(lines.strip() +"\t"+str(GCcount)+"\t"+str(read_num)+"\n")
	gchan = allgc/allbase
	out2 = open("%s.all_GCcount.txt"%sampleid,"w")
	out2.write(sampleid +"\t"+str(gchan)+"\n")
#	return sampleid,dict1
	return

calGC(bamfile)

或（pysam中fetch中 start ,end 不能相等，若相等不能读出reads，如果strat与end差1，计算的为后面end的深度。而 samtools depth -r chr:start,end 给出的是包括start ,end的depth，start，end可以相等）

def region_depth_count(bamfile, bedfile, min_mapq=0,NULL_LOG2_COVERAGE=-20):

    def filter_read(read):
        """True if the given read should be counted towards coverage."""
        return not (read.is_duplicate
                    or read.is_secondary
                    or read.is_unmapped
                    or read.is_qcfail
                    or read.mapq < min_mapq)
    sampleid = bamfile.split("/")[-3]
    bamfile = pysam.AlignmentFile(bamfile, 'rb')
    rdbed=open(bedfile,"r")
    log2dict={"chrom":[], "start":[], "end":[], "log2depth":[], "depth":[],"read_num":[],"sampleid":[]}
    for lines in rdbed:
        chrom,start,end,gene = lines.strip().split("\t")
        start, end = int(start), int(end)
        count = 0
        bases = 0
        for read in bamfile.fetch(reference=chrom, start=start, end=end):
            if filter_read(read):
                count += 1
                # Only count the bases aligned to the region
                rlen = read.query_alignment_length
                if read.pos < start:
                    rlen -= start - read.pos
                if read.pos + read.query_alignment_length > end:
                    rlen -= read.pos + read.query_alignment_length - end
                bases += rlen
        depth = bases / (end - start) if end > start else 0

计算参考基因组 bed区间gc

def calculate_gc(subseq):
    """Calculate the GC content of a string."""
    cnt_at_lo = subseq.count('a') + subseq.count('t')
    cnt_at_up = subseq.count('A') + subseq.count('T')
    cnt_gc_lo = subseq.count('g') + subseq.count('c')
    cnt_gc_up = subseq.count('G') + subseq.count('C')
    tot = float(cnt_gc_up + cnt_gc_lo + cnt_at_up + cnt_at_lo)
    if not tot:
        return 0.0
    frac_gc = (cnt_gc_lo + cnt_gc_up) / tot
    return frac_gc


def fasta_extract_info(fastafile,bedfile):
    rdbed=open(bedfile,"r")
    fasta = pysam.FastaFile(fastafile)
    fastainfo={"chrom":[], "start":[], "end":[], "GCnum":[], "bed_len":[], "gene":[]}
    for lines in rdbed:
        chrom,start,end,gene = lines.strip().split("\t")
        start, end = int(start), int(end)
        bed_len = end-start+1 
        subseq = fasta.fetch(region="%s:%s-%s"%(chrom,int(start),int(end)))
        GCnum = calculate_gc(subseq)
        fastainfo["chrom"].append(chrom)
        fastainfo["start"].append(start)
        fastainfo["end"].append(end)
        fastainfo["GCnum"].append(GCnum)
        fastainfo["bed_len"].append(bed_len)
        fastainfo["gene"].append(gene)
    fasta_info_df = pd.DataFrame(fastainfo)
    return fasta_info_df