计算bed区间gc含量,碱基深度等

计算样本 bed区间内gc ,depth

import pysam
import numpy as np
import pandas as pd
import math
import pyfaidx


def calGC(bamfile, bed):
	sampleid = bamfile.strip().split("/")[-1].split(".")[0]
	out = open("%s.GCcount.txt"%sampleid,"w")
	sam = pysam.AlignmentFile(bamfile, 'rb')
	bedfile = open(bed, "r")
	allgc = 0
	allbase = 0
	for lines in bedfile:
		chrs, start, end = lines.strip().split("\t")
		start = int(start)
		end = int(end)
		base_num = 0
		GCnum = 0
		read_num=0
		for read in sam.fetch(chrs,int(start),int(end)):
			if not read.is_duplicate and not read.is_secondary:
				read_num+=1
				seq = read.query_sequence
				seq_len = read.query_length
				read_start = read.reference_start +1
#				read_end = read.reference_end
				read_end = read_start + seq_len-1
				read_name = read.query_name
				if (read_start<= start) and ( start < read_end <= end):
					seq_in_bed = seq[start - read_start:]
				elif (read_start >= start) and (read_end <= end):
					seq_in_bed = seq
				elif ( start <= read_start <= end) and (read_end >= end):
					seq_in_bed = seq[:end-read_end-1]
				elif (read_start <= start) and (read_end >= end):
					seq_in_bed = seq[start-read_start:end-read_end-1]
				g_count = seq_in_bed.count("G")
				c_count = seq_in_bed.count("C")
				GCnum +=(g_count + c_count)
				base_num +=len(seq_in_bed)
				allgc +=GCnum
				allbase+=base_num
#		       	print(read_num,GCnum,base_num)
		if base_num==0:
			GCcount=0
		else:
			GCcount = GCnum / base_num
		out.write(lines.strip() +"\t"+str(GCcount)+"\t"+str(read_num)+"\n")
	gchan = allgc/allbase
	out2 = open("%s.all_GCcount.txt"%sampleid,"w")
	out2.write(sampleid +"\t"+str(gchan)+"\n")
#	return sampleid,dict1
	return

calGC(bamfile)

或(pysam中fetch中 start ,end 不能相等,若相等不能读出reads,如果strat与end差1,计算的为后面end的深度。而 samtools depth -r chr:start,end 给出的是包括start ,end的depth,start,end可以相等)

def region_depth_count(bamfile, bedfile, min_mapq=0,NULL_LOG2_COVERAGE=-20):

    def filter_read(read):
        """True if the given read should be counted towards coverage."""
        return not (read.is_duplicate
                    or read.is_secondary
                    or read.is_unmapped
                    or read.is_qcfail
                    or read.mapq < min_mapq)
    sampleid = bamfile.split("/")[-3]
    bamfile = pysam.AlignmentFile(bamfile, 'rb')
    rdbed=open(bedfile,"r")
    log2dict={"chrom":[], "start":[], "end":[], "log2depth":[], "depth":[],"read_num":[],"sampleid":[]}
    for lines in rdbed:
        chrom,start,end,gene = lines.strip().split("\t")
        start, end = int(start), int(end)
        count = 0
        bases = 0
        for read in bamfile.fetch(reference=chrom, start=start, end=end):
            if filter_read(read):
                count += 1
                # Only count the bases aligned to the region
                rlen = read.query_alignment_length
                if read.pos < start:
                    rlen -= start - read.pos
                if read.pos + read.query_alignment_length > end:
                    rlen -= read.pos + read.query_alignment_length - end
                bases += rlen
        depth = bases / (end - start) if end > start else 0

计算参考基因组 bed区间gc

def calculate_gc(subseq):
    """Calculate the GC content of a string."""
    cnt_at_lo = subseq.count('a') + subseq.count('t')
    cnt_at_up = subseq.count('A') + subseq.count('T')
    cnt_gc_lo = subseq.count('g') + subseq.count('c')
    cnt_gc_up = subseq.count('G') + subseq.count('C')
    tot = float(cnt_gc_up + cnt_gc_lo + cnt_at_up + cnt_at_lo)
    if not tot:
        return 0.0
    frac_gc = (cnt_gc_lo + cnt_gc_up) / tot
    return frac_gc


def fasta_extract_info(fastafile,bedfile):
    rdbed=open(bedfile,"r")
    fasta = pysam.FastaFile(fastafile)
    fastainfo={"chrom":[], "start":[], "end":[], "GCnum":[], "bed_len":[], "gene":[]}
    for lines in rdbed:
        chrom,start,end,gene = lines.strip().split("\t")
        start, end = int(start), int(end)
        bed_len = end-start+1 
        subseq = fasta.fetch(region="%s:%s-%s"%(chrom,int(start),int(end)))
        GCnum = calculate_gc(subseq)
        fastainfo["chrom"].append(chrom)
        fastainfo["start"].append(start)
        fastainfo["end"].append(end)
        fastainfo["GCnum"].append(GCnum)
        fastainfo["bed_len"].append(bed_len)
        fastainfo["gene"].append(gene)
    fasta_info_df = pd.DataFrame(fastainfo)
    return fasta_info_df
  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
在R语言中,计算DNA序列的GC含量并可视化可以按照以下步骤进行: 1. 将DNA序列保存为一个字符串变量 ```R DNA_seq <- "ATCGATCGATCGTACGCTAGCATCGATCGATCG" ``` 2. 统计G和C的数量 ```R G_count <- nchar(gsub("[^G]", "", DNA_seq)) C_count <- nchar(gsub("[^C]", "", DNA_seq)) ``` 这里使用了gsub函数,将DNA序列中非G字符和非C字符全部替换为空字符"",然后通过nchar函数计算替换后的字符串长度,即为G和C的数量。 3. 计算GC含量 ```R GC_content <- (G_count + C_count) / nchar(DNA_seq) * 100 ``` 这里使用了nchar函数计算DNA序列的总长度,然后按照GC含量的公式计算GC含量。 4. 可视化GC含量 ```R library(ggplot2) library(reshape2) # 创建数据框 GC_data <- data.frame(Base = c("G", "C", "A", "T"), Count = c(G_count, C_count, nchar(gsub("[^A]", "", DNA_seq)), nchar(gsub("[^T]", "", DNA_seq)))) # 计算GC含量 GC_content <- (G_count + C_count) / nchar(DNA_seq) * 100 # 创建绘图数据框 GC_plot_data <- data.frame(GC_Content = GC_content, Type = "GC Content") # 绘制堆叠条形图 GC_barplot <- ggplot(melt(GC_data), aes(x = Base, y = value, fill = variable)) + geom_bar(stat = "identity") + labs(x = "Base", y = "Count", fill = "Type", title = "Base Composition of DNA Sequence") + theme_minimal() # 添加GC含量 GC_barplot + geom_line(data = GC_plot_data, aes(x = 0, y = GC_Content), color = "red", size = 1) ``` 这里使用了ggplot2和reshape2包来绘制堆叠条形图和添加GC含量的折线。首先创建一个数据框GC_data,包含四种碱基的数量,然后使用melt函数将数据框转换为绘图数据框。接着创建一个绘图数据框GC_plot_data,包含GC含量和Type两个变量,用于绘制GC含量的折线。最后使用ggplot函数绘制堆叠条形图,并使用geom_line函数添加GC含量的折线。完整的R代码如下: ```R # 将DNA序列保存为一个字符串变量 DNA_seq <- "ATCGATCGATCGTACGCTAGCATCGATCGATCG" # 统计G和C的数量 G_count <- nchar(gsub("[^G]", "", DNA_seq)) C_count <- nchar(gsub("[^C]", "", DNA_seq)) # 创建数据框 GC_data <- data.frame(Base = c("G", "C", "A", "T"), Count = c(G_count, C_count, nchar(gsub("[^A]", "", DNA_seq)), nchar(gsub("[^T]", "", DNA_seq)))) # 计算GC含量 GC_content <- (G_count + C_count) / nchar(DNA_seq) * 100 # 创建绘图数据框 GC_plot_data <- data.frame(GC_Content = GC_content, Type = "GC Content") # 绘制堆叠条形图 GC_barplot <- ggplot(melt(GC_data), aes(x = Base, y = value, fill = variable)) + geom_bar(stat = "identity") + labs(x = "Base", y = "Count", fill = "Type", title = "Base Composition of DNA Sequence") + theme_minimal() # 添加GC含量 GC_barplot + geom_line(data = GC_plot_data, aes(x = 0, y = GC_Content), color = "red", size = 1) ``` 运行代码后,会生成一个堆叠条形图和一条红色的GC含量折线,用于展示DNA序列的碱基组成和GC含量

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

风风是超人

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值