一、原理介绍
GC depth分布图可以看出测序是否有明显的GC偏向。如果存在样品污染,通常能够从GC含量分析中呈现出来,出现独立的序列簇。异常的GCdepth可能的原因:深度一半可能与动物的性染色体和高杂合有关,其它情况多为污染,例如GC高可能是细菌污染。
下图为异常情况的图:
下图为正常情况图:
横坐标表示GC含量,纵坐标表示测 序深度,右方是contig覆盖深度分布,上方是GC含量分布,根据其GC分布 以及覆盖深度信息绘制散点图,其中红色的部分代表该散点图中点的 密度比较大的部分,用红色表示。
到了大家最关注的问题了,那就是怎么分析,在这里以二代测序数据为例,从illumina平台PE150 bp测序结果的clean data开始,三代测序数据只需要改变步长跟窗口参数,别的均为一致的(忽然意识到没说明怎么进行二代数据过滤,下个帖子再写吧)
二、实操
在第一步和第二部只是为了获取到bam文件,本文可能写的比较复杂了,若有更好的方法可将1、2两步忽略。
1、从测序结果的fq文件到fa文件转换
在本步骤中用到的软件为seqtk,安装方式如下:
conda install -c bioconda seqtk
安装后就可以使用了,将fq文件格式转为fa格式
seqtk seq -a Sample_R1.fq.gz > Sample_R1.fa
2、bam文件生成
首先安装bwa软件。
wget https://zenlayer.dl.sourceforge.net/project/bowtie-bio/bowtie2/2.5.2/bowtie2-2.5.2-linux-x86_64.zip
unzip bowtie2-2.5.2-linux-x86_64.zipcd bowtie2-2.5.2-linux-x86_64/
./bowtie2
安装完软件后就可以后续分析操作了。
用bwa软件将利用fq文件与fa文件生成bam文件。
bwa index Sample_R1.fa
bwa mem -t 4 Sample_R1.fa $fq1 $fq2 | samtools sort - -o aln_sort.bam
3、文件制定与参数设置
fa=Sample_R1.fa ## fa文件
bam=aln_sort.bam ## 比对结果文件
prefix=gcdepth ## 输出结果前缀
window=40 ## 窗口大小
step=10 ## 步长
测序量较少,读长短就把窗口大小参数与步长参数缩小,反正增大,三代的话,步长可设为100,窗口设为500。参数设置与自己序列特征有关,可以多试一下参数。
4、计算序列长度
seqtk comp $fa | awk '{print $1"\t"$2}' > $prefix.len
5、划分窗口生成bed文件
bedtools makewindows -w $window -s $step -g $prefix.len > $prefix.window.bed
6、按窗口提取序列并计算gc含量
seqtk subseq $genome $prefix.window.bed > $prefix.window.fasta
seqtk comp $prefix.window.fasta |awk '{print $1 "\t" ($4+$5)/($3+$4+$5+$6) }' > $prefix.window.gc
7、按窗口计算平均深度
bedtools coverage -b aln_sort.bam -a $prefix.window.bed -mean | \
awk '{print $1":"$2+1"-"$3"\t"$4}' > $prefix.window.depth
8、绘图
Rscript $scriptsdir/gc_depth_plot.r -i $prefix.window.gc -d $prefix.window.depth -p $prefix
其中r脚本代码如下:
parser <- ArgumentParser(description='plot gc content and gc depth')
parser$add_argument( "-i", "--gc.table", type="character",required=T, default=NULL,
help="input gc.table file [required]",
metavar="filepath")
parser$add_argument( "-d", "--depth.table", type="character",required=T, default=NULL,
help="input coverage depth.table file [required]",
metavar="filepath")
# parser$add_argument( "--mingc", type="double",required=F, default=0,
# help="min gc cut [default %(default)s]",
# metavar="mingc")
# parser$add_argument( "--maxgc", type="double",required=F, default=1,
# help="min gc cut [default %(default)s]",
# metavar="maxgc")
#
# parser$add_argument( "--mindp", type="integer",required=F, default=0,
# help="min depth cut [default %(default)s]",
# metavar="mindepth")
# parser$add_argument( "--maxdp", type="integer",required=F, default=500,
# help="min depth cut [default %(default)s]",
# metavar="maxdepth")
parser$add_argument( "--pt.size ", type="double",required=F, default=0.7,
help="Adjust point size for plotting [default %(default)s]",
metavar="pt.size")
parser$add_argument( "-H", "--height", type="double",required=F, default=5,
help="the height of pic, inches [default %(default)s]",
metavar="height")
parser$add_argument("-W", "--width", type="double",required=F, default=5,
help="the width of pic, inches [default %(default)s]",
metavar="width")
parser$add_argument( "-o", "--outdir", type="character",required=F, default=getwd(),
help="output file directory [default %(default)s]",
metavar="path")
parser$add_argument("-p", "--prefix", type="character",required=F, default="demo",
help="out file name prefix [default %(default)s]",
metavar="prefix")
opt <- parser$parse_args()
#####################################################
if( !file.exists(opt$outdir) ){
if( !dir.create(opt$outdir, showWarnings = FALSE, recursive = TRUE) ){
stop(paste("dir.create failed: outdir=",opt$outdir,sep=""))
}
}
##################
library(ggplot2)
library(tibble)
library(dplyr)
library(aplot)
gc <- read.table(opt$gc.table, header = F , row.names = 1)
dp <- read.table(opt$depth.table, header = F , row.names = 1 )
gc1 <- rownames_to_column(gc,var = "id")
dp1 <- rownames_to_column(dp,var = "id")
gcdep <- inner_join(gc1, dp1, by = "id")
colnames(gcdep) <- c("id", "GC", "Depth")
gcdep=as.data.frame(gcdep)
head(gcdep)
#GC 含量中位数(百分比)
gcdep$GC <- 100 * gcdep$GC
GC_median <- round(median(gcdep$GC), 2)
#测序深度中位数
depth_median <- round(median(gcdep$Depth), 2)
#为了避免二代测序的 duplication 所致的深度极高值,将高于测序深度中位数 3 倍的数值去除
gcdep <- subset(gcdep, Depth <= 3 * depth_median)
#depth 深度、GC 含量散点密度图
gc <- ggplot(gcdep, aes(GC, Depth)) +
geom_point(color = 'gray', alpha = 0.6, pch = 16, size = opt$pt.size) +
#geom_vline(xintercept = GC_median, color = 'red', lty = 2, lwd = 0.5) +
#geom_hline(yintercept = depth_median, color = 'red', lty = 2, lwd = 0.5) +
stat_density_2d(aes(fill = ..density.., alpha = ..density..), geom = 'tile', contour = FALSE, n = 500) +
scale_fill_gradientn(colors = c('transparent', 'yellow', 'red', 'red')) +
#labs(x = paste('GC % (Median :', GC_median, '%)'), y = paste('Depth (Median :', depth_median, 'X)')) +
labs(x = "GC Content(%)", y = "Sequencing Depth(X)") +
theme_bw()+theme(
panel.grid=element_blank(),
axis.text.x=element_text(colour="black"),
axis.text.y=element_text(colour="black"),
#panel.border=element_rect(colour = "black"),
legend.key = element_blank(),
panel.border= element_blank(),
legend.position = 'none',
#axis.ticks.length = unit(0, "pt"),
plot.margin = margin(t = 0, # Top margin
r = 0, # Right margin
b = 40, # Bottom margin
l = 20), # Left margin
axis.line.y.left=element_line(colour = "black"),
axis.line.y.right=element_blank(),
axis.line.x.top = element_blank(),
axis.line.x.bottom = element_line(colour = "black"),
legend.title = element_blank())+coord_cartesian(expand = FALSE)
#depth 深度频数直方图
depth_hist <- ggplot(gcdep, aes(Depth)) +
geom_histogram(binwidth = (max(gcdep$Depth) - min(gcdep$Depth))/100, fill = 'LightSkyBlue', color = 'gray40', size = 0.1) +
labs(x = '', y = 'Number of Contigs') +
coord_flip(expand = FALSE) +
#geom_vline(xintercept = depth_median, color = 'red', lty = 2, lwd = 0.5)+
theme_bw()+theme(
panel.grid=element_blank(),
axis.text.x=element_text(colour="black",angle=45, hjust=1),
#axis.text.y=element_text(colour="black"),
#panel.border=element_rect(colour = "black"),
panel.border= element_blank(),
legend.key = element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
panel.spacing = unit(0, "cm"),
axis.ticks.length.y = unit(0, "pt"),
plot.margin = margin(t = 0, # Top margin
r = 0, # Right margin
b = 40, # Bottom margin
l = 0), # Left margin
axis.line.y.left=element_blank(),
axis.line.y.right=element_blank(),
axis.line.x.top = element_blank(),
axis.line.x.bottom = element_line(colour = "black"),
legend.title = element_blank())
#GC 含量频数直方图
GC_hist <- ggplot(gcdep, aes(GC)) +
geom_histogram(binwidth = (max(gcdep$GC) - min(gcdep$GC))/100, fill = 'LightSkyBlue', color = 'gray40', size = 0.1) +
labs(x = '', y = 'Number of Contigs') +
#geom_vline(xintercept = GC_median, color = 'red', lty = 2, lwd = 0.5)+
theme_bw()+theme(
panel.grid=element_blank(),
#axis.text.x=element_text(colour="black"),
#axis.text.y=element_text(colour="black"),
#panel.border=element_rect(colour = "black"),
panel.border= element_blank(),
legend.key = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
panel.spacing = unit(0, "cm"),
axis.ticks.length.x = unit(0, "pt"),
plot.margin = margin(t = 0, # Top margin
r = 0, # Right margin
b = 0, # Bottom margin
l = 20), # Left margin
axis.line.x.top=element_blank(),
axis.line.x.bottom = element_blank(),
axis.line.y.right= element_blank(),
axis.line.y.left = element_line(colour = "black"),
legend.title = element_blank())+coord_cartesian(expand = FALSE)
png(filename=paste(opt$outdir,"/",opt$prefix,".png",sep=""), height=opt$height*300, width=opt$width*300, res=300, units="px")
p <- gc %>%
insert_top(GC_hist, height=.4) %>%
insert_right(depth_hist, width=.4)
p
dev.off()
pdf(file=paste(opt$outdir,"/",opt$prefix,".pdf",sep=""), height=opt$height, width=opt$width)
p <- gc %>%
insert_top(GC_hist, height=.4) %>%
insert_right(depth_hist, width=.4)
p
dev.off()
三、总结
若序列存在污染的话,最好根据nt库进行比对,剔除污染后再进行后续分析