叶绿体基因组深度绘图分析总流程
##B站视频讲解链接
准备数据
1 叶绿体基因组组装fasta文件
2 测序原始文件(fastq, fastq.gz)
本次测试数据来自于https://blog.csdn.net/salty_fish_xu/article/details/127469392?spm=1001.2014.3001.5502
序列调整
将序列调整至大单拷贝区第一个碱基作为序列的起始,该功能可以使用CPStools中的IR.py实现
经鉴定其四分体结构为
LSC:1-84846 IRb:84847-110900 SSC:110901-129191 IRa:129192-155245
获取深度文件
# 构建索引
bowtie2-build test.fasta ref # test.fasta 为组装结果
# 数据mapping
bowtie2 --very-sensitive-local -x ref -1 1.fq -2 2.fq -p 60 > map2ref.sam &
# 提取可以比对上的序列
samtools view -h -F 4 map2ref.sam -@ 20 > mapped.bam
# bam文件排序
samtools sort -o mapped_sort.bam -@ 20 mapped.bam
# 构建bam 文件索引
samtools index mapped_sort.bam -@ 20 mapped_sort.bam.bai
# 统计深度
samtools depth mapped_sort.bam > depth.txt
步长合并
import os
# 脚本只需要修改file_path 和 output_file
# file_path是上一步生成的深度文件depth.txt
# output_file是2000步长的合并文件
file_path = open(r'/Users/xuwenbo/Desktop/testttttt/depth2/depth.txt', 'r') # input file
output_file = r'/Users/xuwenbo/Desktop/testttttt/depth2/depth_2000.txt'
cont = file_path.readlines()
with open(output_file, 'w') as ff: # output file
for i in range(len(cont)):
if i % 2000 == 0:
start = i
end = min(i + 2000, len(cont)) # 确保不超过列表的长度
middle = start + (end - start) // 2 # 计算实际区间中点
all_sum = 0
for j in range(start, end):
all_sum += int(cont[j].split('\t')[2])
average_depth = round(all_sum / (end - start), 0)
ff.write(f"{middle}\t{average_depth}\n"
深度绘图
library(ggplot2)
library(data.table)
data <- read.table('/Users/xuwenbo/Desktop/testttttt/depth2/depth_2000.txt', sep = '\t')
data2 <- fread('/Users/xuwenbo/Desktop/testttttt/depth2/depth.txt', sep = '\t', drop = 1)
## 步长2000绘图
# geom_rect中四个xmin=1000, xmax=84846,需要根据自己的四分体长度进行修改
p1 <- ggplot(data, aes(V1, V2)) +
geom_bar(stat = 'identity', width = 800, fill = "lightblue") +
ylim(0,1200) + theme_classic() + xlab("Sequence length") +
ylab("Mean base depth")+
geom_rect(aes(xmin=1000, xmax=84846, ymin=0, ymax=0.5),
fill="lightblue", colour="blue", size=1.5) +
geom_rect(aes(xmin=84847, xmax=110900, ymin=0, ymax=0.5),
fill="lightgreen", colour="green", size=1.5) +
geom_rect(aes(xmin=110901, xmax=129191, ymin=0, ymax=0.5),
fill="lightpink", colour="red", size=1.5) +
geom_rect(aes(xmin=129192, xmax=154622, ymin=0, ymax=0.5),
fill="lightgray", colour="black", size=1.5)
p1
## 所有位点绘图
p2 <- ggplot(data2, aes(V2, V3)) +
geom_bar(stat = 'identity', fill = "lightblue") +
ylim(0,1200) + theme_classic() + xlab("Sequence length") +
ylab("Mean base depth")+
geom_rect(aes(xmin=1, xmax=84846, ymin=0, ymax=0.5),
fill="lightblue", colour="blue", size=1.5) +
geom_rect(aes(xmin=84847, xmax=110900, ymin=0, ymax=0.5),
fill="lightgreen", colour="green", size=1.5) +
geom_rect(aes(xmin=110901, xmax=129191, ymin=0, ymax=0.5),
fill="lightpink", colour="red", size=1.5) +
geom_rect(aes(xmin=129192, xmax=155245, ymin=0, ymax=0.5),
fill="lightgray", colour="black", size=1.5)
p2
# 合并两张图
library(cowplot)
combined_plot <- plot_grid(p1, p2, nrow = 2, align = "v", labels = c("A", "B"))
combined_plot