From 16s rDNA amplicon sequence to feature table
序列是公司测序返回的数据,格式如下:
sample1_R1.fq.gz
sample1_R2.fq.gz
sample2_R1.fq.gz
sample2_R2.fq.gz
sample3_R1.fq.gz
sample3_R2.fq.gz
每个文件的格式如下(测试数据下载):
@测序的各种编号
正向引物+mysequence
+
测序质量
@测序的各种编号
反向引物+mysequence
+
测序质量
@测序的各种编号
反向引物+mysequence
+
测序质量
@测序的各种编号
正向引物+mysequence
+
测序质量
由于样本比较多,所以我用R生成的QIIME2 需要的manifest文件,并用Rmarkdown整理了了分析的脚本
knitr::opts_chunk$set(echo = TRUE,eval = FALSE)
knitr::opts_chunk$set(engine.opts = list(bash = "-l"))
用R生成manifest文件
library(stringr)
library(dplyr)
setwd("YOURPATH of sequences")
f <- dir("./")
f1 <- f[seq(1,length(f),2)]
f2 <- f[seq(2,length(f),2)]
#根据自身的序列名称进行拆分
id1 <- sapply(str_split(f1,"_"),function(x) x[1])
id2 <- sapply(str_split(f1,"_"),function(x) x[1])
sum(!(id1 == id2))
sampleid <- paste("sample",id1,sep="")
path <- "YOURPATH"
manifest <- tibble(`sample-id`=sampleid,
`forward-absolute-filepath`=paste(path,f1,sep=""),
`reverse-absolute-filepath`=paste(path,f2,sep=""))
write.table(manifest,
"./16s-manifest.tsv",
sep = "\t",
quote = FALSE,
col.names = TRUE,
row.names = FALSE)
Vseach pipline for 16s amplicon sequences
Import manifest file
manifest <- readr::read_table(
"YOURPATH of manifest file",
col_names = TRUE)
Merge paired ends and relabel them
不怎么会用shell编程,就用R脚本来完成了,对不同的序列进行拼接。
setwd("YOURPATH")
for (i in 1:nrow(manifest)){
comstr1 <- paste("vsearch --fastq_mergepairs ",
manifest[i,2],
" --reverse ",
manifest[i,3],
" --fastqout ",
manifest[i,1],
".fq ",
" --relabel ",
manifest[i,1],
".",
sep="")
#comstr2 <- paste("gzip ",manifest[i,1],".fq",sep="")
print(comstr1)
system(comstr1)
#print(comstr2)
#system(comstr2)
}
Pool all merged sequences
mkdir ./2_merged
mv *.fq ./2_merged
cat ./2_merged/*.fq >> ./all.fq
Trim primer sequence
合并后的序列格式如下:
-
GTGYCAGCMGCCGCGGTAAmysequenceAAACTCAAAKRAATTGACGGCCbarcode
-
GGCCGYCAATTYMTTTRAGTTTmysequenceTTACCGCGGCKGCTGRCACbarcode
cutadapt -a TTACCGCGGCKGCTGRCAC \
-a AAACTCAAAKRAATTGACGGCC \
-g GGCCGYCAATTYMTTTRAGTTT \
-g GTGYCAGCMGCCGCGGTAA \
-o all_trimmed2.fq \
-j 10 \
all_trimmed.fq
Quality control
vsearch --fastx_filter ./all_trimmed.fq \
--fastq_maxee_rate 0.01 \
--fastaout ./filtered.fa
Dereplication
vsearch --derep_fulllength ./filtered.fa \
--sizeout --minuniquesize 2 \
--output ./uniques.fa
Unoise
vsearch --cluster_unoise uniques.fa \
--centroids zotus.fa \
--uc uc_zotus.uc \
--log unoise_log.txt \
--threads 14
Chimeras detection
vsearch --uchime3_denovo zotus.fa \
--nonchimeras zotus_nochime.fa \
--fasta_width 0 \
--relabel zotu \
--xsize \
--log unchime3_log.txt
不知为何生成的zotus_nochime.fa
文件中有小写的字母(懂的留言一下哈),需要用如下代码处理一下:
# from Denoising 16S Sequence Reads using UNOISE3 via VSEARCH and QIIME2
awk 'BEGIN{FS=" "}{if(!/>/){print toupper($0)}else{print $1}}' \
zotus_nochime.fa > zotus_nochime_nomask.fa
这项代码的含义如下(from chitgpt):
This command is using the awk programming language to process input line by line. Here’s a breakdown of the command:
awk
: This is the command itself, which invokes the awk programming language interpreter.'BEGIN{FS=" "}'
: This is an awk block that is executed before processing any input lines. In this case, it sets the field separator (FS) to a space character, meaning that each line will be split into fields based on spaces.{if(!/>/){print toupper($0)}else{print $1}}
: This is the main block of the awk command, which is executed for each input line. It contains an if-else statement.- if (!/>/): This condition checks if the input line does not contain the > character.
- print toupper($0): If the condition is true, it prints the entire input line ($0) in uppercase using the toupper() function.
- else: If the condition is false (i.e., the line contains >), it executes the following block.
- print $1: It prints the first field ($1) of the input line, i.e., it prints everything before the first space character.
So, this command takes input lines, converts them to uppercase if they don’t contain >, and prints only the first field (before the first space character) if they do contain >.
Build feature table
vsearch --usearch_global filtered.fa \
--db zotus_nochime.fa \
--id 0.97 \
--otutabout unoise3_zotu_table.txt \
--biomout unoise3_zotu_table.biom \
--threads 14 \
--log otutab_log.txt
DADA2 in QIIME2
conda activate qiime2-2022.8
DADA2 pipline
Import demultiplexed sequence into qiime2
qiime tools import \
--type 'SampleData[PairedEndSequencesWithQuality]' \
--input-path ./16s-manifest.tsv \
--output-path 16s.qza \
--input-format PairedEndFastqManifestPhred33V2
qiime cutadapt trim-paired \
--i-demultiplexed-sequences 16s.qza \
--p-cores 10 \
--p-front-f GTGYCAGCMGCCGCGGTAA \
--p-front-f GGCCGYCAATTYMTTTRAGTTT \
--p-front-r GTGYCAGCMGCCGCGGTAA \
--p-front-r GGCCGYCAATTYMTTTRAGTTT \
--o-trimmed-sequences 16s-trimmed.qza \
--quiet
Generate the summerize of trimmed sequences
qiime demux summarize \
--i-data 16s-trimmed.qza \
--o-visualization 16s-trimmed-summarize.qzv
#View the result to determine the value of parameters
#in next command "qiime dada2 denoise-paired"
#--p-trunc-len-f and --p-trunc-len-r
#Q20 was chosen as the threshold to determine the truncated length of reads
#What is the minimum of your sequence length?
qiime tools view 16s-trimmed-summarize.qzv
DADA2 denoise
qiime dada2 denoise-paired \
--i-demultiplexed-seqs 16s-trimmed.qza\
--p-trim-left-f 0 \
--p-trim-left-r 0 \
--p-trunc-len-f 215 \
--p-trunc-len-r 215 \
--p-max-ee-f 2 \
--p-max-ee-r 2 \
--o-table 16s-table.qza \
--o-representative-sequences 16s-rep-seqs.qza \
--o-denoising-stats 16s-denoising-stats.qza \
--p-n-threads 0
qiime metadata tabulate \
--m-input-file 16s-denoising-stats.qza \
--o-visualization 16s-denoising-stats.qzv
qiime tools view 16s-denoising-stats.qzv
biom convert -i feature-table.biom -o feature-table.txt --to-tsv
Taxonomy classification
#Extract reference reads
qiime feature-classifier extract-reads \
--i-sequences 2022.10.backbone.full-length.fna.qza \
--p-f-primer GTGYCAGCMGCCGCGGTAA \
--p-r-primer GGCCGYCAATTYMTTTRAGTTT \
--p-min-length 300 \
--p-max-length 500 \
--o-reads ref-seqs.qza
#Train the classifier
qiime feature-classifier fit-classifier-naive-bayes \
--i-reference-reads ref-seqs.qza \
--i-reference-taxonomy 2022.10.backbone.tax.qza \
--o-classifier classifier.qza
qiime feature-classifier classify-sklearn \
--i-classifier classifier.qza \
--i-reads 16s-rep-seqs.qza \
--o-classification 16s-taxonomy.qza
qiime metadata tabulate \
--m-input-file 16s-taxonomy.qza \
--o-visualization 16s-taxonomy.qzv
#qiime tools view 16s-taxonomy.qzv
qiime taxa barplot \
--i-table 16s-table.qza \
--i-taxonomy 16s-taxonomy.qza \
--m-metadata-file 16s-manifest.tsv\
--o-visualization 16s-taxa-bar-plots.qzv
#qiime tools view 16s-taxa-bar-plots.qzv
Phylogeny construction
qiime phylogeny align-to-tree-mafft-fasttree \
--i-sequences 16s-rep-seqs.qza \
--o-alignment 16s-aligned-rep-seqs.qza \
--o-masked-alignment 16s-masked-aligned-rep-seqs.qza \
--o-tree 16s-unrooted-tree.qza \
--o-rooted-tree 16s-rooted-tree.qza
Deblur in QIIME2
conda activate qiime2-2022.8
导入序列到QIIME2
qiime tools import \
--type 'SampleData[PairedEndSequencesWithQuality]' \
--input-path ./16s-manifest.tsv \
--output-path 16s.qza \
--input-format PairedEndFastqManifestPhred33V2
去掉引物即引物前面的barcode
qiime cutadapt trim-paired \
--i-demultiplexed-sequences 16s.qza \
--p-cores 10 \
--p-front-f GTGYCAGCMGCCGCGGTAA \
--p-front-f GGCCGYCAATTYMTTTRAGTTT \
--p-front-r GTGYCAGCMGCCGCGGTAA \
--p-front-r GGCCGYCAATTYMTTTRAGTTT \
--o-trimmed-sequences 16s-trimmed.qza \
--quiet
拼接
qiime vsearch join-pairs \
--i-demultiplexed-seqs 16s-trimmed.qza \
--o-joined-sequences demux-joined.qza \
--p-threads 8
质控
qiime quality-filter q-score \
--i-demux demux-joined.qza \
--p-min-quality 20 \
--o-filtered-sequences demux-joined-filtered.qza \
--o-filter-stats demux-joined-filter-stats.qza
deblur需要序列长度相同,确定裁剪的长度
qiime demux summarize \
--i-data demux-joined-filtered.qza \
--o-visualization demux-joined-filtered.qzv
qiime tools view demux-joined-filtered.qzv
deblur需要序列长度相同
qiime deblur denoise-16S \
--i-demultiplexed-seqs demux-joined-filtered.qza \
--p-trim-length 380 \
--p-sample-stats \
--o-representative-sequences rep-seqs.qza \
--o-table table.qza \
--o-stats deblur-stats.qza \
--p-jobs-to-start 10
biom convert -i feature-table.biom -o feature-table.txt --to-tsv