宏基因组数据批量拼接与分箱
一组宏基因组数据批量拼接与分箱的Linux脚本
前提:已经下载好样本文件
例
准备数据来源文件,例如叫_sequence.txt_
- 文件内容单行格式为 样本名:某序列第一部分:某序列第二部分
- 使用英文冒号(:)连接
SAMEA2619857:/vol1/fastq/ERR599/ERR599049/ERR599049_1.fastq.gz:/vol1/fastq/ERR599/ERR599049/ERR599049_2.fastq.gz
SAMEA2619857:/vol1/fastq/ERR599/ERR599134/ERR599134_1.fastq.gz:/vol1/fastq/ERR599/ERR599134/ERR599134_2.fastq.gz
SAMEA2619818:/vol1/fastq/ERR599/ERR599041/ERR599041_1.fastq.gz:/vol1/fastq/ERR599/ERR599041/ERR599041_2.fastq.gz
SAMEA2619818:/vol1/fastq/ERR599/ERR599116/ERR599116_1.fastq.gz:/vol1/fastq/ERR599/ERR599116/ERR599116_2.fastq.gz
SAMEA2619818:/vol1/fastq/ERR599/ERR599155/ERR599155_1.fastq.gz:/vol1/fastq/ERR599/ERR599155/ERR599155_2.fastq.gz
SAMEA2619766:/vol1/fastq/ERR102/071/ERR10235171/ERR10235171_1.fastq.gz:/vol1/fastq/ERR102/071/ERR10235171/ERR10235171_2.fastq.gz
SAMEA2619766:/vol1/fastq/ERR598/ERR598951/ERR598951_1.fastq.gz:/vol1/fastq/ERR598/ERR598951/ERR598951_2.fastq.gz
SAMEA2619766:/vol1/fastq/ERR599/ERR599043/ERR599043_1.fastq.gz:/vol1/fastq/ERR599/ERR599043/ERR599043_2.fastq.gz
SAMEA2619766:/vol1/fastq/ERR609/009/ERR6090679/ERR6090679_1.fastq.gz:/vol1/fastq/ERR609/009/ERR6090679/ERR6090679_2.fastq.gz
SAMEA2619766:/vol1/fastq/ERR609/000/ERR6090680/ERR6090680_1.fastq.gz:/vol1/fastq/ERR609/000/ERR6090680/ERR6090680_2.fastq.gz
拼装序列(batchmegahit.sh)
- 脚本按照sequence.txt文件中内容,逐行拼接
- 拼接前判断文件是否存在
- 拼接会自动创建好相应的输出目录
echo "$(date "+%Y-%m-%d %H:%M:%S") ========================================>>拼装序列 start<<========================================"
#根目录
basepath="/home/app/tempwork/"
#数据来源
FILENAME=$basepath"/sequence.txt"
#输入文件根目录
inBasepath=$basepath"/data"
#输出文件根目录
outBasepath=$basepath"/res"
for line in `cat $FILENAME`;
do
echo "$(date "+%Y-%m-%d %H:%M:%S")"
echo "$(date "+%Y-%m-%d %H:%M:%S") sourceline --> ${line}"
#按 : 号分割 序列第一第二部分
arr=(${line//:/ })
name_yangben=${arr[0]}
part_1=${arr[1]}
part_2=${arr[2]}
echo "$(date "+%Y-%m-%d %H:%M:%S") 样本名"$name_yangben",序列1:"$part_1"<><>序列2:"$part_2
#从第一部分找出序列名
templine=${part_1%/*}
name_xulie=${templine##*/}
echo "$(date "+%Y-%m-%d %H:%M:%S") 序列名 ${name_xulie}"
#找出序列名part
sequence_1=${part_1##*/}
sequence_2=${part_2##*/}
outpath=$outBasepath/$name_yangben/$name_xulie
if [ -f $outpath ];then
echo "$(date "+%Y-%m-%d %H:%M:%S") 样本已拼接无需再拼接 样本路径 --> $outpath"
continue
fi
path1=$inBasepath/$name_yangben/$sequence_1
if [ ! -f $path1 ];then
echo "$(date "+%Y-%m-%d %H:%M:%S") 文件不存在 --> $path1"
continue
fi
path2=$inBasepath/$name_yangben/$sequence_2
if [ ! -f $path2 ];then
echo "$(date "+%Y-%m-%d %H:%M:%S") 文件不存在 --> $path2"
continue
fi
yangbenPath=$outBasepath/$name_yangben/
if [ ! -d $yangbenPath ];then
echo "$(date "+%Y-%m-%d %H:%M:%S") 文件夹不存在 创建目录 --> $yangbenPath"
mkdir -p $yangbenPath
fi
echo "$(date "+%Y-%m-%d %H:%M:%S") exec command==> megahit --k-min 35 --k-max 95 --k-step 20 --min-contig-len 500 -1 $path1 -2 $path2 -o $outpath"
megahit --k-min 35 --k-max 95 --k-step 20 --min-contig-len 500 -1 $path1 -2 $path2 -o $outpath
echo "$(date "+%Y-%m-%d %H:%M:%S")"
done
echo "$(date "+%Y-%m-%d %H:%M:%S") ========================================>>拼装序列 end<<========================================"
exit 0
建立索引(batchindex.sh)
- 脚本按照sequence.txt文件中内容,逐行建立索引
- 要保证__拼装序列__执行完全成功
echo "$(date "+%Y-%m-%d %H:%M:%S") ========================================>>建立索引 start<<========================================"
#根目录
basepath="/home/app/tempwork/"
#数据来源
FILENAME=$basepath"/sequence.txt"
#拼接好的样本根目录
succ_root_path=$basepath"/res"
for line in $(cat $FILENAME); do
echo "$(date "+%Y-%m-%d %H:%M:%S")"
echo "$(date "+%Y-%m-%d %H:%M:%S") sourceline --> ${line}"
#按 : 号分割 序列第一第二部分
arr=(${line//:/ })
name_yangben=${arr[0]}
part_1=${arr[1]}
part_2=${arr[2]}
echo "$(date "+%Y-%m-%d %H:%M:%S") 样本名"$name_yangben",序列1:"$part_1"<><>序列2:"$part_2
#从第一部分找出序列名
templine=${part_1%/*}
name_xulie=${templine##*/}
echo "$(date "+%Y-%m-%d %H:%M:%S") 序列名 ${name_xulie}"
#找出序列名part
sequence_1=${part_1##*/}
sequence_2=${part_2##*/}
#拼接好的样本目录
sourcepath=$succ_root_path/$name_yangben/$name_xulie/"final.contigs.fa"
if [ ! -f $sourcepath ]; then
echo "$(date "+%Y-%m-%d %H:%M:%S") 未找到拼接好的样本 --> $sourcepath"
continue
fi
echo "$(date "+%Y-%m-%d %H:%M:%S") exec command==> bowtie2-build -f $sourcepath final --threads 16"
bowtie2-build -f $sourcepath final --threads 16
echo "$(date "+%Y-%m-%d %H:%M:%S")"
done
echo "$(date "+%Y-%m-%d %H:%M:%S") ========================================>>建立索引 end<<========================================"
exit 0
对比转换排序(batchconvert.sh)
- 脚本按照sequence.txt文件中内容,逐行对比转换排序
- 要保证__建立索引__执行完全成功
echo "$(date "+%Y-%m-%d %H:%M:%S") ========================================>>对比转换排序 start<<========================================"
#根目录
basepath="/home/app/tempwork/"
#数据来源
FILENAME=$basepath"/sequence.txt"
#输入文件根目录
inBasepath=$basepath"/data"
#输出文件根目录
outBasepath=$basepath"/res"
for line in $(cat $FILENAME); do
echo "$(date "+%Y-%m-%d %H:%M:%S")"
echo "$(date "+%Y-%m-%d %H:%M:%S") sourceline --> ${line}"
#按 : 号分割 序列第一第二部分
arr=(${line//:/ })
name_yangben=${arr[0]}
part_1=${arr[1]}
part_2=${arr[2]}
echo "$(date "+%Y-%m-%d %H:%M:%S") 样本名"$name_yangben",序列1:"$part_1"<><>序列2:"$part_2
#从第一部分找出序列名
templine=${part_1%/*}
name_xulie=${templine##*/}
echo "$(date "+%Y-%m-%d %H:%M:%S") 序列名 ${name_xulie}"
#找出序列名part
sequence_1=${part_1##*/}
sequence_2=${part_2##*/}
path1=$inBasepath/$name_yangben/$sequence_1
if [ ! -f $path1 ]; then
echo "$(date "+%Y-%m-%d %H:%M:%S") 文件不存在 --> $path1"
continue
fi
path2=$inBasepath/$name_yangben/$sequence_2
if [ ! -f $path2 ]; then
echo "$(date "+%Y-%m-%d %H:%M:%S") 文件不存在 --> $path2"
continue
fi
#binres目录
binresPath=$outBasepath/$name_yangben/$name_xulie/"binres/"
#对比后生成的文件 sam
samPath=$binresPath"final.sam"
if [ ! -d $binresPath ]; then
echo "$(date "+%Y-%m-%d %H:%M:%S") 文件夹不存在,创建binres目录 --> $binresPath"
mkdir -p $binresPath
fi
echo "$(date "+%Y-%m-%d %H:%M:%S") -------------------------------------------------------------------------------------比对start"
if [ -f $samPath ]; then
echo "$(date "+%Y-%m-%d %H:%M:%S") 文件已存在,不进行对比 --> $samPath"
else
echo "$(date "+%Y-%m-%d %H:%M:%S") exec command==> bowtie2 -1 $path1 -2 $path2 -p 16 -x final -S $samPath"
bowtie2 -1 $path1 -2 $path2 -p 16 -x final -S $samPath
fi
echo "$(date "+%Y-%m-%d %H:%M:%S") -------------------------------------------------------------------------------------比对end"
echo "$(date "+%Y-%m-%d %H:%M:%S") -------------------------------------------------------------------------------------转换start"
#格式转换后的文件 bam
convertOutPath=$binresPath"final.bam"
if [ -f $convertOutPath ]; then
echo "$(date "+%Y-%m-%d %H:%M:%S") 转换后文件已存在,不进行转换 --> $convertOutPath"
else
echo "$(date "+%Y-%m-%d %H:%M:%S") exec command==> samtools view -@ 16 -b -S $samPath -o $convertOutPath"
samtools view -@ 16 -b -S $samPath -o $convertOutPath
fi
echo "$(date "+%Y-%m-%d %H:%M:%S") -------------------------------------------------------------------------------------转换end"
echo "$(date "+%Y-%m-%d %H:%M:%S") -------------------------------------------------------------------------------------排序start"
#排序后的文件 sortbam
sortOutPath=$binresPath"final.sorted.bam"
if [ -f $sortOutPath ]; then
echo "$(date "+%Y-%m-%d %H:%M:%S") 排序后文件已存在,不进行排序 --> $sortOutPath"
else
echo "$(date "+%Y-%m-%d %H:%M:%S") exec command==> samtools sort -@ 16 -l 9 -O BAM $convertOutPath -o $sortOutPath"
samtools sort -@ 16 -l 9 -O BAM $convertOutPath -o $sortOutPath
fi
echo "$(date "+%Y-%m-%d %H:%M:%S") -------------------------------------------------------------------------------------排序end"
echo "$(date "+%Y-%m-%d %H:%M:%S")"
done
echo "$(date "+%Y-%m-%d %H:%M:%S") ========================================>>对比转换排序 end<<========================================"
exit 0
计算分箱(batchcalc.sh)
- 脚本按照sequence.txt文件中内容,逐行计算分箱
- 要保证__对比转换排序__执行完全成功
echo "$(date "+%Y-%m-%d %H:%M:%S") ========================================>>计算分箱 start<<========================================"
#根目录
basepath="/home/app/tempwork/"
#数据来源
FILENAME=$basepath"/sequence.txt"
#拼接好的样本根目录
succ_root_path=$basepath"/res"
for line in $(cat $FILENAME); do
echo "$(date "+%Y-%m-%d %H:%M:%S")"
echo "$(date "+%Y-%m-%d %H:%M:%S") sourceline --> ${line}"
#按 : 号分割 序列第一第二部分
arr=(${line//:/ })
name_yangben=${arr[0]}
part_1=${arr[1]}
part_2=${arr[2]}
echo "$(date "+%Y-%m-%d %H:%M:%S") 样本名"$name_yangben",序列1:"$part_1"<><>序列2:"$part_2
#从第一部分找出序列名
templine=${part_1%/*}
name_xulie=${templine##*/}
echo "$(date "+%Y-%m-%d %H:%M:%S") 序列名 ${name_xulie}"
#找出序列名part
sequence_1=${part_1##*/}
sequence_2=${part_2##*/}
#binres目录
binresPath=$succ_root_path/$name_yangben/$name_xulie/"binres/"
#sortbam文件目录
sortBamPath=$binresPath"final.sorted.bam"
#输出文件目录
outputDepth=$binresPath"final.depth.txt"
echo "$(date "+%Y-%m-%d %H:%M:%S") -------------------------------------------------------------------------------------计算start"
if [ ! -f $sortBamPath ]; then
echo "$(date "+%Y-%m-%d %H:%M:%S") 未找到sortbam文件 --> $sortBamPath"
continue
fi
echo "$(date "+%Y-%m-%d %H:%M:%S") exec command==> jgi_summarize_bam_contig_depths --outputDepth $outputDepth $sortBamPath"
jgi_summarize_bam_contig_depths --outputDepth $outputDepth $sortBamPath
echo "$(date "+%Y-%m-%d %H:%M:%S") 未找到sortbam文件 --> $sortBamPath"
echo "$(date "+%Y-%m-%d %H:%M:%S") -------------------------------------------------------------------------------------计算end"
finalcontigPath=$succ_root_path/$name_yangben/$name_xulie/"final.contigs.fa"
#all文件夹
allPath=$binresPath"all/"
echo "$(date "+%Y-%m-%d %H:%M:%S") -------------------------------------------------------------------------------------分箱start"
if [ ! -f $finalcontigPath ]; then
echo "$(date "+%Y-%m-%d %H:%M:%S") 未找到final.contig.fa文件 --> $finalcontigPath"
continue
fi
if [ ! -d $allPath ]; then
echo "$(date "+%Y-%m-%d %H:%M:%S") 文件夹不存在,创建all目录 --> $allPath"
mkdir -p $allPath
fi
echo "$(date "+%Y-%m-%d %H:%M:%S") exec command==> metabat2 -m 1500 -t 16 -i $finalcontigPath -a $outputDepth -o $allPath -v"
metabat2 -m 1500 -t 16 -i $finalcontigPath -a $outputDepth -o $allPath -v
echo "$(date "+%Y-%m-%d %H:%M:%S") -------------------------------------------------------------------------------------分箱end"
echo "$(date "+%Y-%m-%d %H:%M:%S")"
done
echo "$(date "+%Y-%m-%d %H:%M:%S") ========================================>>计算分箱 end<<========================================"
exit 0