#cd /data/lijing/WES_test && mkdir {raw,clean,align,genome,hg19_VCF}
## step 1 预处理
outpath=/data/lijing/WES_test
mkdir -p ${outpath}/{raw,clean,align,genome,hg19_VCF}
path=/data/lijing/WES_test/raw
ls ${path}/*.gz |while read id
do
str1=$(basename $id)
echo ${str1%%.*}>>${outpath}/name.txt
done
# ls ${path}/*.gz |while read id; do echo $(basename $id)>>${outpath}/name.txt;done
cut -d'.' -f 1 ${outpath}/name.txt |sort|uniq > ${outpath}/name.uniq.txt
## step2 原始数据质控
for item in `cat ${outpath}/name.uniq.txt`
do
/home/biosoft/fastp/fastp \
-i ${path}/${item}.1.fq.gz \
-o ${outpath}/clean/${item}.1.clean.fq.gz \
-I ${path}/${item}.2.fq.gz \
-O ${outpath}/clean/${item}.2.clean.fq.gz
done
## step3 比对
for item in `cat ${outpath}/name.uniq.txt`
do
/usr/bin/bwa \
-t 4 -M -R "@RG\tID:lane1\tPL:illumina\tLB:library\tSM:wes" \
/data/lijing/WES_test/genome/hg19.fa \
${outpath}/clean/${item}.1.clean.fq.gz \
${outpath}/clean/${item}.2.clean.fq.gz > \
${outpath}/align/${item}.sam \
2 > ${outpath}/align/${item}.bwa.align.log && \
/usr/bin/samtools view -b -S ${outpath}/align/${item}.sam > ${outpath}/align/${item}.bam && \
/usr/bin/samtools sort ${outpath}/align/${item}.bam -o ${outpath}/align/${item}.sorted.bam && \
/usr/bin/samtools flagstat ${outpath}/align/${item}.sorted.bam > ${outpath}/align/${item}.sorted.bam.flagstat
done
## step4 根据bed找到外显子覆盖度信息
/home/biosoft/gatk-4.0.12.0/gatk CreateSequenceDictionary -R /data/lijing/WES_test/genome/hg19.fa -O /data/lijing/WES_test/genome/hg19.dict
/home/biosoft/gatk-4.0.12.0/gatk BedToIntervalList -I /data/lijing/WES_test/hg19_VCF/S31285117_Regions.bed -O /data/lijing/WES_test/hg19_VCF/Exon.Interval.bed -SD /data/lijing/WES_test/genome/hg19.dict
for item in `cat ${outpath}/name.uniq.txt`
do
/home/biosoft/gatk-4.0.12.0/gatk CollectHsMetrics \
-BI /data/lijing/WES_test/hg19_VCF/Exon.Interval.bed \
-TI /data/lijing/WES_test/hg19_VCF/Exon.Interval.bed \
-I ${outpath}/align/${item}.sorted.bam \
-O ${outpath}/align/${item}.stat.txt
done
## 标记PCR重复序列并建立索引
for item in `cat ${outpath}/name.uniq.txt`
do
/home/biosoft/gatk-4.0.12.0/gatk --java-options \
"-Xmx10G -Djava.io.tmpdir=${outpath}/align" \
MarkDuplicates -I ${outpath}/align/${item}.sorted.bam \
-O ${outpath}/align/${item}.sorted.MarkDuplicates.bam \
-M ${outpath}/align/${item}.sorted.bam.metrics > ${outpath}/align/${item}.log.mark \
2 > ${outpath}/align/${item}.err.mark && \
/usr/bin/samtools index ${outpath}/align/${item}.sorted.MarkDuplicates.bam
done
## /usr/bin/samtools view -f 1024 wes.sorted.MarkDuplicates.bam | less -SN
wget -c -r ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg19 -P /data/lijing/WES_test/hg19_VCF
gunzip /data/lijing/WES_test/hg19_VCF/ftp.broadinstitute.org/bundle/hg19/*.gz
/usr/bin/samtools faidx /data/lijing/WES_test/genome/hg19.fa
## 重新校正碱基质量值
for item in `cat ${outpath}/name.uniq.txt`
do
/home/biosoft/gatk-4.0.12.0/gatk --java-options \
"-Xmx10G -Djava.io.tmpdir=${outpath}/align" \
BaseRecalibrator -R /data/lijing/WES_test/genome/hg19.fa \
-I ${outpath}/align/${item}.sorted.MarkDuplicates.bam \
--known-sites /data/lijing/WES_test/hg19_VCF/ftp.broadinstitute.org/bundle/hg19/1000G_phase1.indels.hg19.sites.vcf \
--known-sites /data/lijing/WES_test/hg19_VCF/ftp.broadinstitute.org/bundle/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf \
--known-sites /data/lijing/WES_test/hg19_VCF/ftp.broadinstitute.org/bundle/hg19/dbsnp_138.hg19.vcf \
-L /data/lijing/WES_test/hg19_VCF/S31285117_Regions.bed \
-O ${outpath}/align/${item}.recal_data.table && \
/home/biosoft/gatk-4.0.12.0/gatk --java-options \
"-Xmx10G -Djava.io.tmpdir=${outpath}/align" \
ApplyBQSR -R /data/lijing/WES_test/genome/hg19.fa \
-I ${outpath}/align/${item}.sorted.MarkDuplicates.bam \
-bqsr ${outpath}/align/${item}.recal_data.table \
-L /data/lijing/WES_test/hg19_VCF/S31285117_Regions.bed \
-O ${outpath}/align/${item}.sorted.MarkDuplicates.BQSR.bam && \
done
run_line.wes-ngs-pipeline
最新推荐文章于 2024-06-17 15:36:00 发布