WES分析6-BaseRecalibrator

Base Recalibrator

snp=/home/lzn/WES/genome/resources_broad_hg38_v0_1000G_phase1.snps.high_confidence.hg38.vcf.gz
indel=/home/lzn/WES/genome/resources_broad_hg38_v0_Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
dbsnp=/home/lzn/WES/genome/resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf
ref=/home/lzn/WES/genome/resources_broad_hg38_v0_Homo_sapiens_assembly38.fasta
wkd=/home/lzn/WES

for sample in `cat $wkd/sample.list`
do
time gatk BaseRecalibratorSpark \
-I $wkd/align/$sample.sorted.dedup.bam \
--known-sites $dbsnp \
--known-sites $snp \
--known-sites $indel \
-R $ref \
-O $wkd/align/$sample.recal.table
done
#The current best set of known indels to be used for local realignment (note that we don't use dbSNP for this anymore); use both files:
#1000G_phase1.indels.b37.vcf (currently from the 1000 Genomes Phase I indel calls)
#Mills_and_1000G_gold_standard.indels.b37.sites.vcf

#A USER ERROR has occurred: Fasta dict file for reference /home/lzn/WES/genome/resources_broad_hg38_v0_Homo_sapiens_assembly38.fasta does not exist.
#生成参考基因组的dict⽂件
gatk CreateSequenceDictionary -R $ref -O $wkd/genome/resources_broad_hg38_v0_Homo_sapiens_assembly38.dict
-rw-r--r-- 1 lzn lzn 519K 10月  4 18:57 resources_broad_hg38_v0_Homo_sapiens_assembly38.dict


#Unable to parse header with error: Your input file has a malformed header: We never saw the required CHROM header line (starting with one #) for the input VCF file, for input source: /tmp/spark-150caabe-db69-4100-9e81-a407a6108bc0/userFiles-5f8c99c2-f11b-4592-bfd2-53de9221536f/resources_broad_hg38_v0_1000G_phase1.snps.high_confidence.hg38.vcf.vcf
https://gatk.broadinstitute.org/hc/en-us/community/posts/360060067271-BaseRecalibrator-raised-htsjdk-tribble-TribbleException-MalformedFeatureFile
#在https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0
#重新下载文件,用浏览器自带的下载器下载,不能用IDM下载。


#UserException$MissingIndex: An index is required but was not found for file /tmp/spark-353b4306-b709-4d04-b54c-8584ad07bc7d/userFiles-81f7295f-4251-4a19-b0f5-56e79d52032a/resources_broad_hg38_v0_1000G_phase1.snps.high_confidence.hg38.vcf.gz:/tmp/spark-353b4306-b709-4d04-b54c-8584ad07bc7d/userFiles-81f7295f-4251-4a19-b0f5-56e79d52032a/resources_broad_hg38_v0_1000G_phase1.snps.high_confidence.hg38.vcf.gz. Support for unindexed block-compressed files has been temporarily disabled. Try running IndexFeatureFile on the input.
gatk IndexFeatureFile -I $snp
gatk IndexFeatureFile -I $indel
-rw-r--r-- 1 lzn lzn 2.7M 10月  4 19:16 resources_broad_hg38_v0_1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi
-rw-r--r-- 1 lzn lzn 2.1M 10月  4 19:17 resources_broad_hg38_v0_Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi


#UserException$MissingReferenceFaiFile: Fasta index file file:///tmp/spark-7410e8cd-e804-4bb2-8921-f28e65d0107b/userFiles-47a14da6-6a91-4588-96bc-eb2f137097ee/resources_broad_hg38_v0_Homo_sapiens_assembly38.fasta.fai for reference file:///tmp/spark-7410e8cd-e804-4bb2-8921-f28e65d0107b/userFiles-47a14da6-6a91-4588-96bc-eb2f137097ee/resources_broad_hg38_v0_Homo_sapiens_assembly38.fasta does not exist.
samtools faidx $ref
-rw-r--r-- 1 lzn lzn 158K 10月  4 19:25 resources_broad_hg38_v0_Homo_sapiens_assembly38.fasta.fai

#UserException$MissingContigInSequenceDictionary: Contig chr5_KV575243v1_alt not present in the sequence dictionary  主要有可能是ref或参考的vcf文件的序列名称可能包含很多细节的片段,而这些在另一个文件中并没有。
#下载 resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf
#	resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf.idx
#依然报错。
#发现自己生成的resources_broad_hg38_v0_Homo_sapiens_assembly38.dict文件大小(519KB)比网站提供的文件小(568kb),下载网站提供的文件试试。  依旧报错。
#在http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/ 下载human_g1k_v37.fasta.fai和human_g1k_v37.fasta.gz 
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.fai &
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz &

重新下载数据

cd /home/lzn/WES/genome
#ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/

wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/human_g1k_v37.fasta.gz
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/human_g1k_v37.fasta.fai.gz
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/human_g1k_v37.dict.gz
gunzip -f human_g1k_v37*.gz

wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/dbsnp_138.b37.vcf.gz
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/dbsnp_138.b37.vcf.idx.gz
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/1000G_phase1.indels.b37.vcf.gz
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/1000G_phase1.indels.b37.vcf.idx.gz
gunzip -f dbsnp_138.b37.vcf*.gz
gunzip 1000G_phase1.indels.b37.vcf*.gz


snp=/home/lzn/WES/genome/dbsnp_138.b37.vcf
indel=/home/lzn/WES/genome/1000G_phase1.indels.b37.vcf 
ref=/home/lzn/WES/genome/human_g1k_v37.fasta 
wkd=/home/lzn/WES

for sample in `cat $wkd/sample.list`
do
time gatk BaseRecalibratorSpark \
-I $wkd/align/$sample.sorted.dedup.bam \
--known-sites $snp \
--known-sites $indel \
-R $ref \
-O $wkd/align/$sample.recal.table
done

#A USER ERROR has occurred: Input files reference and reads have incompatible contigs: No overlapping contigs found.
#原因:reference contigs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT...]
#reads contigs = [chr1, chr10, chr11, chr11_KI270721v1_random, chr12, chr13, chr14, chr14_GL000009v2_random...] 名字不同,并且一些contigs在reference contigs 中没有。
#解决办法:use the rererence that was used to map with bwa

ref=/home/lzn/WES/genome/resources_broad_hg38_v0_Homo_sapiens_assembly38.fasta
#UserException$MissingContigInSequenceDictionary: Contig chr2_KQ031384v1_fix not present in the sequence dictionary 

wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/human_g1k_v37_decoy.fasta.gz
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/human_g1k_v37_decoy.dict.gz
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/human_g1k_v37_decoy.fasta.fai.gz
gunzip human_g1k_v37_decoy*.gz

ref=/home/lzn/WES/genome/human_g1k_v37_decoy.fasta


重新比对后

snp=/home/lzn/WES/genome/resources_broad_hg38_v0_1000G_phase1.snps.high_confidence.hg38.vcf.gz
indel=/home/lzn/WES/genome/resources_broad_hg38_v0_Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
dbsnp=/home/lzn/WES/genome/resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf
ref=/home/lzn/WES/genome/resources_broad_hg38_v0_Homo_sapiens_assembly38.fasta
wkd=/home/lzn/WES

for sample in `cat $wkd/sample.list`
do
time gatk BaseRecalibratorSpark \
-I $wkd/align/$sample.sorted.dedup.bam \
--known-sites $dbsnp \
--known-sites $snp \
--known-sites $indel \
-R $ref \
-O $wkd/align/$sample.recal.table
done
#上述问题已解决 map和BaseRecalibrator必须使用同一reference

for sample in `cat $wkd/sample.list`
do
time gatk ApplyBQSRSpark \
-bqsr $wkd/align/$sample.recal.table \
-R $ref \
-I $wkd/align/$sample.sorted.dedup.bam \
-O $wkd/align/$sample.sorted.dedup.recal.bam
done

#Plot a single recalibration table
   gatk AnalyzeCovariates \
     -bqsr recal1.table \
     -plots AnalyzeCovariates.pdf
#缺少‘gplots’包
libarry(BiocManager)
install("gplots")
install("gsalib")
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值