SomaticReferenceConfidenceModel
是 GATK(Genome Analysis Toolkit)中的一个类,专门用于体细胞突变的参考置信度计算。体细胞突变(somatic mutations)是指在个体的体细胞中发生的突变,通常与癌症等疾病相关。相比于种系突变(germline mutations),体细胞突变的等位基因频率通常较低,并且它们只出现在肿瘤样本中,而不会出现在正常样本中。
主要功能
SomaticReferenceConfidenceModel
继承自 ReferenceConfidenceModel
,其主要功能是:
-
计算参考置信度:评估在某个基因组位置上,没有观察到变异时的置信度。这对于过滤假阳性结果,或在无突变情况下仍需要进行置信度报告的场景中非常重要。
-
处理低等位基因频率的突变:体细胞突变的等位基因频率往往较低,因此该模型使用了专门的贝塔分布(Beta Distribution)作为等位基因频率的先验分布,以在低频突变中提升准确性。
-
基于堆积(pileup)的基因型似然计算:计算在一个基因组位置上,观察到的读数支持野生型(参考)基因型和其他任何突变基因型的可能性。
-
可选的等位基因过滤:在某些情况下,模型可以选择性地过滤掉弱或噪声等位基因,以提高基因型呼叫的质量。
关键方法
-
calcGenotypeLikelihoodsOfRefVsAny:
- 计算给定样本的堆积(pileup)中参考基因型与其他任何变异基因型的基因型似然。该方法返回一个
ReferenceConfidenceResult
对象,包含了参考基因型的置信度信息。
- 计算给定样本的堆积(pileup)中参考基因型与其他任何变异基因型的基因型似然。该方法返回一个
-
addGenotypeData:
- 将基因型相关的数据添加到
GenotypeBuilder
对象中,通常包括对数似然比(log-odds ratio),用于后续分析。
- 将基因型相关的数据添加到
-
doIndelRefConfCalc:
- 此方法用于处理插入/缺失变异的参考置信度计算。在这个模型中,该方法被设计为不执行任何操作,因为对体细胞插入/缺失变异的处理方式尚未优化。
构造函数
SomaticReferenceConfidenceModel
的构造函数接收多个参数,这些参数定义了如何处理样本、如何使用堆积信息、以及如何设置参考模型的各种参数。重要的参数包括样本列表、最小等位基因频率(minAF
)、是否使用软剪切碱基等。
应用场景
该类主要应用于癌症研究和其他体细胞突变分析领域,特别是在使用 GATK 的 Mutect2 工具进行变异检测时。通过计算参考置信度,SomaticReferenceConfidenceModel
能够帮助研究人员更好地过滤和识别体细胞突变,从而提高变异检测的准确性。
总结来说,SomaticReferenceConfidenceModel
是一个专门为体细胞突变检测优化的参考置信度模型,能够处理低频突变和复杂的基因型环境,为突变检测提供可靠的支持。
SomaticReferenceConfidenceModel类源码
package org.broadinstitute.hellbender.tools.walkers.mutect;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReferenceConfidenceModel;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReferenceConfidenceResult;
import org.broadinstitute.hellbender.tools.walkers.readorientation.BetaDistributionShape;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.SampleList;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import java.util.*;
public class SomaticReferenceConfidenceModel extends ReferenceConfidenceModel {
private final SampleList samples;
private final Optional<BetaDistributionShape> afPrior;
/**
* Create a new ReferenceConfidenceModel
* @param samples the list of all samples we'll be considering with this model
* @param header the SAMFileHeader describing the read information (used for debugging)
* @param indelInformativeDepthIndelSize the max size of indels to consider when calculating indel informative depths
* @param minAF soft threshold for allele fractions -- above this value prior is nearly flat, below, prior is nearly zero
* @param refModelDelQual reference model deletion quality (if constant)
* @param useSoftClippedBases should soft clipped bases be counted against the reference
* @param isFlowBasedModel is the error model flow based
*/
SomaticReferenceConfidenceModel(final SampleList samples, final SAMFileHeader header, final int indelInformativeDepthIndelSize,
final double minAF, final byte refModelDelQual, final boolean useSoftClippedBases,
final boolean isFlowBasedModel){
super(samples, header, indelInformativeDepthIndelSize, 0, refModelDelQual, useSoftClippedBases, isFlowBasedModel);
Utils.validateArg(minAF >= 0.0 && minAF < 1, "minAF must be < 1 and >= 0");
// To softly cut off allele fractions below minAF, we use a Beta prior of the form Beta(1+epsilon, 1); that is
// the prior on allele fraction f is proportional to f^epsilon. If epsilon is small this prior vanishes as f -> 0
// and very rapidly becomes flat. We choose epsilon such that minAF^epsilon = 0.5.
afPrior = minAF == 0.0 ? Optional.empty() : Optional.of(new BetaDistributionShape(1 - Math.log(2)/Math.log(minAF), 1));
this.samples = samples;
}
/**
* Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
*
* @param ploidy target sample ploidy.
* @param pileup the read backed pileup containing the data we want to evaluate
* @param refBase the reference base at this pileup position
* @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation
* @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips
* @return a RefVsAnyResult genotype call.
*/
@Override
public ReferenceConfidenceResult calcGenotypeLikelihoodsOfRefVsAny(final int ploidy,
final ReadPileup pileup,
final byte refBase,
final byte minBaseQual,
final MathUtils.RunningAverage hqSoftClips,
final boolean readsWereRealigned) {
final SomaticRefVsAnyResult result = new SomaticRefVsAnyResult();
final Map<String, List<GATKRead>> perSampleReadMap = new HashMap<>();
perSampleReadMap.put(samples.getSample(0), pileup.getReads());
final List<Byte> altQuals = new ArrayList<>(pileup.size() / 20);
for (final PileupElement element : pileup) {
if (!element.isDeletion() && element.getQual() <= minBaseQual) {
continue;
}
final boolean isAlt = readsWereRealigned ? isAltAfterAssembly(element, refBase) : isAltBeforeAssembly(element, refBase);
if (isAlt) {
altQuals.add(element.getQual());
result.nonRefDepth++;
} else {
result.refDepth++;
}
}
final double logOdds = Mutect2Engine.logLikelihoodRatio(result.refDepth, altQuals, 1, afPrior);
result.lods = new PerAlleleCollection<>(PerAlleleCollection.Type.ALT_ONLY);
result.lods.set(Allele.NON_REF_ALLELE, logOdds);
return result;
}
@Override
public void addGenotypeData(final ReferenceConfidenceResult result, final GenotypeBuilder gb) {
gb.attribute(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, MathUtils.logToLog10(((SomaticRefVsAnyResult)result).lods.get(Allele.NON_REF_ALLELE)));
}
@Override
public void doIndelRefConfCalc(final int ploidy, final byte[] ref, final ReadPileup pileup, final int refOffset, final ReferenceConfidenceResult homRefCalc) {
//NOTE:
// For germline we evaluate the alternative indel reference confidence model, compare with the SNP ref conf
// model results, and return the less confident likelihoods. The existing indel model finds the number of reads
// spanning the current position that are informative for indels of size [-10, 10]bp and calculates a diploid
// genotype likelihood with a constant indel "quality" and up to 40 informative reads. I'm not convinced that
// low allele fraction variants or errors derived from PCR error are well represented in that model.
// Additionally, the first application of this somatic ref conf is for mitochondrial calling. The MT reference
// is quite high complexity so the SNP model should dominate. Until we develop a better model for somatic
// indels, we will rely on the SNP model for all applications and this method (which is called by the parent class)
// will be a noop.
}
}
ReferenceConfidenceModel类源码
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;
import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.util.Locatable;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.engine.AssemblyRegion;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.walkers.genotyper.PloidyModel;
import org.broadinstitute.hellbender.tools.walkers.variantutils.PosteriorProbabilitiesUtils;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.genotyper.SampleList;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.HomoSapiensConstants;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
/**
* Code for estimating the reference confidence
*
* This code can estimate the probability that the data for a single sample is consistent with a
* well-determined REF/REF diploid genotype.
*
*/
public class ReferenceConfidenceModel {
// Annotation used to cache reference confidence information
public static final String INDEL_INFORMATIVE_BASES_CACHE_ATTRIBUTE_NAME = "IDL";
public static final boolean USE_CACHED_READ_INDEL_INFORMATIVENESS_VALUES = true;
// these are filled only when the softclipping is being reversed
public static final String ORIGINAL_SOFTCLIP_START_TAG = "os";
public static final String ORIGINAL_SOFTCLIP_END_TAG = "oe";
private final SampleList samples;
private final int indelInformativeDepthIndelSize;
private final int numRefSamplesForPrior;
private final byte refModelDeletionQuality;
private final boolean useSoftClippedBases;
private final boolean flowBasedModel;
@VisibleForTesting
protected static final String NON_REF_ALLELE_DESCRIPTION = "Represents any possible alternative allele not already represented at this location by REF and ALT";
private final PosteriorProbabilitiesUti