在GATK(Genome Analysis Toolkit)中,FeatureDataSource
类和FeatureManager
类是处理基因组特征数据的关键组件。下面是对这两个类的简要介绍:
FeatureDataSource
类
FeatureDataSource
是一个用于管理和访问与基因组数据相关的特征数据(如变异、基因组注释等)的类。它主要用于提供对这些数据源的访问,支持从不同的数据格式和位置加载特征数据。这个类通常在GATK的工具和算法中用于读取和处理基因组特征数据。
主要功能:
- 加载数据:从文件、数据库等位置加载基因组特征数据。
- 提供接口:为其他组件和算法提供访问基因组特征数据的接口。
- 数据管理:管理数据源的元数据和配置。
源码:
package org.broadinstitute.hellbender.engine;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.*;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.IndexFeatureFile;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBConstants;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
import org.broadinstitute.hellbender.utils.IndexUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.tools.sv.SVFeaturesHeader;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.BlockCompressedIntervalStream.Reader;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.genomicsdb.model.GenomicsDBExportConfiguration;
import org.genomicsdb.reader.GenomicsDBFeatureReader;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import static org.broadinstitute.hellbender.tools.genomicsdb.GATKGenomicsDBUtils.createExportConfiguration;
import static org.broadinstitute.hellbender.utils.io.BlockCompressedIntervalStream.BCI_FILE_EXTENSION;
/**
* Enables traversals and queries over sources of Features, which are metadata associated with a location
* on the genome in a format supported by our file parsing framework, Tribble. Examples of Features are
* VCF records and hapmap records.
* <p>
* Two basic operations are available on this data source:
* <p>
* -Iteration over all Features in this data source, optionally restricted to Features overlapping
* a set of intervals if intervals are provided via {@link #setIntervalsForTraversal(List)}. Traversal
* by a set of intervals requires the file to have been indexed using the bundled tool IndexFeatureFile.
* The set of intervals provided MUST be non-overlapping and sorted in increasing order of start position.
* <p>
* -Targeted queries by one interval at a time. This also requires the file to have been indexed using
* the bundled tool IndexFeatureFile. Targeted queries by one interval at a time are unaffected by
* any intervals for full traversal set via {@link #setIntervalsForTraversal(List)}.
* <p>
* To improve performance in the case of targeted queries by one interval at a time, this class uses a caching
* scheme that is optimized for the common access pattern of multiple separate queries over intervals with
* gradually increasing start positions. It optimizes for this use case by pre-fetching records immediately
* following each interval during a query and caching them. Performance will suffer if the access pattern is
* random, involves queries over intervals with DECREASING start positions instead of INCREASING start positions,
* or involves lots of very large jumps forward on the genome or lots of contig switches. Query caching
* can be disabled, if desired.
*
* @param <T> The type of Feature returned by this data source
*/
public final class FeatureDataSource<T extends Feature> implements GATKDataSource<T>, AutoCloseable {
private static final Logger logger = LogManager.getLogger(FeatureDataSource.class);
/**
* Feature reader used to retrieve records from our file
*/
private final FeatureReader<T> featureReader;
/**
* Iterator representing an open traversal over this data source initiated via a call to {@link #iterator}
* (null if there is no open traversal). We need this to ensure that each iterator is properly closed,
* and to enforce the constraint (required by Tribble) that we never have more than one iterator open
* over our feature reader.
*/
private CloseableTribbleIterator<T> currentIterator;
/**
* Our intervals for traversal. If set, restricts full traversals initiated via {@link #iterator} to
* return only Features overlapping this set of intervals. Does not affect individual queries
* initiated via {@link #query(SimpleInterval)} and/or {@link #queryAndPrefetch(Locatable)}.
*/
private List<SimpleInterval> intervalsForTraversal;
/**
* Cache containing Features from recent queries initiated via {@link #query(SimpleInterval)} and/or
* {@link #queryAndPrefetch(Locatable)}. This is guaranteed to start at the start position of the
* most recent query, but will typically end well after the end of the most recent query. Designed to
* improve performance of the common access pattern involving multiple queries across nearby intervals
* with gradually increasing start positions.
*/
private final FeatureCache<T> queryCache;
/**
* When we experience a cache miss (ie., a query interval not fully contained within our cache) and need
* to re-populate the Feature cache from disk to satisfy a query, this controls the number of extra bases
* AFTER the end of our interval to fetch. Should be sufficiently large so that typically a significant number
* of subsequent queries will be cache hits (ie., query intervals fully contained within our cache) before
* we have another cache miss and need to go to disk again.
*/
private final int queryLookaheadBases;
/**
* Holds information about the path this datasource reads from.
*/
private final FeatureInput<T> featureInput;
/**
* True if this datasource is backed by a file that has an associated index file, false if it doesn't
*/
private final boolean hasIndex;
/**
* True if this datasource supports efficient random access queries.
* <p>
* For a file, this is the same as {@link #hasIndex}, but there are non-file data sources (eg., GenomicsDB)
* that don't have a separate index file but do support random access.
*/
private final boolean supportsRandomAccess;
/**
* Default value for queryLookaheadBases, if none is specified. This is designed to be large enough
* so that in typical usage (ie., query intervals with gradually increasing start locations) there will
* be a substantial number of cache hits between cache misses, reducing the number of times we need to
* repopulate the cache from disk.
*/
public static final int DEFAULT_QUERY_LOOKAHEAD_BASES = 1000;
/**
* Creates a FeatureDataSource backed by the provided File. The data source will have an automatically
* generated name, and will look ahead the default number of bases ({@link #DEFAULT_QUERY_LOOKAHEAD_BASES})
* during queries that produce cache misses.
*
* @param featureFile file containing Features
*/
public FeatureDataSource(final File featureFile) {
this(featureFile, null);
}
/**
* Creates a FeatureDataSource backed by the provided path. The data source will have an automatically
* generated name, and will look ahead the default number of bases ({@link #DEFAULT_QUERY_LOOKAHEAD_BASES})
* during queries that produce cache misses.
*
* @param featurePath path or URI to source of Features
*/
public FeatureDataSource(final String featurePath) {
this(featurePath, null, DEFAULT_QUERY_LOOKAHEAD_BASES, null);
}
/**
* Creates a FeatureDataSource backed by the provided File and assigns this data source the specified logical
* name. We will look ahead the default number of bases ({@link #DEFAULT_QUERY_LOOKAHEAD_BASES}) during queries
* that produce cache misses.
*
* @param featureFile file containing Features
* @param name logical name for this data source (may be null)
*/
public FeatureDataSource(final File featureFile, final String name) {
this(featureFile, name, DEFAULT_QUERY_LOOKAHEAD_BASES);
}
/**
* Creates a FeatureDataSource backed by the provided File and assigns this data source the specified logical
* name. We will look ahead the specified number of bases during queries that produce cache misses.
*
* @param featureFile file containing Features
* @param name logical name for this data source (may be null)
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
*/
public FeatureDataSource(final File featureFile, final String name, final int queryLookaheadBases) {
this(Utils.nonNull(featureFile).getAbsolutePath(), name, queryLookaheadBases, null);
}
/**
* Creates a FeatureDataSource backed by the resource at the provided path.
*
* @param featurePath path to file or GenomicsDB url containing features
* @param name logical name for this data source (may be null)
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
* @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
* that produce this type of Feature. May be null, which results in an unrestricted search.
*/
public FeatureDataSource(final String featurePath, final String name, final int queryLookaheadBases, final Class<? extends Feature> targetFeatureType) {
this(new FeatureInput<>(featurePath, name != null ? name : featurePath), queryLookaheadBases, targetFeatureType);
}
/**
* Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
* during queries that produce cache misses.
*
* @param featureInput a FeatureInput specifying a source of Features
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
* @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
* that produce this type of Feature. May be null, which results in an unrestricted search.
*/
public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLookaheadBases, final Class<? extends Feature> targetFeatureType) {
this(featureInput, queryLookaheadBases, targetFeatureType, 0, 0);
}
/**
* Creates a FeatureDataSource backed by the resource at the provided path.
*
* @param featurePath path to file or GenomicsDB url containing features
* @param name logical name for this data source (may be null)
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
* @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
* that produce this type of Feature. May be null, which results in an unrestricted search.
* @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
* @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
*/
public FeatureDataSource(final String featurePath, final String name, final int queryLookaheadBases, final Class<? extends Feature> targetFeatureType,
final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer) {
this(new FeatureInput<>(featurePath, name != null ? name : featurePath), queryLookaheadBases, targetFeatureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer);
}
/**
* Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
* during queries that produce cache misses.
*
* @param featureInput a FeatureInput specifying a source of Features
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
* @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
* that produce this type of Feature. May be null, which results in an unrestricted search.
* @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
* @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
*/
public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLookaheadBases, final Class<? extends Feature> targetFeatureType,
final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer) {
this(featureInput, queryLookaheadBases, targetFeatureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer,
new GenomicsDBOptions(), false);
}
/**
* Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
* during queries that produce cache misses.
*
* @param featureInput a FeatureInput specifying a source of Features
* @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
* @