在 GATK(Genome Analysis Toolkit)库中,ReferenceDataSource
接口是一个重要的接口,用于表示与参考基因组相关的数据源。它提供了一种标准化的方式来访问和操作参考基因组的不同来源的数据。ReferenceMemorySource 类和ReferenceFileSource 类是ReferenceDataSource接口的实现类。分别用于管理内存和文件中的参考基因组数据。
ReferenceDataSource
接口概述
ReferenceDataSource
接口定义了对参考基因组数据的访问操作,包括获取特定位置的参考序列。它通常用于 GATK 工具中的参考数据处理和访问。
主要功能
- 访问参考数据:提供对参考基因组数据的标准化访问方式。
- 支持不同的数据源:可以从不同的参考数据源(如 FASTA 文件)中获取参考序列。
接口实现:
ReferenceMemorySource 类和 ReferenceFileSource 类实现 ReferenceDataSource接口
ReferenceDataSource接口源代码:
package org.broadinstitute.hellbender.engine;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequence;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.iterators.ByteArrayIterator;
import org.broadinstitute.hellbender.utils.reference.ReferenceBases;
import java.nio.file.Path;
import java.util.Iterator;
/**
* Manages traversals and queries over reference data.
*
* Supports targeted queries over the reference by interval and over the entire reference.
*/
public interface ReferenceDataSource extends GATKDataSource<Byte>, AutoCloseable {
/**
* Initialize this data source using a fasta file.
*
* The provided fasta file must have companion .fai and .dict files.
*
* @param fastaPath reference fasta Path
*/
public static ReferenceDataSource of(final Path fastaPath) {
return new ReferenceFileSource(fastaPath);
}
/**
* Initialize this data source using a fasta file.
*
* The provided fasta file must have companion .fai and .dict files.
*
* If {@code preserveFileBases} is {@code true}, will NOT convert IUPAC bases in the file to `N` and will NOT capitalize lower-case bases.
*
* NOTE: Most GATK tools do not support data created by setting {@code preserveFileBases} to {@code true}.
*
* @param fastaPath reference fasta Path
* @param preserveAmbiguityCodesAndCapitalization Whether to preserve the original bases in the given reference file path.
*/
public static ReferenceDataSource of(final Path fastaPath, final boolean preserveAmbiguityCodesAndCapitalization) {
return new ReferenceFileSource(fastaPath, preserveAmbiguityCodesAndCapitalization);
}
/**
* Initialize this data source using ReferenceBases and corresponding sequence dictionary.
*/
public static ReferenceDataSource of(final ReferenceBases bases, final SAMSequenceDictionary referenceSequenceDictionary) {
return new ReferenceMemorySource(bases, referenceSequenceDictionary);
}
/**
* Query a specific interval on this reference, and get back all bases spanning that interval at once.
* Call getBases() on the returned ReferenceSequence to get the actual reference bases. See the BaseUtils
* class for guidance on how to work with bases in this format.
*
* The default implementation calls #queryAndPrefetch(contig, start, stop).
*
* @param interval query interval
* @return a ReferenceSequence containing all bases spanning the query interval, prefetched
*/
default public ReferenceSequence queryAndPrefetch( final SimpleInterval interval ) {
return queryAndPrefetch(interval.getContig(), interval.getStart(), interval.getEnd());
}
/**
* Query a specific interval on this reference, and get back all bases spanning that interval at once.
* Call getBases() on the returned ReferenceSequence to get the actual reference bases. See the BaseUtils
* class for guidance on how to work with bases in this format.
*
* @param contig query interval contig
* @param start query interval start
* @param stop query interval stop
* @return a ReferenceSequence containing all bases spanning the query interval, prefetched
*/
public ReferenceSequence queryAndPrefetch(final String contig, final long start , final long stop);
/**
* Query a specific interval on this reference, and get back an iterator over the bases spanning that interval.
*
* See the BaseUtils class for guidance on how to work with bases in this format.
*
* @param interval query interval
* @return iterator over the bases spanning the query interval
*/
@Override
default public Iterator<Byte> query(final SimpleInterval interval) {
// TODO: need a way to iterate lazily over reference bases without necessarily loading them all into memory at once
return new ByteArrayIterator(queryAndPrefetch(interval).getBases());
}
/**
* Get the sequence dictionary for this reference
*
* @return SAMSequenceDictionary for this reference
*/
public SAMSequenceDictionary getSequenceDictionary();
/**
* Permanently close this data source. The default implementation does nothing.
*/
@Override
default public void close(){
//do nothing
}
}
ReferenceMemorySource类源代码:
package org.broadinstitute.hellbender.engine;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequence;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.iterators.ByteArrayIterator;
import org.broadinstitute.hellbender.utils.reference.ReferenceBases;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
/**
* Manages traversals and queries over in-memory reference data.
*
* Supports targeted queries over the reference by interval, but does not
* yet support complete iteration over the entire reference.
*/
public final class ReferenceMemorySource implements ReferenceDataSource {
private final ReferenceBases bases;
private final SAMSequenceDictionary sequenceDictionary;
/**
* Initialize this data source using ReferenceBases and corresponding sequence dictionary.
*/
public ReferenceMemorySource(final ReferenceBases bases, final SAMSequenceDictionary referenceSequenceDictionary) {
this.bases = Utils.nonNull(bases);
this.sequenceDictionary = referenceSequenceDictionary;
}
/**
* Start an iteration over the entire reference. Not yet supported!
*
* See the BaseUtils class for guidance on how to work with bases in this format.
*
* @return iterator over all bases in this reference
*/
@Override
public Iterator<Byte> iterator() {
throw new UnsupportedOperationException("Iteration over entire reference not yet implemented");
}
/**
* Query a specific interval on this reference, and get back an iterator over the bases spanning that interval.
*
* See the BaseUtils class for guidance on how to work with bases in this format.
*
* @param interval query interval
* @return iterator over the bases spanning the query interval
*/
@Override
public Iterator<Byte> query( final SimpleInterval interval ) {
int startIndex = (interval.getStart() - bases.getInterval().getStart());
int stopIndex = startIndex + interval.size();
return new ByteArrayIterator(bases.getBases(), startIndex, stopIndex);
}
/**
* Query a specific interval on this reference, and get back all bases spanning that interval at once.
* Call getBases() on the returned ReferenceSequence to get the actual reference bases. See the BaseUtils
* class for guidance on how to work with bases in this format.
*
* @param interval query interval
* @return a ReferenceSequence containing all bases spanning the query interval, prefetched
*/
@Override
public ReferenceSequence queryAndPrefetch( final SimpleInterval interval ) {
return queryAndPrefetch(interval.getContig(), interval.getStart(), interval.getEnd());
}
/**
* Query a specific interval on this reference, and get back all bases spanning that interval at once.
* Call getBases() on the returned ReferenceSequence to get the actual reference bases. See the BaseUtils
* class for guidance on how to work with bases in this format.
*
* @param contig query interval contig
* @param start query interval start
* @param stop query interval stop (included)
* @return a ReferenceSequence containing all bases spanning the query interval, prefetched
*/
@Override
public ReferenceSequence queryAndPrefetch( final String contig, final long start , final long stop) {
final int contigIndex = sequenceDictionary.getSequenceIndex(contig);
int startIndex = (int)(start - bases.getInterval().getStart());
int length = (int)(stop - start + 1);
byte[] basesBytes = bases.getBases();
if (startIndex==0 && length==basesBytes.length) {
// special case: no need to make a copy
return new ReferenceSequence(contig, contigIndex, basesBytes);
}
Utils.validIndex(startIndex, basesBytes.length);
Utils.validateArg(startIndex+length <= basesBytes.length, () -> String.format("Asking for stop %d on contig %s but the ReferenceData only has data until %d.", stop, contig, bases.getInterval().getEnd()));
Utils.validateArg(length >= 0, () -> String.format("Asking for stop<start (%d < %d)", stop, start));
return new ReferenceSequence(contig, contigIndex, Arrays.copyOfRange(basesBytes, startIndex, startIndex+length));
}
/**
* Get the sequence dictionary for this reference
*
* @return SAMSequenceDictionary for this reference
*/
@Override
public SAMSequenceDictionary getSequenceDictionary() {
return sequenceDictionary;
}
/**
* no-op (nothing's open)
*/
@Override
public void close() {}
}
ReferenceFileSource类源代码:
package org.broadinstitute.hellbender.engine;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequence;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import java.nio.file.Path;
import java.util.Iterator;
/**
* Manages traversals and queries over reference data (for now, fasta files only)
*
* Supports targeted queries over the reference by interval, but does not
* yet support complete iteration over the entire reference.
*/
public final class ReferenceFileSource implements ReferenceDataSource {
/**
* Our reference file. Uses the caching version of IndexedFastaSequenceFile
* so that repeated queries over nearby locations will be efficient (this
* is the primary reference access pattern in most traversals).
*/
private final CachingIndexedFastaSequenceFile reference;
/**
* Initialize this data source using a fasta file.
*
* The provided fasta file must have companion .fai and .dict files.
*
* @param fastaPath reference fasta file
*/
public ReferenceFileSource(final Path fastaPath) {
// Will throw a UserException if the .fai and/or .dict are missing
reference = new CachingIndexedFastaSequenceFile(Utils.nonNull(fastaPath));
}
/**
* Initialize this data source using a fasta file.
*
* The provided fasta file must have companion .fai and .dict files.
*
* If {@code preserveFileBases} is {@code true}, will NOT convert IUPAC bases in the file to `N` and will NOT capitalize lower-case bases.
* NOTE: Most GATK tools do not support data created by setting {@code preserveFileBases} to {@code true}.
*
* @param fastaPath reference fasta file
* @param preserveFileBases Whether to preserve the original bases in the given reference file path.
*/
public ReferenceFileSource(final Path fastaPath, final boolean preserveFileBases) {
// Will throw a UserException if the .fai and/or .dict are missing
reference = new CachingIndexedFastaSequenceFile(Utils.nonNull(fastaPath), preserveFileBases);
}
/**
* Start an iteration over the entire reference. Not yet supported!
*
* See the BaseUtils class for guidance on how to work with bases in this format.
*
* @return iterator over all bases in this reference
*/
@Override
public Iterator<Byte> iterator() {
throw new UnsupportedOperationException("Iteration over entire reference not yet implemented");
}
/**
* Query a specific interval on this reference, and get back all bases spanning that interval at once.
* Call getBases() on the returned ReferenceSequence to get the actual reference bases. See the BaseUtils
* class for guidance on how to work with bases in this format.
*
* @param contig query interval contig
* @param start query interval start
* @param stop query interval stop
* @return a ReferenceSequence containing all bases spanning the query interval, prefetched
*/
@Override
public ReferenceSequence queryAndPrefetch( final String contig, final long start , final long stop) {
return reference.getSubsequenceAt(contig, start, stop);
}
/**
* Get the sequence dictionary for this reference
*
* @return SAMSequenceDictionary for this reference
*/
@Override
public SAMSequenceDictionary getSequenceDictionary() {
return reference.getSequenceDictionary();
}
/**
* Permanently close this data source
*/
@Override
public void close() {
reference.close();
}
}