在 HTSJDK 库中,SAMSequenceDictionary
和 SAMSequenceRecord
类用于处理和管理基因组数据中的序列信息(contigs)。这两个类通常一起使用,提供了对基因组中所有 contig 的详细描述和访问。
SAMSequenceDictionary
类
主要功能
- 存储序列信息:
SAMSequenceDictionary
存储了一个基因组的所有 contig 的信息,包括 contig 的名称和长度。 - 提供访问接口:提供方法以获取特定 contig 的信息,方便进行序列数据的访问和操作。
SAMSequenceDictionary
类源码:
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;
import htsjdk.beta.plugin.HtsHeader;
import htsjdk.samtools.util.Log;
import java.io.Serializable;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.util.*;
import java.util.stream.Collectors;
import static htsjdk.samtools.SAMSequenceRecord.*;
/**
* Collection of SAMSequenceRecords.
*/
public class SAMSequenceDictionary implements HtsHeader, Serializable {
public static final long serialVersionUID = 1L;
private List<SAMSequenceRecord> mSequences = new ArrayList<>();
private final Map<String, SAMSequenceRecord> mSequenceMap = new HashMap<>();
public SAMSequenceDictionary() {
}
public SAMSequenceDictionary(final List<SAMSequenceRecord> list) {
this();
setSequences(list);
}
public List<SAMSequenceRecord> getSequences() {
return Collections.unmodifiableList(mSequences);
}
private static Log log = Log.getInstance(SAMSequenceDictionary.class);
public SAMSequenceRecord getSequence(final String name) {
return mSequenceMap.get(name);
}
/**
* Replaces the existing list of SAMSequenceRecords with the given list.
* Reset the aliases
*
* @param list This value is copied and validated.
*/
public void setSequences(final List<SAMSequenceRecord> list) {
mSequences = new ArrayList<>(list.size());
mSequenceMap.clear();
list.forEach(this::addSequence);
}
public void addSequence(final SAMSequenceRecord sequenceRecord) {
if (mSequenceMap.containsKey(sequenceRecord.getSequenceName())) {
throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " +
sequenceRecord.getSequenceName());
}
sequenceRecord.setSequenceIndex(mSequences.size());
mSequences.add(sequenceRecord);
mSequenceMap.put(sequenceRecord.getSequenceName(), sequenceRecord);
sequenceRecord.getAlternativeSequenceNames().forEach(an -> addSequenceAlias(sequenceRecord.getSequenceName(), an));
}
/**
* @return The SAMSequenceRecord with the given index, or null if index is out of range.
*/
public SAMSequenceRecord getSequence(final int sequenceIndex) {
if (sequenceIndex < 0 || sequenceIndex >= mSequences.size()) {
return null;
}
return mSequences.get(sequenceIndex);
}
/**
* @return The index for the given sequence name, or {@value SAMSequenceRecord#UNAVAILABLE_SEQUENCE_INDEX} if the name is not found.
*/
public int getSequenceIndex(final String sequenceName) {
final SAMSequenceRecord record = mSequenceMap.get(sequenceName);
if (record == null) {
return UNAVAILABLE_SEQUENCE_INDEX;
}
return record.getSequenceIndex();
}
/**
* @return number of SAMSequenceRecord(s) in this dictionary
*/
public int size() {
return mSequences.size();
}
/**
* @return The sum of the lengths of the sequences in this dictionary
*/
public long getReferenceLength() {
return getSequences()
.stream()
.mapToLong(SAMSequenceRecord::getSequenceLength)
.sum();
}
/**
* @return true is the dictionary is empty
*/
public boolean isEmpty() {
return mSequences.isEmpty();
}
private static String DICT_MISMATCH_TEMPLATE = "SAM dictionaries are not the same: %s.";
/**
* Non-comprehensive {@link #equals(Object)}-assertion: instead of calling {@link SAMSequenceRecord#equals(Object)} on constituent
* {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call
* {@link SAMSequenceRecord#isSameSequence(SAMSequenceRecord)}.
* Aliases are ignored.
*
* @throws AssertionError When the dictionaries are not the same, with some human-readable information as to why
*/
public void assertSameDictionary(final SAMSequenceDictionary that) {
if (this == that) return;
final Iterator<SAMSequenceRecord> thatSequences = that.mSequences.iterator();
for (final SAMSequenceRecord thisSequence : mSequences) {
if (!thatSequences.hasNext()) {
throw new AssertionError(String.format(DICT_MISMATCH_TEMPLATE, thisSequence + " is present in only one dictionary"));
} else {
final SAMSequenceRecord thatSequence = thatSequences.next();
if(!thatSequence.isSameSequence(thisSequence)) {
throw new AssertionError(
String.format(DICT_MISMATCH_TEMPLATE, thatSequence + " was found when " + thisSequence + " was expected")
);
}
}
}
if (thatSequences.hasNext())
throw new AssertionError(String.format(DICT_MISMATCH_TEMPLATE, thatSequences.next() + " is present in only one dictionary"));
}
/**
* Non-comprehensive {@link #equals(Object)}-validation: instead of calling {@link SAMSequenceRecord#equals(Object)} on constituent
* {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call
* {@link SAMSequenceRecord#isSameSequence(SAMSequenceRecord)}.
*
* @param that {@link SAMSequenceDictionary} to compare against
* @return true if the dictionaries are the same, false otherwise
*
*/
public boolean isSameDictionary(final SAMSequenceDictionary that) {
if (that == null || that.mSequences == null) return false;
if (this == that) return true;
final Iterator<SAMSequenceRecord> thatSequences = that.mSequences.iterator();
for (final SAMSequenceRecord thisSequence : mSequences) {
if (!thatSequences.hasNext()) {
return false;
} else {
final SAMSequenceRecord thatSequence = thatSequences.next();
if (!thatSequence.isSameSequence(thisSequence)) {
return false;
}
}
}
return !thatSequences.hasNext();
}
/**
* Returns {@code true} if the two dictionaries are the same.
*
* <p>NOTE: Aliases are NOT considered, but alternative sequence names (AN tag) names ARE.
*/
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SAMSequenceDictionary that = (SAMSequenceDictionary) o;
return mSequences.equals(that.mSequences);
}
/**
* Add an alias to a SAMSequenceRecord. This can be use to provide some
* alternate names fo a given contig. e.g:
* <code>1,chr1,chr01,01,CM000663,NC_000001.10</code> e.g:
* <code>MT,chrM</code>
*
* <p>NOTE: this method does not add the alias to the alternative sequence name tag (AN) in the SAMSequenceRecord.
* If you would like to add it to the AN tag, use {@link #addAlternativeSequenceName(String, String)} instead.
*
* @param originalName existing contig name
* @param altName new contig name
* @return the contig associated to the 'originalName/altName'
*/
public SAMSequenceRecord addSequenceAlias(final String originalName,
final String altName) {
if (originalName == null) throw new IllegalArgumentException("original name cannot be null");
if (altName == null) throw new IllegalArgumentException("alt name cannot be null");
final SAMSequenceRecord originalSeqRecord = getSequence(originalName);
if (originalSeqRecord == null) throw new IllegalArgumentException("Sequence " + originalName + " doesn't exist in dictionary.");
// same name, nothing to do
if (originalName.equals(altName)) return originalSeqRecord;
final SAMSequenceRecord altSeqRecord = getSequence(altName);
if (altSeqRecord != null) {
// alias was already set to the same record
if (altSeqRecord.equals(originalSeqRecord)) return originalSeqRecord;
// alias was already set to another record
throw new IllegalArgumentException("Alias " + altName + " for " + originalSeqRecord +
" was already set to " + altSeqRecord.getSequenceName());
}
mSequenceMap.put(altName, originalSeqRecord);
return originalSeqRecord;
}
/**
* Add an alternative sequence name (AN tag) to a SAMSequenceRecord, including it into the aliases
* to retrieve the contigs (as with {@link #addSequenceAlias(String, String)}.
*
* <p>This can be use to provide some alternate names fo a given contig. e.g:
* <code>1,chr1,chr01,01,CM000663</code> or
* <code>MT,chrM</code>.
*
* @param originalName existing contig name
* @param altName new contig name
* @return the contig associated to the 'originalName/altName', with the AN tag including the altName
*/
public SAMSequenceRecord addAlternativeSequenceName(final String originalName,
final String altName) {
final SAMSequenceRecord record = addSequenceAlias(originalName, altName);
record.addAlternativeSequenceName(altName);
return record;
}
/**
* return a MD5 sum for ths dictionary, the checksum is re-computed each
* time this method is called.
*
* <pre>
* md5( (seq1.md5_if_available) + ' '+(seq2.name+seq2.length) + ' '+...)
* </pre>
*
* @return a MD5 checksum for this dictionary or the empty string if it is
* empty
*/
public String md5() {
if (isEmpty())
return "";
try {
final MessageDigest md5 = MessageDigest.getInstance("MD5");
md5.reset();
for (final SAMSequenceRecord samSequenceRecord : mSequences) {
if (samSequenceRecord.getSequenceIndex() > 0)
md5.update((byte) ' ');
final String md5_tag = samSequenceRecord.getAttribute(SAMSequenceRecord.MD5_TAG);
if (md5_tag != null) {
md5.update(md5_tag.getBytes());
} else {
md5.update(samSequenceRecord.getSequenceName().getBytes());
md5.update(String.valueOf(samSequenceRecord.getSequenceLength()).getBytes());
}
}
String hash = new BigInteger(1, md5.digest()).toString(16);
if (hash.length() != 32) {
final String zeros = "00000000000000000000000000000000";
hash = zeros.substring(0, 32 - hash.length()) + hash;
}
return hash;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override
public int hashCode() {
return mSequences.hashCode();
}
@Override
public String toString() {
return "SAMSequenceDictionary:( sequences:"+ size()+
" length:"+ getReferenceLength()+" "+
" md5:"+md5()+")";
}
public static final List<String> DEFAULT_DICTIONARY_EQUAL_TAG = Arrays.asList(
SAMSequenceRecord.MD5_TAG,
SAMSequenceRecord.SEQUENCE_LENGTH_TAG);
/**
* Will merge dictionaryTags from two dictionaries into one focusing on merging the tags rather than the sequences.
*
* Requires that dictionaries have the same SAMSequence records in the same order.
* For each sequenceIndex, the union of the tags from both sequences will be added to the new sequence, mismatching
* values (for tags that are in both) will generate a warning, and the value from dict1 will be used.
* For tags that are in tagsToEquate an unequal value will generate an error (an IllegalArgumentException will
* be thrown.) tagsToEquate must include LN and MD.
*
* @param dict1 first dictionary
* @param dict2 first dictionary
* @param tagsToMatch list of tags that must be equal if present in both sequence. Must contain MD, and LN
* @return dictionary consisting of the same sequences as the two inputs with the merged values of tags.
*/
public static SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionary dict1,
final SAMSequenceDictionary dict2,
final List<String> tagsToMatch) {
// We require MD and LN to match.
if (!tagsToMatch.contains(MD5_TAG) || !tagsToMatch.contains(SEQUENCE_LENGTH_TAG)) {
throw new IllegalArgumentException("Both " + MD5_TAG + " and " + SEQUENCE_LENGTH_TAG + " must be matched " +
"when merging dictionaries. Found: " + String.join(",", tagsToMatch));
}
if (!dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()).equals(
dict2.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()))) {
throw new IllegalArgumentException(String.format("Do not use this function to merge dictionaries with " +
"different sequences in them. Sequences must be in the same order as well. Found [%s] and [%s].",
dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.joining(", ")),
dict2.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.joining(", "))));
}
final SAMSequenceDictionary finalDict = new SAMSequenceDictionary();
for (int sequenceIndex = 0; sequenceIndex < dict1.getSequences().size(); sequenceIndex++) {
final SAMSequenceRecord s1 = dict1.getSequence(sequenceIndex);
final SAMSequenceRecord s2 = dict2.getSequence(sequenceIndex);
final String sName = s1.getSequenceName();
final SAMSequenceRecord sMerged = new SAMSequenceRecord(sName, UNKNOWN_SEQUENCE_LENGTH);
finalDict.addSequence(sMerged);
final Set<String> allTags = new HashSet<>();
s1.getAttributes().forEach(a -> allTags.add(a.getKey()));
s2.getAttributes().forEach(a -> allTags.add(a.getKey()));
for (final String tag : allTags) {
final String value1 = s1.getAttribute(tag);
final String value2 = s2.getAttribute(tag);
if (value1 != null && value2 != null && !value1.equals(value2)) {
String baseMessage = String.format("Found sequence entry for which " +
"tags differ: %s and tag %s has the two values: %s and %s.",
sName, tag, value1, value2);
if (tagsToMatch.contains(tag)) {
log.error("Cannot merge dictionaries. ", baseMessage);
throw new IllegalArgumentException("Cannot merge dictionaries. " + baseMessage);
} else {
log.warn(baseMessage, " Using ", value1);
}
}
sMerged.setAttribute(tag, value1 == null ? value2 : value1);
}
final int length1 = s1.getSequenceLength();
final int length2 = s2.getSequenceLength();
if (length1 != UNKNOWN_SEQUENCE_LENGTH && length2 != UNKNOWN_SEQUENCE_LENGTH && length1 != length2) {
throw new IllegalArgumentException(String.format("Cannot merge the two dictionaries. " +
"Found sequence entry for which " + "lengths differ: %s has lengths %s and %s", sName, length1, length2));
}
sMerged.setSequenceLength(length1 == UNKNOWN_SEQUENCE_LENGTH ? length2 : length1);
}
return finalDict;
}
}
SAMSequenceRecord
类
SAMSequenceRecord
是一个类,用于表示单个 contig 的详细信息。它包含了 contig 的基本信息,如名称和长度。
主要功能
- 描述 contig:提供关于 contig 的详细信息,如名称和长度。
- 与
SAMSequenceDictionary
配合使用:SAMSequenceRecord
对象通常通过SAMSequenceDictionary
来管理和访问。
SAMSequenceRecord
类源码:
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.StringUtil;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Header information about a reference sequence. Corresponds to @SQ header record in SAM text header.
*/
public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Cloneable, Locatable {
public static final long serialVersionUID = 1L; // AbstractSAMHeaderRecord implements Serializable
public static final int UNAVAILABLE_SEQUENCE_INDEX = -1;
private final String mSequenceName; // Value must be interned() if it's ever set/modified
private Set<String> mAlternativeSequenceName = new LinkedHashSet<>();
private int mSequenceIndex = UNAVAILABLE_SEQUENCE_INDEX;
private int mSequenceLength = 0;
public static final String SEQUENCE_NAME_TAG = "SN";
public static final String ALTERNATIVE_SEQUENCE_NAME_TAG = "AN";
public static final String SEQUENCE_LENGTH_TAG = "LN";
public static final String MD5_TAG = "M5";
public static final String ASSEMBLY_TAG = "AS";
public static final String URI_TAG = "UR";
public static final String SPECIES_TAG = "SP";
public static final String DESCRIPTION_TAG = "DS";
/**
* If one sequence has this length, and another sequence had a different length, isSameSequence will
* not complain that they are different sequences.
*/
public static final int UNKNOWN_SEQUENCE_LENGTH = 0;
/**
* This is not a valid sequence name, because it is reserved in the RNEXT field of SAM text format
* to mean "same reference as RNAME field."
*/
public static final String RESERVED_RNEXT_SEQUENCE_NAME = "=";
/* use RESERVED_RNEXT_SEQUENCE_NAME instead. */
@Deprecated
public static final String RESERVED_MRNM_SEQUENCE_NAME = RESERVED_RNEXT_SEQUENCE_NAME;
/**
* The standard tags are stored in text header without type information, because the type of these tags is known.
*/
public static final Set<String> STANDARD_TAGS =
new HashSet<>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, ALTERNATIVE_SEQUENCE_NAME_TAG, MD5_TAG, URI_TAG, SPECIES_TAG));
// These are the chars matched by \\s.
private static final char[] WHITESPACE_CHARS = {' ', '\t', '\n', '\013', '\f', '\r'}; // \013 is vertical tab
// alternative sequence name separator
private static final String ALTERNATIVE_SEQUENCE_NAME_SEPARATOR = ",";
private static final Pattern LEGAL_RNAME_PATTERN = Pattern.compile("[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*");
/**
* @deprecated Use {@link #SAMSequenceRecord(String, int)} instead.
* sequenceLength is required for the object to be considered valid.
*/
@Deprecated
public SAMSequenceRecord(final String name) {
this(name, UNKNOWN_SEQUENCE_LENGTH);
}
public SAMSequenceRecord(final String name, final int sequenceLength) {
if (name != null) {
validateSequenceName(name);
mSequenceName = name.intern();
} else {
mSequenceName = null;
}
mSequenceLength = sequenceLength;
}
public String getSequenceName() {
return mSequenceName;
}
public int getSequenceLength() {
return mSequenceLength;
}
public SAMSequenceRecord setSequenceLength(final int value) {
mSequenceLength = value;
return this;
}
public String getAssembly() {
return (String) getAttribute(ASSEMBLY_TAG);
}
public SAMSequenceRecord setAssembly(final String value) {
setAttribute(ASSEMBLY_TAG, value);
return this;
}
public String getSpecies() {
return (String) getAttribute(SPECIES_TAG);
}
public SAMSequenceRecord setSpecies(final String value) {
setAttribute(SPECIES_TAG, value);
return this;
}
public String getMd5() {
return (String) getAttribute(MD5_TAG);
}
public SAMSequenceRecord setMd5(final String value) {
setAttribute(MD5_TAG, value);
return this;
}
public String getDescription() {
return getAttribute(DESCRIPTION_TAG);
}
public SAMSequenceRecord setDescription(final String value) {
setAttribute(DESCRIPTION_TAG, value);
return this;
}
/**
* @return Index of this record in the sequence dictionary it lives in.
*/
public int getSequenceIndex() {
return mSequenceIndex;
}
// Private state used only by SAM implementation.
public SAMSequenceRecord setSequenceIndex(final int value) {
mSequenceIndex = value;
return this;
}
/**
* Returns unmodifiable set with alternative sequence names.
*/
public Set<String> getAlternativeSequenceNames() {
final String anTag = getAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG);
return (anTag == null) ? Collections.emptySet()
: Collections.unmodifiableSet(new LinkedHashSet<>(Arrays.asList(anTag.split(ALTERNATIVE_SEQUENCE_NAME_SEPARATOR))));
}
/**
* Adds an alternative sequence name if it is not the same as the sequence name or it is not present already.
*/
public void addAlternativeSequenceName(final String name) {
final Set<String> altSequences = new HashSet<>(getAlternativeSequenceNames());
if (!mSequenceName.equals(name)) {
altSequences.add(name);
}
encodeAltSequences(altSequences);
}
/**
* Sets the alternative sequence names in the order provided by iteration, removing the previous values.
*/
public SAMSequenceRecord setAlternativeSequenceName(final Collection<String> alternativeSequences) {
if (alternativeSequences == null) {
setAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG, null);
} else {
// encode all alt sequence names
encodeAltSequences(alternativeSequences);
}
return this;
}
private static void validateAltRegExp(final String name) {
if (!LEGAL_RNAME_PATTERN.matcher(name).matches()) {
throw new IllegalArgumentException(String.format("Invalid alternative sequence name '%s': do not match the pattern %s", name, LEGAL_RNAME_PATTERN));
}
}
private void encodeAltSequences(final Collection<String> alternativeSequences) {
//make sure that the order in which alternate names are joined is determined
setAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG, alternativeSequences.isEmpty() ? null : alternativeSequences.stream()
.sorted()
.distinct()
.peek(SAMSequenceRecord::validateAltRegExp)
.collect(Collectors.joining(ALTERNATIVE_SEQUENCE_NAME_SEPARATOR)));
}
/**
* Returns {@code true} if there are alternative sequence names; {@code false} otherwise.
*/
public boolean hasAlternativeSequenceNames() {
return getAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG) != null;
}
/**
* Looser comparison than equals(). We look only at sequence index, sequence length, and MD5 tag value
* (or sequence names, if there is no MD5 tag in either record.
*/
public boolean isSameSequence(final SAMSequenceRecord that) {
if (this == that) {
return true;
}
if (that == null) {
return false;
}
if (mSequenceIndex != that.mSequenceIndex) {
return false;
}
// PIC-439. Allow undefined length.
if (mSequenceLength != UNKNOWN_SEQUENCE_LENGTH && that.mSequenceLength != UNKNOWN_SEQUENCE_LENGTH && mSequenceLength != that.mSequenceLength) {
return false;
}
if (this.getAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getAttribute(SAMSequenceRecord.MD5_TAG) != null) {
final BigInteger thisMd5 = new BigInteger((String) this.getAttribute(SAMSequenceRecord.MD5_TAG), 16);
final BigInteger thatMd5 = new BigInteger((String) that.getAttribute(SAMSequenceRecord.MD5_TAG), 16);
if (!thisMd5.equals(thatMd5)) {
return false;
}
} else {
// Compare using == since we intern() the Strings
if (mSequenceName != that.mSequenceName) {
// if they are different, they could still be the same based on the alternative sequences
if (getAlternativeSequenceNames().contains(that.mSequenceName) ||
that.getAlternativeSequenceNames().contains(mSequenceName)) {
return true;
}
return false;
}
}
return true;
}
@Override
public boolean equals(final Object o) {
if (this == o) {
return true;
}
if (!(o instanceof SAMSequenceRecord)) {
return false;
}
final SAMSequenceRecord that = (SAMSequenceRecord) o;
if (mSequenceIndex != that.mSequenceIndex) {
return false;
}
if (mSequenceLength != that.mSequenceLength) {
return false;
}
if (!attributesEqual(that)) {
return false;
}
if (mSequenceName != that.mSequenceName) {
return false; // Compare using == since we intern() the name
}
if (!getAlternativeSequenceNames().equals(that.getAlternativeSequenceNames())) {
return false;
}
return true;
}
@Override
public int hashCode() {
return mSequenceName != null ? mSequenceName.hashCode() : 0;
}
@Override
Set<String> getStandardTags() {
return STANDARD_TAGS;
}
@Override
public final SAMSequenceRecord clone() {
final SAMSequenceRecord ret = new SAMSequenceRecord(this.mSequenceName, this.mSequenceLength);
ret.mSequenceIndex = this.mSequenceIndex;
for (final Map.Entry<String, String> entry : this.getAttributes()) {
ret.setAttribute(entry.getKey(), entry.getValue());
}
return ret;
}
/**
* Truncate sequence name at first whitespace.
*/
public static String truncateSequenceName(final String sequenceName) {
/*
* Instead of using regex split, do it manually for better performance.
*/
int truncateAt = sequenceName.length();
for (final char c : WHITESPACE_CHARS) {
int index = sequenceName.indexOf(c);
if (index != UNAVAILABLE_SEQUENCE_INDEX && index < truncateAt) {
truncateAt = index;
}
}
return sequenceName.substring(0, truncateAt);
}
/**
* Throw an exception if the sequence name is not valid.
*/
public static void validateSequenceName(final String name) {
if (!LEGAL_RNAME_PATTERN.matcher(name).useAnchoringBounds(true).matches()) {
throw new SAMException(String.format("Sequence name '%s' doesn't match regex: '%s' ", name, LEGAL_RNAME_PATTERN));
}
}
@Override
public String toString() {
return String.format(
"SAMSequenceRecord(name=%s,length=%s,dict_index=%s,assembly=%s,alternate_names=%s)",
getSequenceName(),
getSequenceLength(),
getSequenceIndex(),
getAssembly(),
getAlternativeSequenceNames()
);
}
@Override
public String getSAMString() {
return new SAMTextHeaderCodec().getSQLine(this);
}
/**
* always returns <code>getSequenceName()</code>
*
* @see #getSequenceName()
*/
@Override
public final String getContig() {
return this.getSequenceName();
}
/**
* always returns 1
*/
@Override
public final int getStart() {
return 1;
}
/**
* always returns <code>getSequenceLength()</code>
*
* @see #getSequenceLength()
*/
@Override
public final int getEnd() {
return this.getSequenceLength();
}
}