htsjdk库SAMSequenceDictionary和SAMSequenceRecord类介绍

最新推荐文章于 2024-08-03 21:02:21 发布

qq_27390023

最新推荐文章于 2024-08-03 21:02:21 发布

阅读量164

点赞数 1

文章标签： java 生物信息学

本文链接：https://blog.csdn.net/qq_27390023/article/details/140891400

版权

在 HTSJDK 库中，SAMSequenceDictionary 和 SAMSequenceRecord 类用于处理和管理基因组数据中的序列信息（contigs）。这两个类通常一起使用，提供了对基因组中所有 contig 的详细描述和访问。

`SAMSequenceDictionary` 类

主要功能

存储序列信息：SAMSequenceDictionary 存储了一个基因组的所有 contig 的信息，包括 contig 的名称和长度。
提供访问接口：提供方法以获取特定 contig 的信息，方便进行序列数据的访问和操作。

`SAMSequenceDictionary` 类源码：

/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;

import htsjdk.beta.plugin.HtsHeader;
import htsjdk.samtools.util.Log;

import java.io.Serializable;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.util.*;
import java.util.stream.Collectors;


import static htsjdk.samtools.SAMSequenceRecord.*;

/**
 * Collection of SAMSequenceRecords.
 */

public class SAMSequenceDictionary implements HtsHeader, Serializable {
    public static final long serialVersionUID = 1L;

    private List<SAMSequenceRecord> mSequences = new ArrayList<>();
    private final Map<String, SAMSequenceRecord> mSequenceMap = new HashMap<>();

    public SAMSequenceDictionary() {
    }

    public SAMSequenceDictionary(final List<SAMSequenceRecord> list) {
        this();
        setSequences(list);
    }

    public List<SAMSequenceRecord> getSequences() {
        return Collections.unmodifiableList(mSequences);
    }

    private static Log log = Log.getInstance(SAMSequenceDictionary.class);

    public SAMSequenceRecord getSequence(final String name) {
        return mSequenceMap.get(name);
    }

    /**
     * Replaces the existing list of SAMSequenceRecords with the given list.
     * Reset the aliases
     *
     * @param list This value is copied and validated.
     */
    public void setSequences(final List<SAMSequenceRecord> list) {
        mSequences = new ArrayList<>(list.size());
        mSequenceMap.clear();
        list.forEach(this::addSequence);
    }

    public void addSequence(final SAMSequenceRecord sequenceRecord) {
        if (mSequenceMap.containsKey(sequenceRecord.getSequenceName())) {
            throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " +
                    sequenceRecord.getSequenceName());
        }
        sequenceRecord.setSequenceIndex(mSequences.size());
        mSequences.add(sequenceRecord);
        mSequenceMap.put(sequenceRecord.getSequenceName(), sequenceRecord);
        sequenceRecord.getAlternativeSequenceNames().forEach(an -> addSequenceAlias(sequenceRecord.getSequenceName(), an));
    }

    /**
     * @return The SAMSequenceRecord with the given index, or null if index is out of range.
     */
    public SAMSequenceRecord getSequence(final int sequenceIndex) {
        if (sequenceIndex < 0 || sequenceIndex >= mSequences.size()) {
            return null;
        }
        return mSequences.get(sequenceIndex);
    }

    /**
     * @return The index for the given sequence name, or {@value SAMSequenceRecord#UNAVAILABLE_SEQUENCE_INDEX} if the name is not found.
     */
    public int getSequenceIndex(final String sequenceName) {
        final SAMSequenceRecord record = mSequenceMap.get(sequenceName);
        if (record == null) {
            return UNAVAILABLE_SEQUENCE_INDEX;
        }
        return record.getSequenceIndex();
    }

    /**
     * @return number of SAMSequenceRecord(s) in this dictionary
     */
    public int size() {
        return mSequences.size();
    }

    /**
     * @return The sum of the lengths of the sequences in this dictionary
     */
    public long getReferenceLength() {
        return getSequences()
                .stream()
                .mapToLong(SAMSequenceRecord::getSequenceLength)
                .sum();
    }

    /**
     * @return true is the dictionary is empty
     */
    public boolean isEmpty() {
        return mSequences.isEmpty();
    }

    private static String DICT_MISMATCH_TEMPLATE = "SAM dictionaries are not the same: %s.";
    /**
     * Non-comprehensive {@link #equals(Object)}-assertion: instead of calling {@link SAMSequenceRecord#equals(Object)} on constituent
     * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call
     * {@link SAMSequenceRecord#isSameSequence(SAMSequenceRecord)}.
     * Aliases are ignored.
     *
     * @throws AssertionError When the dictionaries are not the same, with some human-readable information as to why
     */
    public void assertSameDictionary(final SAMSequenceDictionary that) {
        if (this == that) return;

        final Iterator<SAMSequenceRecord> thatSequences = that.mSequences.iterator();
        for (final SAMSequenceRecord thisSequence : mSequences) {
            if (!thatSequences.hasNext()) {
                throw new AssertionError(String.format(DICT_MISMATCH_TEMPLATE, thisSequence + " is present in only one dictionary"));
            } else {
                final SAMSequenceRecord thatSequence = thatSequences.next();
                if(!thatSequence.isSameSequence(thisSequence)) {
                    throw new AssertionError(
                            String.format(DICT_MISMATCH_TEMPLATE, thatSequence + " was found when " + thisSequence + " was expected")
                    );
                }
            }
        }
        if (thatSequences.hasNext())
            throw new AssertionError(String.format(DICT_MISMATCH_TEMPLATE, thatSequences.next() + " is present in only one dictionary"));
    }

    /**
     * Non-comprehensive {@link #equals(Object)}-validation: instead of calling {@link SAMSequenceRecord#equals(Object)} on constituent
     * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call
     * {@link SAMSequenceRecord#isSameSequence(SAMSequenceRecord)}.
     *
     * @param that {@link SAMSequenceDictionary} to compare against
     * @return true if the dictionaries are the same, false otherwise
     *
     */
    public boolean isSameDictionary(final SAMSequenceDictionary that) {
        if (that == null || that.mSequences == null) return false;
        if (this == that) return true;

        final Iterator<SAMSequenceRecord> thatSequences = that.mSequences.iterator();
        for (final SAMSequenceRecord thisSequence : mSequences) {
            if (!thatSequences.hasNext()) {
                return false;
            } else {
                final SAMSequenceRecord thatSequence = thatSequences.next();
                if (!thatSequence.isSameSequence(thisSequence)) {
                    return false;
                }
            }
        }

        return !thatSequences.hasNext();
    }

    /**
     * Returns {@code true} if the two dictionaries are the same.
     *
     * <p>NOTE: Aliases are NOT considered, but alternative sequence names (AN tag) names ARE.
     */
    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;

        SAMSequenceDictionary that = (SAMSequenceDictionary) o;

       return mSequences.equals(that.mSequences);
    }

    /**
     * Add an alias to a SAMSequenceRecord. This can be use to provide some
     * alternate names fo a given contig. e.g:
     * <code>1,chr1,chr01,01,CM000663,NC_000001.10</code> e.g:
     * <code>MT,chrM</code>
     *
     * <p>NOTE: this method does not add the alias to the alternative sequence name tag (AN) in the SAMSequenceRecord.
     * If you would like to add it to the AN tag, use {@link #addAlternativeSequenceName(String, String)} instead.
     *
     * @param originalName  existing contig name
     * @param altName       new contig name
     * @return the contig associated to the 'originalName/altName'
     */
    public SAMSequenceRecord addSequenceAlias(final String originalName,
            final String altName) {
        if (originalName == null) throw new IllegalArgumentException("original name cannot be null");
        if (altName == null) throw new IllegalArgumentException("alt name cannot be null");
        final SAMSequenceRecord originalSeqRecord = getSequence(originalName);
        if (originalSeqRecord == null) throw new IllegalArgumentException("Sequence " + originalName + " doesn't exist in dictionary.");
        // same name, nothing to do
        if (originalName.equals(altName)) return originalSeqRecord;
        final SAMSequenceRecord altSeqRecord = getSequence(altName);
        if (altSeqRecord != null) {
            // alias was already set to the same record
            if (altSeqRecord.equals(originalSeqRecord)) return originalSeqRecord;
            // alias was already set to another record
            throw new IllegalArgumentException("Alias " + altName + " for " + originalSeqRecord +
                    " was already set to " + altSeqRecord.getSequenceName());
        }
        mSequenceMap.put(altName, originalSeqRecord);
        return originalSeqRecord;
    }

    /**
     * Add an alternative sequence name (AN tag) to a SAMSequenceRecord, including it into the aliases
     * to retrieve the contigs (as with {@link #addSequenceAlias(String, String)}.
     *
     * <p>This can be use to provide some alternate names fo a given contig. e.g:
     * <code>1,chr1,chr01,01,CM000663</code> or
     * <code>MT,chrM</code>.
     *
     * @param originalName  existing contig name
     * @param altName       new contig name
     * @return the contig associated to the 'originalName/altName', with the AN tag including the altName
     */
    public SAMSequenceRecord addAlternativeSequenceName(final String originalName,
            final String altName) {
        final SAMSequenceRecord record = addSequenceAlias(originalName, altName);
        record.addAlternativeSequenceName(altName);
        return record;
    }

    /**
     * return a MD5 sum for ths dictionary, the checksum is re-computed each
     * time this method is called.
     *
     * <pre>
     * md5( (seq1.md5_if_available) + ' '+(seq2.name+seq2.length) + ' '+...)
     * </pre>
     *
     * @return a MD5 checksum for this dictionary or the empty string if it is
     *         empty
     */
    public String md5() {
        if (isEmpty())
            return "";
        try {
            final MessageDigest md5 = MessageDigest.getInstance("MD5");
            md5.reset();
            for (final SAMSequenceRecord samSequenceRecord : mSequences) {
                if (samSequenceRecord.getSequenceIndex() > 0)
                    md5.update((byte) ' ');
                final String md5_tag = samSequenceRecord.getAttribute(SAMSequenceRecord.MD5_TAG);
                if (md5_tag != null) {
                    md5.update(md5_tag.getBytes());
                } else {
                    md5.update(samSequenceRecord.getSequenceName().getBytes());
                    md5.update(String.valueOf(samSequenceRecord.getSequenceLength()).getBytes());
                }
            }
            String hash = new BigInteger(1, md5.digest()).toString(16);
            if (hash.length() != 32) {
                final String zeros = "00000000000000000000000000000000";
                hash = zeros.substring(0, 32 - hash.length()) + hash;
            }
            return hash;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public int hashCode() {
        return mSequences.hashCode();
    }

    @Override
    public String toString() {
        return "SAMSequenceDictionary:( sequences:"+ size()+
                " length:"+ getReferenceLength()+" "+
                " md5:"+md5()+")";
    }

    public static final List<String> DEFAULT_DICTIONARY_EQUAL_TAG = Arrays.asList(
            SAMSequenceRecord.MD5_TAG,
            SAMSequenceRecord.SEQUENCE_LENGTH_TAG);

    /**
     * Will merge dictionaryTags from two dictionaries into one focusing on merging the tags rather than the sequences.
     *
     * Requires that dictionaries have the same SAMSequence records in the same order.
     * For each sequenceIndex, the union of the tags from both sequences will be added to the new sequence, mismatching
     * values (for tags that are in both) will generate a warning, and the value from dict1 will be used.
     * For tags that are in tagsToEquate an unequal value will generate an error (an IllegalArgumentException will
     * be thrown.) tagsToEquate must include LN and MD.
     *
     * @param dict1 first dictionary
     * @param dict2 first dictionary
     * @param tagsToMatch list of tags that must be equal if present in both sequence. Must contain MD, and LN
     * @return dictionary consisting of the same sequences as the two inputs with the merged values of tags.
     */
    public static SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionary dict1,
                                                          final SAMSequenceDictionary dict2,
                                                          final List<String> tagsToMatch) {

        // We require MD and LN to match.
        if (!tagsToMatch.contains(MD5_TAG) || !tagsToMatch.contains(SEQUENCE_LENGTH_TAG)) {
            throw new IllegalArgumentException("Both " + MD5_TAG + " and " + SEQUENCE_LENGTH_TAG + " must be matched " +
                    "when merging dictionaries. Found: " + String.join(",", tagsToMatch));
        }

        if (!dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()).equals(
                dict2.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()))) {

            throw new IllegalArgumentException(String.format("Do not use this function to merge dictionaries with " +
                            "different sequences in them. Sequences must be in the same order as well. Found [%s] and [%s].",
                    dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.joining(", ")),
                    dict2.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.joining(", "))));
        }

        final SAMSequenceDictionary finalDict = new SAMSequenceDictionary();
        for (int sequenceIndex = 0; sequenceIndex < dict1.getSequences().size(); sequenceIndex++) {
            final SAMSequenceRecord s1 = dict1.getSequence(sequenceIndex);
            final SAMSequenceRecord s2 = dict2.getSequence(sequenceIndex);

            final String sName = s1.getSequenceName();
            final SAMSequenceRecord sMerged = new SAMSequenceRecord(sName, UNKNOWN_SEQUENCE_LENGTH);
            finalDict.addSequence(sMerged);

            final Set<String> allTags = new HashSet<>();
            s1.getAttributes().forEach(a -> allTags.add(a.getKey()));
            s2.getAttributes().forEach(a -> allTags.add(a.getKey()));

            for (final String tag : allTags) {
                final String value1 = s1.getAttribute(tag);
                final String value2 = s2.getAttribute(tag);

                if (value1 != null && value2 != null && !value1.equals(value2)) {
                    String baseMessage = String.format("Found sequence entry for which " +
                                    "tags differ: %s and tag %s has the two values: %s and %s.",
                            sName, tag, value1, value2);

                    if (tagsToMatch.contains(tag)) {
                        log.error("Cannot merge dictionaries. ", baseMessage);
                        throw new IllegalArgumentException("Cannot merge dictionaries. " + baseMessage);
                    } else {
                        log.warn(baseMessage, " Using ", value1);
                    }
                }
                sMerged.setAttribute(tag, value1 == null ? value2 : value1);
            }

            final int length1 = s1.getSequenceLength();
            final int length2 = s2.getSequenceLength();

            if (length1 != UNKNOWN_SEQUENCE_LENGTH && length2 != UNKNOWN_SEQUENCE_LENGTH && length1 != length2) {
                throw new IllegalArgumentException(String.format("Cannot merge the two dictionaries. " +
                        "Found sequence entry for which " + "lengths differ: %s has lengths %s and %s", sName, length1, length2));
            }
            sMerged.setSequenceLength(length1 == UNKNOWN_SEQUENCE_LENGTH ? length2 : length1);
        }
        return finalDict;
    }
}

`SAMSequenceRecord` 类

SAMSequenceRecord 是一个类，用于表示单个 contig 的详细信息。它包含了 contig 的基本信息，如名称和长度。

主要功能

描述 contig：提供关于 contig 的详细信息，如名称和长度。
与 SAMSequenceDictionary 配合使用：SAMSequenceRecord 对象通常通过 SAMSequenceDictionary 来管理和访问。

`SAMSequenceRecord` 类源码：

/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;

import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.StringUtil;

import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * Header information about a reference sequence.  Corresponds to @SQ header record in SAM text header.
 */

public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Cloneable, Locatable {
    public static final long serialVersionUID = 1L; // AbstractSAMHeaderRecord implements Serializable
    public static final int UNAVAILABLE_SEQUENCE_INDEX = -1;
    private final String mSequenceName; // Value must be interned() if it's ever set/modified
    private Set<String> mAlternativeSequenceName = new LinkedHashSet<>();
    private int mSequenceIndex = UNAVAILABLE_SEQUENCE_INDEX;
    private int mSequenceLength = 0;
    public static final String SEQUENCE_NAME_TAG = "SN";
    public static final String ALTERNATIVE_SEQUENCE_NAME_TAG = "AN";
    public static final String SEQUENCE_LENGTH_TAG = "LN";
    public static final String MD5_TAG = "M5";
    public static final String ASSEMBLY_TAG = "AS";
    public static final String URI_TAG = "UR";
    public static final String SPECIES_TAG = "SP";
    public static final String DESCRIPTION_TAG = "DS";

    /**
     * If one sequence has this length, and another sequence had a different length, isSameSequence will
     * not complain that they are different sequences.
     */
    public static final int UNKNOWN_SEQUENCE_LENGTH = 0;

    /**
     * This is not a valid sequence name, because it is reserved in the RNEXT field of SAM text format
     * to mean "same reference as RNAME field."
     */

    public static final String RESERVED_RNEXT_SEQUENCE_NAME = "=";

    /* use RESERVED_RNEXT_SEQUENCE_NAME instead. */
    @Deprecated
    public static final String RESERVED_MRNM_SEQUENCE_NAME = RESERVED_RNEXT_SEQUENCE_NAME;

    /**
     * The standard tags are stored in text header without type information, because the type of these tags is known.
     */
    public static final Set<String> STANDARD_TAGS =
            new HashSet<>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, ALTERNATIVE_SEQUENCE_NAME_TAG, MD5_TAG, URI_TAG, SPECIES_TAG));

    // These are the chars matched by \\s.
    private static final char[] WHITESPACE_CHARS = {' ', '\t', '\n', '\013', '\f', '\r'}; // \013 is vertical tab

    // alternative sequence name separator
    private static final String ALTERNATIVE_SEQUENCE_NAME_SEPARATOR = ",";
    private static final Pattern LEGAL_RNAME_PATTERN = Pattern.compile("[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*");

    /**
     * @deprecated Use {@link #SAMSequenceRecord(String, int)} instead.
     * sequenceLength is required for the object to be considered valid.
     */
    @Deprecated
    public SAMSequenceRecord(final String name) {
        this(name, UNKNOWN_SEQUENCE_LENGTH);
    }

    public SAMSequenceRecord(final String name, final int sequenceLength) {
        if (name != null) {
            validateSequenceName(name);
            mSequenceName = name.intern();
        } else {
            mSequenceName = null;
        }
        mSequenceLength = sequenceLength;
    }

    public String getSequenceName() {
        return mSequenceName;
    }

    public int getSequenceLength() {
        return mSequenceLength;
    }

    public SAMSequenceRecord setSequenceLength(final int value) {
        mSequenceLength = value;
        return this;
    }

    public String getAssembly() {
        return (String) getAttribute(ASSEMBLY_TAG);
    }

    public SAMSequenceRecord setAssembly(final String value) {
        setAttribute(ASSEMBLY_TAG, value);
        return this;
    }

    public String getSpecies() {
        return (String) getAttribute(SPECIES_TAG);
    }

    public SAMSequenceRecord setSpecies(final String value) {
        setAttribute(SPECIES_TAG, value);
        return this;
    }

    public String getMd5() {
        return (String) getAttribute(MD5_TAG);
    }

    public SAMSequenceRecord setMd5(final String value) {
        setAttribute(MD5_TAG, value);
        return this;
    }

    public String getDescription() {
        return getAttribute(DESCRIPTION_TAG);
    }

    public SAMSequenceRecord setDescription(final String value) {
        setAttribute(DESCRIPTION_TAG, value);
        return this;
    }

    /**
     * @return Index of this record in the sequence dictionary it lives in.
     */
    public int getSequenceIndex() {
        return mSequenceIndex;
    }

    // Private state used only by SAM implementation.
    public SAMSequenceRecord setSequenceIndex(final int value) {
        mSequenceIndex = value;
        return this;
    }

    /**
     * Returns unmodifiable set with alternative sequence names.
     */
    public Set<String> getAlternativeSequenceNames() {
        final String anTag = getAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG);
        return (anTag == null) ? Collections.emptySet()
                : Collections.unmodifiableSet(new LinkedHashSet<>(Arrays.asList(anTag.split(ALTERNATIVE_SEQUENCE_NAME_SEPARATOR))));
    }

    /**
     * Adds an alternative sequence name if it is not the same as the sequence name or it is not present already.
     */
    public void addAlternativeSequenceName(final String name) {
        final Set<String> altSequences = new HashSet<>(getAlternativeSequenceNames());

        if (!mSequenceName.equals(name)) {
            altSequences.add(name);
        }
        encodeAltSequences(altSequences);
    }

    /**
     * Sets the alternative sequence names in the order provided by iteration, removing the previous values.
     */
    public SAMSequenceRecord setAlternativeSequenceName(final Collection<String> alternativeSequences) {
        if (alternativeSequences == null) {
            setAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG, null);
        } else {
            // encode all alt sequence names
            encodeAltSequences(alternativeSequences);
        }
        return this;
    }

    private static void validateAltRegExp(final String name) {
        if (!LEGAL_RNAME_PATTERN.matcher(name).matches()) {
            throw new IllegalArgumentException(String.format("Invalid alternative sequence name '%s': do not match the pattern %s", name, LEGAL_RNAME_PATTERN));
        }
    }

    private void encodeAltSequences(final Collection<String> alternativeSequences) {

        //make sure that the order in which alternate names are joined is determined
        setAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG, alternativeSequences.isEmpty() ? null : alternativeSequences.stream()
                .sorted()
                .distinct()
                .peek(SAMSequenceRecord::validateAltRegExp)
                .collect(Collectors.joining(ALTERNATIVE_SEQUENCE_NAME_SEPARATOR)));
    }

    /**
     * Returns {@code true} if there are alternative sequence names; {@code false} otherwise.
     */
    public boolean hasAlternativeSequenceNames() {
        return getAttribute(ALTERNATIVE_SEQUENCE_NAME_TAG) != null;
    }

    /**
     * Looser comparison than equals().  We look only at sequence index, sequence length, and MD5 tag value
     * (or sequence names, if there is no MD5 tag in either record.
     */
    public boolean isSameSequence(final SAMSequenceRecord that) {
        if (this == that) {
            return true;
        }
        if (that == null) {
            return false;
        }

        if (mSequenceIndex != that.mSequenceIndex) {
            return false;
        }
        // PIC-439.  Allow undefined length.
        if (mSequenceLength != UNKNOWN_SEQUENCE_LENGTH && that.mSequenceLength != UNKNOWN_SEQUENCE_LENGTH && mSequenceLength != that.mSequenceLength) {
            return false;
        }
        if (this.getAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getAttribute(SAMSequenceRecord.MD5_TAG) != null) {
            final BigInteger thisMd5 = new BigInteger((String) this.getAttribute(SAMSequenceRecord.MD5_TAG), 16);
            final BigInteger thatMd5 = new BigInteger((String) that.getAttribute(SAMSequenceRecord.MD5_TAG), 16);
            if (!thisMd5.equals(thatMd5)) {
                return false;
            }
        } else {
            // Compare using == since we intern() the Strings
            if (mSequenceName != that.mSequenceName) {
                // if they are different, they could still be the same based on the alternative sequences
                if (getAlternativeSequenceNames().contains(that.mSequenceName) ||
                        that.getAlternativeSequenceNames().contains(mSequenceName)) {
                    return true;
                }
                return false;
            }
        }

        return true;
    }

    @Override
    public boolean equals(final Object o) {
        if (this == o) {
            return true;
        }
        if (!(o instanceof SAMSequenceRecord)) {
            return false;
        }

        final SAMSequenceRecord that = (SAMSequenceRecord) o;

        if (mSequenceIndex != that.mSequenceIndex) {
            return false;
        }
        if (mSequenceLength != that.mSequenceLength) {
            return false;
        }
        if (!attributesEqual(that)) {
            return false;
        }
        if (mSequenceName != that.mSequenceName) {
            return false; // Compare using == since we intern() the name
        }
        if (!getAlternativeSequenceNames().equals(that.getAlternativeSequenceNames())) {
            return false;
        }

        return true;
    }

    @Override
    public int hashCode() {
        return mSequenceName != null ? mSequenceName.hashCode() : 0;
    }

    @Override
    Set<String> getStandardTags() {
        return STANDARD_TAGS;
    }

    @Override
    public final SAMSequenceRecord clone() {
        final SAMSequenceRecord ret = new SAMSequenceRecord(this.mSequenceName, this.mSequenceLength);
        ret.mSequenceIndex = this.mSequenceIndex;
        for (final Map.Entry<String, String> entry : this.getAttributes()) {
            ret.setAttribute(entry.getKey(), entry.getValue());
        }
        return ret;
    }

    /**
     * Truncate sequence name at first whitespace.
     */
    public static String truncateSequenceName(final String sequenceName) {
        /*
         * Instead of using regex split, do it manually for better performance.
         */

        int truncateAt = sequenceName.length();
        for (final char c : WHITESPACE_CHARS) {
            int index = sequenceName.indexOf(c);
            if (index != UNAVAILABLE_SEQUENCE_INDEX && index < truncateAt) {
                truncateAt = index;
            }
        }
        return sequenceName.substring(0, truncateAt);
    }

    /**
     * Throw an exception if the sequence name is not valid.
     */
    public static void validateSequenceName(final String name) {
        if (!LEGAL_RNAME_PATTERN.matcher(name).useAnchoringBounds(true).matches()) {
            throw new SAMException(String.format("Sequence name '%s' doesn't match regex: '%s' ", name, LEGAL_RNAME_PATTERN));
        }
    }

    @Override
    public String toString() {
        return String.format(
                "SAMSequenceRecord(name=%s,length=%s,dict_index=%s,assembly=%s,alternate_names=%s)",
                getSequenceName(),
                getSequenceLength(),
                getSequenceIndex(),
                getAssembly(),
                getAlternativeSequenceNames()
        );
    }

    @Override
    public String getSAMString() {
        return new SAMTextHeaderCodec().getSQLine(this);
    }

    /**
     * always returns <code>getSequenceName()</code>
     *
     * @see #getSequenceName()
     */
    @Override
    public final String getContig() {
        return this.getSequenceName();
    }

    /**
     * always returns 1
     */
    @Override
    public final int getStart() {
        return 1;
    }

    /**
     * always returns <code>getSequenceLength()</code>
     *
     * @see #getSequenceLength()
     */
    @Override
    public final int getEnd() {
        return this.getSequenceLength();
    }
}

qq_27390023

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
htsjdk库SAMSequenceDictionary和SAMSequenceRecord类介绍

在 HTSJDK 库中，和类用于处理和管理基因组数据中的序列信息（contigs）。这两个类通常一起使用，提供了对基因组中所有 contig 的详细描述和访问。
复制链接

扫一扫