HTSJDK库Cigar类介绍

HTSJDK 库中的 Cigar 类用于表示比对过程中读取序列与参考序列之间的比对信息。CIGAR(Compact Idiosyncratic Gapped Alignment Report)字符串由多个操作符组成,这些操作符表示在比对过程中如何处理读取序列中的每个碱基。Cigar 类封装了这些信息,并提供了操作和访问这些信息的方法。

类简介

Cigar 类主要由一系列 CigarElement 对象组成,每个 CigarElement 包含一个操作符和一个长度。操作符包括匹配(M)、插入(I)、删除(D)、跳跃(N)、软剪切(S)、硬剪切(H)、匹配或不匹配(X)、匹配或不匹配(=)等。

Cigar.java源码

/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

/**
 * A list of CigarElements, which describes how a read aligns with the reference.
 * E.g. the Cigar string 10M1D25M means
 * * match or mismatch for 10 bases
 * * deletion of 1 base
 * * match or mismatch for 25 bases
 *
 * c.f. https://samtools.github.io/hts-specs/SAMv1.pdf for complete CIGAR specification.
 */
public class Cigar implements Serializable, Iterable<CigarElement> {
    public static final long serialVersionUID = 1L;

    private final List<CigarElement> cigarElements = new ArrayList<CigarElement>();

    public Cigar() {
    }

    public Cigar(final List<CigarElement> cigarElements) {
        this.cigarElements.addAll(cigarElements);
    }

    public List<CigarElement> getCigarElements() {
        return Collections.unmodifiableList(cigarElements);
    }

    public CigarElement getCigarElement(final int i) {
        return cigarElements.get(i);
    }

    public void add(final CigarElement cigarElement) {
        cigarElements.add(cigarElement);
    }

    public int numCigarElements() {
        return cigarElements.size();
    }

    public boolean isEmpty() {
        return cigarElements.isEmpty();
    }

    /**
     * @return The number of reference bases that the read covers, excluding padding.
     */
    public int getReferenceLength() {
        int length = 0;
        for (final CigarElement element : cigarElements) {
            switch (element.getOperator()) {
                case M:
                case D:
                case N:
                case EQ:
                case X:
                    length += element.getLength();
                    break;
                default: break;
            }
        }
        return length;
    }

    /**
     * @return The number of reference bases that the read covers, including padding.
     */
    public int getPaddedReferenceLength() {
        int length = 0;
        for (final CigarElement element : cigarElements) {
            switch (element.getOperator()) {
                case M:
                case D:
                case N:
                case EQ:
                case X:
                case P:
                    length += element.getLength();
                    break;
                default: break;
            }
        }
        return length;
    }

    /**
     * @return The number of read bases that the read covers.
     */
    public int getReadLength() {
        return getReadLength(cigarElements);
    }

    /**
     * @return The number of read bases that the read covers.
     */
    public static int getReadLength(final List<CigarElement> cigarElements) {
        int length = 0;
        for (final CigarElement element : cigarElements) {
            if (element.getOperator().consumesReadBases()){
                    length += element.getLength();
            }
        }
        return length;
    }

    /**
     * Exhaustive validation of CIGAR.
     * Note that this method deliberately returns null rather than Collections.emptyList() if there
     * are no validation errors, because callers tend to assume that if a non-null list is returned, it is modifiable.
     * @param readName For error reporting only.  May be null if not known.
     * @param recordNumber For error reporting only.  May be -1 if not known.
     * @return List of validation errors, or null if no errors.
     */
    public List<SAMValidationError> isValid(final String readName, final long recordNumber) {
        if (this.isEmpty()) {
            return null;
        }
        List<SAMValidationError> ret = null;
        boolean seenRealOperator = false;
        for (int i = 0; i < cigarElements.size(); ++i) {
            final CigarElement element = cigarElements.get(i);
            if (element.getLength() == 0) {
                if (ret == null) ret = new ArrayList<SAMValidationError>();
                ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
                        "CIGAR element with zero length", readName, recordNumber));
            }
            // clipping operator can only be at start or end of CIGAR
            final CigarOperator op = element.getOperator();
            if (isClippingOperator(op)) {
                if (op == CigarOperator.H) {
                    if (i != 0 && i != cigarElements.size() - 1) {
                        if (ret == null) ret = new ArrayList<SAMValidationError>();
                        ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
                                "Hard clipping operator not at start or end of CIGAR", readName, recordNumber));
                    }
                } else {
                    if (op != CigarOperator.S) throw new IllegalStateException("Should never happen: " + op.name());
                    if (i == 0 || i == cigarElements.size() - 1) {
                        // Soft clip at either end is fine
                    } else if (i == 1) {
                        if (cigarElements.size() == 3 && cigarElements.get(2).getOperator() == CigarOperator.H) {
                            // Handle funky special case in which S operator is both one from the beginning and one
                            // from the end.
                        } else if (cigarElements.get(0).getOperator() != CigarOperator.H) {
                            if (ret == null) ret = new ArrayList<SAMValidationError>();
                            ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
                                "Soft clipping CIGAR operator can only be inside of hard clipping operator",
                                    readName, recordNumber));
                        }
                    } else if (i == cigarElements.size() - 2) {
                        if (cigarElements.get(cigarElements.size() - 1).getOperator() != CigarOperator.H) {
                            if (ret == null) ret = new ArrayList<SAMValidationError>();
                            ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
                                "Soft clipping CIGAR operator can only be inside of hard clipping operator",
                                    readName, recordNumber));
                        }
                    } else {
                        if (ret == null) ret = new ArrayList<SAMValidationError>();
                        ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
                            "Soft clipping CIGAR operator can at start or end of read, or be inside of hard clipping operator",
                                readName, recordNumber));
                    }

                }
            } else if (isRealOperator(op)) {
                // Must be at least one real operator (MIDN)
                seenRealOperator = true;
                // There should be an M or P operator between any pair of IDN operators
                if (isInDelOperator(op)) {
                    for (int j = i+1; j < cigarElements.size(); ++j) {
                        final CigarOperator nextOperator = cigarElements.get(j).getOperator();
                        // Allow
                        if ((isRealOperator(nextOperator) && !isInDelOperator(nextOperator)) || isPaddingOperator(nextOperator)) {
                            break;
                        }
                        if (isInDelOperator(nextOperator) && op == nextOperator) {
                            if (ret == null) ret = new ArrayList<SAMValidationError>();
                            ret.add(new SAMValidationError(SAMValidationError.Type.ADJACENT_INDEL_IN_CIGAR,
                                    "No M or N operator between pair of " + op.name() + " operators in CIGAR", readName, recordNumber));
                        }
                    }
                }
            } else if (isPaddingOperator(op)) {
                if (i == 0) {
                    /*
                     * Removed restriction that padding not be the first operator because if a read starts in the middle of a pad
                     * in a padded reference, it is necessary to precede the read with padding so that alignment start refers to a
                     * position on the unpadded reference.
                    */
                } else if (i == cigarElements.size() - 1) {
                    if (ret == null) ret = new ArrayList<SAMValidationError>();
                    ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
                            "Padding operator not valid at end of CIGAR", readName, recordNumber));
                } else if (!isRealOperator(cigarElements.get(i-1).getOperator()) ||
                        !isRealOperator(cigarElements.get(i+1).getOperator())) {
                    if (ret == null) ret = new ArrayList<SAMValidationError>();
                    ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
                            "Padding operator not between real operators in CIGAR", readName, recordNumber));
                }
            }
        }
        if (!seenRealOperator) {
            if (ret == null) ret = new ArrayList<SAMValidationError>();
            ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
                    "No real operator (M|I|D|N) in CIGAR", readName, recordNumber));
        }
        return ret;
    }

    private static boolean isRealOperator(final CigarOperator op) {
        return op == CigarOperator.M || op == CigarOperator.EQ || op == CigarOperator.X || 
               op == CigarOperator.I || op == CigarOperator.D || op == CigarOperator.N;
    }

    private static boolean isInDelOperator(final CigarOperator op) {
        return op !=null && op.isIndel();
    }

    private static boolean isClippingOperator(final CigarOperator op) {
        return op !=null && op.isClipping();
    }

    private static boolean isPaddingOperator(final CigarOperator op) {
        return op !=null && op.isPadding();
    }

    @Override
    public boolean equals(final Object o) {
        if (this == o) return true;
        if (!(o instanceof Cigar)) return false;

        final Cigar cigar = (Cigar) o;

        return cigarElements.equals(cigar.cigarElements);
    }
    
    /** build a new Cigar object from a list of cigar operators.
     * This can be used if you have the  operators associated to
     * each base in the read.
     * 
     * e.g: read length =10 with cigar= <code>[M,M,M,M,M,M,M,M,M,M]</code>, here
     * fromCigarOperators would generate the cigar '10M'
     * 
     * later the user resolved the 'M' to '=' or 'X', the array is now
     * 
     * <code>[=,=,=,=,=,X,X,=,=,=]</code>
     * 
     * fromCigarOperators would generate the cigar '5M2X3M'
     * 
     * */
    public static Cigar fromCigarOperators(final List<CigarOperator> cigarOperators) {
        if (cigarOperators == null) throw new IllegalArgumentException("cigarOperators is null");
        final List<CigarElement> cigarElementList = new ArrayList<>();
        int i = 0;
        // find adjacent operators and build list of cigar elements
        while (i < cigarOperators.size() ) {
            final CigarOperator currentOp = cigarOperators.get(i);
            int j = i + 1;
            while (j < cigarOperators.size() && cigarOperators.get(j).equals(currentOp)) {
                j++;
            }
            cigarElementList.add(new CigarElement(j - i, currentOp));
            i = j;
        }
        return new Cigar(cigarElementList);
    }

    /**
     * Decode a Cigar from a SAM formatted CIGAR String, uses {@link TextCigarCodec#decode(String)}
     * Only performs minimal validation.
     * @param cigarString A SAM formatted CIGAR string.
     * @return a new Cigar
     */
    public static Cigar fromCigarString(String cigarString){
        return TextCigarCodec.decode(cigarString);
    }
    
    /** shortcut to <code>getCigarElements().iterator()</code> */
    @Override
    public Iterator<CigarElement> iterator() {
        return this.getCigarElements().iterator();
    }
    
    /** returns true if the cigar string contains the given operator */
    public boolean containsOperator(final CigarOperator operator) {
        return this.cigarElements.stream().anyMatch( element -> element.getOperator() == operator);
    }
    
    /** returns the first cigar element */
    public CigarElement getFirstCigarElement() {
        return isEmpty() ? null : this.cigarElements.get(0); 
    }
    
    /** returns the last cigar element */
    public CigarElement getLastCigarElement() {
        return isEmpty() ? null : this.cigarElements.get(this.numCigarElements() - 1 ); 
    }
    
    /** returns true if the cigar string starts With a clipping operator */
    public boolean isLeftClipped() {
        return !isEmpty() && isClippingOperator(getFirstCigarElement().getOperator());
    }

    /** returns true if the cigar string ends With a clipping operator */
    public boolean isRightClipped() {
        return !isEmpty() && isClippingOperator(getLastCigarElement().getOperator());
    }

    /** returns true if the cigar is clipped */
    public boolean isClipped() {
        return isLeftClipped() || isRightClipped();
    }
    
    @Override
    public int hashCode() {
        return cigarElements.hashCode();
    }

    public String toString() {
        return TextCigarCodec.encode(this);
    }
}

CigarElement.java源码

/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;

import java.io.Serializable;

/**
 * One component of a cigar string.  The component comprises the operator, and the number of bases to which
 * the  operator applies.
 */
public class CigarElement implements Serializable {
    public static final long serialVersionUID = 1L;

    private final int length;
    private final CigarOperator operator;

    public CigarElement(final int length, final CigarOperator operator) {
        if (length < 0) throw new IllegalArgumentException(String.format("Cigar element being constructed with negative length: %d and operation: %s" , length, operator.name()));
        this.length = length;
        this.operator = operator;
    }

    public int getLength() {
        return length;
    }

    public CigarOperator getOperator() {
        return operator;
    }

    @Override
    public boolean equals(final Object o) {
        if (this == o) return true;
        if (!(o instanceof CigarElement)) return false;

        final CigarElement that = (CigarElement) o;

        if (length != that.length) return false;
        if (operator != that.operator) return false;

        return true;
    }

    @Override
    public int hashCode() {
        int result = length;
        result = 31 * result + (operator != null ? operator.hashCode() : 0);
        return result;
    }
    
    @Override
    public String toString() {
        return String.valueOf(this.length)+this.operator;
    }
}

CigarOperator.java源码

/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;

/**
 * The operators that can appear in a cigar string, and information about their disk representations.
 */
public enum CigarOperator {
    /** Match or mismatch */
    M(true, true,   'M'),
    /** Insertion vs. the reference. */
    I(true, false,  'I'),
    /** Deletion vs. the reference. */
    D(false, true,  'D'),
    /** Skipped region from the reference. */
    N(false, true,  'N'),
    /** Soft clip. */
    S(true, false,  'S'),
    /** Hard clip. */
    H(false, false, 'H'),
    /** Padding. */
    P(false, false, 'P'),
    /** Matches the reference. */
    EQ(true, true,  '='),
    /** Mismatches the reference. */
    X(true, true,   'X')
    ;

    // Representation of CigarOperator in BAM file
    private static final byte OP_M = 0;
    private static final byte OP_I = 1;
    private static final byte OP_D = 2;
    private static final byte OP_N = 3;
    private static final byte OP_S = 4;
    private static final byte OP_H = 5;
    private static final byte OP_P = 6;
    private static final byte OP_EQ = 7;
    private static final byte OP_X = 8;

    private final boolean consumesReadBases;
    private final boolean consumesReferenceBases;
    private final byte character;
    private final String string;

    // Readable synonyms of the above enums
    public static final CigarOperator MATCH_OR_MISMATCH = M;
    public static final CigarOperator INSERTION = I;
    public static final CigarOperator DELETION = D;
    public static final CigarOperator SKIPPED_REGION = N;
    public static final CigarOperator SOFT_CLIP = S;
    public static final CigarOperator HARD_CLIP = H;
    public static final CigarOperator PADDING = P;

    /** Default constructor. */
    CigarOperator(boolean consumesReadBases, boolean consumesReferenceBases, char character) {
        this.consumesReadBases = consumesReadBases;
        this.consumesReferenceBases = consumesReferenceBases;
        this.character = (byte) character;
        this.string = new String(new char[] {character}).intern();
    }

    /** If true, represents that this cigar operator "consumes" bases from the read bases. */
    public boolean consumesReadBases() { return consumesReadBases; }

    /** If true, represents that this cigar operator "consumes" bases from the reference sequence. */
    public boolean consumesReferenceBases() { return consumesReferenceBases; }

    /**
     * @param b CIGAR operator in character form as appears in a text CIGAR string
     * @return CigarOperator enum value corresponding to the given character.
     */
    public static CigarOperator characterToEnum(final int b) {
        switch (b) {
        case 'M':
            return M;
        case 'I':
            return I;
        case 'D':
            return D;
        case 'N':
            return N;
        case 'S':
            return S;
        case 'H':
            return H;
        case 'P':
            return P;
        case '=':
            return EQ;
        case 'X':
            return X;
        default:
            throw new IllegalArgumentException("Unrecognized CigarOperator: " + b);
        }
    }

    /**
     * @param i CIGAR operator in binary form as appears in a BAMRecord.
     * @return CigarOperator enum value corresponding to the given int value.
     */
    public static CigarOperator binaryToEnum(final int i) {
        switch(i) {
            case OP_M:
                return M;
            case OP_I:
                return I;
            case OP_D:
                return D;
            case OP_N:
                return N;
            case OP_S:
                return S;
            case OP_H:
                return H;
            case OP_P:
                return P;
            case OP_EQ:
                return EQ;
            case OP_X:
                return X;
            default:
                throw new IllegalArgumentException("Unrecognized CigarOperator: " + i);
        }
    }

    /**
     *
     * @param e CigarOperator enum value.
     * @return CIGAR operator corresponding to the enum value in binary form as appears in a BAMRecord.
     */
    public static int enumToBinary(final CigarOperator e) {
        switch(e) {
            case M:
                return OP_M;
            case I:
                return OP_I;
            case D:
                return OP_D;
            case N:
                return OP_N;
            case S:
                return OP_S;
            case H:
                return OP_H;
            case P:
                return OP_P;
            case EQ:
                return OP_EQ;
            case X:
                return OP_X;
            default:
                throw new IllegalArgumentException("Unrecognized CigarOperator: " + e);
        }
    }

    /** Returns the character that should be used within a SAM file. */
    public static byte enumToCharacter(final CigarOperator e) {
        return e.character;
    }

    /** Returns true if the operator is a clipped (hard or soft) operator */
    public boolean isClipping() {
        return this == S || this == H;
    }

    /** Returns true if the operator is a Insertion or Deletion operator */
    public boolean isIndel() {
        return this == I || this == D;
    }

    /** Returns true if the operator is a Skipped Region Insertion or Deletion operator */
    public boolean isIndelOrSkippedRegion() {
        return this == N || isIndel();
    }

    /** Returns true if the operator is a M, a X or a EQ */
    public boolean isAlignment() {
        return this == M || this == X || this == EQ;
    }
    
    /** Returns true if the operator is a Padding operator */
    public boolean isPadding() {
        return this == P;
    }
    
    /** Returns the cigar operator as it would be seen in a SAM file. */
    @Override public String toString() {
        return this.string;
    }
}


                
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值