HTSJDK 库中的 Cigar
类用于表示比对过程中读取序列与参考序列之间的比对信息。CIGAR(Compact Idiosyncratic Gapped Alignment Report)字符串由多个操作符组成,这些操作符表示在比对过程中如何处理读取序列中的每个碱基。Cigar
类封装了这些信息,并提供了操作和访问这些信息的方法。
类简介
Cigar
类主要由一系列 CigarElement
对象组成,每个 CigarElement
包含一个操作符和一个长度。操作符包括匹配(M)、插入(I)、删除(D)、跳跃(N)、软剪切(S)、硬剪切(H)、匹配或不匹配(X)、匹配或不匹配(=)等。
Cigar.java源码
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
/**
* A list of CigarElements, which describes how a read aligns with the reference.
* E.g. the Cigar string 10M1D25M means
* * match or mismatch for 10 bases
* * deletion of 1 base
* * match or mismatch for 25 bases
*
* c.f. https://samtools.github.io/hts-specs/SAMv1.pdf for complete CIGAR specification.
*/
public class Cigar implements Serializable, Iterable<CigarElement> {
public static final long serialVersionUID = 1L;
private final List<CigarElement> cigarElements = new ArrayList<CigarElement>();
public Cigar() {
}
public Cigar(final List<CigarElement> cigarElements) {
this.cigarElements.addAll(cigarElements);
}
public List<CigarElement> getCigarElements() {
return Collections.unmodifiableList(cigarElements);
}
public CigarElement getCigarElement(final int i) {
return cigarElements.get(i);
}
public void add(final CigarElement cigarElement) {
cigarElements.add(cigarElement);
}
public int numCigarElements() {
return cigarElements.size();
}
public boolean isEmpty() {
return cigarElements.isEmpty();
}
/**
* @return The number of reference bases that the read covers, excluding padding.
*/
public int getReferenceLength() {
int length = 0;
for (final CigarElement element : cigarElements) {
switch (element.getOperator()) {
case M:
case D:
case N:
case EQ:
case X:
length += element.getLength();
break;
default: break;
}
}
return length;
}
/**
* @return The number of reference bases that the read covers, including padding.
*/
public int getPaddedReferenceLength() {
int length = 0;
for (final CigarElement element : cigarElements) {
switch (element.getOperator()) {
case M:
case D:
case N:
case EQ:
case X:
case P:
length += element.getLength();
break;
default: break;
}
}
return length;
}
/**
* @return The number of read bases that the read covers.
*/
public int getReadLength() {
return getReadLength(cigarElements);
}
/**
* @return The number of read bases that the read covers.
*/
public static int getReadLength(final List<CigarElement> cigarElements) {
int length = 0;
for (final CigarElement element : cigarElements) {
if (element.getOperator().consumesReadBases()){
length += element.getLength();
}
}
return length;
}
/**
* Exhaustive validation of CIGAR.
* Note that this method deliberately returns null rather than Collections.emptyList() if there
* are no validation errors, because callers tend to assume that if a non-null list is returned, it is modifiable.
* @param readName For error reporting only. May be null if not known.
* @param recordNumber For error reporting only. May be -1 if not known.
* @return List of validation errors, or null if no errors.
*/
public List<SAMValidationError> isValid(final String readName, final long recordNumber) {
if (this.isEmpty()) {
return null;
}
List<SAMValidationError> ret = null;
boolean seenRealOperator = false;
for (int i = 0; i < cigarElements.size(); ++i) {
final CigarElement element = cigarElements.get(i);
if (element.getLength() == 0) {
if (ret == null) ret = new ArrayList<SAMValidationError>();
ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
"CIGAR element with zero length", readName, recordNumber));
}
// clipping operator can only be at start or end of CIGAR
final CigarOperator op = element.getOperator();
if (isClippingOperator(op)) {
if (op == CigarOperator.H) {
if (i != 0 && i != cigarElements.size() - 1) {
if (ret == null) ret = new ArrayList<SAMValidationError>();
ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
"Hard clipping operator not at start or end of CIGAR", readName, recordNumber));
}
} else {
if (op != CigarOperator.S) throw new IllegalStateException("Should never happen: " + op.name());
if (i == 0 || i == cigarElements.size() - 1) {
// Soft clip at either end is fine
} else if (i == 1) {
if (cigarElements.size() == 3 && cigarElements.get(2).getOperator() == CigarOperator.H) {
// Handle funky special case in which S operator is both one from the beginning and one
// from the end.
} else if (cigarElements.get(0).getOperator() != CigarOperator.H) {
if (ret == null) ret = new ArrayList<SAMValidationError>();
ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
"Soft clipping CIGAR operator can only be inside of hard clipping operator",
readName, recordNumber));
}
} else if (i == cigarElements.size() - 2) {
if (cigarElements.get(cigarElements.size() - 1).getOperator() != CigarOperator.H) {
if (ret == null) ret = new ArrayList<SAMValidationError>();
ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
"Soft clipping CIGAR operator can only be inside of hard clipping operator",
readName, recordNumber));
}
} else {
if (ret == null) ret = new ArrayList<SAMValidationError>();
ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
"Soft clipping CIGAR operator can at start or end of read, or be inside of hard clipping operator",
readName, recordNumber));
}
}
} else if (isRealOperator(op)) {
// Must be at least one real operator (MIDN)
seenRealOperator = true;
// There should be an M or P operator between any pair of IDN operators
if (isInDelOperator(op)) {
for (int j = i+1; j < cigarElements.size(); ++j) {
final CigarOperator nextOperator = cigarElements.get(j).getOperator();
// Allow
if ((isRealOperator(nextOperator) && !isInDelOperator(nextOperator)) || isPaddingOperator(nextOperator)) {
break;
}
if (isInDelOperator(nextOperator) && op == nextOperator) {
if (ret == null) ret = new ArrayList<SAMValidationError>();
ret.add(new SAMValidationError(SAMValidationError.Type.ADJACENT_INDEL_IN_CIGAR,
"No M or N operator between pair of " + op.name() + " operators in CIGAR", readName, recordNumber));
}
}
}
} else if (isPaddingOperator(op)) {
if (i == 0) {
/*
* Removed restriction that padding not be the first operator because if a read starts in the middle of a pad
* in a padded reference, it is necessary to precede the read with padding so that alignment start refers to a
* position on the unpadded reference.
*/
} else if (i == cigarElements.size() - 1) {
if (ret == null) ret = new ArrayList<SAMValidationError>();
ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
"Padding operator not valid at end of CIGAR", readName, recordNumber));
} else if (!isRealOperator(cigarElements.get(i-1).getOperator()) ||
!isRealOperator(cigarElements.get(i+1).getOperator())) {
if (ret == null) ret = new ArrayList<SAMValidationError>();
ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
"Padding operator not between real operators in CIGAR", readName, recordNumber));
}
}
}
if (!seenRealOperator) {
if (ret == null) ret = new ArrayList<SAMValidationError>();
ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR,
"No real operator (M|I|D|N) in CIGAR", readName, recordNumber));
}
return ret;
}
private static boolean isRealOperator(final CigarOperator op) {
return op == CigarOperator.M || op == CigarOperator.EQ || op == CigarOperator.X ||
op == CigarOperator.I || op == CigarOperator.D || op == CigarOperator.N;
}
private static boolean isInDelOperator(final CigarOperator op) {
return op !=null && op.isIndel();
}
private static boolean isClippingOperator(final CigarOperator op) {
return op !=null && op.isClipping();
}
private static boolean isPaddingOperator(final CigarOperator op) {
return op !=null && op.isPadding();
}
@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (!(o instanceof Cigar)) return false;
final Cigar cigar = (Cigar) o;
return cigarElements.equals(cigar.cigarElements);
}
/** build a new Cigar object from a list of cigar operators.
* This can be used if you have the operators associated to
* each base in the read.
*
* e.g: read length =10 with cigar= <code>[M,M,M,M,M,M,M,M,M,M]</code>, here
* fromCigarOperators would generate the cigar '10M'
*
* later the user resolved the 'M' to '=' or 'X', the array is now
*
* <code>[=,=,=,=,=,X,X,=,=,=]</code>
*
* fromCigarOperators would generate the cigar '5M2X3M'
*
* */
public static Cigar fromCigarOperators(final List<CigarOperator> cigarOperators) {
if (cigarOperators == null) throw new IllegalArgumentException("cigarOperators is null");
final List<CigarElement> cigarElementList = new ArrayList<>();
int i = 0;
// find adjacent operators and build list of cigar elements
while (i < cigarOperators.size() ) {
final CigarOperator currentOp = cigarOperators.get(i);
int j = i + 1;
while (j < cigarOperators.size() && cigarOperators.get(j).equals(currentOp)) {
j++;
}
cigarElementList.add(new CigarElement(j - i, currentOp));
i = j;
}
return new Cigar(cigarElementList);
}
/**
* Decode a Cigar from a SAM formatted CIGAR String, uses {@link TextCigarCodec#decode(String)}
* Only performs minimal validation.
* @param cigarString A SAM formatted CIGAR string.
* @return a new Cigar
*/
public static Cigar fromCigarString(String cigarString){
return TextCigarCodec.decode(cigarString);
}
/** shortcut to <code>getCigarElements().iterator()</code> */
@Override
public Iterator<CigarElement> iterator() {
return this.getCigarElements().iterator();
}
/** returns true if the cigar string contains the given operator */
public boolean containsOperator(final CigarOperator operator) {
return this.cigarElements.stream().anyMatch( element -> element.getOperator() == operator);
}
/** returns the first cigar element */
public CigarElement getFirstCigarElement() {
return isEmpty() ? null : this.cigarElements.get(0);
}
/** returns the last cigar element */
public CigarElement getLastCigarElement() {
return isEmpty() ? null : this.cigarElements.get(this.numCigarElements() - 1 );
}
/** returns true if the cigar string starts With a clipping operator */
public boolean isLeftClipped() {
return !isEmpty() && isClippingOperator(getFirstCigarElement().getOperator());
}
/** returns true if the cigar string ends With a clipping operator */
public boolean isRightClipped() {
return !isEmpty() && isClippingOperator(getLastCigarElement().getOperator());
}
/** returns true if the cigar is clipped */
public boolean isClipped() {
return isLeftClipped() || isRightClipped();
}
@Override
public int hashCode() {
return cigarElements.hashCode();
}
public String toString() {
return TextCigarCodec.encode(this);
}
}
CigarElement.java源码
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;
import java.io.Serializable;
/**
* One component of a cigar string. The component comprises the operator, and the number of bases to which
* the operator applies.
*/
public class CigarElement implements Serializable {
public static final long serialVersionUID = 1L;
private final int length;
private final CigarOperator operator;
public CigarElement(final int length, final CigarOperator operator) {
if (length < 0) throw new IllegalArgumentException(String.format("Cigar element being constructed with negative length: %d and operation: %s" , length, operator.name()));
this.length = length;
this.operator = operator;
}
public int getLength() {
return length;
}
public CigarOperator getOperator() {
return operator;
}
@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (!(o instanceof CigarElement)) return false;
final CigarElement that = (CigarElement) o;
if (length != that.length) return false;
if (operator != that.operator) return false;
return true;
}
@Override
public int hashCode() {
int result = length;
result = 31 * result + (operator != null ? operator.hashCode() : 0);
return result;
}
@Override
public String toString() {
return String.valueOf(this.length)+this.operator;
}
}
CigarOperator.java源码
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;
/**
* The operators that can appear in a cigar string, and information about their disk representations.
*/
public enum CigarOperator {
/** Match or mismatch */
M(true, true, 'M'),
/** Insertion vs. the reference. */
I(true, false, 'I'),
/** Deletion vs. the reference. */
D(false, true, 'D'),
/** Skipped region from the reference. */
N(false, true, 'N'),
/** Soft clip. */
S(true, false, 'S'),
/** Hard clip. */
H(false, false, 'H'),
/** Padding. */
P(false, false, 'P'),
/** Matches the reference. */
EQ(true, true, '='),
/** Mismatches the reference. */
X(true, true, 'X')
;
// Representation of CigarOperator in BAM file
private static final byte OP_M = 0;
private static final byte OP_I = 1;
private static final byte OP_D = 2;
private static final byte OP_N = 3;
private static final byte OP_S = 4;
private static final byte OP_H = 5;
private static final byte OP_P = 6;
private static final byte OP_EQ = 7;
private static final byte OP_X = 8;
private final boolean consumesReadBases;
private final boolean consumesReferenceBases;
private final byte character;
private final String string;
// Readable synonyms of the above enums
public static final CigarOperator MATCH_OR_MISMATCH = M;
public static final CigarOperator INSERTION = I;
public static final CigarOperator DELETION = D;
public static final CigarOperator SKIPPED_REGION = N;
public static final CigarOperator SOFT_CLIP = S;
public static final CigarOperator HARD_CLIP = H;
public static final CigarOperator PADDING = P;
/** Default constructor. */
CigarOperator(boolean consumesReadBases, boolean consumesReferenceBases, char character) {
this.consumesReadBases = consumesReadBases;
this.consumesReferenceBases = consumesReferenceBases;
this.character = (byte) character;
this.string = new String(new char[] {character}).intern();
}
/** If true, represents that this cigar operator "consumes" bases from the read bases. */
public boolean consumesReadBases() { return consumesReadBases; }
/** If true, represents that this cigar operator "consumes" bases from the reference sequence. */
public boolean consumesReferenceBases() { return consumesReferenceBases; }
/**
* @param b CIGAR operator in character form as appears in a text CIGAR string
* @return CigarOperator enum value corresponding to the given character.
*/
public static CigarOperator characterToEnum(final int b) {
switch (b) {
case 'M':
return M;
case 'I':
return I;
case 'D':
return D;
case 'N':
return N;
case 'S':
return S;
case 'H':
return H;
case 'P':
return P;
case '=':
return EQ;
case 'X':
return X;
default:
throw new IllegalArgumentException("Unrecognized CigarOperator: " + b);
}
}
/**
* @param i CIGAR operator in binary form as appears in a BAMRecord.
* @return CigarOperator enum value corresponding to the given int value.
*/
public static CigarOperator binaryToEnum(final int i) {
switch(i) {
case OP_M:
return M;
case OP_I:
return I;
case OP_D:
return D;
case OP_N:
return N;
case OP_S:
return S;
case OP_H:
return H;
case OP_P:
return P;
case OP_EQ:
return EQ;
case OP_X:
return X;
default:
throw new IllegalArgumentException("Unrecognized CigarOperator: " + i);
}
}
/**
*
* @param e CigarOperator enum value.
* @return CIGAR operator corresponding to the enum value in binary form as appears in a BAMRecord.
*/
public static int enumToBinary(final CigarOperator e) {
switch(e) {
case M:
return OP_M;
case I:
return OP_I;
case D:
return OP_D;
case N:
return OP_N;
case S:
return OP_S;
case H:
return OP_H;
case P:
return OP_P;
case EQ:
return OP_EQ;
case X:
return OP_X;
default:
throw new IllegalArgumentException("Unrecognized CigarOperator: " + e);
}
}
/** Returns the character that should be used within a SAM file. */
public static byte enumToCharacter(final CigarOperator e) {
return e.character;
}
/** Returns true if the operator is a clipped (hard or soft) operator */
public boolean isClipping() {
return this == S || this == H;
}
/** Returns true if the operator is a Insertion or Deletion operator */
public boolean isIndel() {
return this == I || this == D;
}
/** Returns true if the operator is a Skipped Region Insertion or Deletion operator */
public boolean isIndelOrSkippedRegion() {
return this == N || isIndel();
}
/** Returns true if the operator is a M, a X or a EQ */
public boolean isAlignment() {
return this == M || this == X || this == EQ;
}
/** Returns true if the operator is a Padding operator */
public boolean isPadding() {
return this == P;
}
/** Returns the cigar operator as it would be seen in a SAM file. */
@Override public String toString() {
return this.string;
}
}