HTSJDK库BAMRecordCodec类介绍

qq_27390023

已于 2024-07-17 14:13:27 修改

阅读量213

点赞数 2

文章标签：生物信息学 java

于 2024-07-17 14:11:51 首次发布

本文链接：https://blog.csdn.net/qq_27390023/article/details/140493557

版权

BAMRecordCodec 类是 HTSJDK 库中用于编码和解码 BAM 文件中记录的关键类。它将 SAMRecord 对象编码为 BAM 格式的字节数组，并从 BAM 格式的字节数组解码为 SAMRecord 对象。以下是对 BAMRecordCodec 类的详细介绍。

类简介

BAMRecordCodec 是 HTSJDK 库中用于处理 BAM 文件中记录的编解码器类。BAM 文件是二进制格式的 SAM 文件，包含了相同的信息，但使用了更高效的存储格式。BAMRecordCodec 类的主要功能是将 SAMRecord 对象转换为 BAM 格式的字节数组，并将 BAM 格式的字节数组转换回 SAMRecord 对象。

BAMRecordCodec.java源代码：

/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;

import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.RuntimeEOFException;
import htsjdk.samtools.util.SortingCollection;

import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;

import static htsjdk.samtools.SAMTag.CG;

/**
 * Class for translating between in-memory and disk representation of BAMRecord.
 */
public class BAMRecordCodec implements SortingCollection.Codec<SAMRecord> {
    private final static Log LOG = Log.getInstance(BAMRecordCodec.class);

    private final SAMFileHeader header;
    private final BinaryCodec binaryCodec = new BinaryCodec();
    private final BinaryTagCodec binaryTagCodec = new BinaryTagCodec(binaryCodec);
    private final SAMRecordFactory samRecordFactory;

    private boolean isReferenceSizeWarningShowed = false;

    public BAMRecordCodec(final SAMFileHeader header) {
        this(header, new DefaultSAMRecordFactory());
    }

    public BAMRecordCodec(final SAMFileHeader header, final SAMRecordFactory factory) {
        this.header = header;
        this.samRecordFactory = factory;
    }

    @Override
    public BAMRecordCodec clone() {
        // Do not clone the references to codecs, as they must be distinct for each instance.
        return new BAMRecordCodec(this.header, this.samRecordFactory);
    }

    /**
     * Sets the output stream that records will be written to.
     */
    @Override
    public void setOutputStream(final OutputStream os) {
        this.binaryCodec.setOutputStream(os);
    }

    /**
     * Sets the output stream that records will be written to.
     */
    public void setOutputStream(final OutputStream os, final String filename) {
        this.binaryCodec.setOutputStream(os);
        this.binaryCodec.setOutputFileName(filename);
    }

    /**
     * Sets the input stream that records will be read from.
     */
    @Override
    public void setInputStream(final InputStream is) {
        this.binaryCodec.setInputStream(is);
    }

    /**
     * Sets the input stream that records will be read from.
     */
    public void setInputStream(final InputStream is, final String filename) {
        this.binaryCodec.setInputStream(is);
        this.binaryCodec.setInputFileName(filename);
    }

    /**
     * Write object to OutputStream.
     * Reference and mate reference indices must be resolvable, which either means that these have been set into the
     * SAMRecord directly, or the SAMRecord must have a header assigned into it so that reference names can be
     * resolved into indices.
     *
     * @param alignment Record to be written.
     */
    @Override
    public void encode(final SAMRecord alignment) {
        // Compute block size, as it is the first element of the file representation of SAMRecord
        final int readLength = alignment.getReadLength();

        // If cigar is too long, put into CG tag and replace with sentinel value.
        // Using alignment.getCigarLength() here causes problems, so access the cigar instead
        final Cigar cigarToWrite;
        final boolean cigarSwitcharoo = alignment.getCigar().numCigarElements() > BAMRecord.MAX_CIGAR_OPERATORS;

        if (cigarSwitcharoo) {
            final int[] cigarEncoding = BinaryCigarCodec.encode(alignment.getCigar());
            alignment.setAttribute(CG.name(), cigarEncoding);
            cigarToWrite = makeSentinelCigar(alignment.getCigar());
        }
        else {
            cigarToWrite = alignment.getCigar();
        }

        int blockSize = BAMFileConstants.FIXED_BLOCK_SIZE + alignment.getReadNameLength() + 1 + // null terminated
                cigarToWrite.numCigarElements() * BAMRecord.CIGAR_SIZE_MULTIPLIER +
                (readLength + 1) / 2 + // 2 bases per byte, round up
                readLength;

        final int attributesSize = alignment.getAttributesBinarySize();
        if (attributesSize != -1) {
            // binary attribute size already known, don't need to compute.
            blockSize += attributesSize;
        } else {
            SAMBinaryTagAndValue attribute = alignment.getBinaryAttributes();
            while (attribute != null) {
                blockSize += (BinaryTagCodec.getTagSize(attribute.value));
                attribute = attribute.getNext();
            }
        }

        // shouldn't interact with the long-cigar above since the Sentinel Cigar has the same referenceLength as
        // the actual cigar.
        int indexBin = 0;
        if (alignment.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) {
            if (!warnIfReferenceIsTooLargeForBinField(alignment)) {
                indexBin = alignment.computeIndexingBin();
            }
        }

        // Blurt out the elements
        this.binaryCodec.writeInt(blockSize);
        this.binaryCodec.writeInt(alignment.getReferenceIndex());
        // 0-based!!
        this.binaryCodec.writeInt(alignment.getAlignmentStart() - 1);
        this.binaryCodec.writeUByte((short) (alignment.getReadNameLength() + 1));
        this.binaryCodec.writeUByte((short) alignment.getMappingQuality());
        this.binaryCodec.writeUShort(indexBin);
        this.binaryCodec.writeUShort(cigarToWrite.numCigarElements());
        this.binaryCodec.writeUShort(alignment.getFlags());
        this.binaryCodec.writeInt(alignment.getReadLength());
        this.binaryCodec.writeInt(alignment.getMateReferenceIndex());
        this.binaryCodec.writeInt(alignment.getMateAlignmentStart() - 1);
        this.binaryCodec.writeInt(alignment.getInferredInsertSize());
        final byte[] variableLengthBinaryBlock = alignment.getVariableBinaryRepresentation();
        if (variableLengthBinaryBlock != null) {
            // Don't need to encode variable-length block, because it is unchanged from
            // when the record was read from a BAM file.
            this.binaryCodec.writeBytes(variableLengthBinaryBlock);
        } else {
            if (alignment.getReadLength() != alignment.getBaseQualities().length &&
                    alignment.getBaseQualities().length != 0) {
                throw new RuntimeException("Mismatch between read length and quals length writing read " +
                        alignment.getReadName() + "; read length: " + alignment.getReadLength() +
                        "; quals length: " + alignment.getBaseQualities().length);
            }
            this.binaryCodec.writeString(alignment.getReadName(), false, true);
            final int[] binaryCigar = BinaryCigarCodec.encode(cigarToWrite);
            for (final int cigarElement : binaryCigar) {
                // Assumption that this will fit into an integer, despite the fact
                // that it is spec'ed as a uint.
                this.binaryCodec.writeInt(cigarElement);
            }
            try {
                this.binaryCodec.writeBytes(SAMUtils.bytesToCompressedBases(alignment.getReadBases()));
            } catch (final IllegalArgumentException ex) {
                final String msg = ex.getMessage() + " in read: " + alignment.getReadName();
                throw new IllegalStateException(msg, ex);
            }
            byte[] qualities = alignment.getBaseQualities();
            if (qualities.length == 0) {
                qualities = new byte[alignment.getReadLength()];
                Arrays.fill(qualities, (byte) 0xFF);
            }
            this.binaryCodec.writeBytes(qualities);
            SAMBinaryTagAndValue attribute = alignment.getBinaryAttributes();
            while (attribute != null) {
                this.binaryTagCodec.writeTag(attribute.tag, attribute.value, attribute.isUnsignedArray());
                attribute = attribute.getNext();
            }
        }

        if (cigarSwitcharoo) {
            alignment.setAttribute(CG.name(), null);
        }
    }

    /**
     * Create a "Sentinel" cigar that will be placed in BAM file when the actual cigar has more than 0xffff operator,
     * which are not supported by the bam format. The actual cigar will be encoded and placed in the CG attribute.
     * @param cigar actual cigar to create sentinel cigar for
     * @return sentinel cigar xSyN with readLength (x) and referenceLength (y) matching the input cigar.
     */
    public static Cigar makeSentinelCigar(final Cigar cigar) {
        // in BAM there are only 28 bits for a cigar operator, so this a protection against overflow.
        if (cigar.getReadLength() > BAMRecord.MAX_CIGAR_ELEMENT_LENGTH) {
            throw new IllegalArgumentException(
                    String.format(
                            "Cannot encode (to BAM) a record with more than %d cigar operations and a read-length greater than %d.",
                            BAMRecord.MAX_CIGAR_OPERATORS, BAMRecord.MAX_CIGAR_ELEMENT_LENGTH));
        }

        if (cigar.getReferenceLength() > BAMRecord.MAX_CIGAR_ELEMENT_LENGTH) {
            throw new IllegalArgumentException(
                    String.format(
                            "Cannot encode (to BAM) a record that has than %d cigar operations and spans more than %d bases on the reference.",
                            BAMRecord.MAX_CIGAR_OPERATORS, BAMRecord.MAX_CIGAR_ELEMENT_LENGTH));
        }

        return new Cigar(Arrays.asList(
                new CigarElement(cigar.getReadLength(), CigarOperator.S),
                new CigarElement(cigar.getReferenceLength(), CigarOperator.N)));
    }

    /** Emits a warning the first time a reference too large for binning indexing is encountered.
     *
     * @param rec the SAMRecord to examine
     * @return true if the sequence is too large, false otherwise
     */
    private boolean warnIfReferenceIsTooLargeForBinField(final SAMRecord rec) {
        final SAMSequenceRecord sequence = rec.getHeader() != null ? rec.getHeader().getSequence(rec.getReferenceName()) : null;
        final boolean tooLarge = sequence != null && SAMUtils.isReferenceSequenceIncompatibleWithBAI(sequence);
        if (!isReferenceSizeWarningShowed && tooLarge && rec.getValidationStringency() != ValidationStringency.SILENT) {
            LOG.warn("Reference length is too large for BAM bin field.");
            LOG.warn("Reads on references longer than " + GenomicIndexUtil.BIN_GENOMIC_SPAN + "bp will have bin set to 0.");
            isReferenceSizeWarningShowed = true;
        }

        return tooLarge;
    }

    /**
     * Read the next record from the input stream and convert into a java object.
     *
     * @return null if no more records.  Should throw exception if EOF is encountered in the middle of
     * a record.
     */
    @Override
    public SAMRecord decode() {
        final int recordLength;
        try {
            recordLength = this.binaryCodec.readInt();
        } catch (final RuntimeEOFException e) {
            return null;
        }

        if (recordLength < BAMFileConstants.FIXED_BLOCK_SIZE) {
            throw new SAMFormatException("Invalid record length: " + recordLength);
        }

        final int referenceID = this.binaryCodec.readInt();
        final int coordinate = this.binaryCodec.readInt() + 1;
        final short readNameLength = this.binaryCodec.readUByte();
        final short mappingQuality = this.binaryCodec.readUByte();
        final int bin = this.binaryCodec.readUShort();
        final int cigarLen = this.binaryCodec.readUShort();
        final int flags = this.binaryCodec.readUShort();
        final int readLen = this.binaryCodec.readInt();
        final int mateReferenceID = this.binaryCodec.readInt();
        final int mateCoordinate = this.binaryCodec.readInt() + 1;
        final int insertSize = this.binaryCodec.readInt();
        final byte[] restOfRecord = new byte[recordLength - BAMFileConstants.FIXED_BLOCK_SIZE];
        this.binaryCodec.readBytes(restOfRecord);
        final BAMRecord ret = this.samRecordFactory.createBAMRecord(
                header, referenceID, coordinate, readNameLength, mappingQuality,
                bin, cigarLen, flags, readLen, mateReferenceID, mateCoordinate, insertSize, restOfRecord);

        if (null != header) {
            // don't reset a null header as this will clobber the reference and mate reference indices
            ret.setHeader(header);
        }
        return ret;
    }
}

SortingCollection.java 源代码

/*
 * The MIT License
 *
 * Copyright (c) 2018 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools.util;

import htsjdk.samtools.Defaults;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.reflect.Array;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.TreeSet;

/**
 * Collection to which many records can be added.  After all records are added, the collection can be
 * iterated, and the records will be returned in order defined by the comparator.  Records may be spilled
 * to a temporary directory if there are more records added than will fit in memory.  As a result of this,
 * the objects returned may not be identical to the objects added to the collection, but they should be
 * equal as determined by the codec used to write them to disk and read them back.
 * <p>
 * When iterating over the collection, the number of file handles required is numRecordsInCollection/maxRecordsInRam.
 * If this becomes a limiting factor, a file handle cache could be added.
 * <p>
 * If Snappy DLL is available and snappy.disable system property is not set to true, then Snappy is used
 * to compress temporary files.
 */
public class SortingCollection<T> implements Iterable<T> {
    private static final Log log = Log.getInstance(SortingCollection.class);

    /**
     * Client must implement this class, which defines the way in which records are written to and
     * read from file.
     */
    public interface Codec<T> extends Cloneable {
        /**
         * Where to write encoded output
         */
        void setOutputStream(OutputStream os);

        /**
         * Where to read encoded input from
         */
        void setInputStream(InputStream is);

        /**
         * Write object to output stream
         *
         * @param val what to write
         */
        void encode(T val);

        /**
         * Read the next record from the input stream and convert into a java object.
         *
         * @return null if no more records.  Should throw exception if EOF is encountered in the middle of
         * a record.
         */
        T decode();

        /**
         * Must return a cloned copy of the codec that can be used independently of
         * the original instance.  This is required so that multiple codecs can exist simultaneously
         * that each is reading a separate file.
         */
        Codec<T> clone();
    }

    /**
     * Directories where files of sorted records go.
     */
    private final Path[] tmpDirs;

    /**
     * Used to write records to file, and used as a prototype to create codecs for reading.
     */
    private final SortingCollection.Codec<T> codec;

    /**
     * For sorting, both when spilling records to file, and merge sorting.
     */
    private final Comparator<T> comparator;
    private final int maxRecordsInRam;
    private int numRecordsInRam = 0;
    private T[] ramRecords;
    private boolean iterationStarted = false;
    private boolean doneAdding = false;

    /**
     * Set to true when all temp files have been cleaned up
     */
    private boolean cleanedUp = false;

    /**
     * List of files in tmpDir containing sorted records
     */
    private final List<Path> files = new ArrayList<>();

    private boolean destructiveIteration = true;

    private final TempStreamFactory tempStreamFactory = new TempStreamFactory();

    private final boolean printRecordSizeSampling;

    /**
     * Prepare to accumulate records to be sorted
     *
     * @param componentType   Class of the record to be sorted.  Necessary because of Java generic lameness.
     * @param codec           For writing records to file and reading them back into RAM
     * @param comparator      Defines output sort order
     * @param maxRecordsInRam how many records to accumulate before spilling to disk
     * @param printRecordSizeSampling If true record size will be sampled and output at DEBUG log level
     * @param tmpDir          Where to write files of records that will not fit in RAM
     */
    private SortingCollection(final Class<T> componentType, final SortingCollection.Codec<T> codec,
                              final Comparator<T> comparator, final int maxRecordsInRam,
                              final boolean printRecordSizeSampling, final Path... tmpDir) {
        if (maxRecordsInRam <= 0) {
            throw new IllegalArgumentException("maxRecordsInRam must be > 0");
        }

        if (tmpDir == null || tmpDir.length == 0) {
            throw new IllegalArgumentException("At least one temp directory must be provided.");
        }

        this.tmpDirs = tmpDir;
        this.codec = codec;
        this.comparator = comparator;
        this.maxRecordsInRam = maxRecordsInRam;
        @SuppressWarnings("unchecked")
        T[] ramRecords = (T[]) Array.newInstance(componentType, maxRecordsInRam);
        this.ramRecords = ramRecords;
        this.printRecordSizeSampling = printRecordSizeSampling;
    }

    public void add(final T rec) {
        if (doneAdding) {
            throw new IllegalStateException("Cannot add after calling doneAdding()");
        }
        if (iterationStarted) {
            throw new IllegalStateException("Cannot add after calling iterator()");
        }
        if (numRecordsInRam == maxRecordsInRam) {

            long startMem = 0;
            if (printRecordSizeSampling) {
                // Garbage collect and get free memory
                Runtime.getRuntime().gc();
                startMem = Runtime.getRuntime().freeMemory();
            }

            spillToDisk();

            if (printRecordSizeSampling) {
                //Garbage collect again and get free memory
                Runtime.getRuntime().gc();
                long endMem = Runtime.getRuntime().freeMemory();

                long usedBytes = endMem - startMem;
                log.debug(String.format("%d records in ram required approximately %s memory or %s per record. ", maxRecordsInRam,
                        StringUtil.humanReadableByteCount(usedBytes),
                        StringUtil.humanReadableByteCount(usedBytes / maxRecordsInRam)));

            }
        }
        ramRecords[numRecordsInRam++] = rec;
    }

    /**
     * This method can be called after caller is done adding to collection, in order to possibly free
     * up memory.  If iterator() is called immediately after caller is done adding, this is not necessary,
     * because iterator() triggers the same freeing.
     */
    public void doneAdding() {
        if (this.cleanedUp) {
            throw new IllegalStateException("Cannot call doneAdding() after cleanup() was called.");
        }
        if (doneAdding) {
            return;
        }

        doneAdding = true;

        if (this.files.isEmpty()) {
            return;
        }

        if (this.numRecordsInRam > 0) {
            spillToDisk();
        }

        // Facilitate GC
        this.ramRecords = null;
    }

    /**
     * @return True if this collection is allowed to discard data during iteration in order to reduce memory
     * footprint, precluding a second iteration over the collection.
     */
    public boolean isDestructiveIteration() {
        return destructiveIteration;
    }

    /**
     * Tell this collection that it is allowed to discard data during iteration in order to reduce memory footprint,
     * precluding a second iteration.  This is true by default.
     */
    public void setDestructiveIteration(boolean destructiveIteration) {
        this.destructiveIteration = destructiveIteration;
    }

    /**
     * Sort the records in memory, write them to a file, and clear the buffer of records in memory.
     */
    public void spillToDisk() {
        try {
            Arrays.parallelSort(this.ramRecords, 0, this.numRecordsInRam, this.comparator);

            final Path f = newTempFile();
            try (OutputStream os
                         = tempStreamFactory.wrapTempOutputStream(Files.newOutputStream(f), Defaults.BUFFER_SIZE)) {
                this.codec.setOutputStream(os);
                for (int i = 0; i < this.numRecordsInRam; ++i) {
                    this.codec.encode(ramRecords[i]);
                    // Facilitate GC
                    this.ramRecords[i] = null;
                }
                os.flush();
            } catch (RuntimeIOException ex) {
                throw new RuntimeIOException("Problem writing temporary file " + f.toUri() +
                        ".  Try setting TMP_DIR to a file system with lots of space.", ex);
            }

            this.numRecordsInRam = 0;
            this.files.add(f);
        } catch (IOException e) {
            throw new RuntimeIOException(e);
        }
    }


    /**
     * Creates a new tmp file on one of the available temp filesystems, registers it for deletion
     * on JVM exit and then returns it.
     */
    private Path newTempFile() throws IOException {
        /* The minimum amount of space free on a temp filesystem to write a file there. */
        return IOUtil.newTempPath("sortingcollection.", ".tmp", this.tmpDirs, IOUtil.FIVE_GBS);
    }

    /**
     * Prepare to iterate through the records in order.  This method may be called more than once,
     * but add() may not be called after this method has been called.
     */
    @Override
    public CloseableIterator<T> iterator() {
        if (this.cleanedUp) {
            throw new IllegalStateException("Cannot call iterator() after cleanup() was called.");
        }
        doneAdding();

        this.iterationStarted = true;
        if (this.files.isEmpty()) {
            return new InMemoryIterator();
        } else {
            return new MergingIterator();
        }
    }

    /**
     * Delete any temporary files.  After this method is called, iterator() may not be called.
     */
    public void cleanup() {
        this.iterationStarted = true;
        this.cleanedUp = true;

        IOUtil.deletePaths(this.files);
    }

    /**
     * Syntactic sugar around the ctor, to save some typing of type parameters
     *
     * @param componentType   Class of the record to be sorted.  Necessary because of Java generic lameness.
     * @param codec           For writing records to file and reading them back into RAM
     * @param comparator      Defines output sort order
     * @param maxRecordsInRAM how many records to accumulate in memory before spilling to disk
     * @param tmpDir          Where to write files of records that will not fit in RAM
     * @deprecated since 2017-09. Use {@link #newInstance(Class, Codec, Comparator, int, Path...)} instead
     */
    @Deprecated
    public static <T> SortingCollection<T> newInstance(final Class<T> componentType,
                                                       final SortingCollection.Codec<T> codec,
                                                       final Comparator<T> comparator,
                                                       final int maxRecordsInRAM,
                                                       final File... tmpDir) {
        return new SortingCollection<>(componentType, codec, comparator, maxRecordsInRAM, false, Arrays.stream(tmpDir).map(File::toPath).toArray(Path[]::new));

    }

    /**
     * Syntactic sugar around the ctor, to save some typing of type parameters
     *
     * @param componentType   Class of the record to be sorted.  Necessary because of Java generic lameness.
     * @param codec           For writing records to file and reading them back into RAM
     * @param comparator      Defines output sort order
     * @param maxRecordsInRAM how many records to accumulate in memory before spilling to disk
     * @param tmpDirs         Where to write files of records that will not fit in RAM
     * @deprecated since 2017-09. Use {@link #newInstanceFromPaths(Class, Codec, Comparator, int, Collection)} instead
     */
    @Deprecated
    public static <T> SortingCollection<T> newInstance(final Class<T> componentType,
                                                       final SortingCollection.Codec<T> codec,
                                                       final Comparator<T> comparator,
                                                       final int maxRecordsInRAM,
                                                       final Collection<File> tmpDirs) {
        return new SortingCollection<>(componentType,
                codec,
                comparator,
                maxRecordsInRAM,
                false,
                tmpDirs.stream().map(File::toPath).toArray(Path[]::new));

    }

    /**
     * Syntactic sugar around the ctor, to save some typing of type parameters.  Writes files to java.io.tmpdir
     *
     * @param componentType    Class of the record to be sorted.  Necessary because of Java generic lameness.
     * @param codec            For writing records to file and reading them back into RAM
     * @param comparator       Defines output sort order
     * @param maxRecordsInRAM  how many records to accumulate in memory before spilling to disk
     * @param printRecordSizeSampling If true record size will be sampled and output at DEBUG log level
     */
    public static <T> SortingCollection<T> newInstance(final Class<T> componentType,
                                                       final SortingCollection.Codec<T> codec,
                                                       final Comparator<T> comparator,
                                                       final int maxRecordsInRAM,
                                                       final boolean printRecordSizeSampling) {
        final Path tmpDir = Paths.get(System.getProperty("java.io.tmpdir"));
        return new SortingCollection<>(componentType, codec, comparator, maxRecordsInRAM, printRecordSizeSampling, tmpDir);
    }

    /**
     * Syntactic sugar around the ctor, to save some typing of type parameters
     *
     * @param componentType    Class of the record to be sorted.  Necessary because of Java generic lameness.
     * @param codec            For writing records to file and reading them back into RAM
     * @param comparator       Defines output sort order
     * @param maxRecordsInRAM  how many records to accumulate in memory before spilling to disk
     * @param printRecordSizeSampling If true record size will be sampled and output at DEBUG log level
     * @param tmpDir           Where to write files of records that will not fit in RAM
     */
    public static <T> SortingCollection<T> newInstance(final Class<T> componentType,
                                                       final SortingCollection.Codec<T> codec,
                                                       final Comparator<T> comparator,
                                                       final int maxRecordsInRAM,
                                                       final boolean printRecordSizeSampling,
                                                       final Path... tmpDir) {
        return new SortingCollection<>(componentType, codec, comparator, maxRecordsInRAM, printRecordSizeSampling, tmpDir);
    }

    /**
     * Syntactic sugar around the ctor, to save some typing of type parameters.  Writes files to java.io.tmpdir
     *
     * @param componentType   Class of the record to be sorted.  Necessary because of Java generic lameness.
     * @param codec           For writing records to file and reading them back into RAM
     * @param comparator      Defines output sort order
     * @param maxRecordsInRAM how many records to accumulate in memory before spilling to disk
     */
    public static <T> SortingCollection<T> newInstance(final Class<T> componentType,
                                                       final SortingCollection.Codec<T> codec,
                                                       final Comparator<T> comparator,
                                                       final int maxRecordsInRAM) {
        final Path tmpDir = Paths.get(System.getProperty("java.io.tmpdir"));
        return new SortingCollection<>(componentType, codec, comparator, maxRecordsInRAM, false, tmpDir);
    }

    /**
     * Syntactic sugar around the ctor, to save some typing of type parameters
     *
     * @param componentType   Class of the record to be sorted.  Necessary because of Java generic lameness.
     * @param codec           For writing records to file and reading them back into RAM
     * @param comparator      Defines output sort order
     * @param maxRecordsInRAM how many records to accumulate in memory before spilling to disk
     * @param tmpDir          Where to write files of records that will not fit in RAM
     */
    public static <T> SortingCollection<T> newInstance(final Class<T> componentType,
                                                       final SortingCollection.Codec<T> codec,
                                                       final Comparator<T> comparator,
                                                       final int maxRecordsInRAM,
                                                       final Path... tmpDir) {
        return new SortingCollection<>(componentType, codec, comparator, maxRecordsInRAM, false, tmpDir);
    }

    /**
     * Syntactic sugar around the ctor, to save some typing of type parameters
     *
     * @param componentType   Class of the record to be sorted.  Necessary because of Java generic lameness.
     * @param codec           For writing records to file and reading them back into RAM
     * @param comparator      Defines output sort order
     * @param maxRecordsInRAM how many records to accumulate in memory before spilling to disk
     * @param tmpDirs         Where to write files of records that will not fit in RAM
     */
    public static <T> SortingCollection<T> newInstanceFromPaths(final Class<T> componentType,
                                                                final SortingCollection.Codec<T> codec,
                                                                final Comparator<T> comparator,
                                                                final int maxRecordsInRAM,
                                                                final Collection<Path> tmpDirs) {
        return new SortingCollection<>(componentType,
                codec,
                comparator,
                maxRecordsInRAM,
                false,
                tmpDirs.toArray(new Path[tmpDirs.size()]));
    }

    /**
     * For iteration when number of records added is less than the threshold for spilling to disk.
     */
    class InMemoryIterator implements CloseableIterator<T> {
        private int iterationIndex = 0;

        InMemoryIterator() {
            Arrays.parallelSort(SortingCollection.this.ramRecords,
                    0,
                    SortingCollection.this.numRecordsInRam,
                    SortingCollection.this.comparator);
        }

        @Override
        public void close() {
            // nothing to do
        }

        @Override
        public boolean hasNext() {
            return this.iterationIndex < SortingCollection.this.numRecordsInRam;
        }

        @Override
        public T next() {
            if (!hasNext()) {
                throw new NoSuchElementException();
            }
            T ret = SortingCollection.this.ramRecords[iterationIndex];
            if (destructiveIteration) SortingCollection.this.ramRecords[iterationIndex] = null;
            ++iterationIndex;
            return ret;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    /**
     * For iteration when spilling to disk has occurred.
     * Each file is has records in sort order within the file.
     * This iterator automatically closes when it iterates to the end, but if not iterating
     * to the end it is a good idea to call close().
     * <p>
     * Algorithm: MergingIterator maintains a PriorityQueue of PeekFileRecordIterators.
     * Each PeekFileRecordIterator iterates through a file in which the records are sorted.
     * The comparator for PeekFileRecordIterator used by the PriorityQueue peeks at the next record from
     * the file, so the first element in the PriorityQueue is the file that has the next record to be emitted.
     * In order to get the next record, the first PeekFileRecordIterator in the PriorityQueue is popped,
     * the record is obtained from that iterator, and then if that iterator is not empty, it is pushed back into
     * the PriorityQueue.  Because it now has a different record as its next element, it may go into another
     * location in the PriorityQueue
     */
    class MergingIterator implements CloseableIterator<T> {
        private final TreeSet<PeekFileRecordIterator> queue;

        MergingIterator() {
            this.queue = new TreeSet<>(new PeekFileRecordIteratorComparator());
            int n = 0;
            log.debug(String.format("Creating merging iterator from %d files", files.size()));
            int suggestedBufferSize = checkMemoryAndAdjustBuffer(files.size());
            for (final Path f : files) {
                final FileRecordIterator it = new FileRecordIterator(f, suggestedBufferSize);
                if (it.hasNext()) {
                    this.queue.add(new PeekFileRecordIterator(it, n++));
                } else {
                    it.close();
                }
            }
        }

        // Since we need to open and buffer all temp files in the sorting collection at once it is important
        // to have enough memory left to do this. This method checks to make sure that, given the number of files and
        // the size of the buffer, we can reasonably open all files. If we can't it will return a buffer size that
        // is appropriate given the number of temp files and the amount of memory left on the heap. If there isn't
        // enough memory for buffering it will return zero and all reading will be unbuffered.
        private int checkMemoryAndAdjustBuffer(int numFiles) {
            int bufferSize = Defaults.BUFFER_SIZE;

            // garbage collect so that our calculation is accurate.
            final Runtime rt = Runtime.getRuntime();
            rt.gc();

            //                             free in heap       space available to expand heap
            final long allocatableMemory = rt.freeMemory() + (rt.maxMemory() - rt.totalMemory());

            // There is ~20k in overhead per file.
            final long freeMemory = allocatableMemory - (numFiles * 20 * 1024);
            // use the floor value from the divide
            final int memoryPerFile = (int) (freeMemory / numFiles);

            if (memoryPerFile < 0) {
                log.warn("There is not enough memory per file for buffering. Reading will be unbuffered.");
                bufferSize = 0;
            } else if (bufferSize > memoryPerFile) {
                log.warn(String.format("Default io buffer size of %s is larger than available memory per file of %s.",
                        StringUtil.humanReadableByteCount(bufferSize),
                        StringUtil.humanReadableByteCount(memoryPerFile)));
                bufferSize = memoryPerFile;
            }
            return bufferSize;
        }

        @Override
        public boolean hasNext() {
            return !this.queue.isEmpty();
        }

        @Override
        public T next() {
            if (!hasNext()) {
                throw new NoSuchElementException();
            }

            final PeekFileRecordIterator fileIterator = queue.pollFirst();
            final T ret = fileIterator.next();
            if (fileIterator.hasNext()) {
                this.queue.add(fileIterator);
            } else {
                ((CloseableIterator<T>) fileIterator.getUnderlyingIterator()).close();
            }

            return ret;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }

        @Override
        public void close() {
            while (!this.queue.isEmpty()) {
                final PeekFileRecordIterator it = this.queue.pollFirst();
                ((CloseableIterator<T>) it.getUnderlyingIterator()).close();
            }
        }
    }

    /**
     * Read a file of records in format defined by the codec
     */
    class FileRecordIterator implements CloseableIterator<T> {
        private final Path file;
        private final InputStream is;
        private final Codec<T> codec;
        private T currentRecord = null;

        FileRecordIterator(final Path file, final int bufferSize) {
            this.file = file;
            try {
                this.is = Files.newInputStream(file);
                this.codec = SortingCollection.this.codec.clone();
                this.codec.setInputStream(tempStreamFactory.wrapTempInputStream(this.is, bufferSize));
                advance();
            } catch (IOException e) {
                throw new RuntimeIOException(e);
            }
        }

        @Override
        public boolean hasNext() {
            return this.currentRecord != null;
        }

        @Override
        public T next() {
            if (!hasNext()) {
                throw new NoSuchElementException();
            }
            final T ret = this.currentRecord;
            advance();
            return ret;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }

        private void advance() {
            this.currentRecord = this.codec.decode();
        }

        @Override
        public void close() {
            CloserUtil.close(this.is);
        }
    }


    /**
     * Just a typedef
     */
    class PeekFileRecordIterator extends PeekIterator<T> {
        final int n; // A serial number used for tie-breaking in the sort

        PeekFileRecordIterator(final Iterator<T> underlyingIterator, final int n) {
            super(underlyingIterator);
            this.n = n;
        }
    }

    class PeekFileRecordIteratorComparator implements Comparator<PeekFileRecordIterator> {
        @Override
        public int compare(final PeekFileRecordIterator lhs, final PeekFileRecordIterator rhs) {
            final int result = comparator.compare(lhs.peek(), rhs.peek());
            if (result == 0) return lhs.n - rhs.n;
            else return result;
        }
    }
}