对已经匹配或者没有匹配的reads进行排序
源码:
def adamSortReadsByReferencePosition(): RDD[AlignmentRecord] = SortReads.time {
log.info("Sorting reads by reference position")
// NOTE: In order to keep unmapped reads from swamping a single partition
// we sort the unmapped reads by read name. We prefix with tildes ("~";
// ASCII 126) to ensure that the read name is lexicographically "after" the
// contig names.
rdd.keyBy(r => {
if (r.getReadMapped) {
ReferencePosition(r)
} else {
ReferencePosition(s"~~~${r.getReadName}", 0)
}
}).sortByKey().map(_._2)
}
def apply(record: AlignmentRecord): ReferencePosition = {
new ReferencePosition(record.getContig.getContigName, record.getStart)
}
分析:
先利用keyBy进行操作,利用ReferencePosition的方法返回ContigName和start的tuple
然后进行sortByKey,最后返回value
另外测试用例Suite中有用到zipWithindex,需要注意,不然返回的是乱序的:
sparkTest("sorting reads") {
val random = new Random("sorting".hashCode)
val numReadsToCreate = 1000
val reads = for (i <- 0 until numReadsToCreate) yield {
val mapped = random.nextBoolean()
val builder = AlignmentRecord.newBuilder().setReadMapped(mapped)
if (mapped) {
val contig = Contig.newBuilder
.setContigName(random.nextInt(numReadsToCreate / 10).toString)
.build
val start = random.nextInt(1000000)
builder.setContig(contig).setStart(start).setEnd(start)
}
builder.setReadName((0 until 20).map(i => (random.nextInt(100) + 64)).mkString)
builder.build()
}
val rdd = sc.parallelize(reads)
/*********add by xubo**************/
rdd.foreach(println)
val sortedReads = rdd.adamSortReadsByReferencePosition().collect().zipWithIndex
/*********add by xubo**************/
sortedReads.foreach(println)
val (mapped, unmapped) = sortedReads.partition(_._1.getReadMapped)
// Make sure that all the unmapped reads are placed at the end
assert(unmapped.forall(p => p._2 > mapped.takeRight(1)(0)._2))
// Make sure that we appropriately sorted the reads
val expectedSortedReads = mapped.sortWith(
(a, b) => a._1.getContig.getContigName.toString < b._1.getContig.getContigName.toString && a._1.getStart < b._1.getStart)
assert(expectedSortedReads === mapped)
}
Suite结果:
“`
FromEnd”: 0, “readPaired”: false, “properPair”: false, “readMapped”: true, “mateMapped”: false, “failedVendorQualityChecks”: false, “duplicateRead”: false, “readNegativeStrand”: false, “mateNegativeStrand”: false, “primaryAlignment”: false, “secondaryAlignment”: false, “supplementaryAlignment”: false, “mismatchingPositions”: null, “origQual”: null, “attributes”: null, “recordGroupName”: null, “recordGroupSample”: null, “mateAlignmentStart”: null, “mateAlignmentEnd”: null, “mateContig”: null, “inferredInsertSize”: null}
{“readInFragment”: 0, “contig”: null, “start”: null, “oldPosition”: null, “end”: null, “mapq”: null, “readName”: “12883104691551041117664107129911551229210174103145162”, “sequence”: null, “qual”: null, “cigar”: null, “oldCigar”: null, “basesTrimmedFromStart”: 0, “basesTrimmedFromEnd”: 0, “readPaired”: false, “properPair”: false, “readMapped”: false, “mateMapped”: false, “failedVendorQualityChecks”: false, “duplicateRead”: false, “readNegativeStrand”: false, “mateNegativeStrand”: false, “primaryAlignment”: false, “secondaryAlignment”: false, “supplementaryAlignment”: false, “mismatchingPositions”: null, “origQual”: null, “attributes”: null, “recordGroupName”: null, “recordGroupSample”: null, “mateAlignmentStart”: null, “mateAlignmentEnd”: null, “mateContig”: null, “inferredInsertSize”: null}
{“readInFragment”: 0, “contig”: {“contigName”: “42”, “contigLength”: null, “contigMD5”: null, “referenceURL”: null, “assembly”: null, “species”: null, “referenceIndex”: null}, “start”: 514051, “oldPosition”: null, “end”: 514051, “mapq”: null, “readName”: “163161661111411576912510268859175146127949114514283”, “sequence”: null, “qual”: null, “cigar”: null, “oldCigar”: null, “basesTrimmedFromStart”: 0, “basesTrimmedFromEnd”: 0, “readPaired”: false, “properPair”: false, “readMapped”: true, “mateMapped”: false, “failedVendorQualityChecks”: false, “duplicateRead”: false, “readNegativeStrand”: false, “mateNegativeStrand”: false, “primaryAlignment”: false, “secondaryAlignment”: false, “supplementaryAlignment”: false, “mismatchingPositions”: null, “origQual”: null, “attributes”: null, “recordGroupName”: null, “recordGroupSample”: null, “mateAlignmentStart”: null, “mateAlignmentEnd”: null, “mateContig”: null, “inferredInsertSize”: null}
。。。