import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
private static JavaPairRDD interleaveReads(String fastq, String fastq2, int splitlen, JavaSparkContext sc) throws IOException {
FileSystem fs = FileSystem.get(new Configuration());
FileStatus fst = fs.getFileStatus(new Path(fastq));
FileStatus fst2 = fs.getFileStatus(new Path(fastq2));
List nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD splitRDD = sc.parallelize(nlif);
JavaRDD splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD zips = splitRDD.zip(splitRDD2);
return zips.flatMapToPair( splits -> {
FastqInputFormat.FastqRecordReader fqreader = new FastqInputFormat.FastqRecordReader(new Configuration(), splits._1);
FastqInputFormat.FastqRecordReader fqreader2 = new FastqInputFormat.FastqRecordReader(new Configuration(), splits._2);
ArrayList> reads = new ArrayList>();
while (fqreader.nextKeyValue()) {
String key = fqreader.getCurrentKey().toString();
String[] keysplit = key.split(" ");
key = keysplit[0];
SequencedFragment sf = new SequencedFragment();
sf.setQuality(new Text(fqreader.getCurrentValue().getQuality().toString()));
sf.setSequence(new Text(fqreader.getCurrentValue().getSequence().toString()));
if (fqreader2.nextKeyValue()) {
String key2 = fqreader2.getCurrentKey().toString();
String[] keysplit2 = key2.split(" ");
key2 = keysplit2[0];
//key2 = key2.replace(" 2:N:0:1","/2");
SequencedFragment sf2 = new SequencedFragment();
sf2.setQuality(new Text(fqreader2.getCurrentValue().getQuality().toString()));
sf2.setSequence(new Text(fqreader2.getCurrentValue().getSequence().toString()));
reads.add(new Tuple2(new Text(key), sf));
reads.add(new Tuple2(new Text(key2), sf2));
}
}
return reads.iterator();
});
}