Adam学习27之序列化问题解决办法

1.问题:

1.1 描述

当读入fastq文件后,需要进行转换,比如collect或者读取属性,会出现没有序列化的问题

1.2 问题代码:

package org.dsw.core

import org.apache.spark.{SparkContext, SparkConf}
import org.bdgenomics.adam.rdd.ADAMContext._

/**
  * Created by xubo on 2016/12/16.
  */
class DSAAdamTest {


}

object DSAAdamTest {
  def main(args: Array[String]) {

    val conf = new SparkConf().setMaster("local[8]").setAppName("adam")
    val sc = new SparkContext(conf)
    val input = "file/input/fastq/fix1.fastq"
    val rdd = sc.loadAlignments(input)
    //    println(rdd)
    rdd.foreach(println)
    println(rdd.collect().length)
    sc.stop()

  }
}

1.3 记录:

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/C:/all/java/jar/spark-assembly-1.5.2-hadoop2.6.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/C:/all/java/jar/adam-cli_2.10-0.18.3-SNAPSHOT.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/C:/Users/xubo/.m2/repository/org/slf4j/slf4j-log4j12/1.7.12/slf4j-log4j12-1.7.12.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/C:/Users/xubo/.m2/repository/org/bdgenomics/adam/adam-cli_2.10/0.18.2/adam-cli_2.10-0.18.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
2016-12-16 17:20:50 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2016-12-16 17:20:51 WARN  MetricsSystem:71 - Using default name DAGScheduler for source because spark.app.id is not set.
2016-12-16 17:20:53 WARN  :139 - Your hostname, xubo-PC resolves to a loopback/non-reachable address: fe80:0:0:0:200:5efe:ca26:541f%20, but we couldn't find any external IP address!
{"readNum": null, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": null, "readName": "chr1-1\tRG\tID:foo\tSM:bar", "sequence": "CATATTTACCAATTAAAGTCACAAAATATTTCTCATTATTTATTCATGCAGGTAACTGAGACAAAGATAGTGCAGAAATCAACTTTAAATAAAAAATTAT", "qual": "@C@D@FFDFHHHHIJ.JBIJJGJGIJ:G47JHJ@IJJ91BJJIGHHHEIJDGD=IJJJBJJ'DG=3D)<D?HCHBFAE?GEDC5D5ECD<CD<DBADDBE", "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": null, "readMapped": false, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": null, "mateNegativeStrand": null, "primaryAlignment": null, "secondaryAlignment": null, "supplementaryAlignment": null, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}
2016-12-16 17:20:54 ERROR Executor:96 - Exception in task 0.0 in stage 1.0 (TID 1)
java.io.NotSerializableException: org.bdgenomics.formats.avro.AlignmentRecord
Serialization stack:
    - object not serializable (class: org.bdgenomics.formats.avro.AlignmentRecord, value: {"readNum": null, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": null, "readName": "chr1-1\tRG\tID:foo\tSM:bar", "sequence": "CATATTTACCAATTAAAGTCACAAAATATTTCTCATTATTTATTCATGCAGGTAACTGAGACAAAGATAGTGCAGAAATCAACTTTAAATAAAAAATTAT", "qual": "@C@D@FFDFHHHHIJ.JBIJJGJGIJ:G47JHJ@IJJ91BJJIGHHHEIJDGD=IJJJBJJ'DG=3D)<D?HCHBFAE?GEDC5D5ECD<CD<DBADDBE", "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": null, "readMapped": false, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": null, "mateNegativeStrand": null, "primaryAlignment": null, "secondaryAlignment": null, "supplementaryAlignment": null, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null})
    - element of array (index: 0)
    - array (class [Lorg.bdgenomics.formats.avro.AlignmentRecord;, size 1)
    at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
    at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
    at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:84)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:240)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)
2016-12-16 17:20:54 ERROR TaskSetManager:75 - Task 0.0 in stage 1.0 (TID 1) had a not serializable result: org.bdgenomics.formats.avro.AlignmentRecord
Serialization stack:
    - object not serializable (class: org.bdgenomics.formats.avro.AlignmentRecord, value: {"readNum": null, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": null, "readName": "chr1-1\tRG\tID:foo\tSM:bar", "sequence": "CATATTTACCAATTAAAGTCACAAAATATTTCTCATTATTTATTCATGCAGGTAACTGAGACAAAGATAGTGCAGAAATCAACTTTAAATAAAAAATTAT", "qual": "@C@D@FFDFHHHHIJ.JBIJJGJGIJ:G47JHJ@IJJ91BJJIGHHHEIJDGD=IJJJBJJ'DG=3D)<D?HCHBFAE?GEDC5D5ECD<CD<DBADDBE", "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": null, "readMapped": false, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": null, "mateNegativeStrand": null, "primaryAlignment": null, "secondaryAlignment": null, "supplementaryAlignment": null, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null})
    - element of array (index: 0)
    - array (class [Lorg.bdgenomics.formats.avro.AlignmentRecord;, size 1); not retrying
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 1.0 (TID 1) had a not serializable result: org.bdgenomics.formats.avro.AlignmentRecord
Serialization stack:
    - object not serializable (class: org.bdgenomics.formats.avro.AlignmentRecord, value: {"readNum": null, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": null, "readName": "chr1-1\tRG\tID:foo\tSM:bar", "sequence": "CATATTTACCAATTAAAGTCACAAAATATTTCTCATTATTTATTCATGCAGGTAACTGAGACAAAGATAGTGCAGAAATCAACTTTAAATAAAAAATTAT", "qual": "@C@D@FFDFHHHHIJ.JBIJJGJGIJ:G47JHJ@IJJ91BJJIGHHHEIJDGD=IJJJBJJ'DG=3D)<D?HCHBFAE?GEDC5D5ECD<CD<DBADDBE", "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": null, "readMapped": false, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": null, "mateNegativeStrand": null, "primaryAlignment": null, "secondaryAlignment": null, "supplementaryAlignment": null, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null})
    - element of array (index: 0)
    - array (class [Lorg.bdgenomics.formats.avro.AlignmentRecord;, size 1)
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
		at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
		at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
		at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
		at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
		at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
		at scala.Option.foreach(Option.scala:236)
		at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
		at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
		at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
		at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
		at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1850)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1921)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:909)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
    at org.apache.spark.rdd.RDD.collect(RDD.scala:908)
    at org.dsw.core.DSAAdamTest$.main(DSAAdamTest.scala:24)
    at org.dsw.core.DSAAdamTest.main(DSAAdamTest.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:606)
    at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)

1.4 原因分析

主要是由于类没有序列化:、

java.io.NotSerializableException: org.bdgenomics.formats.avro.AlignmentRecord

但是该类在引用的jar包里面,不方便直接修改,所以需要换思路解决

2 解决办法

2.1 解决思路

查看adam源码,看相关的操作,比如所有.collect,看是否有相关的操作

发现adam测试类的特质中使用的序列化

org.bdgenomics.adam.util.ADAMFunSuite:

  override val appName: String = "adam"
  override val properties: Map[String, String] = Map(
    ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
    ("spark.kryo.registrator", "org.bdgenomics.adam.serialization.ADAMKryoRegistrator"),
    ("spark.kryo.referenceTracking", "true"))

所以想应该可以尝试对SparkContext进行属性设置

2.2 解决办法尝试

在SparkContext定义中加入序列化配置

.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

发现问题解决了

详细的代码请见:

package org.dsw.core

import org.apache.spark.{SparkContext, SparkConf}
import org.bdgenomics.adam.rdd.ADAMContext._

/**
  * Created by xubo on 2016/12/16.
  */
class DSAAdamTest {


}

object DSAAdamTest {
  def main(args: Array[String]) {

    val conf = new SparkConf().setMaster("local[8]").setAppName("adam")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    val sc = new SparkContext(conf)
    val input = "file/input/fastq/fix1.fastq"
    val rdd = sc.loadAlignments(input)
    //    println(rdd)
    rdd.foreach(println)
    println(rdd.collect().length)
    sc.stop()

  }
}

2.3 问题解决运行成功记录

{"readNum": null, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": null, "readName": "chr1-1\tRG\tID:foo\tSM:bar", "sequence": "CATATTTACCAATTAAAGTCACAAAATATTTCTCATTATTTATTCATGCAGGTAACTGAGACAAAGATAGTGCAGAAATCAACTTTAAATAAAAAATTAT", "qual": "@C@D@FFDFHHHHIJ.JBIJJGJGIJ:G47JHJ@IJJ91BJJIGHHHEIJDGD=IJJJBJJ'DG=3D)<D?HCHBFAE?GEDC5D5ECD<CD<DBADDBE", "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": null, "readMapped": false, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": null, "mateNegativeStrand": null, "primaryAlignment": null, "secondaryAlignment": null, "supplementaryAlignment": null, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}
1

换一个大点的文件也没问题,49行的

3 数据

fix1.fastq

@chr1-1 RG  ID:foo  SM:bar
CATATTTACCAATTAAAGTCACAAAATATTTCTCATTATTTATTCATGCAGGTAACTGAGACAAAGATAGTGCAGAAATCAACTTTAAATAAAAAATTAT
+
@C@D@FFDFHHHHIJ.JBIJJGJGIJ:G47JHJ@IJJ91BJJIGHHHEIJDGD=IJJJBJJ'DG=3D)<D?HCHBFAE?GEDC5D5ECD<CD<DBADDBE
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值