求助帖：我在spark的structured streaming进行流计算，想要将每个批次的结果自定义写入hdfs文件中，但是报错无法序列化。

最新推荐文章于 2021-09-10 17:42:18 发布

a997002636

最新推荐文章于 2021-09-10 17:42:18 发布

阅读量439

点赞数

本文链接：https://blog.csdn.net/a997002636/article/details/114869555

版权

求助帖：我在spark的structured streaming进行流计算，想要将每个批次的结果自定义写入hdfs文件中，但是报错无法序列化。
自定义写文件代码如下：
val query = sessionUpdates.writeStream.outputMode(“update”).foreach(new ForeachWriter[NewSessionUpdate] {

  var batchCount = 0
  val conf = new Configuration()
  conf.set("fs.defaultFS", "hdfs://mu01:9000")
  val fs: FileSystem = FileSystem.get(conf)
  var output: FSDataOutputStream = _
  var writer: PrintWriter = _

  // 一般用于 打开链接. 返回 false 表示跳过该分区的数据,
  override def open(partitionId: Long, epochId: Long): Boolean = {
    //        println("open ..." + partitionId + "  " + epochId)
    output = fs.create(new Path("hdfs://mu01:9000/liyixiu/output/result"  + ".txt"))
    writer = new PrintWriter(output)
    true
  }

  // 把数据写入到连接
  override def process(value: NewSessionUpdate): Unit = {
    //        println("process ...." + value)
    val word: String = value.id
    val count: String = value.totalstring
    writer.write(value.id + "," + value.totalstring + "," + value.GPS + "\n")
  }

  // 用户关闭连接
  override def close(errorOrNull: Throwable): Unit = {
    //        println("close...")
    writer.close()
  }
})

query.start().awaitTermination()

报错原因如下：

21/03/16 10:08:10 ERROR v2.WriteToDataSourceV2Exec: Data source writer org.apache.spark.sql.execution.streaming.sources.MicroBatchWriter@520c5cf3 is aborting.
21/03/16 10:08:10 ERROR v2.WriteToDataSourceV2Exec: Data source writer org.apache.spark.sql.execution.streaming.sources.MicroBatchWriter@520c5cf3 aborted.
21/03/16 10:08:10 ERROR streaming.MicroBatchExecution: Query [id = bd0ff581-1a95-48d2-8e30-ff3f3f90b1cd, runId = 65303f9d-3e2d-4208-b4e9-eacbda7ab018] terminated with error
org.apache.spark.SparkException: Writing job aborted.
	at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.doExecute(WriteToDataSourceV2Exec.scala:92)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:296)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3383)
	at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2782)
	at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2782)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
	at org.apache.spark.sql.Dataset.collect(Dataset.scala:2782)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$5$$anonfun$apply$17.apply(MicroBatchE
xecution.scala:540)	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$5.apply(MicroBatchExecution.scala:535
)	at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:351)
	at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:534)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:198)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:166)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:166)
	at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:351)
	at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:166)
	at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:160)
	at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:281)
	at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:193)
Caused by: org.apache.spark.SparkException: Task not serializable
	at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:403)
	at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:393)
	at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
	at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2056)
	at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.doExecute(WriteToDataSourceV2Exec.scala:64)
	... 35 more
Caused by: java.io.NotSerializableException: org.apache.hadoop.conf.Configuration
Serialization stack:
	- object not serializable (class: org.apache.hadoop.conf.Configuration, value: Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml,
 yarn-site.xml, hdfs-default.xml, hdfs-site.xml)	- field (class: DataAnalysis.Test1$$anon$1, name: conf, type: class org.apache.hadoop.conf.Configuration)
	- object (class DataAnalysis.Test1$$anon$1, DataAnalysis.Test1$$anon$1@1b044485)
	- field (class: org.apache.spark.sql.execution.streaming.sources.ForeachWriterFactory, name: writer, type: class org.apache.spark.sql.ForeachWriter)
	- object (class org.apache.spark.sql.execution.streaming.sources.ForeachWriterFactory, ForeachWriterFactory(DataAnalysis.Test1$$anon$1@1b044485,<function1>))
	- field (class: org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec$$anonfun$doExecute$2, name: writeTask$1, type: interface org.apache.spark.sql.sources.v2.writer
.DataWriterFactory)	- object (class org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec$$anonfun$doExecute$2, <function2>)
	at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
	at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
	at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:400)
	... 40 more
Exception in thread "main" org.apache.spark.sql.streaming.StreamingQueryException: Writing job aborted.
=== Streaming Query ===
Identifier: [id = bd0ff581-1a95-48d2-8e30-ff3f3f90b1cd, runId = 65303f9d-3e2d-4208-b4e9-eacbda7ab018]
Current Committed Offsets: {}
Current Available Offsets: {FileStreamSource[hdfs://mu01:9000/liyixiu/input2021]: {"logOffset":0}}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, assertnotnull(assertnotnull(input[0, DataAnalysis.Test1$NewSessionUpdate, true])).i
d, true, false) AS id#34, staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, assertnotnull(assertnotnull(input[0, DataAnalysis.Test1$NewSessionUpdate, true])).totalstring, true, false) AS totalstring#35, staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, assertnotnull(assertnotnull(input[0, DataAnalysis.Test1$NewSessionUpdate, true])).GPS, true, false) AS GPS#36]+- FlatMapGroupsWithState <function3>, cast(value#27 as string).toString, newInstance(class DataAnalysis.Test1$NewSession), [value#27], [sessionId#23, value#24], obj#33: DataAnalysis.Test1$
NewSessionUpdate, class[totalstring[0]: string, GPS[0]: string], Update, true, NoTimeout   +- AppendColumns <function1>, class DataAnalysis.Test1$NewSession, [StructField(sessionId,StringType,true), StructField(value,StringType,true)], newInstance(class DataAnalysis.Test1$NewS
ession), [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#27]      +- SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, assertnotnull(assertnotnull(input[0, DataAnalysis.Test1$NewSession, true])
).sessionId, true, false) AS sessionId#23, staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, assertnotnull(assertnotnull(input[0, DataAnalysis.Test1$NewSession, true])).value, true, false) AS value#24]         +- MapElements <function1>, class java.lang.String, [StructField(value,StringType,true)], obj#22: DataAnalysis.Test1$NewSession
            +- DeserializeToObject cast(value#17 as string).toString, obj#21: java.lang.String
               +- SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#17]
                  +- MapElements <function1>, interface org.apache.spark.sql.Row, [StructField(carID,StringType,true), StructField(Time,StringType,true), StructField(longitude,StringType,tr
ue), StructField(latitude,StringType,true), StructField(speed,StringType,true), StructField(angle,StringType,true), StructField(occupied,StringType,true)], obj#16: java.lang.String                     +- DeserializeToObject createexternalrow(carID#0.toString, Time#1.toString, longitude#2.toString, latitude#3.toString, speed#4.toString, angle#5.toString, occupied#6.to
String, StructField(carID,StringType,true), StructField(Time,StringType,true), StructField(longitude,StringType,true), StructField(latitude,StringType,true), StructField(speed,StringType,true), StructField(angle,StringType,true), StructField(occupied,StringType,true)), obj#15: org.apache.spark.sql.Row                        +- Repartition 50, true
                           +- StreamingExecutionRelation FileStreamSource[hdfs://mu01:9000/liyixiu/input2021], [carID#0, Time#1, longitude#2, latitude#3, speed#4, angle#5, occupied#6]

	at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:297)
	at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:193)
Caused by: org.apache.spark.SparkException: Writing job aborted.
	at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.doExecute(WriteToDataSourceV2Exec.scala:92)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:296)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3383)
	at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2782)
	at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2782)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
	at org.apache.spark.sql.Dataset.collect(Dataset.scala:2782)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$5$$anonfun$apply$17.apply(MicroBatchE
xecution.scala:540)	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$5.apply(MicroBatchExecution.scala:535
)	at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:351)
	at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:534)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:198)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:166)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:166)
	at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:351)
	at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:166)
	at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:160)
	at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:281)
	... 1 more
Caused by: org.apache.spark.SparkException: Task not serializable
	at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:403)
	at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:393)
	at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
	at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2056)
	at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.doExecute(WriteToDataSourceV2Exec.scala:64)
	... 35 more
Caused by: java.io.NotSerializableException: org.apache.hadoop.conf.Configuration
Serialization stack:
	- object not serializable (class: org.apache.hadoop.conf.Configuration, value: Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml,
 yarn-site.xml, hdfs-default.xml, hdfs-site.xml)	- field (class: DataAnalysis.Test1$$anon$1, name: conf, type: class org.apache.hadoop.conf.Configuration)
	- object (class DataAnalysis.Test1$$anon$1, DataAnalysis.Test1$$anon$1@1b044485)
	- field (class: org.apache.spark.sql.execution.streaming.sources.ForeachWriterFactory, name: writer, type: class org.apache.spark.sql.ForeachWriter)
	- object (class org.apache.spark.sql.execution.streaming.sources.ForeachWriterFactory, ForeachWriterFactory(DataAnalysis.Test1$$anon$1@1b044485,<function1>))
	- field (class: org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec$$anonfun$doExecute$2, name: writeTask$1, type: interface org.apache.spark.sql.sources.v2.writer
.DataWriterFactory)	- object (class org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec$$anonfun$doExecute$2, <function2>)
	at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
	at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
	at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:400)

a997002636

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
求助帖：我在spark的structured streaming进行流计算，想要将每个批次的结果自定义写入hdfs文件中，但是报错无法序列化。

求助帖：我在spark的structured streaming进行流计算，想要将每个批次的结果自定义写入hdfs文件中，但是报错无法序列化。自定义写文件代码如下：val query = sessionUpdates.writeStream.outputMode(“update”).foreach(new ForeachWriter[NewSessionUpdate] { var batchCount = 0 val conf = new Configuration() conf.set("
复制链接

扫一扫