Spark SQL 写入 Json 格式文件报错 org.apache.spark.sql.AnalysisException: Found duplicate column(s)

21 篇文章 0 订阅
21 篇文章 0 订阅

 

错误场景

如下两个 Json 文件

person.json

{"name":"路飞","age":17,"deptno":1,"money":15}
{"name":"索隆","age":18,"deptno":1,"money":9}
{"name":"乔巴","age":5,"deptno":1,"money":5}
{"name":"艾斯","age":18,"deptno":2,"money":18}
{"name":"萨博","age":18,"deptno":2,"money":16}
{"name":"香克斯","age":32,"deptno":3,"money":30}

dept.json

{"name":"草帽","deptno":1}
{"name":"白胡子","deptno":2}
{"name":"红发","deptno":3}

当通过如下语句,join 两个表在写入到 新的 Json 文件的时候报错

    val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)

    // 1)相关处理
    val person =  sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
    val dept = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\dept.json")

    person.join(dept, person("deptno") === dept("deptno"), "outer").repartition(1)
        .write.format("json").save("C:\\Users\\LUFFY\\Desktop\\testData\\joinJson")

错误如下:

org.apache.spark.sql.AnalysisException: Found duplicate column(s) when inserting into file:/C:/Users/LUFFY/Desktop/testData/joinJson: `deptno`, `name`;

	at org.apache.spark.sql.util.SchemaUtils$.checkColumnNameDuplication(SchemaUtils.scala:85)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:68)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:676)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:290)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
	at sparkSql.demo1.testError(demo1.scala:142)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
	at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
	at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
	at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
	at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325)
	at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78)
	at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57)
	at org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
	at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
	at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
	at org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
	at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
	at org.junit.runners.ParentRunner.run(ParentRunner.java:363)
	at org.junit.runner.JUnitCore.run(JUnitCore.java:137)
	at com.intellij.junit4.JUnit4IdeaTestRunner.startRunnerWithArgs(JUnit4IdeaTestRunner.java:68)
	at com.intellij.rt.execution.junit.IdeaTestRunner$Repeater.startRunnerWithArgs(IdeaTestRunner.java:47)
	at com.intellij.rt.execution.junit.JUnitStarter.prepareStreamsAndStart(JUnitStarter.java:242)
	at com.intellij.rt.execution.junit.JUnitStarter.main(JUnitStarter.java:70)

 

问题原因

列名重复

解决方案

修改列名,如下:

@Test
  def testJoinSave() ={
    val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)

    // 1)相关处理
    val person =  sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
    val dept = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\dept.json")

    //另存为 Json
    //分别拿出两张表的列名
     val c_person = person.columns
     val c_dept = dept.columns
     //分别对两张表的别名进行设置

     val person_tmp = person.select(c_person.map(n => person(n).as("person_" + n)): _*)
     val dept_tmp = dept.select(c_dept.map(n => dept(n).as("dept_" + n)): _*)

     person_tmp.join(dept_tmp, person_tmp("person_deptno") === dept_tmp("dept_deptno"), "outer").repartition(1)
        .write.format("json").save("C:\\Users\\LUFFY\\Desktop\\testData\\joinJson")
  }

 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值