[Spark基础]-- spark中join和group操作

操作举例如下

package com.scala

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
/**
 * scala测试join和cogroup:join是排列组合,而cgroup是分组
 */
object JoinAndCogroup {
  
  def main(args:Array[String]):Unit={
    val conf=new SparkConf().setAppName("joinAndcogroup").setMaster("local[1]")
    //获取context
    val sc=new SparkContext(conf)
    //创建泛型集合
    val stuList=List((1,"tom"),(2,"jim"),(3,"cassie"))
    val scoreList=List((1,20),(1,90),(1,30),(2,23),(2,23),(2,80),(3,90),(3,100),(3,100))
    //转化为RDD
    val stuRDD=sc.parallelize(stuList)
    val scoreRDD=sc.parallelize(scoreList)
  /*  //join操作
    //遍历
    val joinRDD=stuRDD.join(scoreRDD)
    for( join2 <- joinRDD ) {
      println("===========")
      println("id is "+join2._1)
      println("name is "+join2._2._1)
      println("score is "+join2._2._2)
    }    */
    
    //cogroup操作
    val gourpRDD=stuRDD.cogroup(scoreRDD)
    for (group2<- gourpRDD){
       println("===========")
      println("id is "+group2._1)
      println("name is "+group2._2._1)
      println("score is "+group2._2._2)
    }
    //遍历结果
  }
}

 

发布了508 篇原创文章 · 获赞 616 · 访问量 202万+
展开阅读全文

SparkSQL Group by 语句报错

09-26

跪求各位大神。 代码如下所示: val sqlContext = new org.apache.spark.sql.SQLContext(sc) import spark.implicits._ val testRDD = spark.sparkContext.textFile("hdfs://ip-172-31-26-254:9000/eth-data/done-eth-trx-5125092-5491171.csv"). filter(line=>line.split(",")(25)=="0xa74476443119a942de498590fe1f2454d7d4ac0d") val rdd = testRDD.map(line=>(line.split(",")(25),line.split(",")(15),line.split(",")(18).substring(0,10))) case class Row(fromadd: String, amount:Int, date:String) val rowRDD = rdd.map(p => Row(p._1,p._2.toInt,p._3)) val testDF=rowRDD.toDF() testDF.registerTempTable("test") #test 内容如下所示; | fromadd|amount| date| +--------------------+------+----------+ |0xa74476443119a94...| 28553|2018-02-20| |0xa74476443119a94...| 30764|2018-02-20| |0xa74476443119a94...| 32775|2018-02-20| |0xa74476443119a94...| 29439|2018-02-20| |0xa74476443119a94...| 35810|2018-02-20| |0xa74476443119a94...| 35810|2018-02-20| |0xa74476443119a94...| 35810|2018-02-20| |0xa74476443119a94...| 28926|2018-02-20| |0xa74476443119a94...| 36229|2018-02-20| |0xa74476443119a94...| 33235|2018-02-20| |0xa74476443119a94...| 34104|2018-02-20| |0xa74476443119a94...| 29425|2018-02-20| |0xa74476443119a94...| 29568|2018-02-20| |0xa74476443119a94...| 33473|2018-02-20| |0xa74476443119a94...| 31344|2018-02-20| |0xa74476443119a94...| 34399|2018-02-20| |0xa74476443119a94...| 34080|2018-02-20| |0xa74476443119a94...| 34080|2018-02-20| |0xa74476443119a94...| 27165|2018-02-20| |0xa74476443119a94...| 33512|2018-02-20| +--------------------+------+----------+ 运行SQL: val data=sqlContext.sql("select * from test where amount>27000").show() 语句ok. 但是运行: val res=sqlContext.sql("select count(amount) from test where group by date").show() 报错如下: org.apache.spark.SparkException: Job aborted due to stage failure: Task 55 in stage 5.0 failed 1 times, most recent failure: Lost task 55.0 in stage 5.0 (TID 82, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException: 25 at $anonfun$1.apply(<console>:27) at $anonfun$1.apply(<console>:27) at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:463) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395) at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:234) at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:228) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:108) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069) at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:336) at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38) at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:2861) at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2150) at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2150) at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2842) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2841) at org.apache.spark.sql.Dataset.head(Dataset.scala:2150) at org.apache.spark.sql.Dataset.take(Dataset.scala:2363) at org.apache.spark.sql.Dataset.showString(Dataset.scala:241) at org.apache.spark.sql.Dataset.show(Dataset.scala:637) at org.apache.spark.sql.Dataset.show(Dataset.scala:596) at org.apache.spark.sql.Dataset.show(Dataset.scala:605) ... 50 elided Caused by: java.lang.ArrayIndexOutOfBoundsException: 25 at $anonfun$1.apply(<console>:27) at $anonfun$1.apply(<console>:27) at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:463) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395) at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:234) at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:228) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:108) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 感谢感谢 问答

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: Age of Ai 设计师: meimeiellie

分享到微信朋友圈

×

扫一扫,手机浏览