scala> val list = List("hadoop","spark","hive","spark")
list: List[String] = List(hadoop, spark, hive, spark)
scala> val rdd = sc.parallelize(list)
rdd: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0] at parallelize at <console>:26
scala> val pairRdd = rdd.map((_,1))
pairRdd: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[1] at map at <console>:28
//分组并计算
scala> pairRdd.reduceByKey(_+_).collect.foreach(println)
(hive,1)
(spark,2)
(hadoop,1)
//只分组 不计算
scala> pairRdd.groupByKey().collect.foreach(println)
(hive,CompactBuffer(1))
(spark,CompactBuffer(1, 1))
(hadoop,CompactBuffer(1))
//groupByKey+map 可以达到和reduceByKey一样的效果,但是效率较低
scala> pairRdd.groupByKey().map(t => (t._1,t._2.sum)).collect.foreach(println)
(hive,1)
(spark,2)
(hadoop,1)
scala> pairRdd.groupByKey().mapValues(_.toList).collect.foreach(println)
(hive,List(1))
(spark,List(1, 1))
(hadoop,List(1))
//groupByKey+mapValues 可以达到和reduceByKey一样的效果,但是效率较低
scala> pairRdd.groupByKey().mapValues(_.toList.sum).collect.foreach(println)
(hive,1)
(spark,2)
(hadoop,1)
scala> pairRdd.groupByKey().mapValues(_.toList.sum.sorted).collect.foreach(println)
<console>:31: error: value sorted is not a member of Int
pairRdd.groupByKey().mapValues(_.toList.sum.sorted).collect.foreach(println)
^
scala> pairRdd.groupByKey().mapValues(_.toList.sum.sortWith((x,y)=(x>y)).collect.foreach(println)
<console>:1: error: ')' expected but '=' found.
pairRdd.groupByKey().mapValues(_.toList.sum.sortWith((x,y)=(x>y)).collect.foreach(println)
^
scala> pairRdd.groupByKey().mapValues(_.toList.sum.sortWith((x,y)=>(x>y)).collect.foreach(println)
|
|
You typed two blank lines. Starting a new command.
scala> pairRdd.groupByKey().mapValues(_.toList.sum.sortWith((x,y)=>(x>y))).collect.foreach(println)
<console>:31: error: value sortWith is not a member of Int
pairRdd.groupByKey().mapValues(_.toList.sum.sortWith((x,y)=>(x>y))).collect.foreach(println)
^
scala> pairRdd.groupByKey().mapValues(_.toList.sum).sorted.collect.foreach(println)
<console>:31: error: value sorted is not a member of org.apache.spark.rdd.RDD[(String, Int)]
pairRdd.groupByKey().mapValues(_.toList.sum).sorted.collect.foreach(println)
^
scala> pairRdd.groupByKey().mapValues(_.toList.sum).sortWith((x._2,y._2)=>(x._2>y._2)).collect.foreach(println)
<console>:1: error: not a legal formal parameter.
Note: Tuples cannot be directly destructured in method or function parameters.
Either create a single parameter accepting the Tuple1,
or consider a pattern matching anonymous function: `{ case (param1, param1) => ... }
pairRdd.groupByKey().mapValues(_.toList.sum).sortWith((x._2,y._2)=>(x._2>y._2)).collect.foreach(println)
rdd groupByKey入门到熟悉
最新推荐文章于 2023-07-05 18:45:00 发布