rdd groupByKey入门到熟悉

scala> val list = List("hadoop","spark","hive","spark")
list: List[String] = List(hadoop, spark, hive, spark)

scala> val rdd = sc.parallelize(list)
rdd: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0] at parallelize at <console>:26

scala> val pairRdd = rdd.map((_,1))
pairRdd: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[1] at map at <console>:28
//分组并计算
scala> pairRdd.reduceByKey(_+_).collect.foreach(println)
(hive,1)
(spark,2)
(hadoop,1)
//只分组  不计算 
scala> pairRdd.groupByKey().collect.foreach(println)
(hive,CompactBuffer(1))
(spark,CompactBuffer(1, 1))
(hadoop,CompactBuffer(1))
//groupByKey+map 可以达到和reduceByKey一样的效果,但是效率较低
scala> pairRdd.groupByKey().map(t => (t._1,t._2.sum)).collect.foreach(println)
(hive,1)
(spark,2)
(hadoop,1)

scala> pairRdd.groupByKey().mapValues(_.toList).collect.foreach(println)
(hive,List(1))
(spark,List(1, 1))
(hadoop,List(1))
//groupByKey+mapValues 可以达到和reduceByKey一样的效果,但是效率较低
scala> pairRdd.groupByKey().mapValues(_.toList.sum).collect.foreach(println)
(hive,1)
(spark,2)
(hadoop,1)

scala> pairRdd.groupByKey().mapValues(_.toList.sum.sorted).collect.foreach(println)
<console>:31: error: value sorted is not a member of Int
       pairRdd.groupByKey().mapValues(_.toList.sum.sorted).collect.foreach(println)
                                                   ^

scala> pairRdd.groupByKey().mapValues(_.toList.sum.sortWith((x,y)=(x>y)).collect.foreach(println)
<console>:1: error: ')' expected but '=' found.
pairRdd.groupByKey().mapValues(_.toList.sum.sortWith((x,y)=(x>y)).collect.foreach(println)
                                                          ^

scala> pairRdd.groupByKey().mapValues(_.toList.sum.sortWith((x,y)=>(x>y)).collect.foreach(println)
     |
     |
You typed two blank lines.  Starting a new command.

scala> pairRdd.groupByKey().mapValues(_.toList.sum.sortWith((x,y)=>(x>y))).collect.foreach(println)
<console>:31: error: value sortWith is not a member of Int
       pairRdd.groupByKey().mapValues(_.toList.sum.sortWith((x,y)=>(x>y))).collect.foreach(println)
                                                   ^

scala> pairRdd.groupByKey().mapValues(_.toList.sum).sorted.collect.foreach(println)
<console>:31: error: value sorted is not a member of org.apache.spark.rdd.RDD[(String, Int)]
       pairRdd.groupByKey().mapValues(_.toList.sum).sorted.collect.foreach(println)
                                                    ^

scala> pairRdd.groupByKey().mapValues(_.toList.sum).sortWith((x._2,y._2)=>(x._2>y._2)).collect.foreach(println)
<console>:1: error: not a legal formal parameter.
Note: Tuples cannot be directly destructured in method or function parameters.
      Either create a single parameter accepting the Tuple1,
      or consider a pattern matching anonymous function: `{ case (param1, param1) => ... }
pairRdd.groupByKey().mapValues(_.toList.sum).sortWith((x._2,y._2)=>(x._2>y._2)).collect.foreach(println)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值