transformation

 

// map,遍历操作集合的每一个元素,处理之后返回,组成一个新的list
val rdd1: RDD[Int] = sc.parallelize( List (5,6,4,7,3,8,2,9,1,10)).map(_*2)

// faltMap,操作的是list<list>,遍历list中的list,对list处理变成一个总的list
val rdd2 = sc.parallelize( Array ("a b c", "d e f", "h i j"))
rdd2.flatMap(_.split(' ')).collect
res3: Array[String] = Array(a, b, c, d, e, f, h, i, j) 
// 第一个faltMap是spark的第二个flatMap是scala的
val rdd3 = sc.parallelize( List ( List ("a b c", "a b b"), List ("e f g", "a f g")))
rdd3.flatMap(_.flatMap(_.split(" "))).collect
res4: Array[String] = Array(a, b, c, a, b, b, e, f, g, a, f, g)

// filter,遍历操作集合的每一个元素,处理之后得到true留下,false过滤掉
val rdd4 = sc.parallelize( List (5,6,4,7,3,8,2,9,1,10)).filter(_ % 2 == 0)
rdd4.collect
res5: Array[Int] = Array(6, 4, 8, 2, 10)

//mapPartitons,遍历出rdd所有的分区,拿到分区后可以对分区内的值进行处理
val rdd1 = sc.parallelize( List (1, 2, 3, 4, 5), 2)
rdd1.mapPartitions(it=>it.map(_*10)).collect
res6: Array[Int] = Array(10, 20, 30, 40, 50)

//mapPartitonsWithIndex,遍历出rdd所有的分区和分区索引值
val rdd1 = sc.parallelize( List (1,2,3,4,5,6,7,8,9), 2)
rdd1.mapPartitionsWithIndex((index,it)=>it.map((index+"fq",_))).collect
res7: Array[(String, Int)] = Array((0fq,1), (0fq,2), (0fq,3), (0fq,4), (1fq,5), (1fq,6), (1fq,7), (1fq,8), (1fq,9))

// sortBy, 排序,第二个参数指定顺序 true 升序,false 降序,默认true
sc.parallelize( List (5,11,22,13,2,1,10)).sortBy(x=>x,true).collect
res9: Array[Int] = Array(1, 2, 5, 10, 11, 13, 22)       

// sortBy, 排序,将参数变成string后在排序
sc.parallelize( List (5,11,22,13,2,1,10)).sortBy(x=>x+"",true).collect
res10: Array[Int] = Array(1, 10, 11, 13, 2, 22, 5)

// sortByKey,根据key排序
val rdd1 = sc.parallelize( List (("dd", 9), ("aa", 8), ("cc", 7), ("bb", 2)))
rdd1.sortByKey(false).collect
res17: Array[(String, Int)] = Array((aa,8), (bb,2), (cc,7), (dd,9))

// groupby,对一堆元组,根据key进行分组,组中元素是元组本身
val rdd1 = sc.parallelize( List (("hello", 9), ("tom", 8), ("kitty", 7), ("tom", 2)))
val rdd2 = rdd1.groupBy(_._1).collect
rdd2: Array[(String, Iterable[(String, Int)])] = Array((tom,CompactBuffer((tom,2), (tom,8))), (hello,CompactBuffer((hello,9))), (kitty,CompactBuffer((kitty,7))))      

// groupByKey,根据key进行分组,组中元素是value集合
val rdd2 = sc.parallelize( List (("jerry", 9), ("tom", 8), ("shuke", 7), ("tom", 2)))
rdd2.groupByKey().collect
res1: Array[(String, Iterable[Int])] = Array((tom,CompactBuffer(2, 8)), (jerry,CompactBuffer(9)), (shuke,CompactBuffer(7)))

// reduceByKey,根据key进行分组,分完组后,逐个相邻元素进行操作。
val rdd2 = sc.parallelize( List (("jerry", 9), ("tom", 8), ("shuke", 7), ("tom", 2)))
rdd2.reduceByKey(_+_).collect
res2: Array[(String, Int)] = Array((tom,10), (jerry,9), (shuke,7))

// distinct, 过滤,其实是做了一个reduceBykey
sc.parallelize( List (5,5,6,6,7,8,8,8)).distinct.collect
res3: Array[Int] = Array(6, 7, 8, 5)

// union,两个集合求并集,两个 RDD 类型要一样
val rdd6 = sc.parallelize( List (5,6,4,7), 2)
val rdd7 = sc.parallelize( List (1,2,3,4), 3)
rdd6.union(rdd7).collect
res4: Array[Int] = Array(5, 6, 4, 7, 1, 2, 3, 4)

// intersection,两个集合求交集
val rdd6 = sc.parallelize( List (5,6,4,7))
val rdd7 = sc.parallelize( List (1,2,3,4))
rdd6.intersection(rdd7).collect
res8: Array[Int] = Array(4)

// subtract,两个集合差集
val rdd6 = sc.parallelize( List (5,6,4,7))
val rdd7 = sc.parallelize( List (1,2,3,4))
rdd6.subtract(rdd7).collect
res10: Array[Int] = Array(6, 7, 5)

// join, 内关联,根据key组成一个新的元组,key是相同的key,value是相同key的value的集合
val rdd1 = sc.parallelize( List (("tom", 1), ("jerry", 2), ("kitty", 3)))
val rdd2 = sc.parallelize( List (("jerry", 9), ("tom", 8), ("shuke", 7), ("tom", 2)))
rdd1.join(rdd2).collect
res12: Array[(String, (Int, Int))] = Array((tom,(1,2)), (tom,(1,8)), (jerry,(2,9)))

// leftOuterJoin, 左关联,根据key组成一个新的元组,key是相同的key,value是相同key的value的组成的some集合,如果匹配不上就是none
val rdd1 = sc.parallelize( List (("tom", 1), ("jerry", 2), ("kitty", 3)))
val rdd2 = sc.parallelize( List (("jerry", 9), ("tom", 8), ("shuke", 7), ("tom", 2)))
rdd1.leftOuterJoin(rdd2).collect
res13: Array[(String, (Int, Option[Int]))] = Array((tom,(1,Some(2))), (tom,(1,Some(8))), (jerry,(2,Some(9))), (kitty,(3,None)))

// rightOuterJoin, 右关联,根据key组成一个新的元组,key是相同的key,value是相同key的value的组成的some集合,如果匹配不上就是none
val rdd1 = sc.parallelize( List (("tom", 1), ("jerry", 2), ("kitty", 3)))
val rdd2 = sc.parallelize( List (("jerry", 9), ("tom", 8), ("shuke", 7), ("tom", 2)))
rdd1.rightOuterJoin(rdd2).collect
res1: Array[(String, (Option[Int], Int))] = Array((tom,(Some(1),2)), (tom,(Some(1),8)), (jerry,(Some(2),9)), (shuke,(None,7)))

// fullOuterJoin, 全关联,根据key组成一个新的元组,key是相同的key,value是相同key的value的组成的some集合,如果匹配不上就是none
val rdd1 = sc.parallelize( List (("tom", 1), ("jerry", 2), ("kitty", 3)))
val rdd2 = sc.parallelize( List (("jerry", 9), ("tom", 8), ("shuke", 7), ("tom", 2)))
rdd1.fullOuterJoin(rdd2).collect
res3: Array[(String, (Option[Int], Option[Int]))] = Array((tom,(Some(1),Some(8))), (tom,(Some(1),Some(2))), (jerry,(Some(2),Some(9))), (shuke,(None,Some(7))), (kitty,(Some(3),None)))

// cartesian 笛卡尔积
val rdd1 = sc.parallelize( List ("tom", "jerry"))
val rdd2 = sc.parallelize( List ("tom", "kitty", "shuke"))
rdd1.cartesian(rdd2).collect
res4: Array[(String, String)] = Array((tom,tom), (tom,kitty), (tom,shuke), (jerry,tom), (jerry,kitty), (jerry,shuke))

// aggregateByKey,按照 key 进行聚合,跟 reduceByKey 类似,可以输入两个函数,第一个函数局部聚合,第二个函数全局聚合。初始值自在局部聚合时使用,全局聚合不使用
val pairRDD = sc.parallelize( List (("cat",2), ("cat", 5), ("mouse", 4),
     | ("cat", 12), ("dog", 12), ("mouse", 2)), 2)
pairRDD.aggregateByKey(0)(math. max (_, _), _ + _).collect
pairRDD.aggregateByKey(100)(math. max (_, _), _ + _).collect

// combineByKeycombineByKey,需要输入三个参数,第一个参数为分组后 value 的第一个元素,第二个函数为局部集合函数,第三个函数为全局聚合函数
val pairRDD = sc.parallelize( List (("cat",2), ("cat", 5), ("mouse", 4),
     | ("cat", 12), ("dog", 12), ("mouse", 2)), 2)
pairRDD.combineByKey(x => x, (a: Int, b: Int) => a + b, (m: Int, n: Int) => m + n).collect
res7: Array[(String, Int)] = Array((dog,12), (cat,19), (mouse,6))

// zip,对应角标的元素组合起来
val a = sc.parallelize( List ("dog","cat","gnu","salmon",
     | "rabbit","turkey","wolf","bear","bee"), 3)
val b = sc.parallelize( List (1,1,2,2,2,1,2,2,2), 3)
a.zip(b).collect
res0: Array[(String, Int)] = Array((dog,1), (cat,1), (gnu,2), (salmon,2), (rabbit,2), (turkey,1), (wolf,2), (bear,2), (bee,2))

 

 

 

 

 

 

 

 

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值