Spark rdd算子

本文深入介绍了Spark中RDD的基本操作,包括转换操作如map、filter、join等,以及行动操作如collect、reduce、count等。此外,还探讨了分区操作、序列化及并行化创建RDD的方法,并展示了如何通过实例来理解和运用这些操作。
摘要由CSDN通过智能技术生成

#如果通过并行化的方式创建rdd,如果不指定分区的数量,那么跟你的核数相关

#如果通过读取hdfs里面的数据创建rdd,那么分区的数量跟输入切片的数量是一致的

 

#查询分区长度

rdd4.partitions.length

 

#已并行化的方式创建rdd

valrdd1=sc.parallelize(List(5,6,4,7,3,8,2,9,1,10))

valrdd1=sc.parallelize(List(5,6,4,7,3,8,2,9,1,10)).map(_*2).sortBy(x=>x,true)

valrdd2=rdd1.filter(_>10)

valrdd2=sc.parallelize(List(5,6,4,7,3,8,2,9,1,10)).map(_*2).sortBy(x=>x+"",true)

valrdd2=sc.parallelize(List(5,6,4,7,3,8,2,9,1,10)).map(_*2).sortBy(x=>x.toString,true)

 

alrdd4=sc.parallelize(Array("a b c","d e f","h ij"))

rdd4.flatMap(_.split(",")).collect

 

valrdd5=sc.parallelize(List(List("a b c","a b b"),List("ef g","a f g"),List("h i j","a a b")))

rdd5.flatMap(_.flatMap(_.split(""))).collect

 

#union求并集,注意类型要一致

valrdd6=sc.parallelize(List(5,6,4,7))

valrdd7=sc.parallelize(List(1,2,3,4))

valrdd8=rdd6.union(rdd7)

rdd8.distinct.sortBy(x=>x).collect

 

#intersection求交集

valrdd9=rdd6.intersection(rdd7)

 

valrdd1=sc.parallelize(List(("tom",1),("jerry",2),("kitty",3)))

valrdd2=sc.parallelize(List(("jerry",9),("tom",8),("shuke",7),("tom",2)))

 

#join

valrdd3=rdd1.join(rdd2)

valrdd3=rdd1.leftOuterJoin(rdd2)

valrdd3=rdd1.rightOuterJoin(rdd2)

 

#groupByKey

val rdd3=rdd1 unionrdd2

rdd3.groupByKey

rdd3.groupByKey.map(x=>(x._1,x._2.sum))

groupByKey.mapValues(_.sum).collect

Array((tom,CompactBuffer(1,8,2)),(jerry.CompactBuffer(9,2)),(shuke.CompactBuffer(5,6)))

 

#WordCount

sc.textFile("/root/word.txt").flatMap(x=>x.split(",")).map((_,1)).reduceByKey(_+_).sortBy(_._2,false).collect

sc.textFile("/root/word.txt").flatMap(x=>x.split(",")).map((_,1)).groupByKey.map(t=>(t._1,t._2.sum)).collect

sc.textFile("/root/word.txt").flatMap(x=>x.split(",")).map((_,1)).groupByKey.mapValues(_.sum).collect

 

 

#cogroup

valrdd1=sc.parallelize(List(("tom",1),("tom",2),("jerry",3),("kitty"2)))

valrdd2=sc.parallelize(List(("jerry",2),("tom",1),("shuke",2)))

valrdd3=rdd1.cogroup(rdd2)

valrdd4=rdd3.map(t=>(t._1,t._2._1.sum+t._2._2.sum))

 

#cartesian笛卡尔积

valrdd1=sc.parallelize(List("tom","jerry"))

valrdd2=sc.parallelize(List("tom","kitty","shuke"))

valrdd3=rdd1.cartesian(rdd2)

 

#################################################################################################

#spark action

valrdd1=sc.parallelize(List(1,2,3,4,5),2)

 

#collect

rdd1.collect

 

#reduce

valrdd2=rdd1.reduce(_+_)

 

#count

rdd1.count

 

#top

rdd1.top(2)

 

#take

rdd1.take(2)

 

#first(similer totake(1))

rdd1.first

 

#takeOrder

rdd1.takeOrder(3)

 

 

#mapPartitionsWithIndex:拿到分区的编号和分区中的值

#aggregate:聚合使用

deffunc1=(index:Int,iter:Iterator[(Int)])=>{

iter.toList.map(x=>"[partID:"+index+",val:"+x+"]").iterator

}

valrdd1=sc.parallelize(List(1,2,3,4,5,6,7,8,9),2)

rdd1.mapPartitionsWithIndex(func).collect

#将每个分区的最大值进行相加

rdd1.aggregate(0)(math.max(_,_),_+_)

#5+9+5

rdd1.aggregate(5)(math.max(_,_),_+_)

 

 

valrdd2=sc.parallelize(List("a","b","c","d","e","f"),2)

deffunc2=(index:Int,iter:Iterator[(String)])=>{

iter.toList.map(x=>"[partID:"+index+",val:"+x+"]").iterator

}

rdd2.aggregate("")(_+_,_+_)

rdd2.aggregate("=")(_+_,_+_)

 

valrdd3=sc.parallelize(List("12","23","345","4567"),2)

rdd3.aggregate("")((x,y)=>math.max(x.length,y.length).toString,(x,y)=>x+y)

 

valrdd4=sc.parallelize(List("12","23","345",""),2)

rdd4.aggregate("")((x,y)=>math.min(x.length,y.length).toString,(x,y)=>x+y)

 

valrdd5=sc.parallelize(List("12","23","","345"),2)

rdd5.aggregate("")((x,y)=>math.min(x.length,y.length).toString,(x,y)=>x+y)

 

 

#aggregateByKey

valpairRDD=sc.parallelize(List(("cat",2),("cat",5),("mouse",4),("cat",12),("dog",12),("mouse",2),2))

deffunc2(index:Int,iter:Iterator[(String,Int)]):Iterator[String]={

iter.toList.map(x=>"[partID:"+index+",val:"+x+"]").iterator

}

pairRDD.mapPartitionsWithIndex(func2).collect

pairRDD.aggregateByKey(0)(math.max(_,_),_+_).collect

pairRDD.aggregateByKey(100)(math.max(_,_),_+_).collect

 

#checkpoint

sc.setCheckpointDir("hdfs://usr/ck")

valrdd=sc.textFile("hdfs://root/wc.txt").flatMap(_.split("")).map((_,1)).reduceByKey(_+_)

rdd.checkpoint

rdd.isCheckpointed

rdd.count

rdd.isCheckpointed

rdd.getCheckpointed

 

#coalesce   repartition

valrdd1=sc.parallelize(1 to 10,10)

valrdd2=rdd1.coalesce(2,false)

rdd2.partitions.length

 

#collectAsMap

valrdd=sc.parallelize(List(("a",1),("b",2)))

rdd.collectAsMap

 

#combineByKey

valrdd1=sc.textFile("hdfs://root/wc").flatMap(_.split("")).map((_,1))

valrdd2=rdd1.combineByKey(x=>x,(a:Int,b:Int)=>a+b,(m:Int,n:Int)=>m+n)

rdd2.collect

 

valrdd3=rdd1.combineByKey(x=>x+10,(a:Int,b:Int)=>a+b,(m:Int,n:Int)=>m+n)

rdd3.collect

 

valrdd4=sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"),3)

valrdd5=sc.parallelize(List(1,1,2,2,2,1,2,2,2),3)

valrdd6=rdd5.zip(rdd4)

valrdd7=rdd6.combineByKey(List(_),(x:List[String],y:String)=>x:+y,(m:List[String],n:List[String])=>m++n)

 

#countByKey

valrdd1=sc.parallelize(List("a",1),("b",2),("b",2),("c",2),("c",1))

rdd1.countByKey

rdd1.countByValue

 

#filterByRange

valrdd1=sc.parallelize(List(("e",5),("c",3),("d",4),("c",2),("a",1)))

valrdd2=rdd1.filterByRange("b","d")

rdd2.collect

 

#flatMapValues

vala=sc.parallelize(List(("a","1 2"),("b","34")))

rdd3.flatMapValues(_.split(""))

 

#foldByKey

valrdd1=sc.parallelize(List("dog","wolf","cat","bear"),2)

valrdd2=rdd1.map(x=>(x.length,x))

valrdd3=rdd2.foldByKey("")(_+_)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值