#如果通过并行化的方式创建rdd,如果不指定分区的数量,那么跟你的核数相关
#如果通过读取hdfs里面的数据创建rdd,那么分区的数量跟输入切片的数量是一致的
#查询分区长度
rdd4.partitions.length
#已并行化的方式创建rdd
valrdd1=sc.parallelize(List(5,6,4,7,3,8,2,9,1,10))
valrdd1=sc.parallelize(List(5,6,4,7,3,8,2,9,1,10)).map(_*2).sortBy(x=>x,true)
valrdd2=rdd1.filter(_>10)
valrdd2=sc.parallelize(List(5,6,4,7,3,8,2,9,1,10)).map(_*2).sortBy(x=>x+"",true)
valrdd2=sc.parallelize(List(5,6,4,7,3,8,2,9,1,10)).map(_*2).sortBy(x=>x.toString,true)
alrdd4=sc.parallelize(Array("a b c","d e f","h ij"))
rdd4.flatMap(_.split(",")).collect
valrdd5=sc.parallelize(List(List("a b c","a b b"),List("ef g","a f g"),List("h i j","a a b")))
rdd5.flatMap(_.flatMap(_.split(""))).collect
#union求并集,注意类型要一致
valrdd6=sc.parallelize(List(5,6,4,7))
valrdd7=sc.parallelize(List(1,2,3,4))
valrdd8=rdd6.union(rdd7)
rdd8.distinct.sortBy(x=>x).collect
#intersection求交集
valrdd9=rdd6.intersection(rdd7)
valrdd1=sc.parallelize(List(("tom",1),("jerry",2),("kitty",3)))
valrdd2=sc.parallelize(List(("jerry",9),("tom",8),("shuke",7),("tom",2)))
#join
valrdd3=rdd1.join(rdd2)
valrdd3=rdd1.leftOuterJoin(rdd2)
valrdd3=rdd1.rightOuterJoin(rdd2)
#groupByKey
val rdd3=rdd1 unionrdd2
rdd3.groupByKey
rdd3.groupByKey.map(x=>(x._1,x._2.sum))
groupByKey.mapValues(_.sum).collect
Array((tom,CompactBuffer(1,8,2)),(jerry.CompactBuffer(9,2)),(shuke.CompactBuffer(5,6)))
#WordCount
sc.textFile("/root/word.txt").flatMap(x=>x.split(",")).map((_,1)).reduceByKey(_+_).sortBy(_._2,false).collect
sc.textFile("/root/word.txt").flatMap(x=>x.split(",")).map((_,1)).groupByKey.map(t=>(t._1,t._2.sum)).collect
sc.textFile("/root/word.txt").flatMap(x=>x.split(",")).map((_,1)).groupByKey.mapValues(_.sum).collect
#cogroup
valrdd1=sc.parallelize(List(("tom",1),("tom",2),("jerry",3),("kitty",2)))
valrdd2=sc.parallelize(List(("jerry",2),("tom",1),("shuke",2)))
valrdd3=rdd1.cogroup(rdd2)
valrdd4=rdd3.map(t=>(t._1,t._2._1.sum+t._2._2.sum))
#cartesian笛卡尔积
valrdd1=sc.parallelize(List("tom","jerry"))
valrdd2=sc.parallelize(List("tom","kitty","shuke"))
valrdd3=rdd1.cartesian(rdd2)
#################################################################################################
#spark action
valrdd1=sc.parallelize(List(1,2,3,4,5),2)
#collect
rdd1.collect
#reduce
valrdd2=rdd1.reduce(_+_)
#count
rdd1.count
#top
rdd1.top(2)
#take
rdd1.take(2)
#first(similer totake(1))
rdd1.first
#takeOrder
rdd1.takeOrder(3)
#mapPartitionsWithIndex:拿到分区的编号和分区中的值
#aggregate:聚合使用
deffunc1=(index:Int,iter:Iterator[(Int)])=>{
iter.toList.map(x=>"[partID:"+index+",val:"+x+"]").iterator
}
valrdd1=sc.parallelize(List(1,2,3,4,5,6,7,8,9),2)
rdd1.mapPartitionsWithIndex(func).collect
#将每个分区的最大值进行相加
rdd1.aggregate(0)(math.max(_,_),_+_)
#5+9+5
rdd1.aggregate(5)(math.max(_,_),_+_)
valrdd2=sc.parallelize(List("a","b","c","d","e","f"),2)
deffunc2=(index:Int,iter:Iterator[(String)])=>{
iter.toList.map(x=>"[partID:"+index+",val:"+x+"]").iterator
}
rdd2.aggregate("")(_+_,_+_)
rdd2.aggregate("=")(_+_,_+_)
valrdd3=sc.parallelize(List("12","23","345","4567"),2)
rdd3.aggregate("")((x,y)=>math.max(x.length,y.length).toString,(x,y)=>x+y)
valrdd4=sc.parallelize(List("12","23","345",""),2)
rdd4.aggregate("")((x,y)=>math.min(x.length,y.length).toString,(x,y)=>x+y)
valrdd5=sc.parallelize(List("12","23","","345"),2)
rdd5.aggregate("")((x,y)=>math.min(x.length,y.length).toString,(x,y)=>x+y)
#aggregateByKey
valpairRDD=sc.parallelize(List(("cat",2),("cat",5),("mouse",4),("cat",12),("dog",12),("mouse",2),2))
deffunc2(index:Int,iter:Iterator[(String,Int)]):Iterator[String]={
iter.toList.map(x=>"[partID:"+index+",val:"+x+"]").iterator
}
pairRDD.mapPartitionsWithIndex(func2).collect
pairRDD.aggregateByKey(0)(math.max(_,_),_+_).collect
pairRDD.aggregateByKey(100)(math.max(_,_),_+_).collect
#checkpoint
sc.setCheckpointDir("hdfs://usr/ck")
valrdd=sc.textFile("hdfs://root/wc.txt").flatMap(_.split("")).map((_,1)).reduceByKey(_+_)
rdd.checkpoint
rdd.isCheckpointed
rdd.count
rdd.isCheckpointed
rdd.getCheckpointed
#coalesce repartition
valrdd1=sc.parallelize(1 to 10,10)
valrdd2=rdd1.coalesce(2,false)
rdd2.partitions.length
#collectAsMap
valrdd=sc.parallelize(List(("a",1),("b",2)))
rdd.collectAsMap
#combineByKey
valrdd1=sc.textFile("hdfs://root/wc").flatMap(_.split("")).map((_,1))
valrdd2=rdd1.combineByKey(x=>x,(a:Int,b:Int)=>a+b,(m:Int,n:Int)=>m+n)
rdd2.collect
valrdd3=rdd1.combineByKey(x=>x+10,(a:Int,b:Int)=>a+b,(m:Int,n:Int)=>m+n)
rdd3.collect
valrdd4=sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"),3)
valrdd5=sc.parallelize(List(1,1,2,2,2,1,2,2,2),3)
valrdd6=rdd5.zip(rdd4)
valrdd7=rdd6.combineByKey(List(_),(x:List[String],y:String)=>x:+y,(m:List[String],n:List[String])=>m++n)
#countByKey
valrdd1=sc.parallelize(List("a",1),("b",2),("b",2),("c",2),("c",1))
rdd1.countByKey
rdd1.countByValue
#filterByRange
valrdd1=sc.parallelize(List(("e",5),("c",3),("d",4),("c",2),("a",1)))
valrdd2=rdd1.filterByRange("b","d")
rdd2.collect
#flatMapValues
vala=sc.parallelize(List(("a","1 2"),("b","34")))
rdd3.flatMapValues(_.split(""))
#foldByKey
valrdd1=sc.parallelize(List("dog","wolf","cat","bear"),2)
valrdd2=rdd1.map(x=>(x.length,x))
valrdd3=rdd2.foldByKey("")(_+_)