以官方文档中提供的函数为主,简单介绍其使用,使用scala语言编写,合集20个函数
package spark_day2
import org.apache.spark.rdd.RDD
import org.apache.spark._
object TransformationsTest {
val conf = new SparkConf().setMaster("local").setAppName("spark")
val sc = new SparkContext(conf)
//map 遍历rdd中的每一个元素,可以对元素进行操作
def map1(): Unit ={
val array = Array("a","b","c")
// val mapRdd = sc.makeRDD(array)
// val nameRdd = mapRdd.map(ss=>"hello "+ss)
// nameRdd.foreach( name=>{
// println(name)
// }
// )
sc.parallelize(array).map(a=>a + " ok").foreach(
println(_)
)
}
//filter 过滤函数,将RDD中满足一定条件的值选择出来
def filter(): Unit ={
val array = Array(1,2,3,4,5,6,7,8)
sc.parallelize(array).filter(_ % 2 == 0).foreach(
println(_)
)
}
//flatMap 将数组扁平化,即降维处理
def flatmap(): Unit ={
val array = Array("a b c","1 2 3")
sc.parallelize(array).flatMap(_.split(" "))foreach(
println(_)
)
}
//mapPartitions 类似于map,map是一次读取RDD中的一个值,mapPartitions是一个读取一个分区文件
//本例中默认2个分区,最后结果(2,4,6,8)(10,12,14,16,18),主要用于数据库连接时优化
def mappartition(): Unit ={
val array = Array(1,2,3,4,5,6,7,8,9)
sc.parallelize(array,2).mapPartitions(
_.map(a=>a*2).toIterator,false)
.foreach(
println(_)
)
}
//对于第二个参数未测出意义,对于遍历可以采用while代替map方法
def mappartition1(): Unit ={
val array = Array(("a",1),("b",2),("c",3),("d",4),("e",5),("f",6),("g",7))
sc.parallelize(array,2).mapPartitions(
_.map(ite=>(ite._1+"ok",ite._2+10))
// ite=>{
// var res = List[(String,Int)]()
// while (ite.hasNext){
// val a = ite.next()
// res=res.::(a._1+"ok",a._2 + 10)
// }
// res.toIterator
// }
,false)
.foreach(
println(_)
)
}
//mapPartitionsWithIndex 功能与mapPartitions类似,多了参数index,分区编号
//功能实现,将元组按照分区读取,并添加分区号
def mappartitionwithindex(): Unit ={
val array = Array(("a",1),("b",2),("c",3),("d",4),("e",5),("f",6),("g",7))
sc.parallelize(array,2)
.mapPartitionsWithIndex((index,ite)=> //多出index参数
ite.map(tuple=>(index,tuple._1 + "ok",tuple._2 + 10))
).foreach(
println(_)
)
}
//sample 抽样函数,有三个参数
// withReplacement: Boolean,:是否放回抽样
// fraction: Double,:从数据中抽出样本占整体比值
// seed: Long :随机种子因子,可以不填
def sample(): Unit ={
val array = Array(("a",1),("b",2),("c",3),("d",4),("e",5),("f",6),("g",7))
sc.makeRDD(array).sample(true,0.5)
.foreach(
println(_)
)
}
//union 连接两个类型相同的RDD,即拼接
def union(): Unit ={
val array1 = Array(("a",1),("b",2),("c",3),("d",4))
val array2 = Array(("e",5),("f",6),("g",7))
sc.makeRDD(array1).union(sc.makeRDD(array2))
.foreach(
println(_)
)
}
//A intersection (B)求AB的交集
def intersection(): Unit ={
val array1 = Array(("a",1),("b",2),("c",3),("d",4))
val array2 = Array(("a",1),("b",2),("c",33),("d",44))
val RDD1 = sc.makeRDD(array1)
val RDD2 = sc.makeRDD(array2)
RDD1.intersection(RDD2)
.foreach(
println(_)
)
}
//distinct 去重,需要整体相同才进行去重
//结果:(a,1) (b,2) (b,6) (c,3) (d,4)
def distinct(): Unit ={
val array = Array(("a",1),("b",2),("c",3),("d",4),("a",1),("b",6),("c",3))
sc.makeRDD(array)
.distinct().sortByKey()
.foreach(a=>
print(a+"\t")
)
}
//groupByKey 按照key进行分组,可以指定numPartitions来指定task数目
//(a,CompactBuffer(1, 1)) (b,CompactBuffer(2, 6)) (c,CompactBuffer(3, 3)) (d,CompactBuffer(4))
def groupbykey(): Unit ={
val array = Array(("a",1),("b",2),("c",3),("d",4),("a",1),("b",6),("c",3))
sc.makeRDD(array)
.groupByKey(2).sortByKey()
.foreach(a=>
print(a+"\t")
)
}
//reduceByKey对分组后的数据进行处理,求和,求最值等
//可以指定分区数目
//结果(d,4) (b,6) --- (a,1) (c,3)
def reducebykey(): Unit ={
val array = Array(("a",1),("b",2),("c",3),("d",4),("a",1),("b",6),("c",3))
sc.makeRDD(array)
.reduceByKey(math.max(_,_),2)
.foreach(a=>
print(a+"\t")
)
}
//aggregateByKey 实现优化合并
//(zeroValue: U) 合并初始值
// (seqOp: (U, V) => U, 分区合并
// combOp: (U, U) => U) 不同分区合并
def aggregatebykey(): Unit ={
// val array = Array(("a",1),("b",2),("c",3),("d",4),("a",1),("b",6),("c",3))
// sc.makeRDD(array)
// .aggregateByKey(10)(_+_ //局部合并
// ,_+_) //全局合并
// .foreach(a=>
// print(a+"\t")
// )
val array2 = Array("you,jump", "i,jump")
sc.parallelize(array2).flatMap(_.split(","))
.map((_,1))
.aggregateByKey(0)(_+_,_+_)
.foreach(a=>
print(a+"\t"))
}
//sortByKey 按照key进行排序
//参数1 true为升序 false降序
//参数2 指定分区数目 可选项
def sortbykey(): Unit ={
val array = Array(("a",1),("b",2),("c",3),("d",4),("a",1),("b",6),("c",3))
sc.parallelize(array)
.sortByKey(false,2)
.foreach(a=>
print(a+"\t")
)
}
//join
//按照key值连接两个rdd,可以指定分区(可选项)
//结果(d,(4,d4)) (b,(2,b2)) ------- (a,(1,a1)) (c,(3,c3))
def join(): Unit ={
val array1 = Array(("a",1),("b",2),("c",3),("d",4))
val array2 = Array(("a","a1"),("b","b2"),("c","c3"),("d","d4"))
val RDD1 = sc.makeRDD(array1)
val RDD2 = sc.makeRDD(array2)
RDD1.join(RDD2,2)
.foreach(a=>
print(a+"\t")
)
}
//cogroup 分组函数,将不同rdd按照key进行拼接分组,rdd可以为2个或者3个
//可以指定分区数目,可选项
//结果:(d,(CompactBuffer(4, 4),CompactBuffer(d4, d4),CompactBuffer(4.8, 4.8)))
//(a,(CompactBuffer(1),CompactBuffer(a1),CompactBuffer(1.8)))
//(b,(CompactBuffer(2),CompactBuffer(b2, b2),CompactBuffer(2.8)))
//(c,(CompactBuffer(3, 3),CompactBuffer(c3, c3),CompactBuffer(3.8, 3.8)))
def cogroup(): Unit ={
val array1 = Array(("a",1),("b",2),("c",3),("d",4),("c",3),("d",4))
val array2 = Array(("a","a1"),("b","b2"),("c","c3"),("d","d4"),("b","b2")
,("c","c3"),("d","d4"))
val array3 = Array(("a",1.8),("b",2.8),("c",3.8),("d",4.8),("c",3.8),("d",4.8))
val RDD1 = sc.makeRDD(array1)
val RDD2 = sc.makeRDD(array2)
val RDD3 = sc.makeRDD(array3)
RDD1.cogroup(RDD2,RDD3)
.foreach(a=>
println(a+"\t"))
}
//cartesian笛卡尔积
//结果:((a,1),(a,a1)) ((a,1),(b,b2)) ((a,1),(c,c3)) ((b,2),(a,a1)) ((b,2),(b,b2))
// ((b,2),(c,c3)) ((c,3),(a,a1)) ((c,3),(b,b2)) ((c,3),(c,c3))
def cartesian(): Unit ={
val array1 = Array(("a",1),("b",2),("c",3))
val array2 = Array(("a","a1"),("b","b2"),("c","c3"))
val RDD1 = sc.makeRDD(array1)
val RDD2 = sc.makeRDD(array2)
RDD1.cartesian(RDD2)
.foreach(a=>
print(a+"\t")
)
}
//coalesce当输入task数目过多时,用来减少输入的分区数目,用来进行优化
//1、当输入分区数目P>分片数目S
//当shuffle为false,coalesce无效,分区数目为S
//当shuffle为true,进行shuffle阶段,分区数目为P,此时即repartition
//2、当输入分区数目P<分片数目S
//当P,S数目相差不大时,shuffle设置为false,直接进行按照数目比例进行合并
//当P,S数目相差较大,为了保证并行度,将shuffle设置为true
def coalesce(): Unit ={
val array1 = Array("a","b","c","d","c","d","d","c")
val RDD1 = sc.makeRDD(array1,4)
.coalesce(2,false)
.foreach(a=>
print(a+"\t"))
}
//repartition即coalesce在当输入分区数目P>分片数目S,
// 当shuffle为true,进行shuffle阶段,分区数目为P
def repartition(): Unit ={
val array1 = Array("a","b","c","d","c","d","d","c")
val RDD1 = sc.makeRDD(array1,2)
.repartition(4)
.foreach(a=>
print(a+"\t"))
}
//repartitionAndSortWithinPartitions
//在分区内进行排序,
//使用说明:需要指定Partitioner,实现2个方法,自定分区,
// 也可以使用new HashPartitioner(2)
// 或者new RangePartitioner(a,b) a是分区数目,b是范围分布的数组对应的rdd
def repartitionandsortwithinpartitions(): Unit ={
val array1 = Array((1,1),(2,2),(3,3),(4,4),(5,5),(6,4.8))
// val RDD1 = sc.makeRDD(array1,2)
// .repartitionAndSortWithinPartitions(new Partitioner {
// override def numPartitions: Int = 2
//
// override def getPartition(key: Any): Int = if (key=="a"||key=="b") 0 else 1
// })
// .foreach(a=>
// print(a+"\t"))
// val RDD1 = sc.makeRDD(array1)
// .repartitionAndSortWithinPartitions(new HashPartitioner(2))
// .foreach(a=>
// print(a+"\t"))
val RDD1 = sc.makeRDD(array1)
.repartitionAndSortWithinPartitions(new RangePartitioner(4
,sc.makeRDD(array1)))
.foreach(a=>
print(a+"\t"))
}
def main(args: Array[String]): Unit = {
// map1()
// filter()
// flatmap()
// mappartition()
// mappartition1()
// mappartitionwithindex()
// sample()
// union()
// intersection()
// distinct()
// groupbykey()
// reducebykey()
// aggregatebykey()
// sortbykey()
// join()
// cogroup()
// cartesian()
// coalesce()
// repartition()
repartitionandsortwithinpartitions()
}
}