1、combineByKey
首先给RDD中每个分区中的每个key一个初始值 其次在RDD每个分区内部 相同的key聚合一次
再次在RDD不同的分区之间将相同的key结果聚合一次
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("combineByKey")
val sc = new SparkContext(conf)
val rdd1: RDD[(String, Int)] = sc.makeRDD(List[(String, Int)](
("zhangsan", 10), ("zhangsan", 20), ("wangwu", 30),
("lisi", 40), ("zhangsan", 50), ("lisi", 60),
("wangwu", 70), ("wangwu", 80), ("lisi", 90)
), 3)
rdd1.mapPartitionsWithIndex((index,iter)=>{
val arr = ArrayBuffer[(String,Int)]()
iter.foreach(tp=>{
arr.append(tp)
println("rdd1 partition index = "+index+",value = "+tp)
})
arr.iterator
}).count()
/**
* 0号分区:("zhangsan", 10), ("zhangsan", 20), ("wangwu", 30)
* 1号分区:("lisi", 40), ("zhangsan", 50), ("lisi", 60)
* 2号分区:("wangwu", 70), ("wangwu", 80), ("lisi", 90)
*
* 初始化后:
* 0号分区:("zhangsan", 10hello),("wangwu", 30hello)
* 1号分区:("lisi", 40hello), ("zhangsan", 50hello)
* 2号分区:("wangwu", 70hello),("lisi", 90hello)
*
* 经过RDD分区内的合并后:
* 0号分区:("zhangsan", 10hello@20),("wangwu", 30hello)
* 1号分区:("lisi", 40hello@60), ("zhangsan", 50hello)
* 2号分区:("wangwu", 70hello@80),("lisi", 90hello#50hello)
*
* 经过RDD分区之间的合并:("zhangsan", 10hello@20#50hello),("lisi",40hello@60#90hello),("wangwu", 30hello#70hello@80)
*/
// rdd1.combineByKey((v:Int)=>{v+"hello"},(s:String,v:Int)=>{s+"@"+v},(s1:String,s2:String)=>{s1+"#"+s2})
val result: RDD[(String, String)] = rdd1.combineByKey(v=>{v+"hello"}, (s:String, v)=>{s+"@"+v}, (s1:String, s2:String)=>{s1+"#"+s2})
result.foreach(println)
}
2、aggregateByKey
首先是给定RDD的每个分区一个初始值,然后RDD中每个分区中按照相同的key,结合初始值去合并,最后RDD之间相同的key 聚合。
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("aggregateByKey").setMaster("local")
val sc = new SparkContext(conf)
val rdd1 = sc.makeRDD(List[(String,Int)](
("zhangsan",10),("zhangsan",20),("wangwu",30),
("lisi",40),("zhangsan",50),("lisi",60),
("wangwu",70),("wangwu",80),("lisi",90)
),3)
rdd1.mapPartitionsWithIndex((index,iter)=>{
val arr = ArrayBuffer[(String,Int)]()
iter.foreach(tp=>{
arr.append(tp)
println("rdd1 partition index = "+index+",value = "+tp)
})
arr.iterator
}).count()
/**
* 0号分区:
* ("zhangsan",10)
* ("zhangsan",20)
* ("wangwu",30)
* 1号分区:
* ("lisi",40)
* ("zhangsan",50)
* ("lisi",60)
* 2号分区:
* ("wangwu",70)
* ("wangwu",80)
* ("lisi",90)
*
* init :
* 0:("zhangsan",hello~10~20),("wangwu",hello~30)
* 1:("zhangsan",hello~50),("lisi",hello~40~60)
* 2:("lisi",hello~90),("wangwu",hello~70~80)
*
* 分区合并后:("zhangsan",hello~10~20#hello~50),("lisi",hello~40~60#hello~90),("wangwu",hello~30#hello~70~80)
* ("zhangsan")
*/
val result: RDD[(String, String)] = rdd1.aggregateByKey("hello")((s, v)=>{s+"~"+v}, (s1, s2)=>{s1+"#"+s2})
result.foreach(print)
}
3、coalesce
coalesce(numPartiton,shuffle=false) coalesce 增加或者减少分区,默认没有shuffle
coalesce 如果从少的分区增到多的分区,如果指定没有shuffle,那么不起作用。 coalesce(num,true) =
repartition(num)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local").setAppName("coalesce")
val sc = new SparkContext(conf)
val rdd1: RDD[String] = sc.parallelize(List[String](
"love1", "love2", "love3", "love4",
"love5", "love6", "love7", "love8",
"love9", "love10", "love11", "love12"),3)
val rdd2 :RDD[String] = rdd1.mapPartitionsWithIndex((index,iter)=>{
val list = ListBuffer[String]()
iter.foreach(one=>{
list.append(s"rdd1 partition = 【$index】,value = 【$one】")
})
list.iterator
},true)
val rdd3 = rdd2.coalesce(4,false)
val rdd4 = rdd3.mapPartitionsWithIndex((index,iter)=>{
val arr = ArrayBuffer[String]()
iter.foreach(one=>{
arr.append(s"rdd3 partition = 【$index】,value = 【$one】")
})
arr.iterator
})
val results : Array[String] = rdd4.collect()
results.foreach(println)
sc.stop()
}
4、cogroup
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("cogroup")
val sc = new SparkContext(conf)
val rdd1 = sc.parallelize(List[(String,String)](("zhangsan","female"),("zhangsan","female1"),("lisi","male"),("wangwu","female"),("maliu","male")),3)
val rdd2 = sc.parallelize(List[(String,Int)](("zhangsan",18),("lisi",19),("lisi",190),("wangwu",20),("tianqi",21)),4)
val resultRDD: RDD[(String, (Iterable[String], Iterable[Int]))] = rdd1.cogroup(rdd2)
resultRDD.foreach(info=>{
val key = info._1
val value1: List[String] = info._2._1.toList
val value2: List[Int] = info._2._2.toList
println("key = "+key+",value1 = "+value1+",value2 = "+value2)
})
println("resultRDD partitioin length = "+resultRDD.getNumPartitions)
sc.stop()
}
5、distinct
distinct 去重,有shuffle产生,内部实际是 map+reduceByKey+map实现
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("distinct")
val sc = new SparkContext(conf)
val infos = sc.parallelize(List[String]("a","a","b","b","c","c","d"),4)
val result: RDD[String] = infos.distinct()
result.foreach(println)
sc.stop()
}
6、filter
filter 过滤算子,过滤数据,返回true的数据会被留下
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("filter")
conf.setMaster("local")
val sc = new SparkContext(conf)
val infos = sc.makeRDD(List[Int](1,2,3,4,5))
val result = infos.filter(one=>{
one>3
})
result.foreach(println)
sc.stop()
}
7、flatMap
flatMap 是一对多的关系 处理一条数据得到多条数据结果
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("map").setMaster("local")
val sc = new SparkContext(conf)
val infos = sc.parallelize(Array[String]("hello spark","hello hdfs","hello bjsxt"))
val result = infos.flatMap(one=>{
one.split(" ")
})
result.foreach(println)
}
8、flatMapValues
(K,V) -> (K,V) 作用在K,V格式的RDD上,对一个Key的一个Value返回多个Value
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("flatMapValues")
conf.setMaster("local")
val sc = new SparkContext(conf)
val infos: RDD[(String, String)] = sc.makeRDD(List[(String, String)](("zhangsna", "18"), ("lisi", "20"), ("wangwu", "30")))
val transInfo: RDD[(String, String)] = infos.mapValues(s => {
s + " " + "zhangsan18"
})
val result = transInfo.flatMapValues(s=>{
s.split(" ")
})
result.foreach(print)
sc.stop()
}
9、fullOuterJoin
(K,V)格式的RDD和(K,W)格式的RDD 使用fullOuterJoin结合是以两边的RDD出现的key为主,得到(K,(Option(V),Option(W)))
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("fullOuterJoin")
val sc = new SparkContext(conf)
val nameRDD = sc.parallelize(List[(String,String)](("zhangsan","female"),("lisi","male"),("wangwu","female"),("maliu","male")),3)
val scoreRDD = sc.parallelize(List[(String,Int)](("zhangsan",18),("lisi",19),("wangwu",20),("tianqi",21)),4)
val fullOuterJoin: RDD[(String, (Option[String], Option[Int]))] = nameRDD.fullOuterJoin(scoreRDD)
fullOuterJoin.foreach(println)
println("fullOuterJoin RDD partition length = "+fullOuterJoin.getNumPartitions)
sc.stop()
}
10、groupBy
按照指定的规则,将数据分组
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("groupBy")
val sc = new SparkContext(conf)
val rdd = sc.parallelize(List[(String,Double)](("zhangsan",66.5),("lisi",33.2),("zhangsan",66.7),("lisi",33.4),("zhangsan",66.8),("wangwu",29.8)))
val result: RDD[(Boolean, Iterable[(String, Double)])] = rdd.groupBy(one => {
one._2 > 34
})
result.foreach(print)
// val rdd1: RDD[String] = sc.parallelize(List[String](
// "love1", "love2", "love3", "love4",
// "love5", "love6", "love7", "love8",
// "love9", "love10", "love11", "love12"),3)
//
// val result: RDD[(String, Iterable[String])] = rdd1.groupBy(one=>{one.split("")(4)})
// result.foreach(print)
}
11、groupByKey
根据key 去将相同的key 对应的value合并在一起 (K,V)=>(K,[V])
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("groupByKey")
val sc = new SparkContext(conf)
val rdd = sc.parallelize(List[(String,Double)](("zhangsan",66.5),("lisi",33.2),("zhangsan",66.7),("lisi",33.4),("zhangsan",66.8),("wangwu",29.8)))
val rdd1 = rdd.groupByKey()
rdd1.foreach(info=>{
val name = info._1
val value: Iterable[Double] = info._2
val list: List[Double] = info._2.toList
print("name = "+name+",list = "+list)
})
sc.stop()
}
12、intersection
intersection 取两个RDD的交集,两个RDD的类型要一致 结果RDD的分区数与父rdd多的一致
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("intersection")
val sc = new SparkContext(conf)
val rdd1 = sc.parallelize(List[String]("zhangsan","lisi","wangwu"),5)
val rdd2 = sc.parallelize(List[String]("zhangsan","lisi","maliu"),4)
val intersectionRDD: RDD[String] = rdd1.intersection(rdd2)
intersectionRDD.foreach(println)
println("intersectionRDD partition length = "+intersectionRDD.getNumPartitions)
sc.stop()
}
13、join
join 会产生shuffle (K,V)格式的RDD和(K,V)格式的RDD按照key相同join
得到(K,(V,W))格式的数据。
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("join")
conf.setMaster("local")
val sc = new SparkContext(conf)
val nameRDD = sc.parallelize(List[(String,String)](("zhangsan","female"),("lisi","male"),("wangwu","female")))
val scoreRDD = sc.parallelize(List[(String,Int)](("zhangsan",18),("lisi",19),("wangwu",20)))
val joinRDD: RDD[(String, (String, Int))] = nameRDD.join(scoreRDD)
joinRDD.foreach(println)
}
14、leftOuterJoin
(K,V)格式的RDD和(K,W)格式的RDD
使用leftOuterJoin结合是以左边的RDD出现的key为主,得到(K,(V,Option(W)))
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("leftOuterJoin")
val sc = new SparkContext(conf)
val nameRDD = sc.parallelize(List[(String,String)](("zhangsan","female"),("lisi","male"),("wangwu","female"),("maliu","male")))
val scoreRDD = sc.parallelize(List[(String,Int)](("zhangsan",18),("lisi",19),("wangwu",20),("tianqi",21)))
val leftOuterJoin: RDD[(String, (String, Option[Int]))] = nameRDD.leftOuterJoin(scoreRDD)
leftOuterJoin.foreach(println)
sc.stop()
}
15、map
处理数据是一对一的关系,进入一条数据处理,出来的还是一条数据
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("map").setMaster("local")
val sc = new SparkContext(conf)
val infos = sc.parallelize(Array[String]("hello spark","hello hdfs","hello bjsxt"))
val result = infos.map(one=>{one.split(" ")})
result.foreach(arr=>{arr.foreach(println)})
sc.stop()
}
16、mapPartitions
遍历的是每个分区中的数据,一个个分区的遍历,相对于map 一条条处理数据,性能比较高。
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("mapPartitions")
val sc = new SparkContext(conf)
val infos = sc.parallelize(List[String]("a","b","c","d","e","f","g"),4)
val result = infos.mapPartitions(iter=>{
println("创建数据库连接... ... ")
val array = ArrayBuffer[String]()
while(iter.hasNext){
val s = iter.next()
println("拼接sql... ... "+s)
array.append(s)
}
println("关闭数据库连接... ... ")
array.iterator
})
result.count()
sc.stop()
}
17、mapPartitionsWithIndex
可以拿到每个RDD中的分区,以及分区中的数据
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("mapPartitionWithIndex")
val sc = new SparkContext(conf)
val lines = sc.textFile("./data/words",5)
val result = lines.mapPartitionsWithIndex((index,iter)=>{
val arr = ArrayBuffer[String]()
iter.foreach(one=>{
arr.append(s"partition = 【$index】,value = $one")
})
arr.iterator
},true)
result.foreach(println)
sc.stop()
}
18、mapValues
针对K,V格式的数据,只对Value做操作,Key保持不变
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("mapValues")
conf.setMaster("local")
val sc = new SparkContext(conf)
val infos: RDD[(String, String)] = sc.makeRDD(List[(String, String)](("zhangsna", "18"), ("lisi", "20"), ("wangwu", "30")))
val result: RDD[(String, String)] = infos.mapValues(s => {
s + " " + "zhangsan18"
})
result.foreach(print)
sc.stop()
}
19、reduceByKey
首先会根据key 去分组,然后处理每个组,将每个组内的value聚合 作用在K,V格式的RDD上
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("reduceByKey")
conf.setMaster("local")
val sc = new SparkContext(conf)
val infos = sc.parallelize(List[(String,Int)](("zhangsan",1),("zhangsan",2),("zhangsan",3),("lisi",100),("lisi",200)))
val result = infos.reduceByKey((v1,v2)=>{v1+v2})
result.foreach(println)
sc.stop()
}
20、repartition
重新分区,可以将RDD的分区增多或者减少,会产生shuffle coalesce(num,true) = repartition(num)
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("repartition")
val sc = new SparkContext(conf)
val rdd1: RDD[String] = sc.parallelize(List[String](
"love1", "love2", "love3", "love4",
"love5", "love6", "love7", "love8",
"love9", "love10", "love11", "love12"),3)
val rdd2 :RDD[String] = rdd1.mapPartitionsWithIndex((index,iter)=>{
val list = ListBuffer[String]()
iter.foreach(one=>{
list.append(s"rdd1 partition = 【$index】,value = 【$one】")
})
list.iterator
},true)
// val rdd3 = rdd2.repartition(4)
val rdd3 = rdd2.repartition(3)
val rdd4 = rdd3.mapPartitionsWithIndex((index,iter)=>{
val arr = ArrayBuffer[String]()
iter.foreach(one=>{
arr.append(s"rdd3 partition = 【$index】,value = 【$one】")
})
arr.iterator
})
val results : Array[String] = rdd4.collect()
results.foreach(println)
sc.stop()
}
21、rightOuterJoin
(K,V)格式的RDD和(K,W)格式的RDD
使用rightOuterJoin结合是以右边的RDD出现的key为主,得到(K,(Option(V),W))
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("rightOuterJoin")
val sc = new SparkContext(conf)
val nameRDD = sc.parallelize(List[(String,String)](("zhangsan","female"),("lisi","male"),("wangwu","female"),("maliu","male")),3)
val scoreRDD = sc.parallelize(List[(String,Int)](("zhangsan",18),("lisi",19),("wangwu",20),("tianqi",21)),4)
val rightOuterJoin: RDD[(String, (Option[String], Int))] = nameRDD.rightOuterJoin(scoreRDD)
rightOuterJoin.foreach(println)
println("rightOuterJoin RDD partition length = "+rightOuterJoin.getNumPartitions)
sc.stop()
}
22、sample
sample随机抽样,参数sample(有无放回抽样,抽样的比例,种子)
有种子和无种子的区别:
有种子是只要针对数据源一样,都是指定相同的参数,那么每次抽样到的数据都是一样的
没有种子是针对同一个数据源,每次抽样都是随机抽样
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("sample")
conf.setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile("./data/sampleData.txt")
val result = lines.sample(true,0.01,100)
result.foreach(println)
sc.stop()
}
23、sortBy
sortBy 排序,参数中指定按照什么规则去排序,第二个参数 true/false 指定升序或者降序
无需作用在K,V格式的RDD上
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("sortBy")
conf.setMaster("local")
val sc = new SparkContext(conf)
val infos = sc.parallelize(Array[(String,String)](("f","f"),("a","a"),("c","c"),("b","b")))
val result = infos.sortBy(tp=>{
tp._1
},false)
result.foreach(println)
val infos1 = sc.parallelize(Array[Int](400,200,500,100,300))
val result1 = infos1.sortBy(one=>{one/100},false)
result1.foreach(println)
sc.stop()
}
24、sortByKey
sortByKey 默认按照key去排序,作用在K,V格式的RDD上
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("sortByKey")
conf.setMaster("local")
val sc = new SparkContext(conf)
val infos = sc.parallelize(Array[(String,String)](("f","f"),("a","a"),("c","c"),("b","b")))
val result = infos.sortByKey(false)
result.foreach(println)
sc.stop()
}
25、subtract
subtract 取RDD的差集
subtract两个RDD的类型要一致,结果RDD的分区数与subtract算子前面的RDD的分区个数一致
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("subtract")
val sc = new SparkContext(conf)
val rdd1 = sc.parallelize(List[String]("zhangsan","lisi","wangwu"),5)
val rdd2 = sc.parallelize(List[String]("zhangsan","lisi","maliu"),4)
val subtractRDD: RDD[String] = rdd1.subtract(rdd2)
subtractRDD.foreach(println)
println("subtractRDD partition length = "+subtractRDD.getNumPartitions)
sc.stop()
}
26、union
union 合并RDD ,两个RDD必须是同种类型,不必要是K,V格式的RDD
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("union")
val sc = new SparkContext(conf)
val rdd1 = sc.parallelize(List[String]("zhangsan","lisi","wangwu","maliu"),3)
val rdd2 = sc.parallelize(List[String]("a","b","c","d"),4)
val unionRDD: RDD[String] = rdd1.union(rdd2)
unionRDD.foreach(println)
println("unionRDD partitioin length = "+unionRDD.getNumPartitions)
sc.stop()
}
27、zip
将两个RDD 合成一个K,V格式的RDD,分区数要相同,每个分区中的元素必须相同
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("zip")
val sc = new SparkContext(conf)
val rdd1 = sc.parallelize(List[String]("a","b","c"),2)
val rdd2 = sc.parallelize(List[Int](1,2,3),numSlices = 2)
val result: RDD[(String, Int)] = rdd1.zip(rdd2)
result.foreach(print)
}
28、zipWithIndex
将RDD和数据下标压缩成一个K,V格式的RDD
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("zip")
val sc = new SparkContext(conf)
val rdd1 = sc.parallelize(List[String]("a","b","c"),2)
val rdd2 = sc.parallelize(List[Int](1,2,3),numSlices = 2)
val result: RDD[(String, Long)] = rdd1.zipWithIndex()
result.foreach(print)
}