目录
sparkCore
sparkCore rdd中算子的简单用法以及理解
获取SparkContext的工具类
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
object getSparkSessionUtils {
def getSparkSession1(): SparkSession = {
val conf: SparkConf = new SparkConf().setMaster("local").setAppName("sparkSession")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
spark
}
def getSparkSession2(): SparkSession = {
val session: SparkSession = SparkSession.builder().master("local").appName("sparkSession").getOrCreate()
session
}
def getSparkContext(): SparkContext = {
val conf: SparkConf = new SparkConf().setMaster("local").setAppName("sparkSession")
val sc = new SparkContext(conf)
sc
}
def getSparkConf(): SparkConf = {
val conf: SparkConf = new SparkConf().setMaster("local").setAppName("sparkSession")
conf
}
}
wordcount举例
val sc: SparkContext = getSparkSessionUtils.getSparkContext()
val array =Array("A;B;C;D;B;C")
//切分数据
val value3: RDD[String] = sc.parallelize(array)
val words: RDD[String] = value3.flatMap(_.split(";"))
//将每一个单词生成元组 (单词,1)
val tuples: RDD[(String, Int)] = words.map((_,1))
//spark中提供一个算子 reduceByKey 相同key 为一组进行求和 计算value
val sumed: RDD[(String, Int)] = tuples.reduceByKey(_+_)
sumed.foreach(print)
// res : (B,2)(A,1)(C,2)(D,1)
map
Logger.getLogger("org").setLevel(Level.WARN)
val sc: SparkContext = getSparkSessionUtils.getSparkContext()
val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3)
val b: RDD[(String, Int)] = a.map(x => {
(x, x.length)
})
b.foreach(println)
/* res
可以改变原有rdd的结构,返回一个新的rdd
(dog,3)
(salmon,6)
(salmon,6)
(rat,3)
(elephant,8)
*/
filter
val a = sc.parallelize(1 to 10, 3)
val b = a.filter(x=>{x % 2 == 0})
b.foreach(println)
// 自定义filter
def myFilter(x :Int) ={
if (x%2==0) {
true
}
else false
}
val c = a.filter(myFilter(_))
c.foreach(println)
/* res
=================b
2
4
6
8
10
=================c
2
4
6
8
10
*/
//返回一个新的rdd,由经过func函数计算后返回值为true的输入元素组成
flatMap
// flatmap操作必须是集合类的RDD 比如value1
val value: RDD[Int] = sc.parallelize(List(1, 2, 3, 4, 5, 6))
val value1: RDD[List[Int]] = sc.parallelize(List(List(1, 2), List(3, 4, 5, 6)))
// value.flatMap(x=>x).foreach(println)// error
value1.flatMap(x=>x).foreach(println)
val arr1=sc.parallelize(Array(("A",1),("B",2),("C",3)))
arr1.flatMap(x=>(x._1+x._2)).foreach(println)
/* A
1
B
2
C
3
*/
arr1.map(x=>(x._1+x._2)).foreach(println)
/*
A1
B2
C3
*/
// flatMap 对集合内的所有元素进行展开
// map 对集合内的所有元素进行操作
val arrayRDD: RDD[String] = sc.parallelize(Array(("a_b"), ("c_d"), ("e_f")))
arrayRDD.foreach(println)
/*
a_b
c_d
e_f
*/
arrayRDD.map(string=>{
string.split("_")
}).foreach(x=>{
println(x.mkString(",")) //打印结果2
})
/*
a,b
c,d
e,f
*/
arrayRDD.flatMap(string=>{
string.split("_")
}).foreach(x=>{
println(x.mkString(","))//打印结果3
})
/*
a
b
c
d
e
f
*/
//flatMap里面 x=>x 其实这个x不是list中的每一个元素,而是RDD的中的每个list 中间穿插了println
arrayRDD.flatMap(x=>{println("x= "+x) ;x}).foreach(println)
/*
x= a_b
a
_
b
x= c_d
c
_
d
x= e_f
e
_
f
*/
foldByKey
// "引用" 前一个操作用于合并分区中的值,后一个操作用户合并分区之间的值
// foldByKey算子的操作对象是分区内和分区间
[源码解释](https://www.jianshu.com/p/e831dc317302)
val a = sc.parallelize(List("do3", "ca3", "ow3", "gnu4", "ants5"), 2)
val b = a.map(x => (x.length, x))
// 查看数据的分区
b.glom().foreach(x=>{println(x.mkString(""))})
/*
res : 数据有两个分区 (3,do3)(3,ca3)
(3,ow3)(4,gnu4)(5,ants5)
*/
val b = a.map(x => (x.length, x))
b.foldByKey("")(_ + _).foreach(println)
b.foldByKey("QQ")(_ + _).foreach(println)
/*
res
(4,gnu4)
(3,do3ca3ow3)
(5,ants5)
当第一个3((3,do3))出现时需要拼接初始值QQ 当同分区的第二个3((3,ca3))出现时就不加QQ了,当第二个分区中的3((3,ow3))出现时又会加一个QQ
因为4,5在byKey之后属于不同分区所以两者前面要拼接一个QQ
(4,QQgnu4)
(3,QQdo3ca3QQow3)
(5,QQants5)
*/
glom
-- 参考 foldByKey 里glom 的用法
-- 返回rdd的分区
groupby(func)
val rdd = sc.parallelize(1 to 20)
rdd.groupBy(_%2).foreach(println)
/*
res 按照传入函数的返回值进行分组,将相同的key对应的值放入一个迭代器。
(0,CompactBuffer(2, 4, 6, 8, 10, 12, 14, 16, 18, 20))
(1,CompactBuffer(1, 3, 5, 7, 9, 11, 13, 15, 17, 19))
*/
groupByKey
val words = Array("one", "two", "two", "three", "three", "three")
val wordPairsRDD = sc.parallelize(words).map(word => (word, 1))
val wordCountsWithGroup = wordPairsRDD
.groupByKey()
.map(t => (t._1, t._2.sum))
.foreach(println)
/*
res 按照key值进行分组 返回一个(K, Iterator[V])的RDD
(two,2)
(one,1)
(three,3)
*/
keyby
// 在每个元素上应用一个函数生成键值对中的键,最后构成一个键值对元组。
val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3)
val b = a.keyBy(_.charAt(1))
b.foreach(println)
/*
res
(o,dog)
(a,salmon)
(a,salmon)
(a,rat)
(l,elephant)
*/
mapPartitions&mapPartitionsWithIndex
//mapPartitions是map的一个变种,map的输入函数应用于RDD中的每个元素,而mapPartitions的输入函数应用于每个分区,把每个分区中的内容作为整体来处理
val rdd = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"),2)
//查看数据分区
rdd.glom().foreach(x=> println(x.mkString(",")))
/*
res:两个分区
dog,salmon
salmon,rat,elephant
*/
// 找出每个分区中长度大于2的字符串
val value: RDD[String] = rdd.mapPartitions(iter => iter.filter(_.length > 3))
value.foreach(println)
/*
res:
salmon
salmon
elephant
*/
//todo 使用mapPartitionsWithIndex更加直观
//todo 定义一个mapPartitionsWithIndex的传入函数
def myfunc(index: Int, iter: Iterator[String]): Iterator[String] = {
iter.map(x => index + ":" + x)
}
value.mapPartitionsWithIndex(myfunc).foreach(println)
/*
res :
0:salmon
1:salmon
1:elephant
*/
mapValues
val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)
val b = a.map(x => (x.length, x))
b.foreach(println)
/*
(3,dog)
(5,tiger)
(4,lion)
(3,cat)
(7,panther)
(5,eagle)
*/
b.mapValues("x" + _ + "x").foreach(println)
/*
(3,xdogx)
(5,xtigerx)
(4,xlionx)
(3,xcatx)
(7,xpantherx)
(5,xeaglex)
*/
//针对(Key,Value)型数据中的Value进行Map操作,不对Key处理。
reduceByKey
val words = Array("one", "two", "two", "three", "three", "three")
val wordPairsRDD = sc.parallelize(words).map(word => (word, 1))
val wordCountsWithReduce = wordPairsRDD
.reduceByKey(pre, after)=>pre + after).foreach(println)
// pre, after 逻辑上分别代表同一个key的两个不同values
/*
res:
(two,2)
(one,1)
(three,3)
*/
distinct
val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"), 2)
c.distinct().foreach(println)
/*
res:
Dog
Cat
Gnu
Rat
*/
join
val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3)
val b = a.keyBy(_.length)
println("=====================")
b.foreach(println)
println("=====================")
val c = sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"), 3)
val d = c.keyBy(_.length)
d.foreach(println)
println("=====================")
b.join(d).foreach(println)
/*
=====================
(3,dog)
(6,salmon)
(6,salmon)
(3,rat)
(8,elephant)
=====================
(3,dog)
(3,cat)
(3,gnu)
(6,salmon)
(6,rabbit)
(6,turkey)
(4,wolf)
(4,bear)
(3,bee)
=====================
(6,(salmon,salmon))
(6,(salmon,rabbit))
(6,(salmon,turkey))
(6,(salmon,salmon))
(6,(salmon,rabbit))
(6,(salmon,turkey))
(3,(dog,dog))
(3,(dog,cat))
(3,(dog,gnu))
(3,(dog,bee))
(3,(rat,dog))
(3,(rat,cat))
(3,(rat,gnu))
(3,(rat,bee))
*/
// 默认内连接 可以选择rightOuterJoin 或者 leftOuterJoin
union
// 合并同一数据类型元素,不去重
val a = sc.parallelize(1 to 3,1)
val b = sc.parallelize(5 to 7,1)
(a++b).foreach(println)
println()
a.union(b).foreach(println)
sortBy
val z = sc.parallelize(Array(("H", 10), ("A", 26), ("Z", 1), ("L", 5)))
z.sortBy(c => c._1, true).foreach(println)
z.sortBy(c => c._1, false).foreach(println)
/*
(A,26)
(H,10)
(L,5)
(Z,1)
(Z,1)
(L,5)
(H,10)
(A,26)
*/
sortByKey
val aa = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), 1)
val bb = sc.parallelize(1 to aa.count.toInt, 1)
val cc = aa.zip(bb)
cc.foreach(print)
cc.sortByKey(true).foreach(println)
println("=================")
cc.sortByKey(false).foreach(println)
/* res
(ant,5)
(cat,2)
(dog,1)
(gnu,4)
(owl,3)
=================
(owl,3)
(gnu,4)
(dog,1)
(cat,2)
(ant,5)
*/
intersection
val x = sc.parallelize(1 to 20)
val y = sc.parallelize(10 to 30)
val z = y.intersection(x)
z.sortBy(x=>x).foreach(println)
/*
res 返回两个交集,不含重复元素
10
11
12
13
14
15
16
17
18
19
20
*/
repartition
// 重新分区
val rdd = sc.parallelize(List(1, 2, 10, 4, 5, 2, 1, 1, 1), 3)
rdd.partitions.length //3
val rdd2 = rdd.repartition(5)
rdd2.partitions.length //5
/* 其中partitionBy只能作用于PairRDD.
当作用于PairRDD时,repartition和partitionBy的行为是不同的。
repartition是把数据随机打散均匀分布于各个Partition;
而partitionBy则在参数中指定了Partitioner(默认HashPartitioner),将每个(K,V)对按照K根据Partitioner计算得到对应的Partition. 在合适的时候使用partitionBy可以减少shuffle次数,提高效率。
*/
sample
val a = sc.parallelize(1 to 10000, 3)
println(a.sample(false, 0.1).count)
/*
res 第一个参数代表是否放回抽样,第二个参数代表抽样比例
*/
reduce
val a = sc.parallelize(1 to 100,10)
println(a.reduce(_ + _))
// reduce将RDD中元素前两个传给输入函数,产生一个新的值,新的return值与RDD中下一个元素(第三个元素)组成两个元素,再被传给输入函数,直到只有一个值
flod
val a = sc.parallelize(List("1","2","3"),3)
a.glom().foreach(x=>println(x.mkString(",")))
println(a.fold("9")(_ + _))
println(a.fold("999")(_ + _))
res
/*9919293
999999199929993
初始值*(节点数目+1) + Rdd各元素求和
*/
collect
val sc: SparkContext = getSparkSessionUtils.getSparkContext()
val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"), 2)
println(c.collect.mkString(","))
val collect1: Array[String] = c.collect
//collect 将分布式的 RDD 返回为一个单机的 scala Array 数组
collectAsMap
val sc = new SparkContext(conf)
val a = sc.parallelize(List(1, 2, 1, 3), 1)
val b: RDD[(Int, Int)] = a.zip(a)
val map: collection.Map[Int, Int] = b.collectAsMap
/* res 会去重
(2,2)
(1,1)
(3,3)
*/
aggregateByKey
aggregate