1.Sparkcontext对象和RDDs弹性分布式数据集
1)textFile加载文件数据集:
scala> val lines = sc.textFile("/user/hadoop/helloSpark.txt")
lines: org.apache.spark.rdd.RDD[String] = /user/hadoop/helloSpark.txt MapPartitionsRDD[1] at textFile at <console>:24
scala> lines.count()
res0: Long = 3
2)Parallelize(测试时使用):待并行化处理的集合;分区个数
scala> var rdd = sc.parallelize(Array(1,2,2,4),4)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[2] at parallelize at <console>:24
scala> rdd.count()
res1: Long = 4
scala> rdd.foreach(print)
2124
scala> rdd.foreach(println)
2
1
4
2
3)scala匿名函数和类型推断
scala> val lines2 = lines.filter(line=>line.contains("world"))
lines2: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[3] at filter at <console>:26
scala> lines2.foreach(println)
[Stage 5:> (0 + 0) / 2]hello world
2.RDD的Transformation转换
1)逐元素map():
scala> val lines = sc.parallelize(Array("hello","spark","hello","world","!"))
lines: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[4] at parallelize at <console>:24
scala> lines.foreach(println)
hello
spark
hello
world
!
scala> val lines2 = lines.map(word=>(word,1))
lines2: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[5] at map at <console>:26
scala> lines2.foreach(println)
(hello,1)
(spark,1)
(hello,1)
(world,1)
(!,1)
2)filter():
scala> val lines3 = lines.filter(word=>word.contains("hello"))
lines3: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[6] at filter at <console>:26
scala> lines3.foreach(println)
hello
Hello
3)flatMap():
scala> val inputs = sc.textFile("/user/hadoop/helloSpark.txt")
inputs: org.apache.spark.rdd.RDD[String] = /user/hadoop/helloSpark.txt MapPartitionsRDD[8] at textFile at <console>:24
scala> inputs.foreach(println)
hello spark
hello world
hello !
scala> val lines = inputs.flatMap(line=>line.split(" "))
lines: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[9] at flatMap at <console>:26
scala> lines.foreach(println)
hello
spark
hello
world
hello
!
scala> lines.foreach(print)
hellosparkhelloworldhello!
4)集合运算:
scala> val rdd1 = sc.parallelize(Array("coffe","coffe","panda","monkey","tea"))
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[10] at parallelize at <console>:24
scala> rdd1.foreach(println)
panda
monkey
tea
coffe
coffe
scala> val rdd2 = sc.parallelize(Array("coffe","monkey","kitty"))
rdd2: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[11] at parallelize at <console>:24
scala> rdd2.foreach(println)
coffe
monkey
kitty
scala> var rdd_distinct = rdd1.distinct()
rdd_distinct: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[14] at distinct at <console>:26
scala> rdd_distinct.foreach(println)
[Stage 15:> (0 + 0) / 2]monkey
coffe
panda
tea
scala> var rdd_union = rdd1.union(rdd2)
rdd_union: org.apache.spark.rdd.RDD[String] = UnionRDD[15] at union at <console>:28
scala> rdd_union.foreach(println)
coffe
coffe
panda
monkey
tea
coffe
monkey
kitty
scala> var rdd_inter = rdd1.intersection(rdd2)
rdd_inter: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[21] at intersection at <console>:28
scala> rdd_inter.foreach(println)
[Stage 18:> (0 + 0) / 2]monkey
coffe
scala> var rdd_sub = rdd1.subtract(rdd2)
rdd_sub: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[25] at subtract at <console>:28
scala> rdd_sub.foreach(println)
panda
tea
3.RDD的Action计算结果
1)reduce()
scala> val rdd = sc.parallelize(Array(1,2,3,3))
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24
scala> rdd.reduce((x,y)=>x+y)
res20: Int = 9
2)collect()
scala> rdd.collect()
res19: Array[Int] = Array(1, 2, 3, 3)
3)take(n)
scala> rdd.take(2)
res21: Array[Int] = Array(1, 2)
4)top()
scala> rdd.top(2)
res22: Array[Int] = Array(3, 3)
5)foreach()
4.RDDs特性
RDDs的血统关系图;延迟计算;持久化RDD.persist()设定缓存级别。
5.KeyValue对RDDs
1)reduceByKey
scala> val rdd = sc.textFile("/user/hadoop/helloSpark.txt")
rdd: org.apache.spark.rdd.RDD[String] = /user/hadoop/helloSpark.txt MapPartitionsRDD[29] at textFile at <console>:24
scala> val rdd2 = rdd.map(line=>(line.split(" ")(0),line))
rdd2: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[30] at map at <console>:26
scala> rdd2.foreach(println)
(hello,hello !)
(hello,hello spark)
(hello,hello world)
scala> val rdd3 = sc.parallelize(Array((1,2),(3,4),(3,6)))
rdd3: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[33] at parallelize at <console>:24
scala> var rdd4 = rdd3.reduceByKey((x,y)=>x+y)
rdd4: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[34] at reduceByKey at <console>:26
scala> rdd4.foreach(println)
(1,2)
(3,10)
2)groupByKey
scala> val rdd5 = rdd3.groupByKey()
rdd5: org.apache.spark.rdd.RDD[(Int, Iterable[Int])] = ShuffledRDD[35] at groupByKey at <console>:26
scala> rdd5.foreach(println)
(1,CompactBuffer(2))
(3,CompactBuffer(4, 6))
3)mapValues(func)、flatMapValues(func)、keys
scala> val rdd6 = rdd3.keys
rdd6: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[36] at keys at <console>:26
scala> rdd6.foreach(println)
3
3
1
scala> val rdd7 = rdd3.values
rdd7: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[37] at values at <console>:26
scala> rdd7.foreach(println)
[Stage 39:> (0 + 0) / 2]2
4
6
scala> val rdd8 = rdd3.sortByKey()
rdd8: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[40] at sortByKey at <console>:26
scala> rdd8.foreach(println)
[Stage 41:> (0 + 0) / 2](1,2)
(3,4)
(3,6)
4)combineByKey
参数:createCombiner,mergeValue,mergeCombiners,partitioner
scala> val scores = sc.parallelize(Array(("jake",80.0),("jake",90.0),("jake",85.0),("mike",85.0),("mike",92.0),("mike",90.0)))
scores: org.apache.spark.rdd.RDD[(String, Double)] = ParallelCollectionRDD[41] at parallelize at <console>:24
scala> scores.foreach(println)
(mike,85.0)
(mike,92.0)
(mike,90.0)
(jake,80.0)
(jake,90.0)
(jake,85.0)
scala> val score2 = scores.combineByKey(score=>(1,score),(c1:(Int,Double),newScore)=>(c1._1+1,c1._2+newScore),(c1:(Int,Double),c2:(Int,Double))=>(c1._1+c2._1,c1._2+c2._2))
score2: org.apache.spark.rdd.RDD[(String, (Int, Double))] = ShuffledRDD[42] at combineByKey at <console>:26
scala> score2.foreach(println)
(mike,(3,267.0))
(jake,(3,255.0))
scala> val average=score2.map{case(name,(num,score))=>(name,score/num)}
average: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[43] at map at <console>:28
scala> average.foreach(println)
[Stage 47:> (0 + 0) / 2](mike,89.0)
(jake,85.0)
参考资料: