Spark2.0语法测试

1.Sparkcontext对象和RDDs弹性分布式数据集

1textFile加载文件数据集:

scala> val lines = sc.textFile("/user/hadoop/helloSpark.txt")

lines: org.apache.spark.rdd.RDD[String] = /user/hadoop/helloSpark.txt MapPartitionsRDD[1] at textFile at <console>:24

scala> lines.count()

res0: Long = 3

2Parallelize(测试时使用):待并行化处理的集合;分区个数

scala> var rdd = sc.parallelize(Array(1,2,2,4),4)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[2] at parallelize at <console>:24

scala> rdd.count()

res1: Long = 4

scala> rdd.foreach(print)

2124

scala> rdd.foreach(println)

2

1

4

2

3)scala匿名函数和类型推断

scala> val lines2 = lines.filter(line=>line.contains("world"))

lines2: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[3] at filter at <console>:26

scala> lines2.foreach(println)

[Stage 5:>                                                          (0 + 0) / 2]hello world

2.RDDTransformation转换

1)逐元素map()

scala> val lines = sc.parallelize(Array("hello","spark","hello","world","!"))

lines: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[4] at parallelize at <console>:24

scala> lines.foreach(println)

hello

spark

hello

world

!

scala> val lines2 = lines.map(word=>(word,1))

lines2: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[5] at map at <console>:26

scala> lines2.foreach(println)

(hello,1)

(spark,1)

(hello,1)

(world,1)

(!,1)

2filter():

scala> val lines3 = lines.filter(word=>word.contains("hello"))

lines3: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[6] at filter at <console>:26

scala> lines3.foreach(println)

hello

Hello

3flatMap()

scala> val inputs = sc.textFile("/user/hadoop/helloSpark.txt")

inputs: org.apache.spark.rdd.RDD[String] = /user/hadoop/helloSpark.txt MapPartitionsRDD[8] at textFile at <console>:24

scala> inputs.foreach(println)

hello spark

hello world

hello !

scala> val lines = inputs.flatMap(line=>line.split(" "))

lines: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[9] at flatMap at <console>:26

scala> lines.foreach(println)

hello

spark

hello

world

hello

!

scala> lines.foreach(print)

hellosparkhelloworldhello!

4)集合运算:

scala> val rdd1 = sc.parallelize(Array("coffe","coffe","panda","monkey","tea"))

rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[10] at parallelize at <console>:24

scala> rdd1.foreach(println)

panda

monkey

tea

coffe

coffe

scala> val rdd2 = sc.parallelize(Array("coffe","monkey","kitty"))

rdd2: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[11] at parallelize at <console>:24

scala> rdd2.foreach(println)

coffe

monkey

kitty

scala> var rdd_distinct = rdd1.distinct()

rdd_distinct: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[14] at distinct at <console>:26

scala> rdd_distinct.foreach(println)

[Stage 15:>                                                         (0 + 0) / 2]monkey

coffe

panda

tea                                                      

scala> var rdd_union = rdd1.union(rdd2)

rdd_union: org.apache.spark.rdd.RDD[String] = UnionRDD[15] at union at <console>:28

scala> rdd_union.foreach(println)

coffe

coffe

panda

monkey

tea

coffe

monkey

kitty

scala> var rdd_inter = rdd1.intersection(rdd2)

rdd_inter: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[21] at intersection at <console>:28

scala> rdd_inter.foreach(println)

[Stage 18:>                                            (0 + 0) / 2]monkey

coffe                                                      

scala> var rdd_sub = rdd1.subtract(rdd2)

rdd_sub: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[25] at subtract at <console>:28

scala> rdd_sub.foreach(println)

panda

tea

3.RDDAction计算结果

1)reduce()

scala> val rdd = sc.parallelize(Array(1,2,3,3))

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> rdd.reduce((x,y)=>x+y)

res20: Int = 9

2)collect()

scala> rdd.collect()

res19: Array[Int] = Array(1, 2, 3, 3)

3)take(n)

scala> rdd.take(2)

res21: Array[Int] = Array(1, 2)

4)top()

scala> rdd.top(2)

res22: Array[Int] = Array(3, 3)

5)foreach()

4.RDDs特性

RDDs的血统关系图;延迟计算;持久化RDD.persist()设定缓存级别。

5.KeyValueRDDs

1reduceByKey

scala> val rdd = sc.textFile("/user/hadoop/helloSpark.txt")

rdd: org.apache.spark.rdd.RDD[String] = /user/hadoop/helloSpark.txt MapPartitionsRDD[29] at textFile at <console>:24

scala> val rdd2 = rdd.map(line=>(line.split(" ")(0),line))

rdd2: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[30] at map at <console>:26

scala> rdd2.foreach(println)

(hello,hello !)

(hello,hello spark)

(hello,hello world)

scala> val rdd3 = sc.parallelize(Array((1,2),(3,4),(3,6)))

rdd3: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[33] at parallelize at <console>:24

scala> var rdd4 = rdd3.reduceByKey((x,y)=>x+y)

rdd4: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[34] at reduceByKey at <console>:26

scala> rdd4.foreach(println)

(1,2)

(3,10)

2groupByKey

scala> val rdd5 = rdd3.groupByKey()

rdd5: org.apache.spark.rdd.RDD[(Int, Iterable[Int])] = ShuffledRDD[35] at groupByKey at <console>:26

scala> rdd5.foreach(println)

(1,CompactBuffer(2))

(3,CompactBuffer(4, 6))

3mapValues(func)flatMapValues(func)keys

scala> val rdd6 = rdd3.keys

rdd6: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[36] at keys at <console>:26

scala> rdd6.foreach(println)

3

3

1

scala> val rdd7 = rdd3.values

rdd7: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[37] at values at <console>:26

scala> rdd7.foreach(println)

[Stage 39:>                              (0 + 0) / 2]2

4

6

scala> val rdd8 = rdd3.sortByKey()

rdd8: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[40] at sortByKey at <console>:26

scala> rdd8.foreach(println)

[Stage 41:>                                                         (0 + 0) / 2](1,2)

(3,4)

(3,6)

4)combineByKey

参数:createCombinermergeValuemergeCombinerspartitioner

scala> val scores = sc.parallelize(Array(("jake",80.0),("jake",90.0),("jake",85.0),("mike",85.0),("mike",92.0),("mike",90.0)))

scores: org.apache.spark.rdd.RDD[(String, Double)] = ParallelCollectionRDD[41] at parallelize at <console>:24

scala> scores.foreach(println)

(mike,85.0)

(mike,92.0)

(mike,90.0)

(jake,80.0)

(jake,90.0)

(jake,85.0)

scala> val score2 = scores.combineByKey(score=>(1,score),(c1:(Int,Double),newScore)=>(c1._1+1,c1._2+newScore),(c1:(Int,Double),c2:(Int,Double))=>(c1._1+c2._1,c1._2+c2._2))

score2: org.apache.spark.rdd.RDD[(String, (Int, Double))] = ShuffledRDD[42] at combineByKey at <console>:26

scala> score2.foreach(println)

(mike,(3,267.0))

(jake,(3,255.0))

scala> val average=score2.map{case(name,(num,score))=>(name,score/num)}

average: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[43] at map at <console>:28

scala> average.foreach(println)

[Stage 47:>                                                         (0 + 0) / 2](mike,89.0)

(jake,85.0)

 

 

参考资料:

http://twitter.github.io/scala_school/zh_cn/basics.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值