1. 目的
在Spark提供的Scala环境, 编写实例, 测试Scala语法
2. 实例
2.1 读取本地文件
# test.txt如下:
abcd
heihei
heihei
# 读取文件, 并对每行出现频次进行统计, 使用reduceByKey
val lines = sc.textFile("file:///home/xx/test.txt")
val pairs = lines.map(s=>(s,1))
pairs.reduceByKey((x, y)=>x+y).foreach(println)
# 输出结果如下
(abcd,1)
(heihei,2)
# reduceByKey 巧妙替代 groupByKey, 减少数据的shuffle
# 如下: 如果用groupByKey也可以实现, 需要把每个分组数据全部shuffle到一起后在sorted, 之后在去topN
scala> val a = List((1, 2), (1, 3), (1,4), (1,5), (2,10), (2,11), (2,12))
a: List[(Int, Int)] = List((1,2), (1,3), (1,4), (1,5), (2,10), (2,11), (2,12))
val N = 3
# 下面是获取分组后, 排序取top N
rdda.map { case (name, time) =>
val listBuffer = new ListBuffer[Long]()
listBuffer.append(time)
(name, listBuffer)
}.reduceByKey { case (t1, t2) =>
val listBuffer = new ListBuffer[Long]()
listBuffer.appendAll(t1)
// listBuffer.append(t2) // 注意 reduceByKey, t1, t2可能代表已经汇聚完的值, 所以要appendAll
listBuffer.appendAll(t2)
listBuffer.sorted.take(N)
}.collect
# 上面更好的方法, 是用List代替ListBuffer, 见下:
rdd.map { case (name, time) =>
(name, List(time))
}.reduceByKey { case (t1, t2) =>
(t1 ++ t2).sorted.take(threshold)
}.collect
2.2 join实例
val a1 = List((2,(200,300)), (3,(400,500)), (4,(500,600)))
val a2 = List((2,(200,300)), (3,(400,500)), (1,(500,600)))
val rdd1 = sc.parallelize(a1)
val rdd2 = sc.parallelize(a2)
rdd1.leftOuterJoin(rdd2).foreach(println)
// 关于Option的处理:
val rdd3 = rdd1.leftOuterJoin(rdd2)
rdd3.map{ case (id,(x,y))=>
val (y3,y2) = y.getOrElse((null, null))
(id, x._1, x._2, y3, y2)
}.foreach(println)
// 打印输出结果:
(4,500,600,null,null)
(3,400,500,400,500)
(2,200,300,200,300)
// 交互输入如下:
scala> val a1 = List((2,(200,300)), (3,(400,500)), (4,(500,600)))
a1: List[(Int, (Int, Int))] = List((2,(200,300)), (3,(400,500)), (4,(500,600)))
scala> val a2 = List((2,(200,300)), (3,(400,500)), (1,(500,600)))
a2: List[(Int, (Int, Int))] = List((2,(200,300)), (3,(400,500)), (1,(500,600)))
scala> val rdd1 = sc.parallelize(a1)
rdd1: org.apache.spark.rdd.RDD[(Int, (Int, Int))] = ParallelCollectionRDD[14] at parallelize at :26
scala> val rdd2 = sc.parallelize(a2)
rdd2: org.apache.spark.rdd.RDD[(Int, (Int, Int))] = ParallelCollectionRDD[15] at parallelize at :26
scala> rdd1.join(rdd2).foreach(println)
(3,((400,500),(400,500)))
(2,((200,300),(200,300)))
scala> rdd2.leftOuterJoin(rdd1).foreach(println)
(3,((400,500),Some((400,500))))
(2,((200,300),Some((200,300))))
(1,((500,600),None))
2.2 groupByKey实例
// 目的: 生成可以进行groupByKey的rdd
var arr1 = new Array[(Int, Int)](10)
arr1(0) = (1,1)
arr1(1) = (1,2)
arr1(2) = (2,1)
arr1(3) = (3,1)
arr1(4) = (10,1)
val rddA = sc.parallelize(arr1.filter(_ != null).map{case (n1,n2)=>(n1,n2)})
rddA.groupByKey().foreach(println)
2.3 cogroup实例
scala> val rdd1 = sc.parallelize(List((1,2),(1,3),(1,4),(2,10),(2,10),(3,11),(3,12),(4,100)))
scala> val rdd2 = sc.parallelize(List((1,2), (1,3),(10,111)))
scala> rdd1.cogroup(rdd2).map{case (id, (f1,f2)) =>
| val f = if (f1.isEmpty) -1 else f1
| (id, f, f1, f2)}.foreach(println)
(1,CompactBuffer(2, 3, 4),CompactBuffer(2, 3, 4),CompactBuffer(2, 3))
(4,CompactBuffer(100),CompactBuffer(100),CompactBuffer())
(3,CompactBuffer(11, 12),CompactBuffer(11, 12),CompactBuffer())
(10,-1,CompactBuffer(),CompactBuffer(111))
(2,CompactBuffer(10, 10),CompactBuffer(10, 10),CompactBuffer())
2.4 Option[Boole