创建RDD
// 1).直接使用 sc.parallelize函数实现
val makerdd21= sc.parallelize(Array(1, 2, 3, 4, 5, 6, 7, 8))
// 2).使用makeRDD 底层调用parallelize函数
val makerdd2 = sc.makeRDD(List(1, 2, 3))
// 3).由外部存储系统的数据集创建
val makerdd3 = sc.textFile("hdfs://master:9000/RELEASE")
map与mapValues
// map将原来RDD的每个数据项通过map中的用户自定义函数f映射转变成一个新的元素。
val source = sc.parallelize(1 to 10)
val mapadd = source.map(_ * 2)
mapadd.collect().foreach(println)
mapValue 针对于(K,V)形式的类型只对V进行操作
filter
//filter 返回一个新的RDD,该RDD由经过func函数计算后返回值为true的输入元素组成。
val sourceFilter = sc.parallelize(Array("xiaoming", "xiaojiang", "xiaohe", "dazhi"))
val filter = sourceFilter.filter(_.contains("xiao"))
filter.collect().foreach(println)
flatMap
//flatMap 将原来RDD中的每个元素通过函数f转换为新的元素,并将生成RDD的每个集合中的元素合并为一个集合
val sourceFlat = sc.parallelize(1 to 5)
val flatMap = sourceFlat.flatMap(1 to _)
flatMap.collect().foreach(println)
union
// union 将源RDD和参数RDD求并集后返回一个新的RDD
val unionRdd1 = sc.parallelize(1 to 5)
val unionRdd2 = sc.parallelize(5 to 10)
val unionRdd3 = unionRdd1.union(unionRdd2)
unionRdd3.collect().foreach(println)
intersection
//intersection 对源RDD和参数RDD求交集后返回一个新的RDD
val intersectRdd1 = sc.parallelize(1 to 7)
val intersectRdd2 = sc.parallelize(5 to 10)
val intersectRdd3 = intersectRdd1.intersection(intersectRdd2)
intersectRdd3.collect().foreach(println)
distinct
// distinct 对源RDD进行去重后返回一个新的RDD
val distinctRdd = sc.parallelize(List(1, 2, 1, 5, 2, 9, 6, 1))
val unionRDD = distinctRdd.distinct()
unionRDD.collect().foreach(println)
partitionBy
//partitionBy 对RDD进行分区操作,如果原有的partionRDD和现有的partionRDD是一致的话就不进行分区, 否则会生成ShuffleRDD.
val partitionRdd1 = sc.parallelize(Array((1, "aaa"), (2, "bbb"), (3, "ccc"), (4, "ddd")), 4)
println(partitionRdd1.partitions.size)
val partitionRdd2 = partitionRdd1.partitionBy(new org.apache.spark.HashPartitioner(2))
println(partitionRdd2.partitions.size)
reduceByKey groupByKey combineByKey aggregateByKey foldByKey sortByKey
//reduceByKey 在一个(K,V)的RDD上调用,返回一个(K,V)的RDD,使用指定的reduce函数,将相同key的值聚合到一起,reduce任务的个数可以通过第二个可选的参数来设置。
val redyceByKeyRdd = sc.parallelize(List(("female", 1), ("male", 5), ("female", 5), ("male", 2)))
val reduceByKeyResult = redyceByKeyRdd.reduceByKey((x, y) => x + y)
reduceByKeyResult.collect().foreach(println)
//groupByKey groupByKey也是对每个key进行操作,但只生成一个sequence。
val words = Array("one", "two", "two", "three", "three", "three")
val wordPairsRDD = sc.parallelize(words).map(word => (word, 1))
//对key相同的进行叠加。(two,CompactBuffer(1, 1)) (one,CompactBuffer(1)) (three,CompactBuffer(1, 1, 1))
val group = wordPairsRDD.groupByKey()
group.collect().foreach(println)
val result = group.map(t => (t._1, t._2.sum))
result.collect().foreach(println)
//combineByKey对相同K,把V合并成一个集合
val scores = Array(("Fred", 88), ("Fred", 95), ("Wilma", 91), ("Wilma", 93), ("Mike", 95), ("Mike", 98))
val input = sc.parallelize(scores)
val combine = input.combineByKey(
(v) => (v, 1), // createCombiner
(acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1), // mergeValue
(acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2)//mergeCombiners 对分区进行合并。
)
// (Wilma,(184,2)) (Mike,(193,2)) (Fred,(183,2))
combine.collect.foreach(println)
val combineByKeyResult = combine.map { case (key, value) => (key, value._1 / value._2.toDouble) }
// (Wilma,92.0) (Mike,96.5) (Fred,91.5)
combineByKeyResult.collect.foreach(println)
//aggregateByKey 设定3个分区,取每个分区k相同的最大Value然后进行++
val aggregateByKeyRdd = sc.parallelize(List((1, 3), (1, 2), (1, 4), (2, 3), (3, 6), (3, 8)), 3)
// zeroValue 对每个分区的初始值赋予的值
val agg = aggregateByKeyRdd.aggregateByKey(0)(math.max(_, _), _ + _)
// (3,8),(1,7),(2,3)
agg.collect.foreach(println)
// 打印分区的个数
println(agg.partitions.size)
// foldByKey:aggregateByKey的简化操作 seqop和combop相同
val foldByKeyRdd = sc.parallelize(List((1, 3), (1, 2), (1, 4), (2, 3), (3, 6), (3, 8)), 3)
val foldByKeyAgg = foldByKeyRdd.foldByKey(0)(_ + _)
foldByKeyAgg.collect.foreach(println)
// sortByKey 根据key排序
val sortByKeyRdd = sc.parallelize(Array((3, "aa"), (6, "cc"), (2, "bb"), (1, "dd")))
//从小到大
sortByKeyRdd.sortByKey(true).collect().foreach(println)
//从大到小
sortByKeyRdd.sortByKey(false).collect().foreach(println)
// sortBy 对比sortByKey ,处理更灵活
val sortByRdd = sc.parallelize(List(1, 2, 3, 4))
sortByRdd.sortBy(x => x).collect().foreach(println)
sortByRdd.sortBy(x => x % 3).collect().foreach(println)
cogroup
//cogroup 在类型为(K,V)和(K,W)的RDD上调用,返回一个(K,(Iterable<V>,Iterable<W>))类型的RDD
val cogroupRDD = sc.parallelize(Array((1, "a"), (2, "b"), (3, "c")))
val cogroupRDD1 = sc.parallelize(Array((1, 4), (2, 5), (3, 6)))
cogroupRDD.cogroup(cogroupRDD1).collect().foreach(println)
val cogroupRDD2 = sc.parallelize(Array((4, 4), (2, 5), (3, 6)))
cogroupRDD.cogroup(cogroupRDD2).collect().foreach(println)
val cogroupRDD3 = sc.parallelize(Array((1, "a"), (1, "d"), (2, "b"), (3, "c")))
cogroupRDD3.cogroup(cogroupRDD2).collect().foreach(println)
cartesian 笛卡尔积
val cartesianRDD1 = sc.parallelize(1 to 3)
val cartesianRDD2 = sc.parallelize(2 to 5)
cartesianRDD1.cartesian(cartesianRDD2).collect()
glom 将每一个分区形成一个数组,形成新的RDD类型RDD[Array[T]]
subtract 计算差
//subtract 计算差的一种函数去除两个RDD中相同的元素,不同的RDD将保留下来
val subtractRDD = sc.parallelize(3 to 8)
val subtractRDD1 = sc.parallelize(1 to 5)
subtractRDD.subtract(subtractRDD1).collect()