RDD的算子大全
- RDD的算子大全
- 基础转换操作
- 0. Spark初始化
- 1.map操作
- 2.flatMap操作
- 3.filter操作
- 4.distinct去重操作
- 5.mapPartions操作
- 6. mapPartionWithIndex 操作
- 7.union操作
- 8. intersection交集
- 9. subtract差集
- 10. cartesian 笛卡尔积
- 11.repartion重新分区
- 12. randomSplit按权重拆分
- 13. glom分区变数组
- 14.zip 拉链操作
- 15.zipPartition操作
- 16.zipWithIndex操作
- 17. zipWithUniqueld操作
- 18. sortBy操作-- 按指定的列排序
- 19. sortByKey操作- 按键排序
- 20. sample抽样操作
- 键值对转换操作
- RDD 行动操作
RDD的算子大全
基础转换操作
0. Spark初始化
val conf = new SparkConf().setMaster("local").setAppName("api01")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
推荐的写法
val sparkSession = SparkSession.builder()
.master("local")
.appName("ProduceClientLog")
// .config("hive.metastore.uris", "thrift://192.168.7.11:9083")
.config(new SparkConf())
.enableHiveSupport()
.getOrCreate()
val sc = sparkSession.sparkContext
sc.setLogLevel("ERROR")
1.map操作
// 遍历RDD的每一个数据集,每一次应用f函数,产生一个新的RDD
def main(args: Array[String]): Unit = {
val conf:SparkConf = new SparkConf().setMaster("local").setAppName("test01")
val sc: SparkContext = new SparkContext(conf)
val mapRDD: Array[Int] = sc.parallelize(1 to 10).map(_ * 10).collect()
mapRDD.foreach(println)
}
// 原理 对每一个元素应用函数,返回函数处理结果
f(self.next())
// k-v翻转
map(_.swap)
2.flatMap操作
// 将RDD中的每一个元素通过f函数转换为新的元素,并封装到新的RDD中
// f函数返回的是集合,会保存在自己的迭代器中,自己的迭代器中没有值再去父迭代器
val text = sc.parallelize(Array("hello,world", "how are you?", "and you"))
// 根据逗号和空格切分
text.flatMap(_.split("[, ]")).foreach(println)
// 原理 函数处理值,返回迭代器
f(self.next()).toIterator
// 例子
val data: RDD[(String, Int)] = sc.parallelize(Array(
("zhangsan", 12),
("zhangsan", 23),
("zhangsan", 34),
("lisi", 25),
("lisi", 53),
("lisi", 45),
("wangwu", 56),
("wangwu", 78)
))
//key value -> 一组 行转列
val group: RDD[(String, Iterable[Int])] = data.groupByKey()
// group.foreach(println)
// 行列转换
group.flatMap(e => e._2.map(x => (e._1,x)).iterator).foreach(println)
group.flatMapValues(e => e.iterator).foreach(println)
group.mapValues(e => e.toList.sorted).foreach(println)
group.flatMapValues(e => e.toList.sorted.take(2)).foreach(println)
3.filter操作
// 条件过滤
// 对RDD中的每一个元素应用f函数,返回值是true,该元素会被添加到新的RDD
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local").setAppName("test01")
val sc: SparkContext = new SparkContext(conf)
val filterRDD = sc.parallelize((1 to 30)).filter(_ % 3 == 0).collect()
filterRDD.foreach(println)
}
// 原理,对每一个元素做判断,符合判断,跳出do-while循环,返回值
4.distinct去重操作
// 去重
sc.parallelize(List(1, 2, 3, 4, 5, 4, 3, 2, 1)).distinct().foreach(println)
// 原理
data.map(x => (x, null)).reduceByKey((x,y)=>x).map(_._1).foreach(println)
5.mapPartions操作
// 遍历分区中的数据进行批处理
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local").setAppName("test01")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("ERROR")
val rddDate: RDD[(String, Int)] = sc.parallelize(
Array(
("202100001", 83),
("202100002", 95),
("202100003", 93),
("202100004", 69),
("202100005", 62),
("202100006", 74),
("202100007", 97),
("202100008", 100)
), 2
)
// 大数据量会有内存溢出风险
val rddData2 = rddDate.mapPartitions(iter => {
var result = List[String]() // 会有内存溢出的风险
while (iter.hasNext) {
result = iter.next() match {
// :: 为连接列表操作
case (id, grade) if grade >= 80 => id + "_" + grade :: result
case _ => result
}
}
result.iterator
}
)
rddData2.collect.foreach(println)
// 自己实现迭代器
rddData.mapPartitions(iter => {
new Iterator[String]{
override def hasNext: Boolean = iter.hasNext
override def next(): String = {
val value = iter.next()
value._1 + " 分数: " + value._2
}
}
}).foreach(println)
}
// 自己实现迭代器并且实现过滤操作,模仿filter算子做过滤
rddDate.mapPartitions((iter: Iterator[(String, Int)]) => {
new Iterator[(String, Int)] {
private var hdDefined: Boolean = false
private var hd: (String, Int) = _
override def hasNext: Boolean = hdDefined || {
do {
if (!iter.hasNext) return false
hd = iter.next()
} while (hd._2 < 80) // 小于80的就跳过
hdDefined = true
true
}
override def next(): (String, Int) = {
if (hasNext) {
hdDefined = false
hd
} else {
throw new NoSuchElementException("next on empty iterator")
}
}
}
}).foreach(println)
6. mapPartionWithIndex 操作
// 对每一个分区应用f函数,同时将分区号传入f函数
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local").setAppName("test01")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("ERROR")
val rddData: RDD[(String, Int)] = sc.parallelize(
Array(
("202100001", 83),
("202100002", 95),
("202100003", 93),
("202100004", 69),
("202100005", 62),
("202100006", 74),
("202100007", 97),
("202100007", 93),
("202100008", 100)
), 2
)
val rddData2 = rddData.mapPartitionsWithIndex( (index,iter) => {
var result = List[String]()
while (iter.hasNext) {
result = iter.next() match {
case (id, grade) if grade >= 80 => id + "_" + grade + "[" + index + "]" :: result
case _ => result
}
}
result.iterator
}
)
rddData2.collect.foreach(println)
}
// 数据库操作
val data = sc.parallelize(1 to 10, 2)
data.mapPartitionsWithIndex(
(index: Int, piter: Iterator[Int]) => {
val listBuffer = new ListBuffer[String]() // 内存会炸
println(s"--$index----conn--mysql----")
while (piter.hasNext){
val value: Int = piter.next()
println(s"--$index----select--mysql----")
listBuffer.+= (value + "select")
println("-----close--mysql------")
}
listBuffer.iterator
}
)
// 数据库操作,自己写迭代器
data.mapPartitionsWithIndex((pindex,piter) => {
new Iterator[String]() {
println(s"---$pindex--conn--mysql------")
override def hasNext: Boolean = {
if (!piter.hasNext) {println("close conn mysql");false}
else true
}
override def next(): String = {
val value = piter.next()
println(s"--- $pindex --select $value-----")
s"selected $value"
}
}
}).foreach(println)
// 抽样显示数据在哪个分区
data.mapPartitionsWithIndex(
(pi, pt) => {
pt.map(e => (pi, e))
}
).sample(false,0.3).foreach(println)
7.union操作
// 对两个RDD做并集,并返回新的RDD
val data: RDD[Int] = sc.parallelize(1 to 5)
val data2: RDD[Int] = sc.parallelize(3 to 7)
// 并集
data.union(data2).foreach(println)
// 差集
data.subtract(data2).foreach(println)
// 交集
data.intersection(data2).foreach(println)
// 笛卡尔积
data.cartesian(data2).foreach(println)
8. intersection交集
// 两个RDD取交集
val data: RDD[Int] = sc.parallelize(1 to 5)
val data2: RDD[Int] = sc.parallelize(3 to 7)
// 差集
data.subtract(data2).foreach(println)
// 并集
data.union(data2).foreach(println)
// 交集
data.intersection(data2).foreach(println)
// 笛卡尔积
data.cartesian(data2).foreach(println)
9. subtract差集
// 两个RDD取差集
val data: RDD[Int] = sc.parallelize(1 to 5)
val data2: RDD[Int] = sc.parallelize(3 to 7)
// 差集
data.subtract(data2).foreach(println)
// 并集
data.union(data2).foreach(println)
// 交集
data.intersection(data2).foreach(println)
// 笛卡尔积
data.cartesian(data2).foreach(println)
10. cartesian 笛卡尔积
val data: RDD[Int] = sc.parallelize(1 to 5)
val data2: RDD[Int] = sc.parallelize(3 to 7)
// 差集
data.subtract(data2).foreach(println)
// 并集
data.union(data2).foreach(println)
// 交集
data.intersection(data2).foreach(println)
// 笛卡尔积
data.cartesian(data2).foreach(println)
11.repartion重新分区
// 内部操作是coalesce
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local").setAppName("test01")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("ERROR")
val data: RDD[Int] = sc.parallelize(1 to 5,3)
val data2: RDD[Int] = sc.parallelize(3 to 7,6)
// 分区数增加需要shuffle
val data01: RDD[Int] = data.coalesce(4,shuffle = true)
println(data01.partitions.length)
// 分区数减少可以不需要shuffle环节
data2 = data2.repartition(2)
println(data2.partitions.length)
}
// 原理
coalesce(numPartitions, shuffle = true)
// 查看分区数
data.partitions.length
data.getNumPartitions
12. randomSplit按权重拆分
val data: RDD[Int] = sc.parallelize(1 to 10,3)
val array = data.randomSplit(Array(1, 4, 5))
array(0).foreach(println)
array(1).foreach(println)
array(2).foreach(println)
13. glom分区变数组
// 将RDD中的每一个分区转数组,放置到新的RDD,数组元素类型跟分区元素类型一致
val rddData1 = sc.parallelize(1 to 10,2)
val myArray = rddData1.glom().collect()
sc.parallelize(1 to 30, 2).glom().flatMap(_.iterator).foreach(println)
14.zip 拉链操作
// 将两个RDD以键值对的形式合并,其中key为第一个RDD,value为第二个RDD
val rddData1 = sc.parallelize(1 to 5, 2)
val rddData2 = sc.parallelize(Array("A","B","C","D","E"), 2)
// 加入序号
rddData2.zipWithIndex().map(_.swap).foreach(println)
// 两个rdd拼成map
rddData1.zip(rddData2).foreach(println)
15.zipPartition操作
// 两个RDD的分区数必须要相同
// 第二个隐式参数是一个函数,约定如何返回值
// 代码
val rddData1 = sc.parallelize(1 to 10, 2)
val rddData2 = sc.parallelize(20 to 25, 2)
// 使用内部List集合存储,数据量大会有内存溢出风险
rddData1.zipPartitions(rddData2)((rddIter1,rddIter2) => {
var result = List[(Int, Int)]()
while (rddIter1.hasNext && rddIter2.hasNext){
result ::= (rddIter1.next(),rddIter2.next())
}
result.iterator
} )
// 自定义迭代器实现
rddData1.zipPartitions(rddData2)((rddIter1: Iterator[Int], rddIter2: Iterator[Int]) => {
new Iterator[(Int,Int)]{
override def hasNext: Boolean = rddIter1.hasNext && rddIter2.hasNext
override def next(): (Int, Int) = {
(rddIter1.next(),rddIter2.next())
}
}
}).foreach(println)
16.zipWithIndex操作
// 将数据整合成二元组
val rddData = sc.parallelize(List("A", "B", "C", "D", "E"), 2)
rddData.zipWithIndex().foreach(println)
// 加入序号
rddData2.zipWithIndex().map(_.swap).foreach(println)
17. zipWithUniqueld操作
// 将RDD中的元素与该元素对应的唯一ID进行zip操作
// ID生成算法: xn + k (x为递增自然数,n为总分区数,k为当前元素所在的分区号)
val rddData = sc.parallelize(List("A", "B", "C", "D", "E"), 2)
rddData.zipWithUniqueId().foreach(print)
// (A,0)(B,2)(C,1)(D,3)(E,5)
18. sortBy操作-- 按指定的列排序
// 指定需要排序的位置
// 隐式函数,会自动作用在sortBy函数
implicit object myOrderOps extends Ordering[String] {
override def compare(x: String, y: String): Int = {
if(x.length > y.length) -1
else if (x.length == y.length) 0
else 1
}
}
val rddData1 = sc.parallelize(Array(("dog", 3), ("cat", 2), ("scala", 3), ("hadoop", 5), ("spark", 2), ("zookeeper", 8)))
rddData1.sortBy(_._1, ascending = false).foreach(println) // scala.math.Ordering
}
// 例子
// 26.176.71.174 上海 2018-11-12 1542011088724 8415804143655616455 www.dangdang.com Click
println("---------------UV---------------")
file.map(line => {
val strLIst = line.split("\t")
(strLIst(5), strLIst(0))
}).distinct()
.map(m => (m._1, 1))
.reduceByKey(_ + _)
.sortBy(_._2, ascending = false)
.take(5)
.foreach(println)
19. sortByKey操作- 按键排序
sortByKey(ascending = false)
20. sample抽样操作
val data: RDD[Int] = sc.parallelize(1 to 10,5)
println("----------------------")
// 参数1: 是否有重复的 参数2:取多少 参数3: 种子
data.sample(true,0.3,222).foreach(println)
println("----------------------")
data.sample(true,0.3,222).foreach(println)
println("----------------------")
data.sample(false,0.1,221).foreach(println)
键值对转换操作
1. partitionBy操作
// 按照分区器重新分区
// 可以使用spark的默认分区器,也可以继承Partitionner自己实现分区器
val rddData = sc.parallelize(Array(("Chauncy", 18), ("Bob", 19), ("Alice", 33), ("Thomas", 24), ("Tom", 24)), 2)
import org.apache.spark.HashPartitioner
rddData.partitionBy(new HashPartitioner(4))
.mapPartitionsWithIndex((pindex, iter) => {
new Iterator[(String, Int, Int)] {
override def hasNext: Boolean = iter.hasNext
override def next(): (String, Int, Int) = {
val value = iter.next()
(value._1, value._2, pindex)
}
}
}).foreach(println)
// 自定义分区器
class MyPartitioner(partitions: Int) extends Partitioner {
require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")
override def numPartitions: Int = partitions
override def getPartition(key: Any): Int = key match {
case null => 0
case _ => 1
}
}
// main
val rddData = sc.parallelize(Array(("Chauncy", 18), ("Bob", 19), ("Alice", 33), ("Thomas", 24), ("Tom", 24)), 2)
rddData.partitionBy(new MyPartitioner(3))
.mapPartitionsWithIndex((pindex, piter) => {
var result = List[String]()
while (piter.hasNext) {
result = s"--$pindex --- ${piter.next()}" :: result
}
result.iterator
}).collect.foreach(println)
2. reduceByKey操作
// 针对RDD[K,V]类型的数据根据K对V聚合
// 根据key缩减,函数的参数是oldValue和newValue
sc.parallelize(List(1, 2, 3, 4, 5, 4, 3, 2, 1)).map((_, 1)).reduceByKey(_ + _).foreach(println)
// 原理
data.map((_,1)).combineByKey(
// createCombiner: V => C,
// 第一条记录的value,怎么放入hashmap
(value: Int) => value,
// mergeValue: (C, V) => C,
// 如果有第二条记录,第二条以及之后的value怎么放到hashmap
(v1:Int,v2:Int)=> v1 + v2,
// mergeCombiners: (C, C) => C,
// 合并溢写结果的函数
(v1:Int,v2:Int)=> v1 + v2
).foreach(println)
3. groupByKey操作
// 按key分组
val data: RDD[(String, String)] = sc.parallelize(List(
("班级1", "Chauncy"),
("班级3", "张大壮"),
("班级2", "李小璐"),
("班级1", "王潇"),
("班级2", "潘伟"),
("班级1", "张善为"),
("班级3", "李敢存"),
("班级3", "王友山")
))
//key value -> 一组 行转列
val group = data.groupByKey()
// 行列转换
group.flatMap(e => e._2.map(x => (e._1,x)).iterator).foreach(println)
group.flatMapValues(e => e.iterator).foreach(println)
group.mapValues(e => e.toList).foreach(println)
group.flatMapValues(e => e.toList.sorted.take(2)).foreach(println)
4. combineByKey操作
// 按key聚合
// 案例 计算每月前3的温度
val file = sc.parallelize(Array[String](
"2021-6-1 39",
"2019-5-21 33",
"2021-6-1 38",
"2021-6-2 31",
"2018-3-11 18",
"2018-4-23 22",
"2021-6-21 34",
"1970-8-23 23",
"1970-8-8 32"
))
// 将行处理成 (2021,6,1,39)
file.map(line => {
val ve = line.split("[- \t]+")
(ve(0),ve(1),ve(2),ve(3))
}).foreach(println)
})
// 隐式转换,可以替换掉scala.util.Sorting.quickSort(ov) 后面的参数
// implicit val myOrder = new Ordering[(Int, Int)] {
// override def compare(x: (Int, Int), y: (Int, Int)) = y._2.compareTo(x._2)
// }
// 第五代 调优: combineByKey
val res: RDD[((Int, Int), Array[(Int, Int)])] = data.map(t4 => ((t4._1, t4._2), (t4._3, t4._4))).combineByKey(
// 第一条元素怎么放 3 个位置
(v1: (Int, Int)) => {
Array(v1, (0, 0), (0, 0))
},
// 第二条及以后的怎么放
(ov: Array[(Int, Int)], nv: (Int, Int)) => {
var flag = 0 // 0,1,2 新进来的元素特征: 日 a)相同 1)温度大 2)温度小 日 b)不同
for (i <- ov.indices) {
if (ov(i)._1 == nv._1) { // 同一天
if (ov(i)._2 < nv._2) { // 后来居上
flag = 1
ov(i) = nv
} else flag = 2
}
}
if (flag == 0) { // 不是同一天
ov(ov.length - 1) = nv
}
// ov.sorted 会产生新对象
scala.util.Sorting.quickSort(ov)(new Ordering[(Int, Int)] {
override def compare(x: (Int, Int), y: (Int, Int)) = y._2.compareTo(x._2)
})
ov
},
// 合并溢写结果
(v1: Array[(Int, Int)], v2: Array[(Int, Int)]) => {
v1.union(v2).sorted
}
)
res.map(x => (x._1, x._2.toList)).sortByKey().foreach(println)
// 案列2
// combinerByKey算子
val data: RDD[(String, String)] = sc.parallelize(List(
("班级1", "Chauncy"),
("班级3", "张大壮"),
("班级2", "李小璐"),
("班级1", "王潇"),
("班级2", "潘伟"),
("班级1", "张善为"),
("班级3", "李敢存"),
("班级3", "王友山")
))
data.combineByKey(
(v1: String) => ListBuffer(v1), // ListBuffer[String]
(buf: ListBuffer[String], v: String) => buf += v,
(c1:ListBuffer[String],c2:ListBuffer[String])=> c1 ++= c2
).foreach(println)
// 案列3:求平均数
val data: RDD[(String, Int)] = sc.parallelize(Array(
("chauncy", 12),
("chauncy", 23),
("chauncy", 34),
("mft", 45),
("mft", 56),
("mft", 67),
("yiyun", 78),
("yiyun", 89)
))
data.combineByKey(
(score:Int) => (score,1), // 分数,计数count
(s1:(Int,Int),s2:Int) => (s1._1+s2,s1._2+1),// 分数相加,count加一
(c1:(Int,Int),c2:(Int,Int)) => (c1._1+c2._1,c1._2+c2._2) // 合并时,分数相加,count相加
).mapValues((scoce_count: (Int, Int)) => {
val scoce_sum = scoce_count._1
val count_sum = scoce_count._2
val avg = scoce_sum /count_sum
(scoce_sum,count_sum,avg)
}).foreach(println)
/*
(yiyun,(167,2,83))
(mft,(168,3,56))
(chauncy,(69,3,23))
*/
5. aggregateByKey操作
def aggregateByKey(zeroValue: U)(seqOp: (U, V) => U,combOp: (U, U) => U)
zeroValue: 设置聚合时的初始值,可以是任意值,也可以是集合
seqOp: 将值V聚合到类型为U的对象中
combOp: 跨分区合并,汇总数据
// 案列: 求每个用户访问过的接口,不能有重复的
val data: RDD[(String, String)] = sc.parallelize(Array(
("chauncy", "接口1"),
("chauncy", "接口2"),
("chauncy", "接口3"),
("mft", "接口3"),
("mft", "接口3"),
("mft", "接口2"),
("yiyun", "接口3"),
("yiyun", "接口2")
))
import collection.mutable
data.aggregateByKey(mutable.Set[String]())(
(v1: mutable.Set[String], v2: String) => v1 += v2,
(c1: mutable.Set[String], c2: mutable.Set[String]) => c1 ++= c2
).foreach(println)
// 等价于上面的
data.combineByKey(
(name1: String) => {
mutable.Set[String](name1)
},
(v1: mutable.Set[String], v2: String) => v1 += v2,
(c1: mutable.Set[String], c2: mutable.Set[String]) => c1 ++= c2
).foreach(println)
6. foldByKey操作
// aggregateByKey,reduceByKey和foldByKey底层都是调用combineByKey
// aggregateByKey,reduceByKey的输入输出类型可以不一致,但是foldByKey输入输出类型必须一致
// 案列,每人使用100元优惠券,计算购买商品后应付多少钱
val data: RDD[(String, Int)] = sc.parallelize(Array(
("chauncy", 12),
("chauncy", 23),
("chauncy", 34),
("mft", 45),
("mft", 56),
("mft", 67),
("yiyun", 78),
("yiyun", 89)
))
data.foldByKey(-100)(_+_).mapValues(money => if (money < 0) 0 else money).foreach(println)
// 使用combinerByKey改写
data.map(line => (line._1,line._3))
.combineByKey(
(v1:Int) => (-100 + v1),
(o1:Int,o2:Int) =>o1 + o2,
(n1:Int,n2:Int) =>n1 + n2
).mapValues(
money => if(money > 0) money else 0
).foreach(println)
7. sortByKey操作
// 按键排序
// 例子
file.map(line => (line.split("\t")(5),1)) // 切分取第6个,拼接成二元组
.reduceByKey(_+_) // 缩减,将value相加 oldValue + newValue
.map(_.swap) // k-v 翻转
.sortByKey(false) // 倒序排序
.map(_.swap)
.take(5) // 取前5个
.foreach(println)
// 加强版 自定义排序的值
.sortBy(_._2, ascending = false) // 按元组中的第二位进行倒序排序
8. join操作 将value连接
// 按照相同的Key,将不同RDD的Value联合在一起
val kv1: RDD[(String, Int)] = sc.parallelize(List(
("zhangsan", 11),
("zhangsan", 12),
("lisi", 13),
("wangwu", 14)
))
val kv2: RDD[(String, Int)] = sc.parallelize(List(
("zhangsan", 21),
("zhangsan", 22),
("lisi", 23),
("zhaoliu", 28)
))
// join 取交集连接
val join: RDD[(String, (Int, Int))] = kv1.join(kv2)
join.foreach(println)
/*
(zhangsan,(11,21))
(zhangsan,(11,22))
(zhangsan,(12,21))
(zhangsan,(12,22))
(lisi,(13,23))
*/
// 左外连接
val left: RDD[(String, (Int, Option[Int]))] = kv1.leftOuterJoin(kv2)
left.foreach(println)
// 右外连接
val right: RDD[(String, (Option[Int], Int))] = kv1.rightOuterJoin(kv2)
right.foreach(println)
// 全连接
val full: RDD[(String, (Option[Int], Option[Int]))] = kv1.fullOuterJoin(kv2)
full.foreach(println)
// 原理
cogroup(...).flatMapValues(...)
this.cogroup(other, partitioner).flatMapValues {
case (vs, Seq()) => vs.iterator.map(v => (Some(v), None))
case (Seq(), ws) => ws.iterator.map(w => (None, Some(w)))
case (vs, ws) => for (v <- vs.iterator; w <- ws.iterator) yield (Some(v), Some(w))
}
9. cogroup操作
// 对多个RDD将相同的Key进行合并
// 代码
val rddData1 = sc.parallelize(Array(("Alice", 19), ("Tom", 24), ("Chauncy", 24)))
val rddData2 = sc.parallelize(Array(("Alice", "女"), ("Tom", "男"), ("Chauncy", "男")))
val rddData3 = sc.parallelize(Array(("Alice", "美国"), ("Tom", "加拿大"), ("Chauncy", "中国")))
// rddData1.join(rddData2).foreach(println)
rddData1.cogroup(rddData2,rddData3).mapValues(
values => (values._1.toList.head,values._2.toList(0),values._3.toList(0))
).foreach(println)
10. mapValues操作
// 对每一个k-v数据的value做映射
val kv2: RDD[(String, Int)] = sc.parallelize(List(
("zhangsan", 21),
("zhangsan", 22),
("lisi", 23),
("zhaoliu", 28)
))
kv2.mapValues(_ * 2).foreach(println)
// 例子
val data: RDD[(String, Int)] = sc.parallelize(List(
("zhangsan", 234),
("zhangsan", 5667),
("zhangsan", 343),
("lisi", 212),
("lisi", 44),
("lisi", 33),
("wangwu", 535),
("wangwu", 22)
))
val group: RDD[(String, Iterable[Int])] = data.groupByKey()
group.mapValues(
(e: Iterable[Int]) => e.toList.sorted.take(2)
).foreach(println)
11. flatMapValues操作 (参数为迭代器)
// 按value映射,将value拆分,拼接为key,v1 key,v2 ... key,vn
sc.parallelize(Array("hello@world how are you")).map(x => {
val strings = x.split("@")
(strings(0),strings(1))
}).flatMapValues(_.split(" ")).foreach(println)
// 底层调用了flatmap
flatMap { case (k, v) => cleanF(v).map(x => (k, x))}
// 行转列
val data: RDD[(String, Int)] = sc.parallelize(List(
("zhangsan", 234),
("zhangsan", 5667),
("zhangsan", 343),
("lisi", 212),
("lisi", 44),
("lisi", 33),
("wangwu", 535),
("wangwu", 22)
))
val group = data.groupByKey()
group.flatMapValues(_.iterator).foreach(println)
// 取前两个
group.flatMapValues((e: Iterable[Int]) => e.toList.sorted.take(2)).foreach(println)
RDD 行动操作
让RDD运算开始执行,行动操作不会生成新的RDD,而是将RDD中的数据封装输出到Scala类型的实例中,或直接输出到外部存储系统
基础行动操作
1. collect 收集算子
// //收集多个Executor的执行结果到一个driver上,方便查看结果,不然在集群模式下看不到打印信息
// 但是如果数据量大的话容易OOM,可以使用 rdd.take(100).foreach(println)
val rddData = sc.parallelize(1 to 10, 2)
val array: Array[Int] = rddData.collect()
println(array.toList)
2. first操作,返回第一个
// 返回RDD中的第一个元素,不会排序
println(sc.parallelize(1 to 10, 2).first())
3, take操作,范围获取
// 返回RDD中的[0,num)范围的元素
sc.parallelize(1 to 10, 2).take(3).foreach(println)
4. top返回前几个
// 降序排列后返回前几个,返回Array
val array = sc.parallelize(1 to 10, 2).top(3)
println(array.toList)
val array = sc.parallelize(1 to 10, 2).top(3)(Ordering.by(t => t))
println(array.toList)
// k-v排序去前几个
val array = sc.parallelize(
Array(
("alice", 95),
("Chauncy", 99),
("Tom", 59),
("Tomato", 89),
("Patato", 79),
("Banana", 30)
),3).top(3)(Ordering.by(t => t._2))
println(array.toList)
5. takeOrdered排序后返回
// 升序排列后返回前几个,返回Array
val array = sc.parallelize(
Array(
("alice", 95),
("Chauncy", 99),
("Tom", 59),
("Tomato", 89),
("Patato", 79),
("Banana", 30)
), 3)
.takeOrdered(3)(Ordering.by(t => t._2))
println(array.toList)
6. reduce聚合操作
// 聚合RDD中的每一个元素,通过f函数实现
def reduce(f: (T, T) => T):T
// 计算所有科目的总分
println(sc.parallelize(
Array(
("语文", 95),
("化学", 99),
("生物", 59),
("数学", 89),
("英语", 79),
("物理", 30)
), 3)
.reduce(
(t1, t2) => (t1._1 + "_" + t2._1, t1._2 + t2._2)
))
7. aggregate聚合操作
// 对元素进行聚合操作
zeroValue: 设置聚合时的初始值,可以是任意值,也可以是集合,在每个分区中计算一次,在溢写时再计算一次
seqOp: 将值V聚合到类型为U的对象中
combOp: 跨分区合并,汇总数据
val data: RDD[(String, String)] = sc.parallelize(Array(
("chauncy", "接口1"),
("chauncy", "接口2"),
("chauncy", "接口3"),
("mft", "接口3"),
("mft", "接口3"),
("mft", "接口2"),
("yiyun", "接口3"),
("yiyun", "接口2")
))
data.aggregate(ListBuffer[String]("cc"))(
(v1: ListBuffer[String], v2:(String,String)) => v1 += v2._2,
(c1: ListBuffer[String], c2:ListBuffer[String]) => c1 ++= c2
).foreach(println)
8. fold操作
// aggregate聚合操作的简化版
zeroValue: 设置聚合时的初始值,可以是任意值,也可以是集合,在每个分区中计算一次,在溢写时再计算一次
seqOp: 将值V聚合到类型为U的对象中
combOp: 跨分区合并,汇总数据
println(sc.parallelize(1 to 10).fold(2)(_ + _))
9. foreach遍历操作
sc.parallelize(1 to 10).foreach(println)
10. foreachPartition遍历
// 遍历当前RDD中的每一个分区,进而遍历分区中的每一个数据元素
val rddData = sc.parallelize(Array("chauncy","tom","alice","aoto"), 3)
// 查看元素对应的分区
rddData.mapPartitionsWithIndex((pindex,iter) => {
new Iterator[(String,Int)]{
override def hasNext: Boolean = iter.hasNext
override def next(): (String, Int) = (iter.next(),pindex)
}
}).foreach(println)
// 遍历分区中的元素
rddData.foreachPartition(iter => {
while (iter.hasNext){
println(iter.next())
}
})
11. count操作
// 返回RDD中的元素个数
val rddData = sc.parallelize(Array("chauncy","tom","alice","aoto"), 3)
println(rddData.count())
12. Sample取样
// 取样操作
val data = sc.parallelize(1 to 10, 3)
data.sample(true,0.3,222).foreach(println) // 是否重复,比列,种子
键值对行动操作
1. lookup操作
// 根据key查找所有的value
val data: RDD[(String, Int)] = sc.parallelize(List(
("zhangsan", 234),
("zhangsan", 5667),
("zhangsan", 343),
("lisi", 212),
("lisi", 44),
("lisi", 33),
("wangwu", 535),
("wangwu", 22)
))
println(data.lookup("zhangsan"))
2. countByKey统计指定key的个数
val data: RDD[(String, Int)] = sc.parallelize(List(
("zhangsan", 234),
("zhangsan", 5667),
("zhangsan", 343),
("lisi", 212),
("lisi", 44),
("lisi", 33),
("wangwu", 535),
("wangwu", 22)
))
println(data.countByKey())
// 结果
// Map(zhangsan -> 3, wangwu -> 2, lisi -> 3)
数值行动操作
sum,max,min
// rdd.sum rdd.min rdd.max
// 自己的操作
val data = sc.parallelize(List(
("zhangsan", 234),
("zhangsan", 354),
("zhangsan", 564),
("lisi", 756),
("lisi", 246),
("lisi", 357),
("maliu", 257)
))
val sum = data.reduceByKey(_ + _)
val max = data.reduceByKey((ov, nv) => if (ov > nv) ov else nv)
val min = data.reduceByKey((ov, nv) => if (ov < nv) ov else nv)
val count = data.mapValues(e => 1).reduceByKey(_ + _)
val avg = sum.join(count).mapValues(e => e._1 / e._2)
println("sum")
sum.foreach(println)
println("max")
max.foreach(println)
println("min")
min.foreach(println)
println("count")
count.foreach(println)
println("avg")
avg.foreach(println)
println("-------avg------combine-----")
val com: RDD[(String, (Int, Int))] = data.combineByKey(
// 第一个value如何放入到hashMap
(line: Int) => (line, 1),
// 如果有第二个及以上,如何放入到hashMap
((ov: (Int, Int), nv: Int) => (ov._1 + nv /*sum */ , ov._2 + 1 /*count */ )),
// 合并溢写结果
((v1: (Int, Int), v2: (Int, Int)) => (v1._1 + v2._1, v1._2 + v2._2))
)
com.map(e => (e._1,e._2._1/e._2._2)).foreach(println)
com.mapValues(e => e._1/e._2).foreach(println)