note 1
这里只是将我学习初期笔记拿来分享,没有做太多精细的推理验证,如有错误,希望指正。
note 2
整个算子系列应用的测试数据是相同的,在本系列第一篇Spark-Operator-Map中有完整的测试数据
note 3
因为工作环境如此,我个人使用Java+Scala混合开发,请悉知
note 4
代码版本
-Spark2.2
-Scala2.11
- 源码
/**
* Return a new RDD containing the distinct elements in this RDD.
*/
def distinct(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope {
map(x => (x, null)).reduceByKey((x, y) => x, numPartitions).map(_._1)
}
/**
* Return a new RDD containing the distinct elements in this RDD.
*/
def distinct(): RDD[T] = withScope {
distinct(partitions.length)
}
/**
* Return the union of this RDD and another one. Any identical elements will appear multiple
* times (use `.distinct()` to eliminate them).
*/
def union(other: RDD[T]): RDD[T] = withScope {
sc.union(this, other)
}
/**
* Return the union of this RDD and another one. Any identical elements will appear multiple
* times (use `.distinct()` to eliminate them).
*/
def ++(other: RDD[T]): RDD[T] = withScope {
this.union(other)
}
- union 和 ++ 实际上是同一个方法
union测试代码
object DistinctAndUnion {
val ss =SparkSession.builder().appName("1").master("local").getOrCreate();
val sc = ss.sparkContext
val rdd = sc.textFile("/home/missingli/IdeaProjects/SparkLearn/src/main/resources/sparkbasic.txt")
sc.setLogLevel("error")
def main(args: Array[String]): Unit = {
val rdd2 = sc.textFile("/home/missingli/IdeaProjects/SparkLearn/src/main/resources/sparkbasic.txt")
val rdd3 = rdd++rdd2
val rdd4 = rdd.union(rdd2)
println("rdd")
rdd.foreach(println)
println("rdd2")
rdd2.foreach(println)
println("rdd3 = rdd1 ++rdd2")
rdd3.foreach(println)
println("rdd4 = rdd.union(rdd2)")
rdd3.foreach(println)
}
}
打印结果
rdd
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
rdd2
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
rdd3 = rdd1 ++rdd2
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
rdd4 = rdd.union(rdd2)
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
Distinct测试代码-对基本类型
object DistinctAndUnion {
val ss =SparkSession.builder().appName("1").master("local").getOrCreate();
val sc = ss.sparkContext
val rdd = sc.textFile("/home/missingli/IdeaProjects/SparkLearn/src/main/resources/sparkbasic.txt")
sc.setLogLevel("error")
def main(args: Array[String]): Unit = {
val rdd2 = sc.textFile("/home/missingli/IdeaProjects/SparkLearn/src/main/resources/sparkbasic.txt")
val rdd3 = rdd++rdd2
rdd3.foreach(println)
println("---------------")
rdd3.distinct().foreach(println)
}
}
测试结果
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
---------------
2,w,gd,h
5,h,d,o
6,q,w,e
1,a,c,b
4,6,s,b
3,h,r,x
Distinct测试 - 引用类型 - Collection
object DistinctAndUnion {
val ss =SparkSession.builder().appName("1").master("local").getOrCreate();
val sc = ss.sparkContext
val rdd = sc.textFile("/home/missingli/IdeaProjects/SparkLearn/src/main/resources/sparkbasic.txt")
sc.setLogLevel("error")
def main(args: Array[String]): Unit = {
val rdd2 = sc.textFile("/home/missingli/IdeaProjects/SparkLearn/src/main/resources/sparkbasic.txt")
val rdd3 = (rdd++rdd2).map(r=>{
val list = new util.ArrayList[String]()
list.add(r)
list
})
rdd3.map(r=>r.get(0)).foreach(println)
println("---------------")
rdd3.distinct().map(r=>r.get(0)).foreach(println)
}
}
结果- 成功去重
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
---------------
4,6,s,b
6,q,w,e
1,a,c,b
3,h,r,x
2,w,gd,h
5,h,d,o
Distinct测试 - 引用类型 - Collection - 补充测试
object DistinctAndUnion {
val ss =SparkSession.builder().appName("1").master("local").getOrCreate();
val sc = ss.sparkContext
val rdd = sc.textFile("/home/missingli/IdeaProjects/SparkLearn/src/main/resources/sparkbasic.txt")
sc.setLogLevel("error")
def main(args: Array[String]): Unit = {
val rdd2 = sc.textFile("/home/missingli/IdeaProjects/SparkLearn/src/main/resources/sparkbasic.txt")
val rdd3 = (rdd++rdd2).map(r=>{
val list = new util.ArrayList[String]()
list.add(r)
if(2*Math.random()>1)
list.add("limingshun")
list
})
// 这里 list 有一定随机性,用来判断distinct 是否是比较整个list内容全不相同
rdd3.map(r=>r.get(0)).foreach(println)
println("---------------")
rdd3.distinct().map(r=>r.get(0)).foreach(println)
}
}
结果
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
---------------
4,6,s,b
6,q,w,e
3,h,r,x
1,a,c,b
3,h,r,x
5,h,d,o
2,w,gd,h
5,h,d,o
2,w,gd,h
从结果看,是比较整个list的内容是否完全相同
Distinct - 自定义对象测试
//对象
class User(val name:String) extends Serializable {
def getName(): String ={
name
}
}
object User
object DistinctAndUnion {
val ss =SparkSession.builder().appName("1").master("local").getOrCreate();
val sc = ss.sparkContext
val rdd = sc.textFile("/home/missingli/IdeaProjects/SparkLearn/src/main/resources/sparkbasic.txt")
sc.setLogLevel("error")
def main(args: Array[String]): Unit = {
val rdd2 = sc.textFile("/home/missingli/IdeaProjects/SparkLearn/src/main/resources/sparkbasic.txt")
val rdd3 = (rdd++rdd2).map(r=>{
val list = new util.ArrayList[String]()
list.add(r)
if(2*Math.random()>1)
list.add("limingshun")
list
})
println("====================================")
val rdd4=(rdd++rdd2).map(r=>new User(r))
rdd4.map(r=>r.getName()).foreach(println)
println("----------------------")
rdd4.distinct().map(r=>r.getName()).foreach(println)
}
}
结果
====================================
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
----------------------
4,6,s,b
5,h,d,o
6,q,w,e
1,a,c,b
3,h,r,x
3,h,r,x
5,h,d,o
1,a,c,b
2,w,gd,h
2,w,gd,h
6,q,w,e
4,6,s,b
结果看出,没有影响
Distinct - 自定义对象测试 -增强版本 - 重写equals方法
//对象有如下变化 其他代码不变
class User(val name:String) extends Serializable {
def getName(): String ={
name
}
override def hashCode(): Int = {
var h:Int = 0;
if (h == 0 && name.length > 0) {
val ar: Array[Char] = name.toCharArray;
for( c <- ar){
h=31*h+c
}
}
h
}
override def equals(obj: Any): Boolean = {
println(" here equals ")
this.hashCode() == obj.hashCode()
}
}
object User
结果
====================================
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
1,a,c,b
2,w,gd,h
3,h,r,x
4,6,s,b
5,h,d,o
6,q,w,e
----------------------
here equals
here equals
2,w,gd,h
5,h,d,o
here equals
here equals
here equals
here equals
6,q,w,e
1,a,c,b
4,6,s,b
3,h,r,x
- 结论: distinct 会使用 equals 方法来对对象是否相同进行判断