1.countApproxDistinct
package com.latrobe.spark
import org.apache.spark.{SparkConf, SparkContext}
/**
* countApproxDistinct : RDD的一个方法,作用是对RDD集合内容进行去重统计。
* 该统计是一个大约的统计,参数relativeSD控制统计的精确度。
* relativeSD越小,结果越准确
*/
object CountApproxDistinct {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("spark-demo").setMaster("local")
val sc = new SparkContext(conf)
/**
* 构建一个集合,分成20个partition
*/
val a = sc.parallelize(1 to 10000 , 20)
//RDD a内容复制5遍,其中有50000个元素
val b = a++a++a++a++a
//结果是9760,不传参数,默认是0.05
println(b.countApproxDistinct())
//结果是9760
println(b.countApproxDistinct(0.05))
//8224
println(b.countApproxDistinct(0.1))
//10000
println(b.countApproxDistinct(0.001))
}
}
2.countApproxDistinctByKey
与countApproxDistinct类似,但计算每个不同键的不同值的近似数量。因此,RDD必须由双组件元组组成。对于分布在许多节点上的大型RDD,此功能可能比其他计数方法执行得更快。参数relativeSD控制计算的准确性。
package com.cb.spark.sparkrdd;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
public class CountApproxDistinctByKeyExample {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("CountApproxDistinctByKey").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> a = sc.parallelize(Arrays.asList("Gnu", "Cat", "Rat", "Dog"), 2);
JavaRDD<String> b = sc.parallelize(a.takeSample(true, 10000, 0), 20);
List<Integer> tmp = new ArrayList<>();
for (int i = 1; i <= 10000; i++) {
tmp.add(i);
}
JavaRDD<Integer> c = sc.parallelize(tmp, 20);
JavaPairRDD<String, Integer> d = b.zip(c);
d.countApproxDistinctByKey(0.1).foreach(x -> System.out.print(x + " "));
System.out.println();
d.countApproxDistinctByKey(0.01).foreach(x -> System.out.print(x + " "));
System.out.println();
d.countApproxDistinctByKey(0.001).foreach(x -> System.out.print(x + " "));
System.out.println();
sc.stop();
}
}
参考:https://blog.csdn.net/hi_1234567/article/details/42835441