import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by liupeng on 2017/6/16.
*/
object T_sample {
System.setProperty("hadoop.home.dir","F:\\hadoop-2.6.5")
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("sample_test").setMaster("local")
val sc = new SparkContext(conf)
val rddData = sc.parallelize(1 to 1000, 1)
//sample根据概率随机,第一个参数为True会出现重复的数,第二个参数表示随机数的比例,第三个参数表示随机的种子
//采用固定的种子seed随机
val result = rddData.sample(false, 0.005, 0)
.collect()
for (x <- result)
{
println(x)
}
//采用随机种子
val result1 = rddData.sample(false, 0.005, scala.util.Random.nextInt(1000)).collect()
for (x <- result1)
{
println(x)
}
}
}
运行结果:
192
795
826
192
795
826
5
340
963
340
963