1.sampleByKey
import org.apache.spark.{SparkConf, SparkContext} object testVector { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local") .setAppName("testVector"); val sc = new SparkContext(conf); var data = sc.textFile("kimi.txt") .map(row => { if(row.length == 3) //判定字符数 (row,1) //建立对应Map else (row,2) }) val fractions: Map[String,Double] = Map("aa" -> 2);//设定抽样格式 val approxSample = data.sampleByKey(withReplacement = false,fractions,0);//计算抽样样本 approxSample.foreach(println); } }程序结果:(aa,2)
withReplacement:每次抽样是否放回
fractions:定义分类条件和采样几率。
seed:随机数种子