SparkSession sc = SparkSession
.builder()
.master("local[*]")
.appName("testjob")
.getOrCreate();
JavaSparkContext scs = new JavaSparkContext(sc.sparkContext());
JavaRDD<String> sturow = scs.textFile("E:\\sparkgbase_onhibe\\src\\main\\resources\\student.txt");
JavaRDD<Kudu_Student> stumap = sturow.map(row -> {
Integer id = Integer.parseInt(row.split(",")[0]);
String name = row.split(",")[1];
Integer age = Integer.parseInt(row.split(",")[2]);
String ads = row.split(",")[3];
String birthday = row.split(",")[4];
String hobday = row.split(",")[5];
String gzads = row.split(",")[6];
Double source = Double.parseDouble(row.split(",")[7]);
return null;
});
Dataset<Row> studf = sc.createDataFrame(stumap, Kudu_Student.class);
studf.createOrReplaceTempView("kudu_student");
sc.sql("select * from kudu_student").show();
JavaRDD<Row> stuRowRDD = sc.sql("select ads,gzads,birthday,count(1)" +
" from kudu_student" +
" group by " +
" ads,gzads,birthday" +
" union all " +
" select ads,gzads,birthday,count(1) " +
" from kudu_student " +
" group by " +
" ads,gzads,birthday").toJavaRDD().coalesce(1);
JavaPairRDD<String, Integer> stukvrdd = stuRowRDD.flatMapToPair(x -> {
List<Tuple2<String, Integer>> listTuple = new ArrayList<>();
listTuple.add(new Tuple2<>(x.get(0) + "|" + x.get(1) + "|" + x.get(2) + "|" + x.get(3), 1));
return listTuple.iterator();
});
JavaPairRDD<String, Integer> stuReduceKey = stukvrdd.reduceByKey((x, y) -> x + y);
JavaPairRDD<String, Iterable<Integer>> stuGroupKey = stuReduceKey.groupByKey();
stuGroupKey.foreach(x -> {
System.out.println("key:"+x._1);
Iterator<Integer> iter = x._2.iterator();
while(iter.hasNext()){
System.out.println(iter.next());
}
});
Map<String, Long> stringLongMap = stuReduceKey.countByKey();
System.out.println(stringLongMap);
scs.close();
sc.close();
JavaSpark模板
最新推荐文章于 2024-07-25 10:44:22 发布