第1关 集合并行化创建RDD
package step1;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.*;
public class JStudent {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("JStudent");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<String,Integer>> list = Arrays.asList(
new Tuple2("bj",88),new Tuple2("sh",67),new Tuple2("gz",92),
new Tuple2("bj",94),new Tuple2("sh",85),new Tuple2("gz",95),
new Tuple2("bj",72),new Tuple2("sh",69),new Tuple2("gz",98));
JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(list);
JavaPairRDD<String, Integer> totalScores = rdd.reduceByKey(Integer::sum);
List<Tuple2<String, Integer>> results = totalScores.collect();
for (Tuple2<String, Integer> result : results) {
System.out.println(result);
}
sc.stop();
}
}
第2关 读取外部数据集创建RDD
package step2;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
public class JTeachers {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("JTeachers");
JavaSparkContext sc = new JavaSparkContext(conf);
String dataFile = "file:///root/step2_files";
JavaRDD<String> linesRDD = sc.textFile(dataFile);
JavaPairRDD<String, Integer> teacherCountsRDD = linesRDD
.mapToPair(line -> {
String[] parts = line.split(",");
return new Tuple2<>(parts[1], 1);
});
JavaPairRDD<String, Integer> aggregatedRDD = teacherCountsRDD.reduceByKey(Integer::sum);
List<Tuple2<String, Integer>> results = aggregatedRDD.collect();
for (Tuple2<String, Integer> result : results) {
System.out.println(result);
}
sc.stop();
}
}