第1关:集合并行化创建RDD
代码
package step1;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.*;
public class JStudent {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("JStudent");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<String,Integer>> list = Arrays.asList(
new Tuple2("bj",88),new Tuple2("sh",67),new Tuple2("gz",92),
new Tuple2("bj",94),new Tuple2("sh",85),new Tuple2("gz",95),
new Tuple2("bj",72),new Tuple2("sh",69),new Tuple2("gz",98));
/********** Begin **********/
//第一步:创建RDD
JavaPairRDD<String, Integer> listRDD = sc.parallelizePairs(list);
//第二步:把相同key的进行聚合
List<Tuple2<String, Integer>> result = listRDD.reduceByKey((x, y) -> x + y).collect();
//第三步:输出
for(Tuple2 s:result)
{
System.out.println(s);
}
/********** End **********/
sc.stop();
}
}
第2关:读取外部数据集创建RDD
代码
package step2;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
public class JTeachers {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("JTeachers");
JavaSparkContext sc = new JavaSparkContext(conf);
String dataFile = "file:///root/step2_files";
/********** Begin **********/
//第一步:以外部文件方式创建RDD
JavaRDD<String> rdd = sc.textFile("file:///root/step2_files");//文件路径
//第二步:将文件中每行的数据切分,得到自己想要的返回值
JavaPairRDD<String,Integer> listRDD = rdd.mapToPair(
x ->{
String s=x.split(",")[1];
return new Tuple2(s,1);
}
);
//第三步:将相同的key进行聚合
List<Tuple2<String, Integer>> result = listRDD.reduceByKey((x, y) -> x + y).collect();
//第四步:将结果收集起来
//第五步:输出
for(Tuple2 s:result)
{
System.out.println(s);
}
/********** End **********/
sc.stop();
}
}