import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;
import java.util.*;
public class baobiao {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("dup").master("local[4]").getOrCreate();
JavaRDD<String> input = spark.sparkContext().textFile("/Users/yangyang/Desktop/textbook_exercise_use_report.csv",1)
.toJavaRDD().map(new Function<String, String>() {
public String call(String s) throws Exception {
String[] tmp = s.split(",");
String[] timestamp= tmp[9].split(" ")[0].split("-");
String time = "";
for(int i =0 ;i < timestamp.length-1;i++){
time += timestamp[i];
}
tmp[9] = time;
String res = "";
for(int i = 0;i<tmp.length;i++)
res += tmp[i]+",";
return res;
}
}).persist(StorageLevel.MEMORY_ONLY());
JavaPairRDD<String,Integer> all_people= input.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String,Integer> call(String str) throws Exception{
String[] tmp = str.split(",");
String key = "";
for(int i=1;i<=4;i++)
key += tmp[i]+"\t";
key += tmp[6];
return new Tuple2(key,1);
}
}).groupByKey().mapToPair(new PairFunction<Tuple2<String,Iterable<Integer>>, String, Integer>() {
public Tuple2<String,Integer> call(Tuple2<String,Iterable<Integer>> pairs) throws Exception{
String key = pairs._1();
Iterable<Integer> iter = pairs._2();
int sum = 0;
for(Integer i:iter)
sum +=i;
return new Tuple2(key,sum);
}
});
JavaPairRDD<String,Integer> all_times = input.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String,Integer> call(String str) throws Exception{
String[] tmp = str.split(",");
String key = "";
for(int i=1;i<=4;i++)
key += tmp[i]+"\t";
key += tmp[6];
return new Tuple2(key,Integer.valueOf(tmp[11]));
}
}).groupByKey().mapToPair(new PairFunction<Tuple2<String,Iterable<Integer>>, String, Integer>() {
public Tuple2<String,Integer> call(Tuple2<String,Iterable<Integer>> pairs) throws Exception{
String key = pairs._1();
Iterable<Integer> iter = pairs._2();
int sum = 0;
for(Integer i:iter)
sum +=i;
return new Tuple2(key,sum);
}
});
JavaPairRDD<String,Integer> all_learn_times = input.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String,Integer> call(String str) throws Exception{
String[] tmp = str.split(",");
String key = "";
for(int i=1;i<=4;i++)
key += tmp[i]+"\t";
key += tmp[6];
return new Tuple2(key,Integer.valueOf(tmp[10]));
}
}).groupByKey().mapToPair(new PairFunction<Tuple2<String,Iterable<Integer>>, String, Integer>() {
public Tuple2<String,Integer> call(Tuple2<String,Iterable<Integer>> pairs) throws Exception{
String key = pairs._1();
Iterable<Integer> iter = pairs._2();
int sum = 0;
for(Integer i:iter)
sum +=i;
return new Tuple2(key,sum);
}
});
List<Tuple2<String,Integer>> all_people_list= all_people.collect();
List<Tuple2<String,Integer>> all_times_list= all_times.collect();
List<Tuple2<String,Integer>> all_learn_times_list= all_learn_times.collect();
for(Tuple2<String,Integer> people:all_learn_times_list)
System.out.println(people._1()+"\t"+people._2());
input.foreach(x -> System.out.println(x));
spark.close();
}
}
注:Mac下使用idea开发,可用Alt+/看方法返回类型(本人个人习惯设置,写在此怕忘记)