package cn.hhb.spark.sql;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
/**
* Created by dell on 2017/7/27.
*/
public class JSONDataSource {
public static void main(String[] args) {
// 创建SparkConf
SparkConf conf = new SparkConf()
.setAppName("JSONDataSource").setMaster("local")
.set("spark.testing.memory", "2147480000");
// 创建javasparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
// 针对json文件,创建dataframe(针对json文件创建dataframe)
DataFrame studentScoresDF = sqlContext.read().json("c://students.json");
// 针对学生成绩信息的dataframe,注册临时表,查询分数大于80的学生
// (注册临时表,针对临时表执行sql语句)
studentScoresDF.registerTempTable("student_scores");
DataFrame goodStudentScoresDF = sqlContext.sql("select name,score from student_scores where score >= 80");
// (将dataframe转换为rdd,执行transframation操作)
List<String> goodStudentNames = goodStudentScoresDF.javaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) throws Exception {
return row.getString(0);
}
}).collect();
// 然后针对javaRDD<String>,创建DataFrame
// (针对包含json串的javardd,创建dataframe)
List<String> studentInfoJSONs = new ArrayList<String>();
studentInfoJSONs.add("{\"name\":\"Leo\", \"age\":18}");
studentInfoJSONs.add("{\"name\":\"Marry\", \"age\":17}");
studentInfoJSONs.add("{\"name\":\"Jack\", \"age\":19}");
JavaRDD<String> studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs);
DataFrame studentInfosDF = sqlContext.read().json(studentInfoJSONsRDD);
// 针对学生基本信息DataFrame,注册临时表,然后查询分数大于80分的学生的基本信息
studentInfosDF.registerTempTable("student_infos");
String sql = "select name,age from student_infos where name in(";
for (int i=0; i<goodStudentNames.size(); i++) {
sql += "'" + goodStudentNames.get(i) + "'";
if (i < goodStudentNames.size() - 1){
sql += ",";
}
}
sql += ")";
DataFrame goodStudentInfosDF = sqlContext.sql(sql);
// 然后将两份数据的dataframe,转换为javaPairRDD,执行join transframation
// (将dataframe转换为javardd,再map为JavaPairRDD,然后进行join)
JavaPairRDD<String, Tuple2<Integer, Integer>> goodStudentsRDD =
goodStudentScoresDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<String, Integer>(
row.getString(0),
Integer.valueOf(String.valueOf(row.getLong(1)))
);
}
}).join(goodStudentInfosDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<String, Integer>(
row.getString(0),
Integer.valueOf(String.valueOf(row.getLong(1)))
);
}
}));
// 将封装在rdd中的好学生的全部信息,转换为一个javardd<row>的格式
// (将javardd转换为dataframe)
// 就是之前 以编程方式动态指定元数据,将rdd转换为dataframe 的知识点
JavaRDD<Row> goodStudentRowsRDD = goodStudentsRDD.map(new Function<Tuple2<String, Tuple2<Integer, Integer>>, Row>() {
@Override
public Row call(Tuple2<String, Tuple2<Integer, Integer>> tuple) throws Exception {
return RowFactory.create(tuple._1, tuple._2._1, tuple._2._2);
}
});
// 然后动态构造一份元数据,将javardd<row>转换为dataframe
List<StructField> structFields = new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true));
structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));
StructType structType = DataTypes.createStructType(structFields);
// 使用动态构造的元数据,将rdd转换为dataframe
DataFrame goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType);
// 将dataframe转换为javardd,然后打印
List<Row> rows = goodStudentsDF.javaRDD().collect();
for (Row row : rows){
System.out.println(row);
}
// 将好学生的全部信息保存到一个json文件中去
// (将dataframe中的数据保存到json文件中去)
goodStudentsDF.write().format("json").save("c://goodStudent.json");
sc.close();
}
}
转载于:https://my.oschina.net/hehongbo/blog/1490262