package cn.hhb.spark.sql;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
/**
* Created by dell on 2017/7/27.
*/
public class JSONDataSource {
public static void main(String[] args) {
// 创建SparkConf
SparkConf conf = new SparkConf()
.setAppName("JSONDataSource").setMaster("local")
.set("spark.testing.memory", "2147480000");
// 创建javasparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
// 针对json文件,创建dataframe(针对json文件创建dataframe)
DataFrame studentScoresDF = sqlContext.read().json("c://students.json");
// 针对学生成绩信息的dataframe,注册临时表,查询分数大于80的学生
// (注册临时表,针对临时表执行sql语句)
studentScoresDF.registerTempTable("student_scores");
DataFrame goodStudentScoresDF = sqlContext.sql("select name,score from student_scores where score >= 80");
// (将dataframe转换为rdd,执行transframation操作)
List goodStudentNames = goodStudentScoresDF.javaRDD().map(new Function() {
@Override
public String call(Row row) throws Exception {
return row.getString(0);
}
}).collect();
// 然后针对javaRDD,创建DataFrame
// (针对包含json串的javardd,创建dataframe)
List studentInfoJSONs = new ArrayList();
studentInfoJSONs.add("{\"name\":\"Leo\", \"age\":18}");
studentInfoJSONs.add("{\"name\":\"Marry\", \"age\":17}");
studentInfoJSONs.add("{\"name\":\"Jack\", \"age\":19}");
JavaRDD studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs);
DataFrame studentInfosDF = sqlContext.read().json(studentInfoJSONsRDD);
// 针对学生基本信息DataFrame,注册临时表,然后查询分数大于80分的学生的基本信息
studentInfosDF.registerTempTable("student_infos");
String sql = "select name,age from student_infos where name in(";
for (int i=0; i
sql += "'" + goodStudentNames.get(i) + "'";
if (i < goodStudentNames.size() - 1){
sql += ",";
}
}
sql += ")";
DataFrame goodStudentInfosDF = sqlContext.sql(sql);
// 然后将两份数据的dataframe,转换为javaPairRDD,执行join transframation
// (将dataframe转换为javardd,再map为JavaPairRDD,然后进行join)
JavaPairRDD> goodStudentsRDD =
goodStudentScoresDF.javaRDD().mapToPair(new PairFunction() {
@Override
public Tuple2 call(Row row) throws Exception {
return new Tuple2(
row.getString(0),
Integer.valueOf(String.valueOf(row.getLong(1)))
);
}
}).join(goodStudentInfosDF.javaRDD().mapToPair(new PairFunction() {
@Override
public Tuple2 call(Row row) throws Exception {
return new Tuple2(
row.getString(0),
Integer.valueOf(String.valueOf(row.getLong(1)))
);
}
}));
// 将封装在rdd中的好学生的全部信息,转换为一个javardd的格式
// (将javardd转换为dataframe)
// 就是之前 以编程方式动态指定元数据,将rdd转换为dataframe 的知识点
JavaRDD goodStudentRowsRDD = goodStudentsRDD.map(new Function>, Row>() {
@Override
public Row call(Tuple2> tuple) throws Exception {
return RowFactory.create(tuple._1, tuple._2._1, tuple._2._2);
}
});
// 然后动态构造一份元数据,将javardd转换为dataframe
List structFields = new ArrayList();
structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true));
structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));
StructType structType = DataTypes.createStructType(structFields);
// 使用动态构造的元数据,将rdd转换为dataframe
DataFrame goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType);
// 将dataframe转换为javardd,然后打印
List rows = goodStudentsDF.javaRDD().collect();
for (Row row : rows){
System.out.println(row);
}
// 将好学生的全部信息保存到一个json文件中去
// (将dataframe中的数据保存到json文件中去)
goodStudentsDF.write().format("json").save("c://goodStudent.json");
sc.close();
}
}