SparkSQL 学习笔记----案例实战之查询分数大于80分的学生信息

最新推荐文章于 2023-01-11 20:07:06 发布

PZ~浪味仙

最新推荐文章于 2023-01-11 20:07:06 发布

阅读量1.1k

点赞数

分类专栏： Spark SQL

Spark 同时被 2 个专栏收录

20 篇文章 0 订阅

订阅专栏

SQL

12 篇文章 0 订阅

订阅专栏

1、Json数据源

Spark SQL可以自动推断JSON文件的元数据，并且加载其数据，创建一个DataFrame。
可以使用SQLContext.read.json()方法，针对一个元素类型为String的RDD，或者是一个JSON文件。

注意：这里使用的JSON文件与传统意义上的JSON文件不同。每行都必须，也只能包含一个单独的、自包含的、有效地JSON对象。不能让一个JSON对象分散在多行，否则会报错

2、综合性复杂案例：查询成绩为80分以上的学生的基本信息与成绩信息

java版

package pz.spark.study.sql;


import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;


import java.util.ArrayList;
import java.util.List;

/**
 * JSON数据源
 * 综合性复杂案例：查询成绩为80分以上的学生的基本信息与成绩信息
 */
public class JSONDataSource {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("JSONDataSource").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);

        //针对json文件，创建DataFrame
        Dataset<Row> goodstudentScoresDF = sqlContext.read().json("/Users/pengzhe/IDEA_workspace/RDD_test/student.json");
        //针对学生成绩信息的DataFrame，注册临时表，查询分数大于80分的学生的姓名和分数
        //注册临时表，针对临时表执行sql语句
        goodstudentScoresDF.registerTempTable("student_scores");
        Dataset<Row> goodStudentNamesDF = sqlContext.sql("select name ,score from student_scores where score>=80");
        //将DataFrame转换为RDD，执行transformation操作
        //（针对包含json串的JavaRDD，创建DataFrame）
        List<Object> goodStudentNames = goodStudentNamesDF.javaRDD().map(new Function<Row, Object>() {

            @Override
            public Object call(Row row) throws Exception {
                return row.getString(0);
            }
        }).collect();

        //然后针对JavaRDD<String>，创建DataFrame
        List<String> studentInfoJSONs = new ArrayList<>();
        studentInfoJSONs.add("{\"name\":\"ning\",\"age\":18}");
        studentInfoJSONs.add("{\"name\":\"ji\",\"age\":17}");
        studentInfoJSONs.add("{\"name\":\"eason\",\"age\":19}");

        JavaRDD<String> studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs);
        Dataset<Row> studentInfoDF = sqlContext.read().json(studentInfoJSONsRDD);

        //针对学生基本信息DataFrame注册临时表，然后查询分数>80分的学生的基本信息
        studentInfoDF.registerTempTable("student_infos");
        String sql = "select name,age from student_infos where name in (";
        for (int i =0; i<goodStudentNames.size();i++) {
            sql += "'" + goodStudentNames.get(i) + "'";
            if (i < goodStudentNames.size() -1){
                sql += ",";
            }
        }
        sql += ")";
        Dataset<Row> goodStudentInfosDF = sqlContext.sql(sql);


        //然后将两份数据的DataFrame，转换为JavaPairRDD，执行join transformation
        //(将DataFrame转换为JavaRDD,再map为JavaPairRDD，然后执行join)
        JavaPairRDD<String, Tuple2<Integer, Integer>> goodStudentsRDD =
                goodstudentScoresDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(Row row) throws Exception {
                return new Tuple2<String, Integer>(row.getString(0),
                        Integer.valueOf(String.valueOf(row.getLong(1))));
            }
        }).join(goodStudentInfosDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(Row row) throws Exception {
                return new Tuple2<String, Integer>(row.getString(0),
                        Integer.valueOf(String.valueOf(row.getLong(1))));
            }
        }));

        //然后将封装在RDD中的好学生的全部信息，转换为一个JavaRDD<Row>的格式
        //（将JavaRdd转换为DataFrame）
        JavaRDD<Row> goodStudentRowsRDD = goodStudentsRDD.map(new Function<Tuple2<String, Tuple2<Integer, Integer>>, Row>() {

            @Override
            public Row call(Tuple2<String, Tuple2<Integer, Integer>> stringTuple2Tuple2) throws Exception {
                return RowFactory.create(stringTuple2Tuple2._1, stringTuple2Tuple2._2._1, stringTuple2Tuple2._2._2);
            }
        });
        //创建一份元数据，将JavaRDD<Row>转换为DataFrame
        List<StructField> structFields = new ArrayList<StructField>();
        structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));
        structFields.add(DataTypes.createStructField("score",DataTypes.IntegerType,true));
        structFields.add(DataTypes.createStructField("age",DataTypes.IntegerType,true));

        StructType structType = DataTypes.createStructType(structFields);

        Dataset<Row> goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType);
        //将好学生的全部信息保存到一个Json文件中
        //(将DataFrame中的数据保存到外部的json文件中)
        goodStudentsDF.write().format("json").save("./good_students");
    }
}

scala版(运行失败，提示类型有问题)

package pz.spark.study.sql



import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

object JSONDataSource_scala {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("JSONDataSource_scala").setMaster("local")
    val sc  = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    //创建学生成绩DataFrame
    val studentScoresDF = sqlContext.read.json("./student.json")
    //查询出分数> 80分的学生成绩信息，以及学生姓名
    studentScoresDF.registerTempTable("student_score")
    val goodStudentScoresDF = sqlContext.sql("select name,score from student_score where score>=80")

    val goodStudentNames = goodStudentScoresDF.rdd.map(row => row(0)).collect()

    //创建学生基本信息DataFrame
    val studentInfosJSONs = Array(
      "{\"name\":\"ning\",\"age\":18}",
      "{\"name\":\"ji\",\"age\":17}",
      "{\"name\":\"eason\",\"age\":19}")

    val studentInfoJSONsRDD = {
      sc.parallelize(studentInfosJSONs, 3)
    }

    val studnetInfosDF = sqlContext.read.json(studentInfoJSONsRDD)
    //查询分数大于80分的学生信息
    studnetInfosDF.registerTempTable("student_infos")
    var sql = "select name,age from student_infos where name in ("

    for(i <- 0 until goodStudentNames.length){
      sql += "'" + goodStudentNames(i) + "'"
      if(i < goodStudentNames.length -1){
        sql += ","
      }
    }
    sql += ")"

    val goodStudentInfosDF = sqlContext.sql(sql);
    //将分数大于80分的学生成绩信息与基本信息进行join
    val goodStudentsRDD = goodStudentScoresDF.rdd.map(row => (row.getAs[String]("name"),row.getAs[Long]("score")))
      .join(goodStudentInfosDF.rdd.map(row => (row.getAs("name"),row.getAs[Long]("age"))))

    //将rdd转换为DataFrame
    val goodStudentsRowsRDD = goodStudentsRDD.map(info =>  Row(info._1,info._2._1.toInt,info._2._2.toInt))
    val structType = StructType(Array(
      StructField("name",StringType,true),
      StructField("score",IntegerType,true),
      StructField("age",IntegerType,true)))

    val goodStudentsDF = sqlContext.createDataFrame(goodStudentsRowsRDD,structType)
    //将DataFrame中的数据保存到json中
    goodStudentsDF.write.format("json").save("./student_json_scala")
  }
}

本文为北风网Spark2.0培训视频的学习笔记
视频链接：
https://www.bilibili.com/video/av19995678?p=111

PZ~浪味仙

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
SparkSQL 学习笔记----案例实战之查询分数大于80分的学生信息

1、Json数据源Spark SQL可以自动推断JSON文件的元数据，并且加载其数据，创建一个DataFrame。可以使用SQLContext.read.json()方法，针对一个元素类型为String的RDD，或者是一个JSON文件。注意：这里使用的JSON文件与传统意义上的JSON文件不同。每行都必须，也只能包含一个单独的、自包含的、有效地JSON对象。不能让一个JSON对象分散在多行...
复制链接

扫一扫

专栏目录