查询学生成绩在80之上的学生信息

Java:

package cn.spark.sql;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.catalyst.expressions.In;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;

public class JSONDataSource {
    public static void main(String[] args){
        SparkConf conf = new SparkConf().setAppName("JSONDataSource");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);
        //针对json文件 创建DF
        DataFrame studentScoreDF = sqlContext.read().json(
                "hdfs://master:9000/spark-study/students.json"
        );
        //针对学生成绩信息的DF  注册临时表 查询分数大于80的学生姓名
        studentScoreDF.registerTempTable("studetn_score");
        DataFrame goodStudentScoresDF = sqlContext.sql("select name,score from student_score where score>80");
        //将DF转换为RDD 执行transformation操作
        List<String> goodStudentNames = goodStudentScoresDF.javaRDD().map(new Function<Row, String>() {
            @Override
            public String call(Row row) throws Exception {
                return row.getString(0);
            }
        }).collect();

        //然后针对JavaRDD 创建DF
        List<String> studentInfoJSONs = new ArrayList<String>();
        studentInfoJSONs.add("{\"name\":\"leo\",\"age\":18}");
        studentInfoJSONs.add("{\"name\":\"Marry\",\"age\":17}");
        studentInfoJSONs.add("{\"name\":\"Jack\",\"age\":19}");
        JavaRDD<String> studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs);
        DataFrame studentInfosDF = sqlContext.read().json(studentInfoJSONsRDD);

        //针对学生基本信息DF 注册临时表 然后查询分数大于80分的小道的基本信息
        studentInfosDF.registerTempTable("student_infos");

        String sql = "select name,age from student_infos where name in(";
        for (int i = 0; i < goodStudentNames.size(); i++){
            sql += "'" + goodStudentNames.get(i) + "'";
            sql += ",";
        }
        sql += ")";

        DataFrame goodStudentInfosDF = sqlContext.sql(sql);

        //然后 将两份得到的DF 转换为JavaPairRDD 执行join操作
        // 将DF转换为javardd 再map为javapairrdd 再进行join
        JavaPairRDD<String ,Tuple2<Integer, Integer>> goodStudentRDD = goodStudentInfosDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(Row row) throws Exception {
                return new Tuple2<String, Integer>(row.getString(0),
                        Integer.valueOf(String.valueOf(row.getLong(1))));
            }
        }).join(goodStudentInfosDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(Row row) throws Exception {
                return new Tuple2<String, Integer>(row.getString(0),
                        Integer.valueOf(String.valueOf(row.getLong(1))));
            }
        }));
        //然后将封装在RDD 中的好学生信息 转换为一个JavaRDD<Row>的形式
        JavaRDD<Row> goodStudentRowsRDD = goodStudentRDD.map(new Function<Tuple2<String, Tuple2<Integer, Integer>>, Row>() {
            @Override
            public Row call(Tuple2<String, Tuple2<Integer, Integer>> stringTuple2Tuple2) throws Exception {
                return RowFactory.create(stringTuple2Tuple2._1,stringTuple2Tuple2._2._1,stringTuple2Tuple2._2._2);
            }
        });

        //创建一份元数据 将JavaRDD<Row>转换为DF
        List<StructField> structFields = new ArrayList<StructField>();
        structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));
        structFields.add(DataTypes.createStructField("score",DataTypes.StringType,true));
        structFields.add(DataTypes.createStructField("age",DataTypes.StringType,true));
        StructType structType = DataTypes.createStructType(structFields);

        DataFrame goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD,structType);

        //将好学生的全部信息保存在一个json文件中去
        //将DF中的数据保存到外部的json文件中去
        goodStudentInfosDF.write().format("json").save("hdfs://master:9000/spark_study/java/good-students");
    }

}

Scala:

package cn.spark.sql

import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}

object JSONDataSource {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setAppName("JSONDataSource")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    //创建学生成绩的DF
    val studentInfoDF = sqlContext.read.json("hdfs://master:9000/spark_study/students.json").registerTempTable("student_score")
    //查询出分数大于80分的学生的成绩信息 姓名
//    studentInfoDF.registerTempTable("student_score")
    val studentGoodNames = sqlContext.sql("select name,score from student_score where score>80")
    val goodStudentsNames = studentGoodNames.rdd.map{ row => row(0)}.collect()

    //创建学生基本信息DF
    val  studentInfoJSONs = Array("{\"name\":\"Leo\", \"age\":18}",
      "{\"name\":\"Marry\", \"age\":17}",
      "{\"name\":\"Jack\", \"age\":19}")
    val studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs,3)
    val studentInfosDF = sqlContext.read.json(studentInfoJSONsRDD).registerTempTable("student_infos")

    //查询分数大于80分的学生的基本信息
  //  studentInfoDF.registerTempTable("student_infos")

    var sql = "select name,age from student_infos where name in("
    for (i <- 0 until goodStudentsNames.length){
      sql += "'" + goodStudentsNames(i) + "'"
      if (i < goodStudentsNames.length-1){
        sql += ","
      }
    }
    sql +=")"

    val goodStudentInfosDF = sqlContext.sql(sql)

    //将分数大于80分的学生的成绩与基本信息进行join
    val goodStudentRDD = goodStudentInfosDF.rdd.map{row =>(row.getAs[String]("name"),
      row.getAs[Long]("score"))}.join(goodStudentInfosDF.rdd.map{
      row =>(row.getAs[String]("name"),row.getAs[Long]("age"))
    })

    //将RDD转换为DF
    val goodStudentRowsRDD = goodStudentRDD.map(
      info => Row(info._1,info._2._1.toInt,info._2._2.toInt)
    )
    val structField = StructType(Array(
      StructField("name",StringType,true),
      StructField("score",IntegerType,true),
      StructField("age",IntegerType,true)))
    
    val goodStudentDF = sqlContext.createDataFrame(goodStudentRowsRDD,structField)
    
    //将DF中的数据保存在json中
    goodStudentDF.write.format("json").save("hdfs://master:9000/root/spark_study/scala/good-students-scala")
  }
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值