64、Spark SQL之使用编程方式将RDD转换为DataFrame

最新推荐文章于 2023-05-31 08:44:43 发布

ZFH__ZJ

最新推荐文章于 2023-05-31 08:44:43 发布

阅读量1.7k

点赞数

分类专栏： Spark入坑

本文链接：https://blog.csdn.net/ZJ__ZFH/article/details/86578277

版权

Spark入坑专栏收录该内容

207 篇文章 8 订阅

订阅专栏

使用编程方式指定元数据

Java版本：当JavaBean无法预先定义和知道的时候，比如要动态从一个文件中读取数据结构，那么就只能用编程方式动态指定元数据了。首先要从原始RDD创建一个元素为Row的RDD；其次要创建一个StructType，来代表Row；最后将动态定义的元数据应用到RDD<Row>上。
Scala版本：Scala的实现方式，与Java是基本一样的。
Java版本

public class RDD2DataFrameProgrammatically {

    public static void main(String[] args) {
        // 创建SparkConf、JavaSparkContext、SQLContext
        SparkConf conf = new SparkConf().setAppName("RDD2DataFrameProgrammaticallyJava").setMaster("local");
        JavaSparkContext sparkContext = new JavaSparkContext(conf);

        SQLContext sqlContext = new SQLContext(sparkContext);

        // 第一步，创建一个普通的RDD，但是，必须将其转换为RDD<Row>的这种格式
        JavaRDD<String> lines = sparkContext.textFile("E:\\testdata\\sparksql\\students.txt");

        // 往Row中塞数据的时候，要注意，什么格式的数据，就用什么格式转换一下，再塞进去
        JavaRDD<Row> studentRDD = lines.map(new Function<String, Row>() {
            @Override
            public Row call(String s) throws Exception {
                String[] strings = s.split(",");
                return RowFactory.create(Integer.parseInt(strings[0]),
                        strings[1],
                        Integer.parseInt(strings[2]));
            }
        });

        // 第二步，动态构造元数据
        // 比如说，id、name等，field的名称和类型，可能都是在程序运行过程中，动态从mysql db里
        // 或者是配置文件中，加载出来的，是不固定的
        // 所以特别适合用这种编程的方式，来构造元数据
        List<StructField> fieldList = new ArrayList<StructField>();
        fieldList.add(DataTypes.createStructField("id",DataTypes.IntegerType, true));
        fieldList.add(DataTypes.createStructField("name",DataTypes.StringType, true));
        fieldList.add(DataTypes.createStructField("age",DataTypes.IntegerType, true));
        StructType structType = DataTypes.createStructType(fieldList);

        // 第三步，使用动态构造的元数据，将RDD转换为DataFrame
        DataFrame df = sqlContext.createDataFrame(studentRDD, structType);

        // 前面讲的DataFrame的常用操作依然试用
        df.show();
        df.schema();

        df.select(df.col("id")).show();
        df.select(df.col("name")).show();
        df.select(df.col("age")).show();
        df.select(df.col("id"), df.col("name"), df.col("age").plus(1));
        df.filter(df.col("age").gt(18)).show();
        df.groupBy(df.col("age")).count().show();

        df.registerTempTable("students");

        DataFrame teenagerDF = sqlContext.sql("select * from students where age < 18");

        teenagerDF.show();

        List<Row> collect = teenagerDF.javaRDD().collect();
        for(Row row : collect) {
            System.out.println(row.getInt(0) + "," + row.getString(1) + "," + row.getInt(2));
        }
    }
}

Scala版本

object RDD2DataFrameProgrammatically {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("RDD2DataFrameProgrammaticallyScala")
    val sparkContext = new SparkContext(conf)
    val sqlContext = new SQLContext(sparkContext)

    // 第一步，构造出元素为Row的普通RDD
    val lines = sparkContext.textFile("E:\\testdata\\sparksql\\students.txt")
    val students = lines.map(line => {
      val strings = line.split(",")
      Row(strings(0).toInt, strings(1), strings(2).toInt)
    })
    // 第二步，编程方式动态构造元数据
    val structType = StructType(Array(
      StructField("id", DataTypes.IntegerType, true),
      StructField("name", DataTypes.StringType, true),
      StructField("age", DataTypes.IntegerType, true)
    ))

    // 第三步，进行RDD到DataFrame的转换
    val df = sqlContext.createDataFrame(students, structType)

    // 前面讲的DataFrame的常用操作依然试用
    df.show()
    df.schema
    df.select(df.col("id")).show()
    df.select(df.col("name")).show()
    df.select(df.col("age")).show()
    df.select(df.col("id"), df.col("name"), df.col("age").plus(1))
    df.filter(df.col("age").gt(18)).show()
    df.groupBy(df.col("age")).count().show()

    df.registerTempTable("students")
    val teenagerDF = sqlContext.sql("select * from students where age < 18")
    teenagerDF.show()
    val rows = teenagerDF.rdd.collect()
    for(row <- rows){
      println(row.getAs("id").toString + "," + row.getAs("name").toString + "," + row.getAs("age").toString)
    }
    teenagerDF.rdd.foreach(row => println(row))
  }
}