《深入理解Spark》之Spark sql 2.3.0 测试笔记(一)

最新推荐文章于 2021-08-10 22:36:59 发布

lyzx_in_csdn

最新推荐文章于 2021-08-10 22:36:59 发布

阅读量344

点赞数

分类专栏： Spark

本文链接：https://blog.csdn.net/lyzx_in_csdn/article/details/81144751

版权

Spark 专栏收录该内容

35 篇文章 1 订阅

订阅专栏

package com.lyzx.spark.sql;

public class People implements java.io.Serializable{
    private String name;
    private int age;
    private String code;

    public String getName(){
        return name;
    }
    public void setName(String name){
        this.name = name;
    }
    public int getAge(){
        return age;
    }
    public void setAge(int age){
        this.age = age;
    }
    public String getCode(){
        return code;
    }
    public void setCode(String code){
        this.code = code;
    }
    @Override
    public String toString() {
        return "People{" +
                "name='" + name + '\'' +
                ", age=" + age +
                ", code='" + code + '\'' +
                '}';
    }

    public People(){}
    public People(String name, int age, String code) {
        this.name = name;
        this.age = age;
        this.code = code;
    }
}

有实体来如上

下面是测试代码:

package com.lyzx.spark.sql;

import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.junit.Test;
import java.util.Arrays;
import java.util.Collections;
import static org.apache.spark.sql.functions.col;

public class T1 {
    //SparkSession in Spark 2.0 provides builtin
    private static SparkSession spark;
    private static Dataset<Row> people;
    static {
        spark = SparkSession.builder()
                .master("local[4]")
                .appName("lyzx_hero_sql")
                .getOrCreate();

        spark.sparkContext().setLogLevel("ERROR");
        people = spark.read().json("people.json");
    }

    /**
     * With a SparkSession, applications can create DataFrames from an existing RDD, from a Hive table, or from Spark data sources.
     * As an example, the following creates a DataFrame based on the content of a JSON file
     */
    @Test
    public void test1(){
        // Displays the content of the DataFrame to stdout
        people.show();
        System.out.println("==========================");
        people.select(col("name")).show();
    }


    @Test
    public void test2(){
        //打印schema
        people.printSchema();
        System.out.println("=====================");

        //选择 age 大于23的
        people.filter(col("age").gt(23)).show();
        System.out.println("======================");
        people.groupBy(col("code")).avg("age").show();

        System.out.println("======================");
        people.groupBy(col("code")).count().show();
    }



    @Test
    public void test3(){
        // join 默认是 inner join
        Dataset<Row> clazz = spark.read().format("json").load("clazz.json");
        people.join(clazz,clazz.col("clazz_code").equalTo(people.col("code"))).show();
    }


    @Test
    public void test4() throws AnalysisException {
        //The sql function on a SparkSession enables applications to run SQL queries programmatically and returns the result as a DataFrame
        people.createTempView("people");
        spark.sql("select * from people where age >24").show();


        /**
         * people.createTempView("people")
         * createTempView创建的表是session级别的，其他的session是不能共享的
         * 下面的一行代码会抛出以下异常:
         *   org.apache.spark.sql.AnalysisException: Table or view not found: people;
         *   spark.newSession().sql("select * from people").show();
         */
    }

    /**
     * Temporary views in Spark SQL are session-scoped and will disappear if the session that
     * creates it terminates. If you want to have a temporary view that is shared among all sessions and
     * keep alive until the Spark application terminates, you can create a global temporary view.
     * Global temporary view is tied to a system preserved database global_temp,
     * and we must use the qualified name to refer it, e.g. SELECT * FROM global_temp.view1.
     *
     * createOrReplaceGlobalTempView / createGlobalTempView
     * 默认会将表注册到预先创建的名叫global_temp的数据库中
     *
     * spark默认从当前session中获取表,当加了global_temp前缀才会去这个数据库中去找表
     * @throws AnalysisException
     */
    @Test
    public void test5() throws AnalysisException {
//        people.createOrReplaceGlobalTempView("people");
        people.createGlobalTempView("people");
        spark.sql("select * from global_temp.people where age >24").show();
        spark.newSession().sql("select * from global_temp.people").show();
    }


    /**
     * Datasets are similar to RDDs, however,
     * instead of using Java serialization or Kryo they use a specialized Encoder to serialize the objects
     * for processing or transmitting over the network.
     * While both encoders and standard serialization are responsible for turning an object into bytes,
     * encoders are code generated dynamically and use a format that allows Spark to
     * perform many operations like filtering, sorting and hashing without deserializing the bytes
     * back into an object
     */
    @Test
    public void test6(){
        // Encoders for most common types are provided in class Encoders
        Encoder<People> bean = Encoders.bean(People.class);
        Dataset<People> ds_people = spark.createDataset(Collections.singletonList(new People("lyh", 11, "1")),bean);
        ds_people.show();
    }



    @Test
    public void test7(){
        //如果没有schema的数据 则默认的列的名字叫value ,
        spark.createDataset(Arrays.asList(1,2,3,4,5,6,7),Encoders.INT())
                .map((MapFunction<Integer,Integer>)x->x+1,Encoders.INT())
                .filter(col("value").gt(4))
                .show();

        // DataFrames can be converted to a Dataset by providing a class. Mapping based on name
        System.out.println("========================================");
        Dataset<Row> dataSet_row = spark.read().json("people.json");
        Dataset<People> dataSet_People = dataSet_row.as(Encoders.bean(People.class));
        dataSet_People.show();
    }
}

其中people.json

{"name":"lyh1","age":22,"code":"1"}
{"name":"lyh2","age":23,"code":"1"}
{"name":"lyh3","age":24,"code":"2"}
{"name":"lyh4","age":25,"code":"2"}
{"name":"lyh5","age":26,"code":"3"}
{"name":"lyh6","age":27,"code":"3"}

clazz.json

{"clazz_code":"1","name":"昊天宗"}
{"clazz_code":"2","name":"七宝琉璃宗"}
{"clazz_code":"4","name":"象甲宗"}

lyzx_in_csdn

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
《深入理解Spark》之Spark sql 2.3.0 测试笔记(一)

package com.lyzx.spark.sql;public class People implements java.io.Serializable{ private String name; private int age; private String code; public String getName(){ return n...
复制链接

扫一扫