package com.lyzx.spark.sql;
public class People implements java.io.Serializable{
private String name;
private int age;
private String code;
public String getName(){
return name;
}
public void setName(String name){
this.name = name;
}
public int getAge(){
return age;
}
public void setAge(int age){
this.age = age;
}
public String getCode(){
return code;
}
public void setCode(String code){
this.code = code;
}
@Override
public String toString() {
return "People{" +
"name='" + name + '\'' +
", age=" + age +
", code='" + code + '\'' +
'}';
}
public People(){}
public People(String name, int age, String code) {
this.name = name;
this.age = age;
this.code = code;
}
}
有实体来如上
下面是测试代码:
package com.lyzx.spark.sql;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.junit.Test;
import java.util.Arrays;
import java.util.Collections;
import static org.apache.spark.sql.functions.col;
public class T1 {
//SparkSession in Spark 2.0 provides builtin
private static SparkSession spark;
private static Dataset<Row> people;
static {
spark = SparkSession.builder()
.master("local[4]")
.appName("lyzx_hero_sql")
.getOrCreate();
spark.sparkContext().setLogLevel("ERROR");
people = spark.read().json("people.json");
}
/**
* With a SparkSession, applications can create DataFrames from an existing RDD, from a Hive table, or from Spark data sources.
* As an example, the following creates a DataFrame based on the content of a JSON file
*/
@Test
public void test1(){
// Displays the content of the DataFrame to stdout
people.show();
System.out.println("==========================");
people.select(col("name")).show();
}
@Test
public void test2(){
//打印schema
people.printSchema();
System.out.println("=====================");
//选择 age 大于23的
people.filter(col("age").gt(23)).show();
System.out.println("======================");
people.groupBy(col("code")).avg("age").show();
System.out.println("======================");
people.groupBy(col("code")).count().show();
}
@Test
public void test3(){
// join 默认是 inner join
Dataset<Row> clazz = spark.read().format("json").load("clazz.json");
people.join(clazz,clazz.col("clazz_code").equalTo(people.col("code"))).show();
}
@Test
public void test4() throws AnalysisException {
//The sql function on a SparkSession enables applications to run SQL queries programmatically and returns the result as a DataFrame
people.createTempView("people");
spark.sql("select * from people where age >24").show();
/**
* people.createTempView("people")
* createTempView创建的表是session级别的,其他的session是不能共享的
* 下面的一行代码会抛出以下异常:
* org.apache.spark.sql.AnalysisException: Table or view not found: people;
* spark.newSession().sql("select * from people").show();
*/
}
/**
* Temporary views in Spark SQL are session-scoped and will disappear if the session that
* creates it terminates. If you want to have a temporary view that is shared among all sessions and
* keep alive until the Spark application terminates, you can create a global temporary view.
* Global temporary view is tied to a system preserved database global_temp,
* and we must use the qualified name to refer it, e.g. SELECT * FROM global_temp.view1.
*
* createOrReplaceGlobalTempView / createGlobalTempView
* 默认会将表注册到预先创建的名叫global_temp的数据库中
*
* spark默认从当前session中获取表,当加了global_temp前缀才会去这个数据库中去找表
* @throws AnalysisException
*/
@Test
public void test5() throws AnalysisException {
// people.createOrReplaceGlobalTempView("people");
people.createGlobalTempView("people");
spark.sql("select * from global_temp.people where age >24").show();
spark.newSession().sql("select * from global_temp.people").show();
}
/**
* Datasets are similar to RDDs, however,
* instead of using Java serialization or Kryo they use a specialized Encoder to serialize the objects
* for processing or transmitting over the network.
* While both encoders and standard serialization are responsible for turning an object into bytes,
* encoders are code generated dynamically and use a format that allows Spark to
* perform many operations like filtering, sorting and hashing without deserializing the bytes
* back into an object
*/
@Test
public void test6(){
// Encoders for most common types are provided in class Encoders
Encoder<People> bean = Encoders.bean(People.class);
Dataset<People> ds_people = spark.createDataset(Collections.singletonList(new People("lyh", 11, "1")),bean);
ds_people.show();
}
@Test
public void test7(){
//如果没有schema的数据 则默认的列的名字叫value ,
spark.createDataset(Arrays.asList(1,2,3,4,5,6,7),Encoders.INT())
.map((MapFunction<Integer,Integer>)x->x+1,Encoders.INT())
.filter(col("value").gt(4))
.show();
// DataFrames can be converted to a Dataset by providing a class. Mapping based on name
System.out.println("========================================");
Dataset<Row> dataSet_row = spark.read().json("people.json");
Dataset<People> dataSet_People = dataSet_row.as(Encoders.bean(People.class));
dataSet_People.show();
}
}
其中people.json
{"name":"lyh1","age":22,"code":"1"}
{"name":"lyh2","age":23,"code":"1"}
{"name":"lyh3","age":24,"code":"2"}
{"name":"lyh4","age":25,"code":"2"}
{"name":"lyh5","age":26,"code":"3"}
{"name":"lyh6","age":27,"code":"3"}
clazz.json
{"clazz_code":"1","name":"昊天宗"}
{"clazz_code":"2","name":"七宝琉璃宗"}
{"clazz_code":"4","name":"象甲宗"}