先准备好测试数据和相关的pojo类
Person.java pojo文件
public class Person implements Serializable {
private String name;
private long age;
}
person.json文件内容
{"name":"Michael", "age":29}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}
代码示例
import org.apache.spark.sql.Row;
public static void main( String[] args )
{
String warehouseLocation = "file:///${system:user.dir}/sparktest/spark-warehouse";
SparkSession sparkSession = SparkSession.builder()
.appName("sparktest_1")
.config("spark.sql.warehouse.dir", warehouseLocation)
.enableHiveSupport()
.master("local")
.getOrCreate();
sparkSession.sparkContext().setLogLevel("ERROR");
//1: 设置spark运行时的配置
//set new runtime options
sparkSession.conf().set("spark.sql.shuffle.partitions", 6);
sparkSession.conf().set("spark.executor.memory", "2g");
//2: 创建Dataset
// 用spark.range创建一个Dataset
Dataset<Long> numDS = sparkSession.range(5, 100, 5);
// 对numDS按照id逆序排序并取出前五个
numDS.orderBy(numDS.col("id").desc()).show(5,false);
// 用spark.createDataFrame从List中创建一个DataFrame
Dataset<Row> personDF =
sparkSession.createDataFrame(Arrays.asList(new Person("name", 12), new Person("name2", 13)), Person.class);
personDF.printSchema();
personDF.orderBy(personDF.col("name").desc()).show(false);
// 从json文件中读取数据来创建DataFrame
personDF = sparkSession.read().json("file:///people.json");
personDF.filter(personDF.col("age").gt(21));
//3: sql查询api
personDF.createOrReplaceTempView("people");
personDF.cache();
Dataset<Row> resultsDF = sparkSession.sql("SELECT * FROM people");
resultsDF.show(10,false);
//4: 访问catalog元数据
sparkSession.catalog().listDatabases().show(false);
sparkSession.catalog().listTables().show(false);
}