相关的测试数据和pojo类,查看博文
https://blog.csdn.net/qq_41712271/article/details/107812188
//导入相关的包
import static org.apache.spark.sql.functions.col;
public static void main(String[] args) {
//0: spark sql程序入口
SparkSession spark = SparkSession
.builder()
.config("spark.driver.host", "localhost")
.appName("SparkSqlDabbler")
.master("local")
.getOrCreate();
String jsonDataPath = "file:///e:/people.json";
//1: DataFrame和Dataset的创建
Dataset<Row> dataFrame = spark.read().json(jsonDataPath);
Dataset<Person> dataset = spark.read().json(jsonDataPath).as(Encoders.bean(Person.class));
//2: DataFrame和Dataset的互转
Dataset<Person> datasetFromDF = dataFrame.as(Encoders.bean(Person.class));
//3: DataFrame和Dataset的schema的定义以及使用
dataFrame.schema();
dataFrame.printSchema();
dataset.schema();
dataset.printSchema();
//4: 用API调用并运行sql
dataFrame.createOrReplaceTempView("people");
dataset.createOrReplaceTempView("people");
Dataset<Row> sqlDF = spark.sql("select age, count(*) from people where age > 21 group by age");
sqlDF.show();
//5: DataFrame的api的使用
// Displays the content of the DataFrame to stdout
dataFrame.show();
// +----+-------+
// | age| name|
// +----+-------+
// |29|Michael|
// | 30| Andy|
// | 19| Justin|
// +----+-------+
// 仅查询name这一列数据
dataFrame.select("name").show();
// +-------+
// | name|
// +-------+
// |Michael|
// | Andy|
// | Justin|
// +-------+
// 查询所有列数据,并且age列都需要加上1
dataFrame.select(col("name"), col("age").plus(1)).show();
// +-------+---------+
// | name|(age + 1)|
// +-------+---------+
// |Michael| 30|
// | Andy| 31|
// | Justin| 20|
// +-------+---------+
// 查询age > 21的人
dataFrame.filter(col("age").gt(21)).show();
// +---+----+
// |age|name|
// +---+----+
// | 30|Andy|
// +---+----+
// Count people by age
dataFrame.groupBy("age").count().show();
// +----+-----+
// | age|count|
// +----+-----+
// | 19| 1|
// |29| 1|
// | 30| 1|
// +----+-----+
//6: Dataset的api的使用
dataset.show();
// +---+----+
// |age|name|
// +---+----+
// | 32|Andy|
// +---+----+
//支持DSL编程
Dataset<Row> nameDF = dataset.select("name");
nameDF.show();
Dataset<String> nameDS = dataset.select(dataset.col("name")).as(Encoders.STRING());
nameDS.show();
//同时支持函数式编程
KeyValueGroupedDataset kvgd = dataset.map(new MapFunction<Person, Person>() {
@Override
public Person call(Person person) throws Exception {
if (person.getAge() > 10) {
person.setAge(10);
}
return person;
}
}, Encoders.bean(Person.class)).groupByKey(new MapFunction<Person, String>() {
@Override
public String call(Person person) throws Exception {
return person.getName();
}
}, Encoders.STRING());
kvgd.count().show();
spark.stop();
}