1. 运行环境
本文使用Spark SQL 2.1.0版本
2. 使用代码
例如有个对象,除了包含简单的基本数据String,int之外还包含一个Location对象,就是所说的嵌套对象:
Spark SQL代码:
本文使用Spark SQL 2.1.0版本
2. 使用代码
例如有个对象,除了包含简单的基本数据String,int之外还包含一个Location对象,就是所说的嵌套对象:
import java.io.Serializable;
public class Person implements Serializable {
private static final long serialVersionUID = 1L;
private String name;
private int age;
private Location location;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
public Location getLocation() {
return location;
}
public void setLocation(Location location) {
this.location = location;
}
}
public class Location implements Serializable {
private static final long serialVersionUID = 1L;
private String city;
private String country;
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getCountry() {
return country;
}
public void setCountry(String country) {
this.country = country;
}
}
Spark SQL代码:
// 初始化 spark session
SparkSession spark = SparkSession
.builder()
.appName("Java Spark SQL Schema test")
.master("local[*]")
.getOrCreate();
// 创建Person的JavaRDD
JavaRDD<Person> peopleRDD = spark
.read()
.textFile("examples/src/main/resources/people.txt")
.javaRDD().map(line -> {
String[] parts = line.split(",");
Person person = new Person();
person.setName(parts[0]);
person.setAge(Integer.parseInt(parts[1].trim()));
Location location = new Location();
location.setCity(parts[2