DataFrame:读取和执行查询都会返回DataFrame,是一个由ROW 对象组成的RDD,附带包括每列数据类型的结构信息。ROW对象是支队基本数据类型的数组的封装。
ROW对象:表示DataFrame中的记录。本质是一个定长的字段数组。
Spark将RDD转换成DataFrame的两种方式
一、通过反射指定结构类型
通过反射的方式来推断RDD元素中的元数据。因为RDD本身一条数据本身是没有元数据的,例如Person,而Person有name,id等,而record是不知道这些的,但是变成DataFrame背后一定知道,通过反射的方式就可以了解到背后这些元数据,进而转换成DataFrame。
已知表中的列名,创建Bean对象后反射。
在项目下新建一个people.txt文件
null Michael
30 Andy
19 Justin然后新建一个People的Bean对象
Bean对象:
1、所有属性为private
2、提供默认构造方法
3、提供getter和setter
4、实现serializable接口
package org.xtwy.sql;
import java.io.Serializable;
public class Person implements Serializable{
private static final long serialVersionUID = 1L;
private int age;
private String name;
public Person(int age, String name) {
super();
this.age = age;
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
- java代码:
SparkConf conf = new SparkConf();
conf.setAppName("RDD2DataFrameReflection").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
//新建SQLContext
SQLContext sqlContext = new SQLContext(sc);
JavaRDD<Person> PersonRDD = sc.textFile("hdfs://hadoop1:9000/examples/src/main/resources/student.txt")
.map(new Function<String, Person>() {
public Person call(String line) throws Exception {
String[] strs = line.split(",");
String name=strs[0];
//Integer.parseInt:String转换为Int。trim():去除前后空格
int age=Integer.parseInt(strs[1].trim());
Person person=new Person(age,name);
return person;
}
});
//反射创建DataFrame。在底层通过反射的方式获得Person的所有fields,结合RDD本身,就生成了DataFrame
DataFrame personDF = sqlContext.createDataFrame(PersonRDD, Person.class);
//注册临时表
personDF.registerTempTable("person");
DataFrame resultperson = sqlContext.sql("select name,age from person where age > 13 and age <= 19");
//sql语句转为RDD后再foreach()
resultperson.javaRDD().foreach(new VoidFunction<Row>() {
private static final long serialVersionUID = 1L;
public void call(Row row) throws Exception {
//每一条数据都成是一个row row(0)=name row(1)=age
System.out.println("name"+row.getString(0));
System.out.println("age"+row.getInt(1));
}
});
resultperson.javaRDD().saveAsTextFile("hdfs://hadoop1:9000/reflectionresult");
二、编程动态指定结构类型
数据的列名未知。是从数据库里面动态获取从来的
package org.xtwy.sql;
import java.util.ArrayList;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
public class RDD2DataFrameProgrammactically {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setAppName("RDD2DataFrameProgrammactically");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
JavaRDD<String> personRDD = sc.textFile("hdfs://hadoop1:9000/examples/src/main/resources/people.txt");
/**
* 是从数据库里面动态获取从来的
* 在实际的开发中我们需要写另外的代码去获取
*/
String schemaString="name age";
//create schema
ArrayList<StructField> list = new ArrayList<StructField>();
for(String str:schemaString.split("\t")){
list.add(DataTypes.createStructField(str, DataTypes.StringType, true));
}
StructType schema = DataTypes.createStructType(list);
/**
* 需要将RDD转换为一个JavaRDD《Row》
*/
JavaRDD<Row> rowRDD = personRDD.map(new Function<String, Row>() {
public Row call(String line) throws Exception {
String[] fields = line.split(",");
return RowFactory.create(fields[0],fields[1]);
}
});
DataFrame personDF = sqlContext.createDataFrame(rowRDD, schema);
personDF.registerTempTable("person");
DataFrame resultperson = sqlContext.sql("select name,age from person where age > 13 and age <= 19");
resultperson.javaRDD().foreach(new VoidFunction<Row>() {
/**
*
*/
private static final long serialVersionUID = 1L;
public void call(Row row) throws Exception {
//把每一条数据都看成是一个row row(0)=name row(1)=age
System.out.println("name"+row.getString(0));
System.out.println("age"+row.getInt(1));
}
});
resultperson.javaRDD().saveAsTextFile("hdfs://hadoop1:9000/reflectionresult");
}
}