一:RDD与DataFrame转换 1. 通过反射的方式来推断RDD元素中的元数据。因为RDD本身一条数据本身是没有元数据的,例如Person,而Person有name,id等,而record是不知道这些的,但是变成DataFrame背后一定知道,通过反射的方式就可以了解到背后这些元数据,进而转换成DataFrame。 如何反射? Scala: 通过case class映射,在case class里面说我们这个RDD里面每个record的不同列的元数据是什么。 Java: 如何描述数据的元数据?构建Java Bean,使用Java Bean构建元数据信息,然后变换成DataFrame,但是此种方法不可以构建DataFrame嵌套类型。 2. 动态获取Schema,我们并不知道RDD的元数据信息,所以只能根据曾经运行时动态构建一份具体的元数据。然后将具体的元数据运行在存在的RDD上。而且这种情况比较常见。 二:代码实战
package com.dt.spark.SparkApps.sql;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
/**
* 使用反射的方式将RDD转换成为DataFrame
Person [id=1, name=Spark, age=7]
Person [id=2, name=Hadoop, age=10]
*/
public class RDDToDataFrameByReflection {
public static void main (String[] args) {
SparkConf conf = new SparkConf().setMaster("local" ).setAppName("RDDToDataFrameByReflection" );
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
JavaRDD<String> lines = sc.textFile("E://persons.txt" );
JavaRDD<Person> persons = lines.map(new Function<String,Person>(){
private static final long serialVersionUID = 1 L;
@Override
public Person call (String line) throws Exception {
String[] splited = line.split("," );
Person p = new Person();
p.setId(Integer.valueOf(splited[0 ].trim()));
p.setName(splited[1 ].trim());
p.setAge(Integer.valueOf(splited[2 ].trim()));
return p;
}
});
DataFrame df = sqlContext.createDataFrame(persons, Person.class);
df.registerTempTable("persons" );
DataFrame bigDatas = sqlContext.sql("select * from persons where age >= 6" );
JavaRDD<Row> bigDataRDD = bigDatas.javaRDD();
JavaRDD<Person> result = bigDataRDD.map(new Function<Row,Person>(){
private static final long serialVersionUID = 1 L;
@Override
public Person call (Row row) throws Exception {
Person p = new Person();
/**
* 由于数据在DataFrame会进行优化,里面会对元数据进行排序
* 顺序可能就不是id name age的顺序了。
*/
p.setId(row.getInt(1 ));
p.setName(row.getString(2 ));
p.setAge(row.getInt(0 ));
return p;
}
});
List<Person> personList = result.collect();
for (Person p : personList){
System.out.println(p);
}
}
}
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
People.java源码如下:
package com.dt.spark.SparkApps.sql;
import java.io.Serializable;
public class Person implements Serializable {
private static final long serialVersionUID = 1 L;
private int id;
private String name;
private int age; public int getId () {
return id;
}
public void setId (int id) {
this .id = id;
}
public String getName () {
return name;
}
public void setName (String name) {
this .name = name;
}
public int getAge () {
return age;
}
public void setAge (int age) {
this .age = age;
}
@Override
public String toString () {
return "Person [id=" + id + ", name=" + name + ", age=" + age + "]" ;
}
}
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
1.作业: 使用Scala在IDE中实战RDD和DataFrame转换操作
package com.dataguru.xzl.two.com.dt
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by xzl on 2016/3/16.
*/
object RDD2DataFrameByReflection {
case class Person (id: Int, name: String, age: Int)
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local" ).setAppName("RDD2DataFrameByReflection" )
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val lines = sc.textFile("d://persons.txt" )
val df = lines.map(_.split("," )).map { splited =>
Person(splited(0 ).trim().toInt, splited(1 ), splited(2 ).trim().toInt)
}.toDF()
df.registerTempTable("persons" )
val bigDatas = sqlContext.sql("select * from persons where age >= 6" )
val personList = bigDatas.javaRDD.collect()
for (p <- personList.toArray) {
println(p)
}
}
}