Point 1:java
package com.spark.sparksql.dataframe.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import java.util.List;
public class RDD2DataFrameReflection {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("RDD2DataFrameReflection").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
JavaRDD<String> lines = sc.textFile("./data/students.txt");
JavaRDD<Student> studentRDD = lines.map(new Function<String, Student>() {
private static final long serialVersionUID = 1L;
@Override
public Student call(String line) throws Exception {
String[] lineSplited = line.split(",");
Student stu = new Student();
stu.setId(Integer.valueOf(lineSplited[0]));
stu.setName(lineSplited[1]);
stu.setAge(Integer.valueOf(lineSplited[2]));
return stu;
}
});
// 使用反射方式将RDD转换为DataFrame
DataFrame studentDF = sqlContext.createDataFrame(studentRDD, Student.class);
studentDF.printSchema();
// 有了DataFrame后就可以注册一个临时表,SQL语句还是查询年龄小于18的人
studentDF.registerTempTable("student");
DataFrame teenagerDF = sqlContext.sql("select * from student where age <= 18");
JavaRDD<Row> teenagerRDD = teenagerDF.toJavaRDD();
JavaRDD<Student> teenagerStudentRDD = teenagerRDD.map(new Function<Row, Student>() {
private static final long serialVersionUID = 1L;
@Override
public Student call(Row row) throws Exception {
// 通过反射来生成这个DataFrame的方式如果使用get(index),大家要注意这个列的顺序是字典顺序
// int id = row.getInt(1);
// String name = row.getString(2);
// int age = row.getInt(0);
// 第二种可以直接通过列明来从Row里面来获取数据,这样的好处就是不用担心上面放上的顺序了
int id = row.getAs("id");
int age = row.getAs("age");
String name = row.getAs("name");
Student stu = new Student();
stu.setId(id);
stu.setName(name);
stu.setAge(age);
return stu;
}
});
List<Student> studentList = teenagerStudentRDD.collect();
for(Student stu : studentList){
System.out.println(stu);
}
}
}
package com.spark.sparksql.dataframe.java;
import java.io.Serializable;
public class Student implements Serializable{
private static final long serialVersionUID = 6033246342674289568L;
private int id;
private String name;
private int age;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
@Override
public String toString() {
return "Student [id=" + id + ", name=" + name + ", age=" + age + "]";
}
}
Point 2:scala
package com.spark.sparksql.dataframe.scala
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
case class Student(id:Int, name:String, age:Int)
object RDD2DataFrameReflection {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("RDD2DataFrameReflection").setMaster("local")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
// 在Scala中使用反射方式,进行RDD到DataFrame的转换,需要手动导入一个隐式转换
import sqlContext.implicits._
val students = sc.textFile("./data/students.txt", 1)
.map { line => line.split(",") }
.map { array => Student(array(0).trim().toInt,array(1),array(2).trim().toInt)}
// 直接使用RDD的toDF()即可转换为DataFrame
val studentDF = students.toDF()
studentDF.registerTempTable("students")
val teenagerDF = sqlContext.sql("select * from students where age <= 18")
val teenagerRDD = teenagerDF.rdd
teenagerRDD.map { row => Student(row(0).toString().toInt, row(1).toString(), row(2).toString().toInt) }
.collect().foreach { stu => println(stu.id + ":" + stu.name + ":" + stu.age) }
// 首先scala中保证了顺序的一致,见上面,其次Scala中对row的使用,比JAVA中的row的使用更加丰富
teenagerRDD.map { row => Student(row.getAs[Int]("id"), row.getAs[String]("name"),row.getAs[Int]("age")) }
.collect().foreach { stu => println(stu.id + ":" + stu.name + ":" + stu.age) }
teenagerRDD.map { row =>
val map = row.getValuesMap[Any](Array("id","name","age"))
Student(map("id").toString().toInt,map("name").toString(),map("age").toString().toInt)}
.collect().foreach { stu => println(stu.id + ":" + stu.name + ":" + stu.age) }
}
}