一、目录
1、Case Class 创建DataFrame
2、Tuple 创建DataFrame
3、JavaBean 创建DataFrame
4、Json 创建DataFrame
5、Row 创建DataFrame
6、set集合 创建DataFrame
7、map集合 创建DataFrame
8、Array数组 创建DataFrame
9、Parquet数组 创建DataFrame
二、数据源
1,张飞,21,北京,80
2,关羽,23,北京,82
3,赵云,20,上海,88
4,刘备,26,上海,83
5,曹操,30,深圳,90.8
三、Maven的依赖
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.11</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.3</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.39</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.3.3</version>
</dependency>
四、创建DataFrame的方式思路分析及源码解析
方式一:
通过sparksql的隐士转换
import spark.implicits._
dataFrame.toDF()
dataFrame.toDF(colNames: String*)
方式二:
通过sparksql的createDataFrame( RDD[T] , schema )
spark.createDataFrame(data, schema)
package blog
import java.util.Properties
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
/**
* @author: 余辉
* @blog: https://blog.csdn.net/silentwolfyh
* @create: 2020-01-04 18:16
* @description:
*
**/
case class Stu(id: Int, name: String, age: Int, city: String, score: Double)
object Spark_Create_DataFrame {
Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("Spark_Create_DataFrame")
.master("local[*]")
.getOrCreate()
// Case Class 创建DataFrame
caseClassToDataframe(spark)
// Tuple 创建DataFrame
tupleToDataFrame(spark)
// JavaBean 创建DataFrame
javaToDataFrame(spark)
// json 创建DataFrame
jsonToDataFrame(spark)
// Row 创建DataFrame
rowToDataFrame(spark)
// set集合 创建DataFrame
setToDataFrame(spark)
// map集合 创建DataFrame
mapToDataFrame(spark)
// Array数组 创建DataFrame
arrayToDataFrame(spark)
// parquet 创建DataFrame
parquetToDataFrame(spark)
}
def caseClassToDataframe(spark: SparkSession): Unit = {
// 读取数据
val rdd: RDD[String] = spark.sparkContext.textFile("spark_sql/doc/stu.csv")
// 转为 Stu 对象
val data: RDD[Stu] = rdd.map(line => {
val arr = line.split(",")
Stu(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
})
// 变成DF
import spark.implicits._
data.toDF().show()
}
def tupleToDataFrame(spark: SparkSession): Unit = {
// 读取数据
val rdd: RDD[String] = spark.sparkContext.textFile("spark_sql/doc/stu.csv")
// 转为Tuple
val data: RDD[(Int, String, Int, String, Double)] = rdd.map(line => {
val arr: Array[String] = line.split(",")
(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
})
// 变成DF
import spark.implicits._
val frame: DataFrame = data.toDF("id", "age", "name", "city", "score")
frame.printSchema()
frame.show()
}
def javaToDataFrame(spark: SparkSession): Unit = {
// 读取数据
val rdd: RDD[String] = spark.sparkContext.textFile("spark_sql/doc/stu.csv")
// 转为 StuJava 对象
val data: RDD[StuJava] = rdd.map(line => {
val arr: Array[String] = line.split(",")
new StuJava(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
})
// 变成DF
val df: DataFrame = spark.createDataFrame(data, classOf[StuJava])
df.printSchema()
df.show()
}
def rowToDataFrame(spark: SparkSession): Unit = {
val rdd: RDD[String] = spark.sparkContext.textFile("spark_sql/doc/stu.csv")
val data: RDD[Row] = rdd.map(line => {
val arr: Array[String] = line.split(",")
// 1,张飞,21,北京,80
Row(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
})
/** *
* def createDataFrame(rowRDD: RDD[Row], schema: StructType)
*
* case class StructType(fields: Array[StructField])
*
* case class StructField(
* name: String,
* dataType: DataType,
* nullable: Boolean = true,
* metadata: Metadata = Metadata.empty)
*/
val schema = StructType(Array(
StructField("id", DataTypes.IntegerType),
StructField("name", DataTypes.StringType),
StructField("age", DataTypes.IntegerType),
StructField("city", DataTypes.StringType),
StructField("score", DataTypes.DoubleType)
))
val frame: DataFrame = spark.createDataFrame(data, schema)
frame.printSchema()
frame.show()
}
def jsonToDataFrame(spark: SparkSession): Unit = {
val frame: DataFrame = spark.read.json("spark_sql/doc/student.json")
frame.printSchema()
frame.show()
}
def setToDataFrame(spark: SparkSession): Unit = {
import spark.implicits._
val set1: Set[Int] = Set(1, 2, 3)
val set2: Set[Int] = Set(11, 21, 31)
val df1: RDD[Set[Int]] = spark.sparkContext.parallelize(List(set1, set2))
val frame: DataFrame = df1.toDF("value")
frame.printSchema()
frame.selectExpr("value[0]", "value[1]", "value[2]").show()
}
def mapToDataFrame(spark: SparkSession): Unit = {
import spark.implicits._
val map1: Map[String, String] = Map("key1" -> "value1")
val map2: Map[String, String] = Map("key2" -> "value2")
val rdd1: RDD[Map[String, String]] = spark.sparkContext.parallelize(List(map1, map2))
val frame1: DataFrame = rdd1.toDF("keyvalue")
frame1.selectExpr("keyvalue['key1']", "size(keyvalue)", "map_keys(keyvalue)", "map_values(keyvalue)").show()
}
def arrayToDataFrame(spark: SparkSession): Unit = {
import spark.implicits._
val arr = Array(1, 2, 3, 4, 5)
val rdd21: RDD[Array[Int]] = spark.sparkContext.parallelize(List(arr))
val frame2: DataFrame = rdd21.toDF("value")
frame2.selectExpr("value[0]", "value[1]", "value[2]").show()
}
def parquetToDataFrame(spark: SparkSession): Unit = {
// 这里需要parquet的文件
val frame: DataFrame = spark.read.parquet("spark_sql/doc/output")
frame.printSchema()
frame.show()
}
}
JavaBean对象
package blog;
/**
* @author: 余辉
* @blog: https://blog.csdn.net/silentwolfyh
* @create: 2019-12-29 11:13
* @description:
**/
public class StuJava {
// 属性 类型
int id;
String name;
int age;
String city;
Double score;
// 无参 和 有参 构造方法
public StuJava() {
}
public StuJava(int id, String name, int age, String city, Double score) {
this.id = id;
this.name = name;
this.age = age;
this.city = city;
this.score = score;
}
// get set
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public Double getScore() {
return score;
}
public void setScore(Double score) {
this.score = score;
}
@Override
public String toString() {
return "StuJava{" +
"id=" + id +
", name='" + name + '\'' +
", age=" + age +
", city='" + city + '\'' +
", score=" + score +
'}';
}
}