今天也要努力学习
一:通过读取json格式的文件创建dataframe:
package com.bjsxt.scalaspark.sql.DataSetAndDataFrame
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* 读取json格式的文件加载DataFrame
* 注意:
* 1.读取json格式的两种方式
* 2.df.show默认显示前20行,使用df.show(行数)显示多行
* 3.dataFrame加载过来会按照列的ascii码排序
* 4.df.printSchema显示列的Schema信息
* 5.创建临时表的两种方式和区别:createOrReplaceTempView | createGlobalTempView
*/
object CreateDataFrameFromJsonFile {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("DataSetFromJsonFile")
.master("local")
.getOrCreate()
/**
* 需要导入隐式转换,下面的$ toDF都会用到隐式转换
*/
// val df: DataFrame = spark.read.json("./data/json")
val df: DataFrame= spark.read.format("json").load("./data/json")
//显示DataFrame,默认显示前20行
df.show()
//指定显示的行数
df.show(100)
//打印schema信息
df.printSchema()
/**
* 获取值
*/
// val rows1: Array[Row] = df.take(100)
// val row2: Row = df.first()
// val rows3: Array[Row] = df.head(4)
// println(rows1.toBuffer)
// println(row2)
// println(rows3.toBuffer)
/**
* 使用DataFrame原生api
*/
// val frame: DataFrame = df.select(df.col("name"))
// val frame = df.select("name")
// val frame = df.select("name","age")
// val frame = df.filter($"age">18)//查找年龄大于18的人
// val frame = df.filter(df.col("name").equalTo("zhangsan"))
// val frame = df.filter("name='zhangsan4' or name = 'zhangsan5'")//可以写表达式
// val frame = df.sort($"age".asc,$"name".desc)//按照age升序,按照name降序排列
// val frame = df.sort(df.col("age").asc,df.col("name").desc)//按照age升序,按照name降序排列
// val frame = df.select(df.col("name").as("studentName"),df.col("age").alias("studentAge"))//给定别名
// val frame = df.select($"name".alias("studentName"),$"age")//给定别名
// val frame = df.select($"name",($"age"+1).as("addAge"))//age+1之后,直接列名 成为了 age+1
// val frame = df.groupBy("age").count()//按照age 分组
// frame.show(100)
/**
* 使用sql 操作
*
* createOrReplaceTempView:创建临时的视图
* createTempView
* createGlobalTempView:创建全局的视图,访问全局的视图使用 global_temp.表名,全局的表可以跨spark session访问
*
*/
// df.createOrReplaceTempView("students")
// df.createGlobalTempView("students")
// val frame: DataFrame = spark.sql("select * from students")
// frame.show()
// import spark._
// sql("select * from students where age > 18 and name = 'zhangsan5'").show(100)
// sql("select * from global_temp.students where name like 'wang%'").show(100)
//
// //创建一个新的session
// spark.newSession().sql("SELECT * FROM global_temp.students").show()
// spark.newSession().sql("SELECT * FROM students").show()
/**
* 将DataFrame转换成RDD
*/
// val rdd: RDD[Row] = df.rdd
// rdd.foreach(row=>{
// //打印row
// println(row)
// /**
// * row中获取值,两种方式 1. getAs("字段名") 2.getAs(下标)
// */
// val name = row.getAs[String]("name")
// val age = row.getAs[Long]("age")
//
val name = row.getAs[String](1)
val age = row.getAs[Long](0)
println("name = "+name+",age = "+age)
// })
spark.stop()
}
}
二: 读取json格式的DataSet(spark2.x版本就淘汰了读取Rdd格式的方式)创建dataframe
package com.bjsxt.scalaspark.sql.DataSetAndDataFrame
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
/**
* 由json格式的RDD转换成DataFrame,在旧的版本中 spark.read.json(jsonRDD<String>) ,这中方式2.3+被丢弃
* 目前来说 2.3+版本 中直接就是将json格式的DataSet转换成DataFrame
*
* 注意:
* 1.加载成的DataFrame会自动按照列的Ascii码排序
* 2.自己写sql 组成的DataFrame的列不会按照列的Ascii码排序
*/
object CreateDataFrameFromJsonDataSet {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("createDFFromJsonRDD").master("local").getOrCreate()
val jsonList = List[String](
"{\"name\":\"zhangsan\",\"age\":20}",
"{\"name\":\"lisi\",\"age\":21}",
"{\"name\":\"wangwu\",\"age\":22}"
)
val jsonList2 = List[String](
"{\"name\":\"zhangsan\",\"score\":100}",
"{\"name\":\"lisi\",\"score\":200}",
"{\"name\":\"wangwu\",\"score\":300}"
)
/**
* 1.6版本方式
*/
val jsonRDD: RDD[String] = spark.sparkContext.makeRDD(jsonList)
val df1: DataFrame = spark.read.json(jsonRDD)
df1.show(100)
/**
* 2.3+版本
*/
import spark.implicits._
val jsonDs: Dataset[String] = jsonList.toDS()
val scoreDs: Dataset[String] = jsonList2.toDS()
val df2: DataFrame = spark.read.json(jsonDs)
val df3 :DataFrame = spark.read.json(scoreDs)
df2.show()
df2.createOrReplaceTempView("person")
df3.createOrReplaceTempView("score")
val frame: DataFrame = spark.sql("select * from person")
frame.show()
spark.sql("select t1.name ,t1.age,t2.score from person t1, score t2 where t1.name = t2.name").show()
}
}
三:通过反射的方式创建dataframe
package com.bjsxt.scalaspark.sql.DataSetAndDataFrame
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
/**
* 通过反射的方式将DataSet转换成DataFrame
* 1.自动生成的DataFrame 会按照对象中的属性顺序显示
*/
/**
* 先创建好对应的映射类
*/
case class Student(name:String,age:Long)
case class Person(id: Int, name: String, age: Int, score: Double)
object CreateDataFrameFromRDDWithReflection {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("createDataFrameFromRDDWithReflection").getOrCreate()
import spark.implicits._
/**
* 直接读取文件为DataSet
*/
// val person: Dataset[String] = spark.read.textFile("./data/people.txt")
// val personDs: Dataset[Person] = person.map(one => {
// val arr = one.split(",")
// Person(arr(0).toInt, arr(1).toString, arr(2).toInt, arr(3).toDouble)
// })
/**
* 直接读取文件为RDD
*/
val rdd: RDD[String] = spark.sparkContext.textFile("./data/people.txt")
val personDs: RDD[Person] = rdd.map(one => {
val arr = one.split(",")
Person(arr(0).toInt, arr(1).toString, arr(2).toInt, arr(3).toDouble)
})
val frame: DataFrame = personDs.toDF()
frame.show()
/**
* dataFrame api 操作
*/
frame.createOrReplaceTempView("people")
val teenagersDF: DataFrame = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")
teenagersDF.show()
//根据row中的下标获取值
teenagersDF.map(teenager => "Name: " + teenager(0)).show()
//根据row中的字段获取值
teenagersDF.map(teenager => "Name: " + teenager.getAs[String]("name")).show()
/**
* 数据集[Map[K,V]没有预定义的编码器,在这里定义
*/
// implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]]
// //Map 没有额外的编码器,在转换过程中Map 需要隐式转换的编码器
// val result: Dataset[Map[String, Any]] = teenagersDF.map(teenager=>{teenager.getValuesMap[Any](List("name","age"))})
// result.collect().foreach(println)
}
}
四:通过动态创建schema来创建dataframe
package com.bjsxt.scalaspark.sql.DataSetAndDataFrame
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types._
/**
* 通过动态创建Schema的方式加载DataFrame
*/
object CreateDataFrameFromRDDWithSchema {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("createdataframefromrddwithschema").getOrCreate()
val peopleRDD: RDD[String] = spark.sparkContext.textFile("./data/people.txt")
/**
* 将peopleRDD转换成RDD[Row]
*/
val rowRDD: RDD[Row] = peopleRDD.map(one => {
val arr: Array[String] = one.split(",")
Row(arr(0).toInt, arr(1), arr(2).toInt, arr(3).toLong)
})
val structType: StructType = StructType(List[StructField](
StructField("id", IntegerType, nullable = true),
StructField("name", StringType, nullable = true),
StructField("age", IntegerType, nullable = true),
StructField("score", LongType, nullable = true)
))
val frame: DataFrame = spark.createDataFrame(rowRDD,structType)
frame.show()
frame.printSchema()
// val schemaString = "id name age score"
// /**
// * 动态创建Schema方式
// */
// val fields: Array[StructField] = schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, nullable = true))
// val schema: StructType = StructType(fields)
//
// val rowRDD: RDD[Row] = peopleRDD
// .map(_.split(","))
// .map(attributes => Row(attributes(0).trim, attributes(1).trim, attributes(2).trim, attributes(3).trim))
//
// //创建DataFrame
// import spark.implicits._
// val peopleDF: DataFrame = spark.createDataFrame(rowRDD,schema)
// peopleDF.show()
// peopleDF.printSchema()
//
// //注册临时表
// peopleDF.createOrReplaceTempView("people")
// val results: DataFrame = spark.sql("SELECT name FROM people")
// results.map(attributes => "Name: " + attributes(0)).as("myCol").show()
}
}
五:通过读取mysql表创建dataframe
package com.bjsxt.scalaspark.sql.DataSetAndDataFrame
import java.util.Properties
import org.apache.spark.sql.{DataFrame, DataFrameReader, SaveMode, SparkSession}
/**
* 将MySQL中的表加载成DataFrame
*/
object CreateDataFrameFromMySQL {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("createdataframefrommysql")
.config("spark.sql.shuffle.partitions",1)
.getOrCreate()
/**
* 读取mysql表第一种方式
*/
val properties = new Properties()
properties.setProperty("user", "root")
properties.setProperty("password", "123456")
val person: DataFrame = spark.read.jdbc("jdbc:mysql://192.168.179.4:3306/spark","person",properties)
person.show()
/**
* 读取mysql表第二种方式
*/
val map = Map[String,String](
"url"->"jdbc:mysql://192.168.179.4:3306/spark",
"driver"->"com.mysql.jdbc.Driver",
"user"->"root",
"password"->"123456",
"dbtable"->"score"//表名
)
val score: DataFrame = spark.read.format("jdbc").options(map).load()
score.show()
/**
* 读取mysql数据第三种方式
*/
val reader: DataFrameReader = spark.read.format("jdbc")
.option("url", "jdbc:mysql://192.168.179.4:3306/spark")
.option("driver", "com.mysql.jdbc.Driver")
.option("user", "root")
.option("password", "123456")
.option("dbtable", "score")
val score2: DataFrame = reader.load()
score2.show()
//将以上两张表注册临时表,关联查询
person.createOrReplaceTempView("person")
score.createOrReplaceTempView("score")
spark.sql("select person.id,person.name,person.age,score.score from person ,score where person.id = score.id").show()
//将结果保存在Mysql表中,String 格式的数据在MySQL中默认保存成text格式,如果不想使用这个格式 ,可以自己建表创建各个列的格式再保存。
val result: DataFrame = spark.sql("select person.id,person.name,person.age,score.score from person ,score where person.id = score.id")
result.write.mode(SaveMode.Append).jdbc("jdbc:mysql://192.168.179.4:3306/spark", "result", properties)
/**
* 读取mysql中数据的第四种方式
*/
spark.read.jdbc("jdbc:mysql://192.168.179.4:3306/spark","(select person.id,person.name,person.age,score.score from person ,score where person.id = score.id) T",properties).show()
}
}
六:通过hive创建dataframe
package com.bjsxt.scalaspark.sql.DataSetAndDataFrame
import org.apache.spark.sql.{SaveMode, SparkSession}
/**
* 读取Hive中的数据
* 要开启 :enableHiveSupport
*/
object CreateDataFrameFromHive {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("CreateDataFrameFromHive").enableHiveSupport().getOrCreate()
spark.sql("use spark")
spark.sql("drop table if exists student_infos")
spark.sql("create table if not exists student_infos (name string,age int) row format delimited fields terminated by '\t'")
spark.sql("load data local inpath '/root/test/student_infos' into table student_infos")
spark.sql("drop table if exists student_scores")
spark.sql("create table if not exists student_scores (name string,score int) row format delimited fields terminated by '\t'")
spark.sql("load data local inpath '/root/test/student_scores' into table student_scores")
val df = spark.sql("select si.name,si.age,ss.score from student_infos si,student_scores ss where si.name = ss.name")
spark.sql("drop table if exists good_student_infos")
/**
* 将结果写入到hive表中
*/
df.write.mode(SaveMode.Overwrite).saveAsTable("good_student_infos")
}
}
七:通过读取parquet格式数据文件创建dataframe
package com.bjsxt.scalaspark.sql.DataSetAndDataFrame
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
object CreateDataFrameFromParquet {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("createdataframefromparquet").getOrCreate()
val df1: DataFrame = spark.read.json("./data/json")
df1.show()
/**
* 保存成parquet文件
*/
df1.write.mode(SaveMode.Append).format("parquet").save("./data/parquet")
/**
* 读取parquet文件
*
*/
val df2: DataFrame = spark.read.parquet("./data/parquet")
df2.show()
}
}
附:简单的wc练习
package com.wuyue.sparkSQL
import org.apache.spark.sql.{DataFrame, Dataset, RelationalGroupedDataset, SparkSession}
object SQL_WC {
def main(args: Array[String]): Unit = {
val saprk: SparkSession = SparkSession.builder()
.master("local")
.appName("wc")
.config("spark.sql.shuffle.partitions", 1) //设置job的task分区数量 数据量低时设置低一点 提升性能
.getOrCreate()
//DataSet与RDD类似,但是DataSet不许需要进行反序列化的操作 所以Dataset相对来说速度更快
val jsonList = List[String](
"hello wuyue",
"hello sushui",
"hello mingtian ",
"hello spark"
)
import saprk.implicits._
val lineDs: Dataset[String] = jsonList.toDS()
lineDs.show()
import org.apache.spark.sql.functions._
val words: Dataset[String] = lineDs.flatMap(line=>{line.split(" ")})
val groupDs: RelationalGroupedDataset = words.groupBy($"value" as "word")
val aggDs: DataFrame = groupDs.agg(count("*")as "totalcount")
val frame: DataFrame = aggDs.toDF()
frame.show()
frame.createOrReplaceTempView("t1")
saprk.sql("select * from t1 where word='wuyue'").show()
}
}