目录
一、导入环境依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.49</version>
</dependency>
二、WordCount
//创建spark环境
val spark: SparkSession = SparkSession
.builder()
.appName("wc")
.master("local")
.getOrCreate()
/*
DataFrame: 表结构
在rdd的基础上增加了列名和列类型
*/
val frame: DataFrame = spark
.read
.format("csv") //读取数据的格式,csv为文本格式
.option("sep", "\t") //指定字段的分割方式,不写默认是逗号
.schema("line STRING") //指定字段名和字段类型
.load("data/words.txt") //指定读取文件路径
//将DataFrame注册成一致临时视图,指定表名
frame.createOrReplaceTempView("lines")
//写sql语句,做查询
val wordCountDF: DataFrame = spark.sql(
"""
|select
|T.word,count(1) as c
|from (select explode(split(line,',')) word from lines) as T
|group by T.word
|""".stripMargin
)
wordCountDF.show() //展示表
//将DF数据存储到hdfs中
wordCountDF
.write
.format("csv")
.option("sep","\t")
.mode(SaveMode.Overwrite) //如果输出路径已存在自动覆盖
.save("data/wc")
}
三、读取各种格式的文件
1、csv:文本格式
//读取文件创建表结构
val frame: DataFrame = spark
.read
.format("csv")
.option("sep", ",")
.schema("id STRING , name STRING, age INT , gender STRING , clazz STRING")
.load("data/students.txt")
//打印表结构
frame.printSchema()
//插卡表数据,默认打印前20行,可指定行数
frame.show(5)
2、json格式
{"age":22,"clazz":"文科六班","gender":"女","id":"1500100001","name":"施笑槐"}
{"age":24,"clazz":"文科六班","gender":"男","id":"1500100002","name":"吕金鹏"}
{"age":22,"clazz":"理科六班","gender":"女","id":"1500100003","name":"单乐蕊"}
{"age":24,"clazz":"理科三班","gender":"男","id":"1500100004","name":"葛德曜"}
/*
读取json格式的数据构建DF
spark 会自动解析json格式
*/
val frameJs: DataFrame = spark
.read
.format("json")
.load("data/students.json")
//打印表结构
frameJs.printSchema()
//插卡表数据,默认打印前20行,可指定行数
frameJs.show()
3、jdbc连接数据库
/*
3、读取jdbc数据
通过网络远程读取mysql中的数据
*/
val jdbcDF: DataFrame = spark
.read
.format("jdbc")
.option("url", "jdbc:mysql://master:3306")
.option("dbtable", "shujia.student")
.option("user", "root")
.option("password", "123456")
.load()
jdbcDF.printSchema()
jdbcDF.show()
4、parquet压缩文件
/**
读取parquet格式的数据
parquet格式的数据中自带列名和列的类型,
parquet会对数据进行压缩,体积变小,解压和压缩需要时间
*/
//保存一个parquet格式的文件
studentDF
.write
.format("parquet")
.mode(SaveMode.Overwrite)
.save("data/parquet")
//读取parquet格式的数据
val parquetDF: DataFrame = spark
.read
.format("parquet")
.load("data/parquet")
parquetDF.printSchema()
parquetDF.show()
四、DSL
使用spark自带的SQL方法
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName("df")
.master("local")
//设置sparksql 在shuffle之后DF的分区数据,默认是200
.config("spark.sql.shuffle.partitions", 1)
.getOrCreate()
val scoreDF: DataFrame = spark.read
.format("csv")
.option("sep", ",")
.schema("id STRING , cid STRING ,sco INT")
.load("data/score.txt")
val studentDF: DataFrame = spark
.read
.format("csv")
.option("sep", ",")
.schema("id STRING , name STRING, age INT , gender STRING , clazz STRING")
.load("data/students.txt")
//select: 选择字段,和sql中select是一样
studentDF.select("id", "age").show()
//对字段做处理,用selectExpr方法,与SQL语法一样
studentDF.selectExpr("id", "age + 1 as age").show()
//如果使用select方法实现字段处理,要导入隐式转换
import spark.implicits._
studentDF.select($"id", $"age" + 2 as "age").show()
/*
where : 过滤数据
*/
//使用字符串表达式
studentDF.where("gender='男'").show()
//使用列对象的方式
studentDF.where($"gender" === "男").show()
//group by
studentDF
.groupBy("clazz") //分组
.count() //计数
.show()
//orderBy
scoreDF
.groupBy($"id")
.agg(sum("sco") as "sumSco")
.orderBy($"sumSco")
.show()
//导入所有的sparkSQL的方法
import org.apache.spark.sql.functions._
//agg:分组后进行聚合计算
studentDF
.groupBy($"clazz")
.agg(count($"clazz") as "c", avg($"age") as "avgAge") //聚合后统计数量和计算平均值
.show()
//join
//当关联字段名不一致的时候,如果字段名一样不能使用这种写法
//val joinDF: DataFrame = studentDF.join(scoreDF, $"id" === $"sid", "inner") //参数为合并的表,字段和方式
//关联字段名一样的时候
val joinDF: DataFrame = studentDF.join(scoreDF, "id")
joinDF.show()
/**
* 统计每个班级总分前2的学生
* withColumn: 给DF增加新的列
*/
joinDF
.groupBy($"id",$"clazz")
.agg(sum($"sco") as "sumSco")
.withColumn("r",row_number() over Window.partitionBy($"clazz").orderBy($"sumSco".desc))
.where($"r"<=2)
.show()
//使用SQL语句,先创建临时视图
joinDF.createOrReplaceTempView("stu_sco")
spark.sql(
"""
|select * from (
|select
| id,
| clazz,
| sumSco,
| row_number() over(partition by clazz order by sumSco desc) as r
| from (
| select id,clazz,sum(sco) as sumSco
| from stu_sco
| group by id,clazz
|) as a
|) as b
|where r <= 2
|
|
""".stripMargin).show()
}
}
五、sparkSQL与rdd相互转换
val spark: SparkSession = SparkSession
.builder()
.master("local")
.appName("rddondf")
.config("spark.sql.shuffle.partitions", 1)
.getOrCreate()
import spark.implicits._ //必须要导入隐式转换,才能使用转换的方法
/**
* 创建好SparkSession之后可以直接获取SparkContext
*
*/
val sc: SparkContext = spark.sparkContext
//读取文件得到一个RDD
val lineRDD: RDD[String] = sc.textFile("data/students.txt")
//1、将数据拆分
val studentRDD: RDD[(String, String, Int, String, String)] = lineRDD.map(line => {
val split: Array[String] = line.split(",")
(split(0), split(1), split(2).toInt, split(3), split(4))
})
/**
* 将RDD转换成DF
* 需要注意字段的数量和顺序
*/
val studentDF: DataFrame = studentRDD.toDF("id", "name", "age", "gender", "clazz")
studentDF.printSchema()
studentDF.show()
/**
* DF转换成RDD
*
*/
val stuRDD: RDD[Row] = studentDF.rdd
//1、通过字段名和类型获取数据
val kvRDD: RDD[(String, String, Int, String, String)] = stuRDD.map(row => {
//通过字段名获取数据
val id: String = row.getAs[String]("id")
val name: String = row.getAs[String]("name")
val age: Int = row.getAs[Int]("age")
val gender: String = row.getAs[String]("gender")
val clazz: String = row.getAs[String]("clazz")
(id, name, age, gender, clazz)
})
// kvRDD.foreach(println)
//2、使用case匹配的方式解析
val caseRDD: RDD[(String, String, Int, Int, String)] = stuRDD.map {
case Row(id: String, name: String, age: Int, gender: String, clazz: String) =>
(id, name, age, age, clazz)
}
caseRDD.foreach(println)