一、将JSON格式的数据转化为table
1、SparkSQL的数据源
SparkSQL的数据源可以是JSON类型的字符串,JDBC,Parquent,Hive,HDFS等。
{"name":"zhangsan","age":20}
{"name":"lisi"}
{"name":"wangwu","age":18}
{"name":"wangwu","age":18}
2、依赖jar包(2.4.4版本有2.11和2.12两种,注意使用的core是哪种)
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.4</version>
</dependency>
3、scala代码
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName("SQLTest").master("local").getOrCreate()
val df: DataFrame = spark.read.format("json").load("e:\\json")
//打印图表,相当于desc查看表结构
// df.printSchema()
//相当于查询表
// df.show()
//取出表中的值生成RDD
val rdd: RDD[Row] = df.rdd
rdd.foreach(row=>{
val name: Long = row.getAs[Long]("age")
val age: String = row.getAs[String]("name")
println(s"name = $name,age = $age")
})
df.printSchema()打印结果
df.show()打印结果
二、将RDD转化为table
1、数据源
2、scala代码
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object SQLTest {
def main(args: Array[String]): Unit = {
//一个applaction只能有一个SparkContext,getOrCreate()的作用就是如果前边没有这个对象,就创建一个,如果在这个方法之后再
// 创建SparkContext就会报错
val spark: SparkSession = SparkSession.builder().appName("SQLTest").master("local").getOrCreate()
val lines: RDD[String] = spark.sparkContext.textFile("e:\\persion")
val rowRDD: RDD[Row] = lines.map(line => {
val arr: Array[String] = line.split(",")
Row(arr(0).toInt, arr(1), arr(2).toInt, arr(3).toLong)
})
rowRDD.foreach(println)
//规定表的结构类型
val structType = StructType(List[StructField](
StructField("id",IntegerType,nullable=true),
StructField("name",StringType,nullable=true),
StructField("age",IntegerType,nullable = true),
StructField("score",LongType,nullable = true)
))
//将RDD转化为table
val frame: DataFrame = spark.createDataFrame(rowRDD,structType)
frame.show()
}
}
3、处理结果
三、直接读取数据库的表,转化成DataFrame
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName("SQLTest").master("local").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
//读取mysql的方式
val properties = new Properties()
properties.setProperty("user","root")
properties.setProperty("password","123456")
val result: DataFrame = spark.read.jdbc("jdbc:mysql://localhost:3306/userdb","user",properties)
result.show()
}
结果
四、自定义函数UDF
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName("SQLTest").master("local").getOrCreate()
val name: List[String] = List[String]("zhangsan","lisi","wangwu","zhaoliu")
import spark.implicits._
val df: DataFrame = name.toDF("name")
df.createOrReplaceTempView("student")
//创建UDF自定义函数的方法
spark.udf.register("STRLEN",(name:String)=>{
name.length
})
spark.sql("select name,STRLEN(name) as length from student sort by length desc").show()
}
执行结果
五、自定义函数UDAF
package com.bjsxt.scalaspark.sql.UDF_UDAF
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
/**
*UserDefinedAggregateFunction 用户自定义聚合函数 抽象类
*/
class MyUDAF extends UserDefinedAggregateFunction {
//输入数据的类型
def inputSchema: StructType = {
DataTypes.createStructType(Array(DataTypes.createStructField("uuuu", StringType, true)))
}
/**
* 为每个分组的数据执行初始化值
* 两个部分的初始化:
* 1.在map端每个RDD分区内,在RDD每个分区内 按照group by 的字段分组,每个分组都有个初始化的值
* 2.在reduce 端给每个group by 的分组做初始值
*/
def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = 0
}
// 每个组,有新的q值进来的时候,进行分组对应的聚合值的计算
def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
buffer(0) = buffer.getAs[Int](0)+1
}
// 最后merger的时候,在各个节点上的聚合值,要进行merge,也就是合并
def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1(0) = buffer1.getAs[Int](0)+buffer2.getAs[Int](0)
}
// 聚合操作时,所处理的数据的类型
def bufferSchema: StructType = {
DataTypes.createStructType(Array(DataTypes.createStructField("QQQQ", IntegerType, true)))
}
//
def evaluate(buffer: Row): Any = {
buffer.getAs[Int](0)
}
// 最终函数返回值的类型
def dataType: DataType = {
DataTypes.IntegerType
}
//多次运行 相同的输入总是相同的输出,确保一致性
def deterministic: Boolean = {
true
}
}
object UDAF {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("UDAF").getOrCreate()
val nameList: List[String] = List[String]("zhangsan", "lisi", "lisi", "lisi", "lisi", "wangwu", "zhangsan", "lisi", "zhangsan", "wangwu")
import spark.implicits._
val frame: DataFrame = nameList.toDF("name")
frame.createOrReplaceTempView("students")
//select name ,count(*) from table group by name
/**
* 注册UDAF函数
*
*/
spark.udf.register("NAMECOUNT",new MyUDAF())
spark.sql("select name,NAMECOUNT(name) as count from students group by name").show(100)
}
}