SparkSQL
使用DataFrame创建表
将DataFrame注册成临时的一张表,这张表临时注册到内存中,是逻辑上的表,不会雾化到磁盘,就可以使用sql
frame.createOrReplaceTempView("mytable")
将RDD转为DataSet
Reflection 反射的方式创建DataSet
import session.implicits._
/**
* 直接读取文件为DataSet
*/
val person: Dataset[String] = session.read.textFile("./data/people.txt")
val personDs: Dataset[Person] = person.map(one => {
val arr = one.split(",")
Person(arr(0).toInt, arr(1).toString, arr(2).toInt, arr(3).toDouble)
})
/**
* 直接读取文件为RDD
*/
val textFile: RDD[String] = session.sparkContext.textFile("./data/people.txt")
val personRDD: RDD[Person] = textFile.map(_.split(",")).map(line=>{Person(line(0).toInt,line(1).toString, line(2).toInt, line(3).toDouble)})
val frame: DataFrame = personRDD.toDF()
frame.show()
动态Schema创建DataSet
val dataSet = session.sparkContext.textFile("./data/people.txt")
val rowRDD = dataSet.map(_.split(",")).map(line => {
Row(line(0).toInt, line(1).toString, line(2).toInt, line(3).toDouble)
})
val structType: StructType = StructType(List[StructField](
StructField("id", IntegerType, nullable = true),
StructField("name", StringType, nullable = true),
StructField("age", IntegerType, nullable = true),
StructField("score", DoubleType, nullable = true)
))
val dataFrame: DataFrame = session.createDataFrame(rowRDD, structType)
dataFrame.show()
dataFrame.printSchema()
读取json,parquet
val dataFrame: DataFrame = session.read.json("./data/json")
println(session.sparkContext.getConf.get("spark.app.name"))
dataFrame.show()
/**
* 保存parquet
*/
dataFrame.write.mode(SaveMode.Append).format("parquet").parquet("./data/parquet")
/**
* 读取parquet
*/
val parquetDS: DataFrame = session.read.format("parquet").parquet("./data/parquet")
//val parquetDS = session.read.parquet("./data/parquet")
parquetDS.show()
parquetDS.createOrReplaceTempView("parquet")
val frame: DataFrame = parquetDS.sqlContext.sql("select name from parquet")
frame.withColumnRenamed("name", "myname").show()
读取Mysql
val session = SparkSession.builder().master("local")
.config("spark.sql.shuffle.partitions", 1).getOrCreate()
//读取mysql第一种方式
val properties = new Properties()
properties.setProperty("user", "root")
properties.setProperty("password", "123")
val person: DataFrame = session.read.jdbc("jdbc:mysql://node11:3306/spark", "person", properties)
person.show()
//读取mysql第二种方式
val map = Map[String, String](
"url" -> "jdbc:mysql://node11:3306/spark",
"driver" -> "com.mysql.jdbc.Driver",
"user" -> "root",
"password" -> "123",
"dbtable" -> "score"
)
val score = session.read.format("jdbc").options(map).load()
score.show()
//读取mysql的第三种方式
val scoreDF: DataFrame = session.read.format("jdbc").option("url", "jdbc:mysql://node11:3306/spark")
.option("driver", "com.mysql.jdbc.Driver").option("user", "root")
.option("password", "123").option("dbtable", "score").load()
scoreDF.show()
Spark on Hive
1.配置spark on hive
cp /opt/apache-hive-1.2.1-bin/conf/hive-site.xml /opt/spark-2.3.1-bin-hadoop2.6/conf/
vim hive-site.xml
<property>
<name>hive.metastore.uris</name>
<value>thrift://node13:9083</value>
</property>
2.进入Hive的安装目录,编辑hive,输入 /spark-assembly ,回车,定位到spark-assembly
将
sparkAssemblyPath=ls ${SPARK_HOME}/lib/spark-assembly-*.jar
修改为
sparkAssemblyPath=ls ${SPARK_HOME}/jars/*.jar
3.启动hive
hive --service metastore
UDF
val session = SparkSession.builder().appName("udf").master("local").getOrCreate()
val list = List[String]("zhangsan", "lisi", "wangwu", "zhaoshun", "liuba")
session.udf.register("strlen", (str: String)=>{
str.length
})
import session.implicits._
val nameDF = list.toDF("name")
nameDF.createOrReplaceTempView("student")
session.sql("select name,strlen(name) as length from student sort by length desc").show()
UDAF(聚合函数)
继承UserDefinedAggregateFunction
class MyUDAF extends UserDefinedAggregateFunction{
//输入数据的类型
override def inputSchema: StructType = {
StructType(List[StructField](StructField("inputColumn", StringType, true)))
// DataTypes.createStructType(Array(DataTypes.createStructField("uuuu", StringType, true)))
}
// 聚合操作时,所处理的数据的类型
override def bufferSchema: StructType = {
StructType(List[StructField](StructField("count", LongType, true)))
}
// 最终函数返回值的类型
override def dataType: DataType = LongType
//多次运行 相同的输入总是相同的输出,确保一致性
override def deterministic: Boolean = true
// 为每个分组的数据执行初始化值
override def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = 0L
}
// 每个组,有新的值进来的时候,进行分组对应的聚合值的计算
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
if(!input.isNullAt(0)){
buffer(0) = buffer.getAs[Long](0) + 1
}
}
// 最后merger的时候,在各个节点上的聚合值,要进行merge,也就是合并
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1(0) = buffer1.getAs[Long](0) + buffer2.getAs[Long](0)
}
// 最后返回一个最终的聚合值要和dataType的类型一一对应
override def evaluate(buffer: Row): Any = {
buffer.getAs[Long](0)
}
}
val personRDD: RDD[String] = session.sparkContext.parallelize(List[String](
"zhangsan", "lisi", "wangwu", "tianqi",
"zhangsan", "lisi", "wangwu", "tianqi",
"zhangsan", "lisi", "wangwu", "tianqi"
))
import session.implicits._
val dataFrame = personRDD.toDF("name")
dataFrame.createOrReplaceTempView("person")
dataFrame.show()
session.udf.register("countName", new MyUDAF)
session.sql("select name, countName(name) as length from person group by name").show()
窗口函数
只有hive,oracle,mysql 8之后 才支持over函数
select riqi,leibie,jine from (
select riqi,leibie,row_number() over (partition by leibie order by jine desc)
) t where t.rank<=3
SparkStreaming: https://blog.csdn.net/happiless/article/details/107308181