1.编写数据源
val seq = Seq(("1","xiaoming",15),("2","xiaohong",18),("3","tom",19))
2.将数据转换为RDD
val rdd1 = sc.parallelize(seq)
3.匹配字段名
val df = rdd1.toDF("id","name","age")
4.查询数据
df.show
DSL风格
条件查询
df.filter("age>15").show
查询字段
df.select("name","age").show
Sql风格语法
创建一个临时表,表明为people
df.createOrReplaceTempView("people")
查询people
val sqlDF = spark.sql("select * from people")
sqlDF.show
注意,当Session退出后表失效
创建一个全局表
df.createGlobalTempView("people")
查询全局表是需要带着全路径grobal_temp.people
spark.sql("select * from global_temp.people").show()
spark.newSession().sql("select * from global_temp.people").show()
自定义函数:spark.udf.register(函数名, 函数体)
spark.udf.register("addName", (x : String) => "Name:" + x)
spark.sql("select id, addName(name), age from people").show()
IDEA编写SparkSql
方法一
package com.zpark.stu.sparksql
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
object SparkSqlDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SparkSqlDemo").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlc = new SQLContext(sc)
val testRdd: RDD[String] = sc.textFile("E:\\hadoop\\spark\\WordCount\\InPut\\person.txt")
val mapRdd: RDD[Array[String]] = testRdd.map(_.split(","))
// mapRdd.foreach(y => println(y.toBuffer))
val mapRddTwo: RDD[Person] = mapRdd.map(x => Person(x(0).toInt, x(1), x(2).toInt))
// mapRddTwo.foreach(y => println(y))
import sqlc.implicits._
//DSL风格
val df: DataFrame = mapRddTwo.toDF()
df.show()
df.filter($"age" > 18).show()
df.filter("age > 16").show()
df.registerTempTable("people")
//Sql风格
val sqlLimit: DataFrame = sqlc.sql("select * from people where id = 2")
sqlLimit.show()
// sqlLimit.write.mode("append").save("E:\\hadoop\\spark\\WordCount\\sparkSqlOut")
sc.stop()
}
case class Person(id:Int, name:String, age:Int)
}
方法二:
package com.zpark.stu.sparksql
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType,StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
object SparkSqlDemo2 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SparkSqlDemo2").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlc = new SQLContext(sc)
val fileRdd: RDD[String] = sc.textFile("E:\\hadoop\\spark\\WordCount\\InPut\\person.txt")
val mapRdd: RDD[Array[String]] = fileRdd.map(_.split(","))
//创建表结构
val structType: StructType = StructType {
List(
StructField("id", IntegerType, false),
StructField("name", StringType, true),
StructField("age", IntegerType, false)
)
}
val rowRdd: RDD[Row] = mapRdd.map(arr => Row(arr(0).toInt, arr(1), arr(2).toInt))
val df: DataFrame = sqlc.createDataFrame(rowRdd, structType)
df.show()
}
}
将数据保存到MySQL数据库中
object SparkSqlDemo3 {
/**
* 将数据保存到MySQL数据库中
*/
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SparkSqlDemo3").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlc = new SQLContext(sc)
val fileRdd: RDD[Array[String]] = sc.textFile("E:\\hadoop\\spark\\WordCount\\InPut\\person.txt").map(_.split(","))
val structType = StructType {
Array(
StructField("id", IntegerType, false),
StructField("name", StringType, false),
StructField("age", IntegerType, false)
)
}
val rowRdd: RDD[Row] = fileRdd.map(arr => Row(arr(0).toInt, arr(1), arr(2).toInt))
val df: DataFrame = sqlc.createDataFrame(rowRdd, structType)
val conn = new Properties()
conn.put("user", "root")
conn.put("password", "403411")
conn.put("driver", "com.mysql.cj.jdbc.Driver")
val table = "sparksql"
val url = "jdbc:mysql://localhost/frame?characterEncoding=utf-8&serverTimezone=UTC"
//链接jdbc mysql数据库需要url, table, conn,所以上边创建了,将数据写入frame库中的sparksql表中
df.write.mode("append").jdbc(url, table, conn)
println("执行完成!")
sc.stop()
}
}