Spark SQL支持的外部数据源
hive
//用spark读取hive数据
val spark: SparkSession = SparkSession.builder().appName("demo0")
.master("local[*]")
.config("hive.metastore.uris", "thrift://192.168.232.211:9083")
.enableHiveSupport()
.getOrCreate()
// val frame: DataFrame = spark.sql("show databases")
// frame.show()
val frame: DataFrame = spark.sql("select * from toronto")
frame.printSchema()
frame.show()
mysql
val spark: SparkSession = SparkSession.builder().appName("demo0")
.master("local[*]")
.config("hive.metastore.uris", "thrift://192.168.232.211:9083")
.enableHiveSupport()
.getOrCreate()
val url = "jdbc:mysql://192.168.232.211:3306"
val user = "root"
val pwd = "ok"
val properties = new java.util.Properties()
properties.setProperty("user",user)
properties.setProperty("password",pwd)
properties.setProperty("driver","com.mysql.jdbc.Driver")
val tableDF: DataFrame = spark.read.jdbc(url,table = "test.stu",properties)
tableDF.printSchema()
tableDF.show()
import org.apache.spark.sql.functions._
tableDF.agg(max("stu_id")).show()
json
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("demo1")
val sc = SparkContext.getOrCreate(conf)
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("demo")
.getOrCreate()
import spark.implicits._
val userDF: DataFrame = spark.read.format("json").json("in/user.json")
// userDF.printSchema()
// userDF.select("name").show()
//userDF.select(userDF("name"),(userDF("age")+1).as("ageincrease")).filter($"ageincrease">22).show()
// val countDF: DataFrame = userDF.groupBy("age").count()
// val countDF1: DataFrame = countDF.withColumnRenamed("count","number")
// countDF1.printSchema()
// countDF1.show()
userDF.createOrReplaceTempView("user")
val frame1: DataFrame = spark.sql("select name,age from user where name = 'lisi'")
frame1.show()
内置函数
org.apache.spark.sql.functions._
内置函数的使用
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("demo")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
//模拟用户访问日志信息
val accessLog = Array(
"2016-12-27,001",
"2016-12-27,001",
"2016-12-27,002",
"2016-12-28,003",
"2016-12-28,004",
"2016-12-28,002",
"2016-12-28,002",
"2016-12-28,001"
)
val rdd: RDD[Row] = sc.parallelize(accessLog).map(x => {
val strings: Array[String] = x.split(",")
Row(strings(0), strings(1).toInt)
})
val schema = StructType(Array(
StructField("day", StringType),
StructField("userid", IntegerType)
))
val frame: DataFrame = spark.createDataFrame(rdd,schema)
frame.printSchema()
frame.show()
import org.apache.spark.sql.functions._
frame.groupBy("day").agg(countDistinct("userid").as("uv"))
.select("day","uv").show()
package function
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
case class Student(id:Integer,name:String,gender:String,age:Integer)
object InnerFunction2 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("demo")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
val students = Seq(
Student(1, "a", "F", 10),
Student(2, "b", "F", 20),
Student(3, "c", "M", 19),
Student(4, "d", "F", 22),
Student(5, "e", "F", 35),
Student(6, "f", "M", 18),
Student(7, "g", "F", 23),
Student(8, "h", "F", 22),
Student(9, "i", "M", 15)
)
import spark.implicits._
//两种方式seq -> frame
val df: DataFrame = students.toDF()
val frame: DataFrame = spark.createDataFrame(students)
df.printSchema()
//avg(age)
val avgAgeDF: DataFrame = frame.agg(avg("age"))
avgAgeDF.show()
//按性别分组 求平均年龄
frame.groupBy("gender").agg(avg("age").as("avgage")).show()
frame.groupBy("gender").agg("age"->"avg","age"->"max","age"->"min").show()
//按性别、年龄分组 不同性别年龄的人数
val frame2: DataFrame = frame.groupBy("gender","age").count()
frame2.printSchema()
frame2.show()
//按年龄排序 select from student order by age asc/desc
frame.sort("age").show()//默认升序
frame.sort($"age".desc).show()
frame.groupBy("gender").agg(avg("age").as("avgage"))
.sort($"avgage").show()
}
}