Parquet类型文件
Parquet文件:是一种流行的列式存储格式,以二进制存储,文件中包含数据与元数据
//TODO 1.创建一个SparkSession 对象
val spark: SparkSession = SparkSession.builder()
.master("local[4]").appName("test07")
.getOrCreate()
//导包
import spark.implicits._
val sc: SparkContext = spark.sparkContext
val rdd1: RDD[(String, String, Array[Int])] = sc.parallelize(List(
("zhangsan", "green", Array(3, 5, 6, 9)),
("zhangsan", null, Array(3, 5, 6, 10)),
("lisi", "red", Array(3, 5, 6, 33)),
("zhangsan2", "green", Array(3, 5, 223, 9)),
("zhangsan3", "green", Array(3, 43, 44, 9))
))
//todo 设置schema
val structType = StructType(Array(
StructField("name", StringType),
StructField("color", StringType),
StructField("numbers", ArrayType(IntegerType))
))
val rowRDD: RDD[Row] = rdd1.map(p=>Row(p._1,p._2,p._3))
val df: DataFrame = spark.createDataFrame(rowRDD,structType)
// TODO 读写parquet格式文件
//df.write.parquet("file:///D:\\project\\2020\\Scala\\Spark_day0812\\src\\data\\user")
//TODO 读取parquet文件
val parquetRDD: DataFrame = spark.read.parquet("file:///D:\\project\\2020\\Scala\\Spark_day0812\\src\\data\\user")
parquetRDD.printSchema()
parquetRDD.show()
spark.stop()
SparkSql操作hive表
Spark SQL与Hive集成:
1、hive-site.xml拷贝至${SPARK_HOME}/conf下
2、检查hive.metastore.uris是否正确
3、启动元数据服务:nohup hive --service metastore &
//集成Hive后spark-shell下可直接访问Hive表
val df=spark.table("toronto")
df.printSchema
df.show
//在IDEA中操作
//TODO 1.创建一个SparkSession 对象
val spark: SparkSession = SparkSession.builder()
.master("local[2]").appName("test02")
.enableHiveSupport()
//.config("hive.metastore.uris","thrift://hadoop101:9083")
.getOrCreate()
private val df1: DataFrame = spark.sql("select * from emp")
df1.printSchema()
df1.show()
DF将数据传输到mysql数据库
private val spark: SparkSession = SparkSession.builder().appName("demo4").master("local[*]").getOrCreate()
private val sc: SparkContext = spark.sparkContext
private val rdd: RDD[Array[String]] = sc.textFile("file:///D:\\IDEA\\_20200821_kaoshi\\data\\scala.csv").map(_.split(","))
private val structType = StructType(
Array(
StructField("time", StringType, true),
StructField("id", IntegerType, true),
StructField("salary", DoubleType, true)
)
)
private val rowrdd: RDD[Row] = rdd.map(x=>Row(x(0),x(1).toInt,x(2).toDouble))
private val frame: DataFrame = spark.createDataFrame(rowrdd,structType)
private val prop = new Properties()
prop.setProperty("user","kb07")
prop.setProperty("password","ok")
prop.setProperty("driver","com.mysql.jdbc.Driver")
frame.write.jdbc("jdbc:mysql://zcy01:3306/test","demo4",prop)
SparkSql操作Mysql
private val spark: SparkSession = SparkSession.builder().master("local[2]").appName("mysql")
.getOrCreate()
//TODO 配置url,tableName,prop
val url="jdbc:mysql://hadoop101:3306/test"
val tableName="dept_manager"
val prop = new Properties()
prop.setProperty("user","root")
prop.setProperty("password","123456")
prop.setProperty("driver","com.mysql.jdbc.Driver")
//TODO spark连接mysql,读取mysql中的表并将其转换成df
val mysqlDF: DataFrame = spark.read.jdbc(url,tableName,prop)
mysqlDF.printSchema()
mysqlDF.show()
SparkSQL的内置函数
val spark: SparkSession = SparkSession.builder()
.master("local[*]").appName("test02")
.getOrCreate()
//导包
import spark.implicits._
val sc: SparkContext = spark.sparkContext
//todo 导入Spark 内置函数所需要的包
import org.apache.spark.sql.functions._
val accessLog = Array(
"2020-08-13,1",
"2020-08-13,1",
"2020-08-13,2",
"2020-08-13,2",
"2020-08-13,3",
"2020-08-14,1",
"2020-08-14,1",
"2020-08-14,2",
"2020-08-14,3",
"2020-08-15,1",
"2020-08-15,2",
"2020-08-15,2",
"2020-08-15,3"
)
private val accessLogRDD: RDD[Row] = sc.parallelize(accessLog).map(row => {
val splited: Array[String] = row.split(",")
Row(splited(0), splited(1).toInt)
})
private val structType = StructType(Array(
StructField("day", StringType),
StructField("userId", IntegerType)
))
private val logDF: DataFrame = spark.createDataFrame(accessLogRDD,structType)
logDF.printSchema()
logDF.show()
//TODO 需求1:求每天有多少访问量 (pv)
//logDF.groupBy("day").agg(count("userId")).show()
//logDF.groupBy("day").agg(count("userId").as("pv")).show()
logDF.groupBy("day").agg(count("userId").alias("pv")).show()
//TODO 需求2:求每天有多少访问用户 (uv)
logDF.groupBy("day").agg(countDistinct("userId").alias("uv")).show()
logDF.createOrReplaceTempView("logs")
spark.sql(
"""
|select day,count(userId) as pv
|from logs
|group by day
""".stripMargin).show()
//Test2
val spark: SparkSession = SparkSession.builder()
.master("local[*]").appName("test02")
.getOrCreate()
//导包
import spark.implicits._
val sc: SparkContext = spark.sparkContext
//todo 导入Spark 内置函数所需要的包
import org.apache.spark.sql.functions._
private val stuDF: DataFrame = Seq(
Student(1001, "zhangsan", "F", 20),
Student(1002, "zhang2", "F", 18),
Student(1003, "zhang3", "M", 20),
Student(1004, "zhang4", "M", 25),
Student(1005, "zhang5", "M", 20),
Student(1006, "zhang6", "F", 20)
).toDF()
stuDF.printSchema()
stuDF.show()
TODO | id| name|gender|age|
TODO 按性别分组,求平均年龄
stuDF.groupBy("gender").agg(avg("age")).show()
stuDF.groupBy("gender").avg("age").show()
TODO 按性别分组,求年龄 平均,最大,最小
stuDF.groupBy("gender").agg("age"->"avg","age"->"max","age"->"min").show()
TODO 按性别和年龄进行分组
stuDF.groupBy("gender","age").count().show()
TODO 按年龄排序
stuDF.sort("age").show()
stuDF.sort($"age".desc).show()
stuDF.sort($"age".desc,$"id".desc).show()
stuDF.orderBy($"age".desc).limit(2).show()
spark.stop()
自定义函数
SparkSession.udf.register():只在sql()中有效
functions.udf():对DataFrame API均有效
val spark: SparkSession = SparkSession.builder()
.master("local[*]").appName("test02")
.getOrCreate()
//导包
import spark.implicits._
val sc: SparkContext = spark.sparkContext
private val info: RDD[String] = sc.textFile("file:///D:\\project\\2020\\Scala\\Spark_day0812\\src\\data\\hobbies.txt")
val hobbyDF: DataFrame = info.map(_.split("\t")).map(p=>Hobbies(p(0),p(1))).toDF()
hobbyDF.createOrReplaceTempView("hobby")
//lina travel,dance
//TODO 注册UDF函数,并使用
spark.udf.register("hobby_num",(s:String)=>s.split(",").size)
spark.sql(
"""
|select name,hobbies,hobby_num(hobbies) as hobbyNum
|from hobby
""".stripMargin).show()
spark.stop()