package com.lenovo.sparkSqlDemo
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
//DateFrame API 使用
object SparkSqlTest {
case class User(id: Int, name: String, age: Int)
def main(args: Array[String]): Unit = {
//获取spark
val spark = SparkSession.builder().master("local[3]").appName("sparkSqlTest").getOrCreate()
//隐式注入
import spark.implicits._
val list = List("suns", "suns2", "suns3")
val lines = spark.sparkContext.parallelize(List(
"1,suns,18",
"2,suns2,19",
"3,suns3,20",
"7,suns7,20",
"4,suns4,21",
"5,suns5,22",
"6,xiaohei,20").map(line => Row(line.split(",")(0).toInt, line.split(",")(1), line.split(",")(2).toInt
)))
val rows = spark.sparkContext.parallelize(List(
"1,student,0",
"2,student,0",
"3,worker,1",
"8,worker,1",
"4,worker,2",
"5,teacher,3",
"6,teacher,4").map(line => Row(line.split(",")(0).toInt, line.split(",")(1), line.split(",")(2).toInt
)))
var structType = StructType(StructField("id", IntegerType) :: StructField("name", StringType) :: StructField("age", IntegerType) :: Nil)
var structType2 = StructType(StructField("id", IntegerType) :: StructField("work", StringType) :: StructField("working_years", IntegerType) :: Nil)
val frame1 = spark.createDataFrame(lines, structType)
val frame2 = spark.createDataFrame(rows, structType2)
/*//join和filter的使用
frame1.filter(frame1.col("name")isin(list:_*))
.join(frame2.filter(frame2.col("id")>2),frame1.col("id")===frame2.col("id"))
.show()*/
/*//select和where的使用
frame1.select($"id",$"name",$"age"*3 as "age*3")
.where($"id">2).show()*/
/*//show的用法
//不给参数默认只显示前20条
//frame1.show()
//展示前n条
//frame1.show(3)
//展示前n条 字段的值内容过长时是否截断 true截断,false正常显示
//frame2.show(4,true)
//展示前n条 字段内容展示长度,超过该长度的截断
frame2.show(4,15)*/
/*//groupby ->avg(col1)&&count(a) 统计各个职位的人数并计算平均工龄并按work排序
frame2.select($"id",$"work",$"working_years")
.groupBy($"work")
//"working_years"->"avg" ||("id","count") 两种写法均可(也可以混合使用)
.agg("working_years"->"avg",("id","count"))
//.orderBy(frame2.col("work").desc) //orderBy进行全局排序 等同于sort sortWithinPartitions局部排序
//.orderBy($"work".desc)
.show()*/
/*//分区排序
frame1.sortWithinPartitions("age").show()*/
/*//左连接(左全要)
frame1.join(frame2,Seq("id"),"left").show*/
/*//右连接(右全要)
frame1.join(frame2,Seq("id"),"right").show*/
/*//全连接(数据全要)
frame1.join(frame2,Seq("id"),"full").show*/
/*//内连接(取交集)
frame1.join(frame2,Seq("id"),"inner").show*/
//左半连接(等同于左连接只取左表数据)
frame1.join(frame2,Seq("id"),"leftsemi").show
/*//获取前10条 效率极差
frame1.head(10)
frame1.take(10)*/
/*//获取第一条数据 效率极差
//frame1.first()*/
//agg测试
//frame2.groupBy("work").agg(sum($"working_years") as "totalYears",avg("working_years") as "avgYears").show()
//关闭日志打印
spark.sparkContext.setLogLevel("FATAL")
spark.close()
}
}
spark dataframe api操作
最新推荐文章于 2024-10-11 11:21:26 发布