一、Data Frame表
package com. shujia. spark. sql
import org. apache. spark. sql. { DataFrame, SaveMode, SparkSession}
object Demo1WordCount {
def main( args: Array[ String ] ) : Unit = {
val spark: SparkSession = SparkSession
. builder( )
. appName( "wc" )
. master( "local" )
. getOrCreate( )
val linesDF: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "\t" )
. schema( "line String" )
. load( "data/words.txt" )
linesDF. createOrReplaceTempView( "lines" )
val wordCountDF: DataFrame = spark. sql(
"""
|select word
|,count(1) as c
|from
|(select explode(split(line,",")) word
|from lines
|) as a
|group by word
|
|""" . stripMargin)
wordCountDF. show( )
wordCountDF
. write
. format( "csv" )
. option( "sep" , "\t" )
. mode( SaveMode. Overwrite)
. save( "data/wc" )
}
}
二、创建DataFrame表
package com. shujia. spark. sql
import org. apache. spark. sql. { DataFrame, SaveMode, SparkSession}
object Demo2CreateDataFrame {
def main( args: Array[ String ] ) : Unit = {
val spark: SparkSession = SparkSession
. builder( )
. master( "local" )
. appName( "df" )
. getOrCreate( )
val studentDF: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING, name STRING, age INT, gender STRING, clazz STRING" )
. load( "data/students.txt" )
studentDF. printSchema( )
studentDF. show( )
val studentJsonDF: DataFrame = spark
. read
. format( "json" )
. load( "data/students.json" )
studentJsonDF. printSchema( )
studentJsonDF. show( )
val jdbcDF: DataFrame = spark
. read
. format( "jdbc" )
. option( "url" , "jdbc:mysql://master:3306" )
. option( "dbtable" , "bigdata.students" )
. option( "user" , "root" )
. option( "password" , "123456" )
. load( )
jdbcDF. printSchema( )
jdbcDF. show( )
studentDF
. write
. format( "parquet" )
. mode( SaveMode. Overwrite)
. save( "data/parquet" )
val parquetDF: DataFrame = spark
. read
. format( "parquet" )
. load( "data/parquet" )
parquetDF. printSchema( )
parquetDF. show( )
}
}
三、DataFrameAPI
package com. shujia. spark. sql
import org. apache. spark. sql. expressions. Window
import org. apache. spark. sql. { DataFrame, SparkSession}
object Demo3DFApi {
def main( args: Array[ String ] ) : Unit = {
val spark: SparkSession = SparkSession
. builder( )
. master( "local" )
. appName( "api" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
val studentDF: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,name STRING,age INT,gender STRING,clazz STRING" )
. load( "data/students.txt" )
studentDF. show( )
studentDF. show( 100 )
studentDF. show( false )
studentDF. select( "id" , "age" ) . show( )
studentDF. selectExpr( "id" , "age +1 as age" ) . show( )
import spark. implicits. _
studentDF. select( $"id" , $"age" + 2 as "age" ) . show( )
studentDF. where( "gender = '男'" ) . show( )
studentDF. where( $"gender" == = "女" ) . show
studentDF
. groupBy( $"clazz" )
. count( )
. show( )
import org. apache. spark. sql. functions. _
studentDF
. groupBy( $"clazz" )
. agg( count( $"clazz" ) as "c" , avg( $"age" ) as "avgAge" )
. show( )
val scoreDF: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING, cid STRING, sco INT" )
. load( "data/score.txt" )
val joinDF: DataFrame = studentDF. join( scoreDF, "id" )
joinDF. show( )
joinDF
. groupBy( $"id" , $"clazz" )
. agg( sum( $"sco" ) as "sumSco" )
. select( $"id" , $"clazz" , row_number( ) over Window. partitionBy( $"clazz" ) . orderBy( $"sumSco" . desc) as "r" )
. where( $"r" <= 2 )
. show( )
joinDF. createOrReplaceTempView( "stu_sco" )
spark. sql(
"""
|select * from(
|select
|id
|,clazz,
|sumSco,
|row_number() over(partition by clazz order by sumSco desc) as r
|from(
|select id,clazz,sum(sco) as sumSco
|from stu_sco
|group by id,clazz
|) as a
|) as b
|where r <= 2
|
|""" . stripMargin) . show( )
scoreDF
. groupBy( $"id" )
. agg( sum( $"sco" ) as "sumSco" )
. orderBy( $"sumSco" . desc)
. show( )
}
}
四、DataFrameTest
1、统计每个公司每年按月累计收入
package com. shujia. spark. sql
import org. apache. spark. sql. expressions. Window
import org. apache. spark. sql. { Column, DataFrame, SparkSession}
object Demo4Burk {
def main( args: Array[ String ] ) : Unit = {
val spark: SparkSession = SparkSession
. builder( )
. master( "local" )
. appName( "bur1" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import spark. implicits. _
import org. apache. spark. sql. functions. _
val burks: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "," )
. schema( "burk STRING,year STRING,tsl01 DOUBLE,tsl02 DOUBLE,tsl03 DOUBLE,tsl04 DOUBLE,tsl05 DOUBLE,tsl06 DOUBLE,tsl07 DOUBLE,tsl08 DOUBLE,tsl09 DOUBLE,tsl10 DOUBLE,tsl11 DOUBLE,tsl12 DOUBLE" )
. load( "data/income.txt" )
burks. show( )
burks. createOrReplaceTempView( "burks" )
spark. sql(
"""
|
|select explode(Array(1,2,3,4,5,6,7))
|""" . stripMargin)
. show( )
spark. sql(
"""
|select explode(map('001',23,'002',24,'003',25))
|""" . stripMargin)
. show( )
spark. sql(
"""
select
|burk,year,month,pic,
|sum(pic) over(partition by burk,year order by month) as sumPic,
|sum(pic) over(partition by burk,year) as sumP
|from(
|select burk,year,month,pic
|from burks
|lateral view explode(map(1,tsl01,2,tsl02,3,tsl03,4,tsl04,5,tsl05,6,tsl06,7,tsl07,8,tsl08,9,tsl09,10,tsl10,11,tsl11,12,tsl12)) T as month,pic
|) as a
|
|""" . stripMargin)
. show( )
val mapCol: Column = map(
expr( "1" ) , $"tsl01" ,
expr( "2" ) , $"tsl02" ,
expr( "3" ) , $"tsl03" ,
expr( "4" ) , $"tsl04" ,
expr( "5" ) , $"tsl05" ,
expr( "6" ) , $"tsl06" ,
expr( "7" ) , $"tsl07" ,
expr( "8" ) , $"tsl08" ,
expr( "9" ) , $"tsl09" ,
expr( "10" ) , $"tsl10" ,
expr( "11" ) , $"tsl11" ,
expr( "12" ) , $"tsl12"
)
val burksMonthDF: DataFrame = burks
. select( $"burk" , $"year" , explode( mapCol) as Array( "month" , "pic" ) )
burksMonthDF
. withColumn( "sumPic" , sum( $"pic" ) over Window. partitionBy( $"burk" , $"year" ) . orderBy( "month" ) )
. show( )
burksMonthDF
. withColumn( "lagPic" , lag( $"pic" , 1 , 0.0 ) over Window. partitionBy( $"burk" , $"month" ) . orderBy( $"year" ) )
. withColumn( "p" , coalesce( $"pic" / $"lagPic" - 1 , expr( "1.0" ) ) )
. show( )
}
}
2、统计人的工作履历
package com. shujia. spark. sql
import org. apache. spark. sql. expressions. Window
import org. apache. spark. sql. { DataFrame, SparkSession}
object Demo5Shebao {
def main( args: Array[ String ] ) : Unit = {
val spark: SparkSession = SparkSession
. builder( )
. master( "local" )
. appName( "shebao" )
. config( "spark.sql.seuffle.partitions" , 1 )
. getOrCreate( )
import spark. implicits. _
import org. apache. spark. sql. functions. _
val dataDF: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,burk STRING,sdate STRING" )
. load( "data/data.txt" )
dataDF
. withColumn( "lastBurl" , lag( $"burk" , 1 ) over Window. partitionBy( $"id" ) . orderBy( "sdate" ) )
. withColumn( "flag" , when( $"burk" == = $"lastBurl" , 0 ) . otherwise( 1 ) )
. withColumn( "clazz" , sum( $"flag" ) over Window. partitionBy( $"id" ) . orderBy( $"sdate" ) )
. groupBy( $"id" , $"burk" )
. agg( min( $"sdate" ) as "start_date" , max( $"sdate" ) as "end_date" )
. show( )
}
}
五、submit-项目在服务器中运行
package com. shujia. spark. sql
import org. apache. spark. sql. { DataFrame, SaveMode, SparkSession}
object Demo06Submit {
def main( args: Array[ String ] ) : Unit = {
val spark: SparkSession = SparkSession
. builder( )
. appName( "submit" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import spark. implicits. _
import org. apache. spark. sql. functions. _
val studentDF: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,name STRING,age INT,gender STRING,clazz STRING" )
. load( "/data/students.txt" )
val genderNumDF: DataFrame = studentDF
. groupBy( $"gender" )
. agg( count( $"gender" ) as "gender_sum" )
genderNumDF
. write
. format( "csv" )
. option( "sep" , "," )
. mode( SaveMode. Overwrite)
. save( "/data/gender_sum" )
}
}
六、对多次使用的DF进行缓存
package com. shujia. spark. sql
import org. apache. spark. sql. { DataFrame, SparkSession}
object Demo07DFCache {
def main( args: Array[ String ] ) : Unit = {
val spark: SparkSession = SparkSession
. builder( )
. master( "local" )
. appName( "submit" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import spark. implicits. _
import org. apache. spark. sql. functions. _
val studentDF: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,name STRING,age INT,gender STRING,clazz STRING" )
. load( "data/students.txt" )
studentDF. show( )
studentDF. cache( )
studentDF
. groupBy( $"clazz" )
. agg( count( $"clazz" ) as "c" )
. show( )
studentDF
. groupBy( $"gender" )
. agg( count( $"gender" ) as "c" )
. show( )
val scoreDF: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING, cid STRING, sco INT" )
. load( "data/score.txt" )
scoreDF. show( )
studentDF
. hint( "broadcast" )
. join( scoreDF, "id" )
. show( )
while ( true ) {
}
}
}
七、Window开窗函数(***************)
package com. shujia. spark. sql
import org. apache. spark. sql. { DataFrame, SparkSession}
object Demo08Window {
def main( args: Array[ String ] ) : Unit = {
val spark: SparkSession = SparkSession
. builder( )
. master( "local" )
. appName( "window" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import spark. implicits. _
import org. apache. spark. sql. functions. _
val studentDF: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,name STRING,age INT,gender STRING,clazz STRING" )
. load( "data/students.txt" )
studentDF. createOrReplaceTempView( "student" )
val scoreDF: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,cid STRING,sco INT" )
. load( "data/score.txt" )
scoreDF. createOrReplaceTempView( "score" )
spark. sql(
"""
|select
|id,name,age,gender,clazz,
|count(1) over(partition by clazz) as c
|from student
|""" . stripMargin)
spark. sql(
"""
|select * from (
|select
|a.id,b.name,b.age,b.gender,b.clazz,sumSco
|,avg(sumSco) over(partition by subStr(clazz,0,2)) as avgSco
|from (
|select id
|,sum(sco) as sumSco
|from score
|group by id
|) as a
|join student as b
|on a.id=b.id
|) as c
|where sumSco > avgSco
|""" . stripMargin)
spark. sql(
"""
|select *,
|maxSco-sumSco from(
|select
|a.id,a.sumSco,b.name,b.age,b.clazz,b.gender,
|max(sumSco) over (partition by clazz) as maxSco
|from (
|select id,sum(sco) as sumSco from score
|group by id
|) as a
|join student as b
|on a.id=b.id
|)as c
|""" . stripMargin)
spark. sql(
"""
|select *,
|sum(case when round(s/2)=r then sco else 0 end) over (partition by cid) as med
|from (
|select
|*,
|row_number() over (partition by cid order by sco) as r,
|count(1) over (partition by cid) as s
|from score
|)as a
|""" . stripMargin)
. show( 10000 )
spark. sql(
"""
|select *,
|DENSE_RANK() over (partition by cid order by sco) as r1,
|RANK() over (partition by cid order by sco) as r1
|from score
|""" . stripMargin)
spark. sql(
"""
|select *,
|sumSco-lagSco as chaSco
|from (
|select
|*,
|lag(sumSco,1) over (order by sumSco) lagSco
|from (
|select id,sum(sco) as sumSco from score
|group by id
|) as b
|) as c
|""" . stripMargin)
. show( )
}
}
八、DFToRDD&RDDToDF
package com. shujia. spark. sql
import org. apache. spark. SparkContext
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. { DataFrame, Row, SparkSession}
object Demo9DFToRDD {
def main( args: Array[ String ] ) : Unit = {
val spark: SparkSession = SparkSession
. builder( )
. master( "local" )
. appName( "rddondf" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import spark. implicits. _
val sc: SparkContext = spark. sparkContext
val lineRDD: RDD[ String ] = sc. textFile( "data/students.txt" )
val studentRDD: RDD[ ( String , String , Int , String , String ) ] = lineRDD. map( line => {
val split: Array[ String ] = line. split( "," )
( split( 0 ) , split( 1 ) , split( 2 ) . toInt, split( 3 ) , split( 4 ) )
} )
val studentDF: DataFrame = studentRDD. toDF( "id" , "name" , "age" , "gender" , "clazz" )
studentDF. printSchema( )
studentDF. show( )
val stuRDD: RDD[ Row] = studentDF. rdd
val kvRDD: RDD[ ( String , String , Int , String , String ) ] = stuRDD. map( row => {
val id: String = row. getAs[ String ] ( "id" )
val name: String = row. getAs[ String ] ( "name" )
val age: Int = row. getAs[ Int ] ( "age" )
val gender: String = row. getAs[ String ] ( "gender" )
val clazz: String = row. getAs[ String ] ( "clazz" )
( id, name, age, gender, clazz)
} )
kvRDD. foreach( println)
val caseRDD: RDD[ ( String , String , Int , String , String ) ] = stuRDD. map {
case Row( id: String , name: String , age: Int , gender: String , clazz: String ) =>
( id, name, age, gender, clazz)
}
caseRDD. foreach( println)
}
}
九、自定义UDF函数
package com. shujia. spark. sql
import org. apache. spark. sql. { DataFrame, SparkSession}
import org. apache. spark. sql. expressions. UserDefinedFunction
object Demo10UDF {
def main( args: Array[ String ] ) : Unit = {
val spark: SparkSession = SparkSession
. builder( )
. master( "local" )
. appName( "udf" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import spark. implicits. _
import org. apache. spark. sql. functions. _
val ageAdd: UserDefinedFunction = udf( ( age: Int ) => {
age + 1
} )
spark. udf. register( "ageAdd" , ageAdd)
val studentDF: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,name STRING, age INT, gender STRING, clazz STRING" )
. load( "data/students.txt" )
studentDF. createOrReplaceTempView( "student" )
studentDF
. select( $"id" , ageAdd( $"age" ) as "age" )
. show( )
spark. sql(
"""
|select id,ageAdd(age) from student
|
|""" . stripMargin)
. show( )
}
}
十、DFTest
package com. shujia. spark. sql
import org. apache. spark. sql. expressions. Window
import org. apache. spark. sql. { DataFrame, Dataset, Row, SparkSession}
object Demo11CarBon {
def main( args: Array[ String ] ) : Unit = {
val spark: SparkSession = SparkSession
. builder( )
. master( "local" )
. appName( "udf" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import spark. implicits. _
import org. apache. spark. sql. functions. _
val lowCarBon: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "\t" )
. schema( "user_id STRING, data_dt STRING, low_carbon INT" )
. load( "data/ant_user_low_carbon.txt" )
val plantCarBon: DataFrame = spark
. read
. format( "csv" )
. option( "sep" , "\t" )
. schema( "plant_id STRING, plant_name STRING, plant_carbon INT" )
. load( "data/ant_plant_carbon.txt" )
lowCarBon. show( )
plantCarBon. show( )
val huyang: DataFrame = plantCarBon. where( $"plant_id" == = "p004" )
val shaliu: DataFrame = plantCarBon. where( $"plant_id" == = "p002" )
lowCarBon
. where( $"data_dt" < "2017/10/1" and $"data_dt" >= "2017/1/1" )
. groupBy( $"user_id" )
. agg( sum( $"low_carbon" ) as "sum_low_carbon" )
. crossJoin( huyang)
. select( $"user_id" , $"sum_low_carbon" - $"plant_carbon" as "sy_low_carbon" )
. crossJoin( shaliu)
. withColumn( "plant_count" , floor( $"sy_low_carbon" / $"plant_carbon" ) )
. withColumn( "less_count" , lead( $"plant_count" , 1 , 0 ) over Window. orderBy( $"plant_count" . desc) )
. withColumn( "less_num" , $"plant_count" - $"less_count" )
. limit( 10 )
. select( $"user_id" , $"plant_count" , $"less_num" )
. show( )
lowCarBon
. where( year( regexp_replace( $"data_dt" , "/" , "-" ) ) == = "2017" )
. groupBy( $"user_id" , $"data_dt" )
. agg( sum( $"low_carbon" ) as "sum_low_carbon" )
. where( $"sum_low_carbon" > 100 )
. withColumn( "r" , row_number( ) over Window. partitionBy( $"user_id" ) . orderBy( $"data_dt" ) )
. withColumn( "clazz" , expr( "date_sub(regexp_replace(data_dt,'/','-'),r)" ) )
. withColumn( "clazz_count" , count( $"user_id" ) over Window. partitionBy( $"user_id" , $"clazz" ) )
. where( $"clazz_count" >= 3 )
. join( lowCarBon, Seq( "user_id" , "data_dt" ) )
. select( $"user_id" , $"data_dt" , $"low_carbon" )
. show( )
}
}