大数据学习-Spark
1.Spark-core
1.D emo1WordCount
package com. shujia. core
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object Demo1WordCount {
def main( args: Array[ String ] ) : Unit = {
val conf: SparkConf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "wc" )
val sparkContext: SparkContext = new SparkContext( conf)
val wordsLine: RDD[ String ] = sparkContext. textFile( "spark/data/words.txt" )
val words: RDD[ String ] = wordsLine. flatMap( _. split( "\\|" ) )
val wordsTuple2: RDD[ ( String , Int ) ] = words. map( ( _, 1 ) )
val wordsTuple2Group: RDD[ ( String , Iterable[ ( String , Int ) ] ) ] = wordsTuple2. groupBy( _. _1)
val wordCount: RDD[ ( String , Int ) ] = wordsTuple2Group. map( ( kv: ( String , Iterable[ ( String , Int ) ] ) ) => ( kv. _1, kv. _2. size) )
wordCount. saveAsTextFile( "spark/data/word_count" )
}
}
2.D emo2Partition
package com. shujia. core
import org. apache. spark. { SparkConf, SparkContext}
import org. apache. spark. rdd. RDD
object Demo2Partition {
def main( args: Array[ String ] ) : Unit = {
val conf: SparkConf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "wc" )
val sparkContext: SparkContext = new SparkContext( conf)
val wordsLine: RDD[ String ] = sparkContext. textFile( "spark/data/ws/*" )
println( s "wordsLineRDD分区数是: ${ wordsLine. getNumPartitions } " )
val words: RDD[ String ] = wordsLine. flatMap( _. split( "\\|" ) )
println( s "wordsRDD分区数是: ${ words. getNumPartitions } " )
val wordsTuple2: RDD[ ( String , Int ) ] = words. map( ( _, 1 ) )
println( s "wordsTuple2RDD分区数是: ${ wordsTuple2. getNumPartitions } " )
val wordsTuple2Group: RDD[ ( String , Iterable[ ( String , Int ) ] ) ] = wordsTuple2. groupBy( _. _1, 5 )
println( s "wordsTuple2GroupRDD分区数是: ${ wordsTuple2Group. getNumPartitions } " )
val wordCount: RDD[ ( String , Int ) ] = wordsTuple2Group. map( ( kv: ( String , Iterable[ ( String , Int ) ] ) ) => ( kv. _1, kv. _2. size) )
println( s "wordCountRDD分区数是: ${ wordCount. getNumPartitions } " )
wordCount. saveAsTextFile( "spark/data/word_count2" )
}
}
RDD5大特性:(面试必会的!!) 1)RDD是由一些分区构成的 读取文件时有多少个block块,RDD中就会有多少个分区 注:默认情况下,所有的RDD中的分区数是一样的,无论是shuffle之前还是shuffle之后的,在最开始加载数据的时候决定的
2)函数实际上是作用在RDD中的分区上的,一个分区是由一个task处理,有多少个分区,总共就有多少个task 注:函数在spark中称之为算子(转换transformation算子 RDD–>RDD,行动action算子 RDD->Other数据类型)
3)RDD之间存在一些依赖关系,后一个RDD中的数据是依赖与前一个RDD的计算结果,数据像水流一样在RDD之间流动 注: 3.1 RDD之间有两种依赖关系 **a. 窄依赖 后一个RDD中分区数据对应前一个RDD中的一个分区数据 1对1的关系**
**b. 宽依赖 后一个RDD中分区数据来自于前一个RDD中的多个分区数据 1对多的关系 shuffle**
**3.2 因为有了依赖关系,将整个作业划分了一个一个stage阶段 sumNum(stage) = Num(宽依赖) + 1**
**3.3 窄依赖的分区数是不可以改变,取决于第一个RDD分区数,宽依赖可以在产生shuffle的算子上设置分区数**
4)分区类的算子只能作用在kv格式的RDD上,groupByKey reduceByKey 5)spark为task计算提供了精确的计算位置,移动计算而不移动数据
3.D emo3Map
package com. shujia. core
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object Demo3Map {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "map算子演示" )
val context = new SparkContext( conf)
val studentRDD: RDD[ String ] = context. textFile( "spark/data/students.csv" )
val splitRDD: RDD[ List[ String ] ] = studentRDD. map( ( s: String ) => {
println( "============数加防伪码================" )
s. split( "," ) . toList
} )
}
}
4.D emo4Filter
package com. shujia. core
import org. apache. spark. { SparkConf, SparkContext}
import org. apache. spark. rdd. RDD
object Demo4Filter {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "map算子演示" )
val context = new SparkContext( conf)
val studentRDD: RDD[ String ] = context. textFile( "spark/data/students.csv" )
val filterRDD: RDD[ String ] = studentRDD. filter( ( s: String ) => {
val strings: Array[ String ] = s. split( "," )
"男" . equals( strings( 3 ) )
} )
filterRDD. foreach( println)
}
}
5.D emo5flatMap
package com. shujia. core
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object Demo5flatMap {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "flatMap算子演示" )
val context = new SparkContext( conf)
val linesRDD: RDD[ String ] = context. textFile( "spark/data/words.txt" )
val wordsRDD: RDD[ String ] = linesRDD. flatMap( ( line: String ) => line. split( "\\|" ) )
wordsRDD. foreach( println)
}
}
6.D emo6sample
package com. shujia. core
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object Demo6sample {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "flatMap算子演示" )
val context = new SparkContext( conf)
val studentRDD: RDD[ String ] = context. textFile( "spark/data/students.csv" )
val sampleRDD: RDD[ String ] = studentRDD. sample( withReplacement = true , 0.1 )
sampleRDD. foreach( println)
}
}
7.D emo7GroupBy
package com. shujia. core
import org. apache. spark. { SparkConf, SparkContext}
import org. apache. spark. rdd. RDD
object Demo7GroupBy {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "groupBy算子演示" )
val context = new SparkContext( conf)
val studentRDD: RDD[ String ] = context. textFile( "spark/data/students.csv" )
val splitRDD: RDD[ Array[ String ] ] = studentRDD. map( ( s: String ) => s. split( "," ) )
val clazzWithAgeRDD: RDD[ ( String , Int ) ] = splitRDD. map {
case Array( _, _, age: String , _, clazz: String ) => ( clazz, age. toInt)
}
val kvRDD: RDD[ ( String , Iterable[ ( String , Int ) ] ) ] = clazzWithAgeRDD. groupBy( _. _1)
val clazzAvgAgeRDD: RDD[ ( String , Double ) ] = kvRDD. map {
case ( clazz: String , itr: Iterable[ ( String , Int ) ] ) =>
val allAge: Iterable[ Int ] = itr. map( ( kv: ( String , Int ) ) => kv. _2)
val avgAge: Double = allAge. sum. toDouble / allAge. size
( clazz, avgAge)
}
clazzAvgAgeRDD. foreach( println)
while ( true ) {
}
}
}
8.D emo8GroupByKey
package com. shujia. core
import org. apache. spark. { SparkConf, SparkContext}
import org. apache. spark. rdd. RDD
object Demo8GroupByKey {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "groupByKey算子演示" )
val context = new SparkContext( conf)
val studentRDD: RDD[ String ] = context. textFile( "spark/data/students.csv" )
val splitRDD: RDD[ Array[ String ] ] = studentRDD. map( ( s: String ) => s. split( "," ) )
val clazzWithAgeRDD: RDD[ ( String , Int ) ] = splitRDD. map {
case Array( _, _, age: String , _, clazz: String ) => ( clazz, age. toInt)
}
val kvRDD: RDD[ ( String , Iterable[ Int ] ) ] = clazzWithAgeRDD. groupByKey( )
val clazzAvgAgeRDD: RDD[ ( String , Double ) ] = kvRDD. map {
case ( clazz: String , ageItr: Iterable[ Int ] ) =>
( clazz, ageItr. sum. toDouble / ageItr. size)
}
clazzAvgAgeRDD. foreach( println)
while ( true ) {
}
}
}
groupBy与groupByKey的区别(spark的面试题)
* 1 、代码上的区别:任意一个RDD都可以调用groupBy算子,只有kv类型的RDD才可以调用groupByKey
* 2 、groupByKey之后产生的RDD的结构比较简单,方便后续处理
* 3 、groupByKey的性能更好,执行速度更快,因为groupByKey相比较与groupBy算子来说,shuffle所需要的数据量较少
9.D emo9ReduceByKey
package com. shujia. core
import org. apache. spark. { SparkConf, SparkContext}
import org. apache. spark. rdd. RDD
object Demo9ReduceByKey {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "reduceByKey算子演示" )
val context = new SparkContext( conf)
val studentRDD: RDD[ String ] = context. textFile( "spark/data/students.csv" )
val splitRDD: RDD[ Array[ String ] ] = studentRDD. map( ( s: String ) => s. split( "," ) )
val clazzKVRDD: RDD[ ( String , Int ) ] = splitRDD. map {
case Array( _, _, _, _, clazz: String ) => ( clazz, 1 )
}
val countRDD: RDD[ ( String , Int ) ] = clazzKVRDD. reduceByKey( ( x: Int , y: Int ) => x + y)
countRDD. foreach( println)
while ( true ) {
}
}
}
reduceByKey与groupByKey的区别
1、reduceByKey比groupByKey在map端多了一个预聚合的操作,预聚合之后的shuffle数据量肯定是要少很多的,性能上比groupByKey要好 2、从灵活角度来看,reduceByKey并没有groupByKey灵活 比如reduceByKey无法做方差,groupByKey后续可以完成
10.D emo10Union
package com. shujia. core
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object Demo10Union {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "Union算子演示" )
val context = new SparkContext( conf)
val w1RDD: RDD[ String ] = context. textFile( "spark/data/ws/w1.txt" )
val w2RDD: RDD[ String ] = context. textFile( "spark/data/ws/w2.txt" )
val unionRDD: RDD[ String ] = w1RDD. union( w2RDD)
println( unionRDD. getNumPartitions)
unionRDD. foreach( println)
while ( true ) {
}
}
}
11.D emo11Join
package com. shujia. core
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object Demo11Join {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "Join算子演示" )
val context = new SparkContext( conf)
val rdd1: RDD[ ( String , String ) ] = context. parallelize(
List(
( "1001" , "尚平" ) ,
( "1002" , "丁义杰" ) ,
( "1003" , "徐昊宇" ) ,
( "1004" , "包旭" ) ,
( "1005" , "朱大牛" ) ,
( "1006" , "汪权" )
)
)
val rdd2: RDD[ ( String , String ) ] = context. parallelize(
List(
( "1001" , "崩坏" ) ,
( "1002" , "原神" ) ,
( "1003" , "王者" ) ,
( "1004" , "修仙" ) ,
( "1005" , "学习" ) ,
( "1007" , "敲代码" )
)
)
val leftJoinRDD: RDD[ ( String , ( String , Option[ String ] ) ) ] = rdd1. leftOuterJoin( rdd2)
val leftJoinRDD2: RDD[ ( String , String , String ) ] = leftJoinRDD. map {
case ( id: String , ( name: String , Some( like) ) ) => ( id, name, like)
case ( id: String , ( name: String , None) ) => ( id, name, "无爱好" )
}
leftJoinRDD2. foreach( println)
println( "=================================" )
val fullJoinRDD: RDD[ ( String , ( Option[ String ] , Option[ String ] ) ) ] = rdd1. fullOuterJoin( rdd2)
val fullJoinRDD2: RDD[ ( String , String , String ) ] = fullJoinRDD. map {
case ( id: String , ( Some( name) , Some( like) ) ) => ( id, name, like)
case ( id: String , ( Some( name) , None) ) => ( id, name, "无爱好" )
case ( id: String , ( None, Some( like) ) ) => ( id, "无姓名" , like)
}
fullJoinRDD2. foreach( println)
}
}
12.D emo12Student
package com. shujia. core
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object Demo12Student {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "Join算子演示" )
val context = new SparkContext( conf)
val scoreRDD: RDD[ ( String , String , String ) ] = context. textFile( "spark/data/score.txt" )
. map( ( s: String ) => s. split( "," ) )
. filter( ( arr: Array[ String ] ) => arr. length == 3 )
. map {
case Array( sid: String , subject_id: String , score: String ) => ( sid, subject_id, score)
}
val sumScoreWithSidRDD: RDD[ ( String , Int ) ] = scoreRDD. map {
case ( sid: String , _: String , score: String ) => ( sid, score. toInt)
} . reduceByKey( ( x: Int , y: Int ) => x + y)
val sumScoreTop10: Array[ ( String , Int ) ] = sumScoreWithSidRDD. sortBy( - _. _2) . take( 10 )
val ids: Array[ String ] = sumScoreTop10. map( _. _1)
val top10StuScore: RDD[ ( String , String , String ) ] = scoreRDD. filter {
case ( id: String , _, _) => ids. contains( id)
}
top10StuScore. foreach( println)
}
}
13.D emo13MapValues
package com. shujia. core
import org. apache. spark. { SparkConf, SparkContext}
import org. apache. spark. rdd. RDD
object Demo13MapValues {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "Join算子演示" )
val context = new SparkContext( conf)
val scoreRDD: RDD[ ( String , String , String ) ] = context. textFile( "spark/data/score.txt" )
. map( ( s: String ) => s. split( "," ) )
. filter( ( arr: Array[ String ] ) => arr. length == 3 )
. map {
case Array( sid: String , subject_id: String , score: String ) => ( sid, subject_id, score)
}
val sumScoreWithSidRDD: RDD[ ( String , Int ) ] = scoreRDD. map {
case ( sid: String , _: String , score: String ) => ( sid, score. toInt)
} . reduceByKey( ( x: Int , y: Int ) => x + y)
val resRDD: RDD[ ( String , Int ) ] = sumScoreWithSidRDD. mapValues( _ + 1000 )
resRDD. foreach( println)
val res2RDD: RDD[ ( String , Int ) ] = sumScoreWithSidRDD. map( ( kv: ( String , Int ) ) => ( kv. _1, kv. _2 + 1000 ) )
}
}
14.D emo14mapPartition
package com. shujia. core
import org. apache. spark. { SparkConf, SparkContext}
import org. apache. spark. rdd. RDD
object Demo14mapPartition {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "mapPartition算子演示" )
val context = new SparkContext( conf)
val scoreRDD: RDD[ String ] = context. textFile( "spark/data/ws/*" )
println( scoreRDD. getNumPartitions)
scoreRDD. mapPartitionsWithIndex{
case ( index: Int , itr: Iterator[ String ] ) =>
println( s "当前所处理的分区编号是: ${ index } " )
itr. flatMap( _. split( "\\|" ) )
} . foreach( println)
}
}
15.D emo15Actions
package com. shujia. core
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object Demo15Actions {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "Action算子演示" )
val context = new SparkContext( conf)
val studentRDD: RDD[ String ] = context. textFile( "spark/data/students.csv" )
println( "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$" )
val studentsRDD: RDD[ ( String , String , String , String , String ) ] = studentRDD. map( _. split( "," ) )
. map {
case Array( id: String , name: String , age: String , gender: String , clazz: String ) =>
println( "**************************** 数加防伪码 ^_^ ********************************" )
( id, name, age, gender, clazz)
}
println( "$$$$$$$$$$$$$$$$$$$$$$***__***$$$$$$$$$$$$$$$$$$$$$$$$$" )
val tuples: Array[ ( String , String , String , String , String ) ] = studentsRDD. collect( )
}
}
16.D emo16Catch
package com. shujia. core
import org. apache. spark. { SparkConf, SparkContext}
import org. apache. spark. rdd. RDD
import org. apache. spark. storage. StorageLevel
object Demo16Catch {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "Action算子演示" )
val context = new SparkContext( conf)
context. setCheckpointDir( "spark/data/checkpoint" )
val linesRDD: RDD[ String ] = context. textFile( "spark/data/students.csv" )
val splitRDD: RDD[ Array[ String ] ] = linesRDD. map( _. split( "," ) )
val studentsRDD: RDD[ ( String , String , String , String , String ) ] = splitRDD. map {
case Array( id: String , name: String , age: String , gender: String , clazz: String ) =>
( id, name, age, gender, clazz)
}
studentsRDD. checkpoint( )
val clazzKVRDD: RDD[ ( String , Int ) ] = studentsRDD. map {
case ( _, _, _, _, clazz: String ) => ( clazz, 1 )
}
val clazzNumRDD: RDD[ ( String , Int ) ] = clazzKVRDD. reduceByKey( _ + _)
clazzNumRDD. saveAsTextFile( "spark/data/clazz_num" )
val genderKVRDD: RDD[ ( String , Int ) ] = studentsRDD. map {
case ( _, _, _, gender: String , _) => ( gender, 1 )
}
val genderNumRDD: RDD[ ( String , Int ) ] = genderKVRDD. reduceByKey( _ + _)
genderNumRDD. saveAsTextFile( "spark/data/gender_num" )
}
}
17.D emo17SparkStandaloneSubmit
package com. shujia. core
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object Demo17SparkStandaloneSubmit {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
val sparkContext = new SparkContext( conf)
val linesRDD: RDD[ String ] = sparkContext. parallelize( List( "java,hello,world" , "hello,scala,spark" , "java,hello,spark" ) )
val wordRDD: RDD[ String ] = linesRDD. flatMap( _. split( "," ) )
val wordKVRDD: RDD[ ( String , Int ) ] = wordRDD. map( ( _, 1 ) )
val countRDD: RDD[ ( String , Int ) ] = wordKVRDD. reduceByKey( _ + _)
countRDD. foreach( println)
}
}
18.D emo18SparkYarnSubmit
package com. shujia. core
import org. apache. hadoop. conf. Configuration
import org. apache. hadoop. fs. { FileSystem, Path}
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object Demo18SparkYarnSubmit {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setAppName( "yarn submit" )
val context = new SparkContext( conf)
val linesRDD: RDD[ String ] = context. textFile( "/bigdata29/data/students.csv" )
println( "=" * 100 )
println( s "分区数为: ${ linesRDD. getNumPartitions } " )
println( "=" * 100 )
val classKVRDD: RDD[ ( String , Int ) ] = linesRDD. map( ( line: String ) => {
val clazz: String = line. split( "," ) ( 4 )
( clazz, 1 )
} )
val clazzNumRDD: RDD[ ( String , Int ) ] = classKVRDD. reduceByKey( _ + _)
val resRDD: RDD[ String ] = clazzNumRDD. map( ( kv: ( String , Int ) ) => s " ${ kv. _1 } \t ${ kv. _2 } " )
val hadoopConf = new Configuration( )
val fileSystem: FileSystem = FileSystem. get( hadoopConf)
if ( fileSystem. exists( new Path( "/bigdata29/sparkout1" ) ) ) {
fileSystem. delete( new Path( "/bigdata29/sparkout1" ) , true )
}
resRDD. saveAsTextFile( "/bigdata29/sparkout1" )
}
}
19.D emo19PI
package com. shujia. core
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
import scala. util. Random
object Demo19PI {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setAppName( "yarn submit" )
val context = new SparkContext( conf)
val list: Range. Inclusive = 0 to 1000000000
val rangeRDD: RDD[ Int ] = context. parallelize( list)
val dianRDD: RDD[ ( Double , Double ) ] = rangeRDD. map( ( i: Int ) => {
val x: Double = Random. nextDouble( ) * 2 - 1
val y: Double = Random. nextDouble( ) * 2 - 1
( x, y)
} )
val yuanZuoRDD: RDD[ ( Double , Double ) ] = dianRDD. filter {
case ( x: Double , y: Double ) =>
x * x + y * y < 1
}
println( "=" * 100 )
println( s "PI的值为: ${ ( yuanZuoRDD. count( ) . toDouble / dianRDD. count( ) ) * 4 } " )
println( "=" * 100 )
}
}
20.D emo20Accumulator
package com. shujia. core
import org. apache. spark. { SparkConf, SparkContext}
import org. apache. spark. rdd. RDD
import org. apache. spark. util. LongAccumulator
object Demo20Accumulator {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "map算子演示" )
val context = new SparkContext( conf)
val studentRDD: RDD[ String ] = context. textFile( "spark/data/students.csv" )
val scoreRDD: RDD[ String ] = context. textFile( "spark/data/score.txt" )
}
}
21.D emo21Broadcast
package com. shujia. core
import org. apache. spark. broadcast. Broadcast
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
import scala. io. Source
object Demo21Broadcast {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local" )
conf. setAppName( "广播变量演示" )
val context = new SparkContext( conf)
val studentsMap: Map[ String , String ] = Source. fromFile( "spark/data/students.csv" )
. getLines( )
. toList
. map( ( line: String ) => {
val infos: Array[ String ] = line. split( "," )
val stuInfo: String = infos. mkString( "," )
( infos( 0 ) , stuInfo)
} ) . toMap
val scoresRDD: RDD[ String ] = context. textFile( "spark/data/score.txt" )
val studentsMapBroadcast: Broadcast[ Map[ String , String ] ] = context. broadcast( studentsMap)
val resMapRDD: RDD[ ( String , String ) ] = scoresRDD. map( ( score: String ) => {
val id: String = score. split( "," ) ( 0 )
val stuMap: Map[ String , String ] = studentsMapBroadcast. value
val studentInfo: String = stuMap. getOrElse( id, "无学生信息" )
( score, studentInfo)
} )
resMapRDD. foreach( println)
}
}
2.Spark-sql
Demo1WordCount
package com. shujia. sql
import org. apache. spark. sql. { DataFrame, Dataset, Row, SaveMode, SparkSession}
object Demo1WordCount {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "wc spark sql" )
. getOrCreate( )
val linesDF: DataFrame = sparkSession. read
. format( "csv" )
. schema( "line STRING" )
. option( "sep" , "\n" )
. load( "spark/data/words.txt" )
linesDF. createOrReplaceTempView( "lines" )
val resDF: DataFrame = sparkSession. sql(
"""
|select
|t1.word as word,
|count(1) as counts
|from
|(select
| explode(split(line,'\\|')) as word from lines) t1
| group by t1.word
|""" . stripMargin)
val resDS: Dataset[ Row] = resDF. repartition( 1 )
resDS. write
. format( "csv" )
. option( "sep" , "\t" )
. mode( SaveMode. Overwrite)
. save( "spark/data/sqlout1" )
}
}
Demo2DSLWordCount
package com. shujia. sql
import org. apache. spark. sql. { DataFrame, SaveMode, SparkSession}
object Demo2DSLWordCount {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "wc spark sql" )
. getOrCreate( )
val linesDF: DataFrame = sparkSession. read
. format( "csv" )
. schema( "line STRING" )
. option( "sep" , "\n" )
. load( "spark/data/words.txt" )
import org. apache. spark. sql. functions. _
import sparkSession. implicits. _
val resultDF: DataFrame = linesDF. select( explode( split( $"line" , "\\|" ) ) as "word" )
. groupBy( $"word" )
. agg( count( $"word" ) as "counts" )
resultDF
. repartition( 1 )
. write
. format( "csv" )
. option( "sep" , "\t" )
. mode( SaveMode. Overwrite)
. save( "spark/data/sqlout2" )
}
}
Demo3DSLAPI
package com. shujia. sql
import org. apache. spark. sql. expressions. Window
import org. apache. spark. sql. { DataFrame, SparkSession}
object Demo3DSLAPI {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "dsl语法api演示" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import org. apache. spark. sql. functions. _
import sparkSession. implicits. _
val stuDF: DataFrame = sparkSession. read
. format( "json" )
. load( "spark/data/students.json" )
val scoreDF: DataFrame = sparkSession. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,subject_id STRING,score INT" )
. load( "spark/data/score.txt" )
val joinStuAndScoreWithIDDF: DataFrame = stuDF. join( scoreDF, "id" )
joinStuAndScoreWithIDDF. groupBy( $"id" , $"clazz" )
. agg( sum( $"score" ) as "sumScore" )
. withColumn( "rn" , row_number( ) over Window. partitionBy( $"clazz" ) . orderBy( $"sumScore" . desc) )
. where( $"rn" <= 3 )
. show( )
}
}
Demo4DataSourceAPI
package com. shujia. sql
import org. apache. spark. sql. { DataFrame, SparkSession}
object Demo4DataSourceAPI {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "dsl语法api演示" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
val stuJsonDF: DataFrame = sparkSession. read
. format( "json" )
. load( "spark/data/students2.json" )
val jdDF: DataFrame = sparkSession. read
. format( "jdbc" )
. option( "url" , "jdbc:mysql://192.168.220.100:3306" )
. option( "dbtable" , "bigdata29.jd_goods" )
. option( "user" , "root" )
. option( "password" , "123456" )
. load( )
jdDF. show( 10 , truncate = false )
}
}
Demo5RDDToDF
package com. shujia. sql
import org. apache. spark. SparkContext
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. { DataFrame, Row, SparkSession}
object Demo5RDDToDF {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "RDD和DF互相转换演示" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import sparkSession. implicits. _
val sc: SparkContext = sparkSession. sparkContext
val linesRDD: RDD[ String ] = sc. textFile( "spark/data/students.csv" )
val studentsRDD: RDD[ ( String , String , Int , String , String ) ] = linesRDD. map( ( line: String ) => line. split( "," ) )
. map {
case Array( id: String , name: String , age: String , gender: String , clazz: String ) =>
( id, name, age. toInt, gender, clazz)
}
val studentsDF: DataFrame = studentsRDD. toDF( "id" , "name" , "age" , "gender" , "clazz" )
studentsDF. createOrReplaceTempView( "students" )
val resultDF: DataFrame = sparkSession. sql(
"""
|select
|clazz,
|count(1) as number
|from
|students
|group by clazz
|""" . stripMargin)
val studentsRDD2: RDD[ Row] = resultDF. rdd
studentsRDD2. map{
case Row( clazz: String , number: Long ) =>
( clazz, number)
} . foreach( println)
}
}
Demo6Window
package com. shujia. sql
import org. apache. spark. sql. expressions. Window
import org. apache. spark. sql. { DataFrame, SparkSession}
object Demo6Window {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "开窗函数DSL API演示" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import org. apache. spark. sql. functions. _
import sparkSession. implicits. _
val studentsDF: DataFrame = sparkSession. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,name STRING,age INT,gender STRING,clazz STRING" )
. load( "spark/data/students.csv" )
val scoresDF: DataFrame = sparkSession. read
. format( "csv" )
. option( "sep" , "," )
. schema( "sid STRING,subject_id STRING,score INT" )
. load( "spark/data/score.txt" )
val subjectDF: DataFrame = sparkSession. read
. format( "csv" )
. option( "sep" , "," )
. schema( "subject_id STRING,subject_name STRING,subject_sum_score INT" )
. load( "spark/data/subject.csv" )
val joinDF: DataFrame = studentsDF. join( scoresDF, $"id" == = $"sid" )
joinDF
. withColumn( "sumScore" , sum( $"score" ) over Window. partitionBy( $"id" ) )
. orderBy( $"sumScore" . desc)
. limit( 60 )
scoresDF
. join( subjectDF, "subject_id" )
. where( $"score" >= $"subject_sum_score" * 0.6 )
. withColumn( "jiGeCounts" , count( $"sid" ) over Window. partitionBy( $"sid" ) )
. where( $"jiGeCounts" == = 6 )
joinDF
. withColumn( "sumScore" , sum( $"score" ) over Window. partitionBy( $"id" ) )
. withColumn( "avgScore" , avg( $"sumScore" ) over Window. partitionBy( substring( $"clazz" , 0 , 2 ) ) )
. where( $"sumScore" > $"avgScore" )
joinDF
. groupBy( $"id" , $"clazz" )
. agg( sum( $"score" ) as "sumScore" )
. withColumn( "rn" , row_number( ) over Window. partitionBy( $"clazz" ) . orderBy( $"sumScore" . desc) )
. withColumn( "front_score" , lag( $"sumScore" , 1 , 750 ) over Window. partitionBy( $"clazz" ) . orderBy( $"sumScore" . desc) )
. withColumn( "cha" , $"front_score" - $"sumScore" )
. show( 100 )
}
}
Demo7BurksTest1
package com. shujia. sql
import org. apache. spark. sql. expressions. Window
import org. apache. spark. sql. { Column, DataFrame, SparkSession}
object Demo7BurksTest1 {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "公司营收额数据需求演示" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import org. apache. spark. sql. functions. _
import sparkSession. implicits. _
val burksDF: DataFrame = sparkSession. read
. format( "csv" )
. option( "sep" , "," )
. schema( "burk STRING,year STRING" +
",tsl01 DOUBLE,tsl02 DOUBLE,tsl03 DOUBLE,tsl04 DOUBLE" +
",tsl05 DOUBLE,tsl06 DOUBLE,tsl07 DOUBLE,tsl08 DOUBLE" +
",tsl09 DOUBLE,tsl10 DOUBLE,tsl11 DOUBLE,tsl12 DOUBLE" )
. load( "spark/data/burks.txt" )
val m: Column = map(
expr( "1" ) , $"tsl01" ,
expr( "2" ) , $"tsl02" ,
expr( "3" ) , $"tsl03" ,
expr( "4" ) , $"tsl04" ,
expr( "5" ) , $"tsl05" ,
expr( "6" ) , $"tsl06" ,
expr( "7" ) , $"tsl07" ,
expr( "8" ) , $"tsl08" ,
expr( "9" ) , $"tsl09" ,
expr( "10" ) , $"tsl10" ,
expr( "11" ) , $"tsl11" ,
expr( "12" ) , $"tsl12"
)
burksDF
. select( $"burk" , $"year" , explode( m) as Array( "month" , "tsl" ) )
. withColumn( "leijia" , sum( $"tsl" ) over Window. partitionBy( $"burk" , $"year" ) . orderBy( $"month" ) )
. show( )
}
}
Demo8SubmitYarn
package com. shujia. sql
import org. apache. spark. sql. { DataFrame, SaveMode, SparkSession}
object Demo8SubmitYarn {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. config( "spark.sql.shuffer.partitions" , 1 )
. getOrCreate( )
import sparkSession. implicits. _
import org. apache. spark. sql. functions. _
val stuendsDF: DataFrame = sparkSession. read. format( "csv" ) . option( "sep" , "," ) . schema( "id STRING,name STRING,age INT,gender STRING,clazz STRING" )
. load( "/bigdata29/spark_in/data/student" )
val genderCountsDF: DataFrame = stuendsDF. groupBy( $"gender" )
. agg( count( $"gender" ) as "counts" )
genderCountsDF. write. format( "csv" ) . option( "sep" , "," ) . mode( SaveMode. Overwrite) . save( "/bigdata29/spark_out/out2" )
}
}
Demo9SparkOnHive
package com. shujia. sql
import org. apache. spark. sql. SparkSession
object Demo9SparkOnHive {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "spark读取hive数据" )
. enableHiveSupport( )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import sparkSession. implicits. _
import org. apache. spark. sql. functions. _
sparkSession. sql( "use bigdata29" )
sparkSession. sql( "select clazz,count(1) as counts from students group by clazz" ) . show( )
}
}
Demo10Student
package com. shujia. sql
import org. apache. spark. sql. { Column, DataFrame, SaveMode, SparkSession}
object Demo10Student {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "行列互相转换演示" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import org. apache. spark. sql. functions. _
import sparkSession. implicits. _
val tb1DF: DataFrame = sparkSession. read
. format( "csv" )
. option( "sep" , "," )
. schema( "name STRING,item STRING,score INT" )
. load( "/bigdata29/tb1.txt" )
}
}
Demo11UDF
package com. shujia. sql
import org. apache. spark. sql. expressions. UserDefinedFunction
import org. apache. spark. sql. { DataFrame, SparkSession}
object Demo11UDF {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "udf函数演示" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import org. apache. spark. sql. functions. _
val studentsDF: DataFrame = sparkSession. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,name STRING,age INT,gender STRING,clazz STRING" )
. load( "spark/data/students.csv" )
val shujia_fun1: UserDefinedFunction = udf( ( str: String ) => "数加:" + str)
studentsDF. createOrReplaceTempView( "students" )
sparkSession. udf. register( "shujia_str" , shujia_fun1)
sparkSession. sql(
"""
|select clazz,shujia_str(clazz) as new_clazz from students
|""" . stripMargin) . show( )
}
}
Demo12ShuJiaStr
package com. shujia. sql
import org. apache. hadoop. hive. ql. exec. UDF
class Demo12ShuJiaStr extends UDF {
def evaluate( str: String ) : String = {
"shujia: " + str
}
}
Demo13SheBao
package com. shujia. sql
import org. apache. spark. sql. expressions. Window
import org. apache. spark. sql. { DataFrame, SparkSession}
object Demo13SheBao {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "作业社保演示" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import org. apache. spark. sql. functions. _
import sparkSession. implicits. _
val sheBaoDF: DataFrame = sparkSession. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,burk STRING,sdate STRING" )
. load( "spark/data/shebao.txt" )
sheBaoDF
. withColumn( "last_burk" , lag( $"burk" , 1 ) over Window. partitionBy( $"id" ) . orderBy( $"sdate" ) )
. withColumn( "flag" , when( $"burk" == = $"last_burk" , 0 ) . otherwise( 1 ) )
. withColumn( "tmp" , sum( $"flag" ) over Window. partitionBy( $"id" ) . orderBy( $"sdate" ) )
. groupBy( $"id" , $"burk" , $"tmp" )
. agg( min( $"sdate" ) as "start_date" , max( $"sdate" ) as "end_start" )
. show( 100 )
}
}
package com. shujia. sql
import org. apache. spark. sql. expressions. Window
import org. apache. spark. sql. { DataFrame, Row, SparkSession}
object Demo14MaYi {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "作业社保演示" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import org. apache. spark. sql. functions. _
import sparkSession. implicits. _
val ant_user_low_carbon: DataFrame = sparkSession. read
. format( "csv" )
. option( "sep" , "\t" )
. schema( "user_id STRING,data_dt STRING,low_carbon DOUBLE" )
. load( "spark/data/ant_user_low_carbon.txt" )
val ant_plant_carbon: DataFrame = sparkSession. read
. format( "csv" )
. option( "sep" , "\t" )
. schema( "plant_id STRING,plant_name STRING,plant_carbon DOUBLE" )
. load( "spark/data/ant_plant_carbon.txt" )
val huYangCarbon: Double = ant_plant_carbon
. where( $"plant_name" == = "胡杨" )
. select( $"plant_carbon" )
. rdd
. collect( )
. map {
case Row( plant_carbon: Double ) => plant_carbon
} . head
val shaLiuCarbon: Double = ant_plant_carbon
. where( $"plant_name" == = "沙柳" )
. select( $"plant_carbon" )
. rdd
. collect( )
. map {
case Row( plant_carbon: Double ) => plant_carbon
} . head
println( s "胡杨所需的碳排放量: ${ huYangCarbon } ,沙柳所需要的碳排放量: ${ shaLiuCarbon } " )
println( "---------------------------------------------------------------------------" )
ant_user_low_carbon
. where( $"data_dt" >= "2017/1/1" and $"data_dt" <= "2017/10/1" )
. groupBy( $"user_id" )
. agg( sum( $"low_carbon" ) as "sum_low_carbon" )
. withColumn( "shengYu_carbon" , when( $"sum_low_carbon" >= huYangCarbon, $"sum_low_carbon" - huYangCarbon) . otherwise( $"sum_low_carbon" ) )
. withColumn( "number" , floor( $"shengYu_carbon" / shaLiuCarbon) )
. withColumn( "lead_number" , lead( $"number" , 1 , 0 ) over Window. orderBy( $"number" . desc) )
. withColumn( "duo" , $"number" - $"lead_number" )
. limit( 10 )
ant_user_low_carbon
. where( substring( $"data_dt" , 0 , 4 ) == = "2017" )
. groupBy( $"user_id" , $"data_dt" )
. agg( sum( $"low_carbon" ) as "sum_low_carbon" )
. where( $"sum_low_carbon" > 100 )
. withColumn( "rn" , row_number( ) over Window. partitionBy( $"user_id" ) . orderBy( $"data_dt" ) )
. withColumn( "flag_dt" , date_sub( regexp_replace( $"data_dt" , "/" , "-" ) , $"rn" ) )
. withColumn( "lianxu_days" , count( $"data_dt" ) over Window. partitionBy( $"user_id" , $"flag_dt" ) )
. where( $"lianxu_days" >= 3 )
. join( ant_user_low_carbon, List( "user_id" , "data_dt" ) )
. select( $"user_id" , $"data_dt" , $"low_carbon" )
. show( 100 )
}
}
Test
package com. shujia. sql
import org. apache. spark. sql. expressions. UserDefinedFunction
import org. apache. spark. sql. { DataFrame, SparkSession}
object Test {
def main( args: Array[ String ] ) : Unit = {
val spark: SparkSession = SparkSession
. builder( )
. master( "local[2]" )
. appName( "sql" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import org. apache. spark. sql. functions. _
val str_split: UserDefinedFunction = udf( ( line: String ) => {
"数加:" + line
} )
val studentsDF: DataFrame = spark. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,name STRING,age INT,gender STRING,clazz STRING" )
. load( "spark/data/students.csv" )
studentsDF. createOrReplaceTempView( "lines" )
spark. udf. register( "str_split" , ( line: String ) => "数加:" + line)
spark. sql(
"""
|select str_split(clazz) from lines
|""" . stripMargin) . show( )
}
}
3.Spark-sql
Demo1WordCount
package com. shujia. streaming
import org. apache. spark. streaming. dstream. { DStream, ReceiverInputDStream}
import org. apache. spark. { SparkConf, SparkContext}
import org. apache. spark. streaming. { Durations, StreamingContext}
object Demo1WordCount {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local[2]" )
conf. setAppName( "spark streaming 单词统计案例" )
val sparkContext = new SparkContext( conf)
val streamingContext = new StreamingContext( sparkContext, Durations. seconds( 5 ) )
val lineDS: ReceiverInputDStream[ String ] = streamingContext. socketTextStream( "master" , 12345 )
val flatMapDS: DStream[ String ] = lineDS. flatMap( _. split( " " ) )
val wordsKVDS: DStream[ ( String , Int ) ] = flatMapDS. map( ( _, 1 ) )
val resultDS: DStream[ ( String , Int ) ] = wordsKVDS. reduceByKey( ( x: Int , y: Int ) => x + y)
resultDS. print( )
streamingContext. start( )
streamingContext. awaitTermination( )
streamingContext. stop( )
}
}
Demo2UpdateStateByKey
package com. shujia. streaming
import org. apache. spark. streaming. dstream. { DStream, ReceiverInputDStream}
import org. apache. spark. { SparkConf, SparkContext}
import org. apache. spark. streaming. { Durations, StreamingContext}
object Demo2UpdateStateByKey {
def main( args: Array[ String ] ) : Unit = {
val conf = new SparkConf( )
conf. setMaster( "local[2]" )
conf. setAppName( "spark streaming 单词统计案例1" )
val sparkContext = new SparkContext( conf)
val streamingContext = new StreamingContext( sparkContext, Durations. seconds( 5 ) )
streamingContext. checkpoint( "spark/data/stream_state_wc_checkpoint" )
val lineDS: ReceiverInputDStream[ String ] = streamingContext. socketTextStream( "master" , 12345 )
val flatMapDS: DStream[ String ] = lineDS. flatMap( _. split( " " ) )
val wordsKVDS: DStream[ ( String , Int ) ] = flatMapDS. map( ( _, 1 ) )
val res2DS: DStream[ ( String , Int ) ] = wordsKVDS. updateStateByKey( ( seq: Seq[ Int ] , option: Option[ Int ] ) => {
val currentCount: Int = seq. sum
val count: Int = option. getOrElse( 0 )
Option( currentCount + count)
} )
res2DS. print( )
streamingContext. start( )
streamingContext. awaitTermination( )
streamingContext. stop( )
}
}
Demo3ReduceByKeyAndWindow
package com. shujia. streaming
import org. apache. spark. sql. SparkSession
import org. apache. spark. streaming. dstream. { DStream, ReceiverInputDStream}
import org. apache. spark. { SparkConf, SparkContext}
import org. apache. spark. streaming. { Durations, StreamingContext}
object Demo3ReduceByKeyAndWindow {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local[2]" )
. appName( "窗口演示" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
val sparkContext: SparkContext = sparkSession. sparkContext
val streamingContext = new StreamingContext( sparkContext, Durations. seconds( 5 ) )
val linesDS: ReceiverInputDStream[ String ] = streamingContext. socketTextStream( "master" , 12345 )
val wordsKVDS: DStream[ ( String , Int ) ] = linesDS. flatMap( _. split( " " ) ) . map( ( _, 1 ) )
val res1DS: DStream[ ( String , Int ) ] = wordsKVDS. reduceByKeyAndWindow(
( x: Int , y: Int ) => x + y,
Durations. seconds( 10 ) ,
Durations. seconds( 10 ) )
res1DS. print( )
streamingContext. start( )
streamingContext. awaitTermination( )
streamingContext. stop( )
}
}
Demo4DStreamToRDD
package com. shujia. streaming
import org. apache. spark. SparkContext
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. { DataFrame, SparkSession}
import org. apache. spark. streaming. dstream. { DStream, ReceiverInputDStream}
import org. apache. spark. streaming. { Durations, StreamingContext}
object Demo4DStreamToRDD {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local[2]" )
. appName( "DS2RDD演示" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import sparkSession. implicits. _
import org. apache. spark. sql. functions. _
val sparkContext: SparkContext = sparkSession. sparkContext
val streamingContext = new StreamingContext( sparkContext, Durations. seconds( 5 ) )
val linesDS: ReceiverInputDStream[ String ] = streamingContext. socketTextStream( "master" , 12345 )
val new_linesDS: DStream[ String ] = linesDS. window( Durations. seconds( 15 ) , Durations. seconds( 5 ) )
new_linesDS. foreachRDD( ( rdd: RDD[ String ] ) => {
println( "===================================================" )
println( "正在处理当前批次的数据....." )
println( "===================================================" )
val linesDF: DataFrame = rdd. toDF( "line" )
linesDF. createOrReplaceTempView( "words" )
sparkSession. sql(
"""
|select
|t1.word as word,
|count(1) as number
|from
|(select
| explode(split(line,' ')) as word
|from
|words) t1
|group by t1.word
|""" . stripMargin) . show( )
} )
streamingContext. start( )
streamingContext. awaitTermination( )
streamingContext. stop( )
}
}
Demo5RDDToDStream
package com. shujia. streaming
import org. apache. spark. { SparkContext, rdd}
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. { DataFrame, Row, SparkSession}
import org. apache. spark. streaming. { Durations, StreamingContext}
import org. apache. spark. streaming. dstream. { DStream, ReceiverInputDStream}
object Demo5RDDToDStream {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local[2]" )
. appName( "DS2RDD演示" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
import sparkSession. implicits. _
import org. apache. spark. sql. functions. _
val sparkContext: SparkContext = sparkSession. sparkContext
val streamingContext = new StreamingContext( sparkContext, Durations. seconds( 5 ) )
val linesDS: ReceiverInputDStream[ String ] = streamingContext. socketTextStream( "master" , 12345 )
val resDS: DStream[ ( String , Long ) ] = linesDS. transform( ( rdd: RDD[ String ] ) => {
val linesDF: DataFrame = rdd. toDF( "line" )
linesDF. createOrReplaceTempView( "words" )
val resRDD: RDD[ ( String , Long ) ] = sparkSession. sql(
"""
|select
|t1.word as word,
|count(1) as number
|from
|(select
| explode(split(line,' ')) as word
|from
|words) t1
|group by t1.word
|""" . stripMargin)
. rdd
. map( ( row: Row) => {
( row. getAs[ String ] ( 0 ) , row. getAs[ Long ] ( 1 ) )
} )
resRDD
} )
resDS. print( )
streamingContext. start( )
streamingContext. awaitTermination( )
streamingContext. stop( )
}
}
Demo6Submit
package com. shujia. streaming
import org. apache. spark. SparkContext
import org. apache. spark. sql. SparkSession
import org. apache. spark. streaming. dstream. ReceiverInputDStream
import org. apache. spark. streaming. { Durations, StreamingContext}
object Demo6Submit {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. appName( "提交命令执行" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
val sparkContext: SparkContext = sparkSession. sparkContext
val streamingContext = new StreamingContext( sparkContext, Durations. seconds( 5 ) )
val linesDS: ReceiverInputDStream[ String ] = streamingContext. socketTextStream( "master" , 12345 )
linesDS
. flatMap( _. split( " " ) )
. map( ( _, 1 ) )
. reduceByKey( _+ _)
. print( )
streamingContext. start( )
streamingContext. awaitTermination( )
streamingContext. stop( )
}
}
Demo7SaveToFile
package com. shujia. streaming
import org. apache. spark. SparkContext
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. SparkSession
import org. apache. spark. streaming. dstream. { DStream, ReceiverInputDStream}
import org. apache. spark. streaming. { Durations, StreamingContext}
object Demo7SaveToFile {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local[2]" )
. appName( "提交命令执行" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
val sparkContext: SparkContext = sparkSession. sparkContext
val streamingContext = new StreamingContext( sparkContext, Durations. seconds( 5 ) )
val linesDS: ReceiverInputDStream[ String ] = streamingContext. socketTextStream( "master" , 12345 )
val resultDS: DStream[ ( String , Int ) ] = linesDS
. flatMap( _. split( " " ) )
. map( ( _, 1 ) )
. reduceByKey( _ + _)
. transform( ( rdd: RDD[ ( String , Int ) ] ) => {
println( "=======================" )
println( "正在处理批次数据" )
rdd
} )
resultDS. saveAsTextFiles( "spark/data/streams/stream" , "txt" )
streamingContext. start( )
streamingContext. awaitTermination( )
streamingContext. stop( )
}
}
Demo8SaveToMysql
package com. shujia. streaming
import org. apache. spark. SparkContext
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. SparkSession
import org. apache. spark. streaming. { Durations, StreamingContext}
import org. apache. spark. streaming. dstream. ReceiverInputDStream
import java. sql. { Connection, DriverManager, PreparedStatement}
object Demo8SaveToMysql {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local[2]" )
. appName( "提交命令执行" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
val sparkContext: SparkContext = sparkSession. sparkContext
val streamingContext = new StreamingContext( sparkContext, Durations. seconds( 5 ) )
val linesDS: ReceiverInputDStream[ String ] = streamingContext. socketTextStream( "master" , 12345 )
linesDS. foreachRDD( ( rdd: RDD[ String ] ) => {
println( "------------正在处理一批数据-------------------" )
println( s "该批次的分区数: ${ rdd. getNumPartitions } " )
rdd. foreachPartition( ( itr: Iterator[ String ] ) => {
println( "------------数加 防伪码-------------------" )
Class. forName( "com.mysql.jdbc.Driver" )
val conn: Connection = DriverManager. getConnection(
"jdbc:mysql://master:3306/bigdata29?useUnicode=true&characterEncoding=utf-8&useSSL=false" ,
"root" ,
"123456"
)
val ps: PreparedStatement = conn. prepareStatement( "insert into student values(?,?,?,?,?)" )
itr. foreach( ( line: String ) => {
println( "....正在处理一条数据...." )
val infos: Array[ String ] = line. split( "," )
val id: Int = infos( 0 ) . toInt
val name: String = infos( 1 )
val age: Int = infos( 2 ) . toInt
val gender: String = infos( 3 )
val clazz: String = infos( 4 )
ps. setInt( 1 , id)
ps. setString( 2 , name)
ps. setInt( 3 , age)
ps. setString( 4 , gender)
ps. setString( 5 , clazz)
ps. executeUpdate( )
} )
ps. close( )
conn. close( )
println( )
} )
} )
streamingContext. start( )
streamingContext. awaitTermination( )
streamingContext. stop( )
}
}
Demo9CardCondition
package com. shujia. streaming
import com. alibaba. fastjson. { JSON, JSONObject}
import com. alibaba. fastjson. serializer. JSONObjectCodec
import org. apache. spark. SparkContext
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. SparkSession
import org. apache. spark. streaming. dstream. { DStream, ReceiverInputDStream}
import org. apache. spark. streaming. { Durations, StreamingContext}
import java. lang
import java. sql. { Connection, DriverManager, PreparedStatement}
import java. text. SimpleDateFormat
import java. util. Date
object Demo9CardCondition {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local[2]" )
. appName( "提交命令执行" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
val sparkContext: SparkContext = sparkSession. sparkContext
val sparkStreaming: StreamingContext = new StreamingContext( sparkContext, Durations. seconds( 5 ) )
val InfoDS: ReceiverInputDStream[ String ] = sparkStreaming. socketTextStream( "master" , 12345 )
val carJsonDS: DStream[ String ] = InfoDS. window( Durations. seconds( 15 ) , Durations. seconds( 5 ) )
val cardAndSpeedDS: DStream[ ( Long , ( Double , Int ) ) ] = carJsonDS. map( ( line: String ) => {
val jSONObject: JSONObject = JSON. parseObject( line)
val cardId: Long = jSONObject. getLong( "card" )
val carSpeed: Double = jSONObject. getDouble( "speed" )
( cardId, ( carSpeed, 1 ) )
} )
val cardConditionDS: DStream[ ( Long , ( Double , Int ) ) ] = cardAndSpeedDS. reduceByKey( ( kv1: ( Double , Int ) , kv2: ( Double , Int ) ) => {
val carNumber: Int = kv1. _2 + kv2. _2
val sumSpeed: Double = kv1. _1 + kv2. _1
val avgSpeed: Double = sumSpeed / carNumber
( avgSpeed, carNumber)
} )
cardConditionDS. foreachRDD( ( rdd: RDD[ ( Long , ( Double , Int ) ) ] ) => {
rdd. foreachPartition( ( itr: Iterator[ ( Long , ( Double , Int ) ) ] ) => {
println( "------------数加 防伪码-------------------" )
Class. forName( "com.mysql.jdbc.Driver" )
val conn: Connection = DriverManager. getConnection(
"jdbc:mysql://master:3306/bigdata29?useUnicode=true&characterEncoding=utf-8&useSSL=false" ,
"root" ,
"123456"
)
val ps: PreparedStatement = conn. prepareStatement( "insert into card_condition values(?,?,?,?)" )
val piCiTime: String = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss" ) . format( new Date( ) )
itr. foreach( ( f: ( Long , ( Double , Int ) ) ) => {
val cardId: Long = f. _1
val avgSpeed: Double = f. _2. _1
val carNumber: Int = f. _2. _2
ps. setLong( 1 , cardId)
ps. setDouble( 2 , avgSpeed)
ps. setInt( 3 , carNumber)
ps. setString( 4 , piCiTime)
ps. executeUpdate( )
} )
ps. close( )
conn. close( )
println( )
} )
} )
sparkStreaming. start( )
sparkStreaming. awaitTermination( )
sparkStreaming. stop( )
}
}
4.Spark-opt
Demo1Cache
package com. shujia. opt
import org. apache. spark. SparkContext
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. SparkSession
import org. apache. spark. storage. StorageLevel
object Demo1Cache {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "cache" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
val sparkContext: SparkContext = sparkSession. sparkContext
val studentsRDD: RDD[ String ] = sparkContext. textFile( "spark/data/students.csv" )
studentsRDD. persist( StorageLevel. MEMORY_AND_DISK_SER)
studentsRDD. map( ( line: String ) => ( line. split( "," ) ( 4 ) , 1 ) )
. reduceByKey( _ + _)
. saveAsTextFile( "spark/data/opt/clazz_num" )
studentsRDD. map( ( line: String ) => ( line. split( "," ) ( 3 ) , 1 ) )
. reduceByKey( _ + _)
. saveAsTextFile( "spark/data/opt/gender_num" )
while ( true ) {
}
}
}
Demo2AggregateByKey
package com. shujia. opt
import org. apache. spark. SparkContext
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. SparkSession
object Demo2AggregateByKey {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "cache" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
val sparkContext: SparkContext = sparkSession. sparkContext
val studentsRDD: RDD[ String ] = sparkContext. textFile( "spark/data/students.csv" )
val clazzKVRDD: RDD[ ( String , Int ) ] = studentsRDD. map( ( line: String ) => ( line. split( "," ) ( 4 ) , 1 ) )
clazzKVRDD. reduceByKey( _ + _) . foreach( println)
}
}
Demo3MapPartition
package com. shujia. opt
import org. apache. spark. SparkContext
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. SparkSession
import java. text. SimpleDateFormat
import java. util. Date
object Demo3MapPartition {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "cache" )
. config( "spark.sql.shuffle.partitions" , 2 )
. getOrCreate( )
val sparkContext: SparkContext = sparkSession. sparkContext
val dataRDD: RDD[ String ] = sparkContext. textFile( "spark/data/ant_user_low_carbon.txt" )
val kvRDD: RDD[ ( String , String , String ) ] = dataRDD. mapPartitions( ( itr: Iterator[ String ] ) => {
itr. map( ( line: String ) => {
val infos: Array[ String ] = line. split( "\t" )
( infos( 0 ) , infos( 1 ) , infos( 2 ) )
} )
} )
val resRDD2: RDD[ ( String , Long , String ) ] = kvRDD. mapPartitions( ( itr: Iterator[ ( String , String , String ) ] ) => {
val sdf = new SimpleDateFormat( "yyyy/MM/dd" )
println( "----------------创建了一个SimpleDateFormat对象----------------" )
itr. map( ( kv: ( String , String , String ) ) => {
val dateObj: Date = sdf. parse( kv. _2)
val ts: Long = dateObj. getTime
( kv. _1, ts, kv. _3)
} )
} )
resRDD2. foreach( println)
}
}
Demo4Coalesce
package com. shujia. opt
import org. apache. spark. SparkContext
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. SparkSession
object Demo4Coalesce {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "cache" )
. getOrCreate( )
val sparkContext: SparkContext = sparkSession. sparkContext
val studentsRDD: RDD[ String ] = sparkContext. textFile( "spark/data/students.csv" )
println( s "studentsRDD的分区数量为: ${ studentsRDD. getNumPartitions } " )
val studentsRDD2: RDD[ String ] = studentsRDD. coalesce( 10 , shuffle = true )
println( s "studentsRDD2的分区数量为: ${ studentsRDD2. getNumPartitions } " )
studentsRDD2. foreach( println)
val studentRDD3: RDD[ String ] = studentsRDD2. coalesce( 2 , shuffle = false )
println( s "studentRDD3的分区数量为: ${ studentRDD3. getNumPartitions } " )
studentRDD3. foreach( println)
while ( true ) {
}
}
}
Demo5Coalesce2
package com. shujia. opt
import org. apache. spark. SparkContext
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. SparkSession
object Demo5Coalesce2 {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "cache" )
. getOrCreate( )
val sparkContext: SparkContext = sparkSession. sparkContext
val studentsRDD: RDD[ String ] = sparkContext. textFile( "spark/data/studentsinfo/*" )
println( s "studentsRDD的分区数量为: ${ studentsRDD. getNumPartitions } " )
val students2RDD: RDD[ String ] = studentsRDD. coalesce( 1 , shuffle = false )
println( s "students2RDD的分区数量为: ${ students2RDD. getNumPartitions } " )
students2RDD. saveAsTextFile( "spark/data/studentsinfo2" )
}
}
Demo6MapJoin
package com. shujia. opt
import org. apache. spark. sql. { DataFrame, SparkSession}
object Demo6MapJoin {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "cache" )
. config( "spark.sql.shuffle.partitions" , 1 )
. getOrCreate( )
val studentsDF: DataFrame = sparkSession. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,name STRING,age INT,gender STRING,clazz STRING" )
. load( "spark/data/students.csv" )
val scoresDF: DataFrame = sparkSession. read
. format( "csv" )
. option( "sep" , "," )
. schema( "id STRING,subject_id STRING,score INT" )
. load( "spark/data/score.txt" )
val resDF: DataFrame = scoresDF. join( studentsDF. hint( "broadcast" ) , "id" )
resDF. show( )
while ( true ) {
}
}
}
Demo7Kryo
package com. shujia. opt
import org. apache. spark. SparkContext
import org. apache. spark. rdd. RDD
import org. apache. spark. sql. SparkSession
import org. apache. spark. storage. StorageLevel
case class Student( id: String , name: String , age: Int , gender: String , clazz: String )
object Demo7Kryo {
def main( args: Array[ String ] ) : Unit = {
val sparkSession: SparkSession = SparkSession. builder( )
. master( "local" )
. appName( "cache" )
. config( "spark.sql.shuffle.partitions" , 1 )
. config( "spark.serializer" , "org.apache.spark.serializer.KryoSerializer" )
. config( "spark.kryo.registrator" , "com.shujia.opt.Demo8KryoRegistrator" )
. getOrCreate( )
val sparkContext: SparkContext = sparkSession. sparkContext
val studentsRDD: RDD[ Student] = sparkContext. textFile( "spark/data/students.csv" ) . map( ( line: String ) => {
val infos: Array[ String ] = line. split( "," )
Student( infos( 0 ) , infos( 1 ) , infos( 2 ) . toInt, infos( 3 ) , infos( 4 ) )
} )
studentsRDD. persist( StorageLevel. MEMORY_AND_DISK_SER)
studentsRDD. map( ( stu: Student) => ( stu. clazz, 1 ) )
. reduceByKey( _ + _)
. saveAsTextFile( "spark/data/opt/clazz_num" )
studentsRDD. map( ( stu: Student) => ( stu. gender, 1 ) )
. reduceByKey( _ + _)
. saveAsTextFile( "spark/data/opt/gender_num" )
while ( true ) {
}
}
}
Demo8KryoRegistrator
package com. shujia. opt
import com. esotericsoftware. kryo. Kryo
import org. apache. spark. serializer. KryoRegistrator
class Demo8KryoRegistrator extends KryoRegistrator{
override def registerClasses( kryo: Kryo) : Unit = {
kryo. register( classOf[ Student] )
kryo. register( classOf[ String ] )
kryo. register( classOf[ Int ] )
}
}
Demo9FilterKey
package com. shujia. opt
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object Demo9FilterKey {
def main( args: Array[ String ] ) : Unit = {
val conf: SparkConf = new SparkConf( )
. setMaster( "local" )
. setAppName( "app" )
val sc: SparkContext = new SparkContext( conf)
val lines: RDD[ String ] = sc. textFile( "spark/data/ws/*" )
println( "第一个RDD分区数量:" + lines. getNumPartitions)
val countRDD: RDD[ ( String , Int ) ] = lines
. flatMap( _. split( "\\|" ) )
. map( ( _, 1 ) )
. groupByKey( )
. map( ( x: ( String , Iterable[ Int ] ) ) => ( x. _1, x. _2. toList. sum) )
println( "聚合之后RDD分区的数量" + countRDD. getNumPartitions)
val wordRDD: RDD[ ( String , Int ) ] = lines
. flatMap( _. split( "," ) )
. map( ( _, 1 ) )
wordRDD
. filter( t => ! "null" . equals( t. _1) )
. groupByKey( )
. map( x => ( x. _1, x. _2. toList. sum) )
. foreach( println)
while ( true ) {
}
}
}
Demo10DoubleReduce
package com. shujia. opt
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
import scala. util. Random
object Demo10DoubleReduce {
def main( args: Array[ String ] ) : Unit = {
val conf: SparkConf = new SparkConf( ) . setMaster( "local" ) . setAppName( "app" )
val sc: SparkContext = new SparkContext( conf)
val lines: RDD[ String ] = sc. textFile( "data/word" )
val wordRDD: RDD[ String ] = lines
. flatMap( _. split( "," ) )
. filter( ! _. equals( "" ) )
wordRDD
. map( word => {
val pix: Int = Random. nextInt( 5 )
( pix + "-" + word, 1 )
} )
. groupByKey( )
. map( t => ( t. _1, t. _2. toList. sum) )
. map( t => {
( t. _1. split( "-" ) ( 1 ) , t. _2)
} )
. groupByKey( )
. map( t => ( t. _1, t. _2. toList. sum) )
. foreach( println)
while ( true ) {
}
}
}
Demo11DoubleJoin
package com. shujia. opt
import org. apache. spark. broadcast. Broadcast
import org. apache. spark. rdd. RDD
import org. apache. spark. { SparkConf, SparkContext}
object Demo11DoubleJoin {
def main( args: Array[ String ] ) : Unit = {
val conf: SparkConf = new SparkConf( ) . setAppName( "app" ) . setMaster( "local" )
val sc = new SparkContext( conf)
val dataList1 = List(
( "java" , 1 ) ,
( "shujia" , 2 ) ,
( "shujia" , 3 ) ,
( "shujia" , 1 ) ,
( "shujia" , 1 ) )
val dataList2 = List(
( "java" , 100 ) ,
( "java" , 99 ) ,
( "shujia" , 88 ) ,
( "shujia" , 66 ) )
val RDD1: RDD[ ( String , Int ) ] = sc. parallelize( dataList1)
val RDD2: RDD[ ( String , Int ) ] = sc. parallelize( dataList2)
val sampleRDD: RDD[ ( String , Int ) ] = RDD1. sample( false , 1.0 )
val skewedKey: String = sampleRDD. map( x => ( x. _1, 1 ) )
. reduceByKey( _ + _)
. map( x => ( x. _2, x. _1) )
. sortByKey( ascending = false )
. take( 1 ) ( 0 ) . _2
val skewedRDD1: RDD[ ( String , Int ) ] = RDD1. filter( tuple => {
tuple. _1. equals( skewedKey)
} )
val commonRDD1: RDD[ ( String , Int ) ] = RDD1. filter( tuple => {
! tuple. _1. equals( skewedKey)
} )
val skewedRDD2: RDD[ ( String , Int ) ] = RDD2. filter( tuple => {
tuple. _1. equals( skewedKey)
} )
val commonRDD2: RDD[ ( String , Int ) ] = RDD2. filter( tuple => {
! tuple. _1. equals( skewedKey)
} )
val n = 2
val skewedMap: Map[ String , Int ] = skewedRDD2. collect( ) . toMap
val bro: Broadcast[ Map[ String , Int ] ] = sc. broadcast( skewedMap)
val resultRDD1: RDD[ ( String , ( Int , Int ) ) ] = skewedRDD1. map( kv => {
val word: String = kv. _1
val i: Int = bro. value. getOrElse( word, 0 )
( word, ( kv. _2, i) )
} )
val resultRDD2: RDD[ ( String , ( Int , Int ) ) ] = commonRDD1. join( commonRDD2)
resultRDD1. union( resultRDD2)
. foreach( println)
}
}