依赖
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.27</version>
</dependency>
</dependencies>
一 updateStateByKey
函数声明:
def updateStateByKey[S: ClassTag](
updateFunc: (Seq[V], Option[S]) => Option[S]): DStream[(K, S)]
getActiveOrCreate 函数声明
def getActiveOrCreate(
checkpointPath: String,
creatingFunc: () => StreamingContext,
hadoopConf: Configuration = SparkHadoopUtil.get.conf,
createOnError: Boolean = false
): StreamingContext = {
ACTIVATION_LOCK.synchronized {
getActive().getOrElse { getOrCreate(checkpointPath, creatingFunc, hadoopConf, createOnError) }
}
}
需求:对上次计算的结果进行缓存,在应用重启后,加载上次计算的结果,这里从scoket中读取流数据
package com.gc.sparkStreaming.day01.transform
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object RddToDStream {
//定义状态改变函数 seq:Seq[Int] 当前批次的数据 option:Option[Int] 累加值
val updateFun=(seq:Seq[Int],option:Option[Int])=>{
Some(seq.sum+option.getOrElse(0))
}
val checkPoint:String="./ssss" // checkpoint 目录 默认取的是core-default.xml中fs.defaultFS 的文件历经
// 创建StreamingContext 函数
val createStream: () => StreamingContext = ()=>{
val conf = new SparkConf().setAppName("rdd").setMaster("local[*]")
val spark: StreamingContext = new StreamingContext(conf,Seconds(2))
val dStrem: ReceiverInputDStream[String] = spark.socketTextStream("hadoop103",9999)
val ds: DStream[(String, Int)] = dStrem.flatMap(_.split("\\W+")).map((_,1))
val res: DStream[(String, Int)] = ds.updateStateByKey(updateFun)
res.print() // 调用行动算子
spark
}
def main(args: Array[String]): Unit = {
val spark = StreamingContext.getActiveOrCreate(checkPoint,createStream)
spark.checkpoint("./ssss")
spark.start()
spark.awaitTermination()
}
}
注意点:
- 使用updateBykey 必须设置定checkpoint
- 要想在应用重启后保留上次计算的结果,需使用这种方式创建StreamingContext对象StreamingContext.getActiveOrCreate(checkPoint,createStream)
- 使用上述方式创建StreamingContext后,必须将业务逻辑全部放在createStream函数内,否则会报错
二 中间数据存储到Mysql
累加的本质就是获取之前运算的结果和本次新传入的数据进行汇总,要做的就是将之前累加后的数据缓存起来
import java.sql.{Connection, DriverManager, PreparedStatement}
import java.util.Properties
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
object UseMysqlToChecpoint {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("UseMysqlToChecpoint")
val spark = new StreamingContext(conf,Seconds(2))
//业务逻辑
// 从scoket
val sourceDstream = spark.socketTextStream("hadoop102",9999)
val wordCount: DStream[(String, Int)] = sourceDstream.flatMap(_.split("\\W+")).map((_,1)).reduceByKey(_+_)
//将本次计算的结果写入mysql
wordCount.foreachRDD(rdd=>{
rdd.foreachPartition(it=>{
val conn = getConnection() // 一个分区创建一个连接
conn.setAutoCommit(false) //取消自动提交事务
val sql:String="insert into wordcount values(?,?) on duplicate key update wcount=if(wcount is not null ,wcount+?,wcount)" //设置wordcount 表中对应的work为主键,不允许重复,当key存在的时候进行修改,if(wcount is not null ,wcount+?,wcount) 判断统计结果是否存在,存在则加上本次计算的结果,不存在则直接修改
val ps: PreparedStatement = conn.prepareStatement(sql)
it.foreach(result=>{
ps.setString(1,result._1);
ps.setInt(2,result._2);
ps.setInt(3,result._2);
ps.addBatch() // 一批
})
ps.executeBatch() // 批量执行
conn.commit() // 提交事务
ps.close() // 关闭连接
conn.close()
})
})
spark.start()
spark.awaitTermination()
}
def getConnection():Connection={
Class.forName("com.mysql.jdbc.Driver")
val pro = new Properties()
pro.setProperty("user","root")
pro.setProperty("password","root")
val conn = DriverManager.getConnection("jdbc:mysql://hadoop102:3306/sparkdemo",pro)
conn
}
}
- 连接外部存储应该放在foreachPartition中,一个分区创建一个连接
- 在这为方便起见,将单词的word作为mysql表中的主键
- 可以使用ps.addBatch() 和 ps.executeBatch()对一个分区的数据进行批量执行
三 中间数据存到Redis中
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import redis.clients.jedis.{JedisPool, JedisPoolConfig}
object UseRedisToChecpoint {
private val jedisPoolConfig: JedisPoolConfig = new JedisPoolConfig()
jedisPoolConfig.setMaxTotal(100) //最大连接数
jedisPoolConfig.setMaxIdle(20) //最大空闲
jedisPoolConfig.setMinIdle(20) //最小空闲
jedisPoolConfig.setBlockWhenExhausted(true) //忙碌时是否等待
jedisPoolConfig.setMaxWaitMillis(500) //忙碌时等待时长 毫秒
jedisPoolConfig.setTestOnBorrow(false) //每次获得连接的进行测试
private val jedisPool: JedisPool = new JedisPool(jedisPoolConfig, "hadoop102", 6379)
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("UseRedisToChecpoint")
val spark = new StreamingContext(conf,Seconds(2))
//业务逻辑
val sourceDstream = spark.socketTextStream("hadoop102",9999)
val wordCount: DStream[(String, Int)] = sourceDstream.flatMap(_.split("\\W+")).map((_,1)).reduceByKey(_+_)
//将本次计算的结果写入redis
wordCount.foreachRDD(rdd=>{
rdd.foreachPartition(it=>{
val jedis = getJedis()
it.foreach(result=>{
jedis.hincrBy("wordcount",result._1,result._2); //hincrBy 存在累加 不存在则写入1
})
jedis.close()
})
})
spark.start()
spark.awaitTermination()
}
def getJedis()={
jedisPool.getResource
}
}