spark连接mysql
首先在mysql创建两个表用来接收数据
package cn._51doit.spark.day09
import java.sql.{Connection, DriverManager, PreparedStatement, SQLException}
object MySQLTransactionTest {
def main(args: Array[String]): Unit = {
//MySQL的InnoDB引擎是支持事物的
var connection: Connection = null
var ps1: PreparedStatement = null
var ps2: PreparedStatement = null
try {
connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?useSSL=false", "root", "123456")
//开启事物
connection.setAutoCommit(false)
//第一个问号就是你的1,第二个问号就是你的2
ps1 = connection.prepareStatement("INSERT INTO t_a VALUES (?, ?)")
ps2 = connection.prepareStatement("INSERT INTO t_b VALUES (?, ?)")
ps1.setInt(1, 4000)
ps1.setString(2, "nihao")
//执行更新
ps1.executeUpdate()
ps2.setInt(1, 2)
ps2.setString(2, "hadoop")
//执行更新
ps2.executeUpdate()
//提交事物
connection.commit()
} catch {
case e: Exception => {
e.printStackTrace()
//回滚事物(一凡遇到异常就一键撤回当做什么都没发生)ExactlyOnce(精准一次性语义)
connection.rollback()
}
} finally {
//释放资源
if(ps2 != null) {
ps2.close()
}
if(ps1 != null) {
ps1.close()
}
if(connection != null) {
connection.close()
}
}
}
}
mysql的执行结果:
2 doit123
2 doit123
2 doit123
3 doit123
4 ??
4000 nihao
4000 nihao
将sparkstreaming直连kafka的数据和元数据,在一个事物写入mysql
我们要写入的数据逻辑是一个wordcount
例如我们第一次插入了(name,5),当程序继续运行(name,3)的时候mysql会报错,redis数据库对这种数据比较方便,我们直接的调用ingroupby函数就可以,但是mysql不能这样,但是有另外一个函数可以实现,在你的sql依据后面添加,
on duplicate key update counts = countss+3//这样就可以实现累加了,我们将这行代码加入spark程序,
还有两个工具在工程里面分别是OffsetUtils和JedisConnectionPool
在这个程序执行之前,我们需要在虚拟机打开hdfs,zookeeper,kafka,并且打开wordcount的touple,并向里面输入数据,你就会看到mysql中的数据正在动态的被导入,这就是实时!!!
package cn._51doit.spark.day09
import java.sql.{Connection, DriverManager, PreparedStatement}
import Utils.OffsetUtils
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Milliseconds, StreamingContext}
/**
* 实现WordCount功能并且实现ExactlyOnce
*
* 计算好的结果和偏移量都保存到MySQL中
*/
object ExactlyOnceWordCountWithMySQL {
def main(args: Array[String]): Unit = {
val appName = "wordcount"
val groupId = "g10"
val conf = new SparkConf()
.setAppName(appName)
.setMaster("local[*]")
val ssc = new StreamingContext(conf, Milliseconds(5000))
//设置运行时的日志级别
ssc.sparkContext.setLogLevel("WARN")
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "doit01:9092,doit02:9092,doit03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "gwc02",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean) //不自动提交偏移量,如果不设置,默认为true
)
val topics = Array("wordcount")
val histroyOffsets: Map[TopicPartition, Long] = OffsetUtils.queryHistoryOffsetFromMySQL(appName, groupId)
//创建Kafka数据流
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent, //位置策略
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, histroyOffsets) //订阅策略
)
//使用foreachRDD获取偏移量并使用RDD的api编程
kafkaDStream.foreachRDD(rdd => {
if(!rdd.isEmpty()) {
//获取偏移量
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//使用RDD的api编程并将结果收集到Driver端
val reduced: RDD[(String, Int)] = rdd.map(_.value()).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_)
//触发Action
val res: Array[(String, Int)] = reduced.collect()
var connection: Connection = null
var ps1: PreparedStatement = null
var ps2: PreparedStatement = null
try {
connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?useSSL=false", "root", "123456")
//开启事物
connection.setAutoCommit(false)
//写入计算好的结果
ps1 = connection.prepareStatement("INSERT INTO t_wordcount VALUES (?, ?) ON DUPLICATE KEY UPDATE counts = counts + ?")
for(t <- res) {
ps1.setString(1, t._1)
ps1.setInt(2, t._2)
ps1.setInt(3, t._2)
ps1.executeUpdate()
}
ps2 = connection.prepareStatement("INSERT INTO t_kafka_offsets VALUES (?, ?, ?) ON DUPLICATE KEY UPDATE offset = ?")
for(range <- offsetRanges) {
val topic = range.topic
val partition = range.partition
val untilOffset = range.untilOffset
//更新偏移量
ps2.setString(1, appName + "_" + groupId)
ps2.setString(2, topic + "_" + partition)
ps2.setLong(3, untilOffset)
ps2.setLong(4, untilOffset)
ps2.executeUpdate()
}
//提交事物
connection.commit()
} catch {
case e: Exception => {
//回滚事物
connection.rollback()
e.printStackTrace()
//停掉当前的程序
ssc.stop(true)
}
} finally {
//释放资源
if(ps2 != null) {
ps2.close()
}
if(ps1 != null) {
ps1.close()
}
if(connection != null) {
connection.close()
}
}
}
})
//启动
ssc.start()
//挂起
ssc.awaitTermination()
}
}
虚拟的kafka中输入数据,
[root@doit01 kafka_2.11-2.2.2]# bin/kafka-console-producer.sh --broker-list doit01:9092, doit0092,doit03:9092 --topic wordcount
>shizijun
>shizijun
>zhangliya
>zhangliya
>zhangliya
>zhangliya
>zhangci
>shihu
>a
>s
>d
>f
>g
>h
>j
>k
>l
>
mysql中就会动态导入
t_wordcount
word count
a 1
d 1
e 1
ew 1
ewr 2
f 1
g 1
h 1
j 1
k 1
l 1
laobanbu 1
nisho 1
pengchengytyuan 1
s 1
shangliya 1
shihu 1
shizijn 1
shizijun 6
werewr 1
weweewr 1
wr 1
zhangci 1
也包括偏移量的动态导入
t_kafka_offsets
app_gid topic_partition offset
wordcount_g10 wordcount_0 49