spark 写mysql 设置主键_Spark -- StructuredStreaming第三章 与其他技术整合 kafka 生产数据写入MySQL表...

该博客介绍了如何使用SparkStructuredStreaming从Kafka消费数据,包括订阅单个和多个主题,以及通过模式订阅。同时也展示了如何处理数据并将其写入MySQL,由于StructuredStreamingAPI当前不直接支持MySQL,因此需要自定义JDBCSink,实现了ForeachWriter接口来完成数据写入。
摘要由CSDN通过智能技术生成

整合Kafka

官网介绍

●Creating a Kafka Source for Streaming Queries

// Subscribe to 1 topic

val df = spark

.readStream

.format("kafka")

.option("kafka.bootstrap.servers", "host1:port1,host2:port2")

.option("subscribe", "topic1")

.load()

df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

.as[(String, String)]

// Subscribe to multiple topics(多个topic)

val df = spark

.readStream

.format("kafka")

.option("kafka.bootstrap.servers", "host1:port1,host2:port2")

.option("subscribe", "topic1,topic2")

.load()

df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

.as[(String, String)]

// Subscribe to a pattern(订阅通配符topic)

val df = spark

.readStream

.format("kafka")

.option("kafka.bootstrap.servers", "host1:port1,host2:port2")

.option("subscribePattern", "topic.*")

.load()

df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

.as[(String, String)]

●Creating a Kafka Source for Batch Queries(kafka批处理查询)

// Subscribe to 1 topic

//defaults to the earliest and latest offsets(默认为最早和最新偏移)

val df = spark

.read

.format("kafka")

.option("kafka.bootstrap.servers", "host1:port1,host2:port2")

.option("subscribe", "topic1")

.load()df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

.as[(String, String)]

// Subscribe to multiple topics, (多个topic)

//specifying explicit Kafka offsets(指定明确的偏移量)

val df = spark

.read

.format("kafka")

.option("kafka.bootstrap.servers", "host1:port1,host2:port2")

.option("subscribe", "topic1,topic2")

.option("startingOffsets", """{"topic1":{"0":23,"1":-2},"topic2":{"0":-2}}""")

.option("endingOffsets", """{"topic1":{"0":50,"1":-1},"topic2":{"0":-1}}""")

.load()df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

.as[(String, String)]

// Subscribe to a pattern, (订阅通配符topic)at the earliest and latest offsets

val df = spark

.read

.format("kafka")

.option("kafka.bootstrap.servers", "host1:port1,host2:port2")

.option("subscribePattern", "topic.*")

.option("startingOffsets", "earliest")

.option("endingOffsets", "latest")

.load()df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

.as[(String, String)]

●注意:读取后的数据的Schema是固定的,包含的列如下:

Column

Type

说明

key

binary

消息的key

value

binary

消息的value

topic

string

主题

partition

int

分区

offset

long

偏移量

timestamp

long

时间戳

timestampType

int

类型

●注意:下面的参数是不能被设置的,否则kafka会抛出异常:

group.id:kafka的source会在每次query的时候自定创建唯一的group id

auto.offset.reset:为了避免每次手动设置startingoffsets的值,structured streaming在内部消费时会自动管理offset。这样就能保证订阅动态的topic时不会丢失数据。startingOffsets在流处理时,只会作用于第一次启动时,之后的处理都会自动的读取保存的offset。

key.deserializer,value.deserializer,key.serializer,value.serializer 序列化与反序列化,都是ByteArraySerializer

enable.auto.commit:Kafka源不支持提交任何偏移量

上代码演示!!!

package cn.itcast.structedstreaming

import org.apache.spark.SparkContext

import org.apache.spark.sql.streaming.Trigger

import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object KafkaStructuredStreamingDemo {

def main(args: Array[String]): Unit = {

//1.创建SparkSession

val spark: SparkSession =

SparkSession.builder().master("local[*]").appName("SparkSQL").getOrCreate()

val sc: SparkContext = spark.sparkContext

sc.setLogLevel("WARN")

import spark.implicits._

//2.连接Kafka消费数据

val dataDF: DataFrame = spark.readStream

.format("kafka")

.option("kafka.bootstrap.servers", "node01:9092")

.option("subscribe", "spark_kafka")

.load()

//3.处理数据

//注意:StructuredStreaming整合Kafka获取到的数据都是字节类型,所以需要按照官网要求,

//转成自己的实际类型

val dataDS: Dataset[String] = dataDF.selectExpr("CAST(value AS STRING)").as[String]

val wordDS: Dataset[String] = dataDS.flatMap(_.split(" "))

val result: Dataset[Row] = wordDS.groupBy("value").count().sort($"count".desc)

result.writeStream

.format("console")

.outputMode("complete")

.trigger(Trigger.ProcessingTime(0))

.option("truncate",false)//超过长度的列不截断显示,即完全显示

.start()

.awaitTermination()

}

}

整合MySQL

简介

●需求

我们开发中经常需要将流的运算结果输出到外部数据库,例如MySQL中,但是比较遗憾Structured Streaming API不支持外部数据库作为接收器

如果将来加入支持的话,它的API将会非常的简单比如:

format("jdbc").option("url","jdbc:mysql://...").start()

但是目前我们只能自己自定义一个JdbcSink,继承ForeachWriter并实现其方法

上代码演示!!!

package cn.itcast.structedstreaming

import java.sql.{Connection, DriverManager, PreparedStatement}

import org.apache.spark.SparkContext

import org.apache.spark.sql._

import org.apache.spark.sql.streaming.Trigger

object JDBCSinkDemo {

def main(args: Array[String]): Unit = {

//1.创建SparkSession

val spark: SparkSession =

SparkSession.builder().master("local[*]").appName("SparkSQL").getOrCreate()

val sc: SparkContext = spark.sparkContext

sc.setLogLevel("WARN")

import spark.implicits._

//2.连接Kafka消费数据

val dataDF: DataFrame = spark.readStream

.format("kafka")

.option("kafka.bootstrap.servers", "node01:9092")

.option("subscribe", "spark_kafka")

.load()

//3.处理数据

//注意:StructuredStreaming整合Kafka获取到的数据都是字节类型,所以需要按照官网要求,转成自己的实际类型

val dataDS: Dataset[String] = dataDF.selectExpr("CAST(value AS STRING)").as[String]

val wordDS: Dataset[String] = dataDS.flatMap(_.split(" "))

val result: Dataset[Row] = wordDS.groupBy("value").count().sort($"count".desc)

val writer = new JDBCSink("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8", "root", "root")

result.writeStream

.foreach(writer)

.outputMode("complete")

.trigger(Trigger.ProcessingTime(0))

.start()

.awaitTermination()

}

class JDBCSink(url:String,username:String,password:String) extends ForeachWriter[Row] with Serializable{

var connection:Connection = _ //_表示占位符,后面会给变量赋值

var preparedStatement: PreparedStatement = _

//开启连接

override def open(partitionId: Long, version: Long): Boolean = {

connection = DriverManager.getConnection(url, username, password)

true

}

/*

CREATE TABLE `t_word` (

`id` int(11) NOT NULL AUTO_INCREMENT,

`word` varchar(255) NOT NULL,

`count` int(11) DEFAULT NULL,

PRIMARY KEY (`id`),

UNIQUE KEY `word` (`word`)

) ENGINE=InnoDB AUTO_INCREMENT=26 DEFAULT CHARSET=utf8;

*/

//replace INTO `bigdata`.`t_word` (`id`, `word`, `count`) VALUES (NULL, NULL, NULL);

//处理数据--存到MySQL

override def process(row: Row): Unit = {

val word: String = row.get(0).toString

val count: String = row.get(1).toString

println(word+":"+count)

//REPLACE INTO:表示如果表中没有数据这插入,如果有数据则替换

//注意:REPLACE INTO要求表有主键或唯一索引

val sql = "REPLACE INTO `t_word` (`id`, `word`, `count`) VALUES (NULL, ?, ?);"

preparedStatement = connection.prepareStatement(sql)

preparedStatement.setString(1,word)

preparedStatement.setInt(2,Integer.parseInt(count))

preparedStatement.executeUpdate()

}

//关闭资源

override def close(errorOrNull: Throwable): Unit = {

if (connection != null){

connection.close()

}

if(preparedStatement != null){

preparedStatement.close()

}

}

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值