项目场景:
sapark strem 读取socket 数据保存到mysql
问题描述
spark Stream 连接 mysql 写入数据乱码
代码如下
建表语句
create table hotwords(time varchar(100),word varchar(100),count varchar(100)) charset=utf8mb4;
pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>maven_scala</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/log4j/log4j -->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.13</artifactId>
<version>3.2.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.13</artifactId>
<version>3.2.0</version>
</dependency>
<!--mysql数据库访问-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.37</version>
</dependency>
</dependencies>
</project>
def WordCountAndTopN02(): Unit ={
/*
* @param null
*
* @Description: 模拟排行榜每隔10s计算最近20s的top3数据
* 将统计的数据自定义输出。如:mysql、hdfs等
*/
val url = "jdbc:mysql://"+ip+":3306/bigdata_test?useUnicode=true&characterEncoding=utf-8"
val username = "root"
val passwd = "fan123456"
//TODO 0.准备环境
val conf:SparkConf = new SparkConf().setAppName("wc").setMaster("local[*]")
val sc:SparkContext = new SparkContext(conf)
sc.setLogLevel("WARN")
val ssc = new StreamingContext(sc,Seconds(5)) // 每隔5秒分一个批次
//TODO 1.加载数据
val lines: ReceiverInputDStream[String] = ssc.socketTextStream(ip,9999)
//TODO 2.处理数据
// resultDS 是20s内统计的rdd集合
val resultDS: DStream[(String, Int)] = lines.flatMap(_.split("\\s+"))
.map((_, 1))
// .reduceByKey(_+_)
// windowDuration 窗口长度(大小),表示要计算最近多长时间的数据
// slideDuration 滑动间隔,表示每隔多长时间计算一次
// windowDuration和slideDuration必须是batchDuration(批次)的倍数
.reduceByKeyAndWindow((a: Int, b: Int) => a + b, Seconds(20), Seconds(10))
val sortRDDs: DStream[(String, Int)] = resultDS.transform(rdd => { // rdd 是统计集合内的每个rdd个体
val sortRdDD: RDD[(String, Int)] = rdd.sortBy(_._2, false)
val top3: Array[(String, Int)] = rdd.sortBy(_._2, false).take(3)
println("==========top3============")
top3.foreach(println)
println("==========top3============")
sortRdDD
})
//TODO 3.输出结果
// sortRDDs.print() // 默认输出
// 自定义输出
sortRDDs.foreachRDD((rdd,time) => {
val milliseconds: Long = time.milliseconds
println("=======自定义输出======")
println("batchtime---->"+milliseconds)
println("=======自定义输出======")
// 将结果自定义输出
// 控制台输出
rdd.foreach(println)
// 输出到hdfs/本地文件
rdd.coalesce(1).saveAsTextFile("data/output/result-"+milliseconds)
// 输出到Mysql
rdd.foreachPartition(iter => {
// 开启连接
val conn: Connection = DriverManager.getConnection(url,username,passwd)
val sql = "insert into `hotwords` (`time`,`word`,`count`) values (?,?,?)"
val ps: PreparedStatement = conn.prepareStatement(sql)
iter.foreach(t =>{
val word: String = t._1
val count: Int = t._2
ps.setTimestamp(1,new Timestamp(milliseconds))
ps.setString(2,word)
ps.setInt(3,count)
ps.addBatch()
})
ps.executeBatch()
// 关闭连接
conn.close()
ps.close()
})
})
//TODO 4.启动并等待结果
ssc.start()
ssc.awaitTermination() // 流式应用启动之后需要一直运行等待手动停止/等待数据到来
//TODO 5.关闭资源
ssc.stop(stopSparkContext = true,stopGracefully = true)
}
原因分析:
连接mysql 的连接串没指定编码
解决方案:
修改连接串
val url = “jdbc:mysql://”+ip+“:3306/bigdata_test?useUnicode=true&characterEncoding=utf-8”
增加字符编码