回顾
上一篇文章 :
package com.ruozedata.flink.flink02
import java.sql.{Connection, PreparedStatement}
import com.ruozedata.flink.fink01.Domain.Student
import com.ruozedata.flink.utils.MySQLUtils
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction}
/**
* 读取MySQL中的数据
*/
class MySQLSource extends RichSourceFunction[Student] {
var connection: Connection = _
var pstmt: PreparedStatement = _
// 在open方法中建立连接
override def open(parameters: Configuration): Unit = {
super.open(parameters)
connection = MySQLUtils.getConnection()
pstmt = connection.prepareStatement("select * from student")
}
// 释放
override def close(): Unit = {
super.close()
MySQLUtils.closeResource(connection, pstmt)
}
override def cancel(): Unit = {
}
override def run(ctx: SourceFunction.SourceContext[Student]): Unit = {
val rs = pstmt.executeQuery()
while (rs.next()) {
val student = Student(rs.getInt("id"), rs.getString("name"), rs.getString("password"), rs.getInt("age"))
ctx.collect(student)
}
}
}
注意:
class MySQLSource extends RichSourceFunction[Student]
RichSourceFunction:这个东西 并不是 上一篇文章的三个Function 中的一个
那么 RichSourceFunction 的并行度是多少?
源码:
public abstract class RichSourceFunction<OUT> extends AbstractRichFunction implements SourceFunction<OUT> {
private static final long serialVersionUID = 1L;
}
当时测试的时候设置并行度为3的时候 报错的,源码里是继承SourceFunction的 ,所以并行度是1
那么就有一个问题?
有两个流:
stream1 4并行度 emp表 log里 domain
stream2 1并行度 dept表 MySQL里 user_id
这两个流进行合并
stream1.connect(stream2).map(x=>{})
即:
logStream emp: deptno empno ename
mysqlStream dept: deptno dname
上面两个合并后输出outputStream:empno,ename,deptno,dname
这种场景在工作当中是非常非常多的,eg:
stream1中log接进来之后有些是不全的,对于access.log来讲只有domain,而对于一家公司,一个user_id下面可能有多个domain,如果有一个需求:按照user_id来统计各个维度的domain,该怎么办呢?那么没法统计,因为log里压根就没有user_id这个字段
所以数据进来之后在做数据清洗/ETL(不论实时还是离线这块是跑不了的)
//这里进来其实就是需要一个类似于join的方式
outputStream = stream1.connect(stream2).map(x=>{
....
})
那么connect会成功么??
4并行度的流 里面有写task 的deptno 关联
1并行度的流拿取对应的dname 是拿不到的
那么这个问题该怎么解决呢?之后再说
先解决并行度为1的问题 下面变为:RichParallelSourceFunction
package com.ruozedata.flink.flink02
import com.ruozedata.flink.fink01.Domain.Student
import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
import scalikejdbc._
import scalikejdbc.config._
class ScalikeJDBCMySQLSource extends RichParallelSourceFunction[Student] {
override def run(ctx: SourceFunction.SourceContext[Student]): Unit = {
println("~~~run~~~~")
DBs.setupAll() // parse configuration file
DB.readOnly { implicit session => {
SQL("select * from student").map(rs => {
val student = Student(rs.int("id"), rs.string("name"), rs.string("password"), rs.int("age"))
ctx.collect(student)
}).list().apply()
}
}
}
override def cancel(): Unit = {
}
}
package com.ruozedata.flink.flink02
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010
object SourceApp {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
//测试使用
env.addSource(new ScalikeJDBCMySQLSource).setParallelism(2).print()
env.execute(this.getClass.getSimpleName)
}
}
结果:
~~~run~~~~
~~~run~~~~
3> Student(2,tonny,222,19)
4> Student(3,hello,333,20)
1> Student(3,hello,333,20)
2> Student(1,kairis,leo123,18)
4> Student(2,tonny,222,19)
3> Student(1,kairis,leo123,18)
2并行度 也就是结果会拿取两份
第二个问题:
接着最开始流connect的问题
1.log流 的并行度是4 (100w条)
那么 MySQL里的并行度是否有必要也是4呢? (MySQl 1w条)
结果:
是不是 没有这个必要呀
1个并行度能处理过来的 就可以了
但是1个并行度 也log流4个并行度 connect的时候还会出问题
那么该怎么办呢?之后说
分区器,在MR、Spark、Flink以及Kafka中都有,MR、Spark中默认是hash对key取模来算的,Flink也是一样的,前面有测试过哪个域名往哪走
Data Sinks
https://ci.apache.org/projects/flink/flink-docs-release-1.9/dev/datastream_api.html#data-sinks
案例
Sink kafka
Flink的角色是Producer:
package com.ruozedata.flink.flink03
import com.ruozedata.flink.fink01.Domain.Access
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer010
object SinkApp {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val stream = env.readTextFile("data/access.log").map(x => {
val splits = x.split(",")
Access(splits(0).toLong, splits(1), splits(2).toLong).toString
})
val producer = new FlinkKafkaProducer010[String](
"hadoop001:9092", // broker list
"ruozedata_offset", // target topic
new SimpleStringSchema) // serialization schema
stream.addSink(producer) // 2Kafka
stream.print() // 2Local
env.execute(this.getClass.getSimpleName)
}
}
结果是:
[hadoop@hadoop001 bin]$ ./kafka-console-consumer.sh --bootstrap-server hadoop001:9092 --topic ruozedata_offset
Access(201912120010,dongqiudi.com,1000)
Access(201912120010,ruozedata.com,4000)
Access(201912120010,zhibo8.com,5000)
Access(201912120010,ruozedata.com,2000)
Access(201912120010,dongqiudi.com,6000)
Kafak 2 Kafka
package com.ruozedata.flink.flink03
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.api.scala._
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer010, FlinkKafkaProducer010}
object SinkApp {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val properties = new Properties()
properties.setProperty("bootstrap.servers", "hadoop001:9092")
properties.setProperty("group.id", "ruozedata-flink-test")
val consumer = new FlinkKafkaConsumer010[String]("ruozedata_offset", new SimpleStringSchema(), properties)
val stream = env.addSource(consumer)
// TODO... Kafka2Kafka double_happy_offset ==> double_happy_offset_test
val producer = new FlinkKafkaProducer010[String](
"hadoop001:9092", // broker list
"ruozedata_offset_test", // target topic
new SimpleStringSchema) // serialization schema
stream.addSink(producer) // 2Kafka
stream.print() // 2Local
env.execute(this.getClass.getSimpleName)
}
}
结果:
[hadoop@hadoop001 bin]$ ./kafka-console-consumer.sh --bootstrap-server hadoop001:9092 --topic ruozedata_offset_test
f
a
b
d
b
d
Sink MySQL
/**
* A {@link org.apache.flink.api.common.functions.RichFunction} version of {@link SinkFunction}.
*/
@Public
public abstract class RichSinkFunction<IN> extends AbstractRichFunction implements SinkFunction<IN> {
private static final long serialVersionUID = 1L;
}
/**
* Interface for implementing user defined sink functionality.
*
* @param <IN> Input type parameter.
*/
@Public
public interface SinkFunction<IN> extends Function, Serializable {
所以
IN: 表示 Input type parameter.
表示sink到的 MySQL表里的 字段类型
package com.ruozedata.flink.flink03
import java.sql.{Connection, PreparedStatement}
import com.ruozedata.flink.utils.MySQLUtils
import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction}
import org.apache.flink.configuration.Configuration
class RuozedataMySQLSink extends RichSinkFunction[(String, Int)] {
var connection: Connection = _
var insertPstmt: PreparedStatement = _
var updatePstmt: PreparedStatement = _
// 打开connection等
override def open(parameters: Configuration): Unit = {
super.open(parameters)
/**
* 这块与Spark不一样的 Spark是一个批次往下写
*
* 而Flink是 来一条就往下写
*
* 所以 :
* 这块创建两个 PreparedStatement 也可以创建一个 需要创建表的时候指定 key 即可
*
* 创建 两个简单些 : 有就更新 没有就插入
*/
connection = MySQLUtils.getConnection()
insertPstmt = connection.prepareStatement("insert into domain(domain,traffic) values (?,?)")
updatePstmt = connection.prepareStatement("update domain set traffic=? where domain=?")
}
/**
* 写数据
*/
override def invoke(value: (String, Int), context: SinkFunction.Context[_]): Unit = {
// TODO insert、update
/**
* 这块并没有写错哦 要看 你的sql语句的 ? 走的哦
*
* 先更新 有就更新 无就插入
*/
updatePstmt.setInt(1, value._2)
updatePstmt.setString(2, value._1)
updatePstmt.execute()
if (updatePstmt.getUpdateCount == 0) {
insertPstmt.setString(1, value._1)
insertPstmt.setInt(2, value._2)
insertPstmt.execute()
}
}
// 释放资源
override def close(): Unit = {
super.close()
if (insertPstmt != null) insertPstmt.close()
if (updatePstmt != null) updatePstmt.close()
if (connection != null) connection.close()
}
}
package com.ruozedata.flink.flink03
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.api.scala._
object SinkApp {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val stream = env.readTextFile("data/access.log").map(x => {
val splits = x.split(",")
(splits(1), splits(2).toInt)
}).keyBy(0).sum(1)
stream.addSink(new RuozedataMySQLSink)
env.execute(this.getClass.getSimpleName)
}
}
结果是: 我执行了两遍Flink程序 发现 结果幂等性 ok
mysql> select * from domain;
+---------------+---------+
| domain | traffic |
+---------------+---------+
| dongqiudi.com | 7000 |
| ruozedata.com | 6000 |
| zhibo8.com | 5000 |
+---------------+---------+
3 rows in set (0.00 sec)
mysql> select * from domain;
+---------------+---------+
| domain | traffic |
+---------------+---------+
| dongqiudi.com | 7000 |
| ruozedata.com | 6000 |
| zhibo8.com | 5000 |
+---------------+---------+
3 rows in set (0.00 sec)
mysql>
输出对比:
Spark读写:外部数据源
spark.read.format("").option("",").load
spark.write.format("")....save
Flink读写
addSource(new XXXSourceFunction)
addSink(new XXXSinkFunction)
都是 可插拔的 Spark 自定义外部数据源
还是Spark好用呀
写东西还是要有通用的思想,通用的!!! 与底层的执行引擎是没关系的,Beam的框架 Spark1.6、2.x、Flink,你的API上一定不能出现底层执行引擎的API,要适配Spark以及Flink
Debugging ***
调试的手段***
这一块是为了方便测试用的,生产上面数据肯定不是这样的,对于实时来讲,大部分对接的是消息队列;测试的时候没必要用消息队列,只需要把规则,哪个字段对应什么意思拿出来就OK了
Iterator Data Sink
https://ci.apache.org/projects/flink/flink-docs-release-1.9/dev/datastream_api.html#iterator-data-sink
注意:
你调试的时候 :
1.不需要接kafka 使用 自定义Source 就可以 eg:RichParallelSourceFunction
2.Sink的时候 也不需要Sink 到 存储的地方去
eg:kudu 、Hbase等
这里使用 Iterator Data Sink
这个真的很重要:
我封闭开发的时候 字段154个 Sink到kudu里面
如果使用 提供的 connecer 154个字段 你什么时候能写完
一定是要写成 自动解析的 最好
注意:
刚好 我的业务场景 需要这个 Iterator Data Sink
官网:
import org.apache.flink.streaming.experimental.DataStreamUtils
import scala.collection.JavaConverters.asScalaIteratorConverter
val myResult: DataStream[(String, Int)] = ...
val myOutput: Iterator[(String, Int)] = DataStreamUtils.collect(myResult.javaStream).asScala
案例:
object SourceApp {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
//测试使用
val data: DataStream[Domain.Access] = env.addSource(new AccessSource03).setParallelism(1)
import scala.collection.JavaConverters.asScalaIteratorConverter
val output: Iterator[Domain.Access] = DataStreamUtils.collect(data.javaStream).asScala
output.take(1).foreach(println(_))
env.execute(this.getClass.getSimpleName)
}
}
结果:
Access(1577975577015,zhibo8.cc,78)
Sink redis
https://ci.apache.org/projects/flink/flink-docs-release-1.9/dev/connectors/
注意:
pom 选择的 版本;
<dependency>
<groupId>org.apache.bahir</groupId>
<artifactId>flink-connector-redis_2.11</artifactId>
<version>1.1-SNAPSHOT</version>
</dependency>
这个版本可能下载不到 :
所以下载:
<!-- https://mvnrepository.com/artifact/org.apache.bahir/flink-connector-redis -->
<dependency>
<groupId>org.apache.bahir</groupId>
<artifactId>flink-connector-redis_2.11</artifactId>
<version>1.0</version>
</dependency>
Flink Redis Connector
https://bahir.apache.org/docs/flink/current/flink-streaming-redis/
package com.ruozedata.flink.flink03
import org.apache.flink.streaming.connectors.redis.common.mapper.{RedisCommand, RedisCommandDescription, RedisMapper}
class RuozedataRedisSink extends RedisMapper[(String, Int)] {
override def getCommandDescription: RedisCommandDescription = {
new RedisCommandDescription(RedisCommand.HSET, "ruozedata_traffic")
}
override def getValueFromData(data: (String, Int)): String = {
data._2 + ""
}
override def getKeyFromData(data: (String, Int)): String = {
data._1
}
}
package com.ruozedata.flink.flink03
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.api.scala._
import org.apache.flink.streaming.connectors.redis.RedisSink
import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig
object SinkApp {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val stream = env.readTextFile("data/access.log").map(x => {
val splits = x.split(",")
(splits(1), splits(2).toInt)
}).keyBy(0).sum(1)
val conf = new FlinkJedisPoolConfig.Builder().setHost("hadoop001").build()
stream.addSink(new RedisSink[(String, Int)](conf, new RuozedataRedisSink))
env.execute(this.getClass.getSimpleName)
}
}
结果是:
hadoop001:6379> HGETALL ruozedata_traffic
1) "ruozedata.com"
2) "6000"
3) "dongqiudi.com"
4) "7000"
5) "zhibo8.com"
6) "5000"
hadoop001:6379>
与前面的MySQL 结果是一样的
//对比下来,这些数据存在NoSQL其实更方便一些,可以upset
解决前面的问题
mysql> select * from user_domains;
+---------+---------------+
| user_id | domain |
+---------+---------------+
| 11111 | ruozedata.com |
| 22222 | zhibo8.cc |
| 33333 | dongqiudi.com |
+---------+---------------+
3 rows in set (0.00 sec)
准备了一张表,日志进来我们是只有domain而没有user_id,那么我们要做的就是把一个流里面的数据到另外一个流里面去拼出来
package com.ruozedata.flink.flink03
import java.sql.{Connection, PreparedStatement}
import com.ruozedata.flink.utils.MySQLUtils
import org.apache.flink.streaming.api.functions.source.SourceFunction
import scala.collection.mutable
/**
* 读取MySQL中的数据
*/
//我们这里就不使用并行的了,也就那么几条数据
//HashMap[String,String]这里一个是域名、一个是user_id,我们采用这种数据类型
class UserDomainMySQLSource extends SourceFunction[mutable.HashMap[String, String]] {
var connection: Connection = _
var pstmt: PreparedStatement = _
override def run(ctx: SourceFunction.SourceContext[mutable.HashMap[String, String]]): Unit = {
//构建一个集合,放我们的结果
val map = mutable.HashMap[String, String]()
connection = MySQLUtils.getConnection()
pstmt = connection.prepareStatement("select user_id , domain from user_domains;")
val rs = pstmt.executeQuery()
while (rs.next()) {
//这里数据库是使用下划线,而到我们代码里是使用驼峰规则的
val userId = rs.getString("user_id").trim
val domain = rs.getString("domain").trim
map.put(domain, userId)
}
if (map.size > 0) {
ctx.collect(map)
} else {
println("从MySQL中获取数据为空") //TODO......log4j
}
MySQLUtils.closeResource(connection, pstmt)
}
override def cancel(): Unit = {
}
}
package com.ruozedata.flink.flink03
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.api.scala._
object JoinApp {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.addSource(new UserDomainMySQLSource).print()
env.execute(this.getClass.getSimpleName)
}
}
结果:
3> Map(dongqiudi.com -> 33333, ruozedata.com -> 11111, zhibo8.cc -> 22222)
上面是第一段代码,第二段代码是要到Kafka上面去读数据
package com.ruozedata.flink.flink03
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010
object JoinApp {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val properties = new Properties()
properties.setProperty("bootstrap.servers", "hadoop001:9092") //有几个节点就配几个,不一定要配全
properties.setProperty("group.id", "ruozedata-flink-test")
val consumer = new FlinkKafkaConsumer010[String]("ruozedata_offset", new SimpleStringSchema(), properties)
val logStream = env.addSource(consumer).print()
// env.addSource(new UserDomainMySQLSource).print()
env.execute(this.getClass.getSimpleName)
}
}
这时候日志流和MySQL流都有了,当然日志流是要做相应的ETL的
然后再做connect
要做的事情,应该是根据log流的domain匹配上mysql流的domain,然后取出userid
CoFlatMapFunction
这时候测试会发现有些能找到userid,而有的却找不到,根源就是我们一开始说的问题
这时候就要用到一个叫broadcast的函数
只需要把mysqlStream广播出去
…