Flink消费Kafka的数据
本案例是fileBeat采集Nginx的access.log日志发送至Kafka。在这里不讨论这一步骤。只讨论Kafka至Flink的过程。不多说直接上代码。
1、 Kafka的源数据
这是beat采集发送至kafka的源数据。由于beat无法删除timestamp和metadata字段所以也一起发送到了kafka。
{"@timestamp":"2020-04-18T09:30:41.525Z","@metadata":{"beat":"filebeat","type":"_doc","version":"7.2.0","topic":"bigdata_nginx_access"},"message":"192.168.25.1 - - [18/Apr/2020:14:15:45 +0800] \"GET /nocar/Download?nocarUrl=a/b/E/n/i== HTTP/1.1\" 200 103580 \"-\" \"Mozilla/5.0 (iPhone; CPU iPhone OS 12_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/7.0.12(0x17000c27) NetType/WIFI Language/zh_CN\""}
{
"@timestamp":"2020-04-18T09:30:41.525Z",
"@metadata":{
"beat":"filebeat",
"type":"_doc",
"version":"7.2.0",
"topic":"bigdata_nginx_access"
},
"message":"192.168.25.1 - - [18/Apr/2020:14:15:45 +0800] "GET /nocar/Download?nocarUrl=a/b/E/n/i== HTTP/1.1" 200 103580 "-" "Mozilla/5.0 (iPhone; CPU iPhone OS 12_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/7.0.12(0x17000c27) NetType/WIFI Language/zh_CN""
}
2、Flink代码处理。
主要是获取IP,过滤数据,判断数据的完整性以及IP。聚合计算。
package com.bigdata.appNginx
import java.util.regex.{Matcher, Pattern}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.{Table, TableEnvironment, Types}
import org.apache.flink.table.descriptors.{Json, Kafka, Rowtime, Schema}
import org.apache.flink.types.Row
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.table.api.scala.StreamTableEnvironment
object FlinktableToKafka {
//环境env
val topicSource : String = "bigdata_nginx_access"
val topicSink: String = "bigdata_nginx_access_message"
val topicVersion: String = "0.10"
val zkCluster: String = "192.168.25.1:2181,192.168.25.2:2181,192.168.25.3:2181"
val kafkaCluster: String = "192.168.25.1:9092,192.168.25.2:9092,192.168.25.3:9092"
val groupId: String = "flink_access"
def main(args: Array[String]): Unit = {
val (env: StreamExecutionEnvironment, tableEnv: StreamTableEnvironment) = initTableKafka
//SQL
val sql: String ="select a.message from message a"
val table: Table = tableEnv.sqlQuery(sql)
//val stream: Table = tableEnv.scan("message")
val message: DataStream[Row] = tableEnv.toAppendStream[Row](table).setParallelism(1)
//message.print()
//filter脏数据
val stream: DataStream[Row] = message.filter(str =>
isIp(str.toString.split(" ")(0)) && str.toString.split(" ").length > 15)
val tupleValue: DataStream[(String, Int)] = stream.map(str => (str.toString.split(" ")(0),1))
val result: DataStream[(String, Int)] = tupleValue.keyBy(0)
//滚动窗口十分钟,十秒一次。
.timeWindow(Time.minutes(10), Time.seconds(10))
.sum(1)
result.print()//打印
table.printSchema()
env.execute("example")
}
/**
* 使用正则判断IP
*/
def isIp(addr:String): Boolean={
if(addr.length() < 7 || addr.length() > 15 || "".equals(addr))
false
val pat: Pattern = Pattern.compile("([1-9]|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])(\\.(\\d|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])){3}")
val mat: Matcher = pat.matcher(addr)
mat.find()
}
/**
* 初始化kafkaSource table message
* @return
*/
private def initTableKafka = {
// for batch programs use ExecutionEnvironment instead of StreamExecutionEnvironment
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// create a TableEnvironment
val tableEnv: StreamTableEnvironment = StreamTableEnvironment.create(env)
tableEnv
.connect(
new Kafka()
.version(topicVersion)
.topic(topicSource)
.startFromLatest()
.property("group.id", groupId)
.property("zookeeper.connect", zkCluster)
.property("bootstrap.servers", kafkaCluster))
.withFormat(new Json().failOnMissingField(true).deriveSchema())
.withSchema(
new Schema()
.field("@timestamp", Types.STRING)
.field("@metadata", Types.MAP(Types.STRING, Types.STRING))
.field("message", Types.STRING)
)
.inAppendMode()
.registerTableSource("message")
(env, tableEnv)
}
}
3、遇到的问题
1、json的报错,withFormat的new json的地方报错。
解决:主要是因为没有引入依赖。这点,我在第一次看Flink官网的时候也没有细看。只是把官网的pom贴出来的依赖复制了。https://ci.apache.org/projects/flink/flink-docs-release-1.8/dev/projectsetup/dependencies.html如数据是json格式的需要引入下面的依赖。
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>1.8.0</version>
</dependency>