零、步骤
一、Flink SQL集成Kafka
1.创建topic(一分区一备份)
flink-topic
2.准备flink-sql-connector-kafka_2.12-1.13.1.jar,放入flink/lib下
3.启动client,指定jar
./sql-client.sh embedded -j ../lib/flink-sql-connector-kafka_2.12-1.13.1.jar shell
设置分析结果展示模式为:set execution.result-mode=tableau;
4.创建表,映射到kafka topic
kafka topic中数据是CSV文件格式,有三个字段,user_id、item_id、behavior,从kafka消费数据时,设置从最新偏移量开始
CREATE TABLE test_kafka(
`user_id` BIGINT,
`item_id` BIGINT,
`behavior` STRING
)
WITH(
'connector' = 'kafka',
'topic'='flink-topic',
'properties.bootstrap.servers' = 'localhost:9092',
'properties.group.id' = 'test-group-10001',
'scan.startup.mode' = 'latest-offset',
'format' = 'csv'
);
Flink SQL> select * from test_kafka;
+----+----------------------+----------------------+--------------------------------+
| op | user_id | item_id | behavior |
+----+----------------------+----------------------+————————————————+
5.kafka写入数据
kafka-console-producer.sh --broker-list localhost:9092 —-topic flink-topic
1001,90001,click
1001,90001,browser
1001,90001,click
1002,90002,click
1002,90003,click
1003,90001,order
1004,90001,order
MacBook-Pro:bin FengZhen$ kafka-console-producer.sh --broker-list localhost:9092 --topic flink-topic
>1001,90001,click
1001,90001,browser
1001,90001,click
1002,90002,click
1002,90003,click
1003,90001,order
1004,90001,order
>>>>>>>
数据可实时查询处理
Flink SQL> select * from test_kafka;
+----+----------------------+----------------------+--------------------------------+
| op | user_id | item_id | behavior |
+----+----------------------+----------------------+--------------------------------+
| +I | 1001 | 90001 | click |
| +I | 1001 | 90001 | browser |
| +I | 1001 | 90001 | click |
| +I | 1002 | 90002 | click |
| +I | 1002 | 90003 | click |
| +I | 1003 | 90001 | order |
| +I | 1004 | 90001 | order |
二、代码实现
package com.zhen.hudi;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import static org.apache.flink.table.api.Expressions.$;
/**
* @Author FengZhen
* @Date 3/9/22 10:17 PM
* @Description 基于Flink SQL Connector实现:实时消费topic中数据,转换处理后,实时存储到hudi表中
*/
public class FlinkSQLHudiDemo {
public static void main(String[] args) {
//1.获取表的执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
//并行度设置为1
env.setParallelism(1);
//TODO: 由于增量将数据写入到Hudi表,所以需要启动Flink Checkpoint 检查点
env.enableCheckpointing(5 * 1000);
EnvironmentSettings settings = EnvironmentSettings
.newInstance()
.inStreamingMode()//设置流式模式
.build();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);
//2.创建输入表,TODO:从kafka消费数据
tableEnv.executeSql(
"CREATE TABLE order_kafka_source(\n" +
" `orderId` STRING,\n" +
" `userId` STRING,\n" +
" `orderTime` STRING,\n" +
" `ip` STRING,\n" +
" `orderMoney` DOUBLE,\n" +
" `orderStatus` INT\n" +
")\n" +
"WITH(\n" +
" 'connector' = 'kafka',\n" +
" 'topic'='order-topic',\n" +
" 'properties.bootstrap.servers' = 'localhost:9092',\n" +
" 'properties.group.id' = 'gid-1001',\n" +
" 'scan.startup.mode' = 'latest-offset',\n" +
" 'format' = 'json',\n" +
" 'json.fail-on-missing-field' = 'false',\n" +
" 'json.ignore-parse-errors' = 'true'\n" +
")\n"
);
//3.转换数据,可以使用SQL,也可以是TableAPI
Table etlTable = tableEnv
.from("order_kafka_source")
//添加字段:hudi数据合并的字段,时间戳
.addColumns(
$("orderId").substring(0,17).as("ts")
)
//添加字段:Hudi表分区字段,"orderTime": 2022-03-09 22:21:13.124
.addColumns(
$("orderTime").substring(0, 10).as("partition_day")
);
tableEnv.createTemporaryView("view_order", etlTable);
//4.创建输出表,TODO:关联到hudi表,指定hudi表名称,存储路径,字段名称等信息
tableEnv.executeSql(
"CREATE TABLE order_hudi_sink(\n" +
" `orderId` STRING PRIMARY KEY NOT ENFORCED,\n" +
" `userId` STRING,\n" +
" `orderTime` STRING,\n" +
" `ip` STRING,\n" +
" `orderMoney` DOUBLE,\n" +
" `orderStatus` INT,\n" +
" `ts` STRING,\n" +
" `partition_day` STRING\n" +
")\n" +
"PARTITIONED BY (partition_day)\n" +
"WITH(\n" +
" 'connector' = 'hudi',\n" +
" 'path'='hdfs://localhost:9000/hudi-warehouse/flink_hudi_order',\n" +
" 'table.type' = 'MERGE_ON_READ',\n" +
" 'write.operation' = 'upsert',\n" +
" 'hoodie.datasource.write.recordkey.field' = 'orderId',\n" +
" 'write.precombine.field' = 'ts',\n" +
" 'write.tasks' = '1'\n" +
")\n"
);
//5.通过子查询的方式,将数据写入输出表
tableEnv.executeSql(
"INSERT INTO order_hudi_sink " +
"SELECT orderId, userId, orderTime, ip, orderMoney, orderStatus, ts, partition_day FROM view_order"
);
}
}
kafka数据生成工具类
package com.zhen.hudi.streaming
import java.util.Properties
import org.apache.commons.lang3.time.FastDateFormat
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
import org.json4s.jackson.Json
import scala.util.Random
/**
* 订单实体类(Case Class)
*
* @param orderId 订单ID
* @param userId 用户ID
* @param orderTime 订单日期时间
* @param ip 下单IP地址
* @param orderMoney 订单金额
* @param orderStatus 订单状态
*/
case class OrderRecord(
orderId: String,
userId: String,
orderTime: String,
ip: String,
orderMoney: Double,
orderStatus: Int
)
/**
* @Author FengZhen
* @Date 3/3/22 9:54 PM
* @Description TODO
* 模拟生产订单数据,发送到Kafka Topic中
* Topic中每条数据Message类型为String,以JSON格式数据发送
* 数据转换:
* 将Order类实例对象转换为JSON格式字符串数据(可以使用json4s类库)
*/
object MockOrderProducer {
def main(args: Array[String]): Unit = {
var producer: KafkaProducer[String, String] = null
try {
// 1. Kafka Client Producer 配置信息
val props = new Properties()
props.put("bootstrap.servers", "localhost:9092")
props.put("acks", "1")
props.put("retries", "3")
// props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
// props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
props.put("key.serializer", classOf[StringSerializer].getName)
props.put("value.serializer", classOf[StringSerializer].getName)
// 2. 创建KafkaProducer对象,传入配置信息
producer = new KafkaProducer[String, String](props)
// 随机数实例对象
val random: Random = new Random()
// 订单状态:订单打开 0,订单取消 1,订单关闭 2,订单完成 3
val allStatus = Array(0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
while (true) {
// 每次循环 模拟产生的订单数目
val batchNumber: Int = random.nextInt(1) + 1
(1 to batchNumber).foreach { number =>
val currentTime: Long = System.currentTimeMillis()
val orderId: String = s"${getDate(currentTime)}%06d".format(number)
val userId: String = s"${1 + random.nextInt(5)}%08d".format(random.nextInt(1000))
val orderTime: String = getDate(currentTime, format = "yyyy-MM-dd HH:mm:ss.SSS")
val orderMoney: String = s"${5 + random.nextInt(500)}.%02d".format(random.nextInt(100))
val orderStatus: Int = allStatus(random.nextInt(allStatus.length))
// 3. 订单记录数据
val orderRecord: OrderRecord = OrderRecord(
orderId, userId, orderTime, getRandomIp, orderMoney.toDouble, orderStatus
)
// 转换为JSON格式数据
val orderJson = new Json(org.json4s.DefaultFormats).write(orderRecord)
println(orderJson)
// 4. 构建ProducerRecord对象
val record = new ProducerRecord[String, String]("order-topic", orderId, orderJson)
// 5. 发送数据:def send(messages: KeyedMessage[K,V]*), 将数据发送到Topic
producer.send(record)
}
Thread.sleep(random.nextInt(500) + 5000)
}
} catch {
case e: Exception => e.printStackTrace()
} finally {
if (null != producer) producer.close()
}
}
/** =================获取当前时间================= */
def getDate(time: Long, format: String = "yyyyMMddHHmmssSSS"): String = {
val fastFormat: FastDateFormat = FastDateFormat.getInstance(format)
val formatDate: String = fastFormat.format(time) // 格式化日期
formatDate
}
/** ================= 获取随机IP地址 ================= */
def getRandomIp: String = {
// ip范围
val range: Array[(Int, Int)] = Array(
(607649792, 608174079), //36.56.0.0-36.63.255.255
(1038614528, 1039007743), //61.232.0.0-61.237.255.255
(1783627776, 1784676351), //106.80.0.0-106.95.255.255
(2035023872, 2035154943), //121.76.0.0-121.77.255.255
(2078801920, 2079064063), //123.232.0.0-123.235.255.255
(-1950089216, -1948778497), //139.196.0.0-139.215.255.255
(-1425539072, -1425014785), //171.8.0.0-171.15.255.255
(-1236271104, -1235419137), //182.80.0.0-182.92.255.255
(-770113536, -768606209), //210.25.0.0-210.47.255.255
(-569376768, -564133889) //222.16.0.0-222.95.255.255
)
// 随机数:IP地址范围下标
val random = new Random()
val index = random.nextInt(10)
val ipNumber: Int = range(index)._1 + random.nextInt(range(index)._2 - range(index)._1)
// 转换Int类型IP地址为IPv4格式
number2IpString(ipNumber)
}
/** =================将Int类型IPv4地址转换为字符串类型================= */
def number2IpString(ip: Int): String = {
val buffer: Array[Int] = new Array[Int](4)
buffer(0) = (ip >> 24) & 0xff
buffer(1) = (ip >> 16) & 0xff
buffer(2) = (ip >> 8) & 0xff
buffer(3) = ip & 0xff
// 返回IPv4地址
buffer.mkString(".")
}
}