Flink总结
- 创建topic,分区2+地址为/kafka
kafka-topics.sh --create ods_mall_log --zookeeper bigdata1:2181,bigdata2:2181,bigdata3:2181/kafka --partitions 2 --replication-factor 1
kafka-topics.sh --create ods_mall_data --zookeeper bigdata1:2181,bigdata2:2181,bigdata3:2181/kafka --partitions 2 --replication-factor 1
第一题:flume实时数据采集
#### 1. 从netcat到kafka
把集群的${FLUME_HOME}
scp到ubuntu,然后用see命令或者直接粘贴html地址${FLUME_HOME}/docs/FlumeUserGuide.html
到浏览器,查看配置文件
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
//很奇怪,必须要是localhost
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 25001
# Describe the sink
a1.sinks.k1.channel = c1
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.topic = ods_mall_log
a1.sinks.k1.kafka.bootstrap.servers = bigdata1:9092,bigdata2:9092,bigdata3:9092
a1.sinks.k1.kafka.flumeBatchSize = 20
a1.sinks.k1.kafka.producer.acks = 1
a1.sinks.k1.kafka.producer.linger.ms = 1
a1.sinks.k1.kafka.producer.compression.type = snappy
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动flume
flume-ng agent -n a1 -c ${FLUME_HOME}/conf -f netcat_kafka.conf
启动/data_log/gen_ds_data_to_socket
脚本并查看kafka结果
kafka-console-consumer.sh --topic ods_mall_log --bootstrap-server bigdata1:9092,bigdata2:9092,bigdata3:9092 --max-messsages 2 --group group1 --from-beginning
2.maxwell配置
更改maxwell的config.properties配置文件
cp ${MAXWELL_HOME}/config.properties. example ${MAXWELL_HOME}/config.properties
maxwell配置文件
producer=kafka
kafka.bootstarp.servers=bigdata1:9092,bigdata2:9092,bigdata3:9092
host=localhost
user=root
passwrod=123456
kafka_topic=ods_mall_data
jdbc_options=useSSL=false&serverTimezone=Asia/Shanghai
启动maxwell
${MAXWELL_HOME}/bin/maxwell --conf ${MAXWELL_HOME}/conf.properties --daemon
查看结果
kafka-console-consumer.sh --topic ods_mall_data --bootstrap-server bigdata1:9092,bigdata2:9092,bigdata3:9092 --max-messsages 2 --group group1 --from-beginning
第二题:flink数据迁移到kafka,hbase,hive
1.kafka(ods_mall_data)到kafka(fact_order_master和fact_order_detail) 只要data
package TestTwo
import org.apache.flink.api.common.eventtime.WatermarkStrategy
import org.apache.flink.api.common.functions.FlatMapFunction
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.configuration.Configuration
import org.apache.flink.connector.kafka.sink.{KafkaRecordSerializationSchema, KafkaSink}
import org.apache.flink.connector.kafka.source.KafkaSource
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
import org.json4s.jackson.JsonMethods._
case class TableAnddata(table: String, data: String)
object gamework1 {
def main(args: Array[String]): Unit = {
val BOOTSTRAP_SERVER = "bigdata1:9092,bigdata1:9092,bigdata1:9092"
//从kafka读取
val kafkaSource = KafkaSource.builder()
.setTopics("ods_mall_data")
.setBootstrapServers(BOOTSTRAP_SERVER)
.setValueOnlyDeserializer(new SimpleStringSchema())
.setStartingOffsets(OffsetsInitializer.earliest())
.build()
var env = StreamExecutionEnvironment.getExecutionEnvironment
val data = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafka Source")
if ((args.length > 0 && args(0).equals("local")) || args.length == 0) {
val configuration: Configuration = new Configuration()
configuration.setString("rest.bind-port", "8020-8080")
env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration)
}
val master_detail = data.flatMap(new FlatMapFunction[String, TableAnddata] {
override def flatMap(t: String, collector: Collector[TableAnddata]): Unit = {
val value = parse(t)
val table = (value \ "table").values.toString
if (table.nonEmpty && (table.equals("order_master") || table.equals("order_detail"))) {
collector.collect(TableAnddata(table, compact(value \ "data")))
}
}
})
master_detail.filter(_.table.equals("order_master")).map(_.data).sinkTo(KafkaSink.builder()
.setBootstrapServers("bigdata1:9092,bigdata1:9092,bigdata1:9092")
.setRecordSerializer(KafkaRecordSerializationSchema.builder()
.setTopic("fact_order_master")
.setValueSerializationSchema(new SimpleStringSchema())
.build()).build())
master_detail.filter(_.table.equals("order_detail")).map(_.data).sinkTo(KafkaSink.builder()
.setBootstrapServers("bigdata1:9092,bigdata1:9092,bigdata1:9092")
.setRecordSerializer(KafkaRecordSerializationSchema.builder()
.setTopic("fact_order_detail")
.setValueSerializationSchema(new SimpleStringSchema())
.build()).build())
env.execute("work2")
}
}
2.kafka(ods_mall_log)到kafka(log_product_browse) ,过滤product_browse
package TestTwo
import org.apache.flink.api.common.eventtime.WatermarkStrategy
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.configuration.Configuration
import org.apache.flink.connector.kafka.sink.{KafkaRecordSerializationSchema, KafkaSink}
import org.apache.flink.connector.kafka.source.KafkaSource
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
import org.apache.flink.streaming.api.scala._
object gamework2 {
def main(args: Array[String]): Unit = {
val BOOTSTRAP_SERVER = "bigdata1:9092,bigdata1:9092,bigdata1:9092"
//从kafka读取
val kafkaSource = KafkaSource.builder()
.setTopics("ods_mall_log")
.setBootstrapServers(BOOTSTRAP_SERVER)
.setValueOnlyDeserializer(new SimpleStringSchema())
.setStartingOffsets(OffsetsInitializer.earliest())
.build()
var env = StreamExecutionEnvironment.getExecutionEnvironment
val data = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafka Source")
if ((args.length > 0 && args(0).equals("local")) || args.length == 0) {
val configuration: Configuration = new Configuration()
configuration.setString("rest.bind-port", "8020-8080")
env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration)
}
env.fromSource(kafkaSource,WatermarkStrategy.noWatermarks(), "kafka Source")
.filter(data=>data.split(":")(0).equals("product_browse"))
.sinkTo(KafkaSink.builder()
.setBootstrapServers("bigdata1:9092,bigdata1:9092,bigdata1:9092")
.setRecordSerializer(KafkaRecordSerializationSchema.builder()
.setTopic("fact_order_detail")
.setValueSerializationSchema(new SimpleStringSchema())
.build()).build())
env.execute("work2")
}
}
写到这里就12点了,然后后来3个小时都在第二题纠错和第三题写代码。
2.从kafka(fact_product_browse和fact_order_master和fact_order_detail
)到hbase(ods:prowse_browse,ods:order_master,ods:order_detail
)
package TestTwo
import org.apache.flink.api.common.eventtime.WatermarkStrategy
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.configuration.Configuration
import org.apache.flink.connector.kafka.sink.{KafkaRecordSerializationSchema, KafkaSink}
import org.apache.flink.connector.kafka.source.KafkaSource
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
import org.apache.flink.table.catalog.hive.HiveCatalog
import org.apache.flink.table.functions.ScalarFunction
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods._
import java.io.File
import java.text.SimpleDateFormat
import java.util.Date
import scala.util.Random
object gamework3 {
def main(args: Array[String]): Unit = {
val BOOTSTRAP_SERVER = "bigdata1:9092,bigdata1:9092,bigdata1:9092"
//从kafka读取
val order_master_kafka=KafkaSource.builder()
.setTopics("fact_order_master")
.setBootstrapServers(BOOTSTRAP_SERVER)
.setValueOnlyDeserializer(new SimpleStringSchema())
.setStartingOffsets(OffsetsInitializer.earliest())
.build()
val order_detail_kafka=KafkaSource.builder()
.setTopics("fact_order_detail")
.setBootstrapServers(BOOTSTRAP_SERVER)
.setValueOnlyDeserializer(new SimpleStringSchema())
.setStartingOffsets(OffsetsInitializer.earliest())
.build()
val product_brows_kafka=KafkaSource.builder()
.setTopics("log_product_browse")
.setBootstrapServers(BOOTSTRAP_SERVER)
.setValueOnlyDeserializer(new SimpleStringSchema())
.setStartingOffsets(OffsetsInitializer.earliest())
.build()
var env = StreamExecutionEnvironment.getExecutionEnvironment
var tableEnv=StreamTableEnvironment.create(env)
if ((args.length > 0 && args(0).equals("local")) || args.length == 0) {
val configuration: Configuration = new Configuration()
configuration.setString("rest.bind-port", "8020-8080")
env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration)
tableEnv=StreamTableEnvironment.create(env)
}
var fileConf="/opt/module/hive/conf"
if(!(new File(fileConf).exists())){
fileConf="./src/main/resources"
}
val myhive=new HiveCatalog("myhive","test",fileConf)
tableEnv.registerCatalog("myhive",myhive)
tableEnv.useCatalog("myhive")
tableEnv.getConfig.getConfiguration.setString("table.dynamic-table-options.enabled","true")
//建表,表结构,数据格式,浪费大量时间
val order_master = env.fromSource(order_master_kafka, WatermarkStrategy.noWatermarks(), "kafka Source")
.map(data => {
implicit val format = DefaultFormats
parse(data).extract[Order_master]
})
val order_detail =env.fromSource(order_detail_kafka, WatermarkStrategy.noWatermarks(), "kafka Source").map(data=>{
implicit val format=DefaultFormats
parse(data).extract[Order_detail]
})
val product_browse =env.fromSource(product_brows_kafka, WatermarkStrategy.noWatermarks(), "kafka Source").map(data=>{
implicit val format=DefaultFormats
parse(data).extract[Product_browse]
})
tableEnv.createTemporaryView("order_master",order_master)
tableEnv.createTemporaryView("order_detail",order_detail)
tableEnv.createTemporaryView("product_browse",product_browse)
//TODO 超级浪费时间之---建立表结构
//1.order_master
tableEnv.executeSql(
"""
|create table hbase_order_master
|(info string,info ROW<表结构...超多>)
|with
|(
|'connector'='
|)
|""".stripMargin)
//2.order_detail
tableEnv.executeSql(
"""
|create table hbase_order_detail
|(info string,info ROW<表结构...超多>)
|with
|(
|'connector'='
|)
|""".stripMargin)
//3.product_browse
tableEnv.executeSql(
"""
|create table hbase_product_browse
|(info string,info ROW<表结构...超多>)
|with
|(
|'connector'='
|)
|""".stripMargin)
//bug之不能用insertTo,只能用sql的
//注册表函数
tableEnv.registerFunction("getRowkey",new getRowKey)
tableEnv.executeSql(
"""
|insert into table hbase_order_master
|select
|(getRowkey() as rowkey,
|Row(
|case ( 字段 as 类型)
|)
|)
|from order_master
|""".stripMargin)
tableEnv.executeSql(
"""
|insert into table hbase_order_master
|select
|(getRowkey() as rowkey,
|Row(
|case ( 字段 as 类型)
|)
|)
|from order_detail
|""".stripMargin)
//product_browse需要嵌套一层
tableEnv.executeSql(
"""
|insert into table hbase_order_master
|select concat_ws("",substring(log_id,1,2),substring(log_id,3,6),substring(log_id,6)) as rowkey
|Row(
|*
|)
|from(
| select
| getRowkey() as log_id,
| case ( 字段 as 类型)
|
| from product_browse
|) a
|""".stripMargin)
env.execute("work3")
}
}
//超级浪费时间之---建立case class
case class Order_master()
case class Order_detail()
case class Product_browse()
class getRowKey extends ScalarFunction{
//rowkey很奇怪,需要随机数0-9 加上 yyyyMMddHHmmssSSS格式
def eval(): String ={
val format = new SimpleDateFormat("yyyyMMddHHmmssSSS")
val data = new Date()
(Random.nextInt()*10).toInt+format.format(data)
}
}
3.从log_product_browse计算pv(商品浏览量)uv(用户浏览量)
1.需要结果为:商品id,商品名称,uv,pv,modified_time
2.需要保存到hbase,在hive中查看(已经建好hive连接hbase的外表了)
3.需要连接hive表根据商品id获取商品名称
4.log_product_browse的结构
log_product_browse数据样例:product_browse:123231,13452,0,0,20221221012143
product_id,product_name,-----,----,modified_time
package TestTwo
import org.apache.flink.api.common.eventtime.WatermarkStrategy
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.{MapState, MapStateDescriptor, ValueState, ValueStateDescriptor}
import org.apache.flink.configuration.Configuration
import org.apache.flink.connector.kafka.source.KafkaSource
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.connectors.redis.RedisSink
import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig
import org.apache.flink.streaming.connectors.redis.common.mapper.{RedisCommand, RedisCommandDescription, RedisMapper}
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
import org.apache.flink.table.catalog.hive.HiveCatalog
import org.apache.flink.table.functions.ScalarFunction
import org.apache.flink.util.Collector
import java.io.File
import java.text.SimpleDateFormat
import java.time.Duration
import java.util.Date
import scala.collection.mutable
import scala.util.Random
//写hive连接hbase的外表,#b表示非字符串,非字符串都是byte传输
/*
* create external pv_uv_result
* (product_id String,product_name String,uv Long,pv Long,modified_time Timestamp)
* stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
* with
* SERDEPROPERTIES ("hbase.columns.mapping" = ":key,info:product_id,info:product_name,info:pv#b,info:uv#b,info:modified_time")
* TBLPROPERTIES ("hbase.table.name" = "pv_uv_result",
*"hbase.mapred.output.outputtable" = "pv_uv_result");
* */
object gamework4 {
def main(args: Array[String]): Unit = {
val BOOTSTRAP_SERVER = "bigdata1:9092,bigdata1:9092,bigdata1:9092"
//从kafka读取
val kafkaSource = KafkaSource.builder()
.setTopics("log_product_browse")
.setBootstrapServers(BOOTSTRAP_SERVER)
.setValueOnlyDeserializer(new SimpleStringSchema())
.setStartingOffsets(OffsetsInitializer.earliest())
.build()
var env = StreamExecutionEnvironment.getExecutionEnvironment
var tableEnv=StreamTableEnvironment.create(env)
if ((args.length > 0 && args(0).equals("local")) || args.length == 0) {
val configuration: Configuration = new Configuration()
configuration.setString("rest.bind-port", "8020-8080")
env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration)
tableEnv=StreamTableEnvironment.create(env)
}
val redisConf = new FlinkJedisPoolConfig.Builder().setHost("bigdata1").setPort(6379).build()
var fileConf="/opt/module/hive/conf"
if(!(new File(fileConf).exists())){
fileConf="./src/main/resources"
}
val myhive=new HiveCatalog("myhive","ods",fileConf)
tableEnv.registerCatalog("myhive",myhive)
tableEnv.useCatalog("myhive")
tableEnv.getConfig.getConfiguration.setString("table.dynamic-table-options.enabled","true")
val toJoinHive = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafka Source")
.map(data => {
val str = data.split(":")(1).split(",")
val format = new SimpleDateFormat("yyyyMMddHHmmss")
Product_browse(str(0).toString, str(1).toString, format.parse(str(2)).getTime)
}).assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[Product_browse](Time.milliseconds(3000)) {
override def extractTimestamp(t: Product_browse): Long = {
t.modified_time
}
})
.keyBy(_.pid)
.process(new MyKeyedFunction())
val data = toJoinHive
tableEnv.createTemporaryView("toJoinHive",toJoinHive)
//写入hbase
//-----建表,insert into
tableEnv.executeSql(
"""
|create table hbase_pvuv
|(key string,info Row<字段>)
|with
| 'connector' = 'hbase-1.4',
|'table-name' = 'pv_uv_result',
|'zookeeper.quorum' = 'bigdata1:2181,bigdata2:2181,bigdata3:2181'
|""".stripMargin)
tableEnv.registerFunction("getRowkey",new getRowKey)
//好像可以直接写sql,不用写processFunction
tableEnv.executeSql(
"""
|insert into hbase_pvuv
|select getRowkey() as rowkey,Row(toJoinHive.pid,pinfo.product_name,pv,uv,toJoinHive.modified_time)
|from myhive.product_info as pinfo
|left join on toJoinHive
|on pinfo.product_id=dim.product_id
|""".stripMargin)
env.execute("work4")
//最后一题的写入redis
// .addSink(new RedisSink(redisConf,new MyRedisMapper()))
}
}
case class Product_browse(pid:String,uid:String,modified_time:Long)
case class TOJoinHive(pid:String,uid:String,uv:Long,pv:Long,modified_time:Long)
class MyKeyedFunction extends KeyedProcessFunction[String,Product_browse,TOJoinHive] {
var pvState:ValueState[Long]=_
var uvState:MapState[String,Long]=_
override def open(parameters: Configuration): Unit = {
getRuntimeContext.getState(new ValueStateDescriptor[Long]("pv",classOf[Long]))
getRuntimeContext.getMapState(new MapStateDescriptor[String,Long]("pv",classOf[String],classOf[Long]))
}
override def processElement(i: Product_browse, context: KeyedProcessFunction[String, Product_browse, TOJoinHive]#Context, collector: Collector[TOJoinHive]): Unit = {
val pv=pvState.value()+1
pvState.update(pv)
val isExist=uvState.contains(i.uid)
if(((isExist) && uvState.get(i.uid) < i.modified_time) || (!isExist)){
uvState.put(i.uid,i.modified_time)
}
import scala.collection.JavaConversions._
val uv=uvState.keys().toList.size
val result=new StringBuilder()
result.append(context.getCurrentKey+",")
.append()
collector.collect(TOJoinHive(context.getCurrentKey,i.uid,uv,pv,uvState.get(i.uid)))
}
}
class MyRedisMapper extends RedisMapper[String] {
//redisCommand: RedisCommand, additionalKey: String
override def getCommandDescription: RedisCommandDescription = new RedisCommandDescription(RedisCommand.SET)
override def getKeyFromData(t: String): String = "key"
override def getValueFromData(t: String): String = t
}
class getRowKey extends ScalarFunction{
//rowkey很奇怪,需要随机数0-9 加上 yyyyMMddHHmmssSSS格式
def eval(): String ={
val format = new SimpleDateFormat("yyyyMMddHHmmssSSS")
val data = new Date()
(Random.nextInt()*10).toInt+format.format(data)
}
}
查看结果:
scan '表',{FORMATTER=>'toString',LIMIT=2,COLUMN='info:字段'}
4.GMC什么的,没听过,忘了
–2023年3月23日
–曹