![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/c994a8323a0b113ae21f55d8322f0b26.png)
基本操作
list存在的topic
bin/kafka-topics.sh --list --zookeeper linux01:2181就是看里面的库
----------------------------------------------------------------------------------
创建topic
bin/kafka-topics.sh --create --topic doitedu19 --partitions 2 --replication-factor 3 --zookeeper linux01:2181
--create --topic 指定创建topic
doitedu19 创建topic的名字
--partitions 2 指定分区
--replication-factor 3 指定副本
------------------------------------------------------------------------------------
查看topic信息(就是查看创建这个库的详细信息)
bin/kafka-topics.sh --describe --topic doitedu19 --zookeeper linux01:2181
Topic:doitedu19 PartitionCount:2 ReplicationFactor:3 Configs:
哪个topic 分区的数量 副本的数量 其他配置信息
下面两个是两个分区的详细信息0分区和1分区
Topic: doitedu19 Partition: 0 Leader: 1 Replicas: 1,0,2 Isr: 1,0,2
Topic: doitedu19 Partition: 1 Leader: 2 Replicas: 2,1,0 Isr: 2,1,0
----------------------------------------------------------------------------------
控制台生产者工具(一个交互是命令台开着就别动了就可以写东西了)
bin/kafka-console-producer.sh --broker-list linux01:9092 --topic doitedu19
linux01:9092 kafka的默认端口号
--topic doitedu19 指定topic想这个库写东西
--------------------------------------------------------------------------------
控制台消费者工具
bin/kafka-console-consumer.sh --bootstrap-server linux01:9092,linux02:9092,linux03:9092 --topic doitedu19 --from-beginning
hdp01:9092,hdp02:9092,hdp03:9092 指定端口
topic doitedu19 指定消费的库
--from-beginning 指定从哪开始消费(这边是从开头)
kafka的source
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.channels = c1
#kafkasource的名字
a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
#服务器列表,逗号分隔
a1.sources.r1.kafka.bootstrap.servers = linux01:9092,linux02:9092,linux03:9092
#kafka消费组id
a1.sources.r1.kafka.consumer.group.id = doitedu
#kafka消费主题列表,逗号分隔
a1.sources.r1.kafka.topics = doitedu19
#写入一批的最大信息条数
a1.sources.r1.batchSize = 1000
a1.sources.r1.useFlumeEventFormat = false
#是否要往header中加入一个kv:topic信息
a1.sources.r1.setTopicHeader = true
#应上面开关的需求,加入kv:topic =>topic名称
a1.sources.r1.topicHeader = doit_topic
a1.sources.r1.kafka.consumer.auto.offset.reset = earliest
a1.channels.c1.type = memory
a1.channels.c1.capacity = 100000
a1.channels.c1.transactionCapacity = 2000
a1.sinks.k1.channel = c1
a1.sinks.k1.type = logger
启动命令
bin/flume-ng agent -c conf/ -f agentsconf/ka -n a1 -Dflume.root.logger=INFO,console
channel file可靠的channel
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.channels = c1
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = g1
a1.sources.r1.filegroups.g1 = /root/logs/event.*
a1.sources.r1.batchSize = 100
a1.channels.c1.type = file
#Checkpoint信息保存目录(相当于快照)
a1.channels.c1.checkpointDir = /opt/data/flumedata/file-channel/checkpoint
#Event数据缓存目录
a1.channels.c1.dataDirs = /opt/data/flumedata/file-channel/data
a1.sinks.k1.channel = c1
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://linux01:8020/doitedu05/
a1.sinks.k1.hdfs.filePrefix = DoitEduData
a1.sinks.k1.hdfs.fileSuffix = .log
a1.sinks.k1.hdfs.rollInterval = 60
a1.sinks.k1.hdfs.rollSize = 268435456
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.batchSize = 100
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.useLocalTimeStamp = true
启动命令和创建文件脚本
for i in {1..1000000}; do echo "${i},pppppppppppppppppppppp" >> /root/logs/event.log; sleep 0.5; done
bin/flume-ng agent -c conf/ -f agentsconf/file.cont -n a1 -Dflume.root.logger=INFO,console
kafka的source把kafka当目标存储,不需要sink
# 把kafka当目标存储,不需要sink
a1.sources = r1
a1.channels = c1
a1.sources.r1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.batchSize = 100
a1.sources.r1.command = tail -F /root/logs/event.log
#名字kafka类型
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
#Kafka服务器地址
a1.channels.c1.kafka.bootstrap.servers = linux01:9092,linux02:9092,linux03:9092
#所使用的topic
a1.channels.c1.kafka.topic = flume-channel
#跟上、下游匹配,是否需要将数据解析为Flume的Event格式
a1.channels.c1.parseAsFlumeEvent = false
启动命令和制造文件命令
bin/flume-ng agent -c conf/ -f agentsconf/xiaoka -n a1 -Dflume.root.logger=INFO,console
for i in {1..1000000}; do echo "${i},pppppppppppppppppppppp" >> /root/logs/event.log; sleep 0.5; done
把kafka当数据源,不需要source
# 把kafka当数据源,不需要source
a1.channels = c1
a1.sinks = k1
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = linux01:9092,linux02:9092,linux03:9092
#要读的toic
a1.channels.c1.kafka.topic = flume-channel
a1.channels.c1.parseAsFlumeEvent = false
a1.channels.c1.kafka.consumer.auto.offset.reset = latest
a1.sinks.k1.type = logger
a1.sinks.k1.channel = c1
kafka sink就是向kafka里写东西(没下啥用用上面的就好’了)
#向kafka里写东西
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.batchSize = 100
a1.sources.r1.command = tail -F /root/logs/wxlog/event.log
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 200
a1.sinks.k1.channel = c1
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = linux01:9092,linux02:9092,linux03:9092
#要吓到的topic
a1.sinks.k1.kafka.topic = mytopic
hive sink(往hive里写东西)
# hive的表结构
set hive.support.concurrency=true;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
drop table if exists mytable;
create table mytable(
id int,
name string,
addr string
)
partitioned by (dt string)
row format delimited fields terminated by ','
stored as orc
location '/mytable/'
tblproperties('transactional'='true')
;
# 数据的样例
# 1,zs,18,beijing
# 2,ls,18,shagnhai
# 3,ww,18,hebei
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.batchSize = 100
a1.sources.r1.command = tail -F /root/log/event.log
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000000
a1.channels.c1.transactionCapacity = 30000
a1.sinks.k1.channel = c1
#hive类型
a1.sinks.k1.type = hive
#元数据的服务
a1.sinks.k1.hive.metastore = thrift://linux01:9083
#写到hive的数据库
a1.sinks.k1.hive.database = default
#写到hive的表
a1.sinks.k1.hive.table = mytable
#要放到的分区
a1.sinks.k1.hive.partition = %Y-%m-%d
#本地时间戳
a1.sinks.k1.useLocalTimeStamp = true
#分隔
a1.sinks.k1.serializer = DELIMITED
#分隔符
a1.sinks.k1.serializer.delimiter = ,
#字段名
a1.sinks.k1.serializer.fieldnames = id,name,,addr