kafka快速入门

最新推荐文章于 2024-05-03 18:48:30 发布

一把秀儿

最新推荐文章于 2024-05-03 18:48:30 发布

阅读量206

点赞数

分类专栏：大数据项目

本文链接：https://blog.csdn.net/m0_52106226/article/details/112442229

版权

大数据项目专栏收录该内容

5 篇文章 0 订阅

订阅专栏

在这里插入图片描述

基本操作
list存在的topic
 bin/kafka-topics.sh --list --zookeeper linux01:2181就是看里面的库
 ----------------------------------------------------------------------------------
 创建topic
 bin/kafka-topics.sh --create --topic doitedu19 --partitions 2 --replication-factor 3 --zookeeper linux01:2181
 --create --topic       指定创建topic
 doitedu19               创建topic的名字
 --partitions 2          指定分区
 --replication-factor 3   指定副本
 ------------------------------------------------------------------------------------
 查看topic信息(就是查看创建这个库的详细信息)
 bin/kafka-topics.sh --describe --topic doitedu19 --zookeeper linux01:2181
 Topic:doitedu19 PartitionCount:2        ReplicationFactor:3     Configs:
 哪个topic         分区的数量              副本的数量               其他配置信息
 下面两个是两个分区的详细信息0分区和1分区
 Topic: doitedu19        Partition: 0    Leader: 1       Replicas: 1,0,2 Isr: 1,0,2
 Topic: doitedu19        Partition: 1    Leader: 2       Replicas: 2,1,0 Isr: 2,1,0
 ----------------------------------------------------------------------------------
 控制台生产者工具(一个交互是命令台开着就别动了就可以写东西了)
 bin/kafka-console-producer.sh --broker-list linux01:9092 --topic doitedu19
 linux01:9092        kafka的默认端口号
 --topic doitedu19    指定topic想这个库写东西
 --------------------------------------------------------------------------------
 控制台消费者工具
bin/kafka-console-consumer.sh --bootstrap-server linux01:9092,linux02:9092,linux03:9092 --topic doitedu19 --from-beginning
hdp01:9092,hdp02:9092,hdp03:9092    指定端口
topic doitedu19                          指定消费的库
--from-beginning                      指定从哪开始消费(这边是从开头)

kafka的source

a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.channels = c1
#kafkasource的名字
a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
#服务器列表,逗号分隔
a1.sources.r1.kafka.bootstrap.servers = linux01:9092,linux02:9092,linux03:9092
#kafka消费组id
a1.sources.r1.kafka.consumer.group.id = doitedu
#kafka消费主题列表,逗号分隔
a1.sources.r1.kafka.topics = doitedu19
#写入一批的最大信息条数
a1.sources.r1.batchSize = 1000
a1.sources.r1.useFlumeEventFormat = false
#是否要往header中加入一个kv：topic信息
a1.sources.r1.setTopicHeader = true
#应上面开关的需求，加入kv：topic =>topic名称
a1.sources.r1.topicHeader = doit_topic
a1.sources.r1.kafka.consumer.auto.offset.reset = earliest



a1.channels.c1.type = memory
a1.channels.c1.capacity = 100000
a1.channels.c1.transactionCapacity = 2000


a1.sinks.k1.channel = c1
a1.sinks.k1.type = logger

启动命令
bin/flume-ng agent -c conf/ -f agentsconf/ka -n a1 -Dflume.root.logger=INFO,console

channel file可靠的channel

a1.sources = r1
a1.channels = c1
a1.sinks = k1


a1.sources.r1.channels = c1
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = g1
a1.sources.r1.filegroups.g1 = /root/logs/event.*
a1.sources.r1.batchSize = 100


a1.channels.c1.type = file
#Checkpoint信息保存目录(相当于快照)
a1.channels.c1.checkpointDir = /opt/data/flumedata/file-channel/checkpoint
#Event数据缓存目录
a1.channels.c1.dataDirs = /opt/data/flumedata/file-channel/data



a1.sinks.k1.channel = c1
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://linux01:8020/doitedu05/
a1.sinks.k1.hdfs.filePrefix = DoitEduData
a1.sinks.k1.hdfs.fileSuffix = .log
a1.sinks.k1.hdfs.rollInterval = 60
a1.sinks.k1.hdfs.rollSize = 268435456
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.batchSize = 100
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.useLocalTimeStamp = true


启动命令和创建文件脚本
for i in {1..1000000}; do echo "${i},pppppppppppppppppppppp" >> /root/logs/event.log; sleep 0.5; done
bin/flume-ng agent -c conf/ -f agentsconf/file.cont -n a1 -Dflume.root.logger=INFO,console

kafka的source把kafka当目标存储，不需要sink

# 把kafka当目标存储，不需要sink
a1.sources = r1
a1.channels = c1

a1.sources.r1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.batchSize = 100
a1.sources.r1.command = tail -F /root/logs/event.log

#名字kafka类型
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
#Kafka服务器地址
a1.channels.c1.kafka.bootstrap.servers = linux01:9092,linux02:9092,linux03:9092
#所使用的topic
a1.channels.c1.kafka.topic = flume-channel
#跟上、下游匹配，是否需要将数据解析为Flume的Event格式
a1.channels.c1.parseAsFlumeEvent = false

启动命令和制造文件命令
bin/flume-ng agent -c conf/ -f agentsconf/xiaoka -n a1 -Dflume.root.logger=INFO,console 
for i in {1..1000000}; do echo "${i},pppppppppppppppppppppp" >> /root/logs/event.log; sleep 0.5; done

把kafka当数据源，不需要source

#  把kafka当数据源，不需要source


a1.channels = c1
a1.sinks = k1

a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = linux01:9092,linux02:9092,linux03:9092
#要读的toic
a1.channels.c1.kafka.topic = flume-channel
a1.channels.c1.parseAsFlumeEvent = false
a1.channels.c1.kafka.consumer.auto.offset.reset = latest


a1.sinks.k1.type = logger
a1.sinks.k1.channel = c1

kafka sink就是向kafka里写东西(没下啥用用上面的就好’了)

#向kafka里写东西
a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.batchSize = 100
a1.sources.r1.command = tail -F /root/logs/wxlog/event.log


a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 200


a1.sinks.k1.channel = c1
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = linux01:9092,linux02:9092,linux03:9092
#要吓到的topic
a1.sinks.k1.kafka.topic = mytopic

hive sink(往hive里写东西)

# hive的表结构

set hive.support.concurrency=true;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;

drop table if exists mytable; 
create table mytable(
id int,
name string,
addr string
)
partitioned by (dt string)
row format delimited fields terminated by ','
stored as orc 
location '/mytable/'
tblproperties('transactional'='true')
;


#  数据的样例
# 1,zs,18,beijing
# 2,ls,18,shagnhai
# 3,ww,18,hebei

a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.batchSize = 100
a1.sources.r1.command = tail -F /root/log/event.log


a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000000
a1.channels.c1.transactionCapacity = 30000


a1.sinks.k1.channel = c1
#hive类型
a1.sinks.k1.type = hive
#元数据的服务
a1.sinks.k1.hive.metastore = thrift://linux01:9083
#写到hive的数据库
a1.sinks.k1.hive.database = default
#写到hive的表
a1.sinks.k1.hive.table = mytable
#要放到的分区
a1.sinks.k1.hive.partition = %Y-%m-%d
#本地时间戳
a1.sinks.k1.useLocalTimeStamp = true
#分隔
a1.sinks.k1.serializer = DELIMITED
#分隔符
a1.sinks.k1.serializer.delimiter = ,
#字段名
a1.sinks.k1.serializer.fieldnames = id,name,,addr

一把秀儿

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
kafka快速入门

基本操作list存在的topic bin/kafka-topics.sh --list --zookeeper linux01:2181就是看里面的库 ---------------------------------------------------------------------------------- 创建topic bin/kafka-topics.sh --create --topic doitedu19 --partitions 2 --replication-factor .
复制链接

扫一扫

专栏目录