flume数据迁移到kafka,和hdfs中

把users.csv,events.csv...迁移到kafka和hdfs中

#创建本地文件目录,拖拽文件到目录中
mkdir -p /opt/eventsource 
users
user_friends_raw
user_friends
events
event_attendees_raw
event_attendees
test
train

#启动zookeeper 和kafka 服务 
zkServer.sh start nohup kafka-server-start.sh /opt/software/kafka_2.12-2.8.0/config/server.properties 1>/dev/null 2>&1 &

#创建kafka主题
kafka-topics.sh --bootstrap-server single01:9092 --create --topic users --partitions 1 --replication-factor 1
#查看kafka目录
kafka-topics.sh --bootstrap-server single01:9092 --list  
kafka-topics.sh --zookeeper single01:2181 --list

#创建工作目录
mkdir /opt/software/flume-1.9.0/conf/kb16eventconf    #flume工作的配置文件所在目录
mkdir -p /opt/kb16tmp/flumelogfile/users #资源文件目录
mkdir -p /opt/kb16tmp/checkpoint/users    #检查点目录
mkdir -p /opt/kb16tmp/datadir/users    

#修改flume的运行内存和预分配
vim /opt/software/flume-1.9.0/conf/flume-env.sh
#export JAVA_OPTS="-Xms100m -Xmx2000m -Dcom.sun.management.jmxremote"
export JAVA_OPTS="-Xms2048m -Xmx2048m -Dcom.sun.management.jmxremote"

#创建flume工作的配置文件  //把user.csv 上传至kafka
(base) [root@single01 kb16eventconf]# vim users.conf
{
users.sources=usersSource
users.channels=usersChannel
users.sinks=usersSink

users.sources.usersSource.type=spooldir
#指定sources的监控路径
users.sources.usersSource.spoolDir=/opt/kb16tmp/flumelogfile/users
#反序列化 以行为单位
users.sources.usersSource.deserializer=LINE
#指定每行最大长度(字节)
users.sources.usersSource.deserializer.maxLineLength=320000
#正则匹配(选择需要的文件)
users.sources.usersSource.includePattern=users_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
#设置过滤器(过滤以“user_id”开头的一行)
users.sources.usersSource.interceptors=head_filter
users.sources.usersSource.interceptors.head_filter.type=regex_filter
users.sources.usersSource.interceptors.head_filter.regex=^user_id*
users.sources.usersSource.interceptors.head_filter.excludeEvents=true

users.channels.usersChannel.type=file
#检查点路径
users.channels.usersChannel.checkpointDir=/opt/kb16tmp/checkpoint/users
#数据存放路径
users.channels.usersChannel.dataDirs=/opt/kb16tmp/datadir/users

users.sinks.usersSink.type=org.apache.flume.sink.kafka.KafkaSink
users.sinks.usersSink.batchSize=640
#指定kafka地址
users.sinks.usersSink.brokerList=single01:9092
users.sinks.usersSink.topic=users

users.sources.usersSource.channels=usersChannel
users.sinks.usersSink.channel=usersChannel

}

#查看文件行数
wc -l users.csv

#复制本地原文件到资源文件目录
(base) [root@single01 eventsource]# cp users.csv /opt/kb16tmp/flumelogfile/users/users_2022-04-05.csv


#kafka消费命令
(base) [root@single01 ~]# kafka-console-consumer.sh --bootstrap-server single01:9092 --topic users --from-beginning


#flume启动命令
(base) [root@single01 flume-1.9.0]# ./bin/flume-ng agent --name users --conf ./conf/ --conf-file ./conf/kb16eventconf/users.conf  -Dflume.root.logger=INFO,console


#查看topic消息数量
(base) [root@single01 eventsource]# kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list single01:9092 --topic users -time -1 --offsets 1
users:0:38209

................................................................................................................

//train.conf

把train.csv上传至kafka和hdfs,下面是flume的配置文件

train.sources=trainSource
train.channels=fileChannel memoryChannel
train.sinks=kafkaSink hdfsSink

train.sources.trainSource.type=spooldir
train.sources.trainSource.spoolDir=/opt/kb16tmp/flumelogfile/train
train.sources.trainSource.deserializer=LINE
train.sources.trainSource.deserializer.maxLineLength=320000
train.sources.trainSource.includePattern=train_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
train.sources.trainSource.interceptors=head_filter
train.sources.trainSource.interceptors.head_filter.type=regex_filter
train.sources.trainSource.interceptors.head_filter.regex=^user*
train.sources.trainSource.interceptors.head_filter.excludeEvents=true

train.channels.fileChannel.type=file
train.channels.fileChannel.checkpointDir=/opt/kb16tmp/checkpoint/train
train.channels.fileChannel.dataDirs=/opt/kb16tmp/datadir/train

train.channels.memoryChannel.type=memory
train.channels.memoryChannel.capacity=64000
train.channels.memoryChannel.transactionCapacity=16000

train.sinks.kafkaSink.type=org.apache.flume.sink.kafka.KafkaSink
train.sinks.kafkaSink.batchSize=640
train.sinks.kafkaSink.brokerList=192.168.43.200:9092
train.sinks.kafkaSink.topic=train

train.sinks.hdfsSink.type=hdfs
train.sinks.hdfsSink.hdfs.fileType=DataStream
train.sinks.hdfsSink.hdfs.filePrefix=trian
train.sinks.hdfsSink.hdfs.fileSuffix=.csv
train.sinks.hdfsSink.hdfs.path=hdfs://192.168.43.200:9000/kb16file/train/%Y-%m-%d
train.sinks.hdfsSink.hdfs.useLocalTimeStamp=true
train.sinks.hdfsSink.hdfs.batchSize=640
train.sinks.hdfsSink.hdfs.rollCount=0
train.sinks.hdfsSink.hdfs.rollSize=64000000
train.sinks.hdfsSink.hdfs.rollInterval=30
train.sinks.hdfsSink.hdfs.minBlockReplicas=1

train.sources.trainSource.channels=fileChannel memoryChannel
train.sinks.kafkaSink.channel=fileChannel
train.sinks.hdfsSink.channel=memoryChannel 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值