练习—数据迁移、数据清洗

前置

启动zookeeper、kafka

zkServer.sh start
[root@cp145 data]# nohup kafka-server-start.sh /opt/soft/kafka212/config/server.properties &

例子

users

users.conf

[root@cp145 ~]# kafka-topics.sh --create --zookeeper 192.168.153.145:2181 --topic users --partitions 1 --replication-factor 1
[root@cp145 ~]# kafka-topics.sh --create --zookeeper 192.168.153.145:2181 --topic user_friends --partitions 1 --replication-factor 1
[root@cp145 ~]# kafka-topics.sh --create --zookeeper 192.168.153.145:2181 --topic user_friends_raw --partitions 1 --replication-factor 1
[root@cp145 ~]# kafka-topics.sh --create --zookeeper 192.168.153.145:2181 --topic events --partitions 1 --replication-factor 1
[root@cp145 ~]# kafka-topics.sh --create --zookeeper 192.168.153.145:2181 --topic event_attendees --partitions 1 --replication-factor 1
[root@cp145 ~]# kafka-topics.sh --create --zookeeper 192.168.153.145:2181 --topic event_attendees_raw --partitions 1 --replication-factor 1
[root@cp145 ~]# kafka-topics.sh --create --zookeeper 192.168.153.145:2181 --topic train --partitions 1 --replication-factor 1
[root@cp145 ~]# kafka-topics.sh --create --zookeeper 192.168.153.145:2181 --topic test --partitions 1 --replication-factor 1

在这里插入图片描述
在/opt/soft/flume190/conf/events里新建一个users.conf

users.sources=usersSource
users.channels=usersChannel
users.sinks=userSink

users.sources.usersSource.type=spooldir
users.sources.usersSource.spoolDir=/opt/flumelogfile/users
users.sources.usersSource.deserializer=LINE
users.sources.usersSource.deserializer.maxLineLength=320000
users.sources.usersSource.includePattern=users_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
##过滤器,去表头
users.sources.usersSource.interceptors=head_filter
users.sources.usersSource.interceptors.head_filter.type=regex_filter
users.sources.usersSource.interceptors.head_filter.regex=^user_id*
users.sources.usersSource.interceptors.head_filter.excludeEvents=true

users.channels.usersChannel.type=file
users.channels.usersChannel.checkpointDir=/opt/flumelogfile/checkpoint/users
users.channels.usersChannel.dataDirs=/opt/flumelogfile/data/users

users.sinks.userSink.type=org.apache.flume.sink.kafka.KafkaSink
users.sinks.userSink.batchSize=640
users.sinks.userSink.brokerList=192.168.153.145:9092
users.sinks.userSink.topic=users

users.sources.usersSource.channels=usersChannel
users.sinks.userSink.channel=usersChannel

创建三个目录(不建好像也会自动创建)

/opt/flumelogfile/users
/opt/flumelogfile/checkpoint/users
/opt/flumelogfile/data/users

启动flume

[root@cp145 flume190]# ./bin/flume-ng agent --name users --conf ./conf/ --conf-file ./conf/events/users.conf -Dflume.root.logger=INFO,console

在这里插入图片描述

启动消费者

[root@cp145 data]# kafka-console-consumer.sh --bootstrap-server 192.168.153.145:9092 --topic users

复制启动

[root@cp145 eventdata]# cp ./users.csv /opt/flumelogfile/users/users_2023-04-01.csv

在这里插入图片描述

userfriends

conf

userfriends.sources=userfriendsSource
userfriends.channels=userfriendsChannel
userfriends.sinks=userfriendsSink

userfriends.sources.userfriendsSource.type=spooldir
userfriends.sources.userfriendsSource.spoolDir=/opt/flumelogfile/userfriends
userfriends.sources.userfriendsSource.deserializer=LINE
userfriends.sources.userfriendsSource.deserializer.maxLineLength=320000
userfriends.sources.userfriendsSource.includePattern=uf_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
##过滤器,去表头
userfriends.sources.userfriendsSource.interceptors=head_filter
userfriends.sources.userfriendsSource.interceptors.head_filter.type=regex_filter
userfriends.sources.userfriendsSource.interceptors.head_filter.regex=^user*
userfriends.sources.userfriendsSource.interceptors.head_filter.excludeEvents=true

userfriends.channels.userfriendsChannel.type=file
userfriends.channels.userfriendsChannel.checkpointDir=/opt/flumelogfile/checkpoint/userfriends
userfriends.channels.userfriendsChannel.dataDirs=/opt/flumelogfile/data/userfriends

userfriends.sinks.userfriendsSink.type=org.apache.flume.sink.kafka.KafkaSink
userfriends.sinks.userfriendsSink.batchSize=640
userfriends.sinks.userfriendsSink.brokerList=192.168.153.145:9092
userfriends.sinks.userfriendsSink.topic=user_friends_raw

userfriends.sources.userfriendsSource.channels=userfriendsChannel
userfriends.sinks.userfriendsSink.channel=userfriendsChannel

创建三个目录

/opt/flumelogfile/userfriends
/opt/flumelogfile/checkpoint/userfriends
/opt/flumelogfile/data/userfriends

打开flume

[root@cp145 flume190]# ./bin/flume-ng agent --name userfriends --conf ./conf/ --conf-file ./conf/events/userfriends.conf -Dflume.root.logger=INFO,console

打开消费者

[root@cp145 flumelogfile]# kafka-console-consumer.sh --bootstrap-server 192.168.153.145:9092 --topic user_friends_raw

复制文件启动

cp /opt/eventdata/user_friends.csv /opt/flumelogfile/userfriends/uf_2023-04-01.csv

在这里插入图片描述

train

conf

train.sources=trainSource
train.channels=trainChannel
train.sinks=trainSink

train.sources.trainSource.type=spooldir
train.sources.trainSource.spoolDir=/opt/flumelogfile/train
train.sources.trainSource.deserializer=LINE
train.sources.trainSource.deserializer.maxLineLength=320000
train.sources.trainSource.includePattern=train_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
##过滤器,去表头
train.sources.trainSource.interceptors=head_filter
train.sources.trainSource.interceptors.head_filter.type=regex_filter
train.sources.trainSource.interceptors.head_filter.regex=^user*
train.sources.trainSource.interceptors.head_filter.excludeEvents=true

train.channels.trainChannel.type=file
train.channels.trainChannel.checkpointDir=/opt/flumelogfile/checkpoint/train
train.channels.trainChannel.dataDirs=/opt/flumelogfile/data/train

train.sinks.trainSink.type=org.apache.flume.sink.kafka.KafkaSink
train.sinks.trainSink.batchSize=640
train.sinks.trainSink.brokerList=192.168.153.145:9092
train.sinks.trainSink.topic=train

train.sources.trainSource.channels=trainChannel
train.sinks.trainSink.channel=trainChannel

创建目录

打开消费者

[root@cp145 flumelogfile]# kafka-console-consumer.sh --bootstrap-server 192.168.153.145:9092 --topic train

打开flume

[root@cp145 flume190]# ./bin/flume-ng agent --name train --conf ./conf/ --conf-file ./conf/events/train.conf -Dflume.root.logger=INFO,console

复制启动

在这里插入图片描述

events

conf

events.sources=eventsSource
events.channels=eventsChannel
events.sinks=eventsSink

events.sources.eventsSource.type=spooldir
events.sources.eventsSource.spoolDir=/opt/flumelogfile/events
events.sources.eventsSource.deserializer=LINE
events.sources.eventsSource.deserializer.maxLineLength=320000
events.sources.eventsSource.includePattern=events_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
##过滤器,去表头
events.sources.eventsSource.interceptors=head_filter
events.sources.eventsSource.interceptors.head_filter.type=regex_filter
events.sources.eventsSource.interceptors.head_filter.regex=^event_id*
events.sources.eventsSource.interceptors.head_filter.excludeEvents=true

events.channels.eventsChannel.type=file
events.channels.eventsChannel.checkpointDir=/opt/flumelogfile/checkpoint/events
events.channels.eventsChannel.dataDirs=/opt/flumelogfile/data/events

events.sinks.eventsSink.type=org.apache.flume.sink.kafka.KafkaSink
events.sinks.eventsSink.batchSize=640
events.sinks.eventsSink.brokerList=192.168.153.145:9092
events.sinks.eventsSink.topic=events

events.sources.eventsSource.channels=eventsChannel
events.sinks.eventsSink.channel=eventsChannel

三个目录

消费者

[root@cp145 flumelogfile]# kafka-console-consumer.sh --bootstrap-server 192.168.153.145:9092 --topic events

打开flume

[root@cp145 flume190]# ./bin/flume-ng agent --name events --conf ./conf/ --conf-file ./conf/events/events.conf -Dflume.root.logger=INFO,console 

在这里插入图片描述

event_attendees_raw

conf

ea.sources=eaSource
ea.channels=eaChannel
ea.sinks=eaSink

ea.sources.eaSource.type=spooldir
ea.sources.eaSource.spoolDir=/opt/flumelogfile/ea
ea.sources.eaSource.deserializer=LINE
ea.sources.eaSource.deserializer.maxLineLength=320000
ea.sources.eaSource.includePattern=ea_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
##过滤器,去表头
ea.sources.eaSource.interceptors=head_filter
ea.sources.eaSource.interceptors.head_filter.type=regex_filter
ea.sources.eaSource.interceptors.head_filter.regex=^event*
ea.sources.eaSource.interceptors.head_filter.excludeEvents=true

ea.channels.eaChannel.type=file
ea.channels.eaChannel.checkpointDir=/opt/flumelogfile/checkpoint/ea
ea.channels.eaChannel.dataDirs=/opt/flumelogfile/data/ea

ea.sinks.eaSink.type=org.apache.flume.sink.kafka.KafkaSink
ea.sinks.eaSink.batchSize=640
ea.sinks.eaSink.brokerList=192.168.153.145:9092
ea.sinks.eaSink.topic=event_attendees_raw

ea.sources.eaSource.channels=eaChannel
ea.sinks.eaSink.channel=eaChannel

消费者

[root@cp145 flumelogfile]# kafka-console-consumer.sh --bootstrap-server 192.168.153.145:9092 --topic event_attendees_raw

flume

[root@cp145 flume190]# ./bin/flume-ng agent --name ea --conf ./conf/ --conf-file ./conf/events/ea.conf -Dflume.root.logger=INFO,console

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值