2020.12.08课堂笔记(从flume读取数据连接到Kafka中)

在Kafka中创建topic:

举例:

kafka-topics.sh --create --zookeeper 192.168.237.100:2181 --topic kb09demo2 --partitions 5 --replication-factor 1

要求创建下列topic:
users
user_friends_raw
user_friends
events
event_attendees_raw
event_attendees
train
test

kafka-topics.sh --delete --topic users --zookeeper 192.168.237.100:2181
kafka-topics.sh --delete --topic user_friends_raw --zookeeper 192.168.237.100:2181
kafka-topics.sh --delete --topic user_friends --zookeeper 192.168.237.100:2181
kafka-topics.sh --delete --topic events --zookeeper 192.168.237.100:2181
kafka-topics.sh --delete --topic event_attendees_raw --zookeeper 192.168.237.100:2181
kafka-topics.sh --delete --topic event_attendees --zookeeper 192.168.237.100:2181
kafka-topics.sh --delete --topic train --zookeeper 192.168.237.100:2181
kafka-topics.sh --delete --topic test --zookeeper 192.168.237.100:2181

kafka-topics.sh --create --zookeeper 192.168.237.100:2181 --topic users --partitions 1 --replication-factor 1
kafka-topics.sh --create --zookeeper 192.168.237.100:2181 --topic user_friends_raw --partitions 1 --replication-factor 1
kafka-topics.sh --create --zookeeper 192.168.237.100:2181 --topic user_friends --partitions 1 --replication-factor 1
kafka-topics.sh --create --zookeeper 192.168.237.100:2181 --topic events --partitions 1 --replication-factor 1
kafka-topics.sh --create --zookeeper 192.168.237.100:2181 --topic event_attendees_raw --partitions 1 --replication-factor 1
kafka-topics.sh --create --zookeeper 192.168.237.100:2181 --topic event_attendees --partitions 1 --replication-factor 1
kafka-topics.sh --create --zookeeper 192.168.237.100:2181 --topic train --partitions 1 --replication-factor 1
kafka-topics.sh --create --zookeeper 192.168.237.100:2181 --topic test --partitions 1 --replication-factor 1

编写agent的配置文件:

#定义三大组件的名称
userFriends.sources=userFriendsSource
userFriends.channels=userFriendsChannel
userFriends.sinks=userFriendsSink
# 配置source组件
userFriends.sources.userFriendsSource.type=spooldir
userFriends.sources.userFriendsSource.spoolDir=/opt/flume160/conf/jobkb09/dataSourceFile/userFriends
userFriends.sources.userFriendsSource.includePattern=userFriends_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
userFriends.sources.userFriendsSource.deserializer=LINE
userFriends.sources.userFriendsSource.deserializer.maxLineLength=10000
userFriends.sources.userFriendsSource.interceptors=head_filter
userFriends.sources.userFriendsSource.interceptors.head_filter.type=regex_filter
userFriends.sources.userFriendsSource.interceptors.head_filter.regex=^user*
userFriends.sources.userFriendsSource.interceptors.head_filter.excludeEvents=true
# 配置channel组件
userFriends.channels.userFriendsChannel.type=file
userFriends.channels.userFriendsChannel.checkpointDir=/opt/flume160/conf/jobkb09/checkPointFile/userFriends
userFriends.channels.userFriendsChannel.dataDirs=/opt/flume160/conf/jobkb09/dataChannelFile/userFriends
# 配置sink组件
userFriends.sinks.userFriendSink.type=org.apache.flume.sink.kafka.KafkaSink
userFriends.sinks.userFriendsSink.batchSize=640
userFriends.sinks.userFriendsSink.brokerList=192.168.237.100:9092
userFriends.sinks.userFriendsSink.topic=user_friends_raw
# 绑定source、channel和sink之间的连接
userFriends.sources.userFriendsSource.channels=userFriendsChannel
userFriends.sinks.userFriendsSink.channel=userFriendsChannel

执行flume-ng导入到Kafka中

[root@hadoop100 flume160]# ./bin/flume-ng agent --name userFriends --conf ./conf/ --conf-file ./conf/jobkb09/tmp/userFriend-flume-Kafka.conf -Dflume.root.logger=INFO,console

使用Kafka stream处理flume导入的数据

flume导入的数据放在topic user_friends_raw中
编写Kafka stream程序将数据处理存放在user_friends中

先查看user_friends_raw中的数据:

[root@hadoop100 flume160]# kafka-console-consumer.sh --topic user_friends_raw --bootstrap-server  hadoop100:9092 --from-beginning

取出其中一条数据:

54820382,3887786716 976480913 1549373723 1266340021 2737922063 3328501661 2095013200 2218536804 2826821951 1090749315 1608429064 657947649 2299229236 2277081711 2551833867 2753318997 3255598992 2317748319 3311999073 1659964719 344171892 4257555652 2797450456 45318777 1041683181 1599555618 114577094 886302587 1538472134 3160042675 2563532717 3098505050 1786701735 2440963474 2382320222 3562047852 1689952259 3880284955 4211207471 959390435 961124340 3725363088 97398613 3333292364 2492255667 3846936613 1715899829 1383012353 1309948019 3782295401 811727857 850419243 140602867 2019669204 3716414836 1123599659 2921815 2610662061

现在希望展现形式:

54820382,3887786716
54820382,976480913
54820382,1549373723
54820382,1266340021
54820382,2737922063
54820382,3328501661
54820382,2095013200
54820382,2218536804
54820382,2826821951
54820382,1090749315
54820382,1608429064
54820382,657947649
------以下省略-------
package nj.zb.kb09.kafka;

import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.*;


import java.util.ArrayList;

import java.util.Properties;
import java.util.concurrent.CountDownLatch;

/**
 * @Author: ChaoKeAiMuZhi
 * @Date: 2020/12/16 16:24
 * @Description:
 **/
public class UserFriendStream {
    public static void main(String[] args) {
        Properties prop = new Properties();
        prop.put(StreamsConfig.APPLICATION_ID_CONFIG, "userfriend");
        prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.237.100:9092");
//        prop.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG,3000);
//        prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
//        prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,"false");
        prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
        prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());

        StreamsBuilder builder = new StreamsBuilder();
        
        //代码逻辑部分
        builder.stream("user_friends_raw").flatMap((k, v) -> {
            ArrayList<KeyValue<String, String>> list = new ArrayList<>();
            String[] info = v.toString().split(",");
            if (info.length == 2) {
                String[] friends = info[1].split("\\s+");
                if(friends[0].trim().length()>0) {
                    for (String friend : friends) {
                        System.out.println(info[0] + "  " + friend);
                        list.add(new KeyValue<String, String>(null, info[0] + "," + friend));
                    }
                }
            }
            return list;
        }).to("user_friends");


        final Topology topo = builder.build();
        final KafkaStreams streams = new KafkaStreams(topo, prop);

        final CountDownLatch latch = new CountDownLatch(1);
        Runtime.getRuntime().addShutdownHook(new Thread("stream") {
            @Override
            public void run() {
                streams.close();
                latch.getClass();
            }
        });

        streams.start();
        try {
            latch.await();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        System.exit(0);
    }
}

案例2:

[root@hadoop100 ~]# kafka-topics.sh --delete --zookeeper 192.168.237.100:2181 --topic event_attendees
	[root@hadoop100 ~]# kafka-topics.sh --delete --zookeeper 192.168.237.100:2181 --topic event_attendees_raw
[root@hadoop100 ~]# kafka-topics.sh --list --zookeeper 192.168.237.100:2181
[root@hadoop100 flume160]# kafka-topics.sh --create  --zookeeper 192.168.237.100:2181 --topic event_attendees_raw --partitions 1 --replication-factor 1

创建flume agent的配置文件:

eventattend.sources=eventAttendSource
eventattend.channels=eventAttendChannel
eventattend.sinks=eventAttendSink

eventattend.sources.eventAttendSource.type=spooldir
eventattend.sources.eventAttendSource.spoolDir=/opt/flume160/conf/jobkb09/dataSourceFile/eventattend
eventattend.sources.eventAttendSource.includePattern=eventattend_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
eventattend.sources.eventAttendSource.deserializer=LINE
eventattend.sources.eventAttendSource.deserializer.maxLineLength=20000
eventattend.sources.eventAttendSource.intersecptors=head_filter
eventattend.sources.eventAttendSource.intersecptors.head_filter.type=regex_filter
eventattend.sources.eventAttendSource.intersecptors.head_filter.regex=^event
eventattend.sources.eventAttendSource.intersecptors.head_filter.execludeEvents=true

eventattend.channels.eventAttendChannel.type=file
eventattend.channels.eventAttendChannel.checkpointDir=/opt/flume160/conf/jobkb09/checkPointFile/eventattend
eventattend.channels.eventAttendChannel.dataDirs=/opt/flume160/conf/jobkb09/dataChannelFile/eventattend

eventattend.sinks.eventAttendSink.type=org.apache.flume.sink.kafka.KafkaSink
eventattend.sinks.eventAttendSink.batchSize=640
eventattend.sinks.eventAttendSink.brokerList=192.168.237.100:9092
eventattend.sinks.eventAttendSink.topic=event_attendees_raw


eventattend.sources.eventAttendSource.channels=eventAttendChannel
eventattend.sinks.eventAttendSink.channel=eventAttendChannel
[root@hadoop100 jobkb09]# cp /opt/flume160/conf/jobkb09/tmp/event_attendees.csv /opt/flume160/conf/jobkb09/dataSourceFile/eventattend/eventattend_2020-11-18.csv

mkdir -p /opt/flume160/conf/jobkb09/dataSourceFile/eventattend
mkdir -p /opt/flume160/conf/jobkb09/checkPointFile/eventattend
mkdir -p /opt/flume160/conf/jobkb09/dataChannelFile/eventattend

	[root@hadoop100 flume160]# ./bin/flume-ng agent -n eventattend -c ./conf/  -f ./conf/jobkb09/eventsattend-flume-kafka.conf -Dflume.root.logger=INFO,console

[root@hadoop100 tmp]# kafka-console-consumer.sh --bootstrap-server 127.0.0.1:9092 --topic event_attendees_raw --from-beginning

[root@hadoop100 tmp]# wc -l event_attendees.csv
24145 event_attendees.csv

使用Kafka stream处理数据:

package nj.zb.kb09.kafka;

import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.*;
import org.apache.kafka.streams.kstream.KStream;

import java.util.ArrayList;
import java.util.Properties;
import java.util.concurrent.CountDownLatch;

/**
 * @Author: ChaoKeAiMuZhi
 * @Date: 2020/12/18 14:56
 * @Description:
 **/
public class EventAttendStream2 {
    public static void main(String[] args) {
        Properties prop = new Properties();
        prop.put(StreamsConfig.APPLICATION_ID_CONFIG,"eventattendstream2");
        prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG,"192.168.237.100:9092");
        prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
        prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG,Serdes.String().getClass());

        // 创建流构造器
        StreamsBuilder builder = new StreamsBuilder();

        KStream<Object, Object> ear = builder.stream("event_attendees_raw");

        KStream<String, String> eventKStream = ear.flatMap((k, v) -> {    // event,yes,maybe,invite,no
            System.out.println(v.toString());
            String[] split = v.toString().split(",");
            ArrayList<KeyValue<String, String>> list = new ArrayList<>();
            if (split.length >= 2 && split[1].trim().length() > 0) {
                System.out.println(split);
                String[] yes = split[1].split("\\s+");
                for (String y : yes) {
                    list.add(new KeyValue<String, String>(null, split[0] + "," + y + ",yes"));
                }
            }
            if (split.length >= 3 && split[2].trim().length() > 0) {
                String[] maybe = split[2].split("\\s+");
                for (String m : maybe) {
                    list.add(new KeyValue<String, String>(null, split[0] + "," + m + ",maybe"));
                }
            }
            if (split.length >= 4 && split[3].trim().length() > 0) {
                String[] invite = split[3].split("\\s+");
                for (String i : invite) {
                    list.add(new KeyValue<String, String>(null, split[0] + "," + i + ",invite"));
                }
            }
            if (split.length >= 5 && split[4].trim().length() > 0) {
                String[] no = split[4].split("\\s+");
                for (String n : no) {
                    list.add(new KeyValue<String, String>(null, split[0] + "," + n + ",no"));
                }
            }
            return list;
        });

        eventKStream.to("event_attendees");


        final Topology topo = builder.build();
        final KafkaStreams streams = new KafkaStreams(topo, prop);

        final CountDownLatch latch = new CountDownLatch(1);
        Runtime.getRuntime().addShutdownHook(new Thread("stream") {
            @Override
            public void run() {
                streams.close();
                latch.getClass();
            }
        });

        streams.start();
        try {
            latch.await();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

        System.exit(0);

    }
}

使用Kafka消费者消费数据:

[root@hadoop100 eventattend]# kafka-topics.sh --create  --zookeeper 192.168.237.100:2181 --topic event_attendees --partitions 1 --replication-factor 1
[root@hadoop100 eventattend]# kafka-console-consumer.sh --bootstrap-server 192.168.237.100:9092 --topic event_attendees  --from-beginning
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值