环境
CentOS 7.0
hadoop 2.7.3 CentOS 7.0+hadoop 2.7搭建集群
zookeeper 3.4.11 CentOS 7.0安装zookeeper3.4.11
flume 1.8.0 CentOS 7.0安装flume
kafka 1.0.0 CentOS 7.0安装kafka
下载安装
见前几篇文章
配置
修改配置文件
cp flume-conf.properties flume-conf3.properties
vi flume-conf3.properties
a1.sources=r1
a1.channels=c1 c2
a1.sinks=k1 k2
# For each one of the sources, the type is defined
# agent.sources.seqGenSrc.type = seq
a1.sources.r1.type=exec
a1.sources.r1.channels=c1 c2
a1.sources.r1.command=tail -f /tmp/flume_test.log
# channel采用复制模式,保证不同channel的数据相同
a1.sources.r1.selector.type=replicating
# c1
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transcationCapacity=100
# c2
a1.channels.c2.type=file
a1.channels.c2.checkpointDir=/opt/flume/apache-flume-1.8.0/checkpoint
a1.channels.c2.dataDirs=/opt/flume/apache-flume-1.8.0/data
# k1
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = bitopic
a1.sinks.k1.brokerList = 192.168.122.128:9092,192.168.122.129:9092,192.168.122.130:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20
a1.sinks.k1.channel = c1
# k2
a1.sinks.k2.type=hdfs
a1.sinks.k2.hdfs.path=hdfs://master:9000/test/flume/output
a1.sinks.k2.hdfs.fileType = DataStream
a1.sinks.k2.hdfs.writeFormat=Text
a1.sinks.k2.hdfs.maxOpenFiles = 1
a1.sinks.k2.hdfs.rollCount = 0
a1.sinks.k2.hdfs.rollInterval = 0
a1.sinks.k2.hdfs.rollSize = 1000
a1.sinks.k2.hdfs.batchSize = 100
a1.sinks.k2.channel=c2
新建配置文件所需目录
cd /opt/flume/apache-flume-1.8.0/
mkdir data
mkdir checkpoint
启动
启动kafka
cd /opt/kafka/kafka_2.11-1.0.0/bin
./kafka-server-start.sh -daemon ../config/server.properties
测试
创建topic
./kafka-topics.sh --create --zookeeper 192.168.122.128:2181 --replication-factor 1 --partitions 1 --topic bitopic
启动flume
./bin/flume-ng agent --conf ./conf --conf-file ./conf/flume-conf3.properties --name a1 -Dflume.root.logger=INFO,console
创建生产者
cd /tmp
ping master >> flume_test.log
创建消费者
./kafka-console-consumer.sh --bootstrap-server 192.168.122.129:9092 --topic bitopic --from-beginning
./kafka-console-consumer.sh --bootstrap-server 192.168.122.130:9092 --topic bitopic --from-beginning
结果
消费者能够查看到生产者生产的数据,同时hdfs也相应的文件。
...
...
64 bytes from master (192.168.122.128): icmp_seq=20 ttl=64 time=0.036 ms
64 bytes from master (192.168.122.128): icmp_seq=21 ttl=64 time=0.028 ms
64 bytes from master (192.168.122.128): icmp_seq=22 ttl=64 time=0.040 ms
64 bytes from master (192.168.122.128): icmp_seq=23 ttl=64 time=0.081 ms
64 bytes from master (192.168.122.128): icmp_seq=24 ttl=64 time=0.073 ms
64 bytes from master (192.168.122.128): icmp_seq=25 ttl=64 time=0.032 ms
--- master ping statistics ---
25 packets transmitted, 25 received, 0% packet loss, time 24017ms
rtt min/avg/max/mdev = 0.028/0.038/0.081/0.016 ms
PING master (192.168.122.128) 56(84) bytes of data.
64 bytes from master (192.168.122.128): icmp_seq=1 ttl=64 time=0.020 ms
64 bytes from master (192.168.122.128): icmp_seq=2 ttl=64 time=0.028 ms
64 bytes from master (192.168.122.128): icmp_seq=3 ttl=64 time=0.048 ms
64 bytes from master (192.168.122.128): icmp_seq=4 ttl=64 time=0.029 ms
...
...
hadoop fs -ls /test/flume/output
# 输出
Found 10 items
-rw-r--r-- 2 root supergroup 1065 2017-12-07 17:24 /test/flume/output/FlumeData.1512638646782
-rw-r--r-- 2 root supergroup 951 2017-12-07 17:24 /test/flume/output/FlumeData.1512638646783
-rw-r--r-- 2 root supergroup 586 2017-12-08 09:41 /test/flume/output/FlumeData.1512697094586
-rw-r--r-- 2 root supergroup 1070 2017-12-08 09:43 /test/flume/output/FlumeData.1512697302493
-rw-r--r-- 2 root supergroup 1019 2017-12-08 09:44 /test/flume/output/FlumeData.1512697302494
-rw-r--r-- 2 root supergroup 878 2017-12-08 09:45 /test/flume/output/FlumeData.1512697302495
-rw-r--r-- 2 root supergroup 586 2017-12-08 09:49 /test/flume/output/FlumeData.1512697575258
-rw-r--r-- 2 root supergroup 1070 2017-12-08 09:51 /test/flume/output/FlumeData.1512697855497
-rw-r--r-- 2 root supergroup 1019 2017-12-08 09:52 /test/flume/output/FlumeData.1512697855498
-rw-r--r-- 2 root supergroup 146 2017-12-08 09:52 /test/flume/output/FlumeData.1512697855499.tmp
hadoop fs -cat /test/flume/output/FlumeData.1512697855499.tmp
# 输出
64 bytes from master (192.168.122.128): icmp_seq=21 ttl=64 time=0.034 ms
64 bytes from master (192.168.122.128): icmp_seq=22 ttl=64 time=0.030 ms
64 bytes from master (192.168.122.128): icmp_seq=23 ttl=64 time=0.031 ms
64 bytes from master (192.168.122.128): icmp_seq=24 ttl=64 time=0.035 ms
64 bytes from master (192.168.122.128): icmp_seq=25 ttl=64 time=0.037 ms
64 bytes from master (192.168.122.128): icmp_seq=26 ttl=64 time=0.027 ms
64 bytes from master (192.168.122.128): icmp_seq=27 ttl=64 time=0.031 ms
64 bytes from master (192.168.122.128): icmp_seq=28 ttl=64 time=0.028 ms
至此,单个source、经过两个channel、发布到两个sink已实现;可以引申一下,做实时监控日志文件,通过kafka消息分发、实时分析,同时持久化到hdfs.
参考: