1.flume向hdfs传输数据
上传apache-flume-1.9.0-bin.tar.gz到hadoop102 /opt/software下
tar -zxvf apache-flume-1.9.0-bin.tar.gz -C /opt/module
cd /opt/module/
mv apache-flume-1.9.0 flume-1.9.0
启动hdfs
hdp.sh start
# 执行脚本,模拟日志生产
# 在/opt/module/datas目录下创建a.log
while true; do echo $RANDOM >> a.log; sleep 0.01; done
1.1配置flume的 taildir-m-hdfs.conf
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.channels = c1
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = g1
a1.sources.r1.filegroups.g1 = /opt/module/datas/a.*
a1.sources.r1.headers.g1.x = y
a1.sources.r1.fileHeader = true
a1.sources.r1.fileHeaderKey = filepath
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = timestamp
a1.sources.r1.interceptors.i1.headerName = timestamp
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 1000
a1.sinks.k1.channel = c1
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://hadoop102:8020/flumedata/%Y-%m-%d/%H
a1.sinks.k1.hdfs.filePrefix = mylog-
a1.sinks.k1.hdfs.fileSuffix = .log
a1.sinks.k1.hdfs.rollSize = 268435456
a1.sinks.k1.hdfs.rollInterval = 120
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.batchSize = 1000
a1.sinks.k1.hdfs.fileType = CompressedStream
a1.sinks.k1.hdfs.codeC = snappy
a1.sinks.k1.hdfs.useLocalTimeStamp = false
2.启动flume-agent
[xx@hadoop102 flume-1.9.0]$ bin/flume-ng agent -c ./conf -f ./agentconf/taildir-m-hdfs.conf -n a1 -Dflume.root.logger=DEBUG,console
成功!
2.flume级联案例
2.1.hadoop102上flume配置taildir-f-avrosink.conf
# 第一级的agent配置,第一级的所有节点配置都相同
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.channels = c1
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = g1
a1.sources.r1.filegroups.g1 = /opt/module/datas/a.*
a1.sources.r1.fileHeader = false
a1.channels.c1.type = file
a1.sinks.k1.channel = c1
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop103
a1.sinks.k1.port = 4444
2.2.hadoop103flume上配置avro-f-kfksink.conf
# 第二级的agent配置
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.channels = c1
a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop103
a1.sources.r1.port = 4444
a1.sources.r1.batchSize = 100
a1.channels.c1.type = file
a1.sinks.k1.channel = c1
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = hadoop102:9092,hadoop103:9092,hadoop104:9092
a1.sinks.k1.kafka.topic = doitedu17
a1.sinks.k1.kafka.producer.acks = 1
2.3. 模拟生产日志
hadoop102 /opt/module/datas目录
while true; do echo $RANDOM >> a.log; sleep 0.01; done
2.4.启动zookeeper, kafka (zk和kafka配置)
[xx@hadoop102 ~]$ zk.sh start
JMX enabled by default
Using config: /opt/module/zookeeper-3.4.6/bin/../conf/zoo.cfg
Starting zookeeper ... STARTED
JMX enabled by default
Using config: /opt/module/zookeeper-3.4.6/bin/../conf/zoo.cfg
Starting zookeeper ... STARTED
JMX enabled by default
Using config: /opt/module/zookeeper-3.4.6/bin/../conf/zoo.cfg
Starting zookeeper ... kSTARTED
[xx@hadoop102 ~]$ kafka.sh start
--------启动 hadoop102 Kafka-------
--------启动 hadoop103 Kafka-------
--------启动 hadoop104 Kafka-------
[xx@hadoop102 ~]$ xcall.sh jps
=======hadoop102=======
2130 Kafka
2233 Jps
1803 QuorumPeerMain
=======hadoop103=======
1776 QuorumPeerMain
2199 Jps
2105 Kafka
=======hadoop104=======
1771 QuorumPeerMain
2189 Jps
2094 Kafka
kafka基本命令操作
## topic查看
bin/kafka-topics.sh --list --zookeeper doitedu01:2181
## topic创建
bin/kafka-topics.sh --create --topic topic2 --partitions 2 --replication-factor 2 --zookeeper doitedu01:2181
## 启动一个控制台生产者来生产数据
bin/kafka-console-producer.sh --broker-list doitedu01:9092,doitedu02:9092,doitedu03:9092 --topic topic2
>hello tom
## 启动一个控制台消费者来消费数据
bin/kafka-console-consumer.sh --bootstrap-server doitedu01:9092,doitedu02:9092,doitedu03:9092 --topic topic2 --from-beginning
2.5.级联采集的实践操作步骤
1.启动级联flume系统的第2级 hadoop103
[xx@hadoop103 flume-1.9.0]$ bin/flume-ng agent -c conf/ -f agengConf/avro-f-kfksink.conf -n a1 -Dflume.root.logger=DEBUG,console
2.启动级联flume系统的第1级
[xx@hadoop102 flume-1.9.0]$ bin/flume-ng agent -c conf/ -f agentConf/taildir-f-avrosink.conf -n a1 -Dflume.root.logger=DEBUG,console
3.检查数据是否传输到了kafka,启动一个消费者来消费
[atguigu@hadoop104 kafka]$ bin/kafka-console-consumer.sh --bootstrap-server hadoop102:9092 --topic doitedu17
26088
1901
15413
32216
18167
11155
2907
22079
1172
16349
7927
2059
成功!
3.channel选择器-自定义拦截器-项目分流需求
0.编写一个拦截器
package com.zh.flume;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.util.List;
public class MultiplexingInterceptor implements Interceptor {
private Integer flagfield = 0;
private Integer timestampfield = 0;
public MultiplexingInterceptor(Integer flagfield,Integer timestampfield) {
this.flagfield = flagfield;
this.timestampfield = timestampfield;
}
/**
* 拦截器构造实例后的初始化工作
*/
public void initialize() {
}
// 日志格式:
// u01,ev1,mall,1568738583468
public Event intercept(Event event) {
// 根据event的数据内容,以及参数中指定的标记字段,来产生不同的header值
byte[] body = event.getBody();
String line = new String(body);
String[] split = line.split(",");
// 切出业务标记,并添加到header
event.getHeaders().put("flag",split[flagfield]);
// 切出行为(事件)时间戳,并添加到header
event.getHeaders().put("timestamp",split[timestampfield]);
return event;
}
public List<Event> intercept(List<Event> list) {
for (Event event : list) {
intercept(event);
}
return list;
}
/**
* 拦截器销毁之前的一些清理工作
*/
public void close() {
}
public static class MultiplexingInterceptorBuilder implements Interceptor.Builder{
Integer flagfield = 0;
Integer timestampfield = 0;
/**
* 用户构建一个拦截器实例
* @return
*/
public Interceptor build() {
return new MultiplexingInterceptor(flagfield,timestampfield);
}
/**
* 获取参数的入口
* @param context
*/
public void configure(Context context) {
flagfield = context.getInteger("flagfield");
timestampfield = context.getInteger("timestampfield");
}
}
}
打包上传到flume的lib目录
1.-- 模拟日志生成的脚本:
while true
do
if [ $(($RANDOM % 2)) -eq 0 ]
then
echo "u$RANDOM,e1,waimai,`date +%s`000" >> a.log
else
echo "u$RANDOM,e1,mall,`date +%s`000" >> a.log
fi
sleep 0.2
done
2.flume的agent配置:2个channel,1个选择器,1个自定义拦截器
[xx@hadoop102 flume-1.9.0]$ vim agentConf/multiplexing.conf
a1.sources = r1
a1.channels = c1 c2
a1.sinks = k1 k2
a1.sources.r1.channels = c1 c2
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = g1
a1.sources.r1.filegroups.g1 = /opt/module/datas/a.*
a1.sources.r1.fileHeader = false
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.zh.flume.MultiplexingInterceptor$MultiplexingInterceptorBuilder
a1.sources.r1.interceptors.i1.flagfield = 2
a1.sources.r1.interceptors.i1.timestampfield = 3
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.zh.flume.MultiplexingInterceptor$MultiplexingInterceptorBuilder
a1.sources.r1.interceptors.i1.flagfield = 2
a1.sources.r1.interceptors.i1.timestampfield = 3
a1.sources.r1.selector.type = multiplexing
a1.sources.r1.selector.header = flag
a1.sources.r1.selector.mapping.mall = c1
a1.sources.r1.selector.mapping.waimai = c2
a1.sources.r1.selector.default = c2
a1.channels.c1.type = memory
a1.channels.c1.capacity = 2000
a1.channels.c1.transactionCapacity = 1000
a1.channels.c2.type = memory
a1.channels.c2.capacity = 2000
a1.channels.c2.transactionCapacity = 1000
a1.sinks.k1.channel = c1
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = hadoop102:9092,hadoop103:9092,hadoop104:9092
a1.sinks.k1.kafka.topic = mall
a1.sinks.k1.kafka.producer.acks = 1
a1.sinks.k2.channel = c2
a1.sinks.k2.type = hdfs
a1.sinks.k2.hdfs.path = hdfs://hadoop102:8020/waimai/%Y-%m-%d/%H
a1.sinks.k2.hdfs.filePrefix = doitedu-log-
a1.sinks.k2.hdfs.fileSuffix = .log
a1.sinks.k2.hdfs.rollSize = 268435456
a1.sinks.k2.hdfs.rollInterval = 120
a1.sinks.k2.hdfs.rollCount = 0
a1.sinks.k2.hdfs.batchSize = 1000
a1.sinks.k2.hdfs.fileType = DataStream
a1.sinks.k2.hdfs.useLocalTimeStamp = false
4.启动hdfs,zookeeper,kafka
hdp.sh start
zk.sh start
kafka.sh start
[xx@hadoop102 ~]$ xcall.sh jps
=======hadoop102=======
2000 NameNode
3093 Kafka
2139 DataNode
3196 Jps
2477 NodeManager
2765 QuorumPeerMain
2655 JobHistoryServer
=======hadoop103=======
2545 QuorumPeerMain
2023 ResourceManager
2874 Kafka
2970 Jps
1836 DataNode
2156 NodeManager
=======hadoop104=======
2528 Kafka
2625 Jps
2037 NodeManager
1830 DataNode
1946 SecondaryNameNode
2205 QuorumPeerMain
脚本文件:
hdp.sh
#!/bin/bash
if [ $# -lt 1 ]
then
echo "No Args Input..."
exit ;
fi
case $1 in
"start"){
echo " =================== 启动 hadoop集群 ==================="
echo " --------------- 启动 hdfs ---------------"
ssh hadoop102 "/opt/module/hadoop-3.1.3/sbin/start-dfs.sh"
echo " --------------- 启动 yarn ---------------"
ssh hadoop103 "/opt/module/hadoop-3.1.3/sbin/start-yarn.sh"
echo " --------------- 启动 historyserver ---------------"
ssh hadoop102 "/opt/module/hadoop-3.1.3/bin/mapred --daemon start historyserver"
};;
"stop"){
echo " =================== 关闭 hadoop集群 ==================="
echo " --------------- 关闭 yarn ---------------"
ssh hadoop103 "/opt/module/hadoop-3.1.3/sbin/stop-yarn.sh"
echo " --------------- 关闭 hdfs ---------------"
ssh hadoop102 "/opt/module/hadoop-3.1.3/sbin/stop-dfs.sh"
echo " --------------- 关闭 historyserver ---------------"
ssh hadoop102 "/opt/module/hadoop-3.1.3/bin/mapred --daemon stop historyserver"
};;
*){
echo "Input Args Error..."
};;
esac
zk.sh:
#!/bin/bash
case $1 in
"start") {
for i in hadoop102 hadoop103 hadoop104
do
ssh $i "source /etc/profile; /opt/module/zookeeper-3.4.6/bin/zkServer.sh start"
done
};;
"stop") {
for i in hadoop102 hadoop103 hadoop104
do
ssh $i "source /etc/profile; /opt/module/zookeeper-3.4.6/bin/zkServer.sh stop"
done
};;
"status") {
for i in hadoop102 hadoop103 hadoop104
do
ssh $i "source /etc/profile; /opt/module/zookeeper-3.4.6/bin/zkServer.sh status"
done
};;
esac
kafka.sh:
#! /bin/bash
case $1 in
"start"){
for i in hadoop102 hadoop103 hadoop104
do
echo " --------启动 $i Kafka-------"
ssh $i "source /etc/profile; /opt/module/kafka/bin/kafka-server-start.sh -daemon /opt/module/kafka/config/server.properties "
done
};;
"stop"){
for i in hadoop102 hadoop103 hadoop104
do
echo " --------停止 $i Kafka-------"
ssh $i "source /etc/profile; /opt/module/kafka/bin/kafka-server-stop.sh stop"
done
};;
esac
5.启动flume agent
[xx@hadoop102 flume-1.9.0]$ bin/flume-ng agent -c conf/ -f agentConf/multiplexing.conf -n a1 -Dflume.root.logger=DEBUG,console
6.测试查看kafka和hdfs数据
[xx@hadoop103 kafka]$ bin/kafka-console-consumer.sh --bootstrap-server hadoop102:9092,hadoop103:9092,hadoop104:9092 --topic mall2
u27323,e1,mall,1616769650000
u27407,e1,mall,1616769650000
u31325,e1,mall,1616769651000
u5017,e1,mall,1616769651000
u4703,e1,mall,1616769651000
u26035,e1,mall,1616769651000
u2202,e1,mall,1616769652000
u16182,e1,mall,1616769652000
u30472,e1,mall,1616769653000
u15990,e1,mall,1616769653000
u32291,e1,mall,1616769654000
u1236,e1,mall,1616769654000
u17401,e1,mall,1616769654000
u22316,e1,mall,1616769654000
[atguigu@hadoop103 kafka]$ hdfs dfs -tail hdfs://hadoop102:8020/waimai/2021-03-26/22/doitedu-log-.1616769585211.log
u27887,e1,waimai,1616769691000
u4557,e1,waimai,1616769692000
u3,e1,waimai,1616769692000
u932,e1,waimai,1616769692000
u2070,e1,waimai,1616769692000
u17029,e1,waimai,1616769693000
u15891,e1,waimai,1616769693000
u17386,e1,waimai,1616769693000
u28482,e1,waimai,1616769693000
u19256,e1,waimai,1616769695000
u3694,e1,waimai,1616769695000
u22488,e1,waimai,1616769696000
u12503,e1,waimai,1616769696000
u963,e1,waimai,1616769696000
u20473,e1,waimai,1616769696000
u15916,e1,waimai,1616769697000
u15052,e1,waimai,1616769697000
u8609,e1,waimai,1616769697000
u14148,e1,waimai,1616769697000
u24441,e1,waimai,1616769698000
成功!
4.级联汇聚点HA高可用
级联高可用配置第一级(hadoop102)failover.conf
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1
a1.sources.r1.channels = c1
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = g1
a1.sources.r1.filegroups.g1 = /opt/module/datas/a.*
a1.sources.r1.fileHeader = false
a1.channels.c1.type = memory
a1.channels.c1.capacity = 2000
a1.channels.c1.transactionCapacity = 1000
a1.sinks.k1.channel = c1
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop103
a1.sinks.k1.port = 4444
a1.sinks.k2.channel = c1
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = hadoop104
a1.sinks.k2.port = 4444
a1.sinkgroups = g1
a1.sinkgroups.g1.sinks = k1 k2
a1.sinkgroups.g1.processor.type = failover
a1.sinkgroups.g1.processor.priority.k1 = 200
a1.sinkgroups.g1.processor.priority.k2 = 100
a1.sinkgroups.g1.processor.maxpenalty = 5000
级联高可用配置第2级(hadoop103)failover.conf
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.channels = c1
a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop103
a1.sources.r1.port = 4444
a1.sources.r1.batchSize = 100
a1.channels.c1.type = memory
a1.channels.c1.capacity = 2000
a1.channels.c1.transactionCapacity = 1000
a1.sinks.k1.channel = c1
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = hadoop102:9092,hadoop103:9092,hadoop104:9092
a1.sinks.k1.kafka.topic = failover
a1.sinks.k1.kafka.producer.acks = 1
级联高可用配置第2级(hadoop104)failover.conf
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.channels = c1
a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop104
a1.sources.r1.port = 4444
a1.sources.r1.batchSize = 100
a1.channels.c1.type = memory
a1.channels.c1.capacity = 2000
a1.channels.c1.transactionCapacity = 1000
a1.sinks.k1.channel = c1
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = hadoop102:9092,hadoop103:9092,hadoop104:9092
a1.sinks.k1.kafka.topic = failover
a1.sinks.k1.kafka.producer.acks = 1
/opt/module/datas
while true
do
if [ $(($RANDOM % 2)) -eq 0 ]
then
echo "u$RANDOM,e1,waimai,`date +%s`000" >> a.log
else
echo "u$RANDOM,e1,mall,`date +%s`000" >> a.log
fi
sleep 0.2
done
复制flume到hadoop104
[xx@hadoop102 module]$ scp -r flume-1.9.0/ hadoop104:$PWD
开启hadoop103,hadoop104
bin/flume-ng agent -c conf/ -f agentConf/failover.conf -n a1 -Dflume.root.logger=DEBUG,console
开启hadoop102
bin/flume-ng agent -c conf/ -f agentConf/failover.conf -n a1 -Dflume.root.logger=DEBUG,console
kafka查看topic failover
bin/kafka-console-consumer.sh --bootstrap-server \n
hadoop102:9092,hadoop103:9092,hadoop104:9092 \n
--topic failover
u21356,e1,waimai,1616817611000
u11451,e1,mall,1616817611000
u15814,e1,waimai,1616817612000
u20755,e1,waimai,1616817612000
成功!