Flume跨服务器采集文件数据到HDFS完整案例

最新推荐文章于 2022-03-31 23:25:15 发布

此木Y

最新推荐文章于 2022-03-31 23:25:15 发布

阅读量2.7k

点赞数 5

文章标签： flume hdfs 数据采集大数据

本文链接：https://blog.csdn.net/weixin_43909382/article/details/117126432

版权

一：整体架构如下图所示，有两台服务器A和B，要把服务器A上的数据采集到服务器B上的HDFS。
在这里插入图片描述

二：首先是原始数据，我这里一直保存的是近3天的数据。
可以使用下面脚本实现

#!/bin/bash
find /home/ftpuser/home/ftpuser -mtime +2 -name "202*" -exec rm -rf {} \;

在这里插入图片描述
Flume的配置：
1.在服务器A的Flume安装目录的conf目录下新建aserver.conf

#服务器A
a1.sources = r1 r2 r3 r4
a1.sinks = k1
a1.channels = c1
# 配置监控文件,注意：不能往目录中放同名文件，否则会崩溃
#对于source r1的配置描述 监听文件中的新增数据 exec  spoolDir
a1.sources.r1.type = exec
a1.sources.r1.shell = /bin/bash -c
# 因为我的第一行数据是不需要的，所以从第二行开始采集
a1.sources.r1.command = tail -n +2 -F  "/home/ftpuser/home/ftpuser/`date +%Y%m%d`_acct_flow"
a1.sources.r1.interceptors = i1 i2
a1.sources.r1.interceptors.i1.type = static
#静态的在header中添加一个key value，下面就配置了两个拦截器，i1和i2
a1.sources.r1.interceptors.i1.key = type
# value和文件的后缀名是一样的，便于理解
a1.sources.r1.interceptors.i1.value = acct_flow
a1.sources.r1.interceptors.i2.type = timestamp

#对于source r2的配置描述 监听文件中的新增数据 exec
a1.sources.r2.type = exec
a1.sources.r2.shell = /bin/bash -c
a1.sources.r2.command  = tail -n +2 -F  "/home/ftpuser/home/ftpuser/`date +%Y%m%d`_ntdflow_record"
a1.sources.r2.interceptors = i1 i2
a1.sources.r2.interceptors.i1.type = static
#静态的在header中添加一个key value，下面就配置了两个拦截器，i1和i2
a1.sources.r2.interceptors.i1.key = type
a1.sources.r2.interceptors.i1.value = ntdflow_record
a1.sources.r2.interceptors.i2.type = timestamp

#对于source r3 的配置描述 监听文件中的新增数据 exec
a1.sources.r3.type = exec
a1.sources.r3.shell = /bin/bash -c
a1.sources.r3.command  = tail -n +2 -F  "/home/ftpuser/home/ftpuser/`date +%Y%m%d`_online_detail"
a1.sources.r3.interceptors = i1 i2
a1.sources.r3.interceptors.i1.type = static
#静态的在header中添加一个key value，下面就配置了两个拦截器，i1和i2
a1.sources.r3.interceptors.i1.key = type
a1.sources.r3.interceptors.i1.value = online_detail
a1.sources.r3.interceptors.i2.type = timestamp

#对于source r4 的配置描述 监听文件中的新增数据 exec
a1.sources.r4.type = exec
a1.sources.r4.shell = /bin/bash -c
a1.sources.r4.command  = tail -n +2 -F  "/home/ftpuser/home/ftpuser/`date +%Y%m%d`_userinfo"
a1.sources.r4.interceptors = i1 i2
a1.sources.r4.interceptors.i1.type = static
#静态的在header中添加一个key value，下面就配置了两个拦截器，i1和i2
a1.sources.r4.interceptors.i1.key = type
a1.sources.r4.interceptors.i1.value = userinfo
a1.sources.r4.interceptors.i2.type = timestamp

# 配置sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname=192.168.xxx.xx
a1.sinks.k1.port = 44444

# 配置channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 30000
a1.channels.c1.transactionCapacity = 10000
a1.sources.r1.channels = c1
a1.sources.r2.channels = c1
a1.sources.r3.channels = c1
a1.sources.r4.channels = c1
a1.sinks.k1.channel = c1

2.在服务器B的Flume安装目录的conf目录下新建bserver.conf

#服务器B
b1.sources = r2
b1.sinks = k2
b1.channels = c2
# 配置监控文件
b1.sources.r2.type = avro
# 此处为服务器A的host
b1.sources.r2.bind=192.168.xxx.xx
b1.sources.r2.port = 44444
#b1.sources.r2.interceptors = i1
#b1.sources.r2.interceptors.i1.type = timestamp
# 配置sink
b1.sinks.k2.type =hdfs
b1.sinks.k2.channel = c1
# 保存到HDFS上的路径
b1.sinks.k2.hdfs.path = hdfs://192.168.xxx.xx/user/hive/warehouse/ods.db/%{type}/dt=%Y%m%d/
# 生成的文件前缀
#b1.sinks.k2.hdfs.filePrefix = events-
b1.sinks.k2.hdfs.filePrefix = %{type}
# 是否按照时间滚动文件夹
b1.sinks.k2.hdfs.round = true
# 多少时间单位创建一个文件夹
b1.sinks.k2.hdfs.roundValue = 10
# 重新定义时间单位
b1.sinks.k2.hdfs.roundUnit = minute
# hdfs上每60秒生成一个文件
b1.sinks.k2.hdfs.rollInterval = 60
# hdfs上的文件达到128M生成一个文件
b1.sinks.k2.hdfs.rollSize = 134217728
# 不按照条数生成文件
b1.sinks.k2.hdfs.rollCount = 0
# 积攒多少个Event才flush到HDFS一次
b1.sinks.k2.hdfs.batchSize = 100
# 是否使用本地时间戳
b1.sinks.k2.hdfs.useLocalTimeStamp = true
#生成的文件类型，默认是Sequencefile，可用DataStream，则为普通文本
b1.sinks.k2.hdfs.fileType = DataStream
# 配置channel
b1.channels.c2.type = memory
b1.channels.c2.capacity = 10000
b1.channels.c2.transactionCapacity = 100
# 将三者串联
b1.sources.r2.channels = c2
b1.sinks.k2.channel = c2

3.脚本运行，这里使用的脚本运行，也可以使用单个语句执行

#!/bin/bash
export JAVA_HOME=/opt/jdk1.8.0_181
path=/home/flume/flume-1.9
echo $path
JAR="flume"

function start(){
    echo "开始启动 ...."
    num=`ps -ef|grep java|grep $JAR|wc -l`
    echo "进程数:$num"
    if [ "$num" = "0" ] ; then
        # 请自行修改启动的所需要的参数
        eval nohup $path/bin/flume-ng agent --name a1 --conf $path/conf --conf-file $path/conf/aserver.conf >>/home/flume/flume-1.9/logs/flume.log  2>&1 &
        echo "启动成功...."
        echo "日志路径: $path/logs/flume.log"
        exit 0
    else
        echo "进程已经存在，启动失败，请检查....."
        exit 0
    fi
}

function stop(){
    echo "开始stop ....."
    num=`ps -ef|grep java|grep $JAR|wc -l`
    if [ "$num" != "0" ] ; then
        #ps -ef|grep java|grep $JAR|awk '{print $2;}'|xargs kill -9
        # 正常停止flume
        ps -ef|grep java|grep $JAR|awk '{print $2;}'|xargs kill
        echo "进程已经关闭..."
    else
        echo "服务未启动，无需停止..."
    fi
}


function restart(){
    echo "begin stop process ..."
    stop
    # 判断程序是否彻底停止
    num=`ps -ef|grep java|grep $JAR|wc -l`
    while [ $num -gt 0 ]; do
        sleep 1
        num=`ps -ef|grep java|grep $JAR|wc -l`
    done
    echo "process stoped,and starting ..."
    start
    echo "started ..."
}

case "$1" in
    "start")
      start $@
      exit 0
    ;;
    "stop")
      stop
      exit 0
     ;;
    "restart")
       restart
       exit 0
     ;;
    *)
       echo "用法： $0 {start|stop|restart}"
       exit 1
    ;;
esac

4.定时启动(centos7)输入： crontab -e

30 2 * * * /home/flume/flume-1.9/timingFlume.sh start
50 2 * * * /home/flume/flume-1.9/timingFlume.sh stop

因为我的数据量不多，几分钟就结束了，所以采集完就stop了。
5.HDFS上的数据
在这里插入图片描述
注意：要先启动B服务器，再启动A服务器。
如有问题，欢迎留言交流。

此木Y

关注

5
点赞
踩
2

收藏

觉得还不错? 一键收藏
5
评论
Flume跨服务器采集文件数据到HDFS完整案例

一：整体架构如下图所示，有两台服务器A和B，要把服务器A上的数据采集到服务器B上的HDFS。二：首先是原始数据，我这里一直保存的是近3天的数据。可以使用下面脚本实现#!/bin/bashfind /home/ftpuser/home/ftpuser -mtime +2 -name "202*" -exec rm -rf {} \;Flume的配置：1.在服务器A的Flume安装目录的conf目录下新建aserver.conf#服务器Aa1.sources = r1 r2 r3 r4a
复制链接

扫一扫