Flume拦截器
一.使用正则过滤拦截器(去掉首行)
属性参数
- type 组件类型
regex_filter
- regex 用于匹配Event内容的正则表达式
- excludeEventss 如果为true,被正则匹配到的Event会被丢弃;如果为false,不被正则匹配到的Event会被丢弃
需求:
使用Spooling directory source监督符合格式的文件进行上传(格式:user_年-月-日.csv);
使用正则拦截器去除首行;
使用file channel进行缓存;
以规定的文件格式()上传到HDFS上规定文件夹下
[root@hadoop1 user]#mkdir /opt/flume160/conf/jobkb09/dataSourceFile/user
[root@hadoop1 user]#mkdir /opt/flume160/conf/jobkb09/checkPointFile/user
[root@hadoop1 user]#mkdir /opt/flume160/conf/jobkb09/dataChannelFile/user
#agent文件
[root@hadoop1 jobkb09]# vi user-flume-hdfs.conf
users.sources=userSource
users.channels=userChannel
users.sinks=userSink
users.sources.userSource.type=spooldir
users.sources.userSource.spoolDir=/opt/flume160/conf/jobkb09/dataSourceFile/user
users.sources.userSource.includePattern=users_[0-9]{
4}-[0-9]{
2}-[0-9]{
2}.csv
users.sources.userSource.deserializer=LINE
users.sources.userSource.deserializer.maxLineLength=10000
#正则过滤拦截器
users.sources.userSource.interceptors=head_filter
users.sources.userSource.interceptors.head_filter.type=regex_filter
#匹配以user_id开头的event
users.sources.userSource.interceptors.head_filter.regex=^user_id*
users.sources.userSource.interceptors.head_filter.excludeEvents=true#为true则表示去除正则表达式匹配的内容
users.channels.userChannel.type=file
users.channels.userChannel.checkpointDir=/opt/flume160/conf/jobkb09/checkPointFile/user
#存储临时文件
users.channels.userChannel.dataDirs=/opt/flume160/conf/jobkb09/dataChannelFile/user
users.sinks.userSink.type=hdfs
users.sinks.userSink.hdfs.fileType=DataStream
users.sinks.userSink.hdfs.filePrefix=user
users.sinks.userSink.hdfs.fileSuffix=.csv
users.sinks.userSink.hdfs.path=hdfs://192.168.36.100:9000/kb09file/user/users/%Y-%m-%d
users.sinks.userSink.hdfs.useLocalTimeStamp=true
users.sinks.userSink.hdfs.batchSize=640
users.sinks.userSink.hdfs.rollInterval=20
users.sinks.userSink.hdfs.rollCount=0
users.sinks.userSink.hdfs.rollSize=120000000
users.sources.userSource.channels=userChannel
users.sinks.userSink.channel