Flume是Cloudera提供的一个高可用的,高可靠的,分布式的海量日志采集、聚合和传输的系统,Flume支持在日志系统中定制各类数据发送方,用于收集数据;同时,Flume提供对数据进行简单处理,并写到各种数据接受方(可定制)的能力。
读文件夹 ,以文件夹为通道,传到hdfs
user_friends.sources = userFriendsSource
user_friends.channels = userFriendsChannel
user_friends.sinks = userFriendsSink
user_friends.sources.userFriendsSource.type = spooldir
user_friends.sources.userFriendsSource.spoolDir = /software/kb07file/flumeFile/events/
user_friends.sources.userFriendsSource.deserializer = LINE
user_friends.sources.userFriendsSource.deserializer.maxLineLength = 600000
user_friends.sources.userFriendsSource.includePattern = user_friends_[0-9]{
4}-[0-9]{
2}-[0-9]{
2}.csv
user_friends.channels.userFriendsChannel.type = file
user_friends.channels.userFriendsChannel.checkpointDir = /software/kb07file/flumeFile/checkpointDir/user_friendscheckpoint/
user_friends.channels.userFriendsChannel.dataDir = /software/kb07file/flumeFile/data/user_friends/
user_friends.sinks.userFriendsSink.type = hdfs
user_friends.sinks.userFriendsSink.hdfs.fileType = DataStream
user_friends.sinks.userFriendsSink.hdfs.filePrefix = user
user_friends.sinks.userFriendsSink.hdfs.fileSuffix = .csv
user_friends.sinks.userFriendsSink.hdfs.path = hdfs://192.168.211.151:9000/data/flume/%Y-%m-%d
user_friends.sinks.userFriendsSink.hdfs.useLocalTimeStamp = true
user_friends.sinks.userFriendsSink.hdfs.batchSize = 640
user_friends.sinks.userFriendsSink.hdfs.rollCount = 0
user_friends.sinks.userFriendsSink.hdfs.rollSize = 64000000
user_friends.sinks.userFriendsSink.hdfs.rollInterval = 30
user_friends.sources.userFriendsSource.channels = userFriendsChannel
user_friends.sinks.userFriendsSink.channel = userFriendsChannel
自定义过滤器
java编写过滤器
导入依赖
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.6.0</version>
</dependency>
package cn.kgc.Tset;
import org.apache.flume.Context;
import org.apache