具体流程
1.Nginx
产生格式化的日志信息,修改 /usr/local/nginx/conf/nginx.confw文件
log_format main '$remote_addr,$remote_user,$time_local';//日志产生的格式
access_log logs/log.frame.access.log main;//产生日志存放的位置
Nginx生成日期的俩种格式
$time_iso8601 生成格式:2013-09-25T15:16:35+08:00
$time_local 生成格式: 25/Sep/2013:15:12:13 +0800
2.编写俩个flume配置文件
1.flume-kafka.conf
a1.sources = source1
a1.sinks = k1
a1.channels = c1
a1.sources.source1.type = exec
a1.sources.source1.command = tail -F /usr/local/nginx/logs/log.frame.access.log
# Describe the sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = first
a1.sinks.k1.brokerList = hdp-1:9092,hdp-2:9092,hdp-3:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.source1.channels = c1
a1.sinks.k1.channel = c1
2.kafka-hdfs.conf
#source的名字
agent.sources = kafkaSource
# channels的名字,建议按照type来命名
agent.channels = memoryChannel
# sink的名字,建议按照目标来命名
agent.sinks = hdfsSink
# 指定source使用的channel名字
agent.sources.kafkaSource.channels = memoryChannel
# 指定sink需要使用的channel的名字,注意这里是channel
agent.sinks.hdfsSink.channel = memoryChannel
#-------- kafkaSource相关配置-----------------
# 定义消息源类型
agent.sources.kafkaSource.type = org.apache.flume.source.kafka.KafkaSource
# 定义kafka所在zk的地址
#
# 这里特别注意: 是kafka的zookeeper的地址
#
agent.sources.kafkaSource.zookeeperConnect = hdp-1:2181,hdp-2:2181,hdp-3:2181
# 配置消费的kafka topic
agent.sources.kafkaSource.topic = first
# 配置消费者组的id
#agent.sources.kafkaSource.groupId = flume
# 消费超时时间,参照如下写法可以配置其他所有kafka的consumer选项。注意格式从kafka.xxx开始是consumer的配置属性
agent.sources.kafkaSource.kafka.consumer.timeout.ms = 100
#------- memoryChannel相关配置-------------------------
# channel类型
agent.channels.memoryChannel.type = memory
# channel存储的事件容量
agent.channels.memoryChannel.capacity=10000
# 事务容量
agent.channels.memoryChannel.transactionCapacity=1000
#---------hdfsSink 相关配置------------------
agent.sinks.hdfsSink.type = hdfs
# 注意, 我们输出到下面一个子文件夹datax中
agent.sinks.hdfsSink.hdfs.path = hdfs://hdp-1:9000/user/hive/warehouse/hdp_1_hive.db/t_kafka_hive
agent.sinks.hdfsSink.hdfs.writeFormat = Text
agent.sinks.hdfsSink.hdfs.fileType = DataStream
agent.sinks.hdfsSink.hdfs.rollSize = 1024
agent.sinks.hdfsSink.hdfs.rollCount = 0
agent.sinks.hdfsSink.hdfs.rollInterval = 60
#配置前缀和后缀
agent.sinks.hdfsSink.hdfs.filePrefix=test
agent.sinks.hdfsSink.hdfs.fileSuffix=.data
#避免文件在关闭前使用临时文件
agent.sinks.hdfsSink.hdfs.inUsePrefix=_
agent.sinks.hdfsSink.hdfs.inUseSuffix=
#自定义拦截器
#agent.sources.kafkaSource.interceptors=i1
#agent.sources.kafkaSource.interceptors.i1.type=com.hadoop.flume.FormatInterceptor$Builder
3.启动hive在hive中创建hdp_1_hive数据库,创建t_kafka_hive表
创建数据库命令
create database hdp_1_hive;
创建表命令
create table hdp_1_hive.t_kafka_hive(ip string, user_name string, user_local string)
row format delimited
fields terminated by ',';
4.启动flume 启动之前将集群启动,zookeeper启动,nginx启动,web工程启动
/root/apps/flume-1.7.0/bin/flume-ng agent \
-c conf \
-n a1 \
-f /root/apps/flume-1.7.0/confdir/flume-kafka.conf \
-Dflume.root.logger=DEBUG,console
/root/apps/flume-1.7.0/bin/flume-ng agent \
-c conf \
-n agent \
-f /root/apps/flume-1.7.0/confdir/kafka-hdfs.conf \
-Dflume.root.logger=DEBUG,console
5.到hap-1:50070查看文件是否已经上传
6.进入hive数据库,查看数据库中是否有数据
select * from hdp_1_hive.t_kafka_hive;