1.采集目录到HDFS
采集需求:某服务器的某特定目录下,会不断产生新的文件,每当有新文件出现,就需要把文件采集到HDFS中去
根据需求,首先定义以下3大要素
l 采集源,即source——监控文件目录 : spooldir
l 下沉目标,即sink——HDFS文件系统 : hdfs sink
l source和sink之间的传递通道——channel,可用file channel 也可以用内存channel
配置文件编写:
agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1
agent1.sources.source1.type = spooldir
agent1.sources.source1.spoolDir = /logs
agent1.sources.source1.fileHeader = false
agent1.sources.source1.interceptors = i1
agent1.sources.source1.interceptors.i1.type = host
agent1.sources.source1.interceptors.i1.hostHeader = hostname
agent1.sources.source1.deserializer.outputCharset=UTF-8
agent1.sources.source1.deserializer.outputCharset=ISO-8859-1 #读取文件编码方式
agent1.sources.source1.ignorePattern = ^(.)*\\.tmp$ #文件没上传完成不采集
agent1.sinks.sink1.type = hdfs
agent1.sinks.sink1.hdfs.path =hdfs://10.1.20.174:8020/web_log
agent1.sinks.sink1.hdfs.filePrefix = log_
agent1.sinks.sink1.hdfs.maxOpenFiles = 5000
agent1.sinks.sink1.hdfs.batchSize= 100
agent1.sinks.sink1.hdfs.fileType = DataStream
agent1.sinks.sink1.hdfs.writeFormat =Text
agent1.sinks.sink1.hdfs.rollSize = 102400
agent1.sinks.sink1.hdfs.rollCount = 1000000
agent1.sinks.sink1.hdfs.rollInterval = 60
#agent1.sinks.sink1.hdfs.round = true
#agent1.sinks.sink1.hdfs.roundValue = 10
#agent1.sinks.sink1.hdfs.roundUnit = minute
agent1.sinks.sink1.hdfs.useLocalTimeStamp = true
agent1.sinks.sink1.hdfs.callTimeout=30000
# Use a channel which buffers events in memory
agent1.channels.channel1.type = memory
agent1.channels.channel1.keep-alive = 120
agent1.channels.channel1.capacity = 500000
agent1.channels.channel1.transactionCapacity = 600
# Bind the source and sink to the channel
agent1.sources.source1.channels = channel1
agent1.sinks.sink1.channel = channel1
启动命令(在flume根目录启动)
bin/flume-ng agent -c ./conf -f ./conf/spooldir-hdfs.conf -n agent1 -Dflume.root.logger=INFO,console
2.采集文件到HDFS
采集需求:比如业务系统使用log4j生成的日志,日志内容不断增加,需要把追加到日志文件中的数据实时采集到hdfs
根据需求,首先定义以下3大要素
l 采集源,即source——监控文件内容更新 : exec ‘tail -F file’
l 下沉目标,即sink——HDFS文件系统 : hdfs sink
l Source和sink之间的传递通道——channel,可用file channel 也可以用 内存channel
配置文件编写:
agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1
# Describe/configure tail -F source1
agent1.sources.source1.type = exec
agent1.sources.source1.command = tail -F /logs/server.log
agent1.sources.source1.channels = channel1
#configure host for source
agent1.sources.source1.interceptors = i1
agent1.sources.source1.interceptors.i1.type = host
agent1.sources.source1.interceptors.i1.hostHeader = hostname
# Describe sink1
agent1.sinks.sink1.type = hdfs
#a1.sinks.k1.channel = c1
agent1.sinks.sink1.hdfs.path =hdfs://10.1.20.174:8020/web_log
agent1.sinks.sink1.hdfs.filePrefix = server.log
agent1.sinks.sink1.hdfs.maxOpenFiles = 5000
agent1.sinks.sink1.hdfs.batchSize= 100
agent1.sinks.sink1.hdfs.fileType = DataStream
agent1.sinks.sink1.hdfs.writeFormat =Text
agent1.sinks.sink1.hdfs.rollSize = 102400
agent1.sinks.sink1.hdfs.rollCount = 1000000
agent1.sinks.sink1.hdfs.rollInterval = 60
#agent1.sinks.sink1.hdfs.round = true
#agent1.sinks.sink1.hdfs.roundValue = 10
#agent1.sinks.sink1.hdfs.roundUnit = minute
agent1.sinks.sink1.hdfs.useLocalTimeStamp = true
# Use a channel which buffers events in memory
agent1.channels.channel1.type = memory
agent1.channels.channel1.keep-alive = 120
agent1.channels.channel1.capacity = 500000
agent1.channels.channel1.transactionCapacity = 600
# Bind the source and sink to the channel
agent1.sources.source1.channels = channel1
agent1.sinks.sink1.channel = channel1
启动命令(在flume根目录)
bin/flume-ng agent -c conf -f conf/tail-hdfs.conf -n agent1 -Dflume.root.logger=INFO,console
3.综合实例
需求:将hadoop02采集到的文件发送给hadoop01,并由hadoop01保存到hdfs
步骤:
1.编写javaee程序,配置log4j 和对应的日志格式
2.将应用程序打包发布到hadoop02服务器(JAVAEE服务器)
3. 启动flume
3.1在hadoop01(hdfs)上启动一个flume的agent(avro-hdfs.conf)
[root@cloudera1 conf]# vim tail-hdfs.conf
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = avro
a1.sources.r1.channels = c1
a1.sources.r1.bind = 10.1.20.174
a1.sources.r1.port = 4141
# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
a1.sinks.k1.hdfs.path = /web_log
a1.sinks.k1.hdfs.filePrefix = server.log
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k1.hdfs.rollSize = 20
a1.sinks.k1.hdfs.rollCount = 5
a1.sinks.k1.hdfs.batchSize = 1
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.fileType = DataStream
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动:
[root@cloudera1 flume-1.7.0]# bin/flume-ng agent -c conf -f conf/avro-hdfs.conf -n a1 -Dflume.root.logger=INFO,console
3.2 在hadoop02上启动一个agent服务
[root@cloudera2 conf]# vim tail-avro.conf
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /logs/server.log
a1.sources.r1.channels = c1
# Describe the sink
a1.sinks = k1
a1.sinks.k1.type = avro
a1.sinks.k1.channel = c1
a1.sinks.k1.hostname = 10.1.20.174
a1.sinks.k1.port = 4141
a1.sinks.k1.batch-size = 2
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动:
root@cloudera2 flume-1.7.0]# bin/flume-ng agent -c conf -f conf/tail-avro.conf -n a1 -Dflume.root.logger=INFO,console
4.打包web程序,发布
package com.bocai;
import java.io.IOException;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.log4j.Logger;
@WebServlet("/item")
public class ItemServlet extends HttpServlet{
private static final long serialVersionUID = -586155548844563441L;
public static Logger logger = Logger.getLogger(ItemServlet.class);
@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
// Date date = new Date();
// SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
logger.info(req.getRemoteAddr() + "\t" +req.getRequestURL()+"\t"+req.getParameter("id") + "\t" +"item");
resp.getWriter().print("您访问了####系统 ");
}
@Override
protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
}
}
log4j.properties
log4j.rootLogger=INFO,Console,DailyRollingFile
#Console
log4j.appender.Console=org.apache.log4j.ConsoleAppender
log4j.appender.Console.layout=org.apache.log4j.PatternLayout
log4j.appender.Console.layout.ConversionPattern=%-d{yyyy-MM-dd HH\:mm\:ss} %m %n
log4j.appender.DailyRollingFile=org.apache.log4j.DailyRollingFileAppender
log4j.appender.DailyRollingFile.layout=org.apache.log4j.PatternLayout
log4j.appender.DailyRollingFile.layout.ConversionPattern=%-d{yyyy-MM-dd HH\:mm\:ss} %m %n
log4j.appender.DailyRollingFile.Append=true
log4j.appender.DailyRollingFile.DatePattern ='_'yyyy-MM-dd'.log'
log4j.appender.DailyRollingFile.File=/logs/server.log