flume实战

最新推荐文章于 2024-11-03 16:56:30 发布

chengtong1986

最新推荐文章于 2024-11-03 16:56:30 发布

阅读量116

点赞数

文章标签：大数据 java 开发工具

原文链接：https://my.oschina.net/u/2954291/blog/876891

版权

1.采集目录到HDFS

采集需求：某服务器的某特定目录下，会不断产生新的文件，每当有新文件出现，就需要把文件采集到HDFS中去

根据需求，首先定义以下3大要素

l 采集源，即source——监控文件目录 : spooldir

l 下沉目标，即sink——HDFS文件系统 : hdfs sink

l source和sink之间的传递通道——channel，可用file channel 也可以用内存channel

配置文件编写：

agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1

agent1.sources.source1.type = spooldir
agent1.sources.source1.spoolDir = /logs
agent1.sources.source1.fileHeader = false

agent1.sources.source1.interceptors = i1
agent1.sources.source1.interceptors.i1.type = host
agent1.sources.source1.interceptors.i1.hostHeader = hostname
agent1.sources.source1.deserializer.outputCharset=UTF-8
agent1.sources.source1.deserializer.outputCharset=ISO-8859-1 #读取文件编码方式
agent1.sources.source1.ignorePattern = ^(.)*\\.tmp$  #文件没上传完成不采集

agent1.sinks.sink1.type = hdfs
agent1.sinks.sink1.hdfs.path =hdfs://10.1.20.174:8020/web_log
agent1.sinks.sink1.hdfs.filePrefix = log_
agent1.sinks.sink1.hdfs.maxOpenFiles = 5000
agent1.sinks.sink1.hdfs.batchSize= 100
agent1.sinks.sink1.hdfs.fileType = DataStream
agent1.sinks.sink1.hdfs.writeFormat =Text
agent1.sinks.sink1.hdfs.rollSize = 102400
agent1.sinks.sink1.hdfs.rollCount = 1000000
agent1.sinks.sink1.hdfs.rollInterval = 60
#agent1.sinks.sink1.hdfs.round = true
#agent1.sinks.sink1.hdfs.roundValue = 10
#agent1.sinks.sink1.hdfs.roundUnit = minute
agent1.sinks.sink1.hdfs.useLocalTimeStamp = true
agent1.sinks.sink1.hdfs.callTimeout=30000
# Use a channel which buffers events in memory
agent1.channels.channel1.type = memory
agent1.channels.channel1.keep-alive = 120
agent1.channels.channel1.capacity = 500000
agent1.channels.channel1.transactionCapacity = 600

# Bind the source and sink to the channel
agent1.sources.source1.channels = channel1
agent1.sinks.sink1.channel = channel1

启动命令（在flume根目录启动）

bin/flume-ng agent -c ./conf -f ./conf/spooldir-hdfs.conf -n agent1 -Dflume.root.logger=INFO,console

2.采集文件到HDFS

采集需求：比如业务系统使用log4j生成的日志，日志内容不断增加，需要把追加到日志文件中的数据实时采集到hdfs

根据需求，首先定义以下3大要素

l 采集源，即source——监控文件内容更新 : exec ‘tail -F file’

l 下沉目标，即sink——HDFS文件系统 : hdfs sink

l Source和sink之间的传递通道——channel，可用file channel 也可以用内存channel

配置文件编写：

agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1

# Describe/configure tail -F source1
agent1.sources.source1.type = exec
agent1.sources.source1.command = tail -F /logs/server.log
agent1.sources.source1.channels = channel1

#configure host for source
agent1.sources.source1.interceptors = i1
agent1.sources.source1.interceptors.i1.type = host
agent1.sources.source1.interceptors.i1.hostHeader = hostname

# Describe sink1
agent1.sinks.sink1.type = hdfs
#a1.sinks.k1.channel = c1
agent1.sinks.sink1.hdfs.path =hdfs://10.1.20.174:8020/web_log
agent1.sinks.sink1.hdfs.filePrefix = server.log
agent1.sinks.sink1.hdfs.maxOpenFiles = 5000
agent1.sinks.sink1.hdfs.batchSize= 100
agent1.sinks.sink1.hdfs.fileType = DataStream
agent1.sinks.sink1.hdfs.writeFormat =Text
agent1.sinks.sink1.hdfs.rollSize = 102400
agent1.sinks.sink1.hdfs.rollCount = 1000000
agent1.sinks.sink1.hdfs.rollInterval = 60
#agent1.sinks.sink1.hdfs.round = true
#agent1.sinks.sink1.hdfs.roundValue = 10
#agent1.sinks.sink1.hdfs.roundUnit = minute
agent1.sinks.sink1.hdfs.useLocalTimeStamp = true
# Use a channel which buffers events in memory
agent1.channels.channel1.type = memory
agent1.channels.channel1.keep-alive = 120
agent1.channels.channel1.capacity = 500000
agent1.channels.channel1.transactionCapacity = 600

# Bind the source and sink to the channel
agent1.sources.source1.channels = channel1
agent1.sinks.sink1.channel = channel1

启动命令（在flume根目录）

bin/flume-ng agent -c conf -f conf/tail-hdfs.conf -n agent1 -Dflume.root.logger=INFO,console

3.综合实例

需求：将hadoop02采集到的文件发送给hadoop01，并由hadoop01保存到hdfs

步骤：

1.编写javaee程序，配置log4j 和对应的日志格式

2.将应用程序打包发布到hadoop02服务器（JAVAEE服务器）

3. 启动flume

3.1在hadoop01（hdfs）上启动一个flume的agent（avro-hdfs.conf）

[root@cloudera1 conf]# vim tail-hdfs.conf

a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = avro
a1.sources.r1.channels = c1
a1.sources.r1.bind = 10.1.20.174
a1.sources.r1.port = 4141

# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
a1.sinks.k1.hdfs.path = /web_log
a1.sinks.k1.hdfs.filePrefix = server.log
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k1.hdfs.rollSize = 20
a1.sinks.k1.hdfs.rollCount = 5
a1.sinks.k1.hdfs.batchSize = 1
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.fileType = DataStream

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

启动：

[root@cloudera1 flume-1.7.0]# bin/flume-ng agent -c conf -f conf/avro-hdfs.conf -n a1 -Dflume.root.logger=INFO,console

3.2 在hadoop02上启动一个agent服务

[root@cloudera2 conf]# vim tail-avro.conf

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /logs/server.log
a1.sources.r1.channels = c1

# Describe the sink
a1.sinks = k1
a1.sinks.k1.type = avro
a1.sinks.k1.channel = c1
a1.sinks.k1.hostname = 10.1.20.174
a1.sinks.k1.port = 4141
a1.sinks.k1.batch-size = 2

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

启动：

root@cloudera2 flume-1.7.0]# bin/flume-ng agent -c conf -f conf/tail-avro.conf -n a1 -Dflume.root.logger=INFO,console

4.打包web程序，发布

package com.bocai;

import java.io.IOException;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.log4j.Logger;

@WebServlet("/item")
public class ItemServlet extends HttpServlet{

	private static final long serialVersionUID = -586155548844563441L;
	
	public static Logger logger = Logger.getLogger(ItemServlet.class);

	@Override
	protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
		
//		Date date = new Date();
//		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
		logger.info(req.getRemoteAddr() + "\t" +req.getRequestURL()+"\t"+req.getParameter("id") + "\t" +"item");
		resp.getWriter().print("您访问了####系统 ");
	}

	@Override
	protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
		
	}
}

log4j.properties

log4j.rootLogger=INFO,Console,DailyRollingFile
#Console
log4j.appender.Console=org.apache.log4j.ConsoleAppender
log4j.appender.Console.layout=org.apache.log4j.PatternLayout
log4j.appender.Console.layout.ConversionPattern=%-d{yyyy-MM-dd HH\:mm\:ss}	%m	%n

log4j.appender.DailyRollingFile=org.apache.log4j.DailyRollingFileAppender
log4j.appender.DailyRollingFile.layout=org.apache.log4j.PatternLayout
log4j.appender.DailyRollingFile.layout.ConversionPattern=%-d{yyyy-MM-dd HH\:mm\:ss}	%m	%n
log4j.appender.DailyRollingFile.Append=true
log4j.appender.DailyRollingFile.DatePattern ='_'yyyy-MM-dd'.log'
log4j.appender.DailyRollingFile.File=/logs/server.log

转载于:https://my.oschina.net/u/2954291/blog/876891