讲解一下如何自定义一个Flume的Sink,很简单,下面是一个自定义Sink,将数据写入到HDFS的Demo.
package death.flume;
import java.io.IOException;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.Date;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.google.common.base.Preconditions;
import org.apache.flume.*;
import org.apache.flume.conf.Configurable;
import org.apache.flume.sink.AbstractSink;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
/**
*自定义Sink首先要继承AbstractSink抽象类,实现Configurable接口,并实现相应的方法
*/
public class FlumeSinkDemo extends AbstractSink implements Configurable {
private String hdfsURI;
private String username;
private String dataDir;
private String dateFormat;
private URI uri;
private Configuration conf;
private FileSystem fileSystem;
private FSDataOutputStream out = null;
//数据处理的逻辑都在process方法中实现
@Override
public Status process() throws EventDeliveryException {
String fileDate = new SimpleDateFormat(dateFormat).format(new Date());
String filePath = dataDir + "/" + fileDate + "/" + fileDate + "-log.txt";
Channel channel = getChannel();
Transaction ts = channel.getTransaction();
//event就是自定义Sink中接受的数据
Event event;
ts.begin();
while (true)
{
event = channel.take();
if(event != null)
{
break;
}
}
try {
fileSystem = FileSystem.get(uri, conf, username);
String eventBody = new String(event.getBody());
Path path = new Path(filePath);
boolean isExist = fileSystem.exists(path);
if(!isExist){
boolean isSucess = fileSystem.createNewFile(path);
}
out = fileSystem.append(path);
out.write(eventBody.getBytes());
out.close();
fileSystem.close();
ts.commit();
return Status.READY;
}catch (Throwable th){
ts.rollback();
if (th instanceof Error) {
throw (Error) th;
} else {
throw new EventDeliveryException(th);
}
}finally {
ts.close();
try {
if (out != null) {
out.close();
}
if (fileSystem != null)
{
fileSystem.close();
}
}catch (IOException e){
e.printStackTrace();
}
}
}
//该方法用于读取Flume中Sink的配置,在Sink初始化时调用
@Override
public void configure(Context context)
{
// customelog.sinks.sink1.type=death.flume.FlumeSinkDemo
// customelog.sinks.sink1.channel=channel1
// customelog.sinks.sink1.hdfsURI=hdfs://hostname:port
// customelog.sinks.sink1.username=hdfs
// customelog.sinks.sink1.dataDir=/death/data_sampling
// customelog.sinks.sink1.dateFormat=YYYY-MM-dd
hdfsURI = context.getString("hdfsURI");
Preconditions.checkNotNull(hdfsURI, "hdfsURI must be set");
username = context.getString("username");
Preconditions.checkNotNull(username, "username must be set");
dataDir = context.getString("dataDir");
Preconditions.checkNotNull("dataDir must be set");
dateFormat = context.getString("dateFormat");
Preconditions.checkNotNull(dateFormat, "dateFormat must be set");
}
//该方法用于Sink启动时调用
@Override
public synchronized void start()
{
super.start();
try {
uri = new URI(hdfsURI);
conf = new Configuration();
}catch (Exception e){
e.printStackTrace();
}
}
//该方法用于Sink停止使用调用
@Override
public synchronized void stop()
{
super.stop();
}
}
编写好Sink代码之后,打成jar包,放到FLUME_HOME/lib下,就可以调用了。下面是调用自定义Sink的一些简单配置。
### Sink Configuration
customelog.sinks.sink1.type=death.flume.FlumeSinkDemo
customelog.sinks.sink1.channel=channel1
customelog.sinks.sink1.hdfsURI=hdfs://cxhadoop
customelog.sinks.sink1.username=hdfs
customelog.sinks.sink1.dataDir=/death/data_sampling
customelog.sinks.sink1.dateFormat=YYYY-MM-dd
customelog.sinks.sink1.flumeBatchSize=2000