因为生产集群flume性能一直不佳,所以最近一直在研究flume ng。
使用flume ng将日志文件处理后插入到hbase中,话不多说,直接上代码:
配置文件:
#Name the components on this agent
agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1
agent1.sources.source1.type = exec
agent1.sources.source1.command = cat /home/test.log
agent1.sources.source1.batchSize = 100
# Configure Hbase Sink sink1
agent1.sinks.sink1.type = org.apache.flume.sink.hbase.HBaseSink
agent1.sinks.sink1.channel = channel1
agent1.sinks.sink1.table = longer_60
agent1.sinks.sink1.columnFamily = info
agent1.sinks.sink1.serializer = org.apache.flume.sink.hbase.SimpleHbaseEventSerializerPerson
agent1.sinks.sink1.serializer.payloadColumn = test
# Use a channel which buffers events in memory
agent1.channels.channel1.type = memory
agent1.channels.channel1.capacity = 800
agent1.channels.channel1.transactionCapactiy = 100
其中SimpleHbaseEventSerializerPerson.java是自定义的拆分日志存入hbase的行信息,在SimpleHbaseEventSerializer.java直接修改的:
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.flume.sink.hbase;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.FlumeException;
import org.apache.flume.conf.ComponentConfiguration;
import org.apache.hadoop.hbase.client.Increment;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Row;
import org.apache.hadoop.hbase.util.Bytes;
import com.google.common.base.Charsets;
/**
* A simple serializer that returns puts from an event, by writing the event
* body into it. The headers are discarded. It also updates a row in hbase which
* acts as an event counter.
*
* Takes optional parameters:
* <p>
* <tt>rowPrefix:</tt> The prefix to be used. Default: <i>default</i>
* <p>
* <tt>incrementRow</tt> The row to increment. Default: <i>incRow</i>
* <p>
* <tt>suffix:</tt> <i>uuid/random/timestamp.</i>Default: <i>uuid</i>
* <p>
*
* Mandatory parameters:
* <p>
* <tt>cf:</tt>Column family.
* <p>
* Components that have no defaults and will not be used if null:
* <tt>payloadColumn:</tt> Which column to put payload in. If it is null, event
* data will not be written.
* <p>
* <tt>incColumn:</tt> Which column to increment. Null means no column is
* incremented.
*/
public class SimpleHbaseEventSerializerPerson implements HbaseEventSerializer {
private String rowPrefix;
private byte[] incrementRow;
private byte[] cf;
private byte[] plCol;
private byte[] incCol;
private KeyType keyType;
private byte[] payload;
private String interfaceName;
public SimpleHbaseEventSerializerPerson() {
}
@Override
public void configure(Context context) {
rowPrefix = context.getString("rowPrefix", "default");
incrementRow = context.getString("incrementRow", "incRow").getBytes(
Charsets.UTF_8);
String suffix = context.getString("suffix", "uuid");
String payloadColumn = context.getString("payloadColumn");
String incColumn = context.getString("incrementColumn");
if (payloadColumn != null && !payloadColumn.isEmpty()) {
if (suffix.equals("timestamp")) {
keyType = KeyType.TS;
} else if (suffix.equals("random")) {
keyType = KeyType.RANDOM;
} else if (suffix.equals("nano")) {
keyType = KeyType.TSNANO;
} else {
keyType = KeyType.UUID;
}
plCol = payloadColumn.getBytes(Charsets.UTF_8);
}
if (incColumn != null && !incColumn.isEmpty()) {
incCol = incColumn.getBytes(Charsets.UTF_8);
}
}
@Override
public void configure(ComponentConfiguration conf) {
}
@Override
public void initialize(Event event, byte[] cf) {
this.payload = event.getBody();
this.cf = cf;
}
@Override
public List<Row> getActions() throws FlumeException {
List<Row> actions = new LinkedList<Row>();
if (plCol != null) {
byte[] rowKey;
try {
if (payload.length != 0) {
String data = new String(payload);
String inter = data.split("POST")[1].split(".php")[0]
.split("/")[1];
data = "gid:"
+ data.split("gid=")[1].split(" ")[0].replaceAll(
"&", ",").replaceAll("=", ":");
SpliDataObject jsoData = new SpliDataObject(data);
jsoData.put("interface", inter);
if (jsoData == null) {
return null;
}
Iterator keys = jsoData.keys();
String time = jsoData.getString("time").substring(0, 10);
StringBuffer sb = new StringBuffer();
sb.append(CusDataUtil.timeConvert(time).substring(0, 8) + "-");
sb.append(jsoData.getString("dept") + "-");
sb.append(jsoData.getString("sid") + "-");
sb.append(CusDataUtil.timeConvert(time).substring(8, 14) + "-");
rowPrefix = sb.toString();
if (keyType == KeyType.TS) {
rowKey = SimpleRowKeyGenerator.getTimestampKey(rowPrefix);
} else if (keyType == KeyType.RANDOM) {
rowKey = SimpleRowKeyGenerator.getRandomKey(rowPrefix);
} else if (keyType == KeyType.TSNANO) {
rowKey = SimpleRowKeyGenerator
.getNanoTimestampKey(rowPrefix);
} else {
rowKey = SimpleRowKeyGenerator.getUUIDKey(rowPrefix);
}
Put p = new Put(rowKey);
keys = jsoData.keys();
this.interfaceName = jsoData.getString("interface");
while (keys.hasNext()) {
String tempKey = (String) keys.next();
if (!"interface".equals(tempKey))
if ("time".equals(tempKey)) {
p.add(Bytes.toBytes("info"), Bytes
.toBytes(tempKey), Bytes.toBytes(time));
} else {
p.add(Bytes.toBytes("info"), Bytes
.toBytes(tempKey), Bytes.toBytes(String
.valueOf(jsoData.get(tempKey))));
}
}
actions.add(p);
}
} catch (Exception e) {
//throw new FlumeException("Could not get row key!", e);
}
}
return actions;
}
@Override
public List<Increment> getIncrements() {
List<Increment> incs = new LinkedList<Increment>();
if (incCol != null) {
Increment inc = new Increment(incrementRow);
inc.addColumn(cf, incCol, 1);
incs.add(inc);
}
return incs;
}
@Override
public void close() {
}
public enum KeyType {
UUID, RANDOM, TS, TSNANO;
}
}
这里需要说明的一点是:rowkey的规则还是借用了SimpleHbaseEventSerializer.java中的规则,只不过稍微修改了一下而已。如果用自定义的rowkey规则,性能相差数十倍多,不得不要佩服java大牛们了。
性能测试结果:
test.log文件大约是55M的样子,总共有220617行,在虚拟机上进行的测试,用时大约61秒,CPU使用率一直在1/8左右,内存保持在4%左右。每秒能达到3600多行的写入速度,如果再生产集群部署,相信效果更好。
最后,附上flume ng的自定义接口举例:
public class MySource extends AbstractSource implements Configurable, PollableSource {
private String myProp;
@Override
public void configure(Context context) {
String myProp = context.getString("myProp", "defaultValue");
// Process the myProp value (e.g. validation, convert to another type, ...)
// Store myProp for later retrieval by process() method
this.myProp = myProp;
}
@Override
public void start() {
// Initialize the connection to the external client
}
@Override
public void stop () {
// Disconnect from external client and do any additional cleanup
// (e.g. releasing resources or nulling-out field values) ..
}
@Override
public Status process() throws EventDeliveryException {
Status status = null;
// Start transaction
Channel ch = getChannel();
Transaction txn = ch.getTransaction();
txn.begin();
try {
// This try clause includes whatever Channel operations you want to do
// Receive new data
Event e = getSomeData();
// Store the Event into this Source's associated Channel(s)
getChannelProcessor().processEvent(e)
txn.commit();
status = Status.READY;
} catch (Throwable t) {
txn.rollback();
// Log exception, handle individual exceptions as needed
status = Status.BACKOFF;
// re-throw all Errors
if (t instanceof Error) {
throw (Error)t;
}
} finally {
txn.close();
}
return status;
}
}