Flume ng 插入hbase性能测试

最新推荐文章于 2022-11-11 10:48:55 发布

chinalgf

最新推荐文章于 2022-11-11 10:48:55 发布

阅读量2.3k

点赞数

分类专栏： Flume ng

本文链接：https://blog.csdn.net/morning_pig/article/details/8534079

版权

Flume ng 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

因为生产集群flume性能一直不佳，所以最近一直在研究flume ng。

使用flume ng将日志文件处理后插入到hbase中，话不多说，直接上代码：

配置文件：

#Name the components on this agent
agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1

agent1.sources.source1.type = exec
agent1.sources.source1.command = cat /home/test.log
agent1.sources.source1.batchSize = 100

# Configure Hbase Sink sink1
agent1.sinks.sink1.type = org.apache.flume.sink.hbase.HBaseSink
agent1.sinks.sink1.channel = channel1
agent1.sinks.sink1.table = longer_60
agent1.sinks.sink1.columnFamily = info
agent1.sinks.sink1.serializer = org.apache.flume.sink.hbase.SimpleHbaseEventSerializerPerson
agent1.sinks.sink1.serializer.payloadColumn = test

# Use a channel which buffers events in memory
agent1.channels.channel1.type = memory
agent1.channels.channel1.capacity = 800
agent1.channels.channel1.transactionCapactiy = 100

其中SimpleHbaseEventSerializerPerson.java是自定义的拆分日志存入hbase的行信息，在SimpleHbaseEventSerializer.java直接修改的：

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.flume.sink.hbase;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.FlumeException;
import org.apache.flume.conf.ComponentConfiguration;
import org.apache.hadoop.hbase.client.Increment;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Row;
import org.apache.hadoop.hbase.util.Bytes;

import com.google.common.base.Charsets;

/**
 * A simple serializer that returns puts from an event, by writing the event
 * body into it. The headers are discarded. It also updates a row in hbase which
 * acts as an event counter.
 * 
 * Takes optional parameters:
 * <p>
 * <tt>rowPrefix:</tt> The prefix to be used. Default: <i>default</i>
 * <p>
 * <tt>incrementRow</tt> The row to increment. Default: <i>incRow</i>
 * <p>
 * <tt>suffix:</tt> <i>uuid/random/timestamp.</i>Default: <i>uuid</i>
 * <p>
 * 
 * Mandatory parameters:
 * <p>
 * <tt>cf:</tt>Column family.
 * <p>
 * Components that have no defaults and will not be used if null:
 * <tt>payloadColumn:</tt> Which column to put payload in. If it is null, event
 * data will not be written.
 * <p>
 * <tt>incColumn:</tt> Which column to increment. Null means no column is
 * incremented.
 */
public class SimpleHbaseEventSerializerPerson implements HbaseEventSerializer {
	private String rowPrefix;
	private byte[] incrementRow;
	private byte[] cf;
	private byte[] plCol;
	private byte[] incCol;
	private KeyType keyType;
	private byte[] payload;
	private String interfaceName;

	public SimpleHbaseEventSerializerPerson() {

	}

	@Override
	public void configure(Context context) {
		rowPrefix = context.getString("rowPrefix", "default");
		incrementRow = context.getString("incrementRow", "incRow").getBytes(
				Charsets.UTF_8);
		String suffix = context.getString("suffix", "uuid");

		String payloadColumn = context.getString("payloadColumn");
		String incColumn = context.getString("incrementColumn");
		if (payloadColumn != null && !payloadColumn.isEmpty()) {
			if (suffix.equals("timestamp")) {
				keyType = KeyType.TS;
			} else if (suffix.equals("random")) {
				keyType = KeyType.RANDOM;
			} else if (suffix.equals("nano")) {
				keyType = KeyType.TSNANO;
			} else {
				keyType = KeyType.UUID;
			}
			plCol = payloadColumn.getBytes(Charsets.UTF_8);
		}
		if (incColumn != null && !incColumn.isEmpty()) {
			incCol = incColumn.getBytes(Charsets.UTF_8);
		}
	}

	@Override
	public void configure(ComponentConfiguration conf) {
	}

	@Override
	public void initialize(Event event, byte[] cf) {
		this.payload = event.getBody();
		this.cf = cf;
	}

	@Override
	public List<Row> getActions() throws FlumeException {
		List<Row> actions = new LinkedList<Row>();
		if (plCol != null) {
			byte[] rowKey;
			try {
				if (payload.length != 0) {
					String data = new String(payload);
					String inter = data.split("POST")[1].split(".php")[0]
							.split("/")[1];
					data = "gid:"
							+ data.split("gid=")[1].split(" ")[0].replaceAll(
									"&", ",").replaceAll("=", ":");
					SpliDataObject jsoData = new SpliDataObject(data);
					jsoData.put("interface", inter);

					if (jsoData == null) {
						return null;
					}
					Iterator keys = jsoData.keys();

					String time = jsoData.getString("time").substring(0, 10);
					StringBuffer sb = new StringBuffer();
					sb.append(CusDataUtil.timeConvert(time).substring(0, 8) + "-");
					sb.append(jsoData.getString("dept") + "-");
					sb.append(jsoData.getString("sid") + "-");
					sb.append(CusDataUtil.timeConvert(time).substring(8, 14) + "-");

					rowPrefix = sb.toString();
					
					if (keyType == KeyType.TS) {
						rowKey = SimpleRowKeyGenerator.getTimestampKey(rowPrefix);
					} else if (keyType == KeyType.RANDOM) {
						rowKey = SimpleRowKeyGenerator.getRandomKey(rowPrefix);
					} else if (keyType == KeyType.TSNANO) {
						rowKey = SimpleRowKeyGenerator
								.getNanoTimestampKey(rowPrefix);
					} else {
						rowKey = SimpleRowKeyGenerator.getUUIDKey(rowPrefix);
					}
					
					Put p = new Put(rowKey);

					keys = jsoData.keys();
					this.interfaceName = jsoData.getString("interface");

					while (keys.hasNext()) {
						String tempKey = (String) keys.next();
						if (!"interface".equals(tempKey))
							if ("time".equals(tempKey)) {
								p.add(Bytes.toBytes("info"), Bytes
										.toBytes(tempKey), Bytes.toBytes(time));
							} else {
								p.add(Bytes.toBytes("info"), Bytes
										.toBytes(tempKey), Bytes.toBytes(String
										.valueOf(jsoData.get(tempKey))));
							}
					}
					actions.add(p);
				}
			} catch (Exception e) {
				//throw new FlumeException("Could not get row key!", e);
			}

		}
		return actions;
	}

	@Override
	public List<Increment> getIncrements() {
		List<Increment> incs = new LinkedList<Increment>();
		if (incCol != null) {
			Increment inc = new Increment(incrementRow);
			inc.addColumn(cf, incCol, 1);
			incs.add(inc);
		}
		return incs;
	}

	@Override
	public void close() {
	}

	public enum KeyType {
		UUID, RANDOM, TS, TSNANO;
	}

}

这里需要说明的一点是：rowkey的规则还是借用了SimpleHbaseEventSerializer.java中的规则，只不过稍微修改了一下而已。如果用自定义的rowkey规则，性能相差数十倍多，不得不要佩服java大牛们了。

性能测试结果：

test.log文件大约是55M的样子，总共有220617行，在虚拟机上进行的测试，用时大约61秒，CPU使用率一直在1/8左右，内存保持在4%左右。每秒能达到3600多行的写入速度，如果再生产集群部署，相信效果更好。

最后，附上flume ng的自定义接口举例：


public class MySource extends AbstractSource implements Configurable, PollableSource {
  private String myProp;

  @Override
  public void configure(Context context) {
    String myProp = context.getString("myProp", "defaultValue");

    // Process the myProp value (e.g. validation, convert to another type, ...)

    // Store myProp for later retrieval by process() method
    this.myProp = myProp;
  }

  @Override
  public void start() {
    // Initialize the connection to the external client
  }

  @Override
  public void stop () {
    // Disconnect from external client and do any additional cleanup
    // (e.g. releasing resources or nulling-out field values) ..
  }

  @Override
  public Status process() throws EventDeliveryException {
    Status status = null;

    // Start transaction
    Channel ch = getChannel();
    Transaction txn = ch.getTransaction();
    txn.begin();
    try {
      // This try clause includes whatever Channel operations you want to do

      // Receive new data
      Event e = getSomeData();

      // Store the Event into this Source's associated Channel(s)
      getChannelProcessor().processEvent(e)

      txn.commit();
      status = Status.READY;
    } catch (Throwable t) {
      txn.rollback();

      // Log exception, handle individual exceptions as needed

      status = Status.BACKOFF;

      // re-throw all Errors
      if (t instanceof Error) {
        throw (Error)t;
      }
    } finally {
      txn.close();
    }
    return status;
  }
}