HBaseMiniCluster
背景和介绍
在测试HBase CDC时使用了HBase的Observer和Endpoint的协处理器,由于协处理器是运行在服务器端的,即regionserver上的代码,每次测试协处理器时需要打包到服务器,卸掉再重新关联协处理器,非常麻烦,且一旦出错也不可调试。
为了方便开发人员,HBase以及其它Hadoop组件提供了Mini Cluster的运行方式:在一个JVM里模拟运行Hadoop集群,包括HDFS,Zookeeper,HBase 以及MapReduce;如果你需要运行一些简单的代码或测试案例,或者想在IDE中通过断点来调试,那么这时候用Mini Cluster就再合适不过了;HBase的单元测试中已经频繁使用这种方法;
使用
POM
<dependencies>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>2.2.6</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-testing-util</artifactId>
<version>2.2.6</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.1</version>
<scope>test</scope>
</dependency>
</dependencies>
创建Endpoint
public class HbaseEndpoint extends BaseReplicationEndpoint {
private static final Logger LOG = LoggerFactory.getLogger(HbaseEndpoint.class);
private static final ToHRowJson TO_HROW_JSON = new ToHRowJson();
@Override
public UUID getPeerUUID() {
return UUID.randomUUID();
}
@Override
public boolean replicate(ReplicateContext context) {
final List<WAL.Entry> entries = context.getEntries();
final Map<String, List<WAL.Entry>> entriesByTable = entries.stream()
.collect(groupingBy(entry -> entry.getKey().getTableName().getNameAsString()));
// persist the data to kafka in parallel.
entriesByTable.entrySet().stream().forEach(entry -> {
final String tableName = entry.getKey();
LOG.info("table: " + tableName);
final List<WAL.Entry> tableEntries = entry.getValue();
tableEntries.forEach(tblEntry -> {
List<Cell> cells = tblEntry.getEdit().getCells();
Map<String, List<Cell>> columnsByRow = cells.stream()
.collect(groupingBy(cell -> Bytes.toString(CellUtil.cloneRow(cell))));
columnsByRow.entrySet().forEach(rowcols -> {
HRowJson rowJson = TO_HROW_JSON.apply(rowcols.getKey(), rowcols.getValue());
String jsonResult = JSONObject.toJSONString(rowJson);
LOG.info(jsonResult);
BaseProducer.produce(tableName, jsonResult);
});
});
});
return true;
}
@Override
public void start() {
LOG.info("Hbase replication to Kafka started at " + LocalDate.now());
this.startAsync();
}
@Override
public void stop() {
LOG.info("Hbase replication to Kafka started at " + LocalDate.now());
this.stopAsync();
}
@Override
protected void doStart() {
LOG.info("Hbase replication to Kafka doStarted at " + LocalDate.now());
notifyStarted();
}
@Override
protected void doStop() {
LOG.info("Hbase replication to Kafka doStoped at " + LocalDate.now());
// producer.close();
// BaseProducer.close();
notifyStopped();
}
}
创建测试类,使用MiniCluster调试
public abstract class BaseTest {
protected HBaseTestingUtility utility;
protected int numRegionServers;
@Before
public void setUp() throws Exception {
hbaseConf.setBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY/*.REPLICATION_ENABLE_KEY*/, true);
//System.setProperty("test.build.data.basedirectory", "E:/Download/hbase-test");
//System.setProperty("hadoop.home.dir", "F:/Dev/Hadoop-2.8.5");
utility = new HBaseTestingUtility(hbaseConf);
utility.startMiniCluster();
numRegionServers = utility.getHBaseCluster().getRegionServerThreads().size();
}
/**
*
* @param configuration
* @param peerName
* @param tableCFs
* @throws ReplicationException
* @throws IOException
*/
protected void addPeer(final Configuration configuration, String peerName, Map<TableName, List<String>> tableCFs)
throws ReplicationException, IOException {
Connection connection = ConnectionFactory.createConnection(configuration);
Admin admin = connection.getAdmin();
ReplicationPeerConfig peerConfig = ReplicationPeerConfig.newBuilder().setClusterKey(ZKConfig.getZooKeeperClusterKey(configuration))
.setReplicationEndpointImpl(HbaseEndpoint.class.getName()).build();
admin.addReplicationPeer(peerName, peerConfig);
}
@After
public void tearDown() throws Exception {
if (utility != null) {
utility.shutdownMiniCluster();
}
}
}
public class TestKafkaReplication extends BaseTest {
public static final Logger LOG = LoggerFactory.getLogger(TestKafkaReplication.class);
private static final String PEER_NAME = "hbase.cdc.kafka";
protected final TableName TABLE_NAME = TableName.valueOf("testings");
protected final String ROWKEY = "rk-%s";
protected final String COLUMN_FAMILY = "d";
protected final String QUALIFIER = "q";
protected final String VALUE = "v";
@Test
public void testCustomReplicationEndpoint() throws Exception {
try {
Map<TableName, List<String>> tableCfs = new HashMap<>();
List<String> cfs = new ArrayList<>();
cfs.add(COLUMN_FAMILY);
tableCfs.put(TABLE_NAME, cfs);
createTestTable();
addPeer(utility.getConfiguration(), PEER_NAME, tableCfs);
int numberOfRecords = 10;
addData(numberOfRecords);
} finally {
removePeer();
}
}
/**
* Create the hbase table with a scope set to Global
* @throws IOException
*/
private void createTestTable() throws IOException {
try(HBaseAdmin hBaseAdmin = utility.getHBaseAdmin()) {
final HTableDescriptor hTableDescriptor = new HTableDescriptor(TABLE_NAME);
final HColumnDescriptor hColumnDescriptor = new HColumnDescriptor(COLUMN_FAMILY);
hColumnDescriptor.setScope(HConstants.REPLICATION_SCOPE_GLOBAL);
hTableDescriptor.addFamily(hColumnDescriptor);
hBaseAdmin.createTable(hTableDescriptor);
}
utility.waitUntilAllRegionsAssigned(TABLE_NAME);
}
/**
* Adds data to the previously created HBase table
* @throws IOException
*/
private void addData(int numberOfRecords) throws IOException {
try(Table hTable = ConnectionFactory.createConnection(utility.getConfiguration()).getTable(TABLE_NAME)) {
for(int i = 0; i < numberOfRecords; i++) {
Put put = new Put(toBytes(String.format(ROWKEY, i)));
put.addColumn(toBytes(COLUMN_FAMILY), toBytes(QUALIFIER), toBytes(VALUE));
hTable.put(put);
}
}
}
/**
* Removes the peer
* @throws IOException
* @throws ReplicationException
*/
private void removePeer() throws IOException, ReplicationException {
try(ReplicationAdmin replicationAdmin = new ReplicationAdmin(utility.getConfiguration())) {
replicationAdmin.removePeer(PEER_NAME);
}
}
}
问题
如果你直接使用上面的代码或者从别的地方copy过来,在windows上运行,大概率是会报这些错误的:
- All datanodes are bad. Aborting…
java.io.IOException: All datanodes [DatanodeInfoWithStorage[127.0.0.1:2785,DS-384e5701-c6b0-453f-a79d-a14c96a12397,DISK]] are bad. Aborting...
at org.apache.hadoop.hdfs.DataStreamer.handleBadDatanode(DataStreamer.java:1530)
at org.apache.hadoop.hdfs.DataStreamer.setupPipelineForAppendOrRecovery(DataStreamer.java:1465)
at org.apache.hadoop.hdfs.DataStreamer.processDatanodeError(DataStreamer.java:1237)
at org.apache.hadoop.hdfs.DataStreamer.run(DataStreamer.java:657)
- EOFException: Unexpected EOF while trying to read response from server
java.io.EOFException: Unexpected EOF while trying to read response from server
at org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:402)
at org.apache.hadoop.hdfs.protocol.datatransfer.PipelineAck.readFields(PipelineAck.java:213)
at org.apache.hadoop.hdfs.DataStreamer$ResponseProcessor.run(DataStreamer.java:1073)
- 系统找不到指定的路径。
Caused by: 3: 系统找不到指定的路径。
at org.apache.hadoop.io.nativeio.NativeIO.renameTo0(Native Method)
at org.apache.hadoop.io.nativeio.NativeIO.renameTo(NativeIO.java:877)
at org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.moveBlockFiles(FsDatasetImpl.java:890)
... 7 more
- Failed to move meta file for ReplicaBeingWritten
IOException in BlockReceiver.run():
java.io.IOException: Failed to move meta file for ReplicaBeingWritten, blk_1073741828_1004, RBW
getNumBytes() = 7
getBytesOnDisk() = 7
getVisibleLength()= 7
getVolume() = F:\Cache\IDEA202001\WorkSpace\Wisers\SparkPro\test-hbase2-cdc\target\test-data\f54971a6-6884-3ae6-8bb0-03479e546d76\cluster_510be0ed-46bb-cf4d-096a-78f10d101f39\dfs\data\data2\current
getBlockFile() = F:\Cache\IDEA202001\WorkSpace\Wisers\SparkPro\test-hbase2-cdc\target\test-data\f54971a6-6884-3ae6-8bb0-03479e546d76\cluster_510be0ed-46bb-cf4d-096a-78f10d101f39\dfs\data\data2\current\BP-1066010076-127.0.0.1-1613700403468\current\rbw\blk_1073741828
bytesAcked=7
bytesOnDisk=7 from F:\Cache\IDEA202001\WorkSpace\Wisers\SparkPro\test-hbase2-cdc\target\test-data\f54971a6-6884-3ae6-8bb0-03479e546d76\cluster_510be0ed-46bb-cf4d-096a-78f10d101f39\dfs\data\data2\current\BP-1066010076-127.0.0.1-1613700403468\current\rbw\blk_1073741828_1004.meta to F:\Cache\IDEA202001\WorkSpace\Wisers\SparkPro\test-hbase2-cdc\target\test-data\f54971a6-6884-3ae6-8bb0-03479e546d76\cluster_510be0ed-46bb-cf4d-096a-78f10d101f39\dfs\data\data2\current\BP-1066010076-127.0.0.1-1613700403468\current\finalized\subdir0\subdir0\blk_1073741828_1004.meta
at org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.moveBlockFiles(FsDatasetImpl.java:892)
at org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.BlockPoolSlice.addBlock(BlockPoolSlice.java:315)
at org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsVolumeImpl.addFinalizedBlock(FsVolumeImpl.java:879)
at org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.finalizeReplica(FsDatasetImpl.java:1786)
at org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl.finalizeBlock(FsDatasetImpl.java:1752)
at org.apache.hadoop.hdfs.server.datanode.BlockReceiver$PacketResponder.finalizeBlock(BlockReceiver.java:1441)
at org.apache.hadoop.hdfs.server.datanode.BlockReceiver$PacketResponder.run(BlockReceiver.java:1398)
at java.lang.Thread.run(Thread.java:748)
解决
参考:Running Hbase Testing Utility On Windows
由于我这里已经有了本地hadoop winutils工具以及环境变量,所以我这里直接走了该文章第4步,即在setup方法增加两个本地路径的配置即可。所以只需要将上面代码中setup中的注释去掉即可。
System.setProperty("test.build.data.basedirectory", "E:/Download/hbase-test");
System.setProperty("hadoop.home.dir", "F:/Dev/Hadoop-2.8.5");