【HBase-Coprocessor】HBase协处理器同步数据至Elasticsearch
1)代码编写
1.1.Pom依赖导入
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>hbase-observer-es</groupId>
<artifactId>langya</artifactId>
<version>20190921</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<java.version>1.8</java.version>
<spark.cloudera.version>2.10.cloudera2</spark.cloudera.version>
<hbase.cloudera.version>1.2.0-cdh5.13.3</hbase.cloudera.version>
<maven-assembly-plugin.version>2.6</maven-assembly-plugin.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.40</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>7.6.1</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>transport</artifactId>
<version>7.6.1</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.0</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<testFailureIgnore>true</testFailureIgnore>
<skipTests>true</skipTests>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>${maven-assembly-plugin.version}</version>
<configuration>
<archive>
<manifest>
<mainClass></mainClass>
</manifest>
</archive>
<descriptors>
<descriptor>assembly.xml</descriptor>
</descriptors>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
1.2.assembly.xml
<?xml version="1.0" encoding="UTF-8"?>
<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 http://maven.apache.org/xsd/assembly-2.0.0.xsd">
<id>test</id>
<formats>
<format>jar</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<dependencySets>
<dependencySet>
<unpack>true</unpack>
<scope>runtime</scope>
</dependencySet>
</dependencySets>
<fileSets>
<fileSet>
<directory>${project.build.outputDirectory}</directory>
<outputDirectory>./</outputDirectory>
</fileSet>
</fileSets>
</assembly>
1.3.Elasticsearch相关代码
1.3.1.ElasticsearchPoolUtil
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
public class ElasticsearchPoolUtil {
private static final Log LOGGER = LogFactory.getLog(ElasticsearchPoolUtil.class);
private static String ID = "id";
private static String INDEX = "index";
private static String TYPE = "type";
private static String DATAJSON = "dataJson";
public static Integer MAX_CONNECT_SIZE=1;//默认最小连接数
private static Integer MAX_RETRY_SIZE=600;//默认重试次数
//静态的Connection队列
private static LinkedList<TransportClient> clientQueue = null;
public static String CLUSTER_NAME="itcast-es";
private static String ES_NODE_ONE="192.168.32.61";
private static String ES_PORT_ONE="9200";
public static List<String> esHostTcpList = new ArrayList<>();
public synchronized static TransportClient getClient() throws Exception{
if(clientQueue ==null ){
clientQueue = new LinkedList<>();
for(int i=0;i<MAX_CONNECT_SIZE;i++){
clientQueue.push(clientPush());
}
}else if(clientQueue.size() == 0){
clientQueue.push(clientPush());
}
return clientQueue.poll();
}
public static TransportAddress[] initTranSportAddress(){
TransportAddress[] transportAddresses = new TransportAddress[esHostTcpList.size()];
int offset = 0;
for(int i=0;i<esHostTcpList.size();i++){
String[] ipHost = esHostTcpList.get(i).split(":");
try{
transportAddresses[offset] = new TransportAddress(InetAddress.getByName(ipHost[0].trim()),Integer.valueOf(ipHost[1].trim()));
offset++;
}catch(Exception e){
LOGGER.error("exec init transport address error:",e);
}
}
return transportAddresses;
}
public static void pilotConnection(){
synchronized (clientQueue){
long startTime = System.currentTimeMillis();
LOGGER.warn("正在啟動控制連接,目前連接數量為:"+clientQueue.size());
if(clientQueue.size()>MAX_CONNECT_SIZE){
clientQueue.getLast().close();
clientQueue.removeLast();
LOGGER.warn("關閉連接耗時:"+(System.currentTimeMillis()-startTime));
pilotConnection();
}else{
return;
}
}
}
public static void destoryAllConnection(){
LOGGER.warn("正在銷毀連接,目前連接數為"+clientQueue.size());
synchronized (clientQueue){
long startTime = System.currentTimeMillis();
if(clientQueue.size()>0){
clientQueue.getLast().close();
clientQueue.removeLast();
LOGGER.warn("關閉連接耗時:"+(System.currentTimeMillis()-startTime));
pilotConnection();
}else{
return;
}
}
}
private synchronized static TransportClient clientPush() throws Exception{
TransportClient client = null;
int upCount = 0;
while(clientQueue.size()<MAX_CONNECT_SIZE && client == null && upCount < MAX_RETRY_SIZE){
client = init();
Thread.sleep(100);
upCount++;
}
if(client == null){
throw new Exception("Es client init failed wait for 60s");
}
return client;
}
public static TransportClient init()throws Exception{
System.setProperty("es.set.netty.runtime.available.processors","true");
Settings esSettings = Settings.builder().put("cluster.name",CLUSTER_NAME).put("client.transport.sniff",false).build();
TransportClient client = new PreBuiltTransportClient(esSettings);
client.addTransportAddresses(initTranSportAddress());
return client;
}
public synchronized static void returnClient(TransportClient client){
if(clientQueue ==null){
clientQueue = new LinkedList<>();
}
clientQueue.push(client);
}
}
1.3.2.ElasticsearchUtil
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
public class ElasticsearchUtil {
private static final Log LOGGER = LogFactory.getLog(ElasticsearchUtil.class);
private static String ID = "id";
private static String INDEX = "index";
private static String TYPE = "type";
private static String DATAJSON = "dataJson";
public static Integer MAX_CONNECT_SIZE=1;//默认最小连接数
private static Integer MAX_RETRY_SIZE=600;//默认重试次数
//静态的Connection队列
private static LinkedList<TransportClient> clientQueue = null;
public static String CLUSTER_NAME="itcast-es";
private static String ES_NODE_ONE="192.168.32.61";
private static String ES_PORT_ONE="9200";
public static List<String> esHostTcpList = new ArrayList<>();
public synchronized static TransportClient getClient() throws Exception{
if(clientQueue ==null ){
clientQueue = new LinkedList<>();
for(int i=0;i<MAX_CONNECT_SIZE;i++){
clientQueue.push(clientPush());
}
}else if(clientQueue.size() == 0){
clientQueue.push(clientPush());
}
return clientQueue.poll();
}
public static TransportAddress[] initTranSportAddress(){
TransportAddress[] transportAddresses = new TransportAddress[esHostTcpList.size()];
int offset = 0;
for(int i=0;i<esHostTcpList.size();i++){
String[] ipHost = esHostTcpList.get(i).split(":");
try{
transportAddresses[offset] = new TransportAddress(InetAddress.getByName(ipHost[0].trim()),Integer.valueOf(ipHost[1].trim()));
offset++;
}catch(Exception e){
LOGGER.error("exec init transport address error:",e);
}
}
return transportAddresses;
}
public static void pilotConnection(){
synchronized (clientQueue){
long startTime = System.currentTimeMillis();
LOGGER.warn("正在啟動控制連接,目前連接數量為:"+clientQueue.size());
if(clientQueue.size()>MAX_CONNECT_SIZE){
clientQueue.getLast().close();
clientQueue.removeLast();
LOGGER.warn("關閉連接耗時:"+(System.currentTimeMillis()-startTime));
pilotConnection();
}else{
return;
}
}
}
public static void destoryAllConnection(){
LOGGER.warn("正在銷毀連接,目前連接數為"+clientQueue.size());
synchronized (clientQueue){
long startTime = System.currentTimeMillis();
if(clientQueue.size()>0){
clientQueue.getLast().close();
clientQueue.removeLast();
LOGGER.warn("關閉連接耗時:"+(System.currentTimeMillis()-startTime));
pilotConnection();
}else{
return;
}
}
}
private synchronized static TransportClient clientPush() throws Exception{
TransportClient client = null;
int upCount = 0;
while(clientQueue.size()<MAX_CONNECT_SIZE && client == null && upCount < MAX_RETRY_SIZE){
client = init();
Thread.sleep(100);
upCount++;
}
if(client == null){
throw new Exception("Es client init failed wait for 60s");
}
return client;
}
public static TransportClient init()throws Exception{
System.setProperty("es.set.netty.runtime.available.processors","true");
Settings esSettings = Settings.builder().put("cluster.name",CLUSTER_NAME).put("client.transport.sniff",false).build();
TransportClient client = new PreBuiltTransportClient(esSettings);
client.addTransportAddresses(initTranSportAddress());
return client;
}
public synchronized static void returnClient(TransportClient client){
if(clientQueue ==null){
clientQueue = new LinkedList<>();
}
clientQueue.push(client);
}
}
1.3.3.ElasticsearchBulkOperator
import com.langya.elasticsearch.util.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.elasticsearch.action.DocWriteRequest;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.delete.DeleteRequestBuilder;
import org.elasticsearch.action.update.UpdateRequestBuilder;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.unit.TimeValue;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
public class ElasticsearchBulkOperator {
private static final Log LOGGER = LogFactory.getLog(ElasticsearchBulkOperator.class);
private static final Integer BULK_TIME_OUT = 5;//批次请求超时时间,单位:分钟
private static final Integer CORE_POOL_SIZE = 3;//线程池大小
private static final Integer PUT_INITIAL_DELAY = 10;//单位:秒
private static final Integer PUT_PREIOD = 10;
private static Integer idleIimeAccumulation = 0;//协处理器空闲时间积累,达到半小时就会断掉所有的连接,将所有的连接释放,减轻Elasticsearch的和HBase的压力
private static final Integer SERVICE_MAXIMUM_IDLE_TIME = 1800;//服务最大空闲时间,单位:秒,默认半小时
private static final Integer DESTOR_CONNECTION_INITIAL_DELAY = 10;//单位:秒
private static final Integer DESTOR_CONNECTION_PERIOD = 3;//单位:秒,销毁多余连接检查间隔
public static int MAX_BULK_COUNT = 10000;//最大批次提交,单位:条,用来限制不断写入的时候请求队列大小(通过安装协处理器的时候传递参数可以改变此默认值)
public static int MIN_BULK_COUNT = 0;//最小批次提交大小,单位:条,用在周期性提交请求(周期性任务执行时,只要数据量 > 0条,就会提交),为了避免长时间不进数据,又没有达到最大请求队列限制,导致数据延迟问题(通过安装协处理器传递参数可以改变此默认值)
private static BulkRequestBuilder bulkRequestBuilder = null;
private static final Lock commitLock = new ReentrantLock();
private static ScheduledExecutorService scheduledExecutorService;
static {
bulkRequestBuilder();
scheduledExecutorService = Executors.newScheduledThreadPool(CORE_POOL_SIZE);
final Runnable beeper = () -> run();
final Runnable destoryRedundantConnection = () -> destoryRedundantConnections();
scheduledExecutorService.scheduleAtFixedRate(beeper, PUT_INITIAL_DELAY, PUT_PREIOD, TimeUnit.SECONDS);//定时刷新数据到Elasticsearch中
scheduledExecutorService.scheduleAtFixedRate(destoryRedundantConnection, DESTOR_CONNECTION_INITIAL_DELAY, DESTOR_CONNECTION_PERIOD, TimeUnit.SECONDS);//定时清除多余Elasticsearch连接
}
private static void serviceIdleCheck() {
synchronized (bulkRequestBuilder) {
System.out.println("写入队列当前大小:" + bulkRequestBuilder.numberOfActions() + " 服务空闲时间累积:" + idleIimeAccumulation + "秒");
if (bulkRequestBuilder.numberOfActions() == 0) {
if (idleIimeAccumulation < SERVICE_MAXIMUM_IDLE_TIME) {
idleIimeAccumulation += DESTOR_CONNECTION_PERIOD;
} else {
ElasticsearchPoolUtil.destoryAllConnection();
idleIimeAccumulation = 0;
}
} else {
idleIimeAccumulation = 0;
}
}
}
private static void bulkRequestBuilder() {
try {
TransportClient client = ElasticsearchPoolUtil.getClient();
bulkRequestBuilder = client.prepareBulk();
ElasticsearchPoolUtil.returnClient(client);
} catch (Exception e) {
LOGGER.error(e);
}
bulkRequestBuilder.setTimeout(TimeValue.timeValueMinutes(BULK_TIME_OUT));
}
private static void run() {
try {
commitLock.lock();
bulkRequest(MIN_BULK_COUNT);
} catch (Exception ex) {
LOGGER.error("Time Bulk index error:" + ex.getMessage());
} finally {
commitLock.unlock();
}
}
private static void destoryRedundantConnections() {
ElasticsearchPoolUtil.pilotConnection();
serviceIdleCheck();
}
public static void shutdownScheduEx() {
if (null != scheduledExecutorService && !scheduledExecutorService.isShutdown()) {
scheduledExecutorService.shutdown();
}
}
private static void bulkRequest(int threshold) {
if (bulkRequestBuilder.numberOfActions() > threshold) {
try {
BulkResponse bulkItemResponses = bulkRequestBuilder.execute().actionGet();
if (!bulkItemResponses.hasFailures()) {
TransportClient client = ElasticsearchPoolUtil.getClient();
bulkRequestBuilder = client.prepareBulk();
LOGGER.info("批次提交");
ElasticsearchPoolUtil.returnClient(client);
}
} catch (Exception ex) {
try {
TransportClient client = ElasticsearchPoolUtil.getClient();
List<DocWriteRequest<?>> tempRequests = bulkRequestBuilder.request().requests();
bulkRequestBuilder = client.prepareBulk();
bulkRequestBuilder.request().add(tempRequests);
ElasticsearchPoolUtil.returnClient(client);
} catch (Exception es) {
LOGGER.error(es);
}
LOGGER.error(ex);
}
}
}
public static void addUpdateBuilderToBulk(UpdateRequestBuilder builder) {
commitLock.lock();
try {
bulkRequestBuilder.add(builder);
bulkRequest(MAX_BULK_COUNT);
} catch (Exception e) {
LOGGER.error(e);
} finally {
commitLock.unlock();
}
}
public static void addDeleteBuilderToBulk(DeleteRequestBuilder builder) {
commitLock.lock();
try {
bulkRequestBuilder.add(builder);
bulkRequest(MAX_BULK_COUNT);
} catch (Exception e) {
LOGGER.error(e);
} finally {
commitLock.unlock();
}
}
}
1.4.HBase相关代码
1.4.1.HbaseDataSyncEsObserver
import org.apache.commons.collections.map.HashedMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.CoprocessorEnvironment;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
import org.apache.hadoop.hbase.util.Bytes;
import org.elasticsearch.action.delete.DeleteRequestBuilder;
import org.elasticsearch.action.update.UpdateRequestBuilder;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
public class HbaseDataSyncEsObserver extends BaseRegionObserver {
private static final Log LOGGER = LogFactory.getLog(HbaseDataSyncEsObserver.class);
private String indexName;
private String indexType;
@Override
public void start(CoprocessorEnvironment e) throws IOException {
super.start(e);
initIndexConfiguraction(e);
}
private void initIndexConfiguraction(CoprocessorEnvironment e){
loadEsClientInfo(e);
}
private void loadEsClientInfo(CoprocessorEnvironment e){
//集群名称
ElasticsearchPoolUtil.CLUSTER_NAME = e.getConfiguration().get("cluster.name","itcast-es");
this.setIndexName(e.getConfiguration().get("indexName",""));
this.setIndexType(e.getConfiguration().get("indexType",""));
//只需要改这里即可
// String esClientInfo = e.getConfiguration().get("esClientInfo","192.168.1.1:9300-192.168.1.2:9300");
String esClientInfo = e.getConfiguration().get("esClientInfo","10.70.71.52:9300");
String[] esClientInfoList = esClientInfo.split("-");
for(String esClientInfoTemp : esClientInfoList){
ElasticsearchPoolUtil.esHostTcpList.add(esClientInfoTemp);
}
}
@Override
public void stop(CoprocessorEnvironment e) throws IOException {
super.stop(e);
ElasticsearchBulkOperator.shutdownScheduEx();
ElasticsearchPoolUtil.destoryAllConnection();
}
@Override
public void postPut(ObserverContext<RegionCoprocessorEnvironment> e, Put put, WALEdit edit, Durability durability) throws IOException {
super.postPut(e, put, edit, durability);
String rowKey = new String(put.getRow());
NavigableMap<byte[],List<Cell>> familys = put.getFamilyCellMap();
Map<String,Object> result = new HashedMap();
for(Map.Entry<byte[],List<Cell>> entry : familys.entrySet()){
for(Cell cell : entry.getValue()){
String key = Bytes.toString(CellUtil.cloneQualifier(cell));
String value = Bytes.toString(CellUtil.cloneValue(cell));
result.put(key,value);
}
}
try{
TransportClient transportClient = ElasticsearchPoolUtil.getClient();
UpdateRequestBuilder updateRequestBuilder = transportClient.prepareUpdate(this.indexName,this.indexType,rowKey);
updateRequestBuilder.setDoc(result);
updateRequestBuilder.setDocAsUpsert(true);
ElasticsearchPoolUtil.returnClient(transportClient);
ElasticsearchBulkOperator.addUpdateBuilderToBulk(updateRequestBuilder);
}catch (Exception e1){
LOGGER.error(e1);
}
}
@Override
public void postDelete(ObserverContext<RegionCoprocessorEnvironment> e, Delete delete, WALEdit edit, Durability durability) throws IOException {
super.postDelete(e, delete, edit, durability);
String rowKey = new String(delete.getRow());
try{
TransportClient transportClient = ElasticsearchPoolUtil.getClient();
DeleteRequestBuilder deleteRequestBuilder = transportClient.prepareDelete(this.indexName,this.indexType,rowKey);
ElasticsearchPoolUtil.returnClient(transportClient);
ElasticsearchBulkOperator.addDeleteBuilderToBulk(deleteRequestBuilder);
}catch (Exception e1){
LOGGER.error(e1);
}
}
public void setIndexName(String indexName) {
this.indexName = indexName;
}
public void setIndexType(String indexType) {
this.indexType = indexType;
}
public String getIndexName() {
return indexName;
}
public String getIndexType() {
return indexType;
}
}
1.5.windows(cmd)打包
mvn assembly:assembly
2)使用步骤
2.1.准备工作
#进入HBase控制台
hbase shell
#创建HBase表
create 'tbl_profile','user'
2.2.开启协处理器
(1)停用表
disable 'tbl_profile'
(2)添加协处理器
参数说明:
- table_att:任务名称
- coprocessor:使用协处理器
- hdfs:///user/hbase/lib/test.jar:HDFS上jar包位置
- com.test.hbase.observer.util.HbaseDataSyncEsObserver:HbaseDataSyncEsObserver所在jar包路径
- 1001:任务id
- cluster.name:es集群名称
- indexName:es索引名称
- indexType:es索引类型
alter 'tbl_profile',METHOD=> 'table_att','coprocessor'=>'hdfs:///user/hbase/lib/langya-20190926-test.jar|com.langya.hbase.observer.util.HbaseDataSyncEsObserver|1001|cluster.name=itcast-es,indexName=tbl_profile,indexType=user'
(3)启用表
enable 'tbl_profile'
(4)查看表详细信息
desc 'test'
3)注意
1、如果成功了,就会显示用时多少秒。
2、如果替换的程序有问题或者替换语句有问题,这一步就会造成集群挂掉,如果集群挂掉,hbase是无法重启的,需要进行以下步骤。
① 在hbase-site.xml文件添加
<property>
<name>hbase.coprocessor.abortonerror</name>
<value>false</value>
</property>
② 重启HBase,报错会被抑制
3、注意修改代码中的ip地址。
4、jar包要上传到HDFS。
4)测试
put 'test','1','info:name','zhangsan'
然后查看es中出现同步后的数据,成功。
5)解绑协处理器
(1)禁用表
disable 'test'
(2)解绑
alter 'tbl_profile', METHOD => 'table_att_unset',NAME => 'coprocessor$1'
(3)启用表并查看是否解绑成功,desc后查看没有协处理器了就是解绑成功了
enable 'test'
desc 'test'