hbase同步到es
hbase版本:1.1.2
es版本:6.3.2
一开始从网上找的也能用,但是发现运行一段时间发现报错,丢失数据,后来又从网上搜索加上自己整理的,终于成功了
不多说上代码
public class ElasticSearchBulkOperator {
private static final Log LOG = LogFactory.getLog(ElasticSearchBulkOperator.class);
//此处设置批量提交数量
private static final int MAX_BULK_COUNT = 5000;
private BulkRequestBuilder bulkRequestBuilder = null;
private Lock commitLock = new ReentrantLock();
private ScheduledExecutorService scheduledExecutorService = null;
private EsClient esClient = null;
public ElasticSearchBulkOperator(final EsClient esClient) {
LOG.info("----------------- Init Bulk Operator for Table: " + " ----------------");
this.esClient = esClient;
// init es bulkRequestBuilder
this.bulkRequestBuilder = esClient.getClient().prepareBulk();
// init thread pool and set size 1
this.scheduledExecutorService = Executors.newScheduledThreadPool(1);
// create beeper thread( it will be sync data to ES cluster)use a commitLock to protected bulk es as thread-save
Runnable beeper = new Runnable() {
@Override
public void run() {
commitLock.lock();
try {
//LOG.info("Scheduled Thread start run for ");
bulkRequest(0);
} catch (Exception ex) {
LOG.error("Time Bulk " + " index error : " + ex.getMessage());
} finally {
commitLock.unlock();
}
}
};
// set beeper thread(15 second to delay first execution , 25 second period between successive executions)
scheduledExecutorService.scheduleAtFixedRate(beeper, 15, 25, TimeUnit.SECONDS);
}
/**
* shutdown time task immediately
*/
public void shutdownScheduEx() {
if (null != scheduledExecutorService && !scheduledExecutorService.isShutdown()) {
scheduledExecutorService.shutdown();
}
}
/**
* bulk request when number of builders is grate then threshold
*
* @param threshold
*/
public void bulkRequest(int threshold) {
int count = bulkRequestBuilder.numberOfActions();
if (bulkRequestBuilder.numberOfActions() > threshold) {
try {
LOG.info("Bulk Request Run " + ", the row count is: " + count);
BulkResponse bulkItemResponse = bulkRequestBuilder.execute().actionGet();
if (bulkItemResponse.hasFailures()) {
LOG.error("------------- Begin: Error Response Items of Bulk Requests to ES ------------");
LOG.error(bulkItemResponse.buildFailureMessage());
LOG.error("------------- End: Error Response Items of Bulk Requests to ES ------------");
}
bulkRequestBuilder = esClient.getClient().prepareBulk();
} catch (Exception e) {// two cause: 1. transport client is closed 2. None of the configured nodes are available
LOG.error(" Bulk Request " + " index error : " + e.getMessage());
LOG.error("Reconnect the ES server...");
List<DocWriteRequest> tempRequests = bulkRequestBuilder.request().requests();
esClient.getClient().close();
esClient.repeatInitEsClient();
bulkRequestBuilder = esClient.getClient().prepareBulk();
bulkRequestBuilder.request().add(tempRequests);
}
}
}
/**
* add update builder to bulk use commitLock to protected bulk as
* thread-save
*
* @param builder
*/
public void addUpdateBuilderToBulk(UpdateRequestBuilder builder) {
commitLock.lock();
try {
bulkRequestBuilder.add(builder);
bulkRequest(MAX_BULK_COUNT);
} catch (Exception ex) {
LOG.error(" Add Bulk index error : " + ex.getMessage());
} finally {
commitLock.unlock();
}
}
/**
* add delete builder to bulk use commitLock to protected bulk as
* thread-save
*
* @param builder
*/
public void addDeleteBuilderToBulk(DeleteRequestBuilder builder) {
commitLock.lock();
try {
bulkRequestBuilder.add(builder);
bulkRequest(MAX_BULK_COUNT);
} catch (Exception ex) {
LOG.error(" delete Bulk index error : " + ex.getMessage());
} finally {
commitLock.unlock();
}
}
}
/**
* ES Cleint class
*/
public class EsClient {
@PostConstruct
void init() {
System.setProperty("es.set.netty.runtime.available.processors", "false");
}
// ElasticSearch的集群名称
private String clusterName;
// ElasticSearch的host
private String nodeHost;
// ElasticSearch的端口(Java API用的是Transport端口,也就是TCP)
private int nodePort;
private TransportClient client = null;
private static final Log LOG = LogFactory.getLog(EsClient.class);
/**
* get Es config
*
* @return
*/
public EsClient(String clusterName, String nodeHost, int nodePort) {
this.clusterName = clusterName;
this.nodeHost = nodeHost;
this.nodePort = nodePort;
this.client = initEsClient();
}
public String getInfo() {
List<String> fields = new ArrayList<String>();
try {
for (Field f : EsClient.class.getDeclaredFields()) {
fields.add(f.getName() + "=" + f.get(this));
}
} catch (IllegalAccessException ex) {
ex.printStackTrace();
}
return StringUtils.join(fields, ", ");
}
/* public String getOneNodeHost() {
if (this.nodeHost == null || this.nodeHost.length == 0) {
return "";
}
Random rand = new Random();
return nodeHost[rand.nextInt(this.nodeHost.length)];
}
*/
/**
* init ES client
*/
public TransportClient initEsClient() {
LOG.info("---------- Init ES Client " + this.clusterName + " -----------");
TransportClient client = null;
Settings settings = Settings.builder().put("cluster.name", this.clusterName).put("client.transport.sniff", true).build();
try {
client = new PreBuiltTransportClient(settings).addTransportAddress(new TransportAddress(InetAddress.getByName(this.nodeHost), this.nodePort))
.addTransportAddress(new TransportAddress(InetAddress.getByName("192.168.31.203"), this.nodePort));;
} catch (UnknownHostException e) {
e.printStackTrace();
LOG.error("---------- Init ES Client jieshu " + this.clusterName + " -----------");
}
return client;
}
public void repeatInitEsClient() {
this.client = initEsClient();
}
/**
* @return the clusterName
*/
public String getClusterName() {
return clusterName;
}
/**
* @param clusterName the clusterName to set
*/
public void setClusterName(String clusterName) {
this.clusterName = clusterName;
}
/**
* @return the nodePort
*/
public int getNodePort() {
return nodePort;
}
/**
* @param nodePort the nodePort to set
*/
public void setNodePort(int nodePort) {
this.nodePort = nodePort;
}
/**
* @return the client
*/
public TransportClient getClient() {
return client;
}
/**
* @param client the client to set
*/
public void setClient(TransportClient client) {
this.client = client;
}
public static void main(String[] args) {
System.out.println("nihaoo");
}
}
/**
* Hbase Sync data to Es Class
*/
public class HbaseDataSyncEsObserver extends BaseRegionObserver {
private static final Log LOG = LogFactory.getLog(HbaseDataSyncEsObserver.class);
public String clusterName;
public String nodeHost;
public String indexName;
public String typeName;
public Integer nodePort;
public EsClient EsClient;
public ElasticSearchBulkOperator elasticSearchBulkOperator;
/**
* read es config from params
* @param env
*/
private void readConfiguration(CoprocessorEnvironment env) {
Configuration conf = env.getConfiguration();
clusterName = conf.get("es_cluster");
nodeHost = conf.get("es_host");
nodePort = conf.getInt("es_port", -1);
indexName = conf.get("es_index");
typeName = conf.get("es_type");
}
/**
* start
* @param e
*/
@Override
public void start(CoprocessorEnvironment e) {
// read config
readConfiguration(e);
// init ES client
EsClient = new EsClient(clusterName, nodeHost, nodePort);
elasticSearchBulkOperator = new ElasticSearchBulkOperator(EsClient);
EsClient.initEsClient();
LOG.info("------observer init EsClient start------"+EsClient.getInfo());
}
/**
* stop
* @param e
* @throws IOException
*/
@Override
public void stop(CoprocessorEnvironment e) throws IOException {
// close es client
EsClient.getClient().close();
// shutdown time task
elasticSearchBulkOperator.shutdownScheduEx();
}
/**
* Called after the client stores a value
* after data put to hbase then prepare update builder to bulk ES
*
* @param e
* @param put
* @param edit
* @param durability
* @throws IOException
*/
@Override
public void postPut(ObserverContext<RegionCoprocessorEnvironment> e, Put put, WALEdit edit, Durability durability) throws IOException {
String indexId = new String(put.getRow());
try {
NavigableMap<byte[], List<Cell>> familyMap = put.getFamilyCellMap();
// Map<String, Object> infoJson = new HashMap<String, Object>();
Map<String, Object> json = new HashMap<String, Object>();
for (Map.Entry<byte[], List<Cell>> entry : familyMap.entrySet()) {
for (Cell cell : entry.getValue()) {
String key = Bytes.toString(CellUtil.cloneQualifier(cell));
String value = Bytes.toString(CellUtil.cloneValue(cell));
//处理时间格式,将使其能够自动转换为时间格式
if ("date".equals(key)){
value=value.replace(" ","T")+"+0800";
}
json.put(key, value);
}
}
// set hbase family to es
//infoJson.put("info", json);
elasticSearchBulkOperator.addUpdateBuilderToBulk(EsClient.getClient().prepareUpdate(indexName,typeName, indexId).setDocAsUpsert(true).setDoc(json));
} catch (Exception ex) {
LOG.error("observer put a doc, index [ " + EsClient.getClusterName() + " ]" + "indexId [" + indexId + "] error : " + ex.getMessage());
}
}
/**
* Called after the client deletes a value.
* after data delete from hbase then prepare delete builder to bulk ES
* @param e
* @param delete
* @param edit
* @param durability
* @throws IOException
*/
@Override
public void postDelete(ObserverContext<RegionCoprocessorEnvironment> e, Delete delete, WALEdit edit, Durability durability) throws IOException {
String indexId = new String(delete.getRow());
try {
elasticSearchBulkOperator.addDeleteBuilderToBulk(EsClient.getClient().prepareDelete(indexName,typeName, indexId));
} catch (Exception ex) {
LOG.error(ex);
LOG.error("observer delete a doc, index [ " + EsClient.getClusterName() + " ]" + "indexId [" + indexId + "] error : " + ex.getMessage());
}
}
}
pom文件
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<elasticsearch.version>6.3.2</elasticsearch.version>
<hbase-server.version>1.1.2</hbase-server.version>
<maven-assembly-plugin.version>2.6</maven-assembly-plugin.version>
<commons-logging.version>1.2</commons-logging.version>
<junit.version>4.12</junit.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>transport</artifactId>
<version>${elasticsearch.version}</version>
</dependency>
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty-all</artifactId>
<version>4.1.13.Final</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.10.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.logging.log4j/log4j-api -->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.10.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase-server.version}</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>${elasticsearch.version}</version>
</dependency>
</dependencies>
剩下的就是替换hbase的协处理器了,能全部替换,在配置文件里配置,但是我用的是单个表替换,比较灵活
注意:以下所有命令在使用前请检查好,多个空格也会不成功,切记
先上传包到hdfs,并更改权限
hadoop fs -put hbase-observer-elasticsearch-1.0-SNAPSHOT-zcestestrecord.jar /hbase_es
hadoop fs -chmod -R 777 /hbase_es
进入hbase shell
建表
create ‘dcs’,‘info’
停用表(注意这步千万不要忘了)
disable ‘test_record’
替换协处理器(这步一定要注意,空格多了都不行,可能会出现下载不下来,检查hdfs路径是否有空格)
alter ’ dcs’, METHOD => ‘table_att’, ‘coprocessor’ => ‘hdfs://ambari-1:8020/jar/test03/hbase-elasticsearch-test3.jar|org.eminem.hbase.observer.HbaseDataSyncEsObserver|1001|es_cluster=elasticsearch,es_type=test01,es_index=test01,es_port=9300,es_host=ambari-4’
解读
dcs为hbase表名
hdfs://ambari-1:8020/jar/test03/hbase-elasticsearch-test3.jar为jar包在hdfs上的地址
org.eminem.hbase.observer.HbaseDataSyncEsObserver为运行的程序
es_cluster=elasticsearch这个是而是集群的名字
es_type=test01这个是es的type
es_index=test01这个是es的index
es_port=9300这个是es的port
es_host=ambari-4这个是es的节点,这里我用的负载节点
启用表
enable ‘dcs’
如果成功了,就会显示用时多少秒
如果替换的程序有问题或者替换语句又问题,这一步就会造成集群挂掉
如果集群挂掉,hbase是无法重启的
需要进行以下步骤
在hbase-site.xml文件
添加
然后重启,报错会被抑制,解绑 协处理器
解绑协处理器
disable ‘dcs’
alter ‘dcs’, METHOD => ‘table_att_unset’,NAME => ‘coprocessor$1’
enable ‘dcs’
desc ‘dcs’
desc后查看没有协处理器了就是卸载了
最后是总结一定要看,很重要,很重要,很重要
总结:
绑定之后如果在执行的过程中有报错或者同步不过去,可以到hbase的从节点上的logs目录下查看regionserver报错信息,因为协作器是部署在regionserver上的,所以要到从节点上面去看日志,而不是master节点。
切记:一个hbase表对应一个协处理器,就是说在两个表就要上传两次协处理器jar包,路径不能相同
如果报错,卸载协处理,然后再次上传的时候,路径一定不能相同,相同可能替换不成功,也就是说,如果路径相同,他可能用的还是卸载的那个协处理器
还有一个问题解绑后请将上边两个配置删除在重启hbase
如果上边两个参数在,即使协处理器错误,也不会报错,相当于没有反应,所以在测试阶段请将上边两个参数去掉,生产阶段打开
如何抑制替换报错
在hbase-site.xml文件
添加
<property>
<name>hbase.coprocessor.abortonerror</name>
<value>false</value>
</property>
这两个参数添加后会抑制报警,即使出错也不会报警,不会使hbase因为替换协处理器而挂掉,如果加上这个参数,替换协处理器后数据写入不进去,多半是程序,或者替换语句有问题,尤其是替换语句,非常严格,空格都不能多
还有一个问题解绑后请将上边两个配置删除在重启hbase
如果上边两个参数在,即使协处理器错误,也不会报错,相当于没有反应,所以在测试阶段请将上边两个参数去掉,生产阶段打开