Demo：Es整合Hbase实现二级索引

DevinKim

已于 2024-08-24 17:53:22 修改

阅读量1k

点赞数

分类专栏： Elasticsearch 文章标签： elasticsearch

于 2019-12-14 19:40:01 首次发布

本文链接：https://blog.csdn.net/Yanxu_Jin/article/details/103542574

版权

Elasticsearch 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

本文介绍了一种利用Elasticsearch和Hbase实现二级索引的方法，以解决海量数据存储及秒级查询的需求。具体实现了数据从Excel导入到Es和Hbase，并通过Es的倒排索引快速定位到Hbase中的详细内容。

摘要由CSDN通过智能技术生成

Es整合Hbase实现二级索引

文章目录

需求：解决海量数据的存储，并且能够实现海量数据的秒级查询。
生产中，一遍文章要分成标题和正文；但是正文的量是比较大的，那么我们一般会在es中存储标题，在hbase 中存储正文（hbase本身就是做海量数据的存储）；这样通过es的倒排索引列表检索到关键词的文档id，然后根据文档id在hbase中查询出具体的正文。

数据存在Excel当中，如下
在这里插入图片描述

(一)创建索引库

在kibana客户端创建索引库

PUT /articles
{  
    "settings":{  
         "number_of_shards":3,  
         "number_of_replicas":1,
         "analysis" : {
            "analyzer" : {
                "ik" : {
                    "tokenizer" : "ik_max_word"
                }
            }
        }
    }, 
    "mappings":{  
         "article":{  
             "dynamic":"strict",
             "_source": {
               "includes": [
                  "id","title","from","readCounts","times"
                ],
               "excludes": [
                  "content"
               ]
             },
             "properties":{  
                 "id":{"type": "keyword", "store": true},  
                 "title":{"type": "text","store": true,"index" : true,"analyzer": "ik_max_word"}, 
                 "from":{"type": "keyword","store": true}, 
                 "readCounts":{"type": "integer","store": true},  
                 "content":{"type": "text","store": false,"index": false},
                 "times": {"type": "keyword", "index": false}
             }  
         }  
    }  
}

(二)定义Article实体类

/**
 * User: Devin Kim
 * Date: 2019/12/14 13:31
 * Description:
 */
public class Article {
    private String id;
    private String title;
    private String from;
    private String times;
    private String readCounts;
    private String content;

    public Article() {
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public Article(String id, String title, String from, String times, String readCounts, String content) {
        this.id = id;
        this.title = title;
        this.from = from;
        this.times = times;

        this.readCounts = readCounts;
        this.content = content;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getFrom() {
        return from;
    }

    public void setFrom(String from) {
        this.from = from;
    }

    public String getTimes() {
        return times;
    }

    public void setTimes(String times) {
        this.times = times;
    }

    public String getReadCounts() {
        return readCounts;
    }

    public void setReadCounts(String readCounts) {
        this.readCounts = readCounts;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }
}

(三)定义excel解析工具类

import com.devinkim.bean.Article;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * User: Devin Kim
 * Date: 2019/12/14 13:10
 * Description:
 */
public class ExcelUtil {
    public static List<Article> getExcelList() throws IOException {
        FileInputStream fileInputStream = new FileInputStream("D:\\code\\Review\\ELk\\eshbase\\src\\main\\resources\\baijia.xlsx");

        //获取我们解析excel表格的对象
        XSSFWorkbook xssfSheets = new XSSFWorkbook(fileInputStream);
        //获取表格的第一个sheet页
        XSSFSheet sheet = xssfSheets.getSheetAt(0);

        //获取表格的最后一页（表格有多少行）
        int lastRowNum = sheet.getLastRowNum();
        List<Article> articleList = new ArrayList<Article>();
        for (int i = 1; i < lastRowNum; i++){
            Article article = new Article();
            XSSFRow row = sheet.getRow(i);
            XSSFCell title = row.getCell(0);
            XSSFCell from  = row.getCell(1);
            XSSFCell time = row.getCell(2);
            XSSFCell readCount = row.getCell(3);
            XSSFCell content = row.getCell(4);

            article.setId(""+i);
            article.setTitle(title.toString());
            article.setContent(content.toString());
            article.setFrom(from.toString());
            article.setReadCounts(readCount.toString());
            article.setTimes(time.toString());

            articleList.add(article);
        }

        fileInputStream.close();
        return articleList;
    }

}

(四)将数据存入ES

import com.devinkim.bean.Article;
import com.devinkim.utils.ExcelUtil;
import com.google.gson.Gson;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.transport.client.PreBuiltTransportClient;

import java.io.IOException;
import java.net.InetAddress;
import java.util.List;

/**
 * User: Devin Kim
 * Date: 2019/12/14 13:30
 * Description:
 */
public class SaveArticleToElasticsearch {
    public static void main(String[] args) throws IOException {
        //获取数据
        List<Article> excelList = (List<Article>) ExcelUtil.getExcelList();

        //获取操作Es的transportClient对象
        Settings settings = Settings.builder().put("cluster.name", "myes").build();
        TransportClient transportClient = new PreBuiltTransportClient(settings)
                .addTransportAddress(new TransportAddress(InetAddress.getByName("node01"),9300))
                .addTransportAddress(new TransportAddress(InetAddress.getByName("node02"),9300));

        BulkRequestBuilder bulk = transportClient.prepareBulk();
        Gson gson = new Gson();
        for (Article article : excelList) {
            String jsonStr = gson.toJson(article);
            IndexRequestBuilder indexRequestBuilder = transportClient.prepareIndex("articles", "article", article.getId());
            indexRequestBuilder.setSource(jsonStr, XContentType.JSON);
            bulk.add(indexRequestBuilder);
        }
        bulk.get();
        transportClient.close();
    }

    }

(五)将数据写入Hbase

import com.devinkim.bean.Article;
import com.devinkim.utils.ExcelUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * User: Devin Kim
 * Date: 2019/12/14 13:30
 * Description:
 */
public class SaveArticelToHbase {
    public static void main(String[] args) throws IOException {
        //获取hbase的客户端连接
        Configuration configuration = HBaseConfiguration.create();
        configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
        Connection connection = ConnectionFactory.createConnection(configuration);
        Admin admin = connection.getAdmin();

        //设置我们表名
        TableName tableName = TableName.valueOf("hbase_es_article");
        HTableDescriptor hTableDescriptor = new HTableDescriptor(tableName);
        String familyName = "f1";
        HColumnDescriptor f1 = new HColumnDescriptor(familyName);
        hTableDescriptor.addFamily(f1);
        if(!admin.tableExists(tableName)){
            admin.createTable(hTableDescriptor);
        }
        //获取table对象
        Table table = connection.getTable(tableName);
        List<Article> excelList = ExcelUtil.getExcelList();
        long startTime = System.currentTimeMillis();
        ArrayList<Put> putList = new ArrayList<>();
        for (Article article : excelList) {
            //使用Article对象的id属性作为rowkey
            Put put = new Put(Bytes.toBytes(article.getId()));
            put.addColumn(familyName.getBytes(),"title".getBytes(),article.getTitle().getBytes());
            put.addColumn(familyName.getBytes(),"from".getBytes(),article.getFrom().getBytes());
            put.addColumn(familyName.getBytes(),"times".getBytes(),article.getTimes().getBytes());
            put.addColumn(familyName.getBytes(),"readCounts".getBytes(),article.getReadCounts().getBytes());
            put.addColumn(familyName.getBytes(),"content".getBytes(),article.getContent().getBytes());
            putList.add(put);
        }
        table.put(putList);

        long endTime = System.currentTimeMillis();
        System.out.println("向Hbase中写数据共用时"+ (endTime-startTime)/1000 +"秒");

        table.close();
        admin.close();
        connection.close();
    }
}

(六)查询

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.transport.client.PreBuiltTransportClient;

import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;

/**
 * User: Devin Kim
 * Date: 2019/12/14 14:57
 * Description:
 */
public class EsToHbaseQuery {
    public static void main(String[] args) throws IOException {
        ArrayList<String> rowkeyList = getRowKeyFromEs("机器人");
        System.out.println(rowkeyList);
        getInfoFromHbase(rowkeyList);
    }

    public static ArrayList<String> getRowKeyFromEs(String keyword) throws UnknownHostException {
        //获取操作Es的transportClient对象
        Settings settings = Settings.builder().put("cluster.name", "myes").build();
        TransportClient transportClient = new PreBuiltTransportClient(settings)
                .addTransportAddress(new TransportAddress(InetAddress.getByName("node01"),9300))
                .addTransportAddress(new TransportAddress(InetAddress.getByName("node02"),9300));

        ArrayList<String> rowkeyList = new ArrayList<>();

        SearchResponse searchResponse = transportClient.prepareSearch("articles")
                .setTypes("article")
                .setQuery(QueryBuilders.termQuery("title", keyword))
                .get();

        SearchHits hits = searchResponse.getHits();
        for (SearchHit hit : hits) {
            String id = hit.getId();
            rowkeyList.add(id);
        }
        transportClient.close();
        return rowkeyList;
    }

    private static void getInfoFromHbase(ArrayList<String> rowkeyList) throws IOException {
        //获取hbase的客户端连接
        Configuration configuration = HBaseConfiguration.create();
        configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
        Connection connection = ConnectionFactory.createConnection(configuration);

        //获取table对象
        Table table = connection.getTable(TableName.valueOf("hbase_es_article"));
        for (String rowkey : rowkeyList) {
            Get get = new Get(rowkey.getBytes());
            Result result = table.get(get);
            Cell[] cells = result.rawCells();
            for (Cell cell : cells) {
                byte[] contect = cell.getValue();
                System.out.println("第"+rowkey+"篇文章内容为："+ Bytes.toString(contect));
            }
        }
        table.close();
        connection.close();
    }
}