Es整合Hbase实现二级索引
需求:解决海量数据的存储,并且能够实现海量数据的秒级查询。
生产中,一遍文章要分成标题和正文;但是正文的量是比较大的,那么我们一般会在es中存储标题,在hbase 中存储正文(hbase本身就是做海量数据的存储);这样通过es的倒排索引列表检索到关键词的文档id,然后根据文档id在hbase中查询出具体的正文。
数据存在Excel当中,如下
(一)创建索引库
在kibana客户端创建索引库
PUT /articles
{
"settings":{
"number_of_shards":3,
"number_of_replicas":1,
"analysis" : {
"analyzer" : {
"ik" : {
"tokenizer" : "ik_max_word"
}
}
}
},
"mappings":{
"article":{
"dynamic":"strict",
"_source": {
"includes": [
"id","title","from","readCounts","times"
],
"excludes": [
"content"
]
},
"properties":{
"id":{"type": "keyword", "store": true},
"title":{"type": "text","store": true,"index" : true,"analyzer": "ik_max_word"},
"from":{"type": "keyword","store": true},
"readCounts":{"type": "integer","store": true},
"content":{"type": "text","store": false,"index": false},
"times": {"type": "keyword", "index": false}
}
}
}
}
(二)定义Article实体类
/**
* User: Devin Kim
* Date: 2019/12/14 13:31
* Description:
*/
public class Article {
private String id;
private String title;
private String from;
private String times;
private String readCounts;
private String content;
public Article() {
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Article(String id, String title, String from, String times, String readCounts, String content) {
this.id = id;
this.title = title;
this.from = from;
this.times = times;
this.readCounts = readCounts;
this.content = content;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getFrom() {
return from;
}
public void setFrom(String from) {
this.from = from;
}
public String getTimes() {
return times;
}
public void setTimes(String times) {
this.times = times;
}
public String getReadCounts() {
return readCounts;
}
public void setReadCounts(String readCounts) {
this.readCounts = readCounts;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
(三)定义excel解析工具类
import com.devinkim.bean.Article;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* User: Devin Kim
* Date: 2019/12/14 13:10
* Description:
*/
public class ExcelUtil {
public static List<Article> getExcelList() throws IOException {
FileInputStream fileInputStream = new FileInputStream("D:\\code\\Review\\ELk\\eshbase\\src\\main\\resources\\baijia.xlsx");
//获取我们解析excel表格的对象
XSSFWorkbook xssfSheets = new XSSFWorkbook(fileInputStream);
//获取表格的第一个sheet页
XSSFSheet sheet = xssfSheets.getSheetAt(0);
//获取表格的最后一页(表格有多少行)
int lastRowNum = sheet.getLastRowNum();
List<Article> articleList = new ArrayList<Article>();
for (int i = 1; i < lastRowNum; i++){
Article article = new Article();
XSSFRow row = sheet.getRow(i);
XSSFCell title = row.getCell(0);
XSSFCell from = row.getCell(1);
XSSFCell time = row.getCell(2);
XSSFCell readCount = row.getCell(3);
XSSFCell content = row.getCell(4);
article.setId(""+i);
article.setTitle(title.toString());
article.setContent(content.toString());
article.setFrom(from.toString());
article.setReadCounts(readCount.toString());
article.setTimes(time.toString());
articleList.add(article);
}
fileInputStream.close();
return articleList;
}
}
(四)将数据存入ES
import com.devinkim.bean.Article;
import com.devinkim.utils.ExcelUtil;
import com.google.gson.Gson;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import java.io.IOException;
import java.net.InetAddress;
import java.util.List;
/**
* User: Devin Kim
* Date: 2019/12/14 13:30
* Description:
*/
public class SaveArticleToElasticsearch {
public static void main(String[] args) throws IOException {
//获取数据
List<Article> excelList = (List<Article>) ExcelUtil.getExcelList();
//获取操作Es的transportClient对象
Settings settings = Settings.builder().put("cluster.name", "myes").build();
TransportClient transportClient = new PreBuiltTransportClient(settings)
.addTransportAddress(new TransportAddress(InetAddress.getByName("node01"),9300))
.addTransportAddress(new TransportAddress(InetAddress.getByName("node02"),9300));
BulkRequestBuilder bulk = transportClient.prepareBulk();
Gson gson = new Gson();
for (Article article : excelList) {
String jsonStr = gson.toJson(article);
IndexRequestBuilder indexRequestBuilder = transportClient.prepareIndex("articles", "article", article.getId());
indexRequestBuilder.setSource(jsonStr, XContentType.JSON);
bulk.add(indexRequestBuilder);
}
bulk.get();
transportClient.close();
}
}
(五)将数据写入Hbase
import com.devinkim.bean.Article;
import com.devinkim.utils.ExcelUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* User: Devin Kim
* Date: 2019/12/14 13:30
* Description:
*/
public class SaveArticelToHbase {
public static void main(String[] args) throws IOException {
//获取hbase的客户端连接
Configuration configuration = HBaseConfiguration.create();
configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
Connection connection = ConnectionFactory.createConnection(configuration);
Admin admin = connection.getAdmin();
//设置我们表名
TableName tableName = TableName.valueOf("hbase_es_article");
HTableDescriptor hTableDescriptor = new HTableDescriptor(tableName);
String familyName = "f1";
HColumnDescriptor f1 = new HColumnDescriptor(familyName);
hTableDescriptor.addFamily(f1);
if(!admin.tableExists(tableName)){
admin.createTable(hTableDescriptor);
}
//获取table对象
Table table = connection.getTable(tableName);
List<Article> excelList = ExcelUtil.getExcelList();
long startTime = System.currentTimeMillis();
ArrayList<Put> putList = new ArrayList<>();
for (Article article : excelList) {
//使用Article对象的id属性作为rowkey
Put put = new Put(Bytes.toBytes(article.getId()));
put.addColumn(familyName.getBytes(),"title".getBytes(),article.getTitle().getBytes());
put.addColumn(familyName.getBytes(),"from".getBytes(),article.getFrom().getBytes());
put.addColumn(familyName.getBytes(),"times".getBytes(),article.getTimes().getBytes());
put.addColumn(familyName.getBytes(),"readCounts".getBytes(),article.getReadCounts().getBytes());
put.addColumn(familyName.getBytes(),"content".getBytes(),article.getContent().getBytes());
putList.add(put);
}
table.put(putList);
long endTime = System.currentTimeMillis();
System.out.println("向Hbase中写数据共用时"+ (endTime-startTime)/1000 +"秒");
table.close();
admin.close();
connection.close();
}
}
(六)查询
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
/**
* User: Devin Kim
* Date: 2019/12/14 14:57
* Description:
*/
public class EsToHbaseQuery {
public static void main(String[] args) throws IOException {
ArrayList<String> rowkeyList = getRowKeyFromEs("机器人");
System.out.println(rowkeyList);
getInfoFromHbase(rowkeyList);
}
public static ArrayList<String> getRowKeyFromEs(String keyword) throws UnknownHostException {
//获取操作Es的transportClient对象
Settings settings = Settings.builder().put("cluster.name", "myes").build();
TransportClient transportClient = new PreBuiltTransportClient(settings)
.addTransportAddress(new TransportAddress(InetAddress.getByName("node01"),9300))
.addTransportAddress(new TransportAddress(InetAddress.getByName("node02"),9300));
ArrayList<String> rowkeyList = new ArrayList<>();
SearchResponse searchResponse = transportClient.prepareSearch("articles")
.setTypes("article")
.setQuery(QueryBuilders.termQuery("title", keyword))
.get();
SearchHits hits = searchResponse.getHits();
for (SearchHit hit : hits) {
String id = hit.getId();
rowkeyList.add(id);
}
transportClient.close();
return rowkeyList;
}
private static void getInfoFromHbase(ArrayList<String> rowkeyList) throws IOException {
//获取hbase的客户端连接
Configuration configuration = HBaseConfiguration.create();
configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
Connection connection = ConnectionFactory.createConnection(configuration);
//获取table对象
Table table = connection.getTable(TableName.valueOf("hbase_es_article"));
for (String rowkey : rowkeyList) {
Get get = new Get(rowkey.getBytes());
Result result = table.get(get);
Cell[] cells = result.rawCells();
for (Cell cell : cells) {
byte[] contect = cell.getValue();
System.out.println("第"+rowkey+"篇文章内容为:"+ Bytes.toString(contect));
}
}
table.close();
connection.close();
}
}