1.ES基于LUCENCE,扩展实现分布式等功能。
2.ES集群安装
3.安装head插件
4.安装IK分词器
POST _analyze
{
"analyze":"standard",
"text":"我爱你中国"
}
POST _analyze
{
"analyze":"ik_max_word",
"text":"我爱你中国"
}
5.基本操作-索引
#创建索引,指定IK分词器
PUT /job_idx
{
"mappings":{
"properties":{
"name":{"type":"text","store":true,"analyzer":"ik_max_word"},
"salary":{"type":"text","store":true},
"jd":{"type":"text","store":true,"analyzer":"ik_max_word"}
}
}
}
# 查看某索引
GET /job_idx/_mapping
# 查看所有索引
GET _cat/indices
# 删除索引
delete /job_idx
6.基本操作-文档
#新增一条数据
PUT /job_idx/_doc/29097
{
"name":"java开发",
"salary":"10k/月",
"jd":"完成java开发工作"
}
#更新字段
POST /job_idx/_update/29097
{
"doc":{
"salary":"15-20k/月"
}
}
#删除ES文档
DELETE /job_idx/_doc/29097
# 查询指定ID的文档
GET /job_idx/_search
{
"query":{
"ids":{
"values":["46313"]
}
}
}
# 关键字搜索
GET /job_idx/_search
{
"query":{
"match":{
"jd":"spark"
}
}
}
# 分页-关键字搜索
GET /job_idx/_search
{
"from":1,
"size":5,
"query":{
"match":{
"jd":"spark"
}
}
}
7.scroll分页查询
普通分页查询时,每次都会重新排序查询,称为深分页。为解决这个问题,可以用scroll分页查询,在第一次查询时生成一个快照,后面每次翻页是用游标从快照中读取下一批数据。
#scroll第一次查询。response中有scroll_id
GET /job_idx/_search?scroll=1m
{
"query":{
"multi_match":{
"query":"销售",
"fields":["name","jd"]
}
}
"size":100
}
#第二次以后查询。
GET _search/scroll?scroll=1m
{
"scroll_id":"xxxxxxxxxxxx"
}
8.javaAPI编程
maven依赖
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>elasticsearch-rest-high-level-client</artifactId>
<version>7.6.1</version>
</dependency>
实体类
@Getter
@Setter
@ToString
public class JobDetail {
// 因为此处无需将id序列化为文档中
@JSONField(serialize = false)
private long id; // 唯一标识
private String area; // 职位所在区域
private String exp; // 岗位要求的工作经验
private String edu; // 学历要求
private String salary; // 薪资范围
private String job_type; // 职位类型(全职/兼职)
private String cmp; // 公司名
private String pv; // 浏览量
private String title; // 岗位名称
private String jd; // 职位描述
}
server JobFullTextServiceImpl.java
public class JobFullTextServiceImpl implements JobFullTextService {
private RestHighLevelClient restHighLevelClient;
private static final String JOB_IDX_NAME = "job_idx";
public JobFullTextServiceImpl() {
restHighLevelClient = new RestHighLevelClient(RestClient.builder(
new HttpHost("node1.itcast.cn", 9200, "http")
, new HttpHost("node2.itcast.cn", 9200, "http")
, new HttpHost("node3.itcast.cn", 9200, "http")
));
}
@Override
public void add(JobDetail jobDetail) {
// 1. 构建IndexRequest对象,用来描述ES发起请求的数据。
IndexRequest indexRequest = new IndexRequest(JOB_IDX_NAME);
// 2. 设置文档ID。
indexRequest.id(jobDetail.getId() + "");
// 3. 构建一个实体类对象,并使用FastJSON将实体类对象转换为JSON。
String json = JSON.toJSONString(jobDetail);
// 4. 使用IndexRequest.source方法设置请求数据。
indexRequest.source(json, XContentType.JSON);
try {
// 5. 使用ES High level client调用index方法发起请求
restHighLevelClient.index(indexRequest, RequestOptions.DEFAULT);
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("索引创建成功!");
}
@Override
public void update(JobDetail jobDetail) throws IOException {
// 1. 判断对应ID的文档是否存在
// a) 构建GetRequest
GetRequest getRequest = new GetRequest(JOB_IDX_NAME, jobDetail.getId() + "");
// b) 执行client的exists方法,发起请求,判断是否存在
boolean exists = restHighLevelClient.exists(getRequest, RequestOptions.DEFAULT);
if(!exists) return;
// 2. 构建UpdateRequest请求
UpdateRequest updateRequest = new UpdateRequest(JOB_IDX_NAME, jobDetail.getId() + "");
// 3. 设置UpdateRequest的文档,并配置为JSON格式
updateRequest.doc(JSON.toJSONString(jobDetail), XContentType.JSON);
// 4. 执行client发起update请求
restHighLevelClient.update(updateRequest, RequestOptions.DEFAULT);
}
@Override
public JobDetail findById(long id) throws IOException {
// 1. 构建GetRequest请求。
GetRequest getRequest = new GetRequest(JOB_IDX_NAME, id + "");
// 2. 使用RestHighLevelClient.get发送GetRequest请求,并获取到ES服务器的响应。
GetResponse response = restHighLevelClient.get(getRequest, RequestOptions.DEFAULT);
// 3. 将ES响应的数据转换为JSON字符串
String json = response.getSourceAsString();
// 4. 并使用FastJSON将JSON字符串转换为JobDetail类对象
JobDetail jobDetail = JSONObject.parseObject(json, JobDetail.class);
// 5. 设置ID字段
jobDetail.setId(id);
return jobDetail;
}
@Override
public void deleteById(long id) throws IOException {
// 1. 构建delete请求
DeleteRequest deleteRequest = new DeleteRequest(JOB_IDX_NAME, id + "");
// 2. 使用client执行delete请求
restHighLevelClient.delete(deleteRequest, RequestOptions.DEFAULT);
}
@Override
public List<JobDetail> searchByKeywords(String keywords) throws IOException {
// 1. 构建SearchRequest检索请求
SearchRequest searchRequest = new SearchRequest(JOB_IDX_NAME);
// 2. 创建一个SearchSourceBuilder专门用于构建查询条件
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
// 3. 使用QueryBuilders.multiMatchQuery构建一个查询条件,并配置到SearchSourceBuilder
MultiMatchQueryBuilder queryBuilder = QueryBuilders.multiMatchQuery(keywords, "jd", "title");
searchSourceBuilder.query(queryBuilder);
// 4. 调用SearchRequest.source将查询条件设置到检索请求
searchRequest.source(searchSourceBuilder);
// 5. 执行RestHighLevelClient.search发起请求
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
// 6. 遍历结果
SearchHits hits = searchResponse.getHits();
List<JobDetail> jobDetailList = new ArrayList<>();
for (SearchHit hit : hits) {
// 1) 获取命中的结果
String json = hit.getSourceAsString();
// 2) 将JSON字符串转换为对象
JobDetail jobDetail = JSON.parseObject(json, JobDetail.class);
// 3) 使用SearchHit.getId设置文档ID
jobDetail.setId(Long.parseLong(hit.getId()));
jobDetailList.add(jobDetail);
}
return jobDetailList;
}
@Override
public Map<String, Object> searchByPage(String keywords, int pageNum, int pageSize) throws IOException {
// 1. 构建SearchRequest检索请求
SearchRequest searchRequest = new SearchRequest(JOB_IDX_NAME);
// 2. 创建一个SearchSourceBuilder专门用于构建查询条件
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
// 3. 使用QueryBuilders.multiMatchQuery构建一个查询条件,并配置到SearchSourceBuilder
MultiMatchQueryBuilder queryBuilder = QueryBuilders.multiMatchQuery(keywords, "jd", "title");
searchSourceBuilder.query(queryBuilder);
// 4. 设置SearchSourceBuilder的from和size参数,构建分页
searchSourceBuilder.from(pageNum);
searchSourceBuilder.size(pageSize);
// 4. 调用SearchRequest.source将查询条件设置到检索请求
searchRequest.source(searchSourceBuilder);
// 5. 执行RestHighLevelClient.search发起请求
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
// 6. 遍历结果
SearchHits hits = searchResponse.getHits();
List<JobDetail> jobDetailList = new ArrayList<>();
for (SearchHit hit : hits) {
// 1) 获取命中的结果
String json = hit.getSourceAsString();
// 2) 将JSON字符串转换为对象
JobDetail jobDetail = JSON.parseObject(json, JobDetail.class);
// 3) 使用SearchHit.getId设置文档ID
jobDetail.setId(Long.parseLong(hit.getId()));
jobDetailList.add(jobDetail);
}
// 8. 将结果封装到Map结构中(带有分页信息)
// a) total -> 使用SearchHits.getTotalHits().value获取到所有的记录数
// b) content -> 当前分页中的数据
Map<String, Object> result = new HashMap<>();
result.put("total", hits.getTotalHits().value);
result.put("content", jobDetailList);
return result;
}
@Override
public Map<String, Object> searchByScrollPage(String keywords, String scrollId, int pageSize) throws IOException {
SearchResponse searchResponse = null;
if(scrollId == null) {
// 1. 构建SearchRequest检索请求
SearchRequest searchRequest = new SearchRequest(JOB_IDX_NAME);
// 2. 创建一个SearchSourceBuilder专门用于构建查询条件
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
// 3. 使用QueryBuilders.multiMatchQuery构建一个查询条件,并配置到SearchSourceBuilder
MultiMatchQueryBuilder queryBuilder = QueryBuilders.multiMatchQuery(keywords, "jd", "title");
searchSourceBuilder.query(queryBuilder);
searchSourceBuilder.size(pageSize);
// 设置高亮查询
HighlightBuilder highlightBuilder = new HighlightBuilder();
highlightBuilder.preTags("<font color='red'>");
highlightBuilder.postTags("</font>");
highlightBuilder.field("title");
highlightBuilder.field("jd");
searchSourceBuilder.highlighter(highlightBuilder);
// 4. 调用searchRequest.scroll设置滚动快照有效时间
searchRequest.scroll(TimeValue.timeValueMinutes(10));
// 5. 调用SearchRequest.source将查询条件设置到检索请求
searchRequest.source(searchSourceBuilder);
// 6. 执行RestHighLevelClient.search发起请求
searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
}
else {
SearchScrollRequest searchScrollRequest = new SearchScrollRequest(scrollId);
searchScrollRequest.scroll(TimeValue.timeValueMinutes(10));
searchResponse = restHighLevelClient.scroll(searchScrollRequest, RequestOptions.DEFAULT);
}
if(searchResponse != null) {
// 7. 遍历结果
SearchHits hits = searchResponse.getHits();
List<JobDetail> jobDetailList = new ArrayList<>();
for (SearchHit hit : hits) {
// 1) 获取命中的结果
String json = hit.getSourceAsString();
// 2) 将JSON字符串转换为对象
JobDetail jobDetail = JSON.parseObject(json, JobDetail.class);
// 3) 使用SearchHit.getId设置文档ID
jobDetail.setId(Long.parseLong(hit.getId()));
// 1. 获取高亮字段
Map<String, HighlightField> highlightFieldMap = hit.getHighlightFields();
// 1.1 获取title高亮字段
HighlightField titleHl = highlightFieldMap.get("title");
// 1.2 获取jd高亮字段
HighlightField jdHl = highlightFieldMap.get("jd");
// 2. 将高亮字段进行替换普通字段
// 2.1 处理title高亮,判断高亮是否为空,不为空则将高亮Fragment(碎片)拼接在一起,替换原有普通字段
if(titleHl != null) {
Text[] fragments = titleHl.getFragments();
StringBuilder stringBuilder = new StringBuilder();
for (Text fragment : fragments) {
stringBuilder.append(fragment.string());
}
jobDetail.setTitle(stringBuilder.toString());
}
// 2.2 处理jd高亮
if(jdHl != null) {
Text[] fragments = jdHl.getFragments();
StringBuilder stringBuilder = new StringBuilder();
for (Text fragment : fragments) {
stringBuilder.append(fragment.string());
}
jobDetail.setJd(stringBuilder.toString());
}
jobDetailList.add(jobDetail);
}
// 8. 将结果封装到Map结构中(带有分页信息)
// a) total -> 使用SearchHits.getTotalHits().value获取到所有的记录数
// b) content -> 当前分页中的数据
Map<String, Object> result = new HashMap<>();
result.put("scrollId", searchResponse.getScrollId());
result.put("content", jobDetailList);
return result;
}
return null;
}
@Override
public void close() {
try {
restHighLevelClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
9.ES节点类型
Master节点主要负责:
- 管理索引(创建索引、删除索引)、分配分片
- 维护元数据
- 管理集群节点状态
- 不负责数据写入和查询,比较轻量级
DataNode节点主要负责:
- 数据写入、数据检索,大部分Elasticsearch的压力都在DataNode节点上
- 在生产环境中,内存最好配置大一些
10. Elasticsearch重要工作流程
文档写入原理:
检索原理:
11.Elasticsearch准实时索引实现
- 溢写到文件系统缓存
- 当数据写入到ES分片时,会首先写入到内存中,然后通过内存的buffer生成一个segment,并刷到文件系统缓存中,数据可以被检索(注意不是直接刷到磁盘)
- ES中默认1秒,refresh一次
- 写translog保障容错
- 在写入到内存中的同时,也会记录translog日志,在refresh期间出现异常,会根据translog来进行数据恢复
- 等到文件系统缓存中的segment数据都刷到磁盘中,清空translog文件
- flush到磁盘
- ES默认每隔30分钟会将文件系统缓存的数据刷入到磁盘
- segment合并
- Segment太多时,ES定期会将多个segment合并成为大的segment,减少索引查询时IO开销,此阶段ES会真正的物理删除(之前执行过的delete的数据)
12.Elasticsearch SQL
语法:
SELECT select_expr [, ...]
[ FROM table_name ]
[ WHERE condition ]
[ GROUP BY grouping_element [, ...] ]
[ HAVING condition]
[ ORDER BY expression [ ASC | DESC ] [, ...] ]
[ LIMIT [ count ] ]
[ PIVOT ( aggregation_expr FOR column IN ( value [ [ AS ] alias ] [, ...] ) ) ]
示例:
// 查询职位信息
GET /_sql?format=txt
{
"query": "SELECT * FROM job_idx limit 1"
}
// 将SQL转换为DSL
GET /_sql/translate
{
"query": "SELECT * FROM job_idx limit 1"
}
// scroll分页查询-1
GET /_sql?format=json
{
"query": "SELECT * FROM job_idx",
"fetch_size": 10,
"page_timeout": "10m"
}
// scroll分页查询-2
GET /_sql?format=json
{
"cursor":"xxxxx"
}
// 全文搜索匹配
GET /_sql?format=txt
{
"query": "select * from job_idx where MATCH(title, 'hadoop') or MATCH(jd, 'hadoop') limit 10"
}
13.Beats
Beats是一套采集工具,运行在被采集服务器上,将指定数据采集到ES或LogStash上。
FileBeat使用:
1.安装(被采集服务器上):
wget https://www.elastic.co/cn/downloads/past-releases/filebeat-7-6-1
tar -xvzf filebeat-7.6.1-linux-x86_64.tar.gz -C ../server/es/
2.创建配置文件(被采集服务器上)
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/kafka/log/server.log.*
multiline.pattern: '^\['
multiline.negate: true
multiline.match: after
output.elasticsearch:
hosts: ["node1.itcast.cn:9200", "node2.itcast.cn:9200", "node3.itcast.cn:9200"]
3. 运行(被采集服务器上)
./filebeat -c filebeat_kafka_log.yml -e