一、基本操作过程
1、ik中文分词(两种模式)
ik_max_word:最大化分词,占用存储空间,搜索也会更长时间
ik_smart:按照一定规则语法习惯分词,不重复,但有可能有些细微的关键词没有拆分而无法搜索到
请求 http://localhost:9200/_analyze
参数:
{
"analyzer": "ik_smart",
"text": "经核实,我司从未与腾讯公司或授权他人与腾讯公司就“老干妈”品牌签署《联合市场推广合作协议》"
}
结果:
{
"tokens": [
{
"token": "经",
"start_offset": 0,
"end_offset": 1,
"type": "CN_CHAR",
"position": 0
},
{
"token": "核实",
"start_offset": 1,
"end_offset": 3,
"type": "CN_WORD",
"position": 1
},
{
"token": "我",
"start_offset": 4,
"end_offset": 5,
"type": "CN_CHAR",
"position": 2
},
{
"token": "司",
"start_offset": 5,
"end_offset": 6,
"type": "CN_CHAR",
"position": 3
},
{
"token": "从未",
"start_offset": 6,
"end_offset": 8,
"type": "CN_WORD",
"position": 4
},
{
"token": "与",
"start_offset": 8,
"end_offset": 9,
"type": "CN_CHAR",
"position": 5
},
{
"token": "腾讯",
"start_offset": 9,
"end_offset": 11,
"type": "CN_WORD",
"position": 6
},
{
"token": "公司",
"start_offset": 11,
"end_offset": 13,
"type": "CN_WORD",
"position": 7
},
{
"token": "或",
"start_offset": 13,
"end_offset": 14,
"type": "CN_CHAR",
"position": 8
},
{
"token": "授权",
"start_offset": 14,
"end_offset": 16,
"type": "CN_WORD",
"position": 9
},
{
"token": "他人",
"start_offset": 16,
"end_offset": 18,
"type": "CN_WORD",
"position": 10
},
{
"token": "与",
"start_offset": 18,
"end_offset": 19,
"type": "CN_CHAR",
"position": 11
},
{
"token": "腾讯",
"start_offset": 19,
"end_offset": 21,
"type": "CN_WORD",
"position": 12
},
{
"token": "公司",
"start_offset": 21,
"end_offset": 23,
"type": "CN_WORD",
"position": 13
},
{
"token": "就",
"start_offset": 23,
"end_offset": 24,
"type": "CN_CHAR",
"position": 14
},
{
"token": "老干妈",
"start_offset": 25,
"end_offset": 28,
"type": "CN_WORD",
"position": 15
},
{
"token": "品牌",
"start_offset": 29,
"end_offset": 31,
"type": "CN_WORD",
"position": 16
},
{
"token": "签署",
"start_offset": 31,
"end_offset": 33,
"type": "CN_WORD",
"position": 17
},
{
"token": "联合",
"start_offset": 34,
"end_offset": 36,
"type": "CN_WORD",
"position": 18
},
{
"token": "市场推广",
"start_offset": 36,
"end_offset": 40,
"type": "CN_WORD",
"position": 19
},
{
"token": "合作",
"start_offset": 40,
"end_offset": 42,
"type": "CN_WORD",
"position": 20
},
{
"token": "协议",
"start_offset": 42,
"end_offset": 44,
"type": "CN_WORD",
"position": 21
}
]
}
2、安装插件处理文档附件
./bin/elasticsearch-plugin install ingest-attachment
安装完毕后,即可通过插件来完成常用文档(word、pdf等文本解析)
3、安装IK分词器插件
到官网下载
https://github.com/medcl/elasticsearch-analysis-ik/releases
然后解压到es目录:es/plugins/ 即可,如图:
[root@localhost plugins]# pwd
/opt/soft/es/elasticsearch-6.8.1/plugins
[root@localhost plugins]# ls
analysis-ik ingest-attachment
3、创建附件抽取处理器(管道)
类似于拦截器,能把进入的数据做过滤处理,我们定义了两个操作,data字段是附件要调用插件解析的。
attachment:解析附件中的文本
remove:删除data字段内容,因为我们只需要文本后的分词信息,不需要原始内容。
put http://localhost:9200/_ingest/pipeline/attachment
输入参数:
{
"description": "解析附件文档",
"processors": [
{
"attachment": {"field": "data", "ignore_missing": true}
},
{
"remove": {"field": "data"}
}
]}
返回结果:
{
"acknowledged": true
}
4、创建索引(同时映射表结构)
创建索引:demo
put http://localhost:9200/demo
输入:
{
"settings":{
"number_of_shards": "6",
"number_of_replicas": "1",
"analysis":{
"analyzer":{
"ik":{
"tokenizer":"ik_smart"
}
}
}
},
"mappings": {
"book": {
"properties": {
"id": {
"type": "keyword"
},
"title": {
"type": "text",
"analyzer": "ik_smart"
},
"desc": {
"type": "text",
"analyzer": "ik_max_word"
},
"path": {
"type": "keyword"
},
"create_time": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
},
"update_time": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
},
"attachment": {
"properties": {
"content": {
"type": "text",
"analyzer": "ik_smart"
}
}
}
}
}
}
}
输出:
{
"acknowledged": true,
"shards_acknowledged": true,
"index": "demo"
}
5、写入测试数据
post http://localhost:9200/demo/book/1?pipeline=attachment
注意1:这里URL一定加上定义的pipeline,否则处理器是不会拦截的(即不会解析文件内容)
注意2:URL中的1是es内置的id可以指定(此id与你业务中自己的id不一样)
输入参数:
{
"id":1,
"title":"免疫力是第一生命力",
"desc":"新冠疫情让我们对健康有了全新的认识,面对病毒,免疫力是*有效的药物;无数事实告诉我们,免疫力是生命力。本书将专业的免疫力知识用通俗易懂、轻松幽默的方式呈现给读者,尤其针对免疫系统的组成、免疫防御系统的作战法则、免疫系统的阴阳平衡、如何增强免疫力、生病后如何进行免疫调节等进行了详尽的阐释。",
"path":"d:/file/mianyili.doc",
"create_time":"2020-07-01 16:45:00",
"update_time":"2020-07-01 16:45:00",
"data":"文件流base64编码"
}
输出:
{
"_index": "demo",
"_type": "book",
"_id": "o2FJC3MBJt57Ks2xYp2o",
"_version": 1,
"result": "created",
"_shards": {
"total": 2,
"successful": 1,
"failed": 0
},
"_seq_no": 0,
"_primary_term": 1
}
5、查询测试
post http://localhost:9200/demo/book/_search?pretty=true
输入:
{
"query":{
"multi_match": {
"query":"免疫力 识别",
"fields":[ "title", "attachment.content" ]
}
}
}
输出:
{
"took": 72,
"timed_out": false,
"_shards": {
"total": 6,
"successful": 6,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.2876821,
"hits": [
{
"_index": "demo",
"_type": "book",
"_id": "o2FJC3MBJt57Ks2xYp2o",
"_score": 0.2876821,
"_source": {
"id": 1,
"title": "免疫力是第一生命力",
。。。}
整条记录都出来了,下面进一步优化
二、优化调整
1、查询结果中排除不要的字段
{
"_source":{
"include":[
"id",
"title",
"path","update_time","attachment.content"
],
"exclude":[
"data","desc"
]
},
"query":{
"multi_match": {
"query":"读者法则",
"fields":[ "title", "desc", "attachment.content" ]
}
}
}
2、查询组合
比如:id必须等于10,并且包含其他关键词的文章
{
"_source":{
"exclude":["data","attachment.content"]
},
"query":{
"bool":{
"must":[
{"multi_match":{"query":"账号 密码","fields":["title","desc","attachment.content"]} },
// 如果存在多个条件并列搜错,可以添加多个:multi_match
//{"multi_match":{"query":"账号 密码","fields":["title","desc","attachment.content"]} },
{"match":{"id":"10"} }
]
}
}
}
3、添加高亮
title,desc,attachment.content三个字段中检索出来的相关内容都会多出一个"highlight"属性。
{
"_source":{
"exclude":["data","attachment.content"]
},
"query":{
"bool":{
"must":[
{"multi_match":{"query":"账号 密码","fields":["title","desc","attachment.content"]} },
{"match":{"id":"10"} }
]
}
},
"from":0, "size":10,
"highlight": {
"pre_tags": [
"<em class=\"c_color\">"
],
"post_tags": [
"</em>"
],
"fields": {
"title": {}, "desc":{}, "attachment.content":{}
}
}
}
输出中多出高亮属性:
"highlight": {
"attachment.content": [
"访问相关文档 \n\n如果您安装了 DM数据库,可在<em class=\"c_color\">安装</em>目录的“\\doc”子目录中找到 DM数据库的各种手册\n\n与技术丛书。",
"在<em class=\"c_color\">安装</em> DM的过程中,用户可以选择是否创建初始\n\n数据库。如果当时没有创建,那么在<em class=\"c_color\">安装</em>完成之后,可以利用创建数据库工具 dminit来创\n\n建。",
"找到 dminit所在<em class=\"c_color\">安装</em>目录/bin,输入 dminit和参\n\n数后回车。参数在下一节详细介绍。"
],
"desc": [
"在<em class=\"c_color\">安装</em> DM 的过程中,用户可以选择是否创建初始数据库。如果当时没有创建,那么在<em class=\"c_color\">安装</em>完成之后,可以利用创建数据库工具 dminit 来创建。",
"该工具位于<em class=\"c_color\">安装</em>目录的/bin 目录下"
]
}
4、java客户端调用
5、按照id搜索
"query":{
"constant_score" : {
"filter" : {
"term" : {
"_id" : "3"
}
}
}
}
或者
"query":{
"term" : {"_id":"3"}
}
输出:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 6,
"successful": 6,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "news",
"_type": "_doc",
"_id": "3",
"_score": 1,
"_source": {
"path": "http://localhost:80/test/3",
"update_time": "2020-07-27 10:52:36",
"create_time": "2020-07-27 10:52:36",
"attachment": {
"content_type": "text/plain; charset=UTF-8",
"language": "lt",
"content_length": 2
},
"user_id": "0d6c0b94ae634bc88378a6d0240149c7",
"user_name": "demo",
"org_id": "b1b4a329bed74fe1b352820c3b1a50d5",
"id": "3",等等。
6、组合查询
{
"from":0,
"size":50,
"query":{
"bool":{
"must":[
{"term":{"xm_id":"xm001"}},
{"term":{"file_id":"file002"}},
{"multi_match":{"query":"科技","fields":["content"]}},
{"multi_match":{"query":"经济","fields":["content"]}}
]
}
}
}