ES原理简单介绍:
- 倒排索引(反向索引) ==> 根据内容的切词token指向整句的内容
- 分析 ==> 分词、词语标准化
- 相关性排序 ==> 检索词频率,反向索引词频率、字段长度准则
基本查询 (不安全 弃用)
- 根据文档ID
curl -X GET 127.0.0.1:9200/articles/article/1
curl -X GET 127.0.0.1:9200/articles/article/1?_source=title,user_id
curl -X GET 127.0.0.1:9200/articles/article/1?_source=false
- 查询所有
curl -X GET 127.0.0.1:9200/articles/article/_search?_source=title,user_id/&pretty
- 分页
# from: 起始
# size: 每页数量
curl -X GET 127.0.0.1:9200/articles/article/_search?_source=title,user_id\&size=3
curl -X GET 127.0.0.1:9200/articles/article/_search?_source=title,user_id\&size=3\&from=10
- 全文检索
curl -X GET 127.0.0.1:9200/articles/article/_search?q=content:python%20web\&_source=title,article_id\&pretty
curl -X GET 127.0.0.1:9200/articles/article/_search?q=title:python%20web,content:python%20web\&_source=title,article_id\&pretty
curl -X GET 127.0.0.1:9200/articles/article/_search?q=_all:python%20web\&_source=title,article_id\&pretty
Query DSL (请求体查询)
match | 分词匹配 |
match_all | 匹配所有文档 。默认的查询方式 |
multi_match | 在多个字段上执行相同的 match 查询 |
range | 范围查询 (gt:大于、gte:大于等于、lt:小于、lte:小于等于) |
term | 精确匹配 |
missing | IS_NULL |
exists | OT IS_NULL |
- 全文检索 match
curl -X GET 127.0.0.1:9200/articles/article/_search -d '
{
'query':{
'match':{
'title': 'python web'
}
}
}
curl -X GET 127.0.0.1:9200/articles/article/_search -d '
{
'from': 0,
'size': 5,
'query':{
'match':{
'title': 'python web'
}
}
}
- 短语搜索 match_pharse
curl -X GET 127.0.0.1:9200/articles/article/_search?pretty -d'
{
"size": 5,
"_source": ["article_id","title"],
"query" : {
"match_phrase" : {
"_all" : "python web"
}
}
}'
- 精确查找 term
curl -X GET 127.0.0.1:9200/articles/article/_search?pretty -d'
{
"size": 5,
"_source": ["article_id","title", "user_id"],
"query" : {
"term" : {
"user_id" : 1
}
}
}'
- 范围查找 range
curl -X GET 127.0.0.1:9200/articles/article/_search?pretty -d'
{
"size": 5,
"_source": ["article_id","title", "user_id"],
"query" : {
"range" : {
"article_id": {
"gte": 3,
"lte": 5
}
}
}
}'
- 高亮搜索 highlight
curl -X GET 127.0.0.1:9200/articles/article/_search?pretty -d '
{
"size":2,
"_source": ["article_id", "title", "user_id"],
"query": {
"match": {
"title": "python web 编程"
}
},
"highlight":{
"fields": {
"title": {}
}
}
}
'
组合查询
must | 匹配 |
must_not | 不 匹配 |
should | 满足任意一条即可,会增加_score,用于修正得分 |
filter | 必须 匹配,不评分过滤器 |
curl -X GET 127.0.0.1:9200/articles/article/_search?pretty -d '
{
"_source": ["title", "user_id"],
"query": {
"bool": {
"must": {
"match": {
"title": "python web"
}
},
"filter": {
"term": {
"user_id": 2
}
}
}
}
}
'
- 排序
curl -X GET 127.0.0.1:9200/articles/article/_search?pretty -d'
{
"size": 5,
"_source": ["article_id","title"],
"query" : {
"match" : {
"_all" : "python web"
}
},
"sort": [
{ "create_time": { "order": "desc" }},
{ "_score": { "order": "desc" }}
]
}'
- boost 提升权重,优化排序
curl -X GET 127.0.0.1:9200/articles/article/_search?pretty -d'
{
"size": 5,
"_source": ["article_id","title"],
"query" : {
"match" : {
"title" : {
"query": "python web",
"boost": 4
}
}
}
}'
suggest查询详解
es为开发者提供了四种suggest模式:
- Term suggester
- Phrase Suggester
- completion suggester
- context suggester
Term Suggester 示例
只基于analyze过的单个term去提供建议,并不会考虑多个term之间的关系。API调用方只需为每个token挑选options里的词,组合在一起返回给用户前端即可。
{
"from": 0,
"size": 10,
"_source": false,
"suggest": {
"text": "phtyon web",
"word-phrase": {
"phrase": {
"field": "_all",
"size": 1
}
}
}
}'
Phrase Suggester 示例
Phrase suggester在Term suggester的基础上,会考量多个term之间的关系,比如是否同时出现在索引的原文里,相邻程度,以及词频等等。
Completion Suggester 示例
最后来谈一下Completion Suggester,它主要针对的应用场景就是"Auto Completion"。 此场景下用户每输入一个字符的时候,就需要即时发送一次查询请求到后端查找匹配项,在用户输入速度较高的情况下对后端响应速度要求比较苛刻。因此实现上它和前面两个Suggester采用了不同的数据结构,索引并非通过倒排来完成,而是将analyze过的数据编码成FST和索引一起存放。对于一个open状态的索引,FST会被ES整个装载到内存里的,进行前缀查找速度极快。但是FST只能用于前缀查找
{
"suggest": {
"title-suggest" : {
"prefix" : "pyth",
"completion" : {
"field" : "suggest"
}
}
}
}
Python实践
pip install elasticsearch
from elasticsearch5 import Elasticsearch
# elasticsearch集群服务器的地址
ES = [
'127.0.0.1:9200'
]
# 创建elasticsearch客户端
es = Elasticsearch(
ES,
# 启动前嗅探es集群服务器
sniff_on_start=True,
# es集群服务器结点连接异常时是否刷新es结点信息
sniff_on_connection_fail=True,
# 每60秒刷新结点信息
sniffer_timeout=60
)
# 搜索使用方式
query = {
'query': {
'bool': {
'must': [
{'match': {'_all': 'python web'}}
],
'filter': [
{'term': {'status': 2}}
]
}
}
}
ret = es.search(index='articles', doc_type='article', body=query)
# 接口视图实现 search.py
from flask_restful import Resource
from flask_restful.reqparse import RequestParser
from flask_restful import inputs
from flask import g, current_app
from redis.exceptions import RedisError
from . import constants
from cache import article as cache_article
from cache import user as cache_user
from models.user import Search
from models import db
class SearchResource(Resource):
"""
搜索结果
"""
def get(self):
"""
获取搜索结果
"""
qs_parser = RequestParser()
qs_parser.add_argument('q', type=inputs.regex(r'^.{1,50}$'), required=True, location='args')
qs_parser.add_argument('page', type=inputs.positive, required=False, location='args')
qs_parser.add_argument('per_page', type=inputs.int_range(constants.DEFAULT_SEARCH_PER_PAGE_MIN, constants.DEFAULT_SEARCH_PER_PAGE_MAX, 'per_page'), required=False, location='args')
args = qs_parser.parse_args()
q = args.q
page = 1 if args.page is None else args.page
per_page = args.per_page if args.per_page else constants.DEFAULT_SEARCH_PER_PAGE_MIN
# Search from Elasticsearch
query = {
'from': (page-1)*per_page,
'size': per_page,
'_source': False,
'query': {
'bool': {
'must': [
{'match': {'_all': q}}
],
'filter': [
{'term': {'status': 2}}
]
}
}
}
ret = current_app.es.search(index='articles', doc_type='article', body=query)
total_count = ret['hits']['total']
results = []
hits = ret['hits']['hits']
for result in hits:
article_id = int(result['_id'])
article = cache_article.ArticleInfoCache(article_id).get()
if article:
results.append(article)
# Record user search history
if g.user_id and page == 1:
try:
cache_user.UserSearchingHistoryStorage(g.user_id).save(q)
except RedisError as e:
current_app.logger.error(e)
return {'total_count': total_count, 'page': page, 'per_page': per_page, 'results': results}
# constants.py
# 搜索结果分页默认每页数量 下限
DEFAULT_SEARCH_PER_PAGE_MIN = 10
# 搜索结果页默认每页数量 上限
DEFAULT_SEARCH_PER_PAGE_MAX = 50
# 添加ES新文章索引数据
doc = {
'article_id': article.id,
'user_id': article.user_id,
'title': article.title,
'content': article.content.content,
'status': article.status,
'create_time': article.ctime
}
current_app.es.index(index='articles', doc_type='article', body=doc, id=article.id)
建议查询python 代码
class SuggestionResource(Resource):
"""
联想建议
"""
def get(self):
"""
获取联想建议
"""
qs_parser = RequestParser()
qs_parser.add_argument('q', type=inputs.regex(r'^.{1,50}$'), required=True, location='args')
args = qs_parser.parse_args()
q = args.q
# 先尝试自动补全建议查询
query = {
'from': 0,
'size': 10,
'_source': False,
'suggest': {
'word-completion': {
'prefix': q,
'completion': {
'field': 'suggest'
}
}
}
}
ret = current_app.es.search(index='completions', body=query)
options = ret['suggest']['word-completion'][0]['options']
# 如果没得到查询结果,进行纠错建议查询
if not options:
query = {
'from': 0,
'size': 10,
'_source': False,
'suggest': {
'text': q,
'word-phrase': {
'phrase': {
'field': '_all',
'size': 1
}
}
}
}
ret = current_app.es.search(index='articles', doc_type='article', body=query)
options = ret['suggest']['word-phrase'][0]['options']
results = []
for option in options:
if option['text'] not in results:
results.append(option['text'])
return {'options': results}