1. 分词
1.1 Analysis与Analyzer
- Analysis: 文本分析是把全文本转换一系列单词(term/token)的过程,也要分词
- Analysis是通过Analyzer来实现的。可使用ElasticSearch内置分析器或按需定制化分析器
- 除了在数据写入时转换词条,匹配Query语句时候也需要相同的分析器对查询语句进行分析
1.2 Analyzer组成
分词器Analyzer由三部分组成
- Character Filters(针对原始文本处理,例如去除html)
- Tokenizer(按照规则切分单词)
- Token Filter(将切分的单词进行加工,小写,删除stopwords,增加同义词)
1.3 ElasticSearch内置分词器
- Stanadard Analyzer - 默认分词器,按词切分,小写处理
- Keyword Analyzer - 不分词,直接将输入当作输出
- Customer Analyzer - 自定义分词器
- Simple Analyzer/Stop Analyzer/Whitespace Analyzer/Pattern Analyzer/Language
1.4 使用_analyzer API
# 直接使用analyzer进行测试
GET _analyze
{
"analyzer": "standard",
"text": "I Love China"
}
# 指定索引的字段进行测试
POST books/_analyze
{
"field": "title",
"text": "I Love China"
}
# 自定义分词进行测试
POST _analyze
{
"tokenizer": "standard",
"filter": ["lowercase"],
"text": "I Love China"
}
# 结果
{
"tokens" : [
{
"token" : "i",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "love",
"start_offset" : 2,
"end_offset" : 6,
"type" : "<ALPHANUM>",
"position" : 1
},
{
"token" : "china",
"start_offset" : 7,
"end_offset" : 12,
"type" : "<ALPHANUM>",
"position" : 2
}
]
}
1.5 中文分词IK安装与使用
下载对应版本分词器https://github.com/medcl/elasticsearch-analysis-ik/releases
于ES安装目录plugins下并重命名为ik重启ES即可
对于docker-compose方式部署,可参考https://gitee.com/SJshenjian/blog-code/tree/master/src/main/java/online/shenjian/es并按照docker-compose.yml中挂载plugins目录重启即可
验证中文分词ik
# 最粗粒度分词
GET _analyze
{
"analyzer": "ik_smart",
"text": "我爱中华人民共和国"
}
{
"tokens" : [
{
"token" : "我",
"start_offset" : 0,
"end_offset" : 1,
"type" : "CN_CHAR",
"position" : 0
},
{
"token" : "爱",
"start_offset" : 1,
"end_offset" : 2,
"type" : "CN_CHAR",
"position" : 1
},
{
"token" : "中华人民共和国",
"start_offset" : 2,
"end_offset" : 9,
"type" : "CN_WORD",
"position" : 2
}
]
}
# 最细粒度分词
GET _analyze
{
"analyzer": "ik_max_word",
"text": "我爱中华人民共和国"
}
{
"tokens" : [
{
"token" : "我",
"start_offset" : 0,
"end_offset" : 1,
"type" : "CN_CHAR",
"position" : 0
},
{
"token" : "爱",
"start_offset" : 1,
"end_offset" : 2,
"type" : "CN_CHAR",
"position" : 1
},
{
"token" : "中华人民共和国",
"start_offset" : 2,
"end_offset" : 9,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "中华人民",
"start_offset" : 2,
"end_offset" : 6,
"type" : "CN_WORD",
"position" : 3
},
{
"token" : "中华",
"start_offset" : 2,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 4
},
{
"token" : "华人",
"start_offset" : 3,
"end_offset" : 5,
"type" : "CN_WORD",
"position" : 5
},
{
"token" : "人民共和国",
"start_offset" : 4,
"end_offset" : 9,
"type" : "CN_WORD",
"position" : 6
},
{
"token" : "人民",
"start_offset" : 4,
"end_offset" : 6,
"type" : "CN_WORD",
"position" : 7
},
{
"token" : "共和国",
"start_offset" : 6,
"end_offset" : 9,
"type" : "CN_WORD",
"position" : 8
},
{
"token" : "共和",
"start_offset" : 6,
"end_offset" : 8,
"type" : "CN_WORD",
"position" : 9
},
{
"token" : "国",
"start_offset" : 8,
"end_offset" : 9,
"type" : "CN_CHAR",
"position" : 10
}
]
}
2. 索引模板与索引
2.1 索引模板创建索引
可以通过kibana工具进行创建索引模板
也可以自定义语句,如创建poi索引模板
POST _index_template/poi
{
"index_patterns": ["poi*"],
"template" : {
"settings" : {
"index" : {
"number_of_shards" : "1",
"number_of_replicas" : "1"
}
},
"mappings" : {
"dynamic": "strict", // 严格模式,不允许动态创建字段
"properties" : {
"city" : {
"type" : "keyword"
},
"region" : {
"type" : "keyword"
},
"name" : {
"type" : "text"
},
"location" : {
"type" : "geo_point" // 地址坐标点类型,可以进行范围搜索
}
}
}
}
}
# 查看创建的索引
GET _index_template/poi
# 创建索引
PUT poi
# 查看索引
GET poi
2.2 索引新增字段及重建
# 索引poi新增创建时间字段
PUT poi/_mapping
{
"properties": {
"c_date": {
"type": "date"
}
}
}
# 当想对name进行中文分词时,需要重建索引,然后修改模板,删除旧索引,在重建回来,如下步骤
POST _reindex
{
"source": {
"index": "poi"
},
"dest": {
"index": "poi_bak"
}
}
POST _index_template/poi
{
"index_patterns": ["poi*"],
"template" : {
"settings" : {
"index" : {
"number_of_shards" : "1",
"number_of_replicas" : "1"
}
},
"mappings" : {
"dynamic": "strict",
"properties" : {
"city" : {
"type" : "keyword"
},
"region" : {
"type" : "keyword"
},
"name" : {
"type" : "text",
"analyzer": "ik_smart" // 新增分词
},
"location" : {
"type" : "geo_point"
}
}
}
}
}
DELETE poi
PUT POI
POST _reindex
{
"source": {
"index": "poi_bak"
},
"dest": {
"index": "poi"
}
}
// 如果想修改字段名,可在重建时进行修改
POST _reindex?wait_for_completion=false
{
"source": {
"index": "poi_bak"
},
"dest": {
"index": "poi"
},
"script": {
"source": "ctx._source.paiMaiType=ctx._source.remove(\"zcType\")"
}
}
DELETE poi_bak
2.3 远程索引同步
# 远程索引同步至本地,conflicts=proceed遇到错误忽略继续执行
POST _reindex?wait_for_completion=false
{
"source": {
"remote": {
"host": "http://192.168.0.XX:9200",
"socket_timeout": "30s",
"connect_timeout": "30s",
"username": "xxxx",
"password": "xxxx"
},
"index": "poi",
"size": 1000,
"query": {
"match_all": {}
}
},
"dest": {
"index": "poi"
}
}
3. 聚合
3.1 聚合分类
- Bucket Aggregation:系列满足特定条件的文档的集合,如term range
GET kibana_sample_data_flights/_search
{
"size": 0, // 只显示聚合结果
"aggs": {
"flight_dest": {
"terms": {
"field": "DestCountry" // 对该字段聚合
}
}
}
}
// 结果示例
{
"aggregations" : {
"flight_dest" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 3187,
"buckets" : [
{
"key" : "IT",
"doc_count" : 2371
},
{
"key" : "US",
"doc_count" : 1987
}
]
}
}
}
- Metric Aggregation:一些数学运算,可以对文档字段进行统计分析, 如min max sum avg
GET kibana_sample_data_flights/_search
{
"size": 0,
"aggs": {
"avg_price": {
"avg": {
"field": "AvgTicketPrice"
}
},
"max_price": {
"avg": {
"field": "AvgTicketPrice"
}
}
}
}
// 结果示例
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 10000,
"relation" : "gte"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"max_price" : {
"value" : 628.2536888148849
},
"avg_price" : {
"value" : 628.2536888148849
}
}
}
GET kibana_sample_data_flights/_search
{
"size": 0,
"aggs": {
"flight_dest": {
"terms": {
"field": "DestCountry"
},
"aggs": { // 嵌套聚合出目的地的天气与票价信息
"stat": {
"stats": {
"field": "AvgTicketPrice"
}
},
"weather": {
"terms": {
"field": "DestWeather"
}
}
}
}
}
}
// 结果示例
{
"aggregations" : {
"flight_dest" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 3187,
"buckets" : [
{
"key" : "IT",
"doc_count" : 2371,
"stat" : {
"count" : 2371,
"min" : 100.57646942138672,
"max" : 1195.3363037109375,
"avg" : 586.9627099618385,
"sum" : 1391688.585319519
},
"weather" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Clear",
"doc_count" : 428
},
{
"key" : "Thunder & Lightning",
"doc_count" : 164
}
]
}
}
]
}
}
}
-
Pipeline Aggregation: 对其他的聚合结果进行二次聚合
-
Matrix Aggregation: 支持对多个字段的操作并提供一个结果矩阵
欢迎关注公众号算法小生或沈健的技术博客shenjian.online