想要为每个token添加负载信息,例如标注词性
- elasticsearch中Token Filter:delimited_payloads
- 注意事项:在设置tokenizer时必须不能把delimited_payloads中的分界符去掉
- 实现:
PUT /20181105
{
"settings": {
"analysis": {
"filter": {
"payloads":{
"type":"delimited_payload",
"encoding":"int",
"delimiter":"|"
}
},
"analyzer": {
"payloads":{
"tokenizer":"whitespace",
"filter":["payloads"],
"char_filter":[]
}
}
}
},
"mappings": {
"doc":{
"properties": {
"text":{
"type": "text",
"analyzer": "payloads"
}
}
}
}
}
GET 20181105/_analyze
{
"analyzer": "payloads",
"text": "the|1 Quick|2 fox|3"
}
"tokenfilters": [
{
"name": "payloads",
"tokens": [
{
"token": "the",
"start_offset": 0,
"end_offset": 5,
"type": "word",
"position": 0,
"bytes": "[74 68 65]",
"payload": "[0 0 0 1]",
"positionLength": 1,
"termFrequency": 1
},
{
"token": "Quick",
"start_offset": 6,
"end_offset": 13,
"type": "word",
"position": 1,
"bytes": "[51 75 69 63 6b]",
"payload": "[0 0 0 2]",
"positionLength": 1,
"termFrequency": 1
},
{
"token": "fox",
"start_offset": 14,
"end_offset": 19,
"type": "word",
"position": 2,
"bytes": "[66 6f 78]",
"payload": "[0 0 0 3]",
"positionLength": 1,
"termFrequency": 1
}
]
}
]
}
}