ES使用记录
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import argparse
class ElasticObj:
def __init__(self, index_name, index_type, ip="127.0.0.1"):
'''
构建es索引,批量导入数据
'''
self.index_name = index_name
self.index_type = index_type
self.es = Elasticsearch([ip])
def bulk_Index_Data(self, csvfile):
'''
用bulk将批量数据存储到es
'''
df = open(csvfile, 'r', encoding='utf8')
lines = df.readlines()
print(len(lines))
doc = []
for item in lines:
it = item.split('\t')
dic = {}
dic['docid'] = it[0]
dic['passage'] = it[3].strip().replace(' ', '')
doc.append(dic)
ACTIONS = []
i = 0
for line in doc:
action = {
"_index": self.index_name,
"_type": self.index_type,
"_source": {
"docid": line['docid'],
"passage": line['passage']}
}
i += 1
ACTIONS.append(action)
print('index_num:', i)
success, _ = bulk(self.es, ACTIONS, index=self.index_name, raise_on_error=True, request_timeout=1000)
print('Performed %d actions' % success)
def create_index(self, index_name, index_type):
'''
创建索引
'''
_index_mappings = {
"mappings": {
"properties": {
"passage": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word"
},
"docid": {
"type": "text"
}
}
}
}
print("begin construct index")
if self.es.indices.exists(index=self.index_name) is not True:
res = self.es.indices.create(index=self.index_name, body=_index_mappings)
print(res)
else:
print("exists")
if __name__ == "__main__":
obj = ElasticObj("passage","_doc")
print("init done!")
obj.create_index("passage","_doc")
print("create done!")
obj.bulk_Index_Data("./datasets/law/new_doc.txt")
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
class ElasticObj:
"""
es索引类,用于检索数据
"""
def __init__(self, index_name,index_type,ip ="127.0.0.1"):
self.index_name =index_name
self.index_type = index_type
self.es = Elasticsearch([ip])
"""
通过问题检索文档
"""
def Get_Data_By_Body(self, question, k):
doc = {
"size": k,
"query": {
"match": {
"passage": question
}
}
}
try:
_searched = self.es.search(index=self.index_name, doc_type=self.index_type, body=doc)
answers = []
for item in _searched['hits']['hits']:
answers.append((item['_source']['passage'][:20], item['_source']['docid']))
return answers
except:
print('search not exist')
print(question)
def search(query,topk):
obj = ElasticObj("passage", "_doc")
answers = obj.Get_Data_By_Body(query, topk)
return answers
if __name__ == "__main__":
query = ''
print(search(query, 3))
curl localhost:9200/_cat/indices?v