一.数据源:es的数据源可以是数据库,文档, json文件,以及爬虫爬取的
二.前提准备:
1.安装es,详细请参考 es安装教程
2.安装kibana,详细请参考 kibana安装教程
3.安装ik分词库,详细请参考https://blog.csdn.net/weixin_44062339/article/details/85006948
1.首先在es目录的pelugins 文件夹下创建ik文件夹
2.拷贝ik的压缩包到文件夹中
3.解压ik压缩包到当前文件夹(切记不要解压到其他,只选中解压到当前文件夹,不然es会报错)
# coding:utf8
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import json
import time
class ElasticObj:
def __init__(self, index_name, index_type, ip):
"""
:param index_name: 索引名称
:param index_type: 索引类型
"""
self.index_name = index_name
self.index_type = index_type
# 无用户名密码状态
self.es = Elasticsearch([ip])
# 用户名密码状态
# self.es = Elasticsearch([ip],http_auth=('elastic', 'password'),port=9200)
# 创建索引
def create_index(self):
'''
创建索引,创建索引名称为ott,类型为ott_type的索引
:param ex: Elasticsearch对象
:return:
'''
# 创建映射
_index_mappings = {
"mappings": {
self.index_type: {
"properties": {
"id": {
"type": "long",
"index": "false"
},
"substance": {
"type": "text",
"index": True,
'analyzer': 'ik_max_word',
'search_analyzer': 'ik_max_word'
},
"title": {
'type': 'text',
"index": True,
'analyzer': 'ik_max_word',
'search_analyzer': 'ik_max_word'
}
}
}
}
}
if self.es.indices.exists(index=self.index_name) is not True:
res = self.es.indices.create(index=self.index_name, body=_index_mappings, ignore=400)
print(res)
# 往ES里面插入数据
def insert_data(self, path1):
# 读取数据源写入es
with open(path1, 'r+', encoding="utf8") as fr:
load_text = json.load(fr)
context = load_text["RECORDS"]
ACTIONS = []
i = 1
bulk_num = 2000
for line in context:
action = {
"_index": self.index_name,
"_type": self.index_type,
"_id": i, # _id 也可以默认生成,不赋值
"_source": {
"id": line['_id']['$oid'],
"substance": line['strContent'],
"title": line['strTitle']}
}
i += 1
ACTIONS.append(action)
# 批量处理
if len(ACTIONS) == bulk_num:
print('插入', i / bulk_num, '批数据')
print(len(ACTIONS))
success, _ = bulk(self.es, ACTIONS, index=self.index_name, raise_on_error=True)
del ACTIONS[0:len(ACTIONS)]
print(success)
if len(ACTIONS) > 0:
success, _ = bulk(self.es, ACTIONS, index=self.index_name, raise_on_error=True)
del ACTIONS[0:len(ACTIONS)]
print('Performed %d actions' % success)
# 删除es里面的索引
def Delete_Index(self, index_name):
result = self.es.indices.delete(index=index_name, ignore=[400, 404])
print(result)
# 把es里面的数据删除掉,这里的id是es里面的_id
def Delete_Index_Data(self, id):
'''
删除索引中的一条
:param id:
:return:
'''
res = self.es.delete(index=self.index_name, doc_type=self.index_type, id=id)
print(res)
def Get_Data_Id(self, id):
res = self.es.get(index=self.index_name, doc_type=self.index_type, id=id)
print(res['_source'])
print('------------------------------------------------------------------')
#
# # 输出查询到的结果
# for hit in res['hits']['hits']:
# # print hit['_source']
print(res['_source']['id'], res['_source']['substance'], res['_source']['title'])
# 更新es里面的内容
def Update_Index_Data(self, id, data):
'''
更新索引中的一条
:param id:
:return:
'''
result = self.es.update(index='news', doc_type='politics', body=data, id=id)
print(result)
# 查询es里面的数据
def seacrh_Data(self, index_name, type_name):
result = self.es.search(index=index_name, doc_type=type_name)
print(result)
def Get_Data_By_Body(self, search_info):
# doc = {'query': {'match_all': {}}}
doc = {
"query": {
"match": {
"substance": search_info
}
}
}
result = self.es.search(index=self.index_name, doc_type=self.index_type, body=doc)
print(json.dumps(result, indent=2, ensure_ascii=False))
def Find_Data_BY_key(self):
mapping = {
'properties': {
'title': {
'type': 'text',
'analyzer': 'ik_max_word',
'search_analyzer': 'ik_max_word'
}
}
}
self.es.indices.delete(index='news', ignore=[400, 404])
self.es.indices.create(index='news', ignore=400)
result = self.es.indices.put_mapping(index='news', doc_type='politics', body=mapping)
print(result)
if __name__ == '__main__':
obj = ElasticObj("test1", "en", ip="127.0.0.1")
# obj.create_index()
# path = r"C:\Users\Administrator\PycharmProjects\python_learn\elasterSerach\dedu_suncn_news.json"
# obj.insert_data(path)
# obj.Delete_Index_Data(7)
# obj.Get_Data_Id(1)
start = time.time()
find_news = "香港"
obj.Get_Data_By_Body(find_news)
end = time.time()
print(end - start)