Elasticsearch入门级使用python版

最新推荐文章于 2024-07-20 15:27:02 发布

`AllureLove

最新推荐文章于 2024-07-20 15:27:02 发布

阅读量376

点赞数

文章标签： elasticsearch python

本文链接：https://blog.csdn.net/weixin_36488653/article/details/118569523

版权

本文详细介绍了如何在Windows和Mac上安装Elasticsearch，然后使用Python进行索引创建、数据插入和检索。通过示例代码展示了如何批量插入词汇数据，并进行关键词搜索。此外，还提到了从线上表中提取数据的方法。

摘要由CSDN通过智能技术生成

1. Elasticsearch简介

是一个基于Lucene的搜索服务器。它提供了一个分布式多用户能力的全文搜索引擎，基于RESTful web接口。Elasticsearch是用Java语言开发的，并作为Apache许可条款下的开放源码发布，是一种流行的企业级搜索引擎。Elasticsearch用于云计算中，能够达到实时搜索，稳定，可靠，快速，安装使用方便。官方客户端在Java、.NET（C#）、PHP、Python、Apache Groovy、Ruby和许多其他语言中都是可用的。
Elasticsearch权威指南：
Elasticsearch权威指南

2. 安装ES

ES安装教程windows版
Mac版安装
首先官网下载：
TODO

3. python使用

首先安装依赖：

$ pip3 install elasticsearch

3.1 创建索引

from elasticsearch import Elasticsearch

es = Elasticsearch()
def deleteIndicies(my_index):
	if True and es.indecies.exists(my_index):
		print("删除之前存在的"）
		es.indecies.delete(index=my_index)

def create_index(my_index, my_doc):
	# index_setting
	{
	"mappings": {
			"properties": {
				"my_id": {"type": "integer"},
				"my_word": {"type": "text", "analyzer": "ik_smart", "search_analyzer": "ik_smart"}
			}
		}
	}
	# create_index
	es.indecies.create(index=my_index, ignore=400, body=settings)
	print("创建新索引成功"）

def mainCreateIndex():
	my_index = "word2vec_index"
	my_doc = "my_doc"
	deleteIndicies(my_index)
	create_index(my_index, my_doc)

3.2 插入数据

以bulk形式批量插入数据

$ pip3 install tqdm

from tqdm import tqdm
from elasticsearch import helper

def getAllWords(path="vocab.txt"):
	words = []
	with open(path, "r", encoding="utf-8") as f:
		for i.item in enumerate(f.readlines()):
			words.append(i.item.strip())
	return words

def insertData(words, my_index, my_doc, one_bulk):
	# 插入数据
	# one_bulk表示一个一个bulk里装多少
	body = []
	body_count = 0
	print("共需要插入%d条数据" % len(words))
	pbar = tqdm(total=len(words))

	for id.word in words:
		data1 = {"my_id": id, "my_word": word}
		every_body = {
			"_index": my_index,
			"_type": my_doc,
			"_source": data1
		}
		if body_count < one_bulk:
			body.append(every_body)
		else:
			helper.bulk(es, body)
			pbar.update(one_bulk)
			body = []
			body_count = 0
			body.append(every_body)
			body_count += 1
			
	if len(body) > 0:
		helper.bulk(es, body)
		print("done2")
	
	pbar.close()
	print("插入数据完成")

def mainInsert():
	my_index = "word2vec_index"
	my_doc = "my_doc"
	words = getAllWords(path="vocab.txt")
	# 一次性插入5000条数据
	insertData(words, my_index, my_doc, one_bulk=5000)

3.3 检索数据

def keywordSearch(keyword1, my_index, my_doc):
	# 仅根据keyword1来查找，倒排索引
	my_search1 = {
		"query": {
			"match": {
				"my_word": keyword1
			}
		}
	}

	# 直接查询
	res = es.search(index=my_index, body=my_search1)
	total = res["hits"]["total"] # 总共的个数
	print("共查询到%d条数据" % total)

	# helpers查询
	es_result = helpers.scan(
		client=es,
		query=my_search1,
		scroll="10m",
		index=my_index,
		timeout="10m",
	)
	es_result = [item for item in es_result]
	search_res = []
	for item in es_result:
		tmp = item["_source"]
		search_res.append((tmp["my_id"], tmp["my_word"]))
	print("共查询到%d条数据" % len(es_result))
	print(search_res)

def mainSearch():
	my_index = "word2vec_index"
	my_doc = "my_doc"
	keyword1 = "氨基酸"
	keywordSearch(keyword1, my_index, my_doc)

3.4 线上表数据提取

import pandas as pd

pd.set_option("expand_frame_repr", False)
pd.set_option("display_max_rows", 1000)

# 实例化es对象
ES = es(hosts="远程表地址")
# 计算具体索引数据个数
total_count = ES.count(index="表名")["索引名"]
print("数据总量：%d" % total_count)

# 根据json条件查询检索数据
query_json = {
	"query": {
		"match_all": {},
		# 可以添加很多检索条件，找到符合要求的数据
		"size": 20 # 数据条数
	}
}

data = ES.search(index="表名", body=query_json)
print(data)

# 返回字典为元素的列表
data = data["hits"]["hits"]
print(data)

df = pd.DataFrame(data) # 转换成二维数据，以表的形式
print(df)

# 提取某个对应索引的数据
df = pd.DataFrame(list(df["_source"]))
print(df)

# 提取具体字段数据
df = df[["字段1", "字段2", ]]
print(df)

df.to_excel("data.xlsx", index=False)
print(df.info())