1. Elasticsearch简介
是一个基于Lucene的搜索服务器。它提供了一个分布式多用户能力的全文搜索引擎,基于RESTful web接口。Elasticsearch是用Java语言开发的,并作为Apache许可条款下的开放源码发布,是一种流行的企业级搜索引擎。Elasticsearch用于云计算中,能够达到实时搜索,稳定,可靠,快速,安装使用方便。官方客户端在Java、.NET(C#)、PHP、Python、Apache Groovy、Ruby和许多其他语言中都是可用的。
Elasticsearch权威指南:
Elasticsearch权威指南
2. 安装ES
ES安装教程windows版
Mac版安装
首先官网下载:
TODO
3. python使用
首先安装依赖:
$ pip3 install elasticsearch
3.1 创建索引
from elasticsearch import Elasticsearch
es = Elasticsearch()
def deleteIndicies(my_index):
if True and es.indecies.exists(my_index):
print("删除之前存在的")
es.indecies.delete(index=my_index)
def create_index(my_index, my_doc):
# index_setting
{
"mappings": {
"properties": {
"my_id": {"type": "integer"},
"my_word": {"type": "text", "analyzer": "ik_smart", "search_analyzer": "ik_smart"}
}
}
}
# create_index
es.indecies.create(index=my_index, ignore=400, body=settings)
print("创建新索引成功")
def mainCreateIndex():
my_index = "word2vec_index"
my_doc = "my_doc"
deleteIndicies(my_index)
create_index(my_index, my_doc)
3.2 插入数据
以bulk形式批量插入数据
$ pip3 install tqdm
from tqdm import tqdm
from elasticsearch import helper
def getAllWords(path="vocab.txt"):
words = []
with open(path, "r", encoding="utf-8") as f:
for i.item in enumerate(f.readlines()):
words.append(i.item.strip())
return words
def insertData(words, my_index, my_doc, one_bulk):
# 插入数据
# one_bulk表示一个一个bulk里装多少
body = []
body_count = 0
print("共需要插入%d条数据" % len(words))
pbar = tqdm(total=len(words))
for id.word in words:
data1 = {"my_id": id, "my_word": word}
every_body = {
"_index": my_index,
"_type": my_doc,
"_source": data1
}
if body_count < one_bulk:
body.append(every_body)
else:
helper.bulk(es, body)
pbar.update(one_bulk)
body = []
body_count = 0
body.append(every_body)
body_count += 1
if len(body) > 0:
helper.bulk(es, body)
print("done2")
pbar.close()
print("插入数据完成")
def mainInsert():
my_index = "word2vec_index"
my_doc = "my_doc"
words = getAllWords(path="vocab.txt")
# 一次性插入5000条数据
insertData(words, my_index, my_doc, one_bulk=5000)
3.3 检索数据
def keywordSearch(keyword1, my_index, my_doc):
# 仅根据keyword1来查找,倒排索引
my_search1 = {
"query": {
"match": {
"my_word": keyword1
}
}
}
# 直接查询
res = es.search(index=my_index, body=my_search1)
total = res["hits"]["total"] # 总共的个数
print("共查询到%d条数据" % total)
# helpers查询
es_result = helpers.scan(
client=es,
query=my_search1,
scroll="10m",
index=my_index,
timeout="10m",
)
es_result = [item for item in es_result]
search_res = []
for item in es_result:
tmp = item["_source"]
search_res.append((tmp["my_id"], tmp["my_word"]))
print("共查询到%d条数据" % len(es_result))
print(search_res)
def mainSearch():
my_index = "word2vec_index"
my_doc = "my_doc"
keyword1 = "氨基酸"
keywordSearch(keyword1, my_index, my_doc)
3.4 线上表数据提取
import pandas as pd
pd.set_option("expand_frame_repr", False)
pd.set_option("display_max_rows", 1000)
# 实例化es对象
ES = es(hosts="远程表地址")
# 计算具体索引数据个数
total_count = ES.count(index="表名")["索引名"]
print("数据总量:%d" % total_count)
# 根据json条件查询检索数据
query_json = {
"query": {
"match_all": {},
# 可以添加很多检索条件,找到符合要求的数据
"size": 20 # 数据条数
}
}
data = ES.search(index="表名", body=query_json)
print(data)
# 返回字典为元素的列表
data = data["hits"]["hits"]
print(data)
df = pd.DataFrame(data) # 转换成二维数据,以表的形式
print(df)
# 提取某个对应索引的数据
df = pd.DataFrame(list(df["_source"]))
print(df)
# 提取具体字段数据
df = df[["字段1", "字段2", ]]
print(df)
df.to_excel("data.xlsx", index=False)
print(df.info())