Elasticsearch 创建索引 导读数据

import pandas as pd
from elasticsearch.helpers import bulk
from elasticsearch import Elasticsearch

es = Elasticsearch("ip:host")

# 设置 mapping 使集群健康值为绿色 全部转换为小写
mapping = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "normalizer": {
                "my_lowercase": {
                    "type": "custom",
                    "filter": [
                        "lowercase"
                    ]
                }
            }
        }
    },
    "mappings": {
        "doc": {
            "properties": {
                "cid": {
                    "type": "keyword",
                    "normalizer": "my_lowercase"
                },
                "mol_name": {
                    "type": "keyword",
                    "normalizer": "my_lowercase"
                }
            }
        }
    }
}


# 索引名 必须小写
index_name = "0508_compound_names"

# 检查索引是否存在
if es.indices.exists(index=index_name):
    # 如果存在删除重新创建
    es.indices.delete(index=index_name, ignore=[400, 404])

# 创建索引
es.indices.create(index=index_name, ignore=400, body=mapping)

# 读文件
data = pd.read_csv("compound_properties.txt", sep="\t", index_col=None, header=0)

# 添加_index列 不加 插入数据时 会报错
data["_index"] = index_name

# 添加_type列  不加 插入数据时 会报错
data["_type"] = "doc"

# 添加id列
row, col = data.shape
_id = range(1, row + 1)
data["_id"] = _id

# 填充 缺失值
data = data.fillna("None")

# 取值需要的值
data = data[["cid", "mol_name", "_index", "_type", "_id"]]

# 保存文件方便检查
data.to_csv("compound_names_index.txt", sep="\t", index=None)

# 转换成需要的格式
dict_in_list_object = data.to_dict("records")

# 添加数据到es中
bulk(es, dict_in_list_object)

print("ok!")

示例数据为下表格式,文件中应为tab 键分割

idcidstatusmol_name
1CMNPD1synonym1,2,4-Trithiolane
2CMNPD1synonym1,2,4-Trithiolan
3CMNPD1synonym1,3,5-Trithiolan
4CMNPD1synonym1.2.4-Trithiolan
5CMNPD1synonym1.3.5-Trithiolan
6CMNPD1synonym[1,2,4]trithiolane
7CMNPD2synonym1,2,4-Trithiolane 4-oxide
8CMNPD2synonym1,2,4-Trithiolan-4-oxid
9CMNPD2synonym1,2,4-trithiolane 4-S-oxide
10CMNPD2synonym1.2.4-Trithiolan-4-oxid
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值