import pandas as pd
from elasticsearch.helpers import bulk
from elasticsearch import Elasticsearch
es = Elasticsearch("ip:host")
# 设置 mapping 使集群健康值为绿色 全部转换为小写
mapping = {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"normalizer": {
"my_lowercase": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
},
"mappings": {
"doc": {
"properties": {
"cid": {
"type": "keyword",
"normalizer": "my_lowercase"
},
"mol_name": {
"type": "keyword",
"normalizer": "my_lowercase"
}
}
}
}
}
# 索引名 必须小写
index_name = "0508_compound_names"
# 检查索引是否存在
if es.indices.exists(index=index_name):
# 如果存在删除重新创建
es.indices.delete(index=index_name, ignore=[400, 404])
# 创建索引
es.indices.create(index=index_name, ignore=400, body=mapping)
# 读文件
data = pd.read_csv("compound_properties.txt", sep="\t", index_col=None, header=0)
# 添加_index列 不加 插入数据时 会报错
data["_index"] = index_name
# 添加_type列 不加 插入数据时 会报错
data["_type"] = "doc"
# 添加id列
row, col = data.shape
_id = range(1, row + 1)
data["_id"] = _id
# 填充 缺失值
data = data.fillna("None")
# 取值需要的值
data = data[["cid", "mol_name", "_index", "_type", "_id"]]
# 保存文件方便检查
data.to_csv("compound_names_index.txt", sep="\t", index=None)
# 转换成需要的格式
dict_in_list_object = data.to_dict("records")
# 添加数据到es中
bulk(es, dict_in_list_object)
print("ok!")
示例数据为下表格式,文件中应为tab 键分割
id | cid | status | mol_name |
---|---|---|---|
1 | CMNPD1 | synonym | 1,2,4-Trithiolane |
2 | CMNPD1 | synonym | 1,2,4-Trithiolan |
3 | CMNPD1 | synonym | 1,3,5-Trithiolan |
4 | CMNPD1 | synonym | 1.2.4-Trithiolan |
5 | CMNPD1 | synonym | 1.3.5-Trithiolan |
6 | CMNPD1 | synonym | [1,2,4]trithiolane |
7 | CMNPD2 | synonym | 1,2,4-Trithiolane 4-oxide |
8 | CMNPD2 | synonym | 1,2,4-Trithiolan-4-oxid |
9 | CMNPD2 | synonym | 1,2,4-trithiolane 4-S-oxide |
10 | CMNPD2 | synonym | 1.2.4-Trithiolan-4-oxid |