#!usr/bin/env python
# -*- coding:utf-8 _*-
"""
/**************************************************************
* Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
* @author:
* @email:
* @dept: KG
* @file: index_data.py
* @time: 2021/04/01
**************************************************************/
创建索引数据
"""
import os
import sys
import json
import requests
import random
from elasticsearch import Elasticsearch
from elasticsearch import helpers
reload(sys)
sys.setdefaultencoding("utf-8")
ip = "127.0.0.1"
port = 9200
es = Elasticsearch(hosts=ip, port=port, timeout=20)
def index_data_bulk(data_file):
"""
:param data_file: 文件地址
"""
actions=[]
index = "_".join(os.path.basename(data_file).split(".")[0].split("_")[0:2])
with open(data_file, "r") as lines:
counter = 0
for line in lines:
line_json = json.loads(line)
action = {'_op_type':'index',#操作 index update create delete
'_index':index,
'_type':'_doc', #type
'_id':line_json["@id"],
'_source':line_json}
if counter == 1000:
index_bulk(actions)
counter = 1
actions = [action]
else:
counter += 1
actions.append(action)
if len(actions) > 0:
index_bulk(actions)
def index_bulk(actions):
"""
进行批量建库
"""
#使用bulk方式
helpers.bulk(client=es,actions=actions)
#streaming_bulk与parallel_bulk类似 需要遍历才会运行
#都可以设置每个批次的大小,parallel_bulk还可以设置线程数
#for ok,response in helpers.streaming_bulk(es,actions):
# if not ok:
# print(response)
def main():
"""
"""
data_file = sys.argv[1]
index_data_bulk(data_file)
if __name__ == '__main__':
main()
ip = "127.0.0.1" # elasticsearch-master svc的IP
port = 9200
恢复执行如下:
ls | awk '{print $1}' | xargs -i python ../restore_indices.py {}