from elasticsearch import Elasticsearch
from elasticsearch.helpers import reindex
from datetime import datetime
import json
import hashlib
# ES配置格式
# ES_CONF= {
# "es.nodes" : "XXX.XX.XX.XX",
# "es.port" : "XXXX",
# "es.net.http.auth.user":"elastic",
# "es.net.http.auth.pass":"changeme",
# "es.nodes.wan.only":"true",
# "es.input.json": "XX",
# "es.mapping.id": "XXX"
#}
# 初始化传入es索引别名, 索引类型名称, es配置
class ElasticsearchClient:
# 初始化pyspark环境
def __init__(self, alias, type_name, es_conf):
# 初始化es
self.es = Elasticsearch([es_conf['es.nodes']], port=es_conf['es.port'])
self.alias = alias
self.new_index = self.alias+"_"+datetime.now().strftime('%Y%m%d')
es_conf["es.resource"] = self.new_index+'/'+type_name
# 数据写入es
self.es_conf = es_conf
def query_exists_index(self):
# 查看别名是否存在索引
try:
exists_indexes = list(self.es.indices.get_alias(alias).keys())
self.exists_index = exists_indexes[0]
except:
self.exists_index = None
# 传入pyspark DataFrame格式数据, 将数据写入es
def write_to_es(self, df):
rdd_mapped = df.rdd.map(lambda x: x.asDict())
if self.es_conf['es.mapping.id']:
unique_id = self.es_conf['es.mapping.id']
else:
unique_id = 'unique_id'
# 使用hash添加id
def addId(data, unique_id):
j=json.dumps(data).encode('utf-8')
data[unique_id] = hashlib.sha224(j).hexdigest()
return (data[unique_id], json.dumps(data))
# 多参数时需包多一层匿名函数
final_rdd = rdd_mapped.map(lambda x: addId(x, unique_id))
self.query_exists_index()
# 如果存在与当天索引名称一样的索引, 删除旧索引, 重新插入
if self.exists_index == self.new_index:
self.es.indices.delete(index=self.exists_index, ignore=[400, 404])
# 对同个索引插入数据. 如新插入数据与原有数据重复, 则无动作, 如新插入数据与原有数据不同, 则做insert into操作
final_rdd.saveAsNewAPIHadoopFile(
path='-',
outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
keyClass="org.apache.hadoop.io.NullWritable",
valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
conf=self.es_conf
)
def day_update(self):
# 索引别名, 实现零停机更新数据
self.query_exists_index()
# 如果别名已存在索引
if self.exists_index and self.exists_index != self.new_index:
# 新增新索引别名与去除旧索引别名
self.es.indices.update_aliases({
'actions': [
{'add': {'index': self.new_index, 'alias': self.alias}},
{'remove': {'index': self.exists_index, 'alias': self.alias}}
],
})
# 删除旧索引
self.es.indices.delete(index=self.exists_index, ignore=[400, 404])
elif self.exists_index == self.new_index:
pass
else:
print("别名无对应索引")
# 如索引无数据抛出异常,不进行捕获
# 如索引别名无对应索引, 添加
self.es.indices.update_aliases({
'actions': [
{'add': {'index': self.new_index, 'alias': self.alias}},
],
})
pyspark往elasticsearch 写入数据 零停机日更新
最新推荐文章于 2021-12-12 21:32:16 发布