之前用kclpy读取kinesis流数据,处理并保存到elasticsearch中,现在发现elk中的数据展示与当前时间差越来越大(源数据增加了三倍)。阅读kinesis文档进行相应分片、实例数扩展,均没有明显效果。
重新优化了下代码,使用了bulk批量保存数据到elasticsearch,存放速率明显提高。
相关示例代码:
from datetime import datetime import pytz import time from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk import json es = Elasticsearch(hosts=[{'host': "ip", 'port': "9200"}], http_auth=("username", "password")) def index_bulk(): ACTIONS = [] count = 0 for i in range(500): t = time.time() kinesisdict = { "priority": 0, "tags": {i}, "threshold": 0, "kinesis": True, "env": "test", "region": "cn", "metric": "/var/log/sengled/bulk.log", "dataSource": "bulk", "service": "bulk", "status": "", "endpoint": "test-cn-inception-10.12.112.165", "starttime": t, "product": "bulk", "step": 0, "value": "bulk", "ip": "10.12.112.165", "objectType": "dev", "endtime": t, "timestamp": t, "counterType": "" } count = i # kinesisdict = json.loads(json.dumps(bulk_json)) kdict = kinesisdict.copy() kdict['@timestamp'] = datetime.fromtimestamp(int(kinesisdict['timestamp']),pytz.timezone('Asia/Shanghai')) if kdict['starttime'] == 0: kdict['starttime'] = datetime.fromtimestamp(int(kinesisdict['timestamp']),pytz.timezone('Asia/Shanghai')) else: kdict['starttime'] = datetime.fromtimestamp(int(kinesisdict['starttime']),pytz.timezone('Asia/Shanghai')) if kdict['endtime'] == 0: kdict['endtime'] = datetime.fromtimestamp(int(kinesisdict['timestamp']),pytz.timezone('Asia/Shanghai')) else: kdict['endtime'] = datetime.fromtimestamp(int(kinesisdict['endtime']),pytz.timezone('Asia/Shanghai')) kdict['value'] = str(kinesisdict['value']) kdict['threshold'] = str(kinesisdict['threshold']) kdict['tags'] = str(kinesisdict['tags']) del kdict['timestamp'] action = { "_index": "kinesis-2018.07.19", "_type": "kinesisdata", "_source": kdict } ACTIONS.append(action) print(ACTIONS) bulk(es, ACTIONS, index = "kinesis-2018.11.28", raise_on_error=True) print("insert %s lines" % count) index_bulk()