一、模块描述
aodh-evaluator评估完告警后,调用self.notifier.notify发送告警,告警通过oslo_messaging发给aodh-notifier,aodh-notifier再发送告警到目标业务地址
二、代码流程(数据流图)
2.1 入口
aodh-notifier先加载告警发送相关插件,然后通过oslo_messaging监听alarming topic,发现发来告警了会根据告警发送url,调用告警发送插件中匹配的插件,调用插件的notify方法。入口在aodh/notifier/__init__.py:AlarmNotifierService.__init__()
class AlarmNotifierService(cotyledon.Service):
NOTIFIER_EXTENSIONS_NAMESPACE = "aodh.notifier"
def __init__(self, worker_id, conf):
super(AlarmNotifierService, self).__init__(worker_id)
self.conf = conf
transport = messaging.get_transport(self.conf)
# 加载告警发送相关插件
self.notifiers = extension.ExtensionManager(
self.NOTIFIER_EXTENSIONS_NAMESPACE,
invoke_on_load=True,
invoke_args=(self.conf,))
target = oslo_messaging.Target(topic=self.conf.notifier_topic)
# 监听topic,默认为alarming
self.listener = messaging.get_batch_notification_listener(
transport, [target], [AlarmEndpoint(self.notifiers)], False,
self.conf.notifier.batch_size, self.conf.notifier.batch_timeout)
LOG.info('====ready to provide service====')
self.listener.start()
def terminate(self):
self.listener.stop()
self.listener.wait()
2.2 告警上报
告警目标发送url一般为hw-https开头,在aodh-notifier初始化加载的插件中对应的处理入口在aodh/hw_plugin/alarm/notifier/hwrest.py:HwRestAlarmNotifier.notify(),这个会调父类的notify()看下代码
"""Rest alarm notifier with trusted authentication."""
from six.moves.urllib import parse
from oslo_log import log
from aodh.hw_plugins.alarm.notifier import inner_cache
from aodh.hw_plugins.alarm.notifier import rest
LOG = log.getLogger(__name__)
class HwRestAlarmNotifier(rest.RestAlarmNotifier):
def notify(self, action, alarm_id, alarm_name, severity, previous, current,
reason, reason_data):
try:
client = inner_cache.get_inner_keystone_client(self.conf)
if client is None:
raise Exception('client is None')
except Exception as ex:
LOG.error("hw cannot get keystone client: %s", ex)
raise
finally:
pass
# Remove the fake user.
netloc = action.netloc.split("@", 1)[-1]
# Remove the trust prefix.
scheme = action.scheme[3:]
action = parse.SplitResult(scheme, netloc, action.path, action.query,
action.fragment)
headers = {'X-Auth-Token': client.auth_token}
super(HwRestAlarmNotifier, self).notify(
action, alarm_id, alarm_name, severity, previous, current, reason,
reason_data, headers)
2.2.1 RestAlarmNotifier.notify():
此处逻辑为拼接http请求头,然后线程发送告警,线程实际执行post_with_retry()方法(不是实例方法,是一般函数)
class RestAlarmNotifier(notifier.AlarmNotifier):
"""Rest alarm notifier."""
def __init__(self, conf):
super(RestAlarmNotifier, self).__init__(conf)
self.conf = conf
self.green_pool = eventlet.GreenPool(GREEN_POOL_SIZE)
def notify(self, action, alarm_id, alarm_name, severity, previous, current, reason, reason_data, headers=None):
headers = headers or {}
if not headers.get('x-openstack-request-id'):
headers['x-openstack-request-id'] = context.generate_request_id()
request_id = headers['x-openstack-request-id']
body = {'alarm_name': alarm_name, 'alarm_id': alarm_id, 'severity': severity, 'previous': previous,
'current': current, 'reason': reason, 'reason_data': reason_data}
LOG.info(f'Notifying alarm {alarm_name} {alarm_id} with severity {severity} from {previous} to {current}'
f' with action {action} because {reason}. request-id: {request_id} ')
timeout = self.conf.alarm_send_timeout
headers['content-type'] = 'application/json'
kwargs = {'data': jsonutils.dumps(body), 'headers': headers, 'timeout': timeout}
if action.scheme == 'https':
default_verify = int(self.conf.rest_notifier_ssl_verify)
options = urlparse.parse_qs(action.query)
verify = bool(int(options.get('aodh-alarm-ssl-verify', [default_verify])[-1]))
kwargs['verify'] = (self.conf.service_credentials.cafile if verify else False)
cert = self.conf.rest_notifier_certificate_file
key = self.conf.rest_notifier_certificate_key
if cert:
kwargs['cert'] = (cert, key) if key else cert
while True:
if self.green_pool.free() > 0:
self.green_pool.spawn_n(post_with_retry, alarm_id, action.geturl(), self.conf, **kwargs)
eventlet.sleep(0)
break
else:
LOG.debug("wait for green pool free")
eventlet.sleep(0.1)
2.3 post_with_retry()
def post_with_retry(alarm_id, url, conf, **kwargs):
"""Post alarm information to URL.
If success, which means there is no exception and return code is not 5xx,
then we call ceilometerclient to clear retry_policy flag for group alarms.
同步发告警
"""
@lockutils.synchronized('notifier_lock')
def _post_in_sync(alarm_id, url, **kwargs):
# post alarm in synchronized lock, ensure all green threads process
# cache in order
LOG.info("Post alarm %s in sync", alarm_id)
cached_urls = inner_cache.get_cached_urls()
now = datetime.datetime.utcnow()
is_ok = None
timestp = None
if cached_urls.get(url):
is_ok = cached_urls.get(url).get("is_ok")
timestp = cached_urls.get(url).get("timestamp")
LOG.debug("the cached urls is %s" % cached_urls)
# if this alarm action url has post failed in 120s, do not post it
# 120秒内上次发送的告警发送失败返回True,下次会继续同步发告警
if (url in cached_urls) and ((now - timestp).seconds < 120):
if not is_ok:
LOG.info("post request of alarm %s has abandoned." % alarm_id)
return True
# consider network status is ok in 30s if one alarm post success
# 30秒内如果发送成功,返回'path_clear',下次异步发送
if is_ok and (now - timestp).seconds <= 30:
LOG.debug("alarm %s posting path is clear." % alarm_id)
return 'path_clear'
# 检查ssl,如果url为localdomain,返回'SSL Verify Over',否则返回request的返回值,若触发异常则返回异常obj
ssl_verify_result = _check_cert(url, **kwargs)
if ssl_verify_result == "SSL Verify Over":
# Certificate authentication reliable, start to post alarm.
kwargs['verify'] = False
try:
resp = requests.post(url, **kwargs)
except Exception as e:
LOG.info("Failed to post alarm %s because %s", alarm_id, e)
else:
# 更新告警retry_policy并缓存告警发送url
if _process_resp(resp, alarm_id, url, cached_urls):
return True
finally:
pass
elif isinstance(ssl_verify_result, requests.models.Response):
# verify certificate and send alarm request directly through.
if _process_resp(ssl_verify_result,
alarm_id, url, cached_urls):
return True
else:
LOG.info("Failed to post alarm %s because %s" %
(alarm_id, ssl_verify_result))
if url not in cached_urls or (now - timestp).seconds >= 120:
cached_urls[url] = {"timestamp": datetime.datetime.utcnow(),
"is_ok": False}
LOG.info("post alarm %s failed in sync." % alarm_id)
return False
# 异步发送告警
def _post_in_unsync(alarm_id, url, **kwargs):
# post alarms without lock, make it concurrence
LOG.info("Post alarm %s in unsync", alarm_id)
ssl_verify_result = _check_cert(url, **kwargs)
if ssl_verify_result == "SSL Verify Over":
# Certificate authentication reliable, start to post alarm.
kwargs['verify'] = False
try:
resp = requests.post(url, **kwargs)
except Exception as e:
LOG.info("Failed to post alarm %s because %s", alarm_id, e)
else:
LOG.info("Post alarm %s, return code %s, body %s", alarm_id, resp.status_code, resp.content)
if resp.status_code < 500 or resp.status_code >= 600:
if resp.status_code != 400:
_change_group_alarm_retry_field(alarm_id)
return True
finally:
pass
elif isinstance(ssl_verify_result, requests.models.Response):
# verify certificate and send alarm request directly through.
LOG.info("Post alarm %s, return code %s, body %s",
alarm_id, ssl_verify_result.status_code,
ssl_verify_result.content)
if (ssl_verify_result.status_code < 500
or ssl_verify_result.status_code >= 600):
if ssl_verify_result.status_code != 400:
_change_group_alarm_retry_field(alarm_id)
return True
else:
LOG.info("Failed to post alarm %s because SSL verify failed: %s"
% (alarm_id, ssl_verify_result))
return False
_get_ceilometer_client(conf)
# synchronized post every alarm
# 第一个告警若发送成功,接下来30秒内所有告警都会异步发送,为提高响应时间
if _post_in_sync(alarm_id, url, **kwargs) is 'path_clear':
# If last alarm was post success, consider the network status is ok
# in 30 seconds, then break the lock, alarms will post in concurrence,
# use this logic to avoid the situation that
# single post cost to much time
return _post_in_unsync(alarm_id, url, **kwargs)
else:
return