aodh-notifier启动流程小结

一、模块描述

aodh-evaluator评估完告警后,调用self.notifier.notify发送告警,告警通过oslo_messaging发给aodh-notifier,aodh-notifier再发送告警到目标业务地址

二、代码流程(数据流图)

2.1 入口

aodh-notifier先加载告警发送相关插件,然后通过oslo_messaging监听alarming topic,发现发来告警了会根据告警发送url,调用告警发送插件中匹配的插件,调用插件的notify方法。入口在aodh/notifier/__init__.py:AlarmNotifierService.__init__()

class AlarmNotifierService(cotyledon.Service):
    NOTIFIER_EXTENSIONS_NAMESPACE = "aodh.notifier"

    def __init__(self, worker_id, conf):
        super(AlarmNotifierService, self).__init__(worker_id)
        self.conf = conf
        transport = messaging.get_transport(self.conf)
        # 加载告警发送相关插件
        self.notifiers = extension.ExtensionManager(
            self.NOTIFIER_EXTENSIONS_NAMESPACE,
            invoke_on_load=True,
            invoke_args=(self.conf,))

        target = oslo_messaging.Target(topic=self.conf.notifier_topic)
        # 监听topic,默认为alarming
        self.listener = messaging.get_batch_notification_listener(
            transport, [target], [AlarmEndpoint(self.notifiers)], False,
            self.conf.notifier.batch_size, self.conf.notifier.batch_timeout)
        LOG.info('====ready to provide service====')
        self.listener.start()

    def terminate(self):
        self.listener.stop()
        self.listener.wait()

2.2 告警上报

告警目标发送url一般为hw-https开头,在aodh-notifier初始化加载的插件中对应的处理入口在aodh/hw_plugin/alarm/notifier/hwrest.py:HwRestAlarmNotifier.notify(),这个会调父类的notify()看下代码

"""Rest alarm notifier with trusted authentication."""

from six.moves.urllib import parse
from oslo_log import log

from aodh.hw_plugins.alarm.notifier import inner_cache
from aodh.hw_plugins.alarm.notifier import rest

LOG = log.getLogger(__name__)

class HwRestAlarmNotifier(rest.RestAlarmNotifier):

    def notify(self, action, alarm_id, alarm_name, severity, previous, current,
               reason, reason_data):

        try:
            client = inner_cache.get_inner_keystone_client(self.conf)
            if client is None:
                raise Exception('client is None')
        except Exception as ex:
            LOG.error("hw cannot get keystone client: %s", ex)
            raise
        finally:
            pass

        # Remove the fake user.
        netloc = action.netloc.split("@", 1)[-1]
        # Remove the trust prefix.
        scheme = action.scheme[3:]

        action = parse.SplitResult(scheme, netloc, action.path, action.query,
                                   action.fragment)

        headers = {'X-Auth-Token': client.auth_token}
        super(HwRestAlarmNotifier, self).notify(
            action, alarm_id, alarm_name, severity, previous, current, reason,
            reason_data, headers)

2.2.1 RestAlarmNotifier.notify():

此处逻辑为拼接http请求头,然后线程发送告警,线程实际执行post_with_retry()方法(不是实例方法,是一般函数)

class RestAlarmNotifier(notifier.AlarmNotifier):
    """Rest alarm notifier."""

    def __init__(self, conf):
        super(RestAlarmNotifier, self).__init__(conf)
        self.conf = conf
        self.green_pool = eventlet.GreenPool(GREEN_POOL_SIZE)

    def notify(self, action, alarm_id, alarm_name, severity, previous, current, reason, reason_data, headers=None):
        headers = headers or {}
        if not headers.get('x-openstack-request-id'):
            headers['x-openstack-request-id'] = context.generate_request_id()
        request_id = headers['x-openstack-request-id']

        body = {'alarm_name': alarm_name, 'alarm_id': alarm_id, 'severity': severity, 'previous': previous,
                'current': current, 'reason': reason, 'reason_data': reason_data}
        LOG.info(f'Notifying alarm {alarm_name} {alarm_id} with severity {severity} from {previous} to {current}'
                 f' with action {action} because {reason}. request-id: {request_id} ')
        timeout = self.conf.alarm_send_timeout
        headers['content-type'] = 'application/json'
        kwargs = {'data': jsonutils.dumps(body), 'headers': headers, 'timeout': timeout}

        if action.scheme == 'https':
            default_verify = int(self.conf.rest_notifier_ssl_verify)
            options = urlparse.parse_qs(action.query)
            verify = bool(int(options.get('aodh-alarm-ssl-verify', [default_verify])[-1]))

            kwargs['verify'] = (self.conf.service_credentials.cafile if verify else False)

            cert = self.conf.rest_notifier_certificate_file
            key = self.conf.rest_notifier_certificate_key
            if cert:
                kwargs['cert'] = (cert, key) if key else cert

        while True:
            if self.green_pool.free() > 0:
                self.green_pool.spawn_n(post_with_retry, alarm_id, action.geturl(), self.conf, **kwargs)
                eventlet.sleep(0)
                break
            else:
                LOG.debug("wait for green pool free")
                eventlet.sleep(0.1)

2.3 post_with_retry()

def post_with_retry(alarm_id, url, conf, **kwargs):
    """Post alarm information to URL.

    If success, which means there is no exception and return code is not 5xx,
    then we call ceilometerclient to clear retry_policy flag for group alarms.
    同步发告警
    """

    @lockutils.synchronized('notifier_lock')
    def _post_in_sync(alarm_id, url, **kwargs):
        # post alarm in synchronized lock, ensure all green threads process
        # cache in order
        LOG.info("Post alarm %s in sync", alarm_id)

        cached_urls = inner_cache.get_cached_urls()
        now = datetime.datetime.utcnow()
        is_ok = None
        timestp = None
        if cached_urls.get(url):
            is_ok = cached_urls.get(url).get("is_ok")
            timestp = cached_urls.get(url).get("timestamp")

        LOG.debug("the cached urls is %s" % cached_urls)
        # if this alarm action url has post failed in 120s, do not post it
        # 120秒内上次发送的告警发送失败返回True,下次会继续同步发告警
        if (url in cached_urls) and ((now - timestp).seconds < 120):
            if not is_ok:
                LOG.info("post request of alarm %s has abandoned." % alarm_id)
                return True

        # consider network status is ok in 30s if one alarm post success
        # 30秒内如果发送成功,返回'path_clear',下次异步发送
        if is_ok and (now - timestp).seconds <= 30:
            LOG.debug("alarm %s posting path is clear." % alarm_id)
            return 'path_clear'

        # 检查ssl,如果url为localdomain,返回'SSL Verify Over',否则返回request的返回值,若触发异常则返回异常obj
        ssl_verify_result = _check_cert(url, **kwargs)
        if ssl_verify_result == "SSL Verify Over":
            # Certificate authentication reliable, start to post alarm.
            kwargs['verify'] = False
            try:
                resp = requests.post(url, **kwargs)
            except Exception as e:
                LOG.info("Failed to post alarm %s because %s", alarm_id, e)
            else:
                # 更新告警retry_policy并缓存告警发送url
                if _process_resp(resp, alarm_id, url, cached_urls):
                    return True
            finally:
                pass
        elif isinstance(ssl_verify_result, requests.models.Response):
            # verify certificate and send alarm request directly through.
            if _process_resp(ssl_verify_result,
                             alarm_id, url, cached_urls):
                return True
        else:
            LOG.info("Failed to post alarm %s because %s" %
                     (alarm_id, ssl_verify_result))

        if url not in cached_urls or (now - timestp).seconds >= 120:
            cached_urls[url] = {"timestamp": datetime.datetime.utcnow(),
                                "is_ok": False}

        LOG.info("post alarm %s failed in sync." % alarm_id)
        return False

    # 异步发送告警
    def _post_in_unsync(alarm_id, url, **kwargs):
        # post alarms without lock, make it concurrence
        LOG.info("Post alarm %s in unsync", alarm_id)

        ssl_verify_result = _check_cert(url, **kwargs)
        if ssl_verify_result == "SSL Verify Over":
            # Certificate authentication reliable, start to post alarm.
            kwargs['verify'] = False
            try:
                resp = requests.post(url, **kwargs)
            except Exception as e:
                LOG.info("Failed to post alarm %s because %s", alarm_id, e)
            else:
                LOG.info("Post alarm %s, return code %s, body %s", alarm_id, resp.status_code, resp.content)
                if resp.status_code < 500 or resp.status_code >= 600:
                    if resp.status_code != 400:
                        _change_group_alarm_retry_field(alarm_id)
                    return True
            finally:
                pass

        elif isinstance(ssl_verify_result, requests.models.Response):
            # verify certificate and send alarm request directly through.
            LOG.info("Post alarm %s, return code %s, body %s",
                     alarm_id, ssl_verify_result.status_code,
                     ssl_verify_result.content)
            if (ssl_verify_result.status_code < 500
                    or ssl_verify_result.status_code >= 600):
                if ssl_verify_result.status_code != 400:
                    _change_group_alarm_retry_field(alarm_id)
                return True

        else:
            LOG.info("Failed to post alarm %s because SSL verify failed: %s"
                     % (alarm_id, ssl_verify_result))
        return False

    _get_ceilometer_client(conf)

    # synchronized post every alarm
    # 第一个告警若发送成功,接下来30秒内所有告警都会异步发送,为提高响应时间
    if _post_in_sync(alarm_id, url, **kwargs) is 'path_clear':
        # If last alarm was post success, consider the network status is ok
        # in 30 seconds, then break the lock, alarms will post in concurrence,
        # use this logic to avoid the situation that
        # single post cost to much time
        return _post_in_unsync(alarm_id, url, **kwargs)
    else:
        return
  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值