Nagios数据提取和维护

Nagios数据提取和维护

=======================================================================
提取http服务的主机名,服务名,端口号,及url
create table test1
(
SELECT
    display_name, address, host_object_id
FROM
    nagios_hosts
WHERE
    host_object_id IN (
        SELECT
            host_object_id
        FROM
            nagios_services
        WHERE
            check_command_object_id IN (
                SELECT
                    object_id
                FROM
                    nagios_commands
                WHERE
                    command_line LIKE "%check_http -H%"
                AND config_type = 1
            )
        AND config_type = 1
    )
AND config_type = 1
)
------------
create table test2
(
SELECT
    host_object_id,
    display_name,
    check_command_args
FROM
    nagios_services
WHERE
    check_command_object_id IN (
        SELECT
            object_id
        FROM
            nagios_commands
        WHERE
            command_line LIKE "%check_http -H%"
        AND config_type = 1
    )
AND config_type = 1
)

--------------
SELECT
    test1.display_name,
    test1.address,
    test2.display_name,
    test2.check_command_args
FROM
    test1,
    test2
WHERE
    test1.host_object_id = test2.host_object_id;


------------
drop table test1;
drop table test2;
-------------
----------------------------------------------------------------------------------
主机描述                                 ip            web描述         端口及web
**-**.172.28.**.162.**m1.app.rhl6.4    172.28.**.162    **web    450**!http://**
****-**.172.28.**.163.**m2.app.rhl6.4    172.28.**.163    **web    450**!http://**
----------------------------------------------------------------------------------
=======================================================================

=======================================================================
查询服务组的服务分类列表:
SELECT
    a.servicegroup_id,
    b.alias,
    a.address,
    a.display_name,
    a.t2name,
    a.talias
FROM
    (
        SELECT
            m.address,
            m.display_name,
            m.t2name,
            m.alias talias,
            p.servicegroup_id
        FROM
            nagios_servicegroup_members p,
            (
                SELECT
                    t2.service_object_id,
                    t1.alias,
                    t1.address,
                    t1.display_name,
                    t2.display_name t2name
                FROM
                    nagios_hosts t1,
                    nagios_services t2
                WHERE
                    t1.host_object_id = t2.host_object_id
                AND t1.config_type = '1'
            ) m
        WHERE
            p.service_object_id = m.service_object_id
    ) a,
    nagios_servicegroups b
WHERE
    a.servicegroup_id = b.servicegroup_id
ORDER BY
    1 DESC;

=======================================================================

=======================================================================
已纳入Nagios监控平台进行监控的服务器数量:
SELECT
    'Linux服务器',
    count(DISTINCT(host_name))
FROM
    index_data
WHERE
    host_name LIKE '%rhl%'
UNION
    SELECT
        'AIX服务器',
        count(DISTINCT(host_name))
    FROM
        index_data
    WHERE
        host_name LIKE '%AIX'
    OR host_name LIKE '%aix'
    UNION
        SELECT
            'Windows服务器',
            count(DISTINCT(host_name))
        FROM
            index_data
        WHERE
            host_name LIKE '%win'
        UNION
            SELECT
                '虚拟机平台Esxi物理服务器',
                count(DISTINCT(host_name))
            FROM
                index_data
            WHERE
                host_name LIKE 'ESX%';
-----------------------------------------------------------------------------
Linux服务器    22111
AIX服务器    1011
Windows服务器    5411
虚拟机平台Esxi物理服务器    661
------------------------------------------------------------------------------

一个故障事件即为Nagios监控平台监控到的一次严重级别的事件。
主机存活、网页和数据库连接的监控频率为1分钟一次,其它监控指标一般为5分钟一次
================================================================================

================================================================================
本月具体服务事件前10排名:
SELECT
    service_description,
    count(
        DISTINCT ctime,
        service_description
    ) AS event_number
FROM
    log
WHERE
    from_unixtime(ctime, '%Y%m') = "201409"
AND service_description IS NOT NULL
AND (
    STATUS = "CRITICAL"
    OR STATUS = "DOWN"
)
GROUP BY
    service_description
ORDER BY
    count(*) DESC
LIMIT 10;
-----------------------------------------------------------------------------


-----------------------------------------------------------------------------
==============================================================================

==============================================================================
本月最频繁发生故障的服务器前10:
SELECT
    host_name,
    count(DISTINCT ctime, host_name) AS event_number
FROM
    log
WHERE
    from_unixtime(ctime, '%Y%m') = "201409"
AND service_description IS NOT NULL
AND (
    STATUS = "CRITICAL"
    OR STATUS = "DOWN"
)
GROUP BY
    host_name
ORDER BY
    count(DISTINCT ctime, host_name) DESC
LIMIT 10;

-------------------------------------------------------------------------------
 host_name      event_number
172.16.200.53    163
192.168.9.173    5
192.168.9.178    5
192.168.9.185    5
-------------------------------------------------------------------------------
===============================================================================

===============================================================================

本月最频繁发生故障的服务器前10排名-对应的主要故障事件:
drop procedure if exists top_server_events;
DELIMITER $$
CREATE PROCEDURE top_server_events(in v_ym int(6))
BEGIN
  declare v_host_name varchar(255);
  declare v_count int default 0;
  declare stopFlag int default 0;
  DECLARE v_cursor CURSOR FOR
    select host_name,count(distinct ctime,host_name) as event_number from log
      where from_unixtime(ctime,'%Y%m') = v_ym
        and service_description is not null and ( status="CRITICAL" or status="DOWN" )
      group by host_name
      order by count(distinct ctime,host_name) desc
      limit 10;
  DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
  select "start time ... ",now();
  OPEN v_cursor;
  FETCH v_cursor INTO v_host_name,v_count;
  WHILE stopFlag <> 1 DO
    select v_host_name,service_description,count(distinct ctime,service_description) as event_number from log
      where from_unixtime(ctime,'%Y%m')=v_ym and host_name = v_host_name
        and service_description is not null and ( status="CRITICAL" or status="DOWN" )
      group by service_description
      having count(distinct ctime,service_description) > 300
      order by count(distinct ctime,service_description) desc
      limit 10;
      FETCH v_cursor INTO v_host_name,v_count;
  END WHILE;
  CLOSE v_cursor;
  select "end time ... ",now();
END $$
DELIMITER ;


call top_server_events(201406);

生成服务器CPU使用率报表数据:
call cpu_util_linux(201406);
call cpu_util_aix(201406);
call cpu_util_win(201406);
call cpu_util_esxi(201406);

本月"CPU使用率"峰值超过80%的服务器:
select host_name,cpu_max,cpu_avg from cpu_util where ym=201406 and cpu_max > 80;

本月“CPU使用率”峰值超过80%的服务器-对应的主要故障事件:
select host_name,service_description,count(distinct ctime,host_name,service_description) as event_number from log
  where from_unixtime(ctime,'%Y%m')=201406 and host_name in (
    select host_name from cpu_util where ym=201406 and cpu_max > 80 )
  and service_description is not null and ( status="CRITICAL" or status="DOWN" )
  group by host_name,service_description
  order by count(distinct ctime,host_name,service_description) desc
  limit 10 ;


本月"CPU使用率"平均值超过50%的服务器:
select host_name,cpu_avg,cpu_max from cpu_util where ym=201406 and cpu_avg > 50;

本月“CPU使用率”平均值超过50%的服务器-对应的故障事件:
select host_name,service_description,count(distinct ctime,host_name,service_description) as event_number from log
  where from_unixtime(ctime,'%Y%m')=201406 and host_name in (
    select host_name from cpu_util where ym=201406 and cpu_avg > 50 )
  and service_description is not null and ( status="CRITICAL" or status="DOWN" )
  group by host_name,service_description
  order by count(distinct ctime,host_name,service_description) desc
  limit 10;

附1:
create table cpu_util (ym int(6),host_name varchar(255),cpu_max float(5,2),cpu_avg float(5,2));

drop procedure if exists cpu_util_linux;
DELIMITER $$
CREATE PROCEDURE cpu_util_linux(in v_ym int(6))
BEGIN
  declare v_host_name varchar(255);
  declare stopFlag int default 0;
  DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like '%rhl%' ;
/*  DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ;  */
  DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
  select "start time ... ",now();
  OPEN v_cursor;
  FETCH v_cursor INTO v_host_name;
  WHILE stopFlag <> 1 DO
      insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
          select v_ym,v_host_name,max(value),avg(value)
          from data_bin where id_metric = (
            select metric_id from metrics where index_id = (
              select id from index_data c where c.service_description="CPU使用率"
                and c.host_name = v_host_name
              )
              and metric_name = "CpuUser"
            )
          and from_unixtime(ctime,'%Y%m') = v_ym ;
      FETCH v_cursor INTO v_host_name;
  END WHILE;
  CLOSE v_cursor;
  select "end time ... ",now();
END $$
DELIMITER ;


drop procedure if exists cpu_util_aix;
DELIMITER $$
CREATE PROCEDURE cpu_util_aix(in v_ym int(6))
BEGIN
  declare v_host_name varchar(255);
  declare stopFlag int default 0;
  DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like '%AIX' or host_name like '%aix' ;
/*  DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ;  */
  DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
  select "start time ... ",now();
  OPEN v_cursor;
  FETCH v_cursor INTO v_host_name;
  WHILE stopFlag <> 1 DO
      insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
          select v_ym,v_host_name,max(value),avg(value)
          from data_bin where id_metric = (
            select metric_id from metrics where index_id = (
              select id from index_data c where c.service_description="CPU使用率"
                and c.host_name = v_host_name
              )
              and metric_name = "cpuusage"
            )
          and from_unixtime(ctime,'%Y%m') = v_ym ;
      FETCH v_cursor INTO v_host_name;
  END WHILE;
  CLOSE v_cursor;
  select "end time ... ",now();
END $$
DELIMITER ;


drop procedure if exists cpu_util_win;
DELIMITER $$
CREATE PROCEDURE cpu_util_win(in v_ym int(6))
BEGIN
  declare v_host_name varchar(255);
  declare stopFlag int default 0;
  DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like '%win' ;
/*  DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ;  */
  DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
  select "start time ... ",now();
  OPEN v_cursor;
  FETCH v_cursor INTO v_host_name;
  WHILE stopFlag <> 1 DO
      insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
          select v_ym,v_host_name,max(value),avg(value)
          from data_bin where id_metric = (
            select metric_id from metrics where index_id = (
              select id from index_data c where c.service_description="CPU使用率"
                and c.host_name = v_host_name
              )
              and metric_name = "processor usage"
            )
          and from_unixtime(ctime,'%Y%m') = v_ym ;
      FETCH v_cursor INTO v_host_name;
  END WHILE;
  CLOSE v_cursor;
  select "end time ... ",now();
END $$
DELIMITER ;


drop procedure if exists cpu_util_esxi;
DELIMITER $$
CREATE PROCEDURE cpu_util_esxi(in v_ym int(6))
BEGIN
  declare v_host_name varchar(255);
  declare stopFlag int default 0;
  DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like 'ESX%' ;
/*  DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ;  */
  DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
  select "start time ... ",now();
  OPEN v_cursor;
  FETCH v_cursor INTO v_host_name;
  WHILE stopFlag <> 1 DO
      insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
          select v_ym,v_host_name,max(value),avg(value)
          from data_bin where id_metric = (
            select metric_id from metrics where index_id = (
              select id from index_data c where c.service_description="服务器CPU使用率"
                and c.host_name = v_host_name
              )
              and metric_name = "cpu_usage"
            )
          and from_unixtime(ctime,'%Y%m') = v_ym ;
      FETCH v_cursor INTO v_host_name;
  END WHILE;
  CLOSE v_cursor;
  select "end time ... ",now();
END $$
DELIMITER ;
==================================================================================

==================================================================================
本月应用系统网页故障情况

SELECT
    service_description,
    output,
    count(
        DISTINCT ctime,
        service_description,
        output
    ) AS event_number
FROM
    log
WHERE
    from_unixtime(ctime, '%Y%m') = 201406
AND (
    service_description LIKE '%WEB%'
    OR service_description LIKE '%web%'
)
AND service_description IS NOT NULL
AND (
    STATUS = "CRITICAL"
    OR STATUS = "DOWN"
)
GROUP BY
    service_description,
    output
ORDER BY
    count(
        DISTINCT ctime,
        service_description,
        output
    ) DESC
LIMIT 10;
--------------------------------------------------------------------------------------

=======================================================================================

=======================================================================================
SELECT
    host_name,
    count(DISTINCT ctime, host_name)
FROM
    log
WHERE
    from_unixtime(ctime, '%Y%m') = 201406
AND STATUS = "DOWN"
GROUP BY
    host_name
ORDER BY
    count(*) DESC;


========================================================================================


本月数据库访问故障

SELECT
    service_description,
    count(
        DISTINCT ctime,
        service_description
    ) AS event_number
FROM
    log
WHERE
    from_unixtime(ctime, '%Y%m') = 201406
AND (
    service_description LIKE '%数据库监听%'
    OR service_description LIKE '%数据库连接时间%'
)
AND output IS NOT NULL
AND (
    STATUS = "CRITICAL"
    OR STATUS = "DOWN"
)
GROUP BY
    service_description
ORDER BY
    count(
        DISTINCT ctime,
        service_description
    ) DESC;

===========================================================================================


===========================================================================================

可用率计算表:centreon->report->
SELECT
    from_unixtime(
        date_start,
        '%Y-%m-%d %H:%i:%S'
    ) date_start,
    from_unixtime(
        date_end,
        '%Y-%m-%d %H:%i:%S'
    ) date_end,
    OKTimeScheduled,
    OKnbEvent,
    OKTimeAverageAck,
    OKTimeAverageRecovery,
    WARNINGTimeScheduled,
    WARNINGnbEvent,
    WARNINGTimeAverageAck,
    WARNINGTimeAverageRecovery,
    UNKNOWNTimeScheduled,
    UNKNOWNnbEvent,
    UNKNOWNTimeAverageAck,
    UNKNOWNTimeAverageRecovery,
    CRITICALTimeScheduled,
    CRITICALnbEvent,
    CRITICALTimeAverageAck,
    CRITICALTimeAverageRecovery,
    UNDETERMINEDTimeScheduled,
    MaintenanceTime
FROM
    log_archive_service
WHERE
    service_id = (
        SELECT
            service_id
        FROM
            index_data
        WHERE
            service_description = "可用率_新核心数据库"
    );

----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------
可用率算法:
OKTimeScheduled:    26101
CRITICALTimeScheduled:    60
UNDETERMINEDTimeScheduled: 60239
total:    86400

ok%    = 26101/86400*100=30.20949074           =OKTimeScheduled/total*100
未确定%    = 60239/86400=69.72106481           =UNDETERMINEDTimeScheduled/total*100
critical%    =60/86400=0.069444444             =CRITICALTimeScheduled/total*100
ok(%)    =(1-60/26101)*100=99.77012375         =(1-CRITICALTimeScheduled/OKTimeScheduled)*100


可用率维护操作:

1、由于监控平台本身的原因导致缺少某一天的监控数据,则需要根据实际情况增加1条记录,否则对可用率有很大影响。

select
from_unixtime(date_start,'%Y-%m-%d %H:%i:%S') date_start2,
from_unixtime(date_end,'%Y-%m-%d %H:%i:%S') date_end2,
log_id,
host_id,
service_id,
OKTimeScheduled,
OKnbEvent,
OKTimeAverageAck,
OKTimeAverageRecovery,
WARNINGTimeScheduled,
WARNINGnbEvent,
WARNINGTimeAverageAck,
WARNINGTimeAverageRecovery,
UNKNOWNTimeScheduled,
UNKNOWNnbEvent,
UNKNOWNTimeAverageAck,
UNKNOWNTimeAverageRecovery,
CRITICALTimeScheduled,
CRITICALnbEvent,
CRITICALTimeAverageAck,
CRITICALTimeAverageRecovery,
UNDETERMINEDTimeScheduled,
MaintenanceTime,
date_start,
date_end
from log_archive_service
where service_id=(select service_id
                  from index_data
                  where service_description="可用率_新核心数据库"
                 );

缺少一天的数据,则会产生undetermined 1天:
| 2014-04-09 00:00:00 | 2014-04-10 00:00:00 |  26939 |      59 |        768 |           86400
| 2014-04-11 00:00:00 | 2014-04-12 00:00:00 |  28766 |      59 |        768 |           86400                         

补2014-04-10一条记录:
insert into log_archive_service(
host_id,
service_id,
OKTimeScheduled,
OKnbEvent,
OKTimeAverageAck,
OKTimeAverageRecovery,
WARNINGTimeScheduled,
WARNINGnbEvent,
WARNINGTimeAverageAck,
WARNINGTimeAverageRecovery,
UNKNOWNTimeScheduled,
UNKNOWNnbEvent,
UNKNOWNTimeAverageAck,
UNKNOWNTimeAverageRecovery,
CRITICALTimeScheduled,
CRITICALnbEvent,
CRITICALTimeAverageAck,
CRITICALTimeAverageRecovery,
UNDETERMINEDTimeScheduled,
MaintenanceTime,
date_start,
date_end
)
values
(
59,
768,
86400,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
unix_timestamp('2014-04-10 00:00:00'),
unix_timestamp('2014-04-11 00:00:00')
);


2、由于误报,需要对可用率的数据进行调整

UPDATE log_archive_service
SET OKTimeScheduled = 86400,
 CRITICALTimeScheduled = 0,
 CRITICALnbEvent = 0,
 OKnbEvent = 0
WHERE
    from_unixtime(date_start, '%Y-%m-%d') >= "2014-03-01"
AND from_unixtime(date_start, '%Y-%m-%d') <= "2014-03-31"
AND service_id = (
    SELECT
        service_id
    FROM
        index_data
    WHERE
        service_description = "可用率_核心数据库"
);
===============================================================================


UPDATE log_archive_service
SET OKTimeScheduled = 86400,
 CRITICALTimeScheduled = 0,
 CRITICALnbEvent = 0,
 OKnbEvent = 0,
 MaintenanceTime = 0
WHERE
    from_unixtime(date_start, '%Y-%m-%d') >= "2014-06-25"
AND from_unixtime(date_start, '%Y-%m-%d') <= "2014-06-26"
AND service_id = (
    SELECT
        service_id
    FROM
        index_data
    WHERE
        service_description = "可用率_核心数据库"
);

=================================================================================

来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29500582/viewspace-1315905/,如需转载,请注明出处,否则将追究法律责任。

转载于:http://blog.itpub.net/29500582/viewspace-1315905/

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值