搭建nagios监控平台

最新推荐文章于 2024-06-08 23:28:53 发布

YOUNGYUYEAH

最新推荐文章于 2024-06-08 23:28:53 发布

阅读量2.8k

点赞数

分类专栏：监控文章标签： nagios 配置

本文链接：https://blog.csdn.net/YOUNGYUYEAH/article/details/72921210

版权

监控专栏收录该内容

1 篇文章 0 订阅

订阅专栏

Centos7，使用nagios-4.3.1.tar.gz，nagios-plugins-2.2.1 nrpe2.12.tar.gz源码包

安装nagios主程序：./configure --prefix=/usr/local/nagios --with-group=nagios --with-user=nagios && make all && make install && make install-init && make install-commandmode && make install-config

安装nagios-plugins：./configure --prefix=/usr/local/nagios

安装nrpe远程插件：./configure --prefix=/usr/local/nagios

创建nagios用户：useradd -s /sbin/nologin -G naigos nagios

修改nagios属主：chown -R nagios:nagios /usr/local/nagios

nagios的使用最好是通过web展示或者你也可以只使用它的告警功能，这里介绍下nagios的web部署

①：使用apache做web

首先搭建好apache+php的环境，在测试过php文件可以被正确解析之后，可以配置apache的httpd.conf文件加入Listen 800，我们打算将nagios应用部署在800端口上

httpd.conf 检查如下字段：

#添加监听端口
Listen 800

#修改用户
User nagios
Group nagios

#确保php模块已经开启
LoadModule php5_module  modules/libphp5.so

#确保cgi模块已经开启
LoadModule cgid_module modules/mod_cgid.so

#添加主页php支持
DirectoryIndex index.php

#添加类型映射
AddType application/x-httpd-php .php
AddType application/x-httpd-php-source .php5

#将nagios配置独立出来，使httpd.conf整洁些
Include conf/nagios.conf  #conf/nagios.conf的位置取决于apache主目录

然后新建并编辑nagios.conf

#nagios setting
#监听800端口请求的虚拟主机配置
<VirtualHost *:800>
    ServerName localhost:800
    DocumentRoot "/usr/local/nagios/share"

ScriptAlias /nagios/cgi-bin /usr/local/nagios/sbin
<Directory "/usr/local/nagios/sbin">
    AuthType Basic                                             
    Options ExecCGI
    AllowOverride None
    Order allow,deny
    Allow from all 
    AuthName "Nagios server"
    AuthUserFile /usr/local/nagios/etc/htpasswd
    Require valid-user
</Directory>
<Directory "/usr/local/nagios/share">
    AuthType Basic
    Options None
    AllowOverride None
    Order allow,deny
    Allow from all 
    AuthName "Nagios server"
    AuthUserFile /usr/local/nagios/etc/htpasswd
    Require valid-user
</Directory>
</VirtualHost>

这一段其实在安装nagios后make webconf会自动加入/etc/httpd.conf，但是httpd.conf找不到就无法添加，就要手动编辑

然后使用apache的htpasswd工具创建密码用于认证：htpasswd -c /usr/local/nagios/etc/htpasswd[存放路径] 用户[nagios]

然后启动apache并且访问localhost:800 看是否可以加载nagios页面

如果可以正常加载就可以进入nagios的编辑

②：使用nginx做web

要使用nginx作为nagios的web展示页面需要通过一些perl-fcgi的插件

需要FCGI-0.74，FCGI-ProcManager-0.28，IO-1.25和IO-All-0.86

这几个都是perl Makefile.PL && make && make install 就可以了

下面这段是perl-fcgi.pl，可以在网上下载到：通过这段pl脚本执行

/usr/bin/perl /usr/local/nginx/conf/perl-fcgi.pl -l /usr/local/nginx/logs/perl-fcgi.log -pid /usr/local/nginx/logs/perl-fcgi.pid -S /usr/local/nginx/logs/perl-fcgi.sock

意思是：使用/usr/bin/perl执行/usr/local/nginx/conf/perl-fcgi.pl脚本，并对应的在/usr/local/nginx/logs/生成日志和pid文件和sock文件

#!/usr/bin/perl
#
#	author		Daniel Dominik Rudnicki
#	thanks to:	Piotr Romanczuk
#	email		daniel@sardzent.org
#	version		0.4.3
#	webpage		http://www.nginx.eu/
#
#	BASED @ http://wiki.codemongers.com/NginxSimpleCGI
#
#
# use strict;
use FCGI;
use Getopt::Long;
use IO::All;
use Socket;

sub init {
	GetOptions(	"h"	=> \$help,
			"verbose!"=>\$verbose,
			"pid=s"	=> \$filepid,
			"l=s" => \$logfile,
			"S:s"   => \$unixsocket,
			"P:i"   => \$unixport) or usage();
		usage() if $help;

	print "	Starting Nginx-fcgi\n" if $verbose;
	print "	Running with $> UID" if $verbose;
	print "	Perl $]" if $verbose;

#	if ( $> == "0" ) {
#		print "\n\tERROR\tRunning as a root!\n";
#		print "\tSuggested not to do so !!!\n\n";
#		exit 1;
#	}

        if ( ! $logfile ) {
		print "\n\tERROR\t log file must declared\n"
			. "\tuse $0 with option -l filename\n\n";
		exit 1;
	}
	print "	Using log file $logfile\n" if $verbose;
	"\n\n" >> io($logfile);
	addlog($logfile, "Starting Nginx-cfgi");
	addlog($logfile, "Running with $> UID");
	addlog($logfile, "Perl $]");
	addlog($logfile, "Testing socket options");

	if ( ($unixsocket && $unixport) || (!($unixsocket) && !($unixport)) ) {
		print "\n\tERROR\tOnly one option can be used!\n";
		print "\tSuggested (beacuse of speed) is usage UNIX socket -S \n\n";
		exit 1;
	}

	if ($unixsocket) {
		print "	Daemon listening at UNIX socket $unixsocket\n" if $versbose;
		addlog($logfile, "Deamon listening at UNIX socket $unixsocket");
	} else {
		print "	Daemon listening at TCP/IP socket *:$unixport\n" if $verbose;
		#
		addlog($logfile, "Daemon listening at TCP/IP socket *:$unixport");
	}

	if ( -e $filepid ) {
		print "\n\tERROR\t PID file $filepid already exists\n\n";
		addlog($logfile, "Can not use PID file $filepid, already exists.");
		exit 1;
	}

	if ( $unixsocket ) {
		print "	Creating UNIX socket\n" if $verbose;
		$socket = FCGI::OpenSocket( $unixsocket, 10 );
		if ( !$socket) {
			print "	Couldn't create socket\n";
			addlog($logfile, "Couldn't create socket");
			exit 1;
		}
		print "	Using UNIX socket $unixsocket\n" if $verbose;
	} else {
		print "	Creating TCP/IP socket\n" if $verbose;
		$portnumber = ":".$unixport;
		$socket = FCGI::OpenSocket( $unixport, 10 );
		if ( !$socket ) {
			print "	Couldn't create socket\n";
			addlog($logfile, "Couldn't create socket");
			exit 1;
		}
		print " Using port $unixport\n" if $verbose;
	}
	addlog($logfile, "Socket created");

	if ( ! $filepid ) {
		print "\n\tERROR\t PID file must declared\n"
			. "\tuse $0 with option -pid filename\n\n";
		exit 1;
	}
	print "	Using PID file $filepid\n" if $verbose;
	addlog($logfile, "Using PID file $filepid");

	my $pidnumber = $$;
	$pidnumber > io($filepid);
	print " PID number $$\n" if $verbose;
	addlog($logfile, "PID number $pidnumber");
	
}

sub addzero {
	my ($date) = shift;
	if ($date < 10) {
		return "0$date";
	}
       return $date;
}

sub logformat {
	my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$iddst) = localtime(time);
	my $datestring;
	$year += 1900;
	$mon++;
	$mon  = addzero($mon);
	$mday = addzero($mday);
	$min  = addzero($min);
	$datestring = "$year-$mon-$mday $hour:$min";
	return($datestring);
}

sub addlog {
	my ($log_file, $log_message) = @_;
	my $curr_time = logformat();
	my $write_message = "[$curr_time]   $log_message";
	$write_message >> io($log_file);
	"\n" >> io($log_file);
}

sub printerror {
	my $message = @_;
	print "\n	Nginx FastCGI\tERROR\n"
		. "\t $message\n\n";
	exit 1;
}

sub usage {
	print "\n	Nginx FastCGI \n"
		. "\n\tusage: $0 [-h] -S string -P int\n"
		. "\n\t-h\t\t: this (help) message"
		. "\n\t-S path\t\t: path for UNIX socket"
		. "\n\t-P port\t\t: port number"
		. "\n\t-p file\t\t: path for pid file"
		. "\n\t-l file\t\t: path for logfile"
		. "\n\n\texample: $0 -S /var/run/nginx-perl_cgi.sock -l /var/log/nginx/nginx-cfgi.log -pid /var/run/nginx-fcgi.pid\n\n";
	exit 1;
}


init;
#
END() { } BEGIN() { }
*CORE::GLOBAL::exit = sub { die "fakeexit\nrc=".shift()."\n"; }; eval q{exit}; 
if ($@) { 
	exit unless $@ =~ /^fakeexit/; 
} ;

# fork part
my $pid = fork();

if( $pid == 0 ) {
	&main;
	exit 0;
}

print " Forking worker process with PID $pid\n" if $verbose;
addlog($logfile, "Forking worker process with PID $pid");
print " Update PID file $filepid\n" if $verbose;
addlog($logfile, "Update PID file $filepid");
$pid > io($filepid);
print "	Worker process running.\n" if $verbose;
addlog ($logfile, "Parent process $$ is exiting");
exit 0;

sub main {
	$request = FCGI::Request( \*STDIN, \*STDOUT, \*STDERR, \%req_params, $socket );
	if ($request) { request_loop()};
		FCGI::CloseSocket( $socket );
}

sub request_loop {
	while( $request->Accept() >= 0 ) {
		# processing any STDIN input from WebServer (for CGI-POST actions)
		$stdin_passthrough = '';
		$req_len = 0 + $req_params{'CONTENT_LENGTH'};
		if (($req_params{'REQUEST_METHOD'} eq 'POST') && ($req_len != 0) ){
			while ($req_len) {
				$stdin_passthrough .= getc(STDIN);
				$req_len--;	
			}
		}

		# running the cgi app
		if ( (-x $req_params{SCRIPT_FILENAME}) && 
			(-s $req_params{SCRIPT_FILENAME}) && 
			(-r $req_params{SCRIPT_FILENAME})
		){
			foreach $key ( keys %req_params){
				$ENV{$key} = $req_params{$key};
			}
			if ( $verbose ) {
				addlog($logfile, "running $req_params{SCRIPT_FILENAME}");
			}
			# http://perldoc.perl.org/perlipc.html#Safe-Pipe-Opens
			#
			open $cgi_app, '-|', $req_params{SCRIPT_FILENAME}, $stdin_passthrough or print("Content-type: text/plain\r\n\r\n"); print "Error: CGI app returned no output - Executing $req_params{SCRIPT_FILENAME} failed !\n"; # addlog($logfile, "Error: CGI app returned no output - Executing $req_params{SCRIPT_FILENAME} failed !");
			
			if ($cgi_app) { 
				print <$cgi_app>; 
				close $cgi_app; 
			}
		} else {
			print("Content-type: text/plain\r\n\r\n");
			print "Error: No such CGI app - $req_params{SCRIPT_FILENAME} may not exist or is not executable by this process.\n";
			addlog($logfile, "Error: No such CGI app - $req_params{SCRIPT_FILENAME} may not exist or is not executable by this process.");
		}
	}
}

生成之后，就要修改nginx的nginx.conf来使用nagios了

#修改使用者为nagios，不然执行有异常
user nagios;

server
{
    listen        80;
    server_name   localhost;
    root   /usr/local/nagios/share;
    index  index.html index.php;
    
    auth_basic    "Nagios server";
    auth_basic_user_file /usr/local/nagios/etc/htpasswd;
    #认证这一块的密钥在apache部分有说，通过同样办法生成

    location ~ .*\.(php|php5)?$
    {
         fastcgi_pass   127.0.0.1:9000;
         fastcgi_index  index.php;
         fastcgi_param  SCRIPT_FILENAME  $document_root$fastcgi_script_name;
         include  /usr/local/nginx/conf/fastcgi_params;
    }

    location ~ .*\.(cgi|pl)?$
    {
         root   /usr/local/nagios/sbin;
         rewrite  ^/nagios/cgi-bin/(.*)\.cgi /$1.cgi break;
         fastcgi_pass   unix:/usr/local/nginx/logs/perl-fcgi.sock;
         fastcgi_index  index.cgi;
         fastcgi_param  SCRIPT_FILENAME  /usr/local/nagios/sbin$fastcgi_script_name;
         fastcgi_param  REMOTE_USER  $remote_user;
         include  /usr/local/nginx/conf/fastcgi_params;
    }

    location /nagios
    {
         alias  /usr/local/nagios/share;
    }  

    location /cgi-bin/images
    {
         alias  /usr/local/nagios/share/images;
    }

    location /cgi-bin/stylesheets
    {
         alias  /usr/local/nagios/share/stylesheets;
    }
    
    location /cgi-bin
    {
         alias  /usr/local/nagios/sbin;
    }

}

然后nginx -t 检查语法，没有异常就可以启动了

编辑nagios配置文件：

nagios的配置文件相当复杂，各个模板文件之间相互套用，下面只讲需要用到的几个

nagios.cfg：主配置文件；

#配置其他文件的读取路径
cfg_file=/usr/local/nagios/etc/objects/commands.cfg
cfg_file=/usr/local/nagios/etc/objects/contacts.cfg
cfg_file=/usr/local/nagios/etc/objects/timeperiods.cfg
cfg_file=/usr/local/nagios/etc/objects/templates.cfg 

#配置读取的目录
cfg_dir=/usr/local/nagios/etc/servers

cgi.cfg：控制cgi访问的配置文件；

#开启验证模式，不开启=0
use_authentication=1

#验证用户默认为nagiosadmin，添加通过htpasswd建立的验证用户nagios，用"，"隔开
authorized_for_system_information=nagiosadmin，nagios
authorized_for_configuration_information=nagiosadmin，nagios
authorized_for_system_commands=nagiosadmin，nagios
authorized_for_all_services=nagiosadmin，nagios
authorized_for_all_hosts=nagiosadmin，nagios
authorized_for_all_service_commands=nagiosadmin，nagios
authorized_for_all_host_commands=nagiosadmin，nagios

resource.cfg：变量定义文件；

objects/commands.cfg：命令定义配置文件；

objects/contacts.cfg：联系人配置文件；

define contact{
        contact_name         nagios
        use                  generic-contact 
        alias                Nagios Admin
        email                admin@youngyuyeah.com.cn;    ；接收的邮箱地址
        }   
define contactgroup{
        contactgroup_name    admins
        alias                Nagios Administrators
        members              nagios
        }

objects/templates.cfg：模板文件；

#模板文件的介绍
define contact{
        name                            generic-contact    ; 联系人名称
        service_notification_period     24x7               ; 当服务出现异常时，使用24x7的时间定义
        host_notification_period        24x7               ; 当主机出现异常时，使用24x7的时间定义
        service_notification_options    w,u,c,r            ; 这个定义的是“通知可以被发出的情况”。w即warn，表示警告状态，u即unknown，表示不明状态;
                                                           ; c即criticle，表示紧急状态，r即recover，表示恢复状态;
                                                           ; 也就是在服务出现警告状态、未知状态、紧急状态和重新恢复状态时都发送通知给使用者。
        host_notification_options       d,u,r                   ; 定义主机在什么状态下需要发送通知给使用者，d即down，表示宕机状态;
                                                                ; u即unreachable，表示不可到达状态，r即recovery，表示重新恢复状态。
        service_notification_commands   notify-service-by-email ; 服务故障时，发送邮件;
        host_notification_commands      notify-host-by-email    ; 主机故障时，发送邮件;
        register                        0                    ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL CONTACT, JUST A TEMPLATE!
        }
define host{
        name                            generic-host    ; 主机名称，这里的主机名，并不是直接对应到真正机器的主机名;
                                                        ; 乃是对应到在主机配置文件里所设定的主机名。
        notifications_enabled           1               ; Host notifications are enabled
        event_handler_enabled           1               ; Host event handler is enabled
        flap_detection_enabled          1               ; Flap detection is enabled
        failure_prediction_enabled      1               ; Failure prediction is enabled
        process_perf_data               1               ; 其值可以为0或1，其作用为是否启用Nagios的数据输出功能;
                                                        ; 如果将此项赋值为1，那么Nagios就会将收集的数据写入某个文件中，以备提取。
        retain_status_information       1               ; Retain status information across program restarts
        retain_nonstatus_information    1               ; Retain non-status information across program restarts
        notification_period             24x7            ; 指定“发送通知”的时间段，也就是可以在什么时候发送通知给使用者。
        register                        0               ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL HOST, JUST A TEMPLATE!
        }
define host{
        name                            linux-server    ; 主机名称
        use                             generic-host    ; use表示引用，也就是将主机generic-host的所有属性引用到linux-server中来;
                                                        ; 在nagios配置中，很多情况下会用到引用。
        check_period                    24x7            ; 这里的check_period告诉nagios检查主机的时间段
        check_interval                  5               ; nagios对主机的检查时间间隔，这里是5分钟。
        retry_interval                  1               ; 重试检查时间间隔，单位是分钟。
        max_check_attempts              10              ; nagios对主机的最大检查次数，也就是nagios在检查发现某主机异常时，并不马上判断为异常状况;
                                                        ; 而是多试几次，因为有可能只是一时网络太拥挤，或是一些其他原因，让主机受到了一点影响;
                                                        ; 这里的10就是最多试10次的意思。
        check_command                   check-host-alive ; 指定检查主机状态的命令，其中“check-host-alive”在commands.cfg文件中定义。
        notification_period             24x7            ; 主机故障时，发送通知的时间
        notification_interval           10              ; 在主机出现异常后，故障一直没有解决，nagios再次对使用者发出通知的时间。单位是分钟;
                                                        ; 如果你觉得，所有的事件只需要一次通知就够了，可以把这里的选项设为0
        notification_options            d,u,r           ; 定义主机在什么状态下可以发送通知给使用者，d即down，表示宕机状态;
                                                        ; u即unreachable，表示不可到达状态;r即recovery，表示重新恢复状态。
        contact_groups                  nagios          ; 指定联系人组，这个“admins”在contacts.cfg文件中定义。
        register                        0               ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL HOST, JUST A TEMPLATE!
        }
define service{
        name                            generic-service         ; 定义一个服务名称
        active_checks_enabled           1                       ; Active service checks are enabled
        passive_checks_enabled          1                       ; Passive service checks are enabled/accepted
        parallelize_check               1                       ; Active service checks should be parallelized;
                                                                ; (disabling this can lead to major performance problems)
        obsess_over_service             1                       ; We should obsess over this service (if necessary)
        check_freshness                 0                       ; Default is to NOT check service 'freshness'
        notifications_enabled           1                       ; Service notifications are enabled
        event_handler_enabled           1                       ; Service event handler is enabled
        flap_detection_enabled          1                       ; Flap detection is enabled
        failure_prediction_enabled      1                       ; Failure prediction is enabled
        process_perf_data               1                       ; Process performance data
        retain_status_information       1                       ; Retain status information across program restarts
        retain_nonstatus_information    1                       ; Retain non-status information across program restarts
        is_volatile                     0                       ; The service is not volatile
        check_period                    24x7             ; 这里的check_period告诉nagios检查服务的时间段。
        max_check_attempts              3                ; nagios对服务的最大检查次数。
        normal_check_interval           5                ; 此选项是用来设置服务检查时间间隔，也就是说，nagios这一次检查和下一次检查之间所隔的时间;
                                                         ; 这里是5分钟。
        retry_check_interval            2                ; 重试检查时间间隔，单位是分钟。
        contact_groups                  nagios           ; 指定联系人组
        notification_options            w,u,c,r          ; 这个定义的是“通知可以被发出的情况”。w即warn，表示警告状态;
                                                         ; u即unknown，表示不明状态;c即criticle，表示紧急状态，r即recover，表示恢复状态;
        notification_interval           10               ; Re-notify about service problems every hour
        notification_period             24x7             ; 指定“发送通知”的时间段，也就是可以在什么时候发送通知给使用者。
        register                        0                ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!
        }
define service{
        name                            local-service 
        use                             generic-service
        max_check_attempts              4
        normal_check_interval           5 
        retry_check_interval            1 
        register                        0

objects/timeperiods.cfg：时间段配置文件；

#监控基本都是24*7，所以只保留这个就好
define timeperiod{                                           
    timeperiod_name 24x7
    alias           24hoursx7day
    sunday          00:00-24:00
    monday          00:00-24:00
    tuesday         00:00-24:00
    wednesday       00:00-24:00
    thursday        00:00-24:00
    friday          00:00-24:00
    saturday        00:00-24:00
}

然而我们实际中要检测的机器数量很巨大，所以我们最好分组，之前修改nagios.cfg让其读取servers目录，在servers目录下编辑几个实例[下面给出其中一个实例的部分]

define host{
        use                     linux-server        ;这里使用了templates里面linux-server的定义
        host_name               server1             ;机器的主机名，跟实际相同即可
        address                 192.168.100.1       ;机器的地址
}

define service{
        use                             linux-service         ; 这里使用了templates里面linux-service的定义
        host_name                       server1              ; 机器的主机名
        service_description             PING                  ; 检查的项目，在commands.cfg里定义了比较多的部分，调用该名称
    check_command           check_ping!100.0,20%!500.0,60%    ; 检查项目的预警阈值和警告阀值
        }   
                                                                                             
define service{
        use                             linux-service
        host_name                       server1
        service_description             Root Partition
    check_command           check_nrpe!check_disk!20%!10%!/   ;通过check_nrpe去检测远端机器的check_disk，主要是远端机器的大部分需要加check_nrpe
        }   
#后面也以同样的方式定义多个服务检测项目即可

server的目录结构应该如此
├── server1.cfg

├── server2.cfg

├── server3.cfg

└── group.cfg

#通过一个group文件来编排几个server文件为一个组
define  hostgroup{
          hostgroup_name  Server
          alias           Linux Server
          members         server1,server2,server3                  
          }

但是其实要检测远程的机器是需要通过nrpe这个小插件来完成的，所以在远程机器上都需要部署nagios-plugins和nrpe

nrpe主要是用于传输检测到的数据，而nagios-plugins就是检测服务的命令集

pid_file=/usr/local/nagios/var/nrpe.pid
#根据实际安装路径改变
server_port=5666
#传输数据的端口
allowed_hosts=127.0.0.1,192.168.100.10
#添加允许的机器，这里的192.168.100.10是主控端
command[check_disk]=/usr/local/nagios/libexec/check_disk -w 20% -c 10%
command[check_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 300
command[check_swap]=/usr/local/nagios/libexec/check_swap -w 20% -c 10%
command[check_mem]=/usr/local/nagios/libexec/check_mem -w 90% -c 95%
#配合nagios-plugins制定一些自己想要的检查项目 -w 为warn，-c为criticle

然后/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -f启动服务；添加自启动systemctl enable nrpe.service

可以使用主控端测试一下nrpe的连通性 /usr/local/nagios/libexec/check_nrpe -H 远程机器的ip，如果返回nrpe版本即正确

最后使用/usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg如果没有报错或警告就可以启动了