1.Prometheus部署
1.2.Prometheus安装(二进制)
tar -zxvf prometheus-2.17.1.linux-amd64.tar.gz
ln -sv /usr/local/src/prometheus-2.17.1.linux-amd64 /usr/local/prometheus
root@prometheus-server:/usr/local# cd /usr/local/prometheus
root@prometheus-server:/usr/local/prometheus# ll
total 142932
drwxr-xr-x 5 3434 3434 4096 Sep 6 11:38 ./
drwxr-xr-x 3 root root 4096 Sep 6 11:35 ../
drwxr-xr-x 2 3434 3434 4096 Mar 27 02:22 console_libraries/
drwxr-xr-x 2 3434 3434 4096 Mar 27 02:22 consoles/
drwxr-xr-x 3 root root 4096 Sep 6 11:38 data/
-rw-r--r-- 1 3434 3434 11357 Mar 27 02:22 LICENSE
-rw-r--r-- 1 3434 3434 3184 Mar 27 02:22 NOTICE
-rwxr-xr-x 1 3434 3434 84338005 Mar 27 00:20 prometheus*
-rw-r--r-- 1 3434 3434 926 Mar 27 02:22 prometheus.yml
-rwxr-xr-x 1 3434 3434 48235996 Mar 27 00:22 promtool*
-rwxr-xr-x 1 3434 3434 13732141 Mar 27 00:22 tsdb*
root@prometheus-server:/usr/local/prometheus#
1.3 创建Prometheus脚本
root@prometheus-server:/usr/local# cat /etc/systemd/system/prometheus.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure
WorkingDirectory=/usr/local/prometheus/
ExecStart=/usr/local/prometheus/prometheus --
config.file=/usr/local/prometheus/prometheus.yml
[Install]
WantedBy=multi-user.target
root@prometheus-server:/usr/local#
1.4 启动prometheus服务
systemctl start prometheus
systemctl enable prometheus
root@prometheus-server:/usr/local# systemctl status prometheus
● prometheus.service - Prometheus Server
Loaded: loaded (/etc/systemd/system/prometheus.service; enabled; vendor preset: enabled)
Active: active (running) since Sun 2020-09-06 11:38:03 CST; 18min ago
Docs: https://prometheus.io/docs/introduction/overview/
Main PID: 2409 (prometheus)
Tasks: 13 (limit: 4915)
CGroup: /system.slice/prometheus.service
└─2409 /usr/local/prometheus/prometheus --
Sep 06 11:38:03 prometheus-server prometheus[2409]: level=info ts=2020-09-06T03:38:03.282Z caller=head.go:624 component=tsdb msg="WAL segment loaded" segment=0 maxSegment=0
Sep 06 11:38:03 prometheus-server prometheus[2409]: level=info ts=2020-09-06T03:38:03.282Z caller=head.go:627 component=tsdb msg="WAL replay completed" duration=664.654µs
Sep 06 11:38:03 prometheus-server prometheus[2409]: level=info ts=2020-09-06T03:38:03.286Z caller=main.go:683 fs_type=EXT4_SUPER_MAGIC
Sep 06 11:38:03 prometheus-server prometheus[2409]: level=info ts=2020-09-06T03:38:03.287Z caller=main.go:684 msg="TSDB started"
Sep 06 11:38:03 prometheus-server prometheus[2409]: level=info ts=2020-09-06T03:38:03.287Z caller=main.go:788 msg="Loading configuration file" filename=prometheus.yml
Sep 06 11:38:04 prometheus-server prometheus[2409]: level=info ts=2020-09-06T03:38:04.066Z caller=main.go:816 msg="Completed loading of configuration file" filename=prometheus.yml
Sep 06 11:38:04 prometheus-server prometheus[2409]: level=info ts=2020-09-06T03:38:04.067Z caller=main.go:635 msg="Server is ready to receive web requests."
Sep 06 11:38:07 prometheus-server systemd[1]: /etc/systemd/system/prometheus.service:9: Unknown lvalue 'config.file' in section 'Service'
Sep 06 11:49:49 prometheus-server systemd[1]: /etc/systemd/system/prometheus.service:9: Unknown lvalue 'config.file' in section 'Service'
Sep 06 11:50:06 prometheus-server systemd[1]: /etc/systemd/system/prometheus.service:9: Unknown lvalue 'config.file' in section 'Service'
root@prometheus-server:/usr/local#
1.5 访问prometheus页面
2.node exporter安装
- 在每个master node etcd节点上都要安装node exporter
tar xvf node_exporter-0.18.1.linux-amd64.tar.gz
ln -sv /usr/local/src/node_exporter-0.18.1.linux-amd64 /usr/local/node_exporter
root@node1:/usr/local/src# cd /usr/local/node_exporter
root@node1:/usr/local/node_exporter# ls
LICENSE node_exporter NOTICE
root@node1:/usr/local/node_exporter# ll
total 16508
drwxr-xr-x 2 3434 3434 4096 Jun 5 2019 ./
drwxr-xr-x 3 root root 4096 Sep 6 10:40 ../
-rw-r--r-- 1 3434 3434 11357 Jun 5 2019 LICENSE
-rwxr-xr-x 1 3434 3434 16878582 Jun 5 2019 node_exporter*
-rw-r--r-- 1 3434 3434 463 Jun 5 2019 NOTICE
root@node1:/usr/local/node_exporter#
2.1 创建node-exporter服务脚本
root@node2:/usr/local/node_exporter# cat /etc/systemd/system/node-exporter.service
[Unit]
Description=Prometheus Node Exporter
After=network.target
[Service]
ExecStart=/usr/local/node_exporter/node_exporter
[Install]
WantedBy=multi-user.target
root@node2:/usr/local/node_exporter#
2.2 启动node-exporter服务
systemctl daemon-reload
systemctl restart node-exporter
systemctl enable node-exporter
2.3 访问node-exporter页面
2.4 prometheus采集node指标数据
root@prometheus-server:/usr/local/prometheus# cat prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: 'k8s-master'
static_configs:
- targets: ['172.16.62.201:9100','172.16.62.202:9100','172.16.62.203:9100']
- job_name: 'k8s-node'
static_configs:
- targets: ['172.16.62.207:9100','172.16.62.208:9100','172.16.62.209:9100']
- job_name: 'k8s-etcd'
static_configs:
- targets: ['172.16.62.210:9100','172.16.62.211:9100','172.16.62.212:9100']
2.5重启服务
systemctl restart prometheus
2.6 访问prometheus
2.7 prometheus验证node节点监控数据
3.配置grafana
- 链接
https://grafana.com/grafana/download
3.1 安装
sudo apt-get install -y adduser libfontconfig1
wget https://dl.grafana.com/oss/release/grafana_6.7.2_amd64.deb
sudo dpkg -i grafana_6.7.2_amd64.deb
apt --fix-broken install -y
3.2 配置文件grafana
root@prometheus-server:/etc/grafana# grep ^[a-z] grafana.ini
protocol = http
http_addr =0.0.0.0
http_port = 3000
3.3 启动服务
systemctl start grafana-server
systemctl enable grafana-server
3.4 访问grafana
3.4.1 添加数据源选择prometheus
3.4.2保存
3.4.3 import 模板
- 找到dashboard
3.4.4 导入 模板8919
3.4.5 导入成功
3.4.6 安装插件
#列出插件
grafana-cli plugins list-remote
#安装插件grafana-piechart-panel 饼图
root@prometheus-server:/etc/grafana# grafana-cli plugins install grafana-piechart-panel
installing grafana-piechart-panel @ 1.6.0
from: https://grafana.com/api/plugins/grafana-piechart-panel/versions/1.6.0/download
into: /var/lib/grafana/plugins
✔ Installed grafana-piechart-panel successfully
Restart grafana after installing plugins . <service grafana-server restart>
#重启服务
root@prometheus-server:/etc/grafana# service grafana-server restart
root@prometheus-server:/etc/grafana#
4.监控pod资源
4.1 安装cadvisor
cadvisor镜像准备
docker load -i cadvisor_v0.36.0.tar.gz
docker tag gcr.io/google_containers/cadvisor:v0.36.0 harbor.haostack.com/prometheus/gcr.io/google_containers/cadvisor:v0.36.0
docker push harbor.haostack.com/prometheus/gcr.io/google_containers/cadvisor:v0.36.0
4.2 安装cadvisor
- 在每个node节点上都要安装cadvisor
root@prometheus-server:/data# docker run \
> --volume=/:/rootfs:ro \
> --volume=/var/run:/var/run:rw \
> --volume=/sys:/sys:ro \
> --volume=/var/lib/docker/:/var/lib/docker:ro \
> --volume=/dev/disk/:/dev/disk:ro \
> --publish=8080:8080 \
> --detach=true \
> --name=cadvisor \
> harbor.haostack.com/prometheus/gcr.io/google_containers/cadvisor:v0.36.0
1c3dba5e036df2c668a4018b5011c58ffbbe7de84b28953682c2559912eab3d0
4.3 访问cadvisor web页面
4.4 利用prometheus采集cadvisor数据
4.4.1 添加jod_name
- job_name: 'k8s-pods-cadvisor'
static_configs:
- targets: ['172.16.62.207:8080','172.16.62.208:8080','172.16.62.209:8080']
4.4.2 重启prometheus
systemctl restart prometheus
4.4.3 验证数据
4.5 grafana添加pod模板
4.5.1 8588
4.5.2 导入模板8588
4.5.3 导入模板395
4.5.4 导入模板893
5 Prometheus报警配置
5.1 alertmanager安装
tar -xvf alertmanager-0.20.0.linux-amd64.tar.gz -C /usr/local/src/
ln -sv /usr/local/src/alertmanager-0.20.0.linux-amd64 /usr/local/alertmanager
5.2 配置alertmanager
- https://prometheus.io/docs/alerting/configuration/ #官方配置文档
root@prometheus-server:/usr/local/alertmanager# cat alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '111111111@qq.com'
smtp_auth_username: '11111111@qq.com'
smtp_auth_password: '********'
smtp_hello: '@qq.com'
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
#webhook_configs:
#- url: 'http://127.0.0.1:5001/'
email_configs:
- to: '66666666@qq.com'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
5.3启动alertmanager服务
5.3.1 二进制方式启动
./alertmanager --config.file=./alertmanager.yml
5.3.2 脚本方式启动
root@prometheus-server:/usr/local/alertmanager# cat /etc/systemd/system/alertmanager.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
[Install]
WantedBy=multi-user.target
root@prometheus-server:/usr/local/alertmanager#
5.4 alertmanager 页面访问
5.5 创建报警规则文件
root@prometheus-server:/usr/local/prometheus# cat rule-uat-k8s.yml
groups:
- name: uat_pod.rules
rules:
- alert: Pod_all_cpu_usage
expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
for: 5m
labels:
severity: critical
service: pods
annotations:
description: 容器 {{ $labels.name }} CPU 资源利用率大于 75% , (current value is {{ $value }})
summary: Dev CPU 负载告警
- alert: Pod_all_memory_usage
expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 1024*10^3*2
for: 10m
labels:
severity: critical
annotations:
description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
summary: Dev Memory 负载告警
- alert: Pod_all_network_receive_usage
expr: sum by (name) (irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 1024*1024*50
for: 10m
labels:
severity: critical
annotations:
description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})
root@prometheus-server:/usr/local/prometheus#
5.5.1 prometheus配置文件
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 172.16.62.213:9093 #指定alertmanager地址
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "/usr/local/prometheus/rule-uat-k8s.yml" #指定rules路径
5.5.2 Alerts状态
5.5.3 prometheus web界面验证报警规则
5.5.4 验证报警邮件
6 prometheus监控haproxy
6.1 部署haproxy_exporter
tar -xvf haproxy_exporter-0.10.0.linux-amd64.tar.gz
ln -sv /usr/local/src/haproxy_exporter-0.10.0.linux-amd64 /usr/local/haproxy_exporter
./haproxy_exporter --haproxy.scrape-uri="http://haadmin:123456@172.16.62.204:9999/haproxy-status;csv"
6.2 脚本启动
root@ha1:/usr/local/haproxy_exporter# cat /etc/systemd/system/haproxy-exporter.service
[Unit]
Description=Prometheus haproxy Exporter
After=network.target
[Service]
ExecStart=/usr/local/haproxy_exporter/haproxy_exporter
[Install]
WantedBy=multi-user.target
#启动服务
systemctl start haproxy-exporter
systemctl status haproxy-exporter
6.3 验证haproxy_exporter web 数据
6.4 prometheus server端添加haproxy数据采集
- job_name: 'haproxy-exporter'
static_configs:
- targets: ['172.16.62.204:9101']
6.4.1 重启服务
systemctl restart prometheus
6.5 查看数据
6.6 grafana导入haproxy插件模板
- 导入模板2428
- 导入模板367