1.使用开源项目一键部署
注意版本和k8s对应版本的关系,拉取版本对应分支历史
git clone -b release-0.7 --single-branch https://github.com/prometheus-operator/kube-prometheus.git
主要yaml文件位置,
[root@master setup]# pwd
/root/pro/kube-prometheus/manifests/setup
kubectl apply -f .
上级目录也全部应用
[root@master setup]# pwd
/root/pro/kube-prometheus/manifests
kubectl apply -f .
全部完成后暴露端口或者配置ingress访问,我这里已经暴露好了
查看pod状态全部正常,可以通过浏览器+端口号访问了,开源项目grafana自带各种图表
2.解决二进制kube-controller-manager和kube-scheduler 无法监控问题
[root@master setup]# k get servicemonitors.monitoring.coreos.com -n monitoring
NAME AGE
alertmanager 3h50m
coredns 3h50m
grafana 3h50m
kube-apiserver 3h50m
kube-controller-manager 3h50m
kube-scheduler 3h50m
kube-state-metrics 3h50m
kubelet 3h50m
node-exporter 3h50m
prometheus 3h50m
prometheus-adapter 3h50m
prometheus-operator 3h50m
以yaml查看
k get servicemonitors.monitoring.coreos.com -n monitoring kube-controller-manager -o yaml
他想去找svc,但是没有标签匹配,所以找不到
要做的两点,
1.把监听地址改成0.0.0.0
2.把svc打上他要的标签
[root@master bin]# ss -tlnp|egrep 'controller|schedule'
LISTEN 0 32768 172.26.144.84:10259 *:* users:(("kube-scheduler",pid=1417,fd=8))
LISTEN 0 32768 172.26.144.84:10257 *:* users:(("kube-controller",pid=1429,fd=8))
LISTEN 0 32768 [::]:10251 [::]:* users:(("kube-scheduler",pid=1417,fd=7))
LISTEN 0 32768 [::]:10252 [::]:* users:(("kube-controller",pid=1429,fd=7))
注释:不同二进制安装方式kube-controller-manager.service 的位置有所不同,注意找下
prometheus-operator/kube-prometheus
这个项目装的在这个位置
kube-asz装的在 /etc/systemd/system/kube-controller-manager.service
4.有几个master就改几个、如果大家前面是按我设计的4台NODE节点,其中2台作master的话,那就在这2台master上把systemcd配置改一下
所有master分别改ip
sed -ri 's+172.24.75.117+0.0.0.0+g' /etc/systemd/system/kube-controller-manager.service
sed -ri 's+172.24.75.117+0.0.0.0+g' /etc/systemd/system/kube-scheduler.service
systemctl daemon-reload
systemctl restart kube-controller-manager.service
systemctl restart kube-scheduler.service
改完确定下都变成::了
ss -tlnp|egrep 'controller|schedule'
5.xiu该完毕再次尝试访问\\OK 通了、、但是还没完全通、看第6步
curl 172.24.75.91:10251/metrics
curl 172.24.75.119:10252/metrics
6.6.然后因为K8s的这两上核心组件我们是以二进制形式部署的,为了能让K8s上的prometheus能发现,我们还需要来创建相应的service和endpoints来将其关联起来
注意:我们需要将endpoints里面的NODE IP换成我们实际情况的
将上面的yaml配置保存为repair-prometheus.yaml,然后创建它---node01
vi repair-prometheus.yaml
kubectl apply -f repair-prometheus.yaml
创建完确认下
kubectl -n kube-system get svc |egrep 'controller|scheduler'
apiVersion: v1
kind: Service
metadata:
namespace: kube-system
name: kube-controller-manager
labels:
k8s-app: kube-controller-manager
spec:
type: ClusterIP
clusterIP: None
ports:
- name: http-metrics
port: 10252
targetPort: 10252
protocol: TCP
---
apiVersion: v1
kind: Endpoints
metadata:
labels:
k8s-app: kube-controller-manager
name: kube-controller-manager
namespace: kube-system
subsets:
- addresses:
- ip: 172.24.75.119
- ip: 172.24.75.118
- ip: 172.24.75.117
ports:
- name: http-metrics
port: 10252
protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
namespace: kube-system
name: kube-scheduler
labels:
k8s-app: kube-scheduler
spec:
type: ClusterIP
clusterIP: None
ports:
- name: http-metrics
port: 10251
targetPort: 10251
protocol: TCP
---
apiVersion: v1
kind: Endpoints
metadata:
labels:
k8s-app: kube-scheduler
name: kube-scheduler
namespace: kube-system
subsets:
- addresses:
- ip: 172.24.75.119
- ip: 172.24.75.118
- ip: 172.24.75.117
ports:
- name: http-metrics
port: 10251
protocol: TCP
7.记得还要修改一个地方
kubectl -n monitoring edit servicemonitors.monitoring.coreos.com kube-scheduler
# 将下面两个地方的https换成http
port: https-metrics ##\\21行
scheme: https
kubectl -n monitoring edit servicemonitors.monitoring.coreos.com kube-controller-manager
# 将下面两个地方的https换成http
port: https-metrics ##58行
scheme: https
8.然后再返回prometheus UI处,耐心等待几分钟,就能看到已经被发现了、、、完美解决
kubectl get svc -n monitoring
http://101.133.227.28:30838/targets
第七部的截图:
成功截图,都会上线
3.监控ETCD
视频版本ETCD证书位置
kubeasz证书位置
##改版之后的位置在这:
ll /etc/kubeasz/clusters/testbobo/ssl/etcd.pem
发现etcd没有svc,需要创建svc和endpoints对应
1.查看ETCD是默认暴露---node1---记得改集群名字和ip
curl --cacert /etc/kubernetes/ssl/ca.pem --cert /etc/kubeasz/clusters/testbobo/ssl/etcd.pem --key /etc/kubeasz/clusters/testbobo/ssl/etcd-key.pem https://172.24.75.117:2379/metrics
1
##改版之后的位置在这:
ll /etc/kubeasz/clusters/testbobo/ssl/etcd.pem
2.进行配置使ETCD能被prometheus发现并监控
2.1首先把ETCD的证书创建为secret----------记得修改为自己的集群名字
kubectl -n monitoring create secret generic etcd-certs --from-file=/etc/kubeasz/clusters/test/ssl/etcd.pem --from-file=/etc/kubeasz/clusters/test/ssl/etcd-key.pem --from-file=/etc/kubernetes/ssl/ca.pem
2.2 接着在prometheus里面引用这个secrets
kubectl -n monitoring edit prometheus k8s
spec:
##加中间两个在最后,version:v2.22.1后面
...
secrets:
- etcd-certs
2.3保存退出后,prometheus会自动重启服务pod以加载这个secret配置,过一会,我们进pod来查看下是不是已经加载到ETCD的证书了
kubectl -n monitoring exec -it prometheus-k8s-0 -c prometheus -- sh
###修改配置自动重启,所以多进去几次,这里没问题
/prometheus $ ls /etc/prometheus/secrets/etcd-certs/
ca.pem etcd-key.pem etcd.pem
3.接下来准备创建service、endpoints以及ServiceMonitor的yaml配置
注意替换下面的NODE节点IP为实际ETCD所在NODE内网IP
vi prometheus-etcd.yaml
kubectl apply -f prometheus-etcd.yaml
4.过一会,就可以在prometheus UI上面看到ETCD集群被监控了
apiVersion: v1
kind: Service
metadata:
name: etcd-k8s
namespace: monitoring
labels:
k8s-app: etcd
spec:
type: ClusterIP
clusterIP: None
ports:
- name: api
port: 2379
protocol: TCP
---
apiVersion: v1
kind: Endpoints
metadata:
name: etcd-k8s
namespace: monitoring
labels:
k8s-app: etcd
subsets:
- addresse
- ip: 172.24.75.119
- ip: 172.24.75.118
- ip: 172.24.75.117
ports:
- name: api
port: 2379
protocol: TCP
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: etcd-k8s
namespace: monitoring
labels:
k8s-app: etcd-k8s
spec:
jobLabel: k8s-app
endpoints:
- port: api
interval: 30s
scheme: https
tlsConfig:
caFile: /etc/prometheus/secrets/etcd-certs/ca.pem
certFile: /etc/prometheus/secrets/etcd-certs/etcd.pem
keyFile: /etc/prometheus/secrets/etcd-certs/etcd-key.pem
#use insecureSkipVerify only if you cannot use Subject Alternative Name
insecureSkipVerify: true
selector:
matchLabels:
k8s-app: etcd
namespaceSelector:
matchNames:
- monitoring
监控总结: 自带metrics接口的如果集群已经有svc则只需要创建ServiceMonitor监控,怎么创建可以对照上面的例子,没有svc和endpoint则需要创建svc,endpoint和svm,然后指向、metrics比如ETCD,没有/metrics接口的用exporter下节讲
4.无/metrics的监控kafaka-exporter
网上有很多exporter,这里以kafaka为例
https://github.com/danielqsj/kafka_exporter
[root@master ~]# k create deployment test-kafka --image=danielqsj/kafka-exporter -n monitoring
deployment.apps/test-kafka created
容器一起来就死了,是因为没有常驻命令,加一行就好了,示例
apiVersion: v1
kind: Pod
metadata:
name: busybox
namespace: default
labels:
app: busybox
spec:
containers:
- name: busybox
image: busybox
command: ["/bin/sh","-ce","sleep 3600"]
访问其metrics接口:正常应该有响应
curl:10.68.79.192:9308/metrics
设置servermonitor,这里面的模板改一下
/root/pro/kube-prometheus/manifests/prometheus-serviceMonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kafka-exporter
namespace: monitoring
labels:
k8s-app: kafka-exporter
spec:
jobLabel: kafka-exporter
endpoints:
- port: 9308 ###不行就改成名称
interval: 30s
selector:
matchLabels:
k8s-app: kafka-exporter
namespaceSelector:
matchNames:
- monitoring
如果没有数据,创建topic测试下
5.redis-exporter
6.黑盒监控
GitHub - prometheus/blackbox_exporter: Blackbox prober exporter
创建cm,配置文件如下
blackbox_exporter/blackbox-good.yml at master · prometheus/blackbox_exporter · GitHub
[root@master blackbox]# kubectl create cm good-cm --from-file=good.yaml -n monitoring
configmap/good-cm created
docker pull prom/blackbox-exporter
7.告警---配置邮件告警
pwd
/root/kube-prometheus/manifests/prometheus-rules.yaml
k get secrets -n monitoring
alertmanager-main 就是告警规则的配置文件,打开看一下
对应文件 /root/kube-prometheus/manifests/alertmanager-secret.yaml
改好的模板---邮箱告警
prometheusrule.monitoring.coreos.com/prometheus-k8s-rules unchanged
[root@master manifests]# vim alertmanager-secret.yaml
apiVersion: v1
kind: Secret
metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main
namespace: monitoring
stringData:
alertmanager.yaml: |-
"global":
"resolve_timeout": "5m"
smtp_from: "15927561940@163.com"
smtp_smarthost: "smtp.163.com:465"
smtp_hello: "15927561940"
smtp_auth_username: "15927561940@163.com"
smtp_auth_password: "CVGNQUFFWJPBOEIC"
smtp_require_tls: false
"inhibit_rules":
- "equal":
- "namespace"
- "alertname"
"source_match":
"severity": "critical"
"target_match_re":
"severity": "warning|info"
- "equal":
- "namespace"
- "alertname"
"source_match":
"severity": "warning"
"target_match_re":
"severity": "info"
"receivers":
- "name": "Default"
to: "15927561940@163.com"
send_resolved: true
- "name": "Watchdog"
to: "15927561940@163.com"
send_resolved: true
- "name": "Critical"
to: "15927561940@163.com"
send_resolved: true
"route":
"group_by":
- "namespace"
"group_interval": "5m"
"group_wait": "30s"
"receiver": "Default"
"repeat_interval": "12h"
"routes":
- "match":
"alertname": "Watchdog"
"receiver": "Watchdog"
- "match":
"severity": "critical"
"receiver": "Critical"
type: Opaque
配置完成后进如告警查看
global:
resolve_timeout: 5m
http_config: {}
smtp_from: 15927561940@163.com
smtp_hello: "15927561940"
smtp_smarthost: smtp.163.com:465
smtp_auth_username: 15927561940@163.com
smtp_auth_password: <secret>
pagerduty_url: https://events.pagerduty.com/v2/enqueue
opsgenie_api_url: https://api.opsgenie.com/
wechat_api_url: https://qyapi.weixin.qq.com/cgi-bin/
victorops_api_url: https://alert.victorops.com/integrations/generic/20131114/alert/
route:
receiver: Default
group_by:
- namespace
routes:
- receiver: Watchdog
match:
alertname: Watchdog
- receiver: Critical
match:
severity: critical
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
inhibit_rules:
- source_match:
severity: critical
target_match_re:
severity: warning|info
equal:
- namespace
- alertname
- source_match:
severity: warning
target_match_re:
severity: info
equal:
- namespace
- alertname
receivers:
- name: Default
email_configs:
- send_resolved: true
to: 15927561940@163.com
from: 15927561940@163.com
hello: "15927561940"
smarthost: smtp.163.com:465
auth_username: 15927561940@163.com
auth_password: <secret>
headers:
From: 15927561940@163.com
Subject: '{{ template "email.default.subject" . }}'
To: 15927561940@163.com
html: '{{ template "email.default.html" . }}'
require_tls: false
- name: Watchdog
email_configs:
- send_resolved: true
to: 15927561940@163.com
from: 15927561940@163.com
hello: "15927561940"
smarthost: smtp.163.com:465
auth_username: 15927561940@163.com
auth_password: <secret>
headers:
From: 15927561940@163.com
Subject: '{{ template "email.default.subject" . }}'
To: 15927561940@163.com
html: '{{ template "email.default.html" . }}'
require_tls: false
- name: Critical
email_configs:
- send_resolved: true
to: 15927561940@163.com
from: 15927561940@163.com
hello: "15927561940"
smarthost: smtp.163.com:465
auth_username: 15927561940@163.com
auth_password: <secret>
headers:
From: 15927561940@163.com
Subject: '{{ template "email.default.subject" . }}'
To: 15927561940@163.com
html: '{{ template "email.default.html" . }}'
require_tls: false
templates: []
此时邮件已经收到告警‘
8.告警------配置微信告警
vim alertmanager-secret.yaml
apiVersion: v1
kind: Secret
metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main
namespace: monitoring
stringData:
alertmanager.yaml: |-
"global":
"resolve_timeout": "5m"
smtp_from: "15927561940@163.com"
smtp_smarthost: "smtp.163.com:465"
smtp_hello: "15927561940"
smtp_auth_username: "15927561940@163.com"
smtp_auth_password: "CVGNQUFFWJPBOEIC"
smtp_require_tls: false
wechat_api_url: "https://qyapi.weixin.qq.com/cgi-bin/"
wechat_api_secret: JVD7OCJ7B-RYvXhySpUp1FtYcyDYgH-ge12dSM2cV0k
wechat_api_corp_id: ww08b01c48525f792b
"inhibit_rules":
- "equal":
- "namespace"
- "alertname"
"source_match":
"severity": "critical"
"target_match_re":
"severity": "warning|info"
- "equal":
- "namespace"
- "alertname"
"source_match":
"severity": "warning"
"target_match_re":
"severity": "info"
"receivers":
- "name": "Default"
"email_configs":
- to: "15927561940@163.com"
send_resolved: true
- "name": "Watchdog"
"email_configs":
- to: "15927561940@163.com"
send_resolved: true
- "name": "Critical"
"email_configs":
- to: "15927561940@163.com"
send_resolved: true
- "name": "wechat"
"wechat_configs":
- send_resolved: true
agent_id: 1000002
to_tag: "波波企业"
"route":
"group_by":
- "namespace"
"group_interval": "5m"
"group_wait": "30s"
"receiver": "Default"
"repeat_interval": "12h"
"routes":
- "match":
"alertname": "Watchdog"
"receiver": "wechat"
- "match":
"severity": "critical"
"receiver": "Critical"
- "match":
"severity": "critical"
"receiver": "wechat"
type: Opaque
~
8.自定义告警模板
网上去找找把
9.java项目埋点监控
GitHub - mweirauch/micrometer-jvm-extras: A set of additional JVM process metrics for micrometer.io.
示例java项目
GitHub - gongchangwangpi/spring-cloud-demo2: spring cloud 2.x
docker run -it -v /opt/m2:/root/.m2 -v `pwd`:/opt/ -p 18761:8761 maven:3.5.3 bash
进入容器,查看挂载成功
cd spring-cloud-eureka/
容器中进入eureka目录,修改pom文件
在第22行回车加上
<!-- Micrometer Prometheus registry -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-core</artifactId>
</dependency>
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
<!-- finished -->
进入src/main/resources/application.yml
[root@master spring-cloud-eureka]# pwd
/root/spring-cloud-demo2/spring-cloud-eureka
[root@master spring-cloud-eureka]# vim src/main/resources/application.yml
[root@master spring-cloud-eureka]# pwd
/root/spring-cloud-demo2/spring-cloud-eureka
[root@master spring-cloud-eureka]#
暴露服务
spring:
application:
name: cloud-eureka
management:
endpoints:
web:
exposure:
include: '*'
shutdown:
enable: false
metrics:
tags:
application: "${spring.application.name}"
编译
mvn clean package -DskipTests
java -jar /绝对路径/SNAPSHOT.jar