# 一、将 Jvm-exporter 整合进我们的应用
1、下载 Jvm-exporter 存放到Tomcat/bin目录下
下载 jmx_exporter 存放Tomcat的bin目录下
[root@Jenkins-S ~/Dockerfile/centos-tomcat]# ls -l
total 197408
-rw-r--r-- 1 root root 654 May 17 11:16 Dockerfile
drwxr-xr-x 7 10 143 245 Dec 2 21:14 java
-rw-r--r-- 1 root root 404 May 17 10:35 start-tomcat.sh
drwxr-xr-x 9 root root 220 Dec 3 08:39 tomcat
[root@Jenkins-S ~/Dockerfile/centos-tomcat]# ls -l tomcat/bin/
-rw-r--r-- 1 root root 1211 May 17 10:33 jmx-exporter.yaml
-rw-r--r-- 1 root root 413862 May 17 10:31 jmx_prometheus_javaagent-0.16.0.jar
-rw-r--r-- 1 root root 127 May 17 11:17 setenv.sh
2、在Tomcat/bin目录下创建jmx-exporter.yaml 文件
2.1、方法一:
[root@Jenkins-S ~/Dockerfile/centos-tomcat]# cat tomcat/bin/jmx-exporter.yaml
---
rules:
- pattern: '.*'
不过,个人觉得上面这种配置,可以快速的上手,正式使用的时候,不推荐这样,因为这样会导致prometheus收集的指标太多了,对于存储和网络都会有些许的影响,特别是手机的主机特别多的时候。比较推荐官方给的配置,如下:
2.2、方法二:
[root@Jenkins-S ~/Dockerfile/centos-tomcat]# cat tomcat/bin/jmx-exporter.yaml
# 官方推荐实例: https://github.com/prometheus/jmx_exporter/blob/master/example_configs/tomcat.yml
---
lowercaseOutputLabelNames: true
lowercaseOutputName: true
whitelistObjectNames: ["java.lang:type=OperatingSystem"]
blacklistObjectNames: []
rules:
- pattern: 'java.lang<type=OperatingSystem><>(committed_virtual_memory|free_physical_memory|free_swap_space|total_physical_memory|total_swap_space)_size:'
name: os_$1_bytes
type: GAUGE
attrNameSnakeCase: true
- pattern: 'java.lang<type=OperatingSystem><>((?!process_cpu_time)\w+):'
name: os_$1
type: GAUGE
attrNameSnakeCase: true
3、配置 Tomcat
[root@Jenkins-S ~/Dockerfile/centos-tomcat]# cat ./tomcat/bin/setenv.sh
JAVA_OPTS=" -javaagent:/usr/local/tomcat/bin/jmx_prometheus_javaagent-0.16.0.jar=9901:/usr/local/tomcat/bin/jmx-exporter.yaml"
[root@Jenkins-S ~/Dockerfile/centos-tomcat]# chmod a+x ./tomcat/bin/setenv.sh
注意: 9901为 jvm 暴露监控数据的接口
4、修改 Dockerfile
[root@Jenkins-S ~/Dockerfile/centos-tomcat]# cat Dockerfile
FROM centos:7.7.1908
COPY ./start-tomcat.sh /usr/local/start-tomcat.sh
COPY ./java /usr/local/java
COPY ./tomcat /usr/local/tomcat
ENV LANG=en_US.UTF-8
RUN rm -rf /etc/yum.repos.d/* && \
curl -o /etc/yum.repos.d/CentOS-Base.repo https://mirrors.aliyun.com/repo/Centos-7.repo && \
sed -i -e '/mirrors.cloud.aliyuncs.com/d' -e '/mirrors.aliyuncs.com/d' /etc/yum.repos.d/CentOS-Base.repo && \
yum clean all && yum makecache && \
yum install telnet net-tools vim less wget tree traceroute -y && \
chmod a+x /usr/local/start-tomcat.sh
ENV JAVA_HOME=/usr/local/java
WORKDIR /usr/local/tomcat
EXPOSE 8080
CMD ["/usr/local/start-tomcat.sh"]
[root@Jenkins-S ~/Dockerfile/centos-tomcat]# cat start-tomcat.sh
#!/bin/bash
# /usr/local/tomcat/bin/catalina.sh run >> /usr/local/tomcat/logs/catalina.out 2>&1
/usr/local/tomcat/bin/startup.sh
tail -f /usr/local/tomcat/logs/catalina.out >> /dev/null 2>&1
5、重新构建 Tomcat
[root@Jenkins-S ~/Dockerfile/centos-tomcat]# docker build -t reg.xxx.net/xxx/tomcat8.5.51:centos7.7-jmx .
二、配置 Prometheus 服务自动发现
1、有 Service Pod 自动发现
对于有Service暴露的服务我们可以用 prometheus-operator 项目定义的ServiceMonitorCRD来配置服务发现,配置模板如下:
--- # ServiceMonitor 服务自动发现规则
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor # prometheus-operator 定义的CRD
metadata:
name: jmx-metrics
namespace: monitoring
labels:
k8s-apps: jmx-metrics
spec:
jobLabel: metrics #监控数据的job标签指定为metrics label的值,即加上数据标签job=jmx-metrics
selector:
matchLabels:
metrics: jmx-metrics # 自动发现 label中有metrics: jmx-metrics 的service
namespaceSelector:
matchNames: # 配置需要自动发现的命名空间,可以配置多个
- lzulms
endpoints:
- port: http-metrics # 拉去metric的端口,这个写的是 service的端口名称,即 service yaml的spec.ports.name
interval: 15s # 拉取metric的时间间隔
--- # 服务service模板
apiVersion: v1
kind: Service
metadata:
labels:
metrics: jmx-metrics # ServiceMonitor 自动发现的关键label
name: jmx-metrics
namespace: lzulms
spec:
ports:
- name: http-metrics #对应 ServiceMonitor 中spec.endpoints.port
port: 9093 # jmx-exporter 暴露的服务端口
targetPort: http-metrics # pod yaml 暴露的端口名
selector:
metrics: jmx-metrics # service本身的标签选择器
以上配置了lzulms命名空间的 jmx-metrics Service的服务自动发现,Prometheus会将这个service 的所有关联pod自动加入监控,并从apiserver获取到最新的pod列表,这样当我们的服务副本扩充时也能自动添加到监控系统中。
2、没有Service pod 自动发现
那么对于没有创建 Service 的服务,比如以HostPort对集群外暴露服务的实例,我们可以使用 PodMonitor 来做服务发现,相关样例如下:
--- # PodMonitor 服务自动发现规则,最新的版本支持,旧版本可能不支持
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor # prometheus-operator 定义的CRD
metadata:
name: jmx-metrics
namespace: monitoring
labels:
k8s-apps: jmx-metrics
spec:
jobLabel: metrics #监控数据的job标签指定为metrics label的值,即加上数据标签job=jmx-metrics
selector:
matchLabels:
metrics: jmx-metrics # 自动发现 label中有metrics: jmx-metrics 的pod
namespaceSelector:
matchNames: # 配置需要自动发现的命名空间,可以配置多个
- lzulms
podMetricsEndpoints:
- port: http-metrics # Pod yaml中 metric暴露端口的名称 即 spec.ports.name
interval: 15s # 拉取metric的时间间隔
--- # 需要监控的Pod模板
apiVersion: v1
kind: Pod
metadata:
labels:
metrics: jmx-metrics
name: jmx-metrics
namespace: lzulms
spec:
containers:
- image: tomcat:9.0
name: tomcat
ports:
- containerPort: 9093
name: http-metrics
3、K8S 集群外服务监控
3.1、添加tomcat的 prometheus-servicemonitor
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: jmx-metrics-lms-saas-venus
namespace: monitoring
labels:
k8s-apps: lms-saas-venus
spec:
jobLabel: metrics
selector:
matchLabels:
metrics: lms-saas-venus
namespaceSelector:
matchNames:
- lms-saas
endpoints:
- port: lms-saas-venus
interval: 15s
3.2、kube-prometheus 添加 tomcat service和 endpoint ,把tomcat外部服务映射到k8s集群
cat > tomcatService-venus.yaml << 'EOF'
apiVersion: v1
kind: Endpoints
metadata:
name: lms-saas-venus-jmx
namespace: lzulms
labels:
metrics: lms-saas-venus
subsets:
- addresses:
- ip: 10.52.35.181
ports:
- name: lms-saas-venus
port: 9901
protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
name: lms-saas-venus-jmx
namespace: lzulms
labels:
metrics: lms-saas-venus
spec:
type: ClusterIP
clusterIP: None
ports:
- name: lms-saas-venus
port: 9901
protocol: TCP
EOF
4、为Prometheus serviceAccount 添加对应namespace的权限
# 在对应的ns中创建角色
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: lzulms
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
---
# 绑定角色 prometheus-k8s 角色到 Role
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: lzulms
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s # Prometheus 容器使用的 serviceAccount,kube-prometheus默认使用prometheus-k8s这个用户
namespace: monitoring
三、部署 pod 应用yaml
cat <<EOF > tomcat.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
k8s.kuboard.cn/name: tomcat
name: tomcat
namespace: lzulms
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
k8s.kuboard.cn/name: tomcat
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
labels:
k8s.kuboard.cn/name: tomcat
spec:
containers:
- image: 'reg.xxx.net/xxx/tomcat8.5.51:centos7.7-jmx' # 这里需要用自己刚刚重新构建的镜像
imagePullPolicy: Always
name: tomcat
ports:
- containerPort: 8080
protocol: TCP
- containerPort: 9901
protocol: TCP
volumeMounts:
- mountPath: /usr/local/tomcat/logs
name: volume-j7ewz
dnsPolicy: ClusterFirst
imagePullSecrets:
- name: harbor
restartPolicy: Always
schedulerName: default-scheduler
volumes:
- hostPath:
path: /nfs/tomcat
name: volume-j7ewz
---
apiVersion: v1
kind: Service
metadata:
labels:
metrics: jmx-metrics
name: tomcat
namespace: lzulms
spec:
ports:
- name: http-metrics
port: 9901
protocol: TCP
targetPort: 9901
- name: snakld
port: 8080
protocol: TCP
targetPort: 8080
selector:
k8s.kuboard.cn/name: tomcat
sessionAffinity: None
type: NodePort
EOF
kubectl get svc -n lzulms # 访问一下10.52.16.21:32563 jmx是否获取到数据
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
tomcat NodePort 10.107.235.181 <none> 9901:32563/TCP,8080:31443/TCP 71m
出现以上效果说明 jmx 没有问题
四、在 Prometheus 管理页面查看服务发现
五、配置 Granfa 展示
grafana的dashboards编号是8563,添加到grafana中即可。之后,就完成了tomcat的监控,如下图
六、添加报警规则
vim kube-prometheus/manifests/prometheus-rules.yaml
# 去最后一行添加
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: jvm-metrics-rules
namespace: monitoring
spec:
groups:
- name: jvm-metrics-rules
rules:
# 在5分钟里,GC花费时间超过10%
- alert: GcTimeTooMuch
expr: increase(jvm_gc_collection_seconds_sum[5m]) > 30
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.app }} GC时间占比超过10%"
message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} ip:{{ $labels.instance }} GC时间占比超过10%,当前值({{ $value }}%)"
# GC次数太多
- alert: GcCountTooMuch
expr: increase(jvm_gc_collection_seconds_count[1m]) > 30
for: 1m
labels:
severity: critical
annotations:
summary: "{{ $labels.app }} 1分钟GC次数>30次"
message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} ip:{{ $labels.instance }} 1分钟GC次数>30次,当前值({{ $value }})"
# FGC次数太多
- alert: FgcCountTooMuch
expr: increase(jvm_gc_collection_seconds_count{gc="ConcurrentMarkSweep"}[1h]) > 3
for: 1m
labels:
severity: critical
annotations:
summary: "{{ $labels.app }} 1小时的FGC次数>3次"
message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} ip:{{ $labels.instance }} 1小时的FGC次数>3次,当前值({{ $value }})"
# 非堆内存使用超过95%
- alert: NonheapUsageTooMuch
expr: jvm_memory_bytes_used{area="nonheap"} / jvm_memory_bytes_max * 100 > 95
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.app }} 非堆内存使用>95%"
message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} ip:{{ $labels.instance }} 非堆内存使用率>95%,当前值({{ $value }}%)"
# 堆内存使用超过95%
- alert: heapUsageTooMuch
expr: jvm_memory_bytes_used{area="heap"} / jvm_memory_bytes_max * 100 > 95
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.app }} 堆内存使用>95%"
message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} ip:{{ $labels.instance }} 堆内存使用率>95%,当前值({{ $value }}%)"
kubectl apply -f kube-prometheus/manifests/prometheus-rules.yaml
浏览器访问: Prometheus 查看报警规则里面是否添加成功