场景:
近期生产nodejs服务是用pm2去管理的,那么该如何使用prometheus进行监控告警呢?说干就干
1、在github上发现一个开源的exporterGitHub - saikatharryc/pm2-prometheus-exporter: :rabbit: pm2 prometheus exporter
废话不多说直接试试看吧。
pm2 install pm2-metrics
# 下载完成以后启动
pm2 start pm2-metrics
暴露的是9209端口,我们访问一下看看
http://<HOST>:9209/metrics
下面我们配置一下prometheus 的配置文件,添加一个job,重启prometheus
# test pm2
- job_name: 'nodejs-app'
static_configs:
- targets: ['172.19.143.3:9209']
访问prometheus的targets 检查一下,可以看到是没问题的
那么好,如何绘图呢?首先,我去grafana的官网看了下有没有能直接用的Dashboard。
很遗憾,并没有 (图中的两个用不了,可能是版本原因吧,只有一两个表有数据)。
经过一番查找,找个一个能看的,上json
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 4,
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": false,
"keepTime": false,
"tags": [],
"targetBlank": false,
"title": "New link",
"tooltip": "",
"type": "dashboards",
"url": ""
}
],
"liveNow": false,
"panels": [
{
"datasource": "Prometheus",
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"fillOpacity": 70,
"lineWidth": 0,
"spanNulls": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "green",
"value": 1
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 0
},
"id": 2,
"links": [],
"options": {
"alignValue": "left",
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"mergeValues": true,
"rowHeight": 0.9,
"showValue": "auto",
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "8.5.2",
"targets": [
{
"datasource": "Prometheus",
"editorMode": "code",
"exemplar": false,
"expr": "pm2_up{name!=\"pm2-metrics\"}",
"format": "time_series",
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{name}}",
"range": true,
"refId": "A",
"step": 120
}
],
"title": "Status",
"type": "state-timeline"
},
{
"datasource": "Prometheus",
"description": "Количество запущенных процессов (если больше одного, что-то не так)",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 24,
"x": 0,
"y": 5
},
"id": 8,
"links": [],
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "8.5.2",
"targets": [
{
"datasource": "Prometheus",
"editorMode": "code",
"expr": "pm2_instances{name!=\"pm2-metrics\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{name}}",
"range": true,
"refId": "A",
"step": 120
}
],
"title": "Instances",
"type": "stat"
},
{
"datasource": "Prometheus",
"description": "Перезапуск срабатывает в случае изменения файлов или при фатальной ошибке",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 24,
"x": 0,
"y": 8
},
"id": 17,
"links": [],
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"text": {},
"textMode": "value_and_name"
},
"pluginVersion": "8.5.2",
"targets": [
{
"datasource": "Prometheus",
"editorMode": "code",
"expr": "pm2_restarts{name!=\"pm2-metrics\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{name}}",
"range": true,
"refId": "A",
"step": 120
}
],
"title": "Restarts",
"type": "stat"
},
{
"datasource": "Prometheus",
"description": "Сколько времени живет микросервис",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"decimals": 1,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 24,
"x": 0,
"y": 11
},
"id": 7,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "vertical",
"reduceOptions": {
"calcs": ["mean"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "8.5.2",
"targets": [
{
"datasource": "Prometheus",
"editorMode": "code",
"exemplar": false,
"expr": "pm2_uptime{name!=\"pm2-metrics\"}",
"format": "time_series",
"instant": false,
"intervalFactor": 1,
"legendFormat": "{{name}}",
"range": true,
"refId": "A",
"step": 1800
}
],
"title": "Uptime",
"type": "stat"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"decimals": 2,
"description": "",
"fieldConfig": {
"defaults": {
"unit": "%"
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 14
},
"hiddenSeries": false,
"id": 4,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"hideEmpty": false,
"hideZero": false,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.5.2",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": "Prometheus",
"editorMode": "code",
"exemplar": false,
"expr": "pm2_cpu{name!=\"pm2-metrics\"}",
"format": "time_series",
"hide": false,
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{name}}",
"range": true,
"refId": "A",
"step": 240
}
],
"thresholds": [],
"timeRegions": [],
"title": "CPU",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:87",
"format": "%",
"logBase": 1,
"min": "0",
"show": true
},
{
"$$hashKey": "object:88",
"format": "short",
"logBase": 1,
"show": false
}
],
"yaxis": {
"align": false
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"decimals": 2,
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 14
},
"hiddenSeries": false,
"id": 5,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.5.2",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": "Prometheus",
"editorMode": "code",
"expr": "pm2_memory{name!=\"pm2-metrics\"}",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{name}}",
"range": true,
"refId": "A",
"step": 240
}
],
"thresholds": [],
"timeRegions": [],
"title": "Memory",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:87",
"format": "decbytes",
"logBase": 1,
"min": "0",
"show": true
},
{
"$$hashKey": "object:88",
"format": "short",
"logBase": 1,
"show": false
}
],
"yaxis": {
"align": false
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"decimals": 2,
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 23
},
"hiddenSeries": false,
"id": 11,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.5.2",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": "Prometheus",
"editorMode": "code",
"expr": "pm2_http_mean_latency{name!=\"pm2-metrics\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "mean-{{name}}",
"range": true,
"refId": "A",
"step": 240
},
{
"datasource": "Prometheus",
"editorMode": "code",
"expr": "pm2_http_p95_latency{name!=\"pm2-metrics\"}",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "p95-{{name}}",
"range": true,
"refId": "B"
}
],
"thresholds": [],
"timeRegions": [],
"title": "HTTP Latency",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:310",
"format": "ms",
"logBase": 1,
"min": "0",
"show": true
},
{
"$$hashKey": "object:311",
"format": "short",
"logBase": 1,
"show": false
}
],
"yaxis": {
"align": false
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"decimals": 2,
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 23
},
"hiddenSeries": false,
"id": 12,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.5.2",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": "Prometheus",
"editorMode": "code",
"expr": "pm2_event_loop_latency{name!=\"pm2-metrics\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "mean-{{name}}",
"range": true,
"refId": "A",
"step": 240
},
{
"datasource": "Prometheus",
"editorMode": "code",
"expr": "pm2_event_loop_latency_p95{name!=\"pm2-metrics\"}",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "p95-{{name}}",
"range": true,
"refId": "B"
}
],
"thresholds": [],
"timeRegions": [],
"title": "Event Loop Latency",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:673",
"format": "ms",
"logBase": 1,
"min": "0",
"show": true
},
{
"$$hashKey": "object:674",
"format": "short",
"logBase": 1,
"show": false
}
],
"yaxis": {
"align": false
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"decimals": 0,
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 30
},
"hiddenSeries": false,
"id": 13,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.5.2",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "Apache Down",
"color": "#BF1B00"
},
{
"alias": "Apache Down",
"transform": "negative-Y"
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": "Prometheus",
"editorMode": "code",
"expr": "pm2_active_handles{name!=\"pm2-metrics\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{name}}",
"range": true,
"refId": "A",
"step": 120
}
],
"thresholds": [],
"timeRegions": [],
"title": "Active handles",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:599",
"format": "short",
"logBase": 1,
"min": "0",
"show": true
},
{
"$$hashKey": "object:600",
"format": "short",
"logBase": 1,
"show": false
}
],
"yaxis": {
"align": false
}
}
],
"refresh": "",
"schemaVersion": 36,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-3h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"timezone": "",
"title": "Микросервисы",
"uid": "_GcdVy_7z",
"version": 24,
"weekStart": ""
}
导入
ok,看看成果
告警规则
groups:
- name: pm2_alerts
rules:
- alert: HighCPUUsage
expr: pm2_cpu > 90
for: 5m
labels:
severity: critical
annotations:
summary: 检测到高CPU使用率
description: '{{ $labels.name }} 进程正在经历高CPU使用率 ({{ $value }}%)'
- alert: HighMemoryUsage
expr: pm2_memory / 1024 / 1024 > 800 # 假设阈值为500 MB
for: 5m
labels:
severity: critical
annotations:
summary: 检测到高内存使用率
description: '{{ $labels.name }} 进程正在消耗大量内存 ({{ $value | humanize }})'
- alert: ProcessRestartFrequency
expr: rate(pm2_restarts[1h]) > 10
for: 10m
labels:
severity: warning
annotations:
summary: 进程频繁重启
description: "进程'{{ $labels.name }}'(实例ID:{{ $labels.instance }})在1小时内重启次数超过10次,当前重启次数为{{ $value }}次。"