Prometheus+Grafana同时监控nifi、pg、主机
一、prometheus
- 下载
prometheus下载:https://github.com/prometheus/prometheus/releases这里选用2.28版本,然后上传到服务器
tar -zxf prometheus-2.28.1.linux-amd64.tar.gz
mv prometheus-2.28.1.linux-amd64 prometheus
chown root.root prometheus -R
- 配置启动服务
- 配置后台启动
##首先创建启动service
vim /usr/lib/systemd/system/prometheus.service
##在里面添加下面内容
[Unit]
Description=Prometheus
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --web.enable-lifecycle --storage.tsdb.path=/disk/prometheus/data --storage.tsdb.retention=60d
Restart=on-failure
[Install]
WantedBy=multi-user.target
##最后设置开机启动和启动应用
systemctl enable prometheus
systemctl start prometheus
##或者直接启动
nohup ./prometheus --config.file=prometheus.yml 2>&1 1>prometheus.log &
- 启动参数配置说明:
- web.enable-lifecycle:指明prometheus配置更改后可以进行热加载
配置修改后用下面:curl -X POST http://localhost:9090/-/reload
进行配置重载 - storage.tsdb.path:指明监控数据存储路径
- storage.tsdb.retention:指明数据保留时间
- web.enable-lifecycle:指明prometheus配置更改后可以进行热加载
二、node_exporter
- 下载node_exporter
- 配置后台启动服务
# 配置服务
vi /usr/lib/systemd/system/node_exporter.service
[Unit]
Description=node_exporter
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/disk1/node_exporter/node_exporter
Restart=on-failure
[Install]
WantedBy=multi-user.target
# 设置服务开机自启动
systemctl enable node_exporter
systemctl start node_exporter
# 直接启动
nohup ./node_exporter 2>&1 1>node_exporter.log &
# 查看服务
[root@VM_0_13_centos pushgateway]# netstat -lntup |grep node_export
- prometheus配置node采集点信息:
在prometheus.yml
中配置node采集点信息,加入一下配置
- job_name: 'node'
static_configs:
- targets: ['172.31.187.18:9100']
labels:
name: testNode
三、nifi监控数据采集
- nifi设置prometheus插件
然后就可以进行http://{instance ID}:{Endpoint Port}/metrics进行访问 - prometheus中配置nifi采集点
- job_name: 'nifi'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['172.31.187.18:9110']
四、postgres_exporter监控postgres
- 下载postgres_exporter:postgres_exporter
- 解压安装之后配置
1.建立一些自定义指标文件:
vi costom.yaml
pg_replication:
query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag"
master: true
metrics:
- lag:
usage: "GAUGE"
description: "Replication lag behind master in seconds"
pg_postmaster:
query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
master: true
metrics:
- start_time_seconds:
usage: "GAUGE"
description: "Time at which postmaster started"
pg_stat_user_tables:
query: |
SELECT
current_database() datname,
schemaname,
relname,
seq_scan,
seq_tup_read,
idx_scan,
idx_tup_fetch,
n_tup_ins,
n_tup_upd,
n_tup_del,
n_tup_hot_upd,
n_live_tup,
n_dead_tup,
n_mod_since_analyze,
COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum,
COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum,
COALESCE(last_analyze, '1970-01-01Z') as last_analyze,
COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze,
vacuum_count,
autovacuum_count,
analyze_count,
autoanalyze_count
FROM
pg_stat_user_tables
metrics:
- datname:
usage: "LABEL"
description: "Name of current database"
- schemaname:
usage: "LABEL"
description: "Name of the schema that this table is in"
- relname:
usage: "LABEL"
description: "Name of this table"
- seq_scan:
usage: "COUNTER"
description: "Number of sequential scans initiated on this table"
- seq_tup_read:
usage: "COUNTER"
description: "Number of live rows fetched by sequential scans"
- idx_scan:
usage: "COUNTER"
description: "Number of index scans initiated on this table"
- idx_tup_fetch:
usage: "COUNTER"
description: "Number of live rows fetched by index scans"
- n_tup_ins:
usage: "COUNTER"
description: "Number of rows inserted"
- n_tup_upd:
usage: "COUNTER"
description: "Number of rows updated"
- n_tup_del:
usage: "COUNTER"
description: "Number of rows deleted"
- n_tup_hot_upd:
usage: "COUNTER"
description: "Number of rows HOT updated (i.e., with no separate index update required)"
- n_live_tup:
usage: "GAUGE"
description: "Estimated number of live rows"
- n_dead_tup:
usage: "GAUGE"
description: "Estimated number of dead rows"
- n_mod_since_analyze:
usage: "GAUGE"
description: "Estimated number of rows changed since last analyze"
- last_vacuum:
usage: "GAUGE"
description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)"
- last_autovacuum:
usage: "GAUGE"
description: "Last time at which this table was vacuumed by the autovacuum daemon"
- last_analyze:
usage: "GAUGE"
description: "Last time at which this table was manually analyzed"
- last_autoanalyze:
usage: "GAUGE"
description: "Last time at which this table was analyzed by the autovacuum daemon"
- vacuum_count:
usage: "COUNTER"
description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
- autovacuum_count:
usage: "COUNTER"
description: "Number of times this table has been vacuumed by the autovacuum daemon"
- analyze_count:
usage: "COUNTER"
description: "Number of times this table has been manually analyzed"
- autoanalyze_count:
usage: "COUNTER"
description: "Number of times this table has been analyzed by the autovacuum daemon"
pg_statio_user_tables:
query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables"
metrics:
- datname:
usage: "LABEL"
description: "Name of current database"
- schemaname:
usage: "LABEL"
description: "Name of the schema that this table is in"
- relname:
usage: "LABEL"
description: "Name of this table"
- heap_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from this table"
- heap_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in this table"
- idx_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from all indexes on this table"
- idx_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in all indexes on this table"
- toast_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from this table's TOAST table (if any)"
- toast_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in this table's TOAST table (if any)"
- tidx_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from this table's TOAST table indexes (if any)"
- tidx_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in this table's TOAST table indexes (if any)"
pg_database:
query: "SELECT pg_database.datname, pg_database_size(pg_database.datname) as size_bytes FROM pg_database"
master: true
cache_seconds: 30
metrics:
- datname:
usage: "LABEL"
description: "Name of the database"
- size_bytes:
usage: "GAUGE"
description: "Disk space used by the database"
pg_stat_statements:
query: "SELECT t2.rolname, t3.datname, queryid, calls, total_time / 1000 as total_time_seconds, min_time / 1000 as min_time_seconds, max_time / 1000 as max_time_seconds, mean_time / 1000 as mean_time_seconds, stddev_time / 1000 as stddev_time_seconds, rows, shared_blks_hit, shared_blks_read, shared_blks_dirtied, shared_blks_written, local_blks_hit, local_blks_read, local_blks_dirtied, local_blks_written, temp_blks_read, temp_blks_written, blk_read_time / 1000 as blk_read_time_seconds, blk_write_time / 1000 as blk_write_time_seconds FROM pg_stat_statements t1 JOIN pg_roles t2 ON (t1.userid=t2.oid) JOIN pg_database t3 ON (t1.dbid=t3.oid) WHERE t2.rolname != 'rdsadmin'"
master: true
metrics:
- rolname:
usage: "LABEL"
description: "Name of user"
- datname:
usage: "LABEL"
description: "Name of database"
- queryid:
usage: "LABEL"
description: "Query ID"
- calls:
usage: "COUNTER"
description: "Number of times executed"
- total_time_seconds:
usage: "COUNTER"
description: "Total time spent in the statement, in milliseconds"
- min_time_seconds:
usage: "GAUGE"
description: "Minimum time spent in the statement, in milliseconds"
- max_time_seconds:
usage: "GAUGE"
description: "Maximum time spent in the statement, in milliseconds"
- mean_time_seconds:
usage: "GAUGE"
description: "Mean time spent in the statement, in milliseconds"
- stddev_time_seconds:
usage: "GAUGE"
description: "Population standard deviation of time spent in the statement, in milliseconds"
- rows:
usage: "COUNTER"
description: "Total number of rows retrieved or affected by the statement"
- shared_blks_hit:
usage: "COUNTER"
description: "Total number of shared block cache hits by the statement"
- shared_blks_read:
usage: "COUNTER"
description: "Total number of shared blocks read by the statement"
- shared_blks_dirtied:
usage: "COUNTER"
description: "Total number of shared blocks dirtied by the statement"
- shared_blks_written:
usage: "COUNTER"
description: "Total number of shared blocks written by the statement"
- local_blks_hit:
usage: "COUNTER"
description: "Total number of local block cache hits by the statement"
- local_blks_read:
usage: "COUNTER"
description: "Total number of local blocks read by the statement"
- local_blks_dirtied:
usage: "COUNTER"
description: "Total number of local blocks dirtied by the statement"
- local_blks_written:
usage: "COUNTER"
description: "Total number of local blocks written by the statement"
- temp_blks_read:
usage: "COUNTER"
description: "Total number of temp blocks read by the statement"
- temp_blks_written:
usage: "COUNTER"
description: "Total number of temp blocks written by the statement"
- blk_read_time_seconds:
usage: "COUNTER"
description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
- blk_write_time_seconds:
usage: "COUNTER"
description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
pg_process_idle:
query: |
WITH
metrics AS (
SELECT
application_name,
SUM(EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change))::bigint)::float AS process_idle_seconds_sum,
COUNT(*) AS process_idle_seconds_count
FROM pg_stat_activity
WHERE state = 'idle'
GROUP BY application_name
),
buckets AS (
SELECT
application_name,
le,
SUM(
CASE WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change)) <= le
THEN 1
ELSE 0
END
)::bigint AS bucket
FROM
pg_stat_activity,
UNNEST(ARRAY[1, 2, 5, 15, 30, 60, 90, 120, 300]) AS le
GROUP BY application_name, le
ORDER BY application_name, le
)
SELECT
application_name,
process_idle_seconds_sum as seconds_sum,
process_idle_seconds_count as seconds_count,
ARRAY_AGG(le) AS seconds,
ARRAY_AGG(bucket) AS seconds_bucket
FROM metrics JOIN buckets USING (application_name)
GROUP BY 1, 2, 3
metrics:
- application_name:
usage: "LABEL"
description: "Application Name"
- seconds:
usage: "HISTOGRAM"
description: "Idle time of server processes"
pg_stat_connetion:
query: "select state,count(*) from pg_stat_activity group by state"
metrics:
- state:
usage: "LABEL"
description: "type of session state"
- num:
usage: "GAUGE"
description: "type of session state"
pg_stat_long:
query: "SELECT pid, runtime from (select usename, pid, EXTRACT(EPOCH FROM (now() - query_start))::INT as runtime FROM pg_stat_activity) as ss where runtime > 180 order by runtime desc limit 5"
metrics:
- pid:
usage: "LABEL"
description: "Pid of the client"
- runtime:
usage: "GAUGE"
description: "current transactrion start time"
2.新建postgres_exporter.env环境配置
# postgres_exporter.env
DATA_SOURCE_NAME="postgresql://{user}:{password}@172.31.184.170:5432/?sslmode=disable"
PG_EXPORTER_EXTEND_QUERY_PATH="/opt/postgres_exporter/custom.yaml"
3.创建启动service
vim /usr/lib/systemd/system/postgres_exporter.service
[Unit]
Description=Prometheus exporter for Postgresql
Wants=network-online.target
After=network-online.target
[Service]
User=postgres
Group=postgres
WorkingDirectory=/disk1/postgres_exporter
EnvironmentFile=/disk1/postgres_exporter/postgres_exporter.env
ExecStart=/disk1/postgres_exporter/postgres_exporter
Restart=always
[Install]
WantedBy=multi-user.target
4.配置到prometheus
- job_name: 'postgres'
static_configs:
- targets: ['172.31.184.170:9187']
五、grafana监控ui
1.安装
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-9.0.4-1.x86_64.rpm
sudo yum install grafana-enterprise-9.0.4-1.x86_64.rpm
2.修改配置
vim conf/defaults.ini
data = /disk1/grafana/data
logs = /disk1/grafana/data/log
plugins = /disk1/grafana/data/plugins
provisioning = /disk1/grafana/conf/provisioning
##为了可以嵌入应用界面 这里面配置对应的网关配置 可以解决跨域问题
domain = 172.31.184.170
root_url = %(protocol)s://%(domain)s/grafana/
serve_from_sub_path = true
[auth.anonymous]
# enable anonymous access
# 去掉注释,改为true,允许匿名访问
enabled = true
# specify organization name that should be used for unauthenticated users
# 匿名用户属于的组织
org_name = Main Org.
# specify role for unauthenticated users
# 匿名用户的角色/权限
org_role = Viewer
allow_embedding = true
3.配置nginx网关代理172.31.187.18:3000
location /grafana/ {
root html;
index index.html index.htm;
proxy_pass http://172.31.184.170:3000/;
}
4.启动grafana
``