loki安装和配置
loki的下载地址https://github.com/grafana/loki/
下载二进制包
mkdir /usr/local/loki
cd /usr/local/loki
curl -O -L "https://github.com/grafana/loki/releases/tag/v2.8.6/loki-linux-amd64.zip"
解压
unzip loki-linux-amd64.zip
添加执行权限
chmod +x loki-linux-amd64
编写配置文件
vim loki-confing.yml
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
ingester:
lifecycler:
address: 127.0.0.1
ring:
kvstore:
store: inmemory
replication_factor: 1
final_sleep: 0s
chunk_idle_period: 10m
chunk_retain_period: 30s
schema_config:
configs:
- from: 2020-05-15
store: boltdb
object_store: filesystem
schema: v11
index:
prefix: index_
period: 168h
storage_config:
boltdb:
directory: /usr/local/loki/index
filesystem:
directory: /usr/local/loki/chunks # 块存储路径
limits_config:
enforce_metric_name: false
reject_old_samples: true # 是否拒绝老样本
reject_old_samples_max_age: 168h # 168小时之前的样本将会被删除
ingestion_rate_mb: 200
ingestion_burst_size_mb: 300
per_stream_rate_limit: 1000MB
max_entries_limit_per_query: 10000
chunk_store_config:
max_look_back_period: 168h # 为避免查询超过保留期的数据,必须小于或等于下方的时间值
table_manager:
retention_deletes_enabled: true # 保留删除开启
retention_period: 168h # 超过168h的块数据将被删除
#设置alertmanager报警
ruler:
storage:
type: local
local:
directory: /usr/local/loki/rules #报警规则地址
rule_path: /usr/local/loki/rules-temp
alertmanager_url: http://192.168.0.1:9093 # alertmanager地址
ring:
kvstore:
store: inmemory
enable_api: true
enable_alertmanager_v2: true
后台启动
loki nohup ./loki-linux-amd64 --config.file=loki-config.yml &
编写启动脚本
vim start.sh
#!/bin/bash
echo "stop loki"
ps -ef | grep loki-linux-amd64 | grep -v grep | awk '{print $2}'| xargs kill -9
echo "Begin start loki"
nohup ./loki-linux-amd64 --config.file=loki-config.yml &
chmod +x restart.sh
日志代理Promtail安装和配置
mkdir /usr/local/promtail
cd /usr/local/promtail
下载
curl -O -L "https://github.com/grafana/loki/releases/download/v2.7.4/promtail-linux-amd64.zip"
解压并添加执行权限
unzip promtail-linux-amd64.zip
chmod +x promtail-linux-amd64
编写配置文件
vim promtail-config.yml
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /usr/local/promtail/positions.yaml
clients:
- url: http://10.X.X.125:3100/loki/api/v1/push # 填写好Loki地址
scrape_configs:
- job_name: messagelog
static_configs:
- targets:
- localhost
labels:
#job: messagelog
#自定义标签: name
host: namenode01
__path__: /var/log/*.log
- job_name: agentlog
static_configs:
- targets:
- localhost
labels:
#job: xxxx
host: xxxx
__path__: /xxx/xxx/xxx.log
编写promtail启动脚本
vi restart-promtail.sh
#!/bin/bash
echo "Begin stop promtail"
ps -ef | grep promtail-linux-amd64 | grep -v grep | awk '{print $2}' | xargs kill -9
echo "Begin start promtail...."
nohup ./promtail-linux-amd64 --config.file=promtail-config.yml > ./promtail.log 2>&1 &
Grafana可视化安装和配置
下载启动
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-9.4.3-1.x86_64.rpm
yum install grafana-enterprise-9.4.3-1.x86_64.rpm -y
systemctl start grafana
docker安装
docker run -itd --name grafana -p 3000:3000 grafana/grafana
访问grafana
ip+端口
初始用户密码admin/admin,登陆后设置新密码
添加数据源
或者这里
选择这个
查看数据
选择数据源,点击标签
Kick start your query是查询语句
Operations可以用来过滤
altermanager安装配置
下载安装
解压启动
cd /etc/alertmanager
tar zxvf alertmanager-0.22.2.linux-amd64.tar.gz
vim alertmanager.yml
nohup ./alertmanager --config.file=alertmanager.yml
docker安装
docker run -d --restart=always --name=alertmanager -p 9093:9093 -v /etc/alertmanager:/etc/alertmanager prom/alertmanager:latest
编辑配置文件
/etc/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
#配置发送邮箱,我这里用的qq
smtp_from: 'xxxxx@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: 'xxxxx@qq.com'
# 注意这里需要配置QQ邮箱的授权码,不是登录密码,授权码在账户配置中查看
smtp_auth_password: 'xxxxxx'
smtp_require_tls: false
route:
group_by: ['alert_node']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
# 请注意这里的收件箱请改为你自己的邮箱地址,多个用逗号隔开
- to: 'xxxx@163.com,xxxx@163.com'
send_resolved: true
inhibit_rules:
- source_match:
报警规则
在我们配置文里定义的报警规则存放地址里编写
cd /usr/local/loki/rules
vim error.yml
groups:
- name: error
rules:
- alert: Too-many-error-logs-alert
#这里可以替换为我们自定义的标签job=~"xx"
expr: count_over_time({host=~"namenode01"}|~"error"[1m]) > 10
for: 1m
labels:
severity: warnning
instance: " 主机{{ $labels.host }}的日志: {{ $labels.filename }}"
annotations:
summary: Too many error logs in host logs
description: 1分钟之内日志内error日志出现10次以上