python实现自动更新prometheus规则

由于公司需要监控目标类型较多,不能手动去改动prometheus规则然后reload,所以就通过python写了个程序自动更新prometheus配置

基本环境准备

  • python 3.10.10
  • flask 2.3.2
  • prometheus 2.52.0

基本流程

  1. 将接口传来的prometheus规则信息保存到数据表中
  2. 取数据表中所有prometheus规则生成规则文件保存到本地临时文件夹内
  3. 获取需要修改prometheus机器ip
  4. 根据第三步获取的ip读取之前的prometheus规则备用
  5. 根据第三部获取的ip删除之前prometheus规则文件
  6. 把第二步生成的规则文件上传到第三步获取ip机器上
  7. 通过调用http://{ip}:9090/-/reload接口让配置文件重新生效
  8. 如果新的规则文件未生效,把第四部备用规则文件上传
  9. 清除第二步中的临时文件夹

以上为开发流程,在基本环境准备好的前提下开始开发,本文涉及的kevin模块导入均为本人开发功。

1.数据表创建及模型开发

DROP TABLE IF EXISTS `prom_ruler`;
CREATE TABLE `prom_ruler` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `expr` varchar(1023) DEFAULT NULL,
  `duration` varchar(16) DEFAULT NULL,
  `severity` varchar(15) DEFAULT NULL,
  `summary` text,
  `description` text,
  `datasource` int(11) DEFAULT NULL,
  `group` int(11) DEFAULT NULL,
  `updated_on` datetime DEFAULT NULL,
  `created_on` datetime DEFAULT NULL,
  `heal` int(11) DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;
#!/usr/bin/env python
# -*- coding: UTF-8 -*-

from sqlalchemy import Column, Integer, String, TEXT

from kevin.sqlalchemy_utils.model import Model


class PromRulerModel(Model):
    """
    prom ruler
    """
    __tablename__ = "prom_ruler"

    expr = Column(String(100))
    duration = Column(String(10))
    severity = Column(String)
    summary = Column(TEXT)
    description = Column(TEXT)

    datasource = Column(Integer)
    group = Column(Integer)

    heal = Column(Integer)

2.业务逻辑处理

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

import traceback
from typing import List

import yaml

import requests
from flask import request

from kevin.common.http_code import HttpCode
from kevin.log import logger
from kevin.sqlalchemy_utils.model_utils import list_to_dictlist
from kevin.utils import pretty_result

from config import Config
from extensions.ssh_ext import read_remote_file, exec_ssh_one_cmd, upload_file_to_linux
from models.alarm_heal import AlarmHeal
from models.common import GroupModel, DataSourceModel
from models.prometheus import PromRulerModel, PromRuleClassifyModel
from resource.v1.dc2_monitor_mixin import MonitorMixin


class PrometheusDbView(MonitorMixin):

    @staticmethod
    def generate_prometheus_rule_model(prometheus_rule: dict) -> PromRulerModel:
        prom_rule_model = PromRulerModel(
            expr=prometheus_rule["monitorMetrics"] + prometheus_rule["symbols"] + prometheus_rule["alarmThreshold"],
            duration=prometheus_rule["duration"],
            severity=prometheus_rule["severity"],
            summary=prometheus_rule["summary"],
            description=prometheus_rule["description"],
            datasource=prometheus_rule["datasource"],
            group=prometheus_rule["group"]
        )
        if "heal" in prometheus_rule and prometheus_rule["heal"] and prometheus_rule["heal"] != "None":
            prom_rule_model.heal = prometheus_rule["heal"]
        else:
            prom_rule_model.heal = 0
        return prom_rule_model

    @staticmethod
    def save_prometheus_rule(prom_rule_model) -> None:
        add_prom_rule_model_res = prom_rule_model.objects.add()
        if not add_prom_rule_model_res:
            raise Exception("prom rule was failed deposited into the database.")

    @staticmethod
    def delete_prometheus_rule(prometheus_rule_id: int) -> dict:
        prom_rule = PromRulerModel.objects_().get_by_id(prometheus_rule_id).__dict__
        PromRulerModel.objects_().delete(PromRulerModel.id == prometheus_rule_id)
        return prom_rule

    def save_prometheus_rule_to_db(self, prometheus_rule: dict) -> None:
        logger.info("Start save prometheus rule to db.")
        prom_rule_model = self.generate_prometheus_rule_model(prometheus_rule=prometheus_rule)
        self.save_prometheus_rule(prom_rule_model=prom_rule_model)
        logger.info("Success save prometheus rule to db.")

    @staticmethod
    def get_prometheus_rule_by_db() -> List[dict]:
        prom_rule = PromRulerModel.objects_().list() or []
        prom_rule = list_to_dictlist(prom_rule)
        for rule in prom_rule:
            if "datasource" in rule and rule["datasource"] != 'None':
                rule["datasource_id"] = int(rule["datasource"])
                rule["datasource"] = DataSourceModel.objects_().get_by_id(int(rule["datasource"])).name
            if "group" in rule:
                rule["group_id"] = int(rule["group"])
                try:
                    rule["group"] = GroupModel.objects_().get_by_id(int(rule["group"])).name
                except Exception as e:
                    logger.error("group deleted!")
        return prom_rule

    def update_prometheus_rule_by_db(self, prometheus_rule: dict) -> None:
        logger.info("Start update prometheus rule to db.")
        prom_rule_model = self.generate_prometheus_rule_model(prometheus_rule=prometheus_rule)
        self.delete_prometheus_rule(prometheus_rule_id=prometheus_rule["id"])
        self.save_prometheus_rule(prom_rule_model=prom_rule_model)
        logger.info("Finish update prometheus rule to db.")


class PrometheusRuleView(MonitorMixin):

    @staticmethod
    def generate_prometheus_ips(prometheus_rule: dict) -> list:
        datasource_id = int(prometheus_rule["datasource"])
        url = DataSourceModel.objects_().get_by_id(datasource_id).url
        return url.split(",")

    def read_prom_config(self, ip: str) -> list:
        host_info = self._get_host_info(ip)
        prom_rule_old_yml = read_remote_file(host_info=host_info, remote_path=Config.PROMETHEUS_CONFIG_PATH)
        return prom_rule_old_yml

    def generate_prometheus_yml(self) -> None:
        prometheus_yml = {
            "groups": [
                {
                    "name": "default_group",
                    "rules": []
                }
            ]
        }

        prom_rule_model = PromRulerModel.objects_().list() or []
        prom_rule_list = list_to_dictlist(prom_rule_model)
        for prom_rule in prom_rule_list:
            try:
                prom_rule_dict = {
                    "alert": GroupModel.objects_().get_by_id(int(prom_rule["group"])).name,
                    "annotations": {
                        "summary": prom_rule["summary"],
                        "description": prom_rule["description"],
                    },
                    "expr": prom_rule["expr"],
                    "for": prom_rule["duration"],
                    "labels": {
                        "prom_id": prom_rule["id"],
                        "severity": prom_rule["severity"]
                    }
                }
                prometheus_yml["groups"][0]["rules"].append(prom_rule_dict)
            except Exception as e:
                logger.error(f"generate prom rule, error: {e}, {traceback.format_exc()}")
                logger.error(f"prom rule {prom_rule}")

        self.generate_conf_tmp_dir()

        with open(Config.PROMETHEUS_CONFIG_LOCAL_PATH, "w", encoding='utf-8') as f:
            yaml.dump(prometheus_yml, f, allow_unicode=True)

    @staticmethod
    def generate_old_prometheus_yml(prom_rule_old_yml) -> None:
        with open(Config.PROMETHEUS_CONFIG_OLD_LOCAL_PATH, "w", encoding='utf-8') as f:
            yaml.dump(prom_rule_old_yml, f, allow_unicode=True)

    def remove_remote_prometheus_rules(self, ip: str) -> None:
        remove_remote_prometheus_rule_cmd = f"rm -f {Config.PROMETHEUS_CONFIG_PATH}"
        host_info = self._get_host_info(ip)
        exec_ssh_one_cmd(host_info=host_info, command=remove_remote_prometheus_rule_cmd)

    def upload_prometheus_rules_file(self, ip: str, local_host: str) -> None:
        host_info = self._get_host_info(ip)
        upload_file_to_linux(
            host_info=host_info,
            local_path=local_host,
            remote_path=Config.PROMETHEUS_CONFIG_PATH
        )

    @staticmethod
    def reload_prometheus_rules(ip: str) -> bool:
        url = f"http://{ip}:9090/-/reload"
        resp = requests.post(url=url)
        if resp.status_code == 200:
            logger.info(f"Success reload prom rule {ip}")
            return True
        else:
            logger.error(f"Failed to reload prom rule {ip}")
            return False

    def reload_prometheus_conf(self, prometheus_rule: dict) -> None:
        # 2.生成新的规则文件到本地
        self.generate_prometheus_yml()
        prometheus_ips = self.generate_prometheus_ips(prometheus_rule=prometheus_rule)
        # 3.获取ip
        for prometheus_ip in prometheus_ips:
            # 4.读取远程规则作为旧规则作保障
            prom_old_rule = self.read_prom_config(ip=prometheus_ip)
            # 5.删除远程规则文件
            self.remove_remote_prometheus_rules(ip=prometheus_ip)
            # 6.上传新的规则文件
            self.upload_prometheus_rules_file(ip=prometheus_ip, local_host=Config.PROMETHEUS_CONFIG_LOCAL_PATH)
            # 7.进行远程reload
            reload_result = self.reload_prometheus_rules(ip=prometheus_ip)
            # 8.若失败把旧的规则文件生成并上传远程服务器
            if not reload_result:
                # 9.生成旧的prom配置文件
                self.generate_old_prometheus_yml(prom_old_rule)
                # 10.上传旧的文件
                self.upload_prometheus_rules_file(ip=prometheus_ip, local_host=Config.PROMETHEUS_CONFIG_OLD_LOCAL_PATH)
                self.reload_prometheus_rules(ip=prometheus_ip)
        # 清楚临时生成的文件夹
        self.clear_conf_tmp_dir()


class PrometheusView(PrometheusDbView, PrometheusRuleView):
    def get(self):
        rule_classify_id = request.values.get("ruleClassifyId")
        page = request.values.get("page")
        pagesize = request.values.get("pageSize")
        # 规则分类页面获取规则,没有分页
        if rule_classify_id is None and page is None and pagesize is None:
            prom_rules = self.get_prometheus_rule_by_db()
            return pretty_result(code=HttpCode.OK, data=prom_rules)
        if rule_classify_id == "0" or rule_classify_id is None:
            prom_rules = self.get_prometheus_rule_by_db()
        else:
            prom_rule_classify = PromRuleClassifyModel.objects_().get_by_id(int(rule_classify_id))
            if prom_rule_classify is None:
                prom_rules = []
            else:
                rule_ids = prom_rule_classify.rule_ids.split(",")
                prom_rule_obj = PromRulerModel.objects_().list_by_ids(rule_ids)
                prom_rules = []
                for prom_rule in prom_rule_obj:
                    rule = {
                        "datasource_id": prom_rule.datasource,
                        "datasource": DataSourceModel.objects_().get_by_id(prom_rule.datasource).name,
                        "description": prom_rule.description,
                        "duration": prom_rule.duration,
                        "expr": prom_rule.expr,
                        "group_id": prom_rule.group,
                        "group": GroupModel.objects_().get_by_id(prom_rule.group).name,
                        "id": prom_rule.id,
                        "severity": prom_rule.severity,
                        "summary": prom_rule.summary,
                        "heal": prom_rule.heal
                    }
                    if prom_rule.heal and int(prom_rule.heal) != 0:
                        rule["heal_title"] = AlarmHeal.objects_().get_by_id(int(prom_rule.heal)).title
                    else:
                        rule["heal_title"] = ""
                    prom_rules.append(rule)
        total = len(prom_rules)
        prom_rules = prom_rules[(int(page) - 1) * int(pagesize): int(page) * int(pagesize)]
        return pretty_result(code=HttpCode.OK, data={"prom_rules": prom_rules, "total": total})

    def post(self):
        prometheus_rule = request.get_json(force=True)
        # 1.规则存入数据库
        self.save_prometheus_rule_to_db(prometheus_rule=prometheus_rule)
        self.reload_prometheus_conf(prometheus_rule=prometheus_rule)
        return pretty_result(code=HttpCode.OK)

    def put(self):
        prometheus_rule = request.get_json(force=True)
        self.update_prometheus_rule_by_db(prometheus_rule=prometheus_rule)
        self.reload_prometheus_conf(prometheus_rule=prometheus_rule)
        return pretty_result(code=HttpCode.OK)

    def delete(self):
        prometheus_rule_id = request.get_json(force=True)
        prometheus_rule = self.delete_prometheus_rule(prometheus_rule_id=int(prometheus_rule_id))
        self.reload_prometheus_conf(prometheus_rule=prometheus_rule)
        return pretty_result(code=HttpCode.OK)
#!/usr/bin/env python
# -*- coding: UTF-8 -*-

import os
import shutil
import traceback

from flask_restful import Resource

from kevin.log import logger

from config import Config
from data_entity import HostInfo


class MonitorMixin(Resource):
    @staticmethod
    def _get_host_info(ip):
        host_info = HostInfo(
            hostname=ip,
            username=Config.MONITOR_USERNAME, # prometheus机器用户名
            password=Config.MONITOR_PASSWORD, # prometheus机器密码
            port=22
        )
        return host_info

    @staticmethod
    def clear_conf_tmp_dir() -> None:
        try:
            tmp_path = os.path.join(Config.PROJECT_DIR, "tmp")
            shutil.rmtree(tmp_path)
            logger.info(f"Success clear conf tmp dir: {tmp_path}")
        except Exception as e:
            logger.error(f"Failed to clear conf tmp dir, error: {e}, {traceback.format_exc()}")

    @staticmethod
    def generate_conf_tmp_dir() -> None:
        yaml_dir = os.path.join(Config.PROJECT_DIR, "tmp")
        if not os.path.isdir(yaml_dir):
            os.mkdir(yaml_dir)

3.接口实现

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

from .v1 import *

from flask_restful import Api

api = Api(prefix="/api/v1")
api.add_resource(PrometheusView, "/prometheusconfigs")


  • 23
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
要连接Prometheus并使用Python脚本进行操作,你可以使用PrometheusPython客户端库来实现。这个库提供了一些方法和函数,可以方便地与Prometheus进行交互。 首先,你需要安装`prometheus_client`库。你可以使用以下命令来安装它: ``` pip install prometheus_client ``` 安装完成后,你可以在Python脚本中导入该库,并使用其中的函数和类来连接和操作Prometheus。 下面是一个简单的示例代码,展示了如何连接Prometheus并获取指标数据: ```python from prometheus_client import start_http_server, Summary # 定义一个Summary指标 REQUEST_TIME = Summary('request_processing_seconds', 'Time spent processing request') # 启动一个HTTP服务器,用于暴露指标 start_http_server(8000) # 模拟一个请求处理函数 @REQUEST_TIME.time() def process_request(): # 处理请求的逻辑 pass # 主程序入口 if __name__ == '__main__': # 模拟处理多个请求 for i in range(10): process_request() ``` 在上面的示例中,我们首先导入了`prometheus_client`库,并定义了一个名为`REQUEST_TIME`的Summary指标。然后,我们启动了一个HTTP服务器,用于暴露指标。接下来,我们定义了一个处理请求的函数,并使用`@REQUEST_TIME.time()`装饰器来记录请求处理时间。最后,在主程序中模拟处理多个请求。 你可以通过访问`http://localhost:8000/metrics`来查看暴露的指标数据。 这只是一个简单的示例,你可以根据自己的需求使用更多的函数和类来操作Prometheus。你可以参考`prometheus_client`库的文档来了解更多详细的用法和功能。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值