prometheus监控mtail

prometheus监控mtail

官方参考: 官方demo-.mtailmtail-release下载

文档参考: mtail 添加histogram的一个演示beta版 tomcat 应用监控指标

mtail当它与基于时间序列的计算器和警报工具(如Prometheus )搭配使用时效果最佳。

软件版本
prometheusv2.27.1
mtailv3.0.0-rc47
grafanav7.5.9
tomcatv8.0

1、mtail

  1. 安装
tar xf mtail_3.0.0-rc47_Linux_x86_64.tar.gz -C /usr/bin/
chmod 0755 /usr/bin/mtail
mtail --version
  1. 添加配置文件

    mkdir -p /etc/mtail
    cd /etc/mtail/
    touch /etc/mtail/line_count.mtail  ##要求必须以此格式结尾
    
  2. line_count.mtail

    # Parser for the common apache log format as follow.
    # LogFormat "%h %l %u %t \"%r\" %>s %b %D \"%{Referer}i\" \"%{User-agent}i\"
    counter apache_http_requests_total by request_method, http_version, status_code
    counter apache_http_bytes_total by request_method, http_version, status_code
    gauge apache_http_response_time by remote_host, request_method, request_uri, status_code
    gauge apache_http_response_size by remote_host, request_method, request_uri, status_code
     
    histogram apache_http_request_time_millseconds_bucket buckets 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 15000 by status_code
     
    /^/ +
    /(?P<remote_host>[0-9A-Za-z\.:-]+) / + # %h
    /(?P<remote_logname>[0-9A-Za-z-]+) / + # %l
    /(?P<remote_username>[0-9A-Za-z-]+) / + # %u
    /\[(?P<timestamp>\d{2}\/\w{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|-)\d{4})\] / + # %t
    /"(?P<request_method>[A-Z]+) (?P<request_uri>\S+) (?P<http_version>HTTP\/[0-9\.]+)" / + # \"%r\"
    /(?P<status_code>\d{3}) / + # %>s
    /((?P<response_size>\d+)|-) / + # %b
    /(?P<response_time>\d+) / + # %D
    /"(?P<referer>\S+)" / + # \"%{Referer}i\"
    /"(?P<user_agent>[[:print:]]+)"/ + # \"%{User-agent}i\"
    /$/ {
      strptime($timestamp, "02/Jan/2006:15:04:05 -0700") # for tests
     
      apache_http_requests_total[$request_method][$http_version][$status_code]++
      $response_size > 0 {
          apache_http_bytes_total[$request_method][$http_version][$status_code] += $response_size
          apache_http_response_size[$remote_host][$request_method][$request_uri][$status_code] += $response_size
      }
      $response_time > 3000 {
          apache_http_response_time[$remote_host][$request_method][$request_uri][$status_code] = $response_time
          #apache_http_request_time_seconds_bucket[$remote_host][$request_method][$request_uri][$status_code] = $response_time / 1000
          apache_http_request_time_millseconds_bucket[$status_code] = $response_time
      }
    }
     
    getfilename() !~ /localhost_access_log.?txt/ {
      stop
    }
    
  3. 设置tomcat 日志格式

    vim tomcal/conf/server.xml
    pattern="%h %l %u %t &quot;%r&quot; %s %b %D &quot;%{Referer}i&quot; &quot;%{User-Agent}i&quot;" />
    
    # 与上面line_count.mtail 这个一致 LogFormat "%h %l %u %t \"%r\" %>s %b %D \"%{Referer}i\" \"%{User-agent}i\"
    
  4. 启动mtail

    mtail --progs /etc/mtail --logs='/dir/tomcat/logs/localhost_access_log.*.txt' --logtostderr
    
    # 或使systemctl
    ]# cat mtail.service 
    [Unit]
    Description=mtail server
    After=network.target
    
    [Service]
    ExecStart=/usr/bin/mtail --progs /opt/apm/exporter/mtail --logs=/dir/tomcat/logs/localhost_access_log.*.txt --logtostderr
    ExecReload=/bin/kill -HUP \$MAINPID
    TimeoutStopSec=20s
    Restart=on-failure
    RestartSec=5
    
    [Install]
    WantedBy=multi-user.target
    
  5. 启动日志

    mtail --progs /opt/apm/exporter/mtail --logs='/dir/tomact/logs/localhost_access_log.*.txt' --logtostderr
    I0901 06:07:14.800485   13629 main.go:114] mtail version 3.0.0-rc47 git revision 5e0099f843e4e4f2b7189c21019de18eb49181bf go version go1.16.5 go arch amd64 go os linux
    I0901 06:07:14.800530   13629 main.go:115] Commandline: ["mtail" "--progs" "/opt/apm/exporter/mtail" "--logs=/dir/tomact/logs/localhost_access_log.*.txt" "--logtostderr"]
    I0901 06:07:14.800950   13629 store.go:182] Starting metric store expiry loop every 1h0m0s
    I0901 06:07:14.801933   13629 checker.go:253] capture group reference `remote_username' at line_count.mtail:10:1-3 appears to be unused
    I0901 06:07:14.801949   13629 checker.go:253] capture group reference `remote_username' at line_count.mtail:10:1-3 appears to be unused
    I0901 06:07:14.801953   13629 checker.go:253] capture group reference `user_agent' at line_count.mtail:10:1-3 appears to be unused
    I0901 06:07:14.801957   13629 checker.go:253] capture group reference `referer' at line_count.mtail:10:1-3 appears to be unused
    I0901 06:07:14.801961   13629 checker.go:253] capture group reference `10' at line_count.mtail:10:1-3 appears to be unused
    I0901 06:07:14.801964   13629 checker.go:253] capture group reference `referer' at line_count.mtail:10:1-3 appears to be unused
    I0901 06:07:14.801968   13629 checker.go:253] capture group reference `remote_logname' at line_count.mtail:10:1-3 appears to be unused
    I0901 06:07:14.801971   13629 checker.go:253] capture group reference `5' at line_count.mtail:10:1-3 appears to be unused
    I0901 06:07:14.801976   13629 checker.go:253] capture group reference `user_agent' at line_count.mtail:10:1-3 appears to be unused
    I0901 06:07:14.801980   13629 checker.go:253] capture group reference `remote_logname' at line_count.mtail:10:1-3 appears to be unused
    I0901 06:07:14.802224   13629 runtime.go:188] Loaded program line_count.mtail
    I0901 06:07:14.802266   13629 runtime.go:84] unmarking line_count.mtail
    I0901 06:07:14.802822   13629 tail.go:282] Tailing /dir/tomact/logs/localhost_access_log.2021-08-10.txt
    I0901 06:07:14.802832   13629 logstream.go:61] Parsed url as /dir/tomact/logs/localhost_access_log.2021-08-31.txt
    I0901 06:07:14.802848   13629 tail.go:282] Tailing /dir/tomact/logs/localhost_access_log.2021-08-31.txt
    I0901 06:07:14.802854   13629 logstream.go:61] Parsed url as /dir/tomact/logs/localhost_access_log.2021-09-01.txt
    I0901 06:07:14.802883   13629 tail.go:282] Tailing /dir/tomact/logs/localhost_access_log.2021-09-01.txt
    I0901 06:07:14.802982   13629 mtail.go:126] Listening on [::]:3903
    
  6. 异常处理

    它会出现这个错误:invalid syntax,是因为日志格式不对, 那个得要有值才行
    请添加图片描述
    请添加图片描述

  7. 手动给它整点正常的日志

echo '127.0.0.1 - - [01/Sep/2021:06:54:13 +0000] "GET /console HTTP/1.1" 202 80 1 "-" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"' >> /dir/tomcat/logs/localhost_access_log.2021-09-01.txt
  1. 注意:先等个几分钟然后在看

  2. 最终正常的采集效果
    请添加图片描述

2、tomcat

  • 下载

    # 最好是直接放到 tomcat的lib目录下
    cd /usr/local/prometheus
    wget https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.15.0/jmx_prometheus_javaagent-0.15.0.jar
    
  • 配置

    # 模板可以直接下载: https://github.com/prometheus/jmx_exporter 官方
    #将文件下载下来放到下面文件中 /tomcat安装目录下的/conf/jmx-exporter.yaml
    
    #cat /tomcat安装目录下的/conf/jmx-exporter.yaml
    ---   
    lowercaseOutputLabelNames: true
    lowercaseOutputName: true
    rules:
    - pattern: 'Catalina<type=GlobalRequestProcessor, name=\"(\w+-\w+)-(\d+)\"><>(\w+):'
      name: tomcat_$3_total
      labels:
        port: "$2"
        protocol: "$1"
      help: Tomcat global $3
      type: COUNTER
    - pattern: 'Catalina<j2eeType=Servlet, WebModule=//([-a-zA-Z0-9+&@#/%?=~_|!:.,;]*[-a-zA-Z0-9+&@#/%=~_|]), name=([-a-zA-Z0-9+/$%~_-|!.]*), J2EEApplication=none, J2EEServer=none><>(requestCount|maxTime|processingTime|errorCount):'
      name: tomcat_servlet_$3_total
      labels:
        module: "$1"
        servlet: "$2"
      help: Tomcat servlet $3 total
      type: COUNTER
    - pattern: 'Catalina<type=ThreadPool, name="(\w+-\w+)-(\d+)"><>(currentThreadCount|currentThreadsBusy|keepAliveCount|pollerThreadCount|connectionCount):'
      name: tomcat_threadpool_$3
      labels:
        port: "$2"
        protocol: "$1"
      help: Tomcat threadpool $3
      type: GAUGE
    - pattern: 'Catalina<type=Manager, host=([-a-zA-Z0-9+&@#/%?=~_|!:.,;]*[-a-zA-Z0-9+&@#/%=~_|]), context=([-a-zA-Z0-9+/$%~_-|!.]*)><>(processingTime|sessionCounter|rejectedSessions|expiredSessions):'
      name: tomcat_session_$3_total
      labels:
        context: "$2"
        host: "$1"
      help: Tomcat session $3 total
      type: COUNTER
    
    # 大饼中需要监控用到的模板
    - pattern: 'java.lang<type=OperatingSystem><>(committed_virtual_memory|free_physical_memory|free_swap_space|total_physical_memory|total_swap_space)_size:'
      name: os_$1_bytes
      type: GAUGE
      attrNameSnakeCase: true
    - pattern: 'java.lang<type=OperatingSystem><>((?!process_cpu_time)\w+):'
      name: os_$1
      type: GAUGE
      attrNameSnakeCase: true
    
  • 配置tomcat

    # 修改文件 tomcat/bin/catalina.sh 
    JAVA_OPTS=" -javaagent:/tomcat/lib/jmx_prometheus_javaagent-0.15.0.jar=9144:/tomcat/conf/jmx-exporter.yaml"
    
    # 这个配置在tomcat8中挺好用的,centos6.9版本的tomcat7 和tomcat8 也都挺好用的。但是centos6.5的tomcat7中,不好用。需要使用下面的配置
    ]# vim tomcat/bin/catalina.sh 
    CATALINA_OPTS="$CATALINA_OPTS -javaagent:/usr/local/prometheus/jmx_prometheus_javaagent-0.13.0.jar=20000:/usr/local/prometheus/jmx-exporter.yaml";exportCATALINA_OPTS
    
    # for  win
    tomcat/bin/catalina.bat
    
  • 启动(如果用的是javaagent这里直接忽略)

    如果是java -jar的java包的话,启动命令如下:
    java -javaagent:/usr/local/prometheus/jmx_prometheus_javaagent-0.13.0.jar=20000:/usr/local/prometheus/jmx-exporter.yaml -jar yourJar.jar
    
  • 扩展(选参)

    -server -Xms1G -Xmx1G -Xmn512m 
    -XX:MetaspaceSize=512m 
    -XX:MaxMetaspaceSize=1024m 
    -Djava.awt.headless=true
    -Dfile.encoding=UTF-8
    -Dsun.jnu.encoding=UTF-8 
    -XX:+DisableExplicitGC  
    # 就是这一行,添加上对应路径
    -javaagent:-javaagent:/tomcat/lib/jmx_prometheus_javaagent-0.15.0.jar=9144:/tomcat/conf/jmx-exporter.yaml
    -XX:NewRatio=4
    -XX:SurvivorRatio=4
    -XX:MaxPermSize=16m
    -XX:MaxTenuringThreshold=0
    -XX:+PrintGCDetails 
    -XX:+PrintGCDateStamps 
    -XX:+PrintTenuringDistribution 
    -XX:+PrintHeapAtGC 
    -XX:+PrintReferenceGC 
    -XX:+PrintGCApplicationStoppedTime
    -XX:+PrintSafepointStatistics 
    -XX:PrintSafepointStatisticsCount=1
    -XX:+HeapDumpOnOutOfMemoryError 
    -XX:HeapDumpPath=/dir/tomcat/logs/java_heapdump.hprof
    -Xloggc:/dir/tomcat/logs/gc-%t.log
    -XX:+UseGCLogFileRotation
    -XX:NumberOfGCLogFiles=5
    -XX:GCLogFileSize=100M
    

3、prometheus

3.1、错误指标

  • apache_http_requests_total

    apache_http_requests_total{request_method="GET",status_code="200"} 3
    apache_http_requests_total{request_method="GET",status_code="404"} 66
    apache_http_requests_total{request_method="POST",status_code="200"} 19
    apache_http_requests_total{request_method="POST",status_code="404"} 535
    
    # 统计日志中,每种类型所访问的总次数
    
  • tomcat_errorcount_total

    # tomcat后台记录的一个错误总数,与 apache_http_requests_total结合,通过查看日志可以更方便快捷的处理所存在的问题
    tomcat_errorcount_total{port="9080",protocol="http-nio",} 7.0
    

3.2、请求和吞吐量

  • apache_http_request_time_millseconds_bucket

    # 总的请求响应时间 / 总的访问次数,得到平均每次的访问耗时时长
    apache_http_request_time_millseconds_bucket_sum{prog="line_count.mtail",status_code="200"} 110000       # 总的请求响应时间
    apache_http_request_time_millseconds_bucket_count{prog="line_count.mtail",status_code="200"} 22           # 总的访问次数
    
  • tomcat_processingtime_total

    # tomcat处理请求的响应时间累加值 counter类型
    topk(5,rate(tomcat_processingtime_total{protocol=~"http.*"}[5m]))
    
  • tomcat_maxtime_total

    # 最大处理时间表示服务器处理一个请求所需的最长时间(从可用线程开始处理请求到返回响应为止)。每当服务器检测到比当前maxTime更长的请求处理时间时,其值就会更新。该指标不包含有关请求,其状态或URL路径的详细信息,因此,为了更好地理解单个请求和特定类型请求的最大处理时间,您需要分析访问日志。单个请求的处理时间激增可能表明JSP页面未加载或相关的进程(例如数据库查询)花费的时间太长而无法完成。由于其中一些问题可能是由于Tomcat之外的操作引起的,因此,与组成基础结构的所有其他服务一起监视Tomcat服务器非常重要。这有助于确保您不会忽略其他对运行应用程序也至关重要的操作或过程
    
    topk(5,rate(tomcat_maxtime_total{protocol=~"http.*"}[5m]))
    
  • tomcat_requestcount_total

    # 说明 访问请求总数对其进行rate计算,得到平秒每秒变化率
    topk(5,rate(tomcat_requestcount_total{protocol=~"http.*"}[5m]))
    
  • tomcat_bytesreceived_total & tomcat_bytessent_total

    # 说明 tomcat 实例发送和接收的流量
    topk(5,rate(tomcat_bytesreceived_total{protocol=~"http.*"}[5m]))
    topk(5,rate(tomcat_bytessent_total{protocol=~"http.*"}[5m]))
    

3.3、线程池

  • tomcat_threadpool_currentthreadsbusy

    # 说明 currentThreadsBusy(ThreadPool)和activeCount(Executor)指标可以告诉您当前连接器池中有多少个线程正在处理请求。当您的服务器收到请求时,如果现有线程不足以覆盖工作负载,则Tomcat将启动更多工作线程,直到达到您为池设置的最大线程数为止。这是由代表maxThreads为连接器的线程池和maximumPoolSize为遗嘱执行人。任何后续请求都将放入队列,直到线程可用。
    # 如果队列已满,则服务器将拒绝任何新请求,直到线程可用为止。重要的是要注意繁忙线程的数量,以确保未达到为 maxThreads 设置的值,因为如果持续达到此上限,则可能需要调整为连接器分配的最大线程数。
    # 使用监视工具,可以通过将当前线程数与繁忙线程数进行比较来计算空闲线程数。空闲线程数与忙碌线程数是微调服务器的好方法。如果服务器的空闲线程过多,则可能无法有效地管理线程池。在这种情况下,您可以降低minSpareThreads连接器的值,该值设置池中应始终可用的最小线程数(活动或空闲)。根据应用程序的流量调整此值将确保繁忙线程和空闲线程之间有适当的平衡
    
    topk(5,rate(tomcat_threadpool_currentthreadsbusy{protocol=~"http.*"}[5m]))
    
  • 现有线程

    # 说明 现有线程减去忙碌线程=空闲线程,此值设定要是看当前设定的 sparethread 默认值是否合适
    
    topk(5,sum(tomcat_threadpool_currentthreadcount{protocol=~"http.*"}-tomcat_threadpool_currentthreadsbusy{protocol=~"http.*"})by (instance,port,infra,env,customer,app))
    
  • jvm_memory_bytes_*

    # jvm  堆内存初始化值和最大值 放在总览主要是想看每个tomcat 分配的初始值和最大值
    topk(5,jvm_memory_bytes_init{area="heap"})
    topk(5,jvm_memory_bytes_max{area="heap"})
    
    topk(5,rate(jvm_memory_bytes_committed{area="heap"}[5m]))
    # 保证可用于JVM的内存量。此数量根据内存使用量而变化,并增加到为JVM设置的最大值此值为5分钟内的变化的值
    
    topk(5,jvm_memory_bytes_used{area="heap"})
    # JVM当前使用的内存量(例如,应用程序,垃圾回收)
    

3.4、session相关

  • tomcat_session_expiredsessions_total

    # 过期session会话
    sum(increase(tomcat_session_expiredsessions_total[3m])) by (instance)
    
    # 拒绝session会话
    sum(increase(tomcat_session_rejectedsessions_total[3m])) by (instance)
    
  • tomcat_servlet_requestcount_total

    topk(5,rate(tomcat_session_rejectedsessions_total[3m]))
    
    tomcat_servlet_requestcount_total{module="localhost/yyyy",servlet="jsp",} 0.0
    tomcat_servlet_requestcount_total{module="localhost/xxxx",servlet="jsp",} 0.0
    
  • tomcat_servlet_errorcount_total

    topk(5, increase(tomcat_servlet_errorcount_total[3m]))
    
    tomcat_servlet_errorcount_total{module="localhost/yyyy",servlet="jsp",} 0.0
    tomcat_servlet_errorcount_total{module="localhost/xxxx",servlet="jsp",} 0.0
    
  • tomcat_servlet_processingtime_total

    topk(5, rate(tomcat_servlet_processingtime_total[3m])) / rate(tomcat_servlet_requestcount_total[3m])))
    
    tomcat_servlet_processingtime_total{module="localhost/yyyy",servlet="jsp",} 0.0
    tomcat_servlet_processingtime_total{module="localhost/xxxx",servlet="jsp",} 0.0
    

3.4、其它

  • 文件描述符

    # 使用的文件描述符,用完了会报错的,系统每打开一个文件或者建立一个连接都会耗一个fd,按照道理应该能判断并发高低。  当用完描述符 服务就会报错了 可以关连预警
    
    process_max_fds		# 最大文件描述符
    process_open_fds	# 已使用的
    
  • jvm_threads_state

    jvm_threads_state	#  线程状态
    
  • jvm_gc_collection_seconds_count

    jvm_threads_current			JVM当前活跃线程数
    jvm_threads_peak			从JVM启动开始曾经活着的最大线程数
    jvm_threads_started_total 	从JVM启动开始总共启动过的线程次数
    jvm_threads_daemon			JVM当前活跃的守护线程数
    jvm_threads_deadlocked		等待获取对象监视器或可拥有的同步器处于死锁状态的 JVM 线程的周期
    jvm_threads_deadlocked_monitor		等待获取对象监视器的 JVM 线程处于死锁状态的循环
    
  • 已加载的类

    rate(jvm_gc_collection_seconds_sum{app="$app"}[5m])/rate(jvm_gc_collection_seconds_count{app="$app"}[5m])
    
    # 说明  得到5分钟加载类的剩余情况
    
  • process_start_time_seconds

    # 服务存活时间
    time() - process_start_time_seconds	
    
  • jvm_memory_pool_bytes_used

    # 可以预警
    jvm_memory_pool_bytes_used			# 使用
    jvm_memory_pool_bytes_committed		# 已提交数
    
  • jvm_gc_collection_seconds_count

# gc垃圾回收次数的平均每秒增长值
rate(jvm_gc_collection_seconds_count[5m])
  • jvm_gc_collection_seconds_sum

    # gc耗时的平均每秒增长值
    rate(jvm_gc_collection_seconds_sum[5m])
    

4、使用

  • 效果图
    请添加图片描述
    请添加图片描述

  • prometheus配置

    [root@cstest targets]# cat mtail.yml 
    - targets:
      - x.x.x.x:3903
      labels:
        app: smailserver
        job_type: smailserverLog
    
    [root@cstest targets]# cat smailserver.yml 
    - targets:
      - x.x.x.x:9144
      - x.x.x.x:9144
      labels:
        app: smailserver
        middleware: smailserver
        job_type: java
        
    # 注意  middleware 这个label是用来关连 mtail的app的, 也可以自已随便定义一样,一定要保持一致
    # 一个tomcat对应一个mtail
    
    promtheus.yml配置
    
      - job_name: 'mtail_exporter'
        metrics_path: /metrics
        scrape_interval: 15s
        scrape_timeout: 10s
        file_sd_configs:		# 注意,我这里使用的是 文件自动发现
        - files: ['targets/mtail.yml']
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
    
      - job_name: 'smailserver_exporter'
        metrics_path: /metrics
        scrape_interval: 15s
        scrape_timeout: 10s
        file_sd_configs:
        - files: ['targets/smailserver.yml']
    
  • 最后导入json

    导入 tomcat-jvm&日志.json
    链接:https://pan.baidu.com/s/1-i7O2VE6ogkhwMtp2ygMzw 
    提取码:eiyv
    
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值