安装
服务端性能监控最佳实践(一)—— 炫酷的Nginx请求分析监控
其中涉及的lua脚本等,github地址
不过其中的代码有问题,我fork了一份,修改后传到新地址了,具体问题见后续分析
nginx+OpenResty
Nginx 平滑升级至 OpenResty
Nginx的启动、停止与重启
bug
安装后,nginx日志报错
2021/01/23 14:38:21 [error] 85214#0: *13184 access forbidden by rule, client: 10.10.10.10, server: localhost, request: "HEAD / HTTP/1.0"
2021/01/23 14:38:21 [error] 85214#0: *13184 [lua] counter.lua:66: log(): latency=0,status=403,endpoint=nil,fullurl=nil while logging request, client: 10.10.10.10, server: localhost, request: "HEAD / HTTP/1.0"
2021/01/23 14:38:21 [error] 85214#0: *13184 [lua] prometheus.lua:317: log_error(): Wrong number of labels for nginx_http_request_duration_seconds. Expected 6, got 4 while logging request, client: 10.10.10.10, server: localhost, request: "HEAD / HTTP/1.0"
看起来是lua脚本有问题,debug看看
Mac+Idea+lua
Emmylua
IDEA+EmmyLua Lua开发环境搭建
在 mac osx 下进行 ulua 远程调试
其中如果电脑是macpro m1 (silicon架构) 架构可以加
-arch arm64
最终没搞定
Lua
改为用lua插件
bug fix
单步调试搞定后,发现counter.lua有问题
原代码
local pcall = pcall
local ngx = ngx
local ngx_log = ngx.log
local ngx_err = ngx.ERR
local _M = {}
function _M.init()
uris = ngx.shared.uri_by_host
global_set = ngx.shared.global_set
global_set:set("initted", false)
global_set:set("looped", false)
prometheus = require("prometheus").init("prometheus_metrics")
metric_latency = prometheus:histogram("nginx_http_request_duration_seconds", "HTTP request latency status", {"host", "status", "scheme", "method", "endpoint", "fullurl"})
end
local function split(inputstr, sep)
if sep == nil then
sep = "%s"
end
local t={} ; i=1
for str in string.gmatch(inputstr, "([^"..sep.."]+)") do
t[i] = str
i = i + 1
end
return t
end
local function parse_fullurl(request_uri)
result_table = {}
if string.find(request_uri, "%.") ~= nil then
return nil
end
parts = split(request_uri, "/")
if table.getn(parts) == 1 then
return nil
end
for j=1, #parts do
if(j == 1) then
endpoint = "/"..parts[j]
fullurl = "/"..parts[j]
elseif(j <= 5) then
if tonumber(parts[j]) ~= nil then
break
end
fullurl = fullurl.."/"..parts[j]
else
break
end
end
result_table["endpoint"] = endpoint
result_table["fullurl"] = fullurl
return result_table
end
function _M.log()
local request_host = ngx.var.host
local request_uri = ngx.unescape_uri(ngx.var.uri)
local request_status = ngx.var.status
local request_scheme = ngx.var.scheme
local request_method = ngx.var.request_method
local remote_ip = ngx.var.remote_addr
local ngx_sent = ngx.var.body_bytes_sent
local latency = ngx.var.upstream_response_time or 0
result_table = parse_fullurl(request_uri)
if result_table == nil then
return
end
ngx_log(ngx_err,"latency=", tonumber(latency), ",status=", request_status, ",endpoint=", result_table["endpoint"], ",fullurl=", result_table["fullurl"])
metric_latency:observe(tonumber(latency), {request_host, request_status, request_scheme, request_method, result_table["endpoint"], result_table["fullurl"]})
end
return _M
其中有2个问题
if string.find(request_uri, “%.”) ~= nil then
改为
if string.find(request_uri, “%.”) == nil then
~=在lua里表示不等于。lua的find会返回2个值,这里应该是url查找任意字符,从代码上看逻辑是检查字符串长度是否为0
另外一个
table.getn(parts) == 1
lua升级到5.1后,不再支持getn,改为 #
#parts == 1
正确脚本
counter.lua
local pcall = pcall
local ngx = ngx
local ngx_log = ngx.log
local ngx_err = ngx.ERR
local _M = {}
function _M.init()
uris = ngx.shared.uri_by_host
global_set = ngx.shared.global_set
global_set:set("initted", false)
global_set:set("looped", false)
prometheus = require("prometheus").init("prometheus_metrics")
metric_latency = prometheus:histogram("nginx_http_request_duration_seconds", "HTTP request latency status", {"host", "status", "scheme", "method", "endpoint", "fullurl"})
end
local function split(inputstr, sep)
if sep == nil then
sep = "%s"
end
local t={} ; i=1
for str in string.gmatch(inputstr, "([^"..sep.."]+)") do
t[i] = str
i = i + 1
end
return t
end
local function parse_fullurl(request_uri)
result_table = {}
if string.find(request_uri, "%.") == nil then
return nil
end
parts = split(request_uri, "/")
if #parts == 1 then
return nil
end
for j=1, #parts do
if(j == 1) then
endpoint = "/"..parts[j]
fullurl = "/"..parts[j]
elseif(j <= 5) then
if tonumber(parts[j]) ~= nil then
break
end
fullurl = fullurl.."/"..parts[j]
else
break
end
end
result_table["endpoint"] = endpoint
result_table["fullurl"] = fullurl
return result_table
end
function _M.log()
local request_host = ngx.var.host
local request_uri = ngx.unescape_uri(ngx.var.uri)
local request_status = ngx.var.status
local request_scheme = ngx.var.scheme
local request_method = ngx.var.request_method
local remote_ip = ngx.var.remote_addr
local ngx_sent = ngx.var.body_bytes_sent
local latency = ngx.var.upstream_response_time or 0
result_table = parse_fullurl(request_uri)
if result_table == nil then
return
end
ngx_log(ngx_err,"latency=", tonumber(latency), ",status=", request_status, ",endpoint=", result_table["endpoint"], ",fullurl=", result_table["fullurl"])
metric_latency:observe(tonumber(latency), {request_host, request_status, request_scheme, request_method, result_table["endpoint"], result_table["fullurl"]})
end
return _M
请求http://xxx.xxxx.xxx.xxx:9145/metrics
正常返回了
配置prometheus + Grafana
按照文章配置好后,发现页面endpoint, host等变量有数据,但是图表没有东西
找个panel点击编辑后
发现这里的 nginx_http_request_duration_seconds:qps_by_instance_host_endpoint_fullurl_2XX ,写的job,但是grafana里没有找到对应的定义,prometheus里也没有。
google了一下,发现
GIt地址
里定义了
按照这个prometheus.yml里的rule的定义,拷贝文件到服务器
再看grafana有图了
改进
抓回数据后,发现有对静态资源的请求也在日志范围内,如果想去掉静态请求,可以修改counter.lua文件,例如
if string.find(request_uri, ".html") ~= nil then
return nil
end
if string.find(request_uri, ".js") ~= nil then
return nil
end
if string.find(request_uri, ".css") ~= nil then
return nil
end
if string.find(request_uri, ".png") ~= nil then
return nil
end
if string.find(request_uri, "/static/") ~= nil then
return nil
end