背景
对于部分生产上的日志无法像 Nginx 那样,可以直接将输出的日志转为 Json 格式
但是可以借助 Logstash 来将我们的 ”非结构化数据“,转为 "结构化数据";
logstash介绍
Logstash 的基础架构类似于 pipeline 流水线:
Input:数据采集(常用插件:stdin、file、kafka、beat、http、)
Filter:数据解析/转换(常用插件:grok、date、geoip、mutate、useragent)
Output:数据输出 (常用插件:Elasticsearch、)
logstash的插件学习,可以借助官网: https://www.elastic.co/guide/en/logstash/current/index.html
常用插件
2.Logstash Input插件
stdin
file
beat
kafka
2.1 stdin插件
#从标准输入读取数据,从标准输出中输出内容;
[root@logstash ~]# cat /etc/logstash/conf.d/stdin_logstash.conf
input {
stdin {
type => "stdin" #自定义事件类型,可用于后续判断
tags => "stdin_type" #自定义事件tag,可用于后续判断
}
}
output {
stdout {
codec => "rubydebug"
}
}
[root@logstash ~]# echo "test" | /usr/share/logstash/bin/logstash -f /etc/logstash/conf.d/stdin_logstash.conf
#查看结果
2.2 file插件
#从file 文件中读取数据,然后输入至标准输入;
[root@logstash ~]# cat /etc/logstash/conf.d/file_logstash.conf
input {
file {
path => "/var/log/test.log"
type => syslog
exclude => "*.gz" #不想监听的文件规则,基于glob匹配语法
start_position => "beginning" #第一次丛头开始读取文件 beginning or end
stat_interval => "3" #定时检查文件是否更新,默认1s
}
}
output {
stdout {
codec => rubydebug
}
}
[root@logstash ~]# /usr/share/logstash/bin/logstash -f /etc/logstash/conf.d/file_logstash.conf
#查看结果
2.3 beats插件
#从filebeat文件中读取数据,然后输入至标准输入;
[root@logstash ~]# cat /etc/logstash/conf.d/beats_logstash.conf
input {
beats {
port => 5044
}
}
output {
stdout {
codec => rubydebug
}
}
2.3 kafka插件
#从kafka文件中读取数据,然后输入至标准输入;
input {
kafka {
zk_connect => "kafka1:2181,kafka2:2181,kafka3:2181"
group_id => "logstash"
topic_id => "apache_logs"
consumer_threads => 16
}
}
3.Logstash Filter插件
Grok 将非结构化数据派生出结构
geoip 从 IP 地址分析出地理坐标
useragent 从 请求中分析操作系统、设备类型
3.1 Grok插件
#示例 将 Nginx 日志格式化为 json 格式;
input {
http {
port =>7474
}
}
filter {
#将nginx日志格式化为json格式
grok {
match => {
"message" => "%{COMBINEDAPACHELOG}"
}
}
}
output {
stdout {
codec => rubydebug
}
}
3.2 geoip插件
#示例:通过 geoip 提取 Nginx 日志中 clientip 字段,并获取地域信息;
input {
http {
port =>7474
}
}
filter {
...
#提取clientip字段,获取地域信息
geoip {
source => "clientip"
}
...
}
output {
stdout {
codec => rubydebug
}
}
3.2.1 fields字段
#由于输出内容太多,可以通过 fileds 选项选择自己需要的信息;
input {
http {
port =>7474
}
}
filter {
...
# 提取clientip字段,获取地域信息
geoip {
source => "clientip"
fields => ["country_name","country_code2","timezone","longitude","latitude","continent_code"] # 仅提取需要获取的指标
}
...
}
output {
stdout {
codec => rubydebug
}
}
3.3 Date插件
#将日期字符串解析为日志类型。然后替换 @timestamp 字段或指定的其他字段
match 类型为数组,用于指定日期匹配的格式,可以以此指定多种日期格式
target 类型为字符串,用于指定赋值的字段名,默认是 @timestamp
timezone 类型为字符串,用于指定时区域
3.3.1 date示例
input {
http {
port =>7474
}
}
filter {
...
#解析date日期 30/Dec/2019:11:40:44 +0800
date {
match => ["timestamp", "dd/MMM/yyyy:HH:mm:ss Z" ]
target => "nginx_date"
timezone => "Asia/Shanghai"
}
}
output {
stdout {
codec => rubydebug
}
}
3.4 useragent插件
#useragent插件:根据请求中的 user-agent 字段,解析出浏览器设备、操作系统等信息;
#示例:
input {
http {
port =>7474
}
}
filter {
...
#提取agent字段,进行解析
useragent {
source => "agent" #指定丛哪个字段获取数据解析
target => "useragent" #转换后的新字段
}
}
output {
stdout {
codec => rubydebug
}
}
3.5 mutate 插件
mutate 主要是对字段进行、类型转换、删除、替换、更新等操作;
remove_field 删除字段
split 字符串切割
add_field 添加字段
convert 类型转换
gsub 字符串替换
rename 字段重命名
3.5.1 remove_field
#mutate 删除无用字段,比如:headers、message、agent
input {
http {
port =>7474
}
}
filter {
...
#mutate 删除操作
mutate {
remove_field => ["headers", "message", "agent"]
}
...
}
output {
stdout {
codec => rubydebug
}
}
3.5.2 split
split 字符切割, 指定 | 为字段分隔符。
示例: 日志:5607|提交订单|2020-08-31
input {
http {
port =>7474
}
}
filter {
mutate {
#字段分隔符
split => { "message" => "|" }
}
}
output {
stdout {
codec => rubydebug
}
}
3.5.3 add_field
#将分割后的数据创建出新的字段名称,便于统计和分析
示例:
input {
http {
port =>7474
}
}
filter {
mutate {
#字段分隔符
split => { "message" => "|" }
#将分割后的字段添加到指定的字段名称
add_field => {
"UserID" => "%{[message][0]}"
"Action" => "%{[message][1]}"
"Date" => "%{[message][2]}"
}
}
}
output {
stdout {
codec => rubydebug
}
}
3.5.4 convert
#convert类型转换, 支持转换 integer、float、string等类型;
示例:
input {
http {
port =>7474
}
}
filter {
mutate {
#字段分隔符
split => { "message" => "|" }
#将分割后的字段添加到指定的字段名称
add_field => {
"UserID" => "%{[message][0]}"
"Action" => "%{[message][1]}"
"Date" => "%{[message][2]}"
}
#对新添加字段进行格式转换
convert => {
"UserID" => "integer"
"Action" => "string"
"Date" => "string"
}
#移除无用的字段
remove_field => ["headers", "message"]
}
}
output {
stdout {
codec => rubydebug
}
}
4.Logstash Output插件
stdout
file
elasticsearch
4.1 stdout插件
#stdout 插件将数据输出到屏幕终端,便于调试;
output {
stdout {
codec => rubydebug
}
}
4.2 file插件
#输出到文件,实现将分散在多地的文件统一到一处
output {
file {
path => "/var/log/web.log"
}
}
4.3 elastic插件
#输出到 elasticsearch,是最常用的输出插件;
output {
elasticsearch {
hosts => ["172.16.1.162:9200","172.16.1.163:9200"] #一般写data地址
index => "nginx-%{+YYYY.MM.dd}" #索引名称
template_overwrite => true #覆盖索引模板
}
}
项目
1.nginx的日志收集到es
#Filebeat
[root@web01 ~]# cat /etc/filebeat/nginx_logstash.yml
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/nginx/access.log
tags: ["nginx-access"]
- type: log
enabled: true
paths:
- /var/log/nginx/error.log
tags: ["nginx-error"]
output.logstash:
hosts: ["10.0.0.151:5044"]
#Logstash
[root@logstash ~]# cat /etc/logstash/conf.d/nginx_logstash.conf
input {
beats {
port => 5044
}
}
filter {
if "nginx-access" in [tags][0]{
grok {
match => { "message" => "%{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] \"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})\" %{NUMBER:response} (?:%{NUMBER:bytes}|-) %{QS:referrer} %{QS:useragent}" }
}
date {
match => ["timestamp", "dd/MMM/yyyy:HH:mm:ss Z"]
target => "@timestamp"
timezone => "Asia/Shanghai"
}
geoip {
source => "clientip"
}
useragent {
source => "useragent"
target => "useragent"
}
mutate {
convert => [ "bytes", "integer" ]
remove_field => [ "message", "agent" , "input","ecs" ]
add_field => { "target_index" => "logstash-nginx-access-%{+YYYY.MM.dd}" }
}
}
else if "nginx-error" in [tags][0] {
mutate {
add_field => { "target_index" => "logstash-nginx-error-%{+YYYY.MM.dd}" }
}
}
}
output {
elasticsearch {
hosts => ["10.0.0.161:9200","10.0.0.162:9200"]
index => "%{[target_index]}"
}
}
2.收集DB的slow_log到es
#Filebeat
[root@web01 ~]# cat /etc/filebeat/filebeat.yml
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/mariadb/slow.log
exclude_lines: ['^\# Time'] #排除无用的行
multiline.pattern: '^\# User'
multiline.negate: true
multiline.match: after
multiline.max_lines: 10000 #默认最大合并500行,可根据实际情况调整
output.logstash:
hosts: ["10.0.0.151:5044"]
#Logstash
[root@logstash ~]# cat /etc/logstash/conf.d/mysql_slow_logstash.conf
input {
beats {
port => 5044
}
}
filter {
mutate {
gsub => ["message","\n"," "]
}
grok {
match => {"message" => "(?m)^# User@Host: %{USER:User}\[%{USER-2:User}\] @ (?:(?<Clienthost>\S*) )?\[(?:%{IP:Client_IP})?\] # Thread_id: %{NUMBER:Thread_id:integer}\s+ Schema: (?:(?<DBname>\S*) )\s+QC_hit: (?:(?<QC_hit>\S*) )# Query_time: %{NUMBER:Query_Time}\s+ Lock_time: %{NUMBER:Lock_Time}\s+ Rows_sent: %{NUMBER:Rows_Sent:integer}\s+Rows_examined: %{NUMBER:Rows_Examined:integer} SET timestamp=%{NUMBER:timestamp}; \s*(?<Query>(?<Action>\w+)\s+.*)" }
}
date {
match => ["timestamp","UNIX","YYY-MM-dd HH:mm:ss"]
target => "@timestamp"
timezone => "Asia/Shanghai"
}
mutate {
remove_field => ["message","input","timestamp"]
convert => ["Lock_Time","float"]
convert => ["Query_Time","float"]
add_field => {"[@metadata][target_index]" => "mysql-logstash-%{+YYYY.MM.dd}"}
}
}
output {
elasticsearch {
hosts => ["10.0.0.161:9200","10.0.0.162:9200"]
index => "%{[@metadata][target_index]}"
template_overwrite => true
}
}