python脚本
Goaccess---良心nginx日志分析工具
Ngxtop-Nginx日志实时分析利器
goaccess-nginx日志分析工具
Apache_Nginx 访问日志分析脚本 - 雷纳科斯的博客 2013
awk分析nginx日志里面的接口响应时间
log_format main ‘$remote_addr – $remote_user [$time_iso8601] “$request” ‘
‘$status $body_bytes_sent “$http_referer” ‘
‘”$http_user_agent” “$http_x_forwarded_for” ‘
‘ “$upstream_addr” “$upstream_status” “$request_time"`;
cat website.access.log| awk ‘{print $(NF)}’ | awk -F “\”” ‘{print $2′}>a.txt
paste -d ” ” website.access.log a.txt > b.txt
cat b.txt |awk ‘($NF>1){print $6$7 ” ” $NF}’>c.txt
linux下使用awk,wc,sort,uniq,grep对nginx日志进行分析和统计
b). 字段含义(如下说明)
column1:ip_address
column2:log_time
column3:request
column4:status_code
column5:send_bytes
column6:referer
需求一:统计总记录数,总成功数,各种失败数:404,403,500
cat data.log|awk -F '\t' '{if($4 > 0) print $4}'|wc -l|
awk '{print "Total Items:"$1}'
2. 提取成功、各种失败总数
cat data.log|awk -F '\t' '{if($4>0 && $4==200) print $4}'|wc -l
需求二:各种错误中,哪类URL出现的次数最多,要求剔除重复项,并倒叙给出结果
cat data.log|awk -F '\t' '{if($4>0 && $4==500) print $3}'|awk '{print $2}'|sort|uniq -c|sort -k1 nr
需求三:要统计URL中文件名出现的次数,结果中要包含Code 和 Referer。但是 URL和 Referer中都包含 / 字符,对于过滤有干扰,尝试去解决。
cat data.log|awk '{print $5,$7,$9}'|grep 200|
sed 's#.*/\(.*\)#\1#'|sort -k1|uniq -c
wc -l access.log |awk '{print $1}' 总请求数
awk '{print $1}' access.log|sort |uniq |wc -l 独立IP数
awk -F'[ []' '{print $5}' access.log|sort|uniq -c|sort -rn|head -5 每秒客户端请求数 TOP5
awk '{print $1}' access.log|sort |uniq -c | sort -rn |head -5 访问最频繁IP Top5
awk '{print $7}' access.log|sort |uniq -c | sort -rn |head -5 访问最频繁的URL TOP5
awk '{if ($12 > 10){print $7}}' access.log|sort|uniq -c|sort -rn |head -5
响应大于10秒的URL TOP5
awk '{if ($13 != 200){print $13}}' access.log|sort|uniq -c|sort -rn|head -5
分析请求数大于50000的源IP的行为
awk '{print $1}' access.log|sort |uniq -c |sort -rn|awk '{if ($1 > 50000){print $2}}' > tmp.txt
for i in $(cat tmp.txt)
do
echo $i >> analysis.txt
echo "访问行为统计" >> analysis.txt
grep $i access.log|awk '{print $6}' |sort |uniq -c | sort -rn |head -5 >> analysis.txt
echo "访问接口统计" >> analysis.txt
grep $i access.log|awk '{print $7}' |sort |uniq -c | sort -rn |head -5 >> analysis.txt
echo -e "\n" >> /root/analysis/$Ydate.txt
done
如果源IP来自代理服务器,应将第一条命令过滤地址改为$http_x_forwarded_for地址
awk '{print $NF}' access.log|sort |uniq -c |sort -rn|awk '{if ($1 > 50000){print $2}}' > tmp.txt
5.性能指标
并发连接数
客户端向服务器发起请求,并建立了TCP连接。每秒钟服务器链接的总TCP数量,就是并发连接数
PV(page view) UV(unique visitor) 独立IP
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
案例一
ip
-
-
[
23
/
Mar
/
2017
:
00
:
17
:
49
+
0800
]
"GET / HTTP/1.1"
302
0
"-"
"PycURL/7.19.7"
log_format access
'$HTTP_X_REAL_IP - $remote_user [$time_local] "$request"'
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" $HTTP_X_Forwarded_For'
;
192.168
.
21.1
-
-
[
27
/
Jan
/
2014
:
11
:
28
:
53
+
0800
]
"GET /2.php HTTP/1.1"
200
133
"-"
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1707.0 Safari/537.36"
"-"
192.168
.
21.128
200
127.0
.
0.1
:
9000
0.119
0.119
#log_format main '$remote_addr - $remote_user [$time_local] "$request" '
# '$status $body_bytes_sent "$http_referer" '
# '"$http_user_agent" "$http_x_forwarded_for"';
$http_host:用户在浏览器中输入的URL(IP或着域名)地址
192.168
.
21.128
$upstream_status: upstream状态
200
$upstream_addr: 后端upstream地址及端口
127.0
.
0.1
:
9000
$request_time: 页面访问总时间
0.119
$upstream_response_time:页面访问中upstream响应时间
0.119
$
10
$body_bytes_sent
$
1
$remote_addr
$
7
$request
$
11
$http_referer
$
9
$status
$
6
http_user_agent
1
、总访问量
2
、总带宽
3
、独立访客量
4
、访问IP统计
5
、访问url统计
6
、来源统计
7
、
404
统计
8
、搜索引擎访问统计(谷歌,百度)
9
、搜索引擎来源统计(谷歌,百度)
#!/bin/bash
log_path
=
/
home
/
www.centos.bz
/
log
/
access.log.
1
domain
=
"centos.bz"
email
=
"log@centos.bz"
maketime
=
`date
+
%
Y
-
%
m
-
%
d
" "
%
H
":"
%
M`
logdate
=
`date
-
d
"yesterday"
+
%
Y
-
%
m
-
%
d`
total_visit
=
`wc
-
l ${log_path} | awk
'{print $1}'
`
total_bandwidth
=
`awk
-
v total
=
0
'{total+=$10}END{print total/1024/1024}'
${log_path}`
total_unique
=
`awk
'{ip[$1]++}END{print asort(ip)}'
${log_path}`
ip_pv
=
`awk
'{ip[$1]++}END{for (k in ip){print ip[k],k}}'
${log_path} | sort
-
rn | head
-
20
`
url_num
=
`awk
'{url[$7]++}END{for (k in url){print url[k],k}}'
${log_path} | sort
-
rn | head
-
20
`
referer
=
`awk
-
v domain
=
$domain '$
11
!~
/
http:\
/
\
/
[^
/
]
*
'"$domain"'
/
{url[$
11
]
+
+
}END{
for
(k
in
url){
print
url[k],k}}' ${log_path} | sort
-
rn | head
-
20
`
notfound
=
`awk
'$9 == 404 {url[$7]++}END{for (k in url){print url[k],k}}'
${log_path} | sort
-
rn | head
-
20
`
spider
=
`awk
-
F
'"' '$6 ~ /Baiduspider/ {spider["baiduspider"]++} $6 ~
/Googlebot/ {spider["googlebot"]++}END{for (k in spider){print
k,spider[k]}}' ${log_path}`
search=`awk -F'"'
'$
4
~
/
http:\
/
\
/
www\.baidu\.com
/
{search[
"baidu_search"
]
+
+
} $
4
~
/
http:\
/
\
/
www\.google\.com
/
{search[
"google_search"
]
+
+
}END{
for
(k
in
search){
print
k,search[k]}}'
${log_path}`
#echo -e "概况\n报告生成时间:${maketime}\n总访问量:${total_visit}\n总带宽:${total_bandwidth}M\n独
立访客:${total_unique}\n\n访问IP统计\n${ip_pv}\n\n访问url统计\n${url_num}\n\n来源页面统计
\n${referer}\n\n404统计\n${notfound}\n\n蜘蛛统计\n${spider}\n\n搜索引擎来源统计
\n${search}
" | mail -s "
$domain $logdate log statistics" ${email}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
案例二
# tar zxvf pymongo-1.11.tar.gz
# cd pymongo-1.11
# python setup.py install
python连接mongodb样例
$ cat conn_mongodb.py
#!/usr/bin/python
import
pymongo
import
random
conn
=
pymongo.Connection(
"127.0.0.1"
,
27017
)
db
=
conn.tage
#连接库
db.authenticate(
"tage"
,
"123"
)
#用户认证
db.user.drop()
#删除集合user
db.user.save({
'id'
:
1
,
'name'
:
'kaka'
,
'sex'
:
'male'
})
#插入一个数据
for
id
in
range
(
2
,
10
):
name
=
random.choice([
'steve'
,
'koby'
,
'owen'
,
'tody'
,
'rony'
])
sex
=
random.choice([
'male'
,
'female'
])
db.user.insert({
'id'
:
id
,
'name'
:name,
'sex'
:sex})
#通过循环插入一组数据
content
=
db.user.find()
#打印所有数据
for
i
in
content:
print
i
编写python脚本
#encoding=utf8
import
re
zuidaima_nginx_log_path
=
"/usr/local/nginx/logs/www.zuidaima.com.access.log"
pattern
=
re.
compile
(r
'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
)
def
stat_ip_views(log_path):
ret
=
{}
f
=
open
(log_path,
"r"
)
for
line
in
f:
match
=
pattern.match(line)
if
match:
ip
=
match.group(
0
)
if
ip
in
ret:
views
=
ret[ip]
else
:
views
=
0
views
=
views
+
1
ret[ip]
=
views
return
ret
def
run():
ip_views
=
stat_ip_views(zuidaima_nginx_log_path)
max_ip_view
=
{}
for
ip
in
ip_views:
views
=
ip_views[ip]
if
len
(max_ip_view)
=
=
0
:
max_ip_view[ip]
=
views
else
:
_ip
=
max_ip_view.keys()[
0
]
_views
=
max_ip_view[_ip]
if
views>_views:
max_ip_view[ip]
=
views
max_ip_view.pop(_ip)
print
"ip:"
, ip,
",views:"
, views
#总共有多少ip
print
"total:"
,
len
(ip_views)
#最大访问的ip
print
"max_ip_view:"
, max_ip_view
run()
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
案例三
import
os,re,sys,datetime
dlog
=
"C:\Users\user\Desktop\\data.log"
iplist
=
[]
dict
=
{}
with
open
(dlog) as f:
for
i
in
f.readlines():
ip
=
i.split()[
0
].strip()
iplist.append(ip)
print
iplist
myset
=
set
(iplist)
# print myset
for
item
in
myset:
# print("the %s has found %d" % (item, iplist.count(item)))
dict
[item]
=
iplist.count(item)
dictr
=
sorted
(
dict
.items(),key
=
lambda
item:item[
1
],reverse
=
True
)
print
dictr
for
i
in
dictr:
print
i[
0
],i[
1
]
# tu= dictr[0]
# print tu[1] 统计访问次数最高的ip
or
可以这样写
dict
[ip]
=
dict
.get(ip,
0
)
+
1
|
1
|
<br>
|