61.159.245.95 - - [01/Aug/2003:15:39:39 +0800] "GET /epl.shtml HTTP/1.1" 302 251 "http://www.hao123.com/sport2/football.htm" "Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; Avant Browser [avantbrowser.com])"
该LOG文件超过10万行,如何列出最后的10万行中请求最多前十位IP,显示如下的结果:
358 140.113.111.105
263 36.4.216.181
200 86.209.29.99
200 51.117.24.41
200 35.51.17.62
200 245.97.190.176
199 5.209.207.67
199 238.190.2.28
199 201.91.66.208
199 139.93.146.162
LOG可以使用如下脚本来生成:
#!/bin/bash
randomIP() {
IP="$(($RANDOM/128))"
IP=${IP}.$(($RANDOM/128))
IP=${IP}.$(($RANDOM/128))
IP=${IP}.$(($RANDOM/128))
echo $IP
}
sum=1
echo > test
while [ $sum -le 110000 ]
do
j=$(($RANDOM % 200 + 1))
addr=$(randomIP)
for i in `seq 1 $j`
do
echo -n "$addr" >> test
echo ' - - [01/Aug/2003:15:39:39 +0800] "GET /epl.shtml HTTP/1.1" 302 251 "http://www.hao123.com/"' >> test
done
sum=$(($sum+$j))
done
三次时间如下:
358 140.113.111.105
263 36.4.216.181
200 86.209.29.99
200 51.117.24.41
200 35.51.17.62
200 245.97.190.176
199 5.209.207.67
199 238.190.2.28
199 201.91.66.208
199 139.93.146.162
real 0m0.081s
user 0m0.060s
sys 0m0.018s
[root@desktop2 test]# time (tail -n 100000 test | awk '{name[$1]+=1}END{for(i in name){print name[i]" "i}}' | sort -nr | head -n 10)
358 140.113.111.105
263 36.4.216.181
200 86.209.29.99
200 51.117.24.41
200 35.51.17.62
200 245.97.190.176
199 5.209.207.67
199 238.190.2.28
199 201.91.66.208
199 139.93.146.162
real 0m0.069s
user 0m0.047s
sys 0m0.019s
[root@desktop2 test]# time (tail -n 100000 test | awk '{name[$1]+=1}END{for(i in name){print name[i]" "i}}' | sort -nr | head -n 10)
358 140.113.111.105
263 36.4.216.181
200 86.209.29.99
200 51.117.24.41
200 35.51.17.62
200 245.97.190.176
199 5.209.207.67
199 238.190.2.28
199 201.91.66.208
199 139.93.146.162
real 0m0.081s
user 0m0.049s
sys 0m0.020s
结论:对于大量文本处理的情况,尽量少的直接使用shell下的指令,而尽量多的使用awk来提高处理效率。