python刷新cdn_使用Python解析阿里云CDN日志

最新推荐文章于 2023-09-14 11:27:29 发布

weixin_39840635

最新推荐文章于 2023-09-14 11:27:29 发布

阅读量483

点赞数

文章标签： python刷新cdn

本文链接：https://blog.csdn.net/weixin_39840635/article/details/113508049

版权

本文介绍如何利用Python解析阿里云CDN的日志文件。通过定义正则表达式模式，将每条日志记录转换为字典对象，然后进一步转换成JSON格式，便于分析和处理CDN的日志数据。

摘要由CSDN通过智能技术生成

某些原因，一开始没有设计网站的统计模块

如今需要加上，只能借助于百度统计或者阿里云的cdn日志文件，阿里云cdn的日志文件是web的访问信息

log

[9/Mar/2016:00:00:16 +0800] 222.171.7.89 - 62113 "http://cloud.insta360.com/post/5e7b029d8ed7e3c4b23006a71bab73c8?e=true&m=true" "GET http://cloud.insta360.com/public/media/mp4/5e7b029d8ed7e3c4b23006a71bab73c8_960x480.mp4" 206 509 20516390 HIT "Mozilla/5.0 (iPhone; CPU iPhone OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Mobile/12H321 NewsApp/5.3.2" "video/mp4"

fileds

时间

访问IP

回源IP

responsetime

referer

method

访问URL

httpcode

requestsize

responsesize

cache命中状态

UA头

文件类型

# 将单条记录转换为Dict对象

def line2dict(line):

# Snippet, thanks to http://www.seehuhn.de/blog/52

parts = [

r'\[(?P.+)\]', # 时间 %t

r'(?P\S+)', # 访问IP %h

r'(?P\S+)', # 回源IP %h

r'(?P[0-9]+)', # 响应时间 %>s

r'"(?P.*)"', # Referer "%{Referer}i"

r'"(?P.+)"', # 请求地址 "%r"

r'(?P[0-9]+)', # Httpcode %>s

r'(?P\S+)', # 请求大小 %b (careful, can be '-')

r'(?P[0-9]+)', # 响应大小 size %>s

r'(?P\S+)', # 缓存状态 %s

r'"(?P.*)"', # user agent "%{User-agent}i"

r'"(?P.*)"', # content type "%{Content-type}i"

]

pattern = re.compile(r'\s+'.join(parts) + r'\s*\Z')

m = pattern.match(line)

res = m.groupdict()

return res

script

AliyunLog.py

# coding=utf-8

import fileinput

import re

import os

try:

import simplejson as json

except ImportError:

import json

# 读取输入文件并返回Dict对象

def readfile(file):

filecontent = {}

index = 0

statinfo = os.stat(file)

# just a guestimate. I believe a single entry contains atleast 150 chars

if statinfo.st_size < 150:

print "Not a valid log file. It does not have enough data"

else:

for line in fileinput.input(file):

index = index + 1

if line != "\n": # don't read newlines

filecontent[index] = line2dict(line)

fileinput.close()

return filecontent

# 将单条记录转换为Dict对象

def line2dict(line):

# Snippet, thanks to http://www.seehuhn.de/blog/52

parts = [

r'\[(?P.+)\]', # 时间 %t

r'(?P\S+)', # 访问IP %h

r'(?P\S+)', # 回源IP %h

r'(?P[0-9]+)', # 响应时间 %>s

r'"(?P.*)"', # Referer "%{Referer}i"

r'"(?P.+)"', # 请求地址 "%r"

r'(?P[0-9]+)', # Httpcode %>s

r'(?P\S+)', # 请求大小 %b (careful, can be '-')

r'(?P[0-9]+)', # 响应大小 size %>s

r'(?P\S+)', # 缓存状态 %s

r'"(?P.*)"', # user agent "%{User-agent}i"

r'"(?P.*)"', # content type "%{Content-type}i"

]

pattern = re.compile(r'\s+'.join(parts) + r'\s*\Z')

m = pattern.match(line)

res = m.groupdict()

return res

# 转换整个记录为Json对象

def toJson(file):

entries = readfile(file)

return json.JSONEncoder(indent=4).encode(entries)

main.py

#!/usr/bin/env python

# coding=utf-8

import sys

from AliyunLog import *

def main():

if len(sys.argv) < 3:

print "Incorrect Syntax. Usage: python main.py -f "

sys.exit(2)

elif sys.argv[1] != "-f":

print "Invalid switch '" + sys.argv[1] + "'"

sys.exit(2)

elif os.path.isfile(sys.argv[2]) == False:

print "File does not exist"

sys.exit(2)

print toJson(sys.argv[2])

if __name__ == "__main__":

main()

result

run script

python main.py -f data

terminal

{

"6432": {

"res_time": "1728",

"res_ip": "118.114.213.118",

"req_size": "768",

"req_url": "GET http://cloud.insta360.com/public/media/mp4/f9e4bf15d452440c2884b234854d089c_audio.mp3",

"origin_ip": "-",

"referer": "http://cloud.insta360.com/post/f9e4bf15d452440c2884b234854d089c?m=true&from=timeline&isappinstalled=0",

"content_type": "audio/mpeg",

"time": "9/Mar/2016:00:59:58 +0800",

"ua": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13C75 MicroMessenger/6.3.13 NetType/WIFI Language/zh_CN",

"http_code": "206",

"res_size": "5290084",

"cache_status": "HIT"

}，

...

}

weixin_39840635

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫