cat pd_ng_log_stat.py
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from ng_line_parser import NgLineParser
import pandas as pd
import socket
import struct
class PDNgLogStat(object):
def __init__(self):
self.ng_line_parser = NgLineParser()
def _log_line_iter(self, pathes):
"""解析文件中的每一行并生成一个迭代器"""
for path in pathes:
with open(path, 'r') as f:
for index, line in enumerate(f):
self.ng_line_parser.parse(line)
yield self.ng_line_parser.to_dict()
def load_data(self, path):
"""通过给的文件路径加载数据生成 DataFrame"""
self.df = pd.DataFrame(self._log_line_iter(path))
def pv_day(self):
"""计算每一天的 PV"""
group_by_cols = ['access_time'] # 需要分组的列,只计算和显示该列
# 下面我们是按 yyyy-mm-dd 形式来分组的, 所以需要定义分组策略:
# 分组策略为: self.df['access_time'].map(lambda x: x.split()[0])
pv_day_grp = self.df[group_by_cols].groupby(
self.df['access_time'].map(lambda x: x.split()[0]))
return pv_day_grp.agg(['count'])
def main():
file_pathes = ['www.ttmark.com.access.log']
pd_ng_log_stat = PDNgLogStat()
pd_ng_log_stat.load_data(file_pathes)
# 统计每日 pv
print pd_ng_log_stat.pv_day()
if __name__ == '__main__':
main()