需求
原数据
ip username starttime stoptime inputoctets outputoctets
10.112.10.233 2019110300 1575043261.02 1575129661.02 2930141 3921287
10.112.61.13 2017010078 1575043321.05 1575129721.05 1254624 752959
10.205.1.239 2016211712 1575089821.07 1575129721.07 1633356392 147034526
10.128.213.84 2017213509 1575043262.05 1575129662.05 616043513 72367093
10.103.240.76 2018140631 1575043321.06 1575129721.06 40562383 3272866
10.108.42.98 2016010188 1575083221.11 1575129721.11 722747848 65725758
10.203.10.244 2014210476 1575117962.1 1575129662.1 1140582086 122980764
10.217.102.220 2015211904 1575100321.11 1575129721.11 4129825669 209006077
10.112.12.39 2019110252 1575084421.11 1575129721.11 5113448884 352169224
10.109.246.244 2018141080 1575043262.12 1575129662.12 2798076084 112478879
10.217.77.101 2019111386 1575126722.11 1575129722.11 27613590 31585687
10.210.54.177 2018111662 1575085021.12 1575129721.12 2490077399 4200337715
读取本目录内所有的txt文件
该元数据格式为 20191201_wired.txt 20191201_wireless.txt
编码格式为utf-16 的通过python 转换编码格式转为utf-8
并找到 需要的字段生成新文件到新的文件夹中,
过滤20191201_wireless 中无线的数据,只保留ipv4的数据
#!/usr/bin/env python3
# coding=utf-8
import os
import codecs
import chardet
import time
from django.utils.encoding import smart_text
result = []
bait_type_path = os.path.dirname(__file__)
def get_all(cwd):
get_dir = os.listdir(cwd)
for i in get_dir:
sub_dir = os.path.join(cwd, i)
if os.path.isdir(sub_dir):
print(sub_dir)#获取子文件夹
## get_all(sub_dir)#同时获取子文件夹,当前不需要
else:
result.append(i)
# 获取编码格式
def check_file_charset(file):
with open(file, 'rb') as f:
return chardet.detect(f.read())
return {}
"""
遍历读取文件
并生成新文件
"""
def readAllFile(result):
print(result)
for i in result:
if i.endswith('.txt'):
filepath = os.getcwd() + '/' + i
newfile = open(os.getcwd() + '/ipv4/' + i, 'w')
oldfile = open(filepath, "rb") # 源文件读取
f_type = check_file_charset(filepath) # 获取编码格式
##将文件转编码格式为utf8
if f_type and 'encoding' in f_type.keys() and f_type['encoding'] != 'utf-8':
try:
with codecs.open(filepath, 'rb', f_type['encoding']) as f:
content = smart_text(f.read())
with codecs.open(filepath, 'wb', 'utf-8') as f:
f.write(content)
except:
pass
##读取文件并提取文件中的内容生成新文件
next(oldfile) ##从第二行读取
if i.find('wireless') != -1:
lines = oldfile.read().splitlines() # 读行并拿掉换行符
for line in lines:
list = line.split()
if len(list) == 8 and len(list[0]) <= 20: # byte类型需要转字符串类型
#时间戳转换时间格式
stime_local = time.localtime(float(list[4].decode()))#格式化时间戳为本地时间
stime = time.strftime('%Y-%m-%d %H:%M:%S',stime_local)#自定义时间格式
etime_local = time.localtime(float(list[5].decode()))#格式化时间戳为本地时间
etime = time.strftime('%Y-%m-%d %H:%M:%S',etime_local)#自定义时间格式
bytess = list[0].decode() + '|' + list[1].decode() + '|' + stime + '|' + etime + '\n'
newfile.write(bytess)
if i.find('wired') != -1:
lines = oldfile.read().splitlines() # 读行并拿掉换行符
for line in lines:
list = line.split()
#时间戳转换时间格式
stime_local = time.localtime(float(list[2].decode()))#格式化时间戳为本地时间
stime = time.strftime('%Y-%m-%d %H:%M:%S',stime_local)#自定义时间格式
etime_local = time.localtime(float(list[3].decode()))#格式化时间戳为本地时间
etime = time.strftime('%Y-%m-%d %H:%M:%S',etime_local)#自定义时间格式
bytess = list[0].decode() + '|' + list[1].decode() + '|' + stime + '|' + etime + '\n'
newfile.write(bytess)
print('translating ok > ' + i)
if __name__ == "__main__":
get_all(os.getcwd())
# print(result)
readAllFile(result)
print("ALL OK!! in dir ipv4")
def time_test():
currentTimeStamp = time.time()#获取当前时间戳
time_local = time.localtime(currentTimeStamp)#格式化时间戳为本地时间
time_YmdHMS = time.strftime('%Y-%m-%d %H:%M:%S',time_local)#自定义时间格式
print('currentTimeStamp:', currentTimeStamp)
print('time_local:', time_local)
print('time_YmdHMS:', time_YmdHMS)
程序优化,由于随着处理的文件增大,达到10g的数据,需要python支持大文件读取并处理,优化如下
修改了自动识别编码格式的功能, 设置手动设置编码格式转utf8,程序识别编码格式太慢了
#!/usr/bin/env python3
# coding=utf-8
import os
import codecs
import chardet
import time
from django.utils.encoding import smart_text
from chardet.universaldetector import UniversalDetector
result = []
bait_type_path = os.path.dirname(__file__)
def get_all(cwd):
get_dir = os.listdir(cwd)
for i in get_dir:
sub_dir = os.path.join(cwd, i)
if os.path.isdir(sub_dir):
print(sub_dir) # 获取子文件夹
## get_all(sub_dir)#同时获取子文件夹,当前不需要
else:
result.append(i)
# 获取编码格式
def check_file_charset(file):
## 小文件读取编码
# with open(file, 'rb') as f:
# return chardet.detect(f.read())
## 大文件读取编码
bigdata = open(file, 'rb')
print('read charset from >' + file)
detector = UniversalDetector()
for line in bigdata.readlines():
detector.feed(line)
if detector.done:
break
detector.close()
bigdata.close()
print(detector.result)
return detector.result
"""
遍历读取文件
并生成新文件
"""
def readAllFile(result):
print(result)
for i in result:
if i.endswith('.txt'):
filepath = os.getcwd() + '/' + i
newfile = open(os.getcwd() + '/ipv4/' + i, 'w')
f_type = 'ASCII' # 获取编码格式
##将文件转编码格式为utf8
with codecs.open(filepath, 'rb', 'ascii') as fr:
for data in fr:##大文件读取
##读取文件并提取文件中的内容生成新文件
list = data.split()
print(list)
if(len(list) > 1) and list[0] != 'ip':
if i.find('wireless') != -1:
if len(list) == 8 and len(list[0]) <= 20: # byte类型需要转字符串类型
# 时间戳转换时间格式
stime_local = time.localtime(float(list[4])) # 格式化时间戳为本地时间
stime = time.strftime('%Y-%m-%d %H:%M:%S', stime_local) # 自定义时间格式
etime_local = time.localtime(float(list[5])) # 格式化时间戳为本地时间
etime = time.strftime('%Y-%m-%d %H:%M:%S', etime_local) # 自定义时间格式
bytess = list[0] + '|' + list[1] + '|' + stime + '|' + etime + '\n'
newfile.write(bytess)
if i.find('wired') != -1:
if len(list) == 6: # byte类型需要转字符串类型
# 时间戳转换时间格式
stime_local = time.localtime(float(list[2])) # 格式化时间戳为本地时间
stime = time.strftime('%Y-%m-%d %H:%M:%S', stime_local) # 自定义时间格式
etime_local = time.localtime(float(list[3])) # 格式化时间戳为本地时间
etime = time.strftime('%Y-%m-%d %H:%M:%S', etime_local) # 自定义时间格式
bytess = list[0] + '|' + list[1] + '|' + stime + '|' + etime + '\n'
newfile.write(bytess)
if __name__ == "__main__":
get_all(os.getcwd())
# print(result)
readAllFile(result)
print("ALL OK!! in dir ipv4")
def time_test():
currentTimeStamp = time.time() # 获取当前时间戳
time_local = time.localtime(currentTimeStamp) # 格式化时间戳为本地时间
time_YmdHMS = time.strftime('%Y-%m-%d %H:%M:%S', time_local) # 自定义时间格式
print('currentTimeStamp:', currentTimeStamp)
print('time_local:', time_local)
print('time_YmdHMS:', time_YmdHMS)