#coding=utf-8
import re
from struct import *
import json
from pprint import pprint
import hashlib
import base64
duplication_detected_db = {}
def md5_generator(url, feature_base64_text):
plain_text = url + feature_base64_text
crypted_text = hashlib.md5(plain_text)
return crypted_text.hexdigest()
def is_duplication_detected(url, feature_base64_text):
md5_str = md5_generator(url, feature_base64_text)
if md5_str not in duplication_detected_db:
duplication_detected_db[md5_str] = 0
return False
else:
duplication_detected_db[md5_str] += 1
return True
def build_feature_json(feature_content):
reg_expression_list = [r'CC3G:(.*?)&', r'MPTC:(.*?)&']
index_to_file_list = ['ccon_3g', 'e_symbol']
i = 0
#compile the reg_expression
for reg in reg_expression_list:
pattern = re.compile(reg)
try:
#match the source text using the complied reg_expression
content = pattern.search(feature_content).group(1).strip(',')
#print content
except AttributeError:
continue
#split the matched string using the ,
tag_list = content.split(',')
#print tag_list
print index_to_file_list[i]
file_handler = open(index_to_file_list[i], 'r')
#for a file
attr_dict = {}
for tag in tag_list:
#print tag
if tag is not None:
id_num_key_value_list = tag.split('|')
#print id_num_key_value_list
#print id_num_key_value_list[0]
#print id_num_key_value_list[1]
feature_text = get_value_by_sn(int(id_num_key_value_list[0]), file_handler)
print feature_text
#add to a tmp dictionary
attr_dict[feature_text] = id_num_key_value_list[1]
json_dict = {}
json_dict[index_to_file_list[i]] = attr_dict
#print json_dict
#json_str = json.dumps(json_dict)
#pprint(json_str)
file_handler.close()
i += 1
return json_dict
def get_value_by_sn(serial_num, file_handler):
#cnt represent the record that has been read
cnt = 0
offset = 0
file_handler.seek(0)
while cnt < serial_num:
record_len, = unpack("b", file_handler.read(1))
cnt += 1
offset += record_len + 1
file_handler.seek(offset)
record_len, = unpack("b", file_handler.read(1))
str_text = file_handler.read(record_len)
content = unicode(str_text, 'utf-8')
return content
def parseLog(in_file_path, out_file_path):
record_count = 0
file_handler = open(in_file_path)
line_per_loop = 10
json_dict = {}
while True:
lines = file_handler.readlines(line_per_loop)
if not lines:
break
for line in lines:
info_list = line.split('\t', 10)
if int(info_list[5]) == 6:
print line
base64_plain_text = base64.b64decode(info_list[9])
print base64_plain_text
url_type_tag = base64_plain_text.split(',')[1].strip()
try:
url_type = url_type_tag.split(':')[1].strip()
except IndexError:
continue
if url_type is not None:
#extract the url and base64_text: no.6 and no.9
if not is_duplication_detected(info_list[6], info_list[9]):
record_count += 1
json_dict = build_feature_json(base64_plain_text.split('\n')[1])
print json_dict
print "================================================"
#write the dictionary to the json file
with open(out_file_path, 'a') as f:
json.dump(json_dict, f)
f.write('\n')
return record_count
def dir_walk(dir, file_reg, url_db):
pattern = re.compile(file_reg)
list = os.listdir(dir)
for line in list:
file_path = os.path.join(dir, line)
if os.path.isdir(file_path):
pass
elif os.path:
print line
match = pattern.search(line)
if match:
print 'match ok'
print line
if __name__ == "__main__":
print parseLog('info.log.2016-06-13-15', 'json.data')
print 'record is detected!'
周末作业
最新推荐文章于 2024-08-04 12:17:48 发布