周末作业

#coding=utf-8

import re
from struct import *
import json
from pprint import pprint
import hashlib
import base64

duplication_detected_db = {}

def md5_generator(url, feature_base64_text):
    plain_text = url + feature_base64_text
    crypted_text = hashlib.md5(plain_text)
    return crypted_text.hexdigest()

def is_duplication_detected(url, feature_base64_text):

    md5_str = md5_generator(url, feature_base64_text)

    if md5_str not in duplication_detected_db:
        duplication_detected_db[md5_str] = 0
        return False
    else:
        duplication_detected_db[md5_str] += 1
        return True

def build_feature_json(feature_content):

    reg_expression_list = [r'CC3G:(.*?)&', r'MPTC:(.*?)&'] 

    index_to_file_list = ['ccon_3g', 'e_symbol']

    i = 0
    #compile the reg_expression 
    for reg in reg_expression_list:
        pattern = re.compile(reg)

        try:
            #match the source text using the complied reg_expression
            content = pattern.search(feature_content).group(1).strip(',')
            #print content
        except AttributeError:
            continue

        #split the matched string using the ,
        tag_list = content.split(',')
        #print tag_list

        print index_to_file_list[i]
        file_handler = open(index_to_file_list[i], 'r')


        #for a file
        attr_dict = {}
        for tag in tag_list:
            #print tag
            if tag is not None:
                id_num_key_value_list = tag.split('|')

                #print id_num_key_value_list
                #print id_num_key_value_list[0]
                #print id_num_key_value_list[1]

                feature_text = get_value_by_sn(int(id_num_key_value_list[0]), file_handler)

                print feature_text
                #add to a tmp dictionary
                attr_dict[feature_text] = id_num_key_value_list[1]

        json_dict = {}
        json_dict[index_to_file_list[i]] = attr_dict

        #print json_dict
        #json_str = json.dumps(json_dict)
        #pprint(json_str)

        file_handler.close()

        i += 1

    return json_dict

def get_value_by_sn(serial_num, file_handler):


    #cnt represent the record that has been read
    cnt = 0
    offset = 0
    file_handler.seek(0)

    while cnt < serial_num:

        record_len, = unpack("b", file_handler.read(1))
        cnt += 1
        offset += record_len + 1
        file_handler.seek(offset)

    record_len, = unpack("b", file_handler.read(1))
    str_text = file_handler.read(record_len)
    content = unicode(str_text, 'utf-8')

    return content 

def parseLog(in_file_path, out_file_path):

    record_count = 0
    file_handler = open(in_file_path)
    line_per_loop = 10
    json_dict = {}

    while True:
        lines = file_handler.readlines(line_per_loop)
        if not lines:
            break

        for line in lines:
            info_list = line.split('\t', 10)
            if int(info_list[5]) == 6:
                print line

                base64_plain_text = base64.b64decode(info_list[9])
                print base64_plain_text

                url_type_tag = base64_plain_text.split(',')[1].strip()
                try:
                    url_type = url_type_tag.split(':')[1].strip()
                except IndexError:
                    continue

                if url_type is not None:
                    #extract the url and base64_text: no.6 and no.9
                    if not is_duplication_detected(info_list[6], info_list[9]):
                        record_count += 1
                        json_dict = build_feature_json(base64_plain_text.split('\n')[1])
                        print json_dict
                        print "================================================"
                        #write the dictionary to the json file
                        with open(out_file_path, 'a') as f:
                            json.dump(json_dict, f)
                            f.write('\n')

    return record_count

def dir_walk(dir, file_reg, url_db):
      pattern = re.compile(file_reg)
      list = os.listdir(dir)

      for line in list:
          file_path = os.path.join(dir, line)
          if os.path.isdir(file_path):
              pass
          elif os.path:
              print line
              match = pattern.search(line)
              if match:
                  print 'match ok'
                  print line



if __name__ == "__main__":

    print parseLog('info.log.2016-06-13-15', 'json.data')
    print 'record is detected!'   
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值