Python提取数据

Python提取数据

python提取数据
目前实现的代码:

import re

def read_write_file(in_file_path, out_file_path, reg_expression, line_per_loop):
    in_file  = open(in_file_path)
    out_file = open(out_file_path, "a")

    #pattern = re.compile(r'warn.mse.360.cn')
    pattern = re.compile(reg_expression)

    while 1:
        lines = in_file.readlines(line_per_loop)
        if not lines:
            break
        for line in lines:
            #process the line
            match = pattern.search(line)
            if match:
                print line
                out_file.write(line)

    in_file.close()
    out_file.close()




if __name__ == "__main__":
    read_write_file("log.txt", "out.txt", r'INCLUDE', 100000)

访问目录的部分

import os, sys

# depth: the current dir:depth = 0
def dir_walk(dir, out_file, depth):
    out_file.write(dir + '\n')
    current_depth = 0
    file_num = 0

    list = os.listdir(dir)
    for line in list:
        file_path = os.path.join(dir, line)
        if os.path.isdir(file_path):    # is a directory
            #file.write(line + '\\' + '\n')
            dir_walk(file_path, out_file, depth - 1)
        elif os.path:    # is a file
            #process the line
            read_write_file(file_path, out_file, r'warn\.mse\.360\.cn', 100000)


if __name__ == "__main__":
    #dir_path = os.getcwd()
    file = open('path.txt', 'w')
    dir_walk('E:\\git', file, 2)

数据部分在优盘里

最后完善的代码:

import re
import os, sys

# in_file_path: the file will be read that maybe in different path
# out_file: the content that was been extracted from the source file
# reg_expression: the regulation rules that want to extract
# liner_per_loop: the lines that will be loaded per time
def read_write_file(in_file_path, out_file, reg_expression, line_per_loop):
    in_file  = open(in_file_path)

    #pattern = re.compile(r'warn.mse.360.cn')
    pattern = re.compile(reg_expression)

    while 1:
        lines = in_file.readlines(line_per_loop)
        if not lines:
            break
        for line in lines:
            #process the line
            match = pattern.search(line)
            if match:
                print line
                out_file.write(line)

    in_file.close()


# dir: the directory that will be search
# out_file: the file that has been opened
# depth: the current dir:depth = 0
def dir_walk(dir, out_file, current_depth, max_depth):
    print current_depth
    if current_depth > max_depth:
        return

    list = os.listdir(dir)
    for line in list:
        print line
        file_path = os.path.join(dir, line)
        if os.path.isdir(file_path):    # is a directory
            dir_walk(file_path, out_file, current_depth + 1, max_depth)
        elif os.path:    # is a file
            if current_depth != 0:
                #process the line
                read_write_file(file_path, out_file, r'INCLUDE', 100000)

if __name__ == "__main__":
    current_dir = os.getcwd()
    out_file = open("extract_log.txt", "a")

    dir_walk(current_dir, out_file, 0, 3)

    out_file.close()

在第0层中,代码只会对文件夹进行扫描,其他层次扫描所有的文件和目录。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值