Python提取数据
python提取数据
目前实现的代码:
import re
def read_write_file(in_file_path, out_file_path, reg_expression, line_per_loop):
in_file = open(in_file_path)
out_file = open(out_file_path, "a")
#pattern = re.compile(r'warn.mse.360.cn')
pattern = re.compile(reg_expression)
while 1:
lines = in_file.readlines(line_per_loop)
if not lines:
break
for line in lines:
#process the line
match = pattern.search(line)
if match:
print line
out_file.write(line)
in_file.close()
out_file.close()
if __name__ == "__main__":
read_write_file("log.txt", "out.txt", r'INCLUDE', 100000)
访问目录的部分
import os, sys
# depth: the current dir:depth = 0
def dir_walk(dir, out_file, depth):
out_file.write(dir + '\n')
current_depth = 0
file_num = 0
list = os.listdir(dir)
for line in list:
file_path = os.path.join(dir, line)
if os.path.isdir(file_path): # is a directory
#file.write(line + '\\' + '\n')
dir_walk(file_path, out_file, depth - 1)
elif os.path: # is a file
#process the line
read_write_file(file_path, out_file, r'warn\.mse\.360\.cn', 100000)
if __name__ == "__main__":
#dir_path = os.getcwd()
file = open('path.txt', 'w')
dir_walk('E:\\git', file, 2)
数据部分在优盘里
最后完善的代码:
import re
import os, sys
# in_file_path: the file will be read that maybe in different path
# out_file: the content that was been extracted from the source file
# reg_expression: the regulation rules that want to extract
# liner_per_loop: the lines that will be loaded per time
def read_write_file(in_file_path, out_file, reg_expression, line_per_loop):
in_file = open(in_file_path)
#pattern = re.compile(r'warn.mse.360.cn')
pattern = re.compile(reg_expression)
while 1:
lines = in_file.readlines(line_per_loop)
if not lines:
break
for line in lines:
#process the line
match = pattern.search(line)
if match:
print line
out_file.write(line)
in_file.close()
# dir: the directory that will be search
# out_file: the file that has been opened
# depth: the current dir:depth = 0
def dir_walk(dir, out_file, current_depth, max_depth):
print current_depth
if current_depth > max_depth:
return
list = os.listdir(dir)
for line in list:
print line
file_path = os.path.join(dir, line)
if os.path.isdir(file_path): # is a directory
dir_walk(file_path, out_file, current_depth + 1, max_depth)
elif os.path: # is a file
if current_depth != 0:
#process the line
read_write_file(file_path, out_file, r'INCLUDE', 100000)
if __name__ == "__main__":
current_dir = os.getcwd()
out_file = open("extract_log.txt", "a")
dir_walk(current_dir, out_file, 0, 3)
out_file.close()
在第0层中,代码只会对文件夹进行扫描,其他层次扫描所有的文件和目录。