正则表达式提取文件需要的文件
利用Python 正则表达式来提取 *.gz后缀的文件。
本人写了一个类用于读取文件
import os
import numpy as np
import gzip
import re
import argparse
class function(object):
def __init__(self):
self.path = "xxx"
self.cond = "xxx"
def load_gz_file(self, path):
contents = []
if os.path.exists(path):
with gzip.open(path, 'rb') as fp:
contents = fp.read()
else:
print('File not exist! continue')
return contents
def load_files_by_condition(self, path, cond):
# file_names = os.listdir("F:\y2019")
paths = []
for root, dirs, files in os.walk(path):
for file in files:
# print(file.endswith(""*.gz))
re_match = re.findall(r"\D{5}\d{6}\D+", file)
if np.size(re_match) != 0 and file.endswith(cond):
paths.append(os.path.join(root, file))
print(re_match)
# print(re.findall(r"\D{4}", files))
return paths
if __name__ == '__main__':
# Parse command line arguments
parser = argparse.ArgumentParser(description='Enter the parameter')
parser.add_argument('--parent_path',
required=True,
metavar="/path/to/coco/",
help='Father directory of the dataset')
parser.add_argument('--condition',
required=True,
metavar=".gz",
help='Set the endswith of the file you want')
args = parser.parse_args()
function_process = function()
file_names = function_process.load_files_by_condition(args.parent_path, args.condition)
for i in range(len(file_names)):
data = np.array(bytearray(function_process.load_gz_file(file_names[i])))
print(file_names[i])