import re import shutil from pathlib import Path import pandas as pd import sys import os def clean(paths): return_data = [] with open(paths,"r") as f: text = f.read() lists = [] strs = [] count = 0 for i in text.split("\n"): # print(i) if "DataName" in i : if strs: lists.append(strs) strs = [] i = '%s,'%count + i.replace(" ",",").replace(" ",",") strs.append(i) # lists.append(i.split(",")) # lists.append(str(count)) if "DataValue" in i: i = '%s,'%count + i.replace(" ",",") strs.append(i) # lists.append(i.split(",")) # lists.append(str(count)) count +=1 else: if strs: lists.append(strs) for i in lists: table = pd.DataFrame([ii.split(',') for ii in i]).applymap(lambda x:"" if x == None else x) # print(table) columns_name = [] columns = table.columns for index, row in table.iterrows(): if index == 0: # 提取Dataname行数据 for c in columns.values.tolist(): columns_name.append(row[c]) # print(columns_name) if index > 1: # 提取Datavalue行数据 for c in range(2,len(columns_name)): # 判断是否为空值 if row[c]: return_data.append([columns_name[c],row[c],row[0]]) # print(return_data) return return_data def fileclean(inputdirs,outputfile,errordir): datas = [] for folderName,subfolders,filenames in os.walk(inputdirs): # print(folderName,subfolders,filenames) for filename in filenames: if filename.endswith(".csv"): parent = folderName.split("\\")[-1] paths = folderName+'/'+filename parent_s = parent.split("_") Devices_time = re.findall(";(.*?)]",filename)[0].replace("_",'/',2).replace("_",":") test_name = filename.split(" ")[0] Wafer_id = re.findall("\[(.*?)-",filename)[0] test_pipeline = re.findall("-(.*?)\(",filename)[0] #ICES5V [A213911020-T1D1(17) ; 11_7_2021 10_31_26 PM] for i,j,k in clean(paths=paths): data = { "PATH":parent, "FILE_NAME":filename, "DATANAME":i, "DATAVALUE":j, "PROJECT_TYPE":parent_s[0], "PRODUCT":parent_s[1], "PRODUCT_VERSION":parent_s[2], "LOT":parent_s[3], "TEST_ITEM":parent_s[4], "TEST_NODE":parent_s[5], "FACTORY":parent_s[6], "DEVICES_TIME":Devices_time, "TEST_NAME":test_name, "WAFER_ID":Wafer_id, "TEST_PIPELINE":test_pipeline, "LINE_NO":k } datas.append(data) else : if os.path.isdir(errordir): paths = folderName + '/' + filename shutil.move(paths, errordir) # with open(errorfile,"a+") as f: # import datetime # f.write(f"{datetime.datetime.now()} -- f{folderName}/f{filename} -- error") if datas: table = pd.DataFrame(datas) # 日期处理 table['DEVICES_TIME'] = pd.to_datetime(table['DEVICES_TIME']) table.to_csv(outputfile,index=None) if __name__ == '__main__': # argv = sys.argv # print(argv) # fileclean(argv[1],argv[2],argv[3]) # python clean_data.py 文件夹路径 输出文件路径.csv 错误保存.txt fileclean("data","data2.csv","error")
大数据扔出错误文件
最新推荐文章于 2023-11-10 15:27:42 发布