import os
import json
import pandas as pd
import zipfile
from tqdm import tqdm
import sqlite3
from sklearn.preprocessing import LabelEncoder
def read_data_set(col_name_file, data_dir, problm_sn_path):
# Implementation of read_data_set...
pass
def extract_all_zip(zip_dir):
for root, dirs, files in os.walk(zip_dir):
for filename in files:
if filename.endswith('.zip'):
zip_path = os.path.join(root, filename)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(root)
def feature_deal(data):
data.drop(columns=['itemCode'],inplace=True)
label_feature = ['defectName-1','defectName-2']
null_precentile = data.isnull().sum() / len(data)
dropcol_list1 = null_precentile[null_precentile.values > 0.97].index.tolist() #删除掉空值超过百分之97的特征
data.drop(columns=dropcol_list1,inplace=True)
text_columns = [col for col in data.columns if data[col].apply(lambda x: isinstance(x, str) and len(x) >= 50).any()] #筛出文本特征
categorical_columns = data.select_dtypes(include=['object']).columns.to_list()
dropcol_list2 = label_feature + text_columns
categorical_columns = [cat for cat in categorical_columns if cat not in dropcol_list2] #筛选类别特征
print(categorical_columns)
for col in categorical_columns: #类别特征编码
data[col] = data[col].str.strip()
data[col] = data[col].astype('str') #保证所有列值类型一致
return data
if __name__ == "__main__":
col_name_file = r"E:\code1\press_column_names.txt"
data_dir = r"E:\code1\Defect2.0"
col_names = []
with open(col_name_file, 'r') as f:
for line in f.readlines():
col_names.append(line.strip("\n"))
conn = sqlite3.connect('data3.db')
t = []
for root, dirs, files in tqdm(os.walk(data_dir)):
for file in files:
if not file.endswith(".json"):
continue
with open(os.path.join(root, file), "r", encoding="utf-8") as file:
# Rest of the code...
for line in f.readlines():
# Split the line into words
col_names.append(line.strip("\n"))
print(col_names)
from tqdm import tqdm
# savefile = open("../process_data_excel7.csv","w",encoding="utf-8")
import sqlite3
conn = sqlite3.connect('data3.db')
t = []
i = 0
for root, dir, files in os.walk(data_dir):
for file in tqdm(files):
if not file.endswith(".json"):
continue
with open(os.path.join(root, file), "r", encoding="utf-8") as file:
jsonData = json.load(file)
defectDir = os.path.dirname(root).split(os.path.sep)[-1]
for item in jsonData:
defectName_1 = item.get('defectName-1', "")
defectName_2 = item.get('defectName-2', "")
defectName_3 = item.get('defectName-3', "")
if "异物" in item.get('problemCause-1', "") or "异物" in item.get('problemCause-2', ""):
continue
if "锡" not in defectName_1 and "锡" not in defectName_2 and defectName_1 != "":
# if defectName_1!="":
continue
row = []
for col_name in col_names:
row.append(item.get(col_name, ""))
row.append(defectDir)
t.append(row)
i += 1
if len(t) == 500000:
df = pd.DataFrame(t, columns=col_names + ["defectDir"])
df.to_sql("data_defect2", conn, if_exists="append")
t = []
df = pd.DataFrame(t, columns=col_names + ["defectDir"])
df.to_sql("data_defect2", conn, if_exists="append")
t = []
conn.close()
conn = sqlite3.connect('data3.db')
data = pd.read_sql_query("""select * from data_defect2 LIMIT 1*5000 ;""",conn) #只取5000条测试效果
data = feature_deal(data)
data.head(10)
conn.close()
质量检测111111
于 2023-12-14 14:48:52 首次发布