import os
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"] = ["SimHei"] # 设置字体
plt.rcParams["axes.unicode_minus"] = False # 该语句解决图像中的“-”负号的乱码问题
def get_data_dict(path):
df_dict = {}
all_dir_name = os.listdir(path)
print(all_dir_name)
for dir_name in all_dir_name:
df = pd.read_excel(os.path.join(path, dir_name))
df.drop(columns=['Unnamed: 0'], inplace=True)
all_columns = {'日期', '标况瞬时流量', '温度', '进站压力', '表压', '出口压力', '出口温度', '进口压力', '进口温度'}
all_columns.difference_update(df.columns.tolist())
print(df.columns.tolist())
print(all_columns)
if all_columns:
pass
else:
df_dict[dir_name] = df
os.makedirs("../data/处理后的合并数据", exist_ok=True)
df_dict[dir_name].to_excel(os.path.join('../data/处理后的合并数据', dir_name))
return df_dict
# 进行空值统计并删除
def nan_count_delete(df_dict):
delete_user = []
nan_count_dict = {}
for user, user_data in df_dict.items():
# user = user.split('.')[0]
count_dict = {}
for col in user_data.columns.tolist():
count_dict[col] = user_data[col].isnull().sum()
user_data.dropna(axis=0, how='any', inplace=True)
# 若数据量过少,则舍弃,不进入字典
if len(user_data) <= 100:
delete_user.append(user)
df_dict[user] = user_data
nan_count_dict[user] = count_dict
# 去除掉数据量过少的公司
[df_dict.pop(k) for k in delete_user]
return df_dict, nan_count_dict
# 进行零值统计并删除
def zero_count_delete(df_dict):
delete_user = []
zero_count_dict = {}
for user, user_data in df_dict.items():
count_dict = {}
for col in user_data.columns.tolist():
count_dict[col] = (user_data[col] == 0).astype(int).sum(axis=0)
user_data = user_data.drop(user_data[user_data['标况瞬时流量'] == 0].index)
# 若数据量过少,则舍弃,不进入字典
if len(user_data) <= 100:
delete_user.append(user)
df_dict[user] = user_data
zero_count_dict[user] = count_dict
# 去除掉数据量过少的公司
[df_dict.pop(k) for k in delete_user]
for user, data in df_dict.items():
df_dict[user].to_excel(os.path.join('../data/处理后的最终数据', user))
return df_dict, zero_count_dict
# 特征相关性分析并绘图
def feature_corr(df_dict, path):
# all_feature_list = []
for user, user_data in df_dict.items():
user = user.split('.')[0]
os.makedirs(os.path.join(path, user), exist_ok=True)
# feature_list = user_data.columns.tolist()
plt.figure(figsize=(10, 10))
sns.heatmap(user_data.corr(), annot=True)
plt.savefig(os.path.join(path, user, "{}_相关性热力图.png".format(user)), dpi=300)
plt.close()
def draw_data(df_dict, path):
for user, data in df_dict.items():
user_name = user.split('.')[0]
os.makedirs(os.path.join(path, user_name), exist_ok=True)
for i in ['标况瞬时流量', '温度', '进站压力', '表压', '出口压力', '出口温度', '进口压力', '进口温度']:
# 绘图
plt.figure(figsize=(24, 8))
plt.plot(df_dict[user]["日期"], df_dict[user][i], '.-', color='g', label=i, linewidth=0.3,
markeredgewidth=0.1) # o-:圆形
plt.xlabel("time") # 横坐标名字
plt.ylabel(i) # 纵坐标名字
plt.legend(loc="best") # 图例
plt.title("{}数据分布".format(i))
# plt.show()
plt.savefig(os.path.join(path, user_name, "{}_数据分布".format(i)), dpi=300)
plt.close()
if __name__ == '__main__':
# 获取所有数据,并利用字典封装
data_dict = get_data_dict("../data/处理后的数据")
print("data_dict:", len(data_dict))
# 进行空值统计并删除
data_dict, nan_count = nan_count_delete(data_dict)
print("nan_count:", nan_count)
# 进行零值统计并删除
data_dict, zero_count = zero_count_delete(data_dict)
print("zero_count:", zero_count)
# 相关性分析
feature_corr(data_dict, "../picture")
# 绘图
draw_data(data_dict, "../picture")
# 重要结果保存
os.makedirs("../score", exist_ok=True)
writer = pd.ExcelWriter("../score/result_score.xlsx", engine='openpyxl')
# pd.DataFrame(score_dict).to_excel(writer, sheet_name="score")
# pd.DataFrame(len_count_dict).to_excel(writer, sheet_name="len_train_test")
pd.DataFrame(nan_count).to_excel(writer, sheet_name="nan_count")
pd.DataFrame(zero_count).to_excel(writer, sheet_name="zero_count")
writer.save()
数据的处理
于 2023-03-29 13:27:05 首次发布