整体逻辑
- 首先对数据进行清洗:数据标准化、数据集的切分、删除空数据行等。
- 数据中存在空缺,进行填充,使用的方法是用平均值的方法对数据进行填充
- 使用smote算法实现数据集的拟合
- 调用pandas库绘制每个类别的数据个数图
- 保存数据为excel文件,注意:测试集不用传入模型训练!
代码实现
import pandas as pd
import matplotlib.pyplot as plt
'''---------------------使用平均值的方法对数据进行填充-----------------------'''
def mean_train_method(data):
'''平均值的计算方法'''
fill_values = data.mean()
return data.fillna(fill_values)
def mean_train_fill(train_data,train_label):
'''使用平均值的方法对数据进行填充'''
data = pd.concat([train_data, train_label], axis=1)
data = data.reset_index(drop=True)
A = data[data['矿物类型'] == 0]
B = data[data['矿物类型'] == 1]
C = data[data['矿物类型'] == 2]
D = data[data['矿物类型'] == 3]
A = mean_train_method(A)
B = mean_train_method(B)
C = mean_train_method(C)
D = mean_train_method(D)
df_filled = pd.concat([A, B, C, D])
df_filled = df_filled.reset_index(drop=True)
return df_filled.drop('矿物类型', axis=1),df_filled.矿物类型
def mean_test_method(train_data, test_data):
'''根据训练集获取每个类别的平均值,并将训练集的每个类别平均值填充到测试中'''
fill_values = train_data.mean()
return test_data.fillna(fill_values)
def mean_test_fill(train_data,train_label, test_data,test_label):
'''使用平均值的方法对数据进行填充'''
train_data_all = pd.concat([train_data, train_label], axis=1)
train_data_all = train_data_all.reset_index(drop=True)
test_data_all = pd.concat([test_data, test_label], axis=1)
test_data_all = test_data_all.reset_index(drop=True)
A_train = train_data_all[train_data_all['矿物类型'] == 0]
B_train = train_data_all[train_data_all['矿物类型'] == 1]
C_train = train_data_all[train_data_all['矿物类型'] == 2]
D_train = train_data_all[train_data_all['矿物类型'] == 3]
A_test = test_data_all[test_data_all['矿物类型'] == 0]
B_test = test_data_all[test_data_all['矿物类型'] == 1]
C_test = test_data_all[test_data_all['矿物类型'] == 2]
D_test = test_data_all[test_data_all['矿物类型'] == 3]
A = mean_test_method(A_train,A_test)
B = mean_test_method(B_train,B_test)
C = mean_test_method(C_train,C_test)
D = mean_test_method(D_train,D_test)
df_filled = pd.concat([A, B, C, D])
df_filled = df_filled.reset_index(drop=True)
return df_filled.drop('矿物类型', axis=1),df_filled.矿物类型
data = pd.read_excel("矿物数据.xls")
data = data[data['矿物类型'] != 'E']
null_num = data.isnull()
null_total = null_num.sum()
X_whole = data.drop('矿物类型', axis=1).drop('序号', axis=1)
y_whole = data.矿物类型
label_dict = {"A": 0, "B": 1, "C": 2, "D": 3}
encoded_labels = [label_dict[label] for label in y_whole]
y_whole = pd.Series(encoded_labels,name='矿物类型')
for column_name in X_whole.columns:
X_whole[column_name] = pd.to_numeric(X_whole[column_name], errors='coerce')
"""数据标准化:Z标准化"""
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_whole_Z = scaler.fit_transform(X_whole)
X_whole = pd.DataFrame(X_whole_Z,columns=X_whole.columns)
'''数据集的切分'''
from sklearn.model_selection import train_test_split
x_train_w, x_test_w, y_train_w, y_test_w = \
train_test_split(X_whole, y_whole, test_size = 0.3, random_state = 50000)
'''数据中存在空缺,进行填充'''
x_train_fill,y_train_fill = mean_train_fill(x_train_w,y_train_w)
x_test_fill,y_test_fill = mean_test_fill(x_train_fill, y_train_fill, x_test_w, y_test_w)
'''smote算法实现数据集的拟合'''
from imblearn.over_sampling import SMOTE
oversampler = SMOTE(k_neighbors=1, random_state=42)
os_x_train, os_y_train = oversampler.fit_resample(x_train_fill, y_train_fill)
y_whole = pd.concat([os_y_train,y_test_fill])
labels_count = pd.value_counts(y_whole)
fig, ax = plt.subplots()
bars = ax.bar(labels_count.index,labels_count.values)
for bar in bars:
yval = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2, yval,
round(yval, 2),
va='bottom',
ha='center',
fontsize=10,
color='black')
plt.xlabel('lables')
plt.ylabel('numbers')
plt.title('The number of data for each category after removing empty data')
plt.show()
'''数据保存为excel文件'''
data_train = pd.concat([os_y_train,os_x_train],axis=1).sample(frac=1, random_state=4)
data_test = pd.concat([y_test_fill,x_test_fill],axis=1)
data_train.to_excel(r'.//temp_data//训练数据集[平均值填充].xlsx', index=False)
data_test.to_excel(r'.//temp_data//测试数据集[平均值填充].xlsx', index=False)