数据预处理
(1)缺失值处理
from sklearn.preprocessing import Imputer
1)用平均数填充
2)用中位数填充
3)用最频繁值(众数)填充
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
//均值填充
data1 = data
imp = Imputer(missing_values='NaN',strategy='mean')
imp.fit(data1)
data1 = imp.transform(data1)
//中位数填充
data2 = data
imp = Imputer(missing_values='NaN',strategy='median')
imp.fit(data2)
data2 = imp.transform(data2)
//众数填充
data3 = data
imp = Imputer(missing_values='NaN',strategy='most_frequent')
imp.fit(data3)
data3 = imp.transform(data3)
(2)数据标准化
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
data = pd.read_excel("data_path")
data1 = data
scaler = StandardScaler()
scaler.fit(data1)
data1 = scaler.transform(data1)