# 第5章 机器学习包# 5.2.1 缺失值处理import numpy as np
import pandas as pd
import os
os.chdir("C:\\Users\\Administrator\\Desktop")#更改工作路径,注意双\\ 任何操作前可以先将常用包和路径先设置好
data = pd.read_excel("missing.xlsx")#将文件放到工作路径下,用该命令读取数据
print(data)
a b c d
0 2.0 kj 4.0 7.0
1 2.0 kl 6.0 9.0
2 NaN kl 5.0 NaN
3 5.0 NaN NaN 9.0
4 6.0 kk 6.0 8.0
c = np.array([[1,2,3,4],[4,5,6,np.nan],[5,6,7,8],[9,4,np.nan,8]])# 含有缺失值的数组
C = pd.DataFrame(c)#把数组c转化成数据框C# 需要填充的数据结构要求为数组或数据框,类型为数值型from sklearn.preprocessing import Imputer #这种做法在新版本软件中会报错from sklearn.impute import SimpleImputer # 新版本的操作方法
# 均值填充
fC = C
imp = SimpleImputer(np.nan,"mean")# 新版本的作法,用均值填充空值
fC = imp.fit_transform(fC)print(fC)
[[1. 2. 3. 4. ]
[4. 5. 6. 6.66666667]
[5. 6. 7. 8. ]
[9. 4. 5.33333333 8. ]]
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass missing_values=nan, strategy=mean as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
warnings.warn(f"Pass {args_msg} as keyword args. From version "
# 中位数填充
fc = c
imp = SimpleImputer( np.nan,"median")# 新版本的作法
fc = imp.fit_transform(fc)print(fc)
[[1. 2. 3. 4.]
[4. 5. 6. 8.]
[5. 6. 7. 8.]
[9. 4. 6. 8.]]
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass missing_values=nan, strategy=median as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
warnings.warn(f"Pass {args_msg} as keyword args. From version "
[[2. 4.]
[2. 6.]
[2. 5.]
[5. 6.]
[6. 6.]]
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass missing_values=nan, strategy=most_frequent as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
warnings.warn(f"Pass {args_msg} as keyword args. From version "