import pandas as pd
import numpy as np
data=pd.read_excel('missing.xlsx')
c=np.array([[1,2,3,4],[4,5,6,np.nan],[5,6,7,8],[9,4,np.nan,8]])
C=pd.DataFrame(c)
from sklearn.preprocessing import Imputer
fC=C
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(fC)
fC=imp.transform(fC)
print(fC)
[[1. 2. 3. 4. ]
[4. 5. 6. 6.66666667]
[5. 6. 7. 8. ]
[9. 4. 5.33333333 8. ]]
imp = Imputer(missing_values='NaN', strategy='median', axis=1)
fc=c
imp.fit(fc)
fc=imp.transform(fc)
print(fc)
[[1. 2. 3. 4.]
[4. 5. 6. 5.]
[5. 6. 7. 8.]
[9. 4. 8. 8.]]
fD=data[['a','c']]
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(fD)
fD=imp.transform(fD)
print(fD)
[[2. 4.]
[2. 6.]
[2. 5.]
[5. 6.]
[6. 6.]]
data=np.load('data.npy')
data=data[:,1:]
print(data)
[[ 17. 66.17647059 32. 1614.96618125 13.15625 ]
[ 8. 68.6875 36. 143.56458056 3.80555556]
[ 16. 65.84375 43. 1344.13137674 12.69767442]
...
[ 10. 67.95 24. 115.87417083 2.79166667]
[ 21. 66.5 41. 538.71289268 20.31707317]
[ 11. 78.27272727 9. 62.98323333 9.44444444]]
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(data)
data=imp.transform(data)
print(data)
[[ 17. 66.17647059 32. 1614.96618125 13.15625 ]
[ 8. 68.6875 36. 143.56458056 3.80555556]
[ 16. 65.84375 43. 1344.13137674 12.69767442]
...
[ 10. 67.95 24. 115.87417083 2.79166667]
[ 21. 66.5 41. 538.71289268 20.31707317]
[ 11. 78.27272727 9. 62.98323333 9.44444444]]
from sklearn.preprocessing import StandardScaler
X=data
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)
print(X)
[[ 0.20025842 -0.82760637 0.05554634 2.84353829 0.76954149]
[-0.68918721 -0.09224269 0.20662516 -0.18554119 -0.65156099]
[ 0.10143112 -0.92504475 0.47101308 2.28598816 0.69984796]
...
[-0.49153262 -0.30822213 -0.24661129 -0.24254565 -0.80565008]
[ 0.59556758 -0.73285966 0.39547367 0.62792513 1.85783108]
[-0.39270533 2.71482439 -0.81315684 -0.35142881 0.20542766]]
from sklearn.preprocessing import MinMaxScaler
X1=data
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X1)
X1=min_max_scaler.transform(X1)
print(X1)
[[0.38095238 0.04406273 0.2519685 0.33941778 0.1381392 ]
[0.16666667 0.17158327 0.28346457 0.03015525 0.03188131]
[0.35714286 0.0271658 0.33858268 0.28249311 0.13292812]
...
[0.21428571 0.13412995 0.18897638 0.02433521 0.02035985]
[0.47619048 0.06049291 0.32283465 0.11320842 0.2195122 ]
[0.23809524 0.65836106 0.07086614 0.01321848 0.0959596 ]]
本节内容参考Python金融数据分析与挖掘实战,黄恒秋,人民邮电出版社的内容