如果数据集中缺失数据,将其替换成所属特征的平均值。
假如文件forTest.txt中以下数据集:
2,2,2,NaN,2
3,3,NaN,3,3
4,NaN,4,4,4
加载和替换函数:
from numpy import *
def replaceNanWithMean(filename):
dataMat=loadDataSet(filename,',')
numFeat=shape(dataMat)[1]
for i in range(numFeat):
meanVal=mean(dataMat[nonzero(~isnan(dataMat[:,i].A))[0],i])
dataMat[nonzero(isnan(dataMat[:,i].A))[0],i]=meanVal
return dataMat
def loadDataSet(filename,delim='\t'):
f=open(filename)
stringArr=[line.strip().split(delim) for line in f.readlines()]
dataArr=[list(map(float,line)) for line in stringArr]
return mat(dataArr)
dataMat=replaceNanWithMean('forTest.txt')
print(dataMat)
输出:
[[2. 2. 2. 3.5 2. ]
[3. 3. 3. 3. 3. ]
[4. 2.5 4. 4. 4. ]]