dev数据集是用来做交叉验证的,为了评估不同算法的表现
而测试数据集是为了计算所选算法的正确度的
def crossValidation(xArr,yArr,numVal = 10):
"""numVal:交叉验证的次数"""
m = len(yArr)
indexList = range(m)
errorMat = np.zeros((numVal,30))
for i in range(numVal):
trainX = []
trainY = []
testX = []
testY = []
random.shuffle(indexList) #实现训练和测试数据的随机抽取
#分为训练集和测试集
for j in range(m):
if j < m*0.9:
trainX.append(xArr[indexList[j]])
trainY.append(yArr[indexList[j]])
else:
testX.append(xArr[indexList[j]])
testY.append(yArr[indexList[j]])
wMat = ridgeTest(trainX,trainY)
for k in range(30):
#用训练时的参数将测试数据标准化
matTestX = np.mat(testX)
matTrainX = np.mat(trainX)
meanTrain = np.mean(matTrainX,0)
varTrain = np.var(matTrainX,0)
matTestX = (matTestX - meanTrain)/varTrain
yEst = matTestX*np.mat(wMat[k,:]).T+np.mean(trainY)
errorMat[i,k] = rssError(yEst.T.A,np.array(testY))
meanErrors = np.mean(errorMat,0)
minMean = float(min(meanErrors))
bestWeights = wMat[np.nonzero(meanErrors==minMean)]
print("the best regression params :",bestWeights)
xMat = np.mat(xArr)
yMat = np.mat(yArr).T
meanX = np.mean(xMat,0)
varX = np.var(xMat,0)
#数据还原-便于与其他的算法模型拟合结果进行比较
unReg = bestWeights/varX
print('the best model from Ridge Regression is:\n',unReg)
print('with constant term',-1*sum(np.multiply(meanX,unReg))+np.mean(yMat))