'''
1、经过多次迭代计算出最佳的回归系数,在每次迭代中增加或减少某个权重观察其错误率变化。选择对误差影响小的方向:
'''
from numpy import *
#import numpy as np
import pandas as pd
def rssError(yArr, yHatArr):
return ((yArr - yHatArr) ** 2).sum()
###2、加载数据 ,txt格式以\t分割 将文件数据转换成列表返回dataMat, labelMat
def loadDataSet(fileName):
numFeat = pd.read_csv(fileName,index_col=0)
dataMat = array(numFeat.iloc[:500, 2:]).tolist()
labelMat = array(numFeat.iloc[:500, 1]).tolist()
#print(len(dataMat[0]),len(dataMat),dataMat[:5],type(dataMat),labelMat[:10])
return dataMat, labelMat
# fileName = 'edu_shaoer_afterfeature.csv'
# loadDataSet(fileName)
# def loadDataSet(fileName):
# numFeat = len(open(fileName).readline().split('\t')) - 1
# dataMat = []
# labelMat = []
# fr = open(fileName)
# for line in fr.readlines():
# lineArr = []
# curLine = line.strip().split('\t')
# for i in range(numFeat):
# lineArr.append(float(curLine[i]))
# dataMat.append(lineArr)
# labelMat.append(float(curLine[-1]))
# return dataMat, labelMat
###3、标准化数据
def regularize(xMat): # regularize by columns
inMat = xMat.copy()
inMeans = mean(inMat, 0) # 按列取均值
inVar = var(inMat, 0) # 按列取方差
inMat = (inMat - inMeans) / inVar
#print(type(inMat),inVar)
return inMat
###4、
def stageWise(xArr, yArr, eps=0.01, numIt=100):
xMat = mat(xArr) # 将xArr变成矩阵形式
yMat = mat(yArr).T # 将yArr变成矩阵形式->m X 1
yMean = mean(yMat, 0) # 将yMat按列求均值
yMat = yMat - yMean
print(yMat[:10])
xMat = regularize(xMat)
m, n = shape(xMat)
returnMat = zeros((numIt, n)) # 初始化一个100 X n的全0矩阵
ws = zeros((n, 1)) # 初始化一个n X 1的全0矩阵
wsTest = ws.copy()
wsMax = ws.copy()
for i in range(numIt): # numIt:迭代的次数
#print(ws.T) # 输出一个 1 X n的全0矩阵
lowestError = inf # 误差初始化为正无穷
for j in range(n): # 对每个特征
for sign in [-1, 1]: # 分别计算增加或减少该特征对误差的影响
wsTest[j] += eps * sign # eps:每次迭代移动的步长
yTest = xMat * wsTest
rssE = rssError(yMat.A, yTest.A) # 得到平方误差进行比较; 矩阵名.A: 矩阵转化为array数组类型
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
#print('wsmax',wsMax)
ws = wsMax.copy()
returnMat[i, :] = ws.T
print(returnMat[:5],returnMat.shape,)
return returnMat # 迭代结束返回回归系数
def main():
xArr,yArr = loadDataSet('edu_shaoer_afterfeature.csv')
stageWise(xArr,yArr,0.1,5000)
if __name__ == '__main__':
main()
#——————————————————————————————————————————
#岭回归
### Ridge Regression ###
def ridgeRegres(xMat, yMat, lam=0.2):
xTx = xMat.T * xMat
denom = xTx + lam*eye(shape(xMat)[1])
if linalg.det(denom) == 0.0:
print ("This matrix is singular, cannot do inverse")
return
ws = denom.I * (xMat.T*yMat)
return ws
def ridgeTest(xArr, yArr):
xMat = mat(xArr)
xMeans = mean(xMat, 0)
xVar = var(xMat, 0)
xMat = (xMat-xMeans) / xVar
yMat = mat(yArr).T
yMean = mean(yMat, 0)
yMat = yMat - yMean
numTestPts = 30
wMat = zeros((numTestPts, shape(xMat)[1]))
for ii in range(numTestPts):
ws = ridgeRegres(xMat, yMat, exp(ii-10))
wMat[ii,:] = ws.T
return wMat