1.线性回归原理
回归的目的就是预测数值型的目标值,回归方程就是回归系数和输入线性组合的方程,求回归系数的过程就叫回归。线性回归是输入项和系数相乘在相加,非线性回归可能认为输出是输入的相乘
求回归系数就是求出误差函数并对会回归系数求导,并让其为0,就可以求出回归系数
import numpy as np
import matplotlib.pyplot as pl
def loadDataSet(filename):
numFeat = len(open(filename).readline().split('\t')) - 1
dataMat = []
labelMat = []
fr = open(filename)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat, labelMat
def standRegres(xArr, yArr):
xMat = np.mat(xArr)
yMat = np.mat(yArr).T
xTx = xMat.T * xMat
if np.linalg.det(xTx) == 0.0:
print 'this matrix is singular,cannot do inverse'
return
ws = xTx.I * (xMat.T * yMat)
return ws
线性回归可能出现的一个问题就是欠拟合,此时我们可以做局部加权线性回归,我们对带预测附近的每个点赋予一个权值,然后在做回归
def lwlr(testPoint, xArr, yArr, k = 1.0):
xMat = np.mat(xArr)
yMat = np.mat(yArr).T
m = np.shape(xMat)[0]
weights = np.mat(np.eye((m)))
for j in range(m):
diffMat = testPoint - xMat[j,:]
weights[j,j] = np.exp(diffMat*diffMat.T/(-2.0*k**2))
xTx = xMat.T * (weights*xMat)
if np.linalg.det(xTx) == 0.0:
print 'this matrix is singular, cannot do inverse'
return
ws = xTx.I * (xMat.T*(weights*yMat))
return testPoint * ws
def lwlrTest(testArr, xArr, yArr, k = 1.0):
m = np.shape(testArr)[0]
yHat = np.zeros(m)
for i in range(m):
yHat[i] = lwlr(testArr[i], xArr, yArr, k)
return yHat
岭回归,当特征的数目比数据的数还多时,我们可以引入岭回归来解决问题.岭回归就是在XtX上加一个常数在乘单位矩阵,使其非奇异可求逆
def ridgeRegress(xMat, yMat, lam = 0.2):
xTx = xMat.T * xMat
denom = xTx + np.eye(np.shape(xMat)[1])*lam
if np.linalg.det(denom) == 0.0:
print 'this matrix is singular, cannot do inverse'
return
ws = denom.I * (xMat.T*yMat)
return ws
def ridgeTest(xArr, yArr):
xMat = np.mat(xArr); yMat = np.mat(yArr).T
yMean = np.mean(yMat,0)
yMat = yMat - yMean
xMean = np.mean(xMat, 0)
xVar = np.var(xMat, 0)
xMat = (xMat - xMean) / xVar
numTest = 30
wMat = np.ones((numTest, np.shape(xMat)[1]))
for i in range(30):
ws = ridgeRegress(xMat, yMat, np.exp(i-10))
wMat[i,:] = ws.T
return wMat
前向逐步回归
def rssError(yArr, yHatArr):
return ((yArr - yHatArr)**2).sum()
def regularize(xMat):
inMat = xMat.copy()
inMatMean = np.mean(inMat,0)
inMatVar = np.var(inMat,0)
inMat = (inMat - inMatMean)/inMatVar
return inMat
def stageWise(xArr, yArr, eps = 0.01, numIt = 100):
xMat = np.mat(xArr); yMat = np.mat(yArr).T
yMean = np.mean(yMat,0)
yMat = yMat - yMean
xMat = regularize(xMat)
m,n = np.shape(xMat)
retMat = np.ones((numIt,n))
ws = np.ones((n,1))
wsTest = ws.copy()
wsMax = ws.copy()
for i in range(numIt):
print ws.T
lowestError = np.Inf
for j in range(n):
for sign in [-1,1]:
wsTest = ws.copy()
wsTest[j] += eps * sign
yTest = xMat * wsTest
rssE = rssError(yMat.A, yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
retMat[i,:] = ws.T
return retMat