一,标准回归函数
from numpy import *
def loadDataSet(fileName): #general function to parse tab -delimited floats
numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
def standRegres(xArr, yArr):
xMat = mat(xArr); yMat = mat(yArr).T
xTx = xMat.T*xMat
if linalg.det(xTx) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = xTx.I * (xMat.T*yMat)
return ws
>>> import regression
>>> from numpy import *
>>> xArr,yArr=regression.loadDataSet('ex0.txt')
>>> xArr[0:2]
[[1.0, 0.067732], [1.0, 0.42781]]
>>> ws = regression.standRegres(xArr,yArr)
>>> ws
matrix([[ 3.00774324],
[ 1.69532264]])
>>> xMat=mat(xArr)
>>> yMat=mat(yArr)
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])
<matplotlib.collections.PathCollection object at 0x7f4905380fd0>
>>> xCopy=xMat.copy()
>>> xCopy.sort(0)
>>> yHat=xCopy*ws
>>> ax.plot(xCopy[:,1],yHat)
[<matplotlib.lines.Line2D object at 0x7f4905391950>]
>>> plt.show()
二,局部加权线性回归
线性回归的一个问题是有可能出现欠拟合现象,因为它求的是具有最小均方误差的无偏估计.所以有些方法允许在估计中引入一些偏差,从而降低预测的均方误差
def lwlr(testPoint,xArr,yArr,k=1.0):
xMat = mat(xArr); yMat = mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye((m)))
for j in range(m):
diffMat = testPoint - xMat[j,:]
weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
xTx = xMat.T * (weights * xMat)
if linalg.det(xTx) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = xTx.I * (xMat.T * (weights*yMat))
return testPoint*ws
def lwlrTest(testArr,xArr,yArr,k=1.0):
m = shape(testArr)[0]
yHat = zeros(m)
for i in range(m):
yHat[i] = lwlr(testArr[i],xArr,yArr,k)
return yHat
>>> reload(regression)
<module 'regression' from 'regression.py'>
>>> xArr,yArr=regression.loadDataSet('ex0.txt')
>>> yArr[0]
3.176513
>>> regression.lwlr(xArr[0],xArr,yArr,1.0)
matrix([[ 3.12204471]])
>>> regression.lwlr(xArr[0],xArr,yArr,0.001)
matrix([[ 3.20175729]])
>>> yHat=regression.lwlrTest(xArr,xArr,yArr,0.003)
>>> xMat=mat(xArr)
>>> srtInd=xMat[:,1].argsort(0)
>>> xSort=xMat[srtInd][:,0,:]
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.plot(xSort[:,1],yHat[srtInd])
[<matplotlib.lines.Line2D object at 0x7f4905a37390>]
>>> ax.scatter(xMat[:,1].flatten().A[0], mat(yArr).T.flatten().A[0], s=2, c='red')
<matplotlib.collections.PathCollection object at 0x7f4905a37a50>
>>> plt.show()
三,示例:预测鲍鱼的年龄
def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
return ((yArr-yHatArr)**2).sum()
四,缩减系数来"理解"数据
如果数据的特征比样本点还多应该怎么办?是否还可以使用线性回归和之前的方法来做预测?答案是否定的,这是因为在计算(X.T*X)^-1的时候会出错
为了解决这个问题,统计学家引入了岭回归(ridge regression)
def ridgeRegres(xMat, yMat, lam=0.2):
xTx = xMat.T*xMat
denom = xTx + eye(shape(xMat)[1])*lam
if linalg.det(denom) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = denom.I * (xMat.T*yMat)
return ws
def ridgeTest(xArr,yArr):
xMat = mat(xArr); yMat = mat(yArr).T
yMean = mean(yMat, 0)
yMat = yMat - yMean
#regularize X's
xMeans = mean(xMat,0)
xVar = var(xMat,0)
xMat = (xMat - xMeans)/xVar
numTestPts = 30
wMat = zeros((numTestPts,shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,exp(i-10))
wMat[i,:]=ws.T
return wMat
>>> reload(regression)
<module 'regression' from 'regression.py'>
>>> abX,abY=regression.loadDataSet('abalone.txt')
>>> ridgeWeights=regression.ridgeTest(abX,abY)
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.plot(ridgeWeights)
[<matplotlib.lines.Line2D object at 0x7f4905a8e410>, <matplotlib.lines.Line2D object at 0x7f4905a8e750>, <matplotlib.lines.Line2D object at 0x7f4905a8ecd0>, <matplotlib.lines.Line2D object at 0x7f4905a8ee50>, <matplotlib.lines.Line2D object at 0x7f490534d4d0>, <matplotlib.lines.Line2D object at 0x7f490534d6d0>, <matplotlib.lines.Line2D object at 0x7f490534de90>, <matplotlib.lines.Line2D object at 0x7f490534dad0>]
>>> plt.show()
五,lasso回归
def regularize(xMat):
inMat = xMat.copy()
inMeans = mean(inMat,0)
inVar = var(inMat,0)
inMat = (inMat - inMeans)/inVar
return inMat
def stageWise(xArr,yArr,eps=0.01,numIt=100):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #can also regularize ys but will get smaller coef
xMat = regularize(xMat)
m,n=shape(xMat)
#returnMat = zeros((numIt,n)) #testing code remove
ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
for i in range(numIt):
print ws.T
lowestError = inf;
for j in range(n):
for sign in [-1,1]:
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat*wsTest
rssE = rssError(yMat.A,yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
#returnMat[i,:]=ws.T
#return returnMat
>>> reload(regression)
>>> xArr,yArr=regression.loadDataSet('abalone.txt')
>>> regression.stageWise(xArr,yArr,0.01,200)
>>> regression.stageWise(xArr,yArr,0.001,5000)
>>> xMat=mat(xArr)
>>> yMat=mat(yArr).T
>>> xMat=regression.regularize(xMat)
>>> yM = mean(yMat,0)
>>> yMat=yMat-yM
>>> weights=regression.standRegres(xMat,yMat.T)
>>> weights.T
matrix([[ 0.0430442 , -0.02274163, 0.13214087, 0.02075182, 2.22403814,
-0.99895312, -0.11725427, 0.16622915]])