1.最原始的linear regression
标准回归函数和文本数据导入函数
from numpy import *
def loadDataSet(fileName): #general function to parse tab -delimited floats
numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields '\t'是tab,每一行的特征个数
dataMat = []; labelMat = [] #数据矩阵,标签矩阵
fr = open(fileName)
for line in fr.readlines(): #fr.readlines()表示读取每一行
lineArr =[] #该行的列表,注意这里保存的可是数字了
curLine = line.strip().split('\t') #strip()去掉前后的空格,split()把一个字符串分割成字符串数组
for i in range(numFeat): #数字序列,内置函数range() range(10) [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
lineArr.append(float(curLine[i])) #
dataMat.append(lineArr)
labelMat.append(float(curLine[-1])) #-1表示倒数第一个
return dataMat,labelMat #返回数据矩阵和标签矩阵(目标值矩阵)
def standRegres(xArr,yArr): #用来计算最佳拟合直线
xMat = mat(xArr); yMat = mat(yArr).T #搞成矩阵形式 matrix.T transpose:返回矩阵的转置矩阵
xTx = xMat.T*xMat
if linalg.det(xTx) == 0.0: # numpy.linalg模块包含线性代数的函数,计算行列式值是否为0
print "This matrix is singular, cannot do inverse" #奇异矩阵
return
ws = xTx.I * (xMat.T*yMat) #matrix.I inverse:返回矩阵的逆矩阵,就这一步就求出来了,该算法叫做普通最小二乘法(ordinary least squares)
return ws
测试
import regression
import matplotlib.pyplot as plt
from numpy import *
xArr, yArr = regression.loadDataSet('ex0.txt')
# print xArr[0:2] #取不到2
# print yArr
#接下来来看拟合的效果
ws = regression.standRegres(xArr, yArr)
# print ws #变量ws存放的就是回归系数
xMat = mat(xArr)
yMat = mat(yArr)
yHat = xMat*ws #计算预测值
#接下来绘制数据集散点图和最佳拟合直线图
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])
# flatten()方法能将matrix的元素变成一维的,
# .A能使matrix变成array
xCopy = xMat.copy()
# print xCopy
xCopy.sort(0) #按照升序排序,主要是根据第二个元素
# print xCopy
yHat = xCopy *ws
ax.plot(xCopy[:,1],yHat,'red')
plt.show()
结果:
2. locally weighted linear regression
必要函数
#以下函数,对于x空间中的任意一个testPoint,输出其对应的预测值yHat
def lwlr(testPoint,xArr,yArr,k=1.0): # 参数k控制衰减速度 1.0为默认值; testPoint为输入,函数返回根据局部加权线性回归得出的预测值
xMat = mat(xArr); yMat = mat(yArr).T
m = shape(xMat)[0] #[0]指示的是行数,也就是样本点个数
weights = mat(eye((m))) #eye(m)主对角元素为1----对应于(m,m),其余为0
for j in range(m): #next 2 lines create weights matrix
diffMat = testPoint - xMat[j,:]
weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
xTx = xMat.T * (weights * xMat)
if linalg.det(xTx) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = xTx.I * (xMat.T * (weights * yMat))
return testPoint * ws
def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one, k的默认值为1
m = shape(testArr)[0]
yHat = zeros(m) #元素全为0的向量
for i in range(m):
yHat[i] = lwlr(testArr[i],xArr,yArr,k)
return yHat
def lwlrTestPlot(xArr,yArr,k=1.0): #same thing as lwlrTest except it sorts X first
yHat = zeros(shape(yArr)) #easier for plotting
xCopy = mat(xArr)
xCopy.sort(0)
for i in range(shape(xArr)[0]):
yHat[i] = lwlr(xCopy[i],xArr,yArr,k)
return yHat,xCopy
测试
import regression
import matplotlib.pyplot as plt
from numpy import *
xArr, yArr = regression.loadDataSet('ex0.txt')
# print yArr[0]
# print regression.lwlr(xArr[0],xArr,yArr,0.001)
yHat, xSort = regression.lwlrTestPlot(xArr,yArr,1) #此处的这个k值得选取会直接影响到拟合的效果
# print xSort
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xSort[:,1],yHat)
xMat = mat(xArr)
yMat = mat(yArr)
ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0], s=2, c='red')
plt.show()
k=1 欠拟合
k=0.01
k=0.003 过拟合
3. 预测鲍鱼的年龄
#预测鲍鱼年龄
import regression
from numpy import *
abX, abY = regression.loadDataSet('abalone.txt')
yHat01=regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],0.1) #过拟合
yHat1=regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],1)
yHat10=regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],10)
print regression.rssError(abY[0:99], yHat01.T)
print regression.rssError(abY[0:99], yHat1.T)
print regression.rssError(abY[0:99], yHat10.T)
yHat01New=regression.lwlrTest(abX[100:199],abX[0:99],abY[0:99],0.1) #过拟合
yHat1New=regression.lwlrTest(abX[100:199],abX[0:99],abY[0:99],1)
yHat10New=regression.lwlrTest(abX[100:199],abX[0:99],abY[0:99],10)
print regression.rssError(abY[100:199], yHat01New.T)
print regression.rssError(abY[100:199], yHat1New.T)
print regression.rssError(abY[100:199], yHat10New.T)
#接下里看看普通的线性回归
ws = regression.standRegres(abX[0:99], abY[0:99])
yHat =mat(abX[100:199])*ws
print regression.rssError(abY[100:199],yHat.T.A)
结果:
56.8843765879
429.89056187
549.118170883
58720.7256135
573.526144189
517.571190538
518.636315325
4. 缩减系数来“理解”数据
4.1 岭回归
#岭回归---在鲍鱼数据集上的效果
import regression
from numpy import *
import matplotlib.pyplot as plt
abX, abY = regression.loadDataSet('abalone.txt')
ridgeWeights = regression.ridgeTest(abX, abY)
print ridgeWeights
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(ridgeWeights)
plt.show()
4.2 前向逐步回归
def regularize(xMat):#regularize by columns
inMat = xMat.copy()
inMeans = mean(inMat,0) #calc mean then subtract it off
inVar = var(inMat,0) #calc variance of Xi then divide by it
inMat = (inMat - inMeans)/inVar
return inMat
def stageWise(xArr,yArr,eps=0.01,numIt=100): #前向逐步线性回归
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #can also regularize ys but will get smaller coef
xMat = regularize(xMat)
m,n=shape(xMat)
returnMat = zeros((numIt,n)) #testing code remove
ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
for i in range(numIt): #numIt表示迭代次数
print ws.T
lowestError = inf; #inf表示无穷
for j in range(n):
for sign in [-1,1]: #分别显示增加和减少该特征系数对结果的影响
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat*wsTest
rssE = rssError(yMat.A,yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
returnMat[i,:]=ws.T
return returnMat
测试
#测试前向逐步线性回归的效果
import regression
from numpy import *
import matplotlib.pyplot as plt
xArr, yArr = regression.loadDataSet('abalone.txt')
print regression.stageWise(xArr,yArr,0.001,5000)
#将其结果与最小二乘法进行比较
xMat = mat(xArr)
yMat = mat(yArr).T
xMat = regression.regularize(xMat)
yM = mean(yMat,0)
yMat = yMat - yM
weights=regression.standRegres(xMat, yMat.T)
print weights.T