线性回归原理比较简单,其在一维特征时候的方程我们在高中阶段就学习过了,对于多维特征的线性回归,只是在其基础上进行扩展,对于寻找合适参数的过程,可以使用梯度下降的方法来进行,但对于线性回归而言,其实是有数值解的:
其相关代码如下:
import numpy as np
import matplotlib.pyplot as plt
def loaddata():
file = open(r'E:\学习资料\AI+CS\01 个人\《机器学习实战》-Peter Harriton\MLiA_SourceCode\machinelearninginaction\Ch08\ex0.txt')
data = []
label = []
for i in file.readlines():
line = i.strip().split()
data.append([float(line[0]),float(line[1])])
label.append([float(line[-1])])
file.close()
return data,label
#整个样本的梯度下降
def calw(data,label):
x = np.mat(data)
y = np.mat(label)
xTx = np.dot(x.T,x)
if np.linalg.det(xTx) == 0:
print('the result is wrong!')
return
w1 = np.dot(xTx.I,x.T)
w = np.dot(w1,y)
return w
def plotdata(data,label,w):
data = np.array(data)
label = np.array(label)
x = []
for i in range(data.shape[0]):
x.append(data[i][1])
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
ax1.scatter(x,label,s = 5,color = 'blue',marker = 'o')
yhat = np.dot(data,w)
ax1.plot(data[:,1],yhat,color = 'red')
plt.title('data')
plt.xlabel('X')
plt.show()
#局部加权线性回归(Locally Weighted Linear Regression)
def lwlr(testpoint,data,label, k = 1.0):
x = np.mat(data)
label = np.mat(label)
m = x.shape[0]
weight = np.eye(m)
for i in range(m):
error = testpoint - x[i,:]
weight[i,i] = np.exp(error * error.T/(-2*k**2))
xTx = x.T * weight * x
if np.linalg.det(xTx) == 0:
print('it is wrong')
return
ws = xTx.I * x.T * weight * label
return testpoint * ws
def lwlrtest(testarr,data,label,k = 1.0):
data = np.mat(data)
m = data.shape[0]
yhat = np.zeros(m)
for i in range(m):
yhat[i] = lwlr(testarr[i],data,label, k )
return yhat
def plotlwlr(data,label,k=1.0):
data = np.mat(data)
label = np.mat(label)
f,ax = plt.subplots(3,1,sharex = False,sharey = False,figsize = (10,6))
ax[0].scatter(data[:,1].flatten().A[0],label.flatten().A[0],c = 'blue',s = 5)
ax[1].scatter(data[:,1].flatten().A[0],label.flatten().A[0],c = 'blue',s = 5)
ax[2].scatter(data[:,1].flatten().A[0],label.flatten().A[0],c = 'blue',s = 5)
yhat_0 = lwlrtest(data,data,label,k = 1.0)
yhat_1 = lwlrtest(data,data,label,k = 0.01)
yhat_2 = lwlrtest(data,data,label,k = 0.003)
index = data[:,1].argsort(axis = 0)
xmat = data[index][:,0,:]
ax[0].plot(xmat[:,1],yhat_0[index],c = 'red')
ax[1].plot(xmat[:,1],yhat_1[index],c = 'red')
ax[2].plot(xmat[:,1],yhat_2[index],c = 'red')
plt.show()
data,label = loaddata()
plotlwlr(data,label,k=1.0)
yhat = lwlrtest(data,data,label,k = 1.0)
yhat = np.mat(yhat)
yhat.shape
鲍鱼年龄预测:
#示例:鲍鱼年龄预测—数据读取
def loaddata_0(filename):
f = open(filename)
num = len(open(filename).readline().strip().split()) - 1 #数据特征的数目---更改这个部位,将f.readline() 改为 open(filename).readline()
data = []
label = []
for i in f.readlines():
line = i.strip().split() #将字符串分割返还的是列表
temp = []
for j in range(num):
temp.append(float(line[j]))
data.append(temp)
label.append(float(line[-1]))
f.close()
return data,label
def lwlr(testpoint,data,label, k = 1.0):
x = np.mat(data)
label = np.mat(label).T
m = x.shape[0]
weight = np.eye(m)
for i in range(m):
error = testpoint - x[i,:]
weight[i,i] = np.exp(error * error.T/(-2*k**2))
xTx = x.T * weight * x
if np.linalg.det(xTx) == 0:
print('it is wrong')
return
ws = xTx.I * x.T * weight * label
return testpoint * ws
def lwlrtest(testarr,data,label,k = 1.0):
data = np.mat(data)
m = data.shape[0]
yhat = np.zeros(m)
for i in range(m):
yhat[i] = lwlr(testarr[i],data,label, k )
return yhat
def calw_0(data,label):
x = np.mat(data)
y = np.mat(label).T
xTx = np.dot(x.T,x)
if np.linalg.det(xTx) == 0: #计算行列式的值
print('the result is wrong!')
return
w1 = np.dot(xTx.I,x.T)
w = np.dot(w1,y)
return w
def error(yhat,label):
yhat = np.array(yhat)
label = np.array(label)
error_sum = ((yhat - label)**2).sum()
return error_sum
data,label= loaddata_0(r'E:\学习资料\AI+CS\01 个人\《机器学习实战》-Peter Harriton\MLiA_SourceCode\machinelearninginaction\Ch08\abalone.txt')
print('训练集与测试集相同时,查看各误差结果:')
yhat_0 = lwlrtest(data[0:99],data[0:99],label[0:99],k = 1.0)
yhat_1 = lwlrtest(data[0:99],data[0:99],label[0:99],k = 0.1)
yhat_2 = lwlrtest(data[0:99],data[0:99],label[0:99],k = 10)
print('当k=1时,误差为:%f' %(error(yhat_0,label[0:99])) )
print('当k=0.1时,误差为:%f' %(error(yhat_1,label[0:99])) )
print('当k=10时,误差为:%f' %(error(yhat_2,label[0:99])) )
print('')
print('训练集与测试集不同的情况下,查看结果:')
yhat_0 = lwlrtest(data[100:199],data[0:99],label[0:99],k = 1.0)
yhat_1 = lwlrtest(data[100:199],data[0:99],label[0:99],k = 0.1)
yhat_2 = lwlrtest(data[100:199],data[0:99],label[0:99],k = 10)
print('当k=1时,误差为:%f' %(error(yhat_0,label[100:199])) )
print('当k=0.1时,误差为:%f' %(error(yhat_1,label[100:199])) )
print('当k=10时,误差为:%f' %(error(yhat_2,label[100:199])) )
print('')
print('比较简单线性回归和加权线性回归,k = 1.0时的误差大小:')
print('加权线性回归,k=1时,误差为:%f' %(error(yhat_0,label[100:199])) )
w = calw_0(data[0:99],label[0:99])
yhat = data[100:199] * w
print('简单线性回归,误差为:%f' %(error(yhat.flatten().A[0],label[100:199])) )
岭回归:
import numpy as np
import matplotlib.pyplot as plt
#示例:鲍鱼年龄预测—数据读取
def loaddata_0(filename):
f = open(filename)
num = len(open(filename).readline().strip().split()) - 1 #数据特征的数目---更改这个部位,将f.readline() 改为 open(filename).readline()
data = []
label = []
for i in f.readlines():
line = i.strip().split() #将字符串分割返还的是列表
temp = []
for j in range(num):
temp.append(float(line[j]))
data.append(temp)
label.append(float(line[-1]))
f.close()
return data,label
data,label = loaddata_0(r'E:\学习资料\AI+CS\01 个人\《机器学习实战》-Peter Harriton\MLiA_SourceCode\machinelearninginaction\Ch08\abalone.txt')
def ridgeRegres(data,label,lam):
x = np.mat(data)
y = np.mat(label)
m,n = x.shape
lam_I = lam * np.mat(np.eye(n))
xTx = x.T * x + lam_I
if np.linalg.det(xTx) == 0:
print('该矩阵为奇异矩阵,不能计算逆矩阵')
return
w_lam = xTx.I * x.T * y
return w_lam
def normdata(data,label):
x = np.mat(data) #维度为(m,n)
y = np.mat(label).T #维度为(n,1)
n = x.shape[1]
x_mean = np.mean(x,axis = 0)
y_mean = np.mean(y,axis = 0)
y_new = y - y_mean
x_var = np.var(x,axis = 0 )
x_new = (x - x_mean)/ x_var
num = 30
wmat = np.zeros((num,n))
for i in range(num):
w_lam = ridgeRegres(x_new,y_new,np.exp(i-10))
wmat[i,:] = w_lam.T
return wmat
wmat = normdata(data,label)
f = plt.figure()
ax = f.add_subplot(1,1,1)
ax.plot(wmat)
plt.show()