机器学习2:多元线性回归~多项式回归 ~标准方程法 ~特征缩放与交叉验证 ~过拟合正则化
多元线性回归
原理
算法实现
手工实现
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
#载入数据
data=np.genfromtxt(r"Delivery.csv",delimiter=",")
print(data)
x_data=data[:,0:2]
y_data=data[:,2]
print(x_data)
print(y_data)
#学习率
lr=0.0001
#参数
k1=0
k2=0
#初始截距
b=0
#最大迭代次数
epochs=1000
#最小二乘法
def compute_error(b,k1,k2,x_data,y_data):
totalError=0
for i in range(0,len(x_data)):
totalError+=(y_data[i]-(k1*x_data[i,0]+k2*x_data[i,1]+b))**2 #求代价函数
return totalError/float(len(x_data))/2
def gradient_descent_runner(x_data,y_data,b,k1,k2,lr,epochs):
#计算总数据量
m=float(len(x_data))
#循环epochs次
for i in range(epochs):
b_grad=0
k1_grad=0
k2_grad=0
#计算梯度的总和再求平均
for j in range(0,len(x_data)):
b_grad += (1/m)*(((k1*x_data[j,0])+(k2*x_data[j,1])+b)-y_data[j])
k1_grad += (1/m)*x_data[j,0]*(((k1*x_data[j,0])+(k2*x_data[j,1])+b)-y_data[j])
k2_grad += (1/m)*x_data[j,1]*(((k1*x_data[j,0])+(k2*x_data[j,1])+b)-y_data[j])
#更新b和k
b=b-(lr*b_grad)
k1=k1-(lr*k1_grad)
k2=k2-(lr*k2_grad)
return b,k1,k2
print("Starting b = {0}, k1 = {1}, k2 = {2}, error = {3}".format(b, k1, k2 , compute_error(b, k1, k2, x_data, y_data)))
print("Running...")
b, k1 ,k2 = gradient_descent_runner(x_data, y_data, b, k1 , k2, lr, epochs)
print("After {0} iterations b = {1}, k1 = {2}, k2 = {3} ,error = {4}".format(epochs, b, k1,k2 ,compute_error(b, k1, k2 ,x_data, y_data)))
Starting b = 0, k1 = 0, k2 = 0, error = 23.639999999999997
Running…
After 1000 iterations b = 0.006971416196678632, k1 = 0.08021042690771771, k2 = 0.07611036240566814 ,error = 0.3865635716109059
ax = plt.figure().add_subplot(111, projection = '3d')
ax.scatter(x_data[:,0], x_data[:,1], y_data, c = 'r', marker = 'o', s = 100) #点为红色三角形
x0 = x_data[:,0]
x1 = x_data[:,1]
# 生成网格矩阵
x0, x1 = np.meshgrid(x0, x1)
z = b + x0*k1 + x1*k2
# 画3D图
ax.plot_surface(x0,x1,z,rstride=1,cstride=1,cmap=plt.get_cmap('rainbow')) #camp用来表示颜色 rstride,cstride用于改变色块大小
#设置坐标轴
ax.set_xlabel('Miles')
ax.set_ylabel('Num of Deliveries')
ax.set_zlabel('Time')
#显示图像
plt.show()
sklearn实现
import numpy as np
from numpy import genfromtxt
from sklearn import linear_model
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# 读入数据
data = genfromtxt(r"Delivery.csv",delimiter=',')
print(data)
# 切分数据
x_data = data[:,:-1]
y_data = data[:,-1]
print(x_data)
print(y_data)
# 创建模型
model = linear_model.LinearRegression()
model.fit(x_data, y_data)
# 系数
print("coefficients:",model.coef_)
# 截距
print("intercept:",model.intercept_)
# 测试
x_test = [[102,4]]
predict = model.predict(x_test)
print("predict:",predict)
ax = plt.figure().add_subplot(111, projection = '3d')
ax.scatter(x_data[:,0], x_data[:,1], y_data, c = 'r', marker = 'o', s = 100) #点为红色三角形
x0 = x_data[:,0]
x1 = x_data[:,1]
# 生成网格矩阵
x0, x1 = np.meshgrid(x0, x1)
z = model.intercept_ + x0*model.coef_[0] + x1*model.coef_[1]
# 画3D图
ax.plot_surface(x0, x1, z)
#设置坐标轴
ax.set_xlabel('Miles')
ax.set_ylabel('Num of Deliveries')
ax.set_zlabel('Time')
#显示图像
plt.show()
多项式回归
原理
sklearn实现
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# 载入数据
data = np.genfromtxt("job.csv", delimiter=",")
x_data = data[1:,1]
y_data = data[1:,2]
plt.scatter(x_data,y_data)
plt.show()
print(x_data)
[ 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.]
x_data = x_data[:,np.newaxis]
y_data = y_data[:,np.newaxis]
print(x_data)
array([[ 1.],
[ 2.],
[ 3.],
[ 4.],
[ 5.],
[ 6.],
[ 7.],
[ 8.],
[ 9.],
[10.]])
# 创建并拟合模型
model = LinearRegression()
model.fit(x_data, y_data)
# 画图
plt.plot(x_data, y_data, 'b.')
plt.plot(x_data, model.predict(x_data), 'r')
plt.show()
# 定义多项式回归,degree的值可以调节多项式的特征
poly_reg = PolynomialFeatures(degree=10)
# 特征处理
x_poly = poly_reg.fit_transform(x_data)
# 定义回归模型
lin_reg = LinearRegression()
# 训练模型
lin_reg.fit(x_poly, y_data)
print(x_poly)
array([[1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
[1.00000000e+00, 2.00000000e+00, 4.00000000e+00, 8.00000000e+00,
1.60000000e+01, 3.20000000e+01, 6.40000000e+01, 1.28000000e+02,
2.56000000e+02, 5.12000000e+02, 1.02400000e+03],
[1.00000000e+00, 3.00000000e+00, 9.00000000e+00, 2.70000000e+01,
8.10000000e+01, 2.43000000e+02, 7.29000000e+02, 2.18700000e+03,
6.56100000e+03, 1.96830000e+04, 5.90490000e+04],
[1.00000000e+00, 4.00000000e+00, 1.60000000e+01, 6.40000000e+01,
2.56000000e+02, 1.02400000e+03, 4.09600000e+03, 1.63840000e+04,
6.55360000e+04, 2.62144000e+05, 1.04857600e+06],
[1.00000000e+00, 5.00000000e+00, 2.50000000e+01, 1.25000000e+02,
6.25000000e+02, 3.12500000e+03, 1.56250000e+04, 7.81250000e+04,
3.90625000e+05, 1.95312500e+06, 9.76562500e+06],
[1.00000000e+00, 6.00000000e+00, 3.60000000e+01, 2.16000000e+02,
1.29600000e+03, 7.77600000e+03, 4.66560000e+04, 2.79936000e+05,
1.67961600e+06, 1.00776960e+07, 6.04661760e+07],
[1.00000000e+00, 7.00000000e+00, 4.90000000e+01, 3.43000000e+02,
2.40100000e+03, 1.68070000e+04, 1.17649000e+05, 8.23543000e+05,
5.76480100e+06, 4.03536070e+07, 2.82475249e+08],
[1.00000000e+00, 8.00000000e+00, 6.40000000e+01, 5.12000000e+02,
4.09600000e+03, 3.27680000e+04, 2.62144000e+05, 2.09715200e+06,
1.67772160e+07, 1.34217728e+08, 1.07374182e+09],
[1.00000000e+00, 9.00000000e+00, 8.10000000e+01, 7.29000000e+02,
6.56100000e+03, 5.90490000e+04, 5.31441000e+05, 4.78296900e+06,
4.30467210e+07, 3.87420489e+08, 3.48678440e+09],
[1.00000000e+00, 1.00000000e+01, 1.00000000e+02, 1.00000000e+03,
1.00000000e+04, 1.00000000e+05, 1.00000000e+06, 1.00000000e+07,
1.00000000e+08, 1.00000000e+09, 1.00000000e+10]])
# 画图
plt.plot(x_data, y_data, 'b.')
plt.plot(x_data, lin_reg.predict(poly_reg.fit_transform(x_data)), c='r')
plt.title('Truth or Bluff (Polynomial Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
# 画图
plt.plot(x_data, y_data, 'b.')
x_test = np.linspace(1,10,100)
x_test = x_test[:,np.newaxis]
plt.plot(x_test, lin_reg.predict(poly_reg.fit_transform(x_test)), c='r')
plt.title('Truth or Bluff (Polynomial Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
标准方程法
原理
手工实现
sklearn中的线性回归模型就是利用标准方程法
import numpy as np
from numpy import genfromtxt
import matplotlib.pyplot as plt
##载入数据
data=np.genfromtxt("data.csv",delimiter=",")
x_data=data[:,0,np.newaxis]
y_data=data[:,1,np.newaxis]
plt.scatter(x_data,y_data)
plt.show()
print(np.mat(x_data).shape)
print(np.mat(y_data).shape)
#给样本添加偏置项
X_data=np.concatenate((np.ones((100,1)),x_data),axis=1) #合并
print(X_data.shape)
(100, 1)
(100, 1)
(100, 2)
print(X_data[:3])
[[ 1. 32.50234527]
[ 1. 53.42680403]
[ 1. 61.53035803]]
#标准方程法
def weights(xArr,yArr):
#将np.array变成matrix
xMat=np.mat(xArr)
yMat=np.mat(yArr)
xTx=xMat.T * xMat #矩阵乘法
#计算行列式
if np.linalg.det(xTx) == 0.0:
print("This matrix cannot do inverse")
return
ws=xTx.I * xMat.T *yMat #矩阵的逆.I
return ws
ws=weights(X_data,y_data)
print(ws)
[[7.99102098]
[1.32243102]]
#画图
x_test=np.array([[20],[80]])
print(x_test)
y_test=ws[0]+ x_test * ws[1]
plt.plot(x_data,y_data,'b.') #b.代表蓝色的点
plt.plot(x_test,y_test,'r')
plt.show()
特征缩放与交叉验证
多个特征数据范围相差太大
数据归一化就是把数据的取值范围处理为0-1或者-1-1之间。
任意数据转化为0-1之间:
newValue = (oldValue-min)/(max-min)
(1,3,5,7,9)
(1-1)/(9-1)=0
(3-1)/(9-1)=1/4
(5-1)/(9-1)=1/2
(7-1)/(9-1)=3/4
(9-1)/(9-1)=1
任意数据转化为-1-1之间:
newValue = ((oldValue-min)/(max-min)-0.5)*2
均值标准化
x为特征数据,u为数据的平均值,s为数据的方差
newValue = (oldValue-u)/s
(1,3,5,7,9)
u = (1+3+5+7+9)/5=5
s = ((1-5)2+(3-5)2+(5-5)2+(7-5)2+(9-5)2
)/5=8
(1-5)/8=-1/2
(3-5)/8=-1/4
(5-5)/8=0
(7-5)/8=1/4
(9-5)/8=1/2
交叉验证法
过拟合和正则化
过拟合
防止过拟合的方法
1.减少特征
2.增加数据量
3.正则化
(Regularized)
正则化