理论推导
通常对于一组特征数据和其标记值:
(
x
1
,
y
1
)
,
(
x
2
,
y
2
)
,
.
.
.
,
(
x
n
,
y
n
)
(x_1, y_1), (x_2, y_2), ..., (x_n, y_n)
(x1,y1),(x2,y2),...,(xn,yn),在使用特征值
x
i
x_i
xi对
y
i
y_i
yi进行预测时,根据习惯,如果
y
i
y_i
yi是连续的,则称这种操作或者技术为回归
;如果
y
i
y_i
yi是离散的,则通常称为分类
。
X ∈ R n ∗ m : { x 1 , x 2 , . . . , x n } X\in R^{n*m}:\{x_1,x_2,...,x_n\} X∈Rn∗m:{x1,x2,...,xn}
Y ∈ R n ∗ 1 : { y 1 , y 2 , . . . , y n } Y\in R^{n*1}:\{y_1,y_2,...,y_n\} Y∈Rn∗1:{y1,y2,...,yn}
其中
X
X
X为特征
数据,
n
n
n条,每条属性维度为
m
m
m,
Y
Y
Y为对应的标签
我们需找到 w ∈ R m ∗ 1 w\in R^{m*1} w∈Rm∗1, b ∈ R 1 ∗ 1 b\in R^{1*1} b∈R1∗1使得, X w + b = Y Xw+b=Y Xw+b=Y
进一步把 b b b放在 w w w中,则有: Y = X w ∗ Y=Xw^* Y=Xw∗,其中 X = { 1 , x 1 , x 2 , . . , x n } ∈ R n ∗ ( m + 1 ) X=\{1,x_1,x_2,..,x_n\}\in R^{n*(m+1)} X={1,x1,x2,..,xn}∈Rn∗(m+1), w ∗ ∈ R ( m + 1 ) ∗ 1 w^*\in R^{(m+1)*1} w∗∈R(m+1)∗1
如果
X
X
X可逆
,则有
w
∗
=
X
−
1
Y
w^*=X^{-1}Y
w∗=X−1Y,这就可以找到
w
∗
w^*
w∗
因为: y 1 = b + x 11 ∗ w 1 + x 12 ∗ w 2 + , . . . + x 1 n ∗ w n = 1 ∗ w 0 + x 11 ∗ w 1 + x 12 ∗ w 2 + , . . . + x 1 n ∗ w n y_1 = b+x_{11}*w_1+x_{12}*w_2+,...+x_{1n}*w_n = 1*w_0+x_{11}*w_1+x_{12}*w_2+,...+x_{1n}*w_n y1=b+x11∗w1+x12∗w2+,...+x1n∗wn=1∗w0+x11∗w1+x12∗w2+,...+x1n∗wn
写成矩阵形式为: y 1 = [ 1 , x 11 , x 12 , . . . , x 1 n ] ∗ [ w 0 , w 1 , w 2 , . . . , w n ] T y_1=[1,x_{11},x_{12},...,x_{1n}]*[w_0,w_1,w_2,...,w_n]^T y1=[1,x11,x12,...,x1n]∗[w0,w1,w2,...,wn]T
因此就有:
Y
=
X
w
∗
Y=Xw^*
Y=Xw∗
Y
=
[
y
1
,
y
2
,
.
.
.
,
y
n
]
T
Y=[y_1,y_2,...,y_n]^T
Y=[y1,y2,...,yn]T,
X
=
[
1
,
x
1
,
x
2
,
.
.
.
,
x
n
]
X=[1,x_1,x_2,...,x_n]
X=[1,x1,x2,...,xn],
w
∗
=
[
b
,
w
]
T
w^*=[b,w]^T
w∗=[b,w]T
求逆(伪逆)法
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import scipy
def plotData(X,y):
plt.scatter(X[...,0],y)
plt.show()
# 直接求导
def matrixSolver(X,y):
X = np.hstack((np.ones((len(X),1)),X))
# w = np.dot(np.linalg.inv(X),y) # 求逆
w = np.dot(np.linalg.pinv(X), y) # 求伪逆
return w
if __name__=="__main__":
X, y = make_regression(200, 1, bias=2, noise=4)
# 划分训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
# plotData(X,y)
# 直接求导
w = matrixSolver(X_train,y_train)
# print(w)
# 画图
plt.scatter(X_test,y_test,s=30,c='red',marker='o',alpha=0.5,label='C1')
plt.plot(X_test,np.dot(np.hstack((np.ones((len(X_test),1)),X_test)),w),c="blue")
plt.show()
梯度下降
详细推导参考:
实例
"""
Author:wucng
Time: 20200114
Summary: 线性回归对boston数据预测
源代码: https://github.com/wucng/MLAndDL
参考:https://cuijiahua.com/blog/2017/11/ml_3_decision_tree_2.html
"""
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
import scipy,pickle,os,time
import pandas as pd
# 1.加载数据集(并做预处理)
def loadData(dataPath: str) -> tuple:
with open(dataPath,"r") as fp:
lines = fp.readlines()
dataset=[]
i = 0
while i<len(lines):
line = lines[i]
i += 1
if line[0].isdigit(): # 数字开头
data1=list(map(float,line.strip().split(" ")))
line = lines[i]
i += 1
data2 = list(map(float, line.strip().split(" ")))
data1.extend(data2)
dataset.append(data1)
else:
continue
dataset = np.asarray(dataset)
# 拆分成训练集与标签
X,y = dataset[...,:-1],dataset[...,-1]
# 数据归一化
X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
# 使用sklearn方式
# X = MinMaxScaler().transform(X)
return (X,y)
class LinearRegressionSelf(object):
"""求逆(伪逆)法"""
def __init__(self,save_file="model.npy"):
self.save_file = save_file
def __fit(self,X,y):
# 直接求导
X = np.hstack((np.ones((len(X), 1)), X))
# w = np.dot(np.linalg.inv(X),y) # 求逆
w = np.dot(np.linalg.pinv(X), y) # 求伪逆
return w
def fit(self,X,y,batch_size=32,epochs=20):
if not os.path.exists(self.save_file):
length = len(y)
m = len(y)//batch_size
last_w = []
for epoch in range(epochs):
w = []
# 随机打乱数据
index = np.arange(0, length)
np.random.seed(epoch)
np.random.shuffle(index)
new_X = X[index]
new_y = y[index]
for i in range(m):
start = i*batch_size
end = min((i+1)*batch_size,length)
w.append(self.__fit(new_X[start:end],new_y[start:end]))
last_w.append(np.mean(w,0))
# save parameter
np.save(self.save_file,np.mean(last_w,0))
self.w = np.load(self.save_file)
def predict(self,X):
X = np.hstack((np.ones((len(X), 1)), X))
return np.dot(X,self.w)
def error(self,y_true,y_pred):
# https://www.jianshu.com/p/3a98f33113ac
# 越大,拟合的效果越好,最优值为1,并且模型的效果很离谱时可能为负
return 1-np.sum((y_pred-y_true)**2)/np.sum((y_true-np.mean(y_true))**2)
class LinearRegressionSelf2(object):
"""梯度下降"""
def __init__(self,save_file="model.ckpt"):
self.save_file = save_file
def __fit(self,X,y,w,b,lr=1e-3):
diff = np.dot(X, w) + b - y
w-=lr*(1/len(y))*(np.dot(np.transpose(X), diff))
b-=lr*np.mean(diff)
return w,b
def fit(self,X,y,batch_size=32,epochs=50000,lr=5e-4):
if not os.path.exists(self.save_file):
length = len(y)
m = len(y)//batch_size
w = np.random.random((len(X[0]),1)) # 初始随机值
b = np.random.random((1,1)) # 初始随机值
for epoch in range(epochs):
# 随机打乱数据
index = np.arange(0, length)
np.random.seed(epoch)
np.random.shuffle(index)
new_X = X[index]
new_y = y[index]
for i in range(m):
start = i*batch_size
end = min((i+1)*batch_size,length)
w,b = self.__fit(new_X[start:end],new_y[start:end],w,b,lr)
# print(w,b)
# save parameter
pickle.dump({"w":w,"b":b},open(self.save_file,"wb"))
data = pickle.load(open(self.save_file,"rb"))
self.w = data["w"]
self.b = data["b"]
def predict(self,X):
return np.dot(X,self.w)+self.b
def error(self,y_true,y_pred):
# https://www.jianshu.com/p/3a98f33113ac
# 越大,拟合的效果越好,最优值为1,并且模型的效果很离谱时可能为负
return 1-np.sum((y_pred-y_true)**2)/np.sum((y_true-np.mean(y_true))**2)
if __name__=="__main__":
dataPath = "../../dataset/boston.txt"
X, y = loadData(dataPath)
if len(y.shape)==1:y=y[...,None]
# 划分训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
start = time.time()
clf = LinearRegressionSelf()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("error:",clf.error(y_test,y_pred))
error = np.sum((y_pred - y_test) ** 2) / len(y_test)
print("cost time:%.6f(s) error:%.3f"%(time.time()-start,error))
"""
error: 0.7131946712017807
cost time:0.000985(s) error:32.785
"""
# 使用sklearn的LinearRegression方法
start = time.time()
clf = LinearRegression()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
error = 1-np.sum((y_pred-y_test)**2)/np.sum((y_test-np.mean(y_test))**2)
print("error:",error)
error = np.sum((y_pred - y_test) ** 2) / len(y_test)
print("cost time:%.6f(s) error:%.3f" % (time.time() - start, error))
"""
error: 0.7215519718844166
cost time:0.001995(s) error:31.830
"""