线性回归
代码实现
# 定义线性回归模型主体 ——回归模型公式、均方损失函数、参数求偏导
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.utils import shuffle # 导入打乱数据集的函数
import matplotlib.pyplot as plt
def linear_loss(X,y,w,b):
'''
X:输入变量矩阵
y:输出标签变量
w:变量参数权重矩阵
b:偏置
y_hat:线性回归模型预测值
loss:均方损失
dw:权重系数一阶偏导
db:偏置一阶偏导
'''
num_train=X.shape[0] # 行数
num_feature=X.shape[0] # 列数
y_hat=np.dot(X,w)+b # 线性回归预测值
loss=np.sum((y_hat-y)**2)/num_train
dw=np.dot(X.T,(y_hat-y))/num_train
db=np.sum((y_hat-y))/num_train
return y_hat,loss,dw,db
# 初始化模型参数
def initialize_params(dims):
'''
dims:训练数据的变量维度
w:初始化权重系数
b:初始化偏置参数
'''
w=np.zeros((dims,1))
b=0
return w,b
# 定义线性回归模型的训练过程
def linear_train(X,y,learning_rate=0.01,epochs=10000):
'''
X:输入变量矩阵
y:输入标签向量
learning_rate:学习率
epochs:训练迭代次数
loss_his:每次迭代的均方损失
params:优化后的参数字典
grads:优化后的参数梯度字典
'''
loss_his=[]
w,b=initialize_params(X.shape[1])
for i in range(1,epochs):
y_hat,loss,dw,db=linear_loss(X,y,w,b)
w+=-learning_rate*dw
b+=-learning_rate*db
loss_his.append(loss)
if i%10000==0:
print('epoch %d loss %f'% (i,loss))
# 更新此次优化后的参数
params={
'w':w,
'b':b
}
# 更新此次优化后的梯度
grads={
'dw':dw,
'db':db
}
return loss_his,params,grads
# 回归模型的预测函数
def predict(X,params):
'''
X:测试集
params:模型训练参数
y_pred:模型预测结果
'''
w=params['w']
b=params['b']
# 预测
y_pred=np.dot(X,w)+b
return y_pred
# 回归模型R^2系数
def r2_score(y_test,y_pred):
'''
y_test:测试集标签值
y_pred:测试集预测值
'''
y_avg=np.mean(y_test)
ss_tot=np.sum((y_test-y_avg)**2) # 总离差平方和
ss_res=np.sum((y_test-y_pred)**2)# 残差平方和
r2=1-(ss_res/ss_tot)
return r2
diabetes=load_diabetes()
data,target=diabetes.data,diabetes.target
X,y=shuffle(data,target,random_state=13)# 打乱数据集
offset=int(X.shape[0]*0.8)
X_train,y_train=X[:offset],y[:offset]
X_test,y_test=X[offset:],y[offset:]
# 训练集改成列向量的形式
y_train=y_train.reshape((-1,1))
# 测试集改成列向量的形式
y_test=y_test.reshape((-1,1))
print("X_train's shape:",X_train.shape)
print("X_test's shape:",X_test.shape)
print("y_train's shape:",y_train.shape)
print("y_test's shape:",y_test.shape)
loss_his,params,grads=linear_train(X_train,y_train,0.01,200000)
print(params)
'''
(x,y)表示(特征,标签),而(train,test)表示(训练集,测试集)。
西瓜的颜色、瓜蒂的形状、敲击的声音就是特征,而“好瓜”和“坏瓜”这两个判断就是标签。
抽象一点,特征是做出某个判断的证据,标签是结论。
sklearn调用
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.utils import shuffle
from sklearn import linear_model
from sklearn.metrics import mean_squared_error,r2_score
diabetes=load_diabetes()
data,target=diabetes.data,diabetes.target
X,y=shuffle(data,target,random_state=13)# 打乱数据集
offset=int(X.shape[0]*0.8)
X_train,y_train=X[:offset],y[:offset]
X_test,y_test=X[offset:],y[offset:]
regr=linear_model.LinearRegression()
regr.fit(X_train,y_train)
y_pred=regr.predict(X_test)
print("Mean squared error:%.2f"%mean_squared_error(y_test,y_pred))
print('R Square score:%.2f'%r2_score(y_test,y_pred))
Logistic回归
代码实现
import numpy as np
import matplotlib.pyplot as plt
# 需要找到一个单调可微函数将真实标签与线性回归模型的预测值进行映射
# 我们需要找到一个映射函数将线性回归模型的预测值转化为0/1值
# 求解对数几率回归模型参数的方法:
# 求偏导+交叉熵损失函数(基于损失函数进行梯度优化)、极大似然法(最大化抽样样本的对书画概率估计)
#################################################################
# 定义对数几率回归模型
# 定义辅助函数
def sigmoid(x):
z=1/(1+np.exp(-x))
return z
# 定义初始化函数
def initialize_params(dims):
'''
dims:参数维度
'''
W=np.zeros((dims,1))
b=0
return W,b
# 定义对数几率回归模型主体
def logistic(X,y,W,b):
'''
X:输入特征矩阵
y:输出标签向量
W:权重系数
b:偏置系数
a:对数几率回归模型输出
cost:损失
dW:权重梯度
db:偏置梯度
'''
# 训练样本量
num_train=X.shape[0]
# 训练特征数
num_feature=X.shape[1]
# 对数几率回归模型输出
a=sigmoid(np.dot(X,W)+b)
# 交叉熵损失
cost=-1/num_train*np.sum(y*np.log(a)+(1-y)*np.log(1-a))
dW=np.dot(X.T,(a-y))/num_train
db=np.sum(a-y)/num_train
# 压缩损失数组维度
cost=np.squeeze(cost)
return a,cost,dW,db
#################################################################
# 定义对数几率回归模型训练过程
def logistic_train(X,y,learning_rate,epochs):
'''
cost_list:损失列表
params:模型参数
grads:参数梯度
'''
W,b=initialize_params(X.shape[1])
cost_list=[]
for i in range(epochs):
a,cost,dW,db=logistic(X,y,W,b)
W=W-learning_rate*dW
b=b-learning_rate*db
if i%100==0:
cost_list.append(cost)
if i%100==0:
print('epoch %d cost %f'%(i,cost))
params={
'W':W,
'b':b
}
grads={
'dW':dW,
'db':db
}
return cost_list,params,grads
#################################################################
# 定义预测函数——对验证集进行预测以方便后续评估模型分类的准确性
def predict(X,params):
# 模型预测值
y_pred=sigmoid(np.dot(X,params['W'])+params['b'])
# 基于分类阈值对概率预测值进行类型转换
for i in range(len(y_pred)):
if y_pred[i]>0.5:
y_pred[i]=1
else:
y_pred[i]=0
return y_pred
# 生成模拟二分类数据集,并进行可视化
from sklearn.datasets._samples_generator import make_classification
X,labels=make_classification(
n_samples=100,
n_features=2,
n_redundant=0,
n_informative=2,
random_state=1,
n_clusters_per_class=2)
rng=np.random.RandomState(2)
# 对生成的特征数据添加一组均匀分布噪声
X+=2*rng.uniform(size=X.shape)
unique_labels=set(labels)
# 根据标签类别设置颜色
colors=plt.cm.Spectral(np.linspace(0,1,len(unique_labels)))
# 绘制模拟数据的散点图
for k,col in zip(unique_labels,colors):
x_k=X[labels==k]
plt.plot(x_k[:,0],x_k[:,1],'o',
markerfacecolor=col,
markeredgecolor='k',
markersize=14)
plt.title('Simulated binary data set')
plt.show()
#################################################################
# 按照9:1划分训练集和测试集
offset=int(X.shape[0]*0.9)
X_train,y_train=X[:offset],labels[:offset]
X_test,y_test=X[offset:],labels[offset:]
y_train=y_train.reshape((-1,1))
y_test=y_test.reshape((-1,1))
print('X_train=',X_train.shape)
print('X_test=',X_test.shape)
print('y_train=',y_train.shape)
print('y_test=',y_test.shape)
# 执行训练并基于训练参数进行预测和评估
cost_list,params,grads=logistic_train(X_train,y_train,0.01,1000)
print(params)
y_pred=predict(X_test,params)
print(y_pred)
# 基于sklearn上的分类评估来衡量模型在测试集上的表现
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))
# 绘制对数几率回归分类决策边界
def plot_logistic(X_train,y_train,params):
'''
X_train:
X_train:
params:
输出:分类决策边界图
'''
n=X_train.shape[0]
xcord1=[]
ycord1=[]
xcord2=[]
ycord2=[]
# 获取两类座标点存入列表
for i in range(n):
if y_train[i]==1:
xcord1.append(X_train[i][0])
ycord1.append(X_train[i][1])
else:
xcord2.append(X_train[i][0])
ycord2.append(X_train[i][1])
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(xcord1,ycord1,s=32,c='red')
ax.scatter(xcord2,ycord2,s=32,c='green')
# 取值范围
x=np.arange(-1.5,3,0.1)
# 分类决策边界公式
y=(-params['b']-params['W'][0]*x)/params['W'][1]
ax.plot(x,y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
plot_logistic(X_train,y_train,params)
成果显示
sklearn调用
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.datasets._samples_generator import make_classification
X,labels=make_classification(
n_samples=100,
n_features=2,
n_redundant=0,
n_informative=2,
random_state=1,
n_clusters_per_class=2)
# 设置随机数种子
rng=np.random.RandomState(2)
# 对生成的特征数添加一组均匀分布噪声
X+=2*rng.uniform(size=X.shape)
# 划分数据集和测试集
offset=int(X.shape[0]*0.9)
X_train,y_train=X[:offset],labels[:offset]
X_test,y_test=X[offset:],labels[offset:]
y_train=y_train.reshape((-1,1))
y_test=y_test.reshape((-1,1))
clf=LogisticRegression(random_state=0).fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(y_pred)
LASSO回归
代码实现
# LASSO回归模型
# 采用1-范数进行正则化
# 适用于特征数量大于样本量的情形
# LASSO会把大量特征系数压缩成0
# 矩阵的0-范数为矩阵中非0元素的个数,1-范数为所有元素的绝对值之和
# 2-范数是矩阵中各元素平方和再求均方根的结果
# 正则化相当于对目标参数施加了一个惩罚项,使得模型不能过于复杂。
# 在优化过程中,正则化项的存在可以使不重要的特征系数逐渐为0,从而保留关键特征
# 坐标下降法——一种替代LASSO回归寻优的方法,在当前坐标轴上搜索损失函数的最小值
import numpy as np
import sklearn
#########################################################################
# 定义符号函数
# 作为L1损失的梯度计算辅助函数,还需要向量化
def sign(x):
if x>0:
return 1
elif x<0:
return -1
else:
return 0
vec_sign=np.vectorize(sign)
# 定义LASSO回归损失函数
def l1_loss(X,y,w,b,alpha):
'''
X:输入变量矩阵
y:输出标签向量
w:变量参数权重矩阵
b:偏置
alpha:正则化系数
y_hat:线性模型预测输出
loss:均方损失值
'''
num_train=X.shape[0]
num_feature=X.shape[1]
y_hat=np.dot(X,w)+b
loss=np.sum((y_hat-y)**2)/num_train+np.sum(alpha*abs(w))
dw=np.dot(X.T,(y_hat-y))/num_train+alpha*vec_sign(w)
db=np.sum((y_hat-y))/num_train
return y_hat,loss,dw,db
def initialize_params(dims):
w=np.zeros((dims,1))
b=0
return w,b
# 定义LASSO回归模型的训练过程
def lasso_train(X,y,learning_rate=0.01,epochs=1000):
'''loss_his是每次迭代后的L1损失列表'''
loss_his=[]
w,b=initialize_params(X.shape[1])
# 迭代训练
for i in range(1,epochs):
y_hat,loss,dw,db=l1_loss(X,y,w,b,0.1)
w+=-learning_rate*dw
b+=-learning_rate*db
# 梯度下降是使损失函数的值减小 -> 逼近预期值
# 记录当前迭代的损失
loss_his.append(loss)
if i%50==0:
print('epoch %d loss %f'%(i,loss))
params={
'w':w,
'b':b
}
grads={
'dw':dw,
'db':db
}
return loss,loss_his,params,grads
# 导入数据集
data=np.genfromtxt(r'3.回归模型拓展\example.dat',delimiter=',')
x=data[:,0:100]
y=data[:,100].reshape(-1,1)
# 加一项
X=np.column_stack((np.ones((x.shape[0],1)),x))
# 划分训练集和测试集
X_train,y_train=X[:70],y[:70]
X_test,y_test=X[70:],y[70:]
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)
loss,loss_list,params,grads=lasso_train(X_train,y_train,0.01,3000)
print(loss)
输出均方损失值
sklearn调用
from sklearn import linear_model
import numpy as np
sk_LASSO=linear_model.Lasso(alpha=0.1)
data=np.genfromtxt(r'3.回归模型拓展\example.dat',delimiter=',')
x=data[:,0:100]
y=data[:,100].reshape(-1,1)
# 加一项
X=np.column_stack((np.ones((x.shape[0],1)),x))
# 划分训练集和测试集
X_train,y_train=X[:70],y[:70]
X_test,y_test=X[70:],y[70:]
# 对训练集进行拟合
# sk_LASSO.fit(X_train,y_train)
# print("sklearn LASSO intercept:",sk_LASSO.intercept_)
# print("\nsklearn LASSO coefficients:\n",sk_LASSO.coef_)
# print("\nsklearn LASSO number of iterations:",sk_LASSO.n_iter_)
根据输出结果可以看到,LASSO回归模型使大量特征参数被压缩为0。
岭回归
代码实现
# 岭回归
# 使用2-范数进行正则化,最小化参数矩阵的每个元素,使其无限接近0,但是不等于0
# L2正则化中,正则化系数取值越大,系数矩阵中的每个元素都会变小,线性计算结果也会变小
# Ridge回归跟LASSO回归的很相似
import numpy as np
def initialize_params(dims):
w=np.zeros((dims,1))
b=0
return w,b
# Ridge的损失函数
def l2_loss(X,y,w,b,alpha):
num_train=X.shape[0]
num_feature=X.shape[1]
y_hat=np.dot(X,w)+b
loss=np.sum((y_hat-y)**2)/num_train+alpha*(np.sum(np.square(w)))
dw=np.dot(X.T,(y_hat-y))/num_train+2*alpha*w
db=np.sum((y_hat-y))/num_train
return y_hat,loss,dw,db
# Ridge的训练过程
def ridge_train(X,y,learning_rate=0.01,epochs=1000):
loss_his=[]
w,b=initialize_params(X.shape[1])
for i in range(1,epochs):
y_hat,loss,dw,db=l2_loss(X,y,w,b,alpha=0.1)
w+=-learning_rate*dw
b+=-learning_rate*db
loss_his.append(loss)
if i %50 ==0:
print('epoch %d loss %f'%(i,loss))
params={
'w':w,
'b':b
}
grads={
'dw':dw,
'db':db
}
return loss,loss_his,params,grads
# 导入数据集
data=np.genfromtxt(r'3.回归模型拓展\example.dat',delimiter=',')
x=data[:,0:100]
y=data[:,100].reshape(-1,1)
# 加一项
X=np.column_stack((np.ones((x.shape[0],1)),x))
# 划分训练集和测试集
X_train,y_train=X[:70],y[:70]
X_test,y_test=X[70:],y[70:]
loss,loss_list,params,grads=ridge_train(X_train,y_train,0.01,3000)
sklearn调用
from sklearn import linear_model
import numpy as np
sk_Ridge=linear_model.Ridge(alpha=0.1)
data=np.genfromtxt(r'3.回归模型拓展\example.dat',delimiter=',')
x=data[:,0:100]
y=data[:,100].reshape(-1,1)
X=np.column_stack((np.ones((x.shape[0],1)),x))
# 划分训练集和测试集
X_train,y_train=X[:70],y[:70]
X_test,y_test=X[70:],y[70:]
# 对训练集进行拟合
sk_Ridge.fit(X_train,y_train)
print("sklearn Ridge intercept:",sk_Ridge.intercept_)
print("\nsklearn Ridge coefficients:\n",sk_Ridge.coef_)
print("\nsklearn Ridge number of iterations:",sk_Ridge.n_iter_)
根据输出结果可以看到,Ridge回归模型使大量特征的回归参数被压缩到非常接近0,但是不会直接等于0。