import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
from pylab import mpl
import random
import pandas as pd
# 设置正常显示符号
mpl.rcParams["axes.unicode_minus"] = False
sns.set_theme()
plt.rcParams['font.family'] = 'SimHei' # 替换为你选择的字体
def plot_feature_y(X,X_label,Y):
"""
desc: 特征量与真实值的相关性
"""
m,n = X.shape
axs = []
# 设置画布
fig = plt.figure(figsize=(14,14), dpi=100)
plt.subplots_adjust(bottom=0, right=0.8, top=1, hspace=0.5)
#列
coloum = 3
for i in range(n):
ax = fig.add_subplot(math.ceil(n/coloum) , coloum, i+1)
if i == 0:
ax.set_ylabel('真实值 y')
ax.set_xlabel('x')
ax.set_title(X_label[i])
# 绘制散点图
ax.scatter(X[:,i], Y)
# 绘制箱型图
np.random.seed(10) # 设置种子
D = np.random.normal((3, 5, 4), (1.25, 1.00, 1.25), (100, 3))
ax.boxplot(D, positions=[2, 4, 6], widths=1.5, patch_artist=True,
showmeans=False, showfliers=False,
medianprops={"color": "white", "linewidth": 0.5},
boxprops={"facecolor": "C0", "edgecolor": "white",
"linewidth": 0.5},
whiskerprops={"color": "C0", "linewidth": 1.5},
capprops={"color": "C0", "linewidth": 1.5})
axs.append(ax)
def plot_cost(cost):
"""
desc:绘制损失值图
"""
fig, ax = plt.subplots()
ax.set_title("代价变化图")
ax.set_xlabel("iteration")
ax.set_ylabel("cost")
plt.plot(cost)
def plot_corr(data_set):
"""
desc:绘制变量间的相关性关系
"""
# 计算变量间的相关系数
corr = data_set.corr()
f, ax = plt.subplots(figsize=(9, 6))
ax.set_title("变量之间的相关性系数值")
sns.heatmap(corr, annot=True, fmt=".2f", linewidths=.5, cmap="YlGn",ax=ax)
def Z_score_normalization(X):
"""
desc:Z-score归一化
公式:x* = ( x − μ ) / σ
paremeters:
X np.array (m,n) 原始数据
return:
X_nor np.array (m,n) 归一化后的
"""
# 计算样本的特征的均值和标准差
Mu = np.mean(X, axis=0)
Sigma = np.std(X, axis=0)
# print(f"Mu = {Mu}")
X_nor = (X - Mu) / Sigma
return X_nor, Mu, Sigma
# 正则化后的回归模型
def data_dispose(data_set):
"""
desc: 数据处理 利用pandas库进行处理 返回numpy对象
parameters:
data_set pandas类型 数据集
return
X_dispose np.array (m, n) 处理后的特征量
Y_dispose np.array (m,1) 处理后的真实值
X_labels list (,n) 特征标签
"""
data_set = pd.DataFrame(data_set)
# 1.缺失值处理:替换 or 删除
data_set.fillna(0,inplace=True)
# 提取特征量、特征标签、真实值
X_dispose = data_set.iloc[:,:-1] # 特征量 :除最后一列外都为特征
Y_dispose = data_set.iloc[:,-1] # 真实值:最后一列为真实值
X_labels = data_set.columns # 特征标签
# print(f"Y_dispose={np.array(Y_dispose,ndmin=2).T}")
return np.array(X_dispose),np.array(Y_dispose,ndmin=2).reshape(-1,1,), X_labels
def init_W_b(X):
"""
desc: 初始化w和b模型参数值
parameters:
X np.array (m, n) 特征数据
return:
W np.array (n,1) 模型参数值
b float 模型参数指
"""
n = np.array(X).shape[1]
W = np.zeros((n,1))
b = 0.
return W,b
def Hypothesis_function(X,W,b):
"""
desc: 假设函数
parameters:
X np.array (m, n) 特征数据
W np.array (n,1) 模型参数值
b float 模型参数指
returns:
f_wb np.array (m,1) 预测值
"""
f_wb = X @ W + b
# print(f"f_wb = {f_wb}")
return f_wb
def regularize_lambda():
"""
desc:给出正则系数
lambda$大 ,则W_j小,惩罚大
lambda$小 ,则W_j大
return:
_lambda float 正则系数
"""
_lambda = 0.5
return _lambda
def cost_function(X, Y, W, b):
"""
desc:代价函数
parameters:
X np.array (m, n) 特征数据
Y np.array (m,1) 真实值
W np.array (n,1) 模型参数值
b float 模型参数值
return:
J_w_b float 成本/代价
Err np.array (m,1) 损失
"""
m = np.array(X).shape[0]
# 损失loss
Err = Hypothesis_function(X, W,b) - Y
Loss = Err**2
cost = (1/(2*m)) * np.sum(Loss)
# 正则
_lambda = regularize_lambda() # 正则系数
regularize = (_lambda/(2*m)) * np.sum(W**2)
# 成本cost
J_wb = cost + regularize
return J_wb,Err
def compute_gradient_descent(X, Y, W, Err):
"""
desc:计算正则化后的梯度(偏导)
parameters:
X np.array (m, n) 特征数据
Y np.array (m,1) 真实值
W np.array (n,1) 模型参数值
Err np.array (m,1) 损失
return:
dJ_dW np.array (n,1) J对w的偏导数
dJ_db float J对b的偏导数
"""
m = np.array(X).shape[0]
_lambda = regularize_lambda()
# 计算偏导数
tmp_dJ_dW = (1/m) * np.dot(X.T, Err) + (_lambda / m) * W
tmp_dJ_db = (1/m) * np.sum(Err)
# 同时更新
dJ_dW = tmp_dJ_dW
dJ_db = tmp_dJ_db
return dJ_dW,dJ_db
def fit(X_train, Y_train, lr=0.01, iteration = 10000):
"""
desc:模型训练,模型拟合
parameters:
X_train np.array (m, n) 训练集的特征数据
Y_train np.array (m,1) 训练集的真实值
lr float 学习率 默认0.1
iteration int 迭代次数 默认10000
return:
W_opt = W
b_opt = b
"""
# 数据处理
X, Y = X_train, Y_train
# 初始化模型参数
W, b = init_W_b(X)
#损失
Cost = []
for index in range(iteration):
# 1.计算cost,losss
J_wb,Err = cost_function(X, Y, W, b)
Cost.append(J_wb)
##############输出打印##############
print(f"iteration {index}: cost = {J_wb}")
# 2.计算梯度
gradient_W,gradient_b = compute_gradient_descent(X, Y, W, Err)
# 3.模型参数更新
W -= lr * gradient_W
b -= lr * gradient_b
# 最优点
W_opt = W
b_opt = b
# 绘制cost
plot_cost(Cost)
return W_opt,b_opt
def predict(X,W, b):
"""
desc:模型预测
parameters:
X np.array (m, n) 特征数据
W np.array (n,1) 模型参数值
b float 模型参数指
"""
predict_y = Hypothesis_function(X,W,b)
return predict_y
if __name__ == '__main__':
# 加载数据集
data_set = pd.read_csv('./data/boston.csv')
# X = np.array([1,2,3,4,5,6,7,8,9])
# data_set = pd.DataFrame({
# 'X1': X,
# 'Y': 3*X + random.random()
# })
# 数据预处理:
X, Y, X_label = data_dispose(data_set)
# 绘制特征分量关于真实值的散点图
plot_feature_y(X,X_label,Y)
# 绘制变量相关性热力图
plot_corr(data_set)
# 规范化
X_dispose , mu , sigma =Z_score_normalization(X)
# 数据训练拟合
W_opt,b_opt =fit(X_dispose, Y, lr=0.01,iteration=20000)
print(f"最优参数:W={W_opt} b={b_opt}")
# 预测
## 归一化
data_test = (X[:4,:] - mu)/ sigma
## 预测
predict = Hypothesis_function(data_test, W_opt, b_opt)
print(f"预测结果: {predict}")
监督学习—— Regularized Linear Regression (正则化后的线性回归)
最新推荐文章于 2024-07-10 10:00:07 发布