机械学习代码实现（一）

某用

已于 2023-12-28 19:42:31 修改

阅读量470

点赞数 9

文章标签： python 机器学习

于 2023-12-20 20:09:00 首次发布

本文链接：https://blog.csdn.net/jkydsg/article/details/134274583

版权

线性回归算法：

import numpy as np
import matplotlib.pyplot as mp

train_x = np.array([0.5, 0.6, 0.8, 1.1, 1.4])
train_y = np.array([5.0, 5.5, 6.0, 6.8, 7.0])

w0, w1, losses, epoches = [1], [1], [], []
times = 1000
lrate = 0.01
for i in range(1, times + 1):
  # 求损失函数关于w0和w1的偏导数，从而更新模型参数
    epoches.append(i)
    loss = ((w0[-1] + w1[-1] * train_x - train_y) ** 2).sum() / 2
    losses.append(loss)
    print('{:4}> w0={:.8f},w1={:.8f},loss={:.8f}'.format(i, w0[-1], w1[-1], loss))

    d0 = (w0[-1] + w1[-1] * train_x - train_y).sum()
    d1 = (train_x * (w0[-1] + w1[-1] * train_x - train_y)).sum()
    # 根据梯度下降公式更新w0和w1
    w0.append(w0[-1] - lrate * d0)
    w1.append(w1[-1] - lrate * d1)
print("w0:", w0[-1])
print("w1:", w1[-1])
# 画图
linex = np.linspace(train_x.min(), train_x.max(), 100)
liney = w1[-1] * linex + w0[-1]

mp.figure('线性回归', facecolor='lightgray')
mp.title('线性回归', fontsize=18)
mp.grid(linestyle=':')
mp.scatter(train_x, train_y, s=80, marker='o',
           color='dodgerblue', label='Samples')
mp.plot(linex, liney, color='orangered',
        linewidth=2, label='Regression Line')
mp.legend()
# 训练过程图 绘制w0，w1，loss的变化曲线
mp.figure('Training Progress', facecolor='lightgray')
mp.title('Trainint Progress', fontsize=18)
mp.subplot(311)
mp.grid(linestyle=':')
mp.ylabel(r'$w_0$', fontsize=14)
mp.plot(epoches, w0[:-1], color='dodgerblue',
        label=r'$w_0$')
mp.legend()
mp.tight_layout()

mp.figure('Training Progress', facecolor='lightgray')
mp.title('Trainint Progress', fontsize=18)
mp.subplot(312)
mp.grid(linestyle=':')
mp.ylabel(r'$w_1$', fontsize=14)
mp.plot(epoches, w1[:-1], color='dodgerblue',
        label=r'$w_1$')
mp.legend()
mp.tight_layout()

mp.figure('Training Progress', facecolor='lightgray')
mp.title('Trainint Progress', fontsize=18)
mp.subplot(313)
mp.grid(linestyle=':')
mp.ylabel(r'$loss$', fontsize=14)
mp.plot(epoches, losses, color='dodgerblue',
        label=r'$loss$')
mp.legend()

# 绘制三维曲面图，显示梯度下降过程
n = 500
w0_grid, w1_grid = np.meshgrid(
    np.linspace(0, 9, n),
    np.linspace(0, 3.5, n)
)
loss_grid = 0
for x, y in zip(train_x, train_y):
    loss_grid += (w0_grid + w1_grid * x - y)**2 / 2

fig = mp.figure('Loss Function', facecolor='lightgray')
ax3d = fig.add_subplot(111, projection='3d')
ax3d.set_xlabel('w0')
ax3d.set_ylabel('w1')
ax3d.set_zlabel('loss')
ax3d.plot_surface(
    w0_grid, w1_grid, loss_grid,
    cstride=30, rstride=30, cmap='jet'
)
ax3d.plot(w0[:-1], w1[:-1], losses, 'o-', color='red')
mp.tight_layout()

# 以等高线的方式绘制梯度下降的过程
mp.figure('Batch Gradient Descent', facecolor='lightgray')
mp.title('Batch Gradient Descent', fontsize=20)
mp.xlabel('w0', fontsize=14)
mp.ylabel('w1', fontsize=14)
mp.grid(linestyle=':')
mp.contour(w0_grid, w1_grid, loss_grid, 10, cmap='jet')
cntr = mp.contour(w0_grid, w1_grid, loss_grid, 10, color='black', linewidths=0.5)
mp.clabel(cntr, inline_spacing=0.1, fmt='%.2f', fontsize=8)
mp.plot(w0, w1, 'o-', c='orangered', label='BGD')
mp.legend()
mp.tight_layout()


mp.show()

每次预测一次都要写这么多代码太麻烦了，这里可以直接调用API：

import numpy as np
import matplotlib.pyplot as mp
import sklearn.linear_model as lm

# 采集数据 读文本
x, y = np.loadtxt("C:\桌面文件\c++\测试结果\1.txt", delimiter=',',  unpack=True)

# 整理输入集（二维)与输出集（一维）
x = x.reshape(-1, 1) # 变维：n行1列

mp.figure('Linear Regression', facecolor='lightgray')
mp.title('Linear Regression', fontsize=18)
mp.grid(linestyle=':')
mp.scatter(x, y, s=70, color='dodgerblue', label='Sample Points')

# 构建线性回归模型，训练模型
model = lm.LinearRegression()
model.fit(x, y)

pred_y = model.predict(x)

mp.plot(x, pred_y, color='orangered', label='Regression Line')
mp.legend()
mp.show()

评估训练结构误差：

代码的储存：

import numpy as np
import sklearn.linear_model as lm
import pickle

# 采集数据 读文本
x, y = np.loadtxt("C:\\桌面文件\\c++\\测试结果\\景.txt", delimiter=',',  unpack=True)

# 整理输入集（二维)与输出集（一维）
x = x.reshape(-1, 1)  # 变维：n行1列

# 构建线性回归模型，训练模型
model = lm.LinearRegression()
model.fit(x, y)

# 存储模型
with open(r"C:\Users\86188\Desktop\机械学习\线性回归\线性回归API模型.txt", 'wb') as f:
    pickle.dump(model, f)
    print('dump success!')

代码的加载：

import numpy as np
import matplotlib.pyplot as mp
import sklearn.linear_model as lm
import pickle

# 采集数据 读文本
x, y = np.loadtxt(r"C:\Users\86188\Desktop\景.txt", delimiter=',',  unpack=True)

# 整理输入集（二维)与输出集（一维）
x = x.reshape(-1, 1)  # 变维：n行1列

mp.figure('Linear Regression', facecolor='lightgray')
mp.title('Linear Regression', fontsize=18)
mp.grid(linestyle=':')
mp.scatter(x, y, s=70, color='dodgerblue', label='Sample Points')

# 模型从文件中加载而来
with open(r"C:\Users\86188\Desktop\机械学习\线性回归\线性回归API模型.txt", 'rb') as f:
    model = pickle.load(f)
pred_y = model.predict(x)

mp.plot(x, pred_y, color='orangered', label='Regression Line')
mp.legend()
mp.show()

岭回归：岭回归是在线性回归的一种改进，在原有的代码上加上了一个正则化。

import numpy as np
import matplotlib.pyplot as mp
import sklearn.linear_model as lm

# 采集数据 读文本
x, y = np.loadtxt(r"C:\Users\86188\Desktop\岭回归.txt", delimiter=',',  unpack=True)

# 整理输入集（二维)与输出集（一维）
x = x.reshape(-1, 1) # 变维：n行1列

mp.figure('Linear Regression', facecolor='lightgray')
mp.title('Linear Regression', fontsize=18)
mp.grid(linestyle=':')
mp.scatter(x, y, s=70, color='dodgerblue', label='Sample Points')

# 普通线性回归模型
model = lm.LinearRegression()
model.fit(x, y)
pred_y = model.predict(x)
mp.plot(x, pred_y, color='orangered', label='LR')

# 岭回归
model = lm.Ridge(
    100, fit_intercept=True, max_iter=1000)
model.fit(x, y)
pred_y = model.predict(x)
mp.plot(x, pred_y, color='green', label='Ridge')

mp.legend()
mp.show()

多项式回归线性方程：

代码实现如下：

import numpy as np
import matplotlib.pyplot as mp
import sklearn.linear_model as lm
import sklearn.pipeline as pl
import sklearn.preprocessing as sp

# 采集数据 读文本
x, y = np.loadtxt(r"C:\Users\86188\Desktop\景.txt", delimiter=',',  unpack=True)

# 整理输入集（二维)与输出集（一维）
x = x.reshape(-1, 1)  # 变维：n行1列

mp.figure('Poly Regression', facecolor='lightgray')
mp.title('Poly Regression', fontsize=18)
mp.grid(linestyle=':')
mp.scatter(x, y, s=70, color='dodgerblue', label='Sample Points')

# 多项式回归
model = pl.make_pipeline(
    sp.PolynomialFeatures(2),lm.LinearRegression())  # 这里的数字代表特征向量数量
model.fit(x, y)
pred_y = model.predict(x)

# 绘制多项式函数图像，从min到max拆成500个点，预测500个函数值，按顺序连线

x = np.linspace(x.min(),x.max(),500)
pred_y = model.predict(x.reshape(-1,1))

mp.plot(x, pred_y, color='orangered', label='Regression Line')
mp.legend()
mp.show()

对于特征数量需要考虑拟合情况

决策树：

这里教学给了案例一，预测波士顿的房价代码如下：

import sklearn.datasets as sd
import sklearn.utils as su

# 案例一：预测波士顿地区房价
boston = sd.load_boston()
print(boston.data.shape)  # 输入数据集
print(boston.data[0])
print(boston.target.shape)  # 输出数据集
print(boston.target[0])
print(boston.feature_names)  # 输入数据的特征名

但是我的编译器告诉我：`load_boston` has been removed from scikit-learn since version 1.2.

根据报错信息，load_boston方法已经被移除，并且在使用波士顿房价数据集时存在道德问题，因此scikit-learn维护者强烈不建议使用该数据集。

如果你想继续学习使用波士顿房价数据集，可以使用原始数据集并进行适当的数据转换。

并且给了我一个新的预测代码：

import pandas as pd
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

print(data.shape)  # 输入数据集
print(data[0])
print(target.shape)  # 输出数据集
print(target[0])
# 手动添加特征名
feature_names = [
    'CRIM',   # 犯罪率
    'ZN',     # 占地面积超过 25000 平方英尺的住宅的比例
    'INDUS',  # 非零售商业用地比例
    'CHAS',   # 查尔斯河指示变量 (如果是河流则为1，否则为0)
    'NOX',    # 一氧化氮浓度（每千万份）
    'RM',     # 每个住宅的平均房间数
    'AGE',    # 1940 年以前建造的自住单元的比例
    'DIS',    # 到波士顿五个就业中心的加权距离
    'RAD',    # 径向高速公路的可达性指数
    'TAX',    # 全值财产税率每一万美元
    'PTRATIO',# 城镇的师生比例
    'B',      # 1000(Bk - 0.63)^2 其中 Bk 是黑人的比例
    'LSTAT'   # 人口状况较低（低社会经济地位人口的百分比）
]

print(feature_names)

这里我借用这个波士顿的数据集来进行一个决策树的预估：

import pandas as pd
import numpy as np
import sklearn.utils as su
import sklearn.tree as st
import sklearn.metrics as sm
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

# 打乱原始数据集，拆分训练集与测试集
# random_state：随机种子（随便给个数字）
# 使用相同的随机种子多次打乱得到的结果是一致的
x, y = su.shuffle(
    data, target, random_state=7)
train_size = int(len(x) * 0.8)
train_x, test_x, train_y, test_y = \
    x[:train_size], x[train_size:], \
    y[:train_size], y[train_size:]
# 构建决策树模型
model = st.DecisionTreeRegressor(max_depth=4)
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
# 评估结果
r = sm.r2_score(test_y,pred_test_y)
print(r)
print(sm.mean_absolute_error(test_y,pred_test_y))