目录
线性回归算法:
import numpy as np
import matplotlib.pyplot as mp
train_x = np.array([0.5, 0.6, 0.8, 1.1, 1.4])
train_y = np.array([5.0, 5.5, 6.0, 6.8, 7.0])
w0, w1, losses, epoches = [1], [1], [], []
times = 1000
lrate = 0.01
for i in range(1, times + 1):
# 求损失函数关于w0和w1的偏导数,从而更新模型参数
epoches.append(i)
loss = ((w0[-1] + w1[-1] * train_x - train_y) ** 2).sum() / 2
losses.append(loss)
print('{:4}> w0={:.8f},w1={:.8f},loss={:.8f}'.format(i, w0[-1], w1[-1], loss))
d0 = (w0[-1] + w1[-1] * train_x - train_y).sum()
d1 = (train_x * (w0[-1] + w1[-1] * train_x - train_y)).sum()
# 根据梯度下降公式更新w0和w1
w0.append(w0[-1] - lrate * d0)
w1.append(w1[-1] - lrate * d1)
print("w0:", w0[-1])
print("w1:", w1[-1])
# 画图
linex = np.linspace(train_x.min(), train_x.max(), 100)
liney = w1[-1] * linex + w0[-1]
mp.figure('线性回归', facecolor='lightgray')
mp.title('线性回归', fontsize=18)
mp.grid(linestyle=':')
mp.scatter(train_x, train_y, s=80, marker='o',
color='dodgerblue', label='Samples')
mp.plot(linex, liney, color='orangered',
linewidth=2, label='Regression Line')
mp.legend()
# 训练过程图 绘制w0,w1,loss的变化曲线
mp.figure('Training Progress', facecolor='lightgray')
mp.title('Trainint Progress', fontsize=18)
mp.subplot(311)
mp.grid(linestyle=':')
mp.ylabel(r'$w_0$', fontsize=14)
mp.plot(epoches, w0[:-1], color='dodgerblue',
label=r'$w_0$')
mp.legend()
mp.tight_layout()
mp.figure('Training Progress', facecolor='lightgray')
mp.title('Trainint Progress', fontsize=18)
mp.subplot(312)
mp.grid(linestyle=':')
mp.ylabel(r'$w_1$', fontsize=14)
mp.plot(epoches, w1[:-1], color='dodgerblue',
label=r'$w_1$')
mp.legend()
mp.tight_layout()
mp.figure('Training Progress', facecolor='lightgray')
mp.title('Trainint Progress', fontsize=18)
mp.subplot(313)
mp.grid(linestyle=':')
mp.ylabel(r'$loss$', fontsize=14)
mp.plot(epoches, losses, color='dodgerblue',
label=r'$loss$')
mp.legend()
# 绘制三维曲面图,显示梯度下降过程
n = 500
w0_grid, w1_grid = np.meshgrid(
np.linspace(0, 9, n),
np.linspace(0, 3.5, n)
)
loss_grid = 0
for x, y in zip(train_x, train_y):
loss_grid += (w0_grid + w1_grid * x - y)**2 / 2
fig = mp.figure('Loss Function', facecolor='lightgray')
ax3d = fig.add_subplot(111, projection='3d')
ax3d.set_xlabel('w0')
ax3d.set_ylabel('w1')
ax3d.set_zlabel('loss')
ax3d.plot_surface(
w0_grid, w1_grid, loss_grid,
cstride=30, rstride=30, cmap='jet'
)
ax3d.plot(w0[:-1], w1[:-1], losses, 'o-', color='red')
mp.tight_layout()
# 以等高线的方式绘制梯度下降的过程
mp.figure('Batch Gradient Descent', facecolor='lightgray')
mp.title('Batch Gradient Descent', fontsize=20)
mp.xlabel('w0', fontsize=14)
mp.ylabel('w1', fontsize=14)
mp.grid(linestyle=':')
mp.contour(w0_grid, w1_grid, loss_grid, 10, cmap='jet')
cntr = mp.contour(w0_grid, w1_grid, loss_grid, 10, color='black', linewidths=0.5)
mp.clabel(cntr, inline_spacing=0.1, fmt='%.2f', fontsize=8)
mp.plot(w0, w1, 'o-', c='orangered', label='BGD')
mp.legend()
mp.tight_layout()
mp.show()
每次预测一次都要写这么多代码太麻烦了,这里可以直接调用API:
import numpy as np
import matplotlib.pyplot as mp
import sklearn.linear_model as lm
# 采集数据 读文本
x, y = np.loadtxt("C:\桌面文件\c++\测试结果\1.txt", delimiter=',', unpack=True)
# 整理输入集(二维)与输出集(一维)
x = x.reshape(-1, 1) # 变维:n行1列
mp.figure('Linear Regression', facecolor='lightgray')
mp.title('Linear Regression', fontsize=18)
mp.grid(linestyle=':')
mp.scatter(x, y, s=70, color='dodgerblue', label='Sample Points')
# 构建线性回归模型,训练模型
model = lm.LinearRegression()
model.fit(x, y)
pred_y = model.predict(x)
mp.plot(x, pred_y, color='orangered', label='Regression Line')
mp.legend()
mp.show()
评估训练结构误差:
代码的储存:
import numpy as np
import sklearn.linear_model as lm
import pickle
# 采集数据 读文本
x, y = np.loadtxt("C:\\桌面文件\\c++\\测试结果\\景.txt", delimiter=',', unpack=True)
# 整理输入集(二维)与输出集(一维)
x = x.reshape(-1, 1) # 变维:n行1列
# 构建线性回归模型,训练模型
model = lm.LinearRegression()
model.fit(x, y)
# 存储模型
with open(r"C:\Users\86188\Desktop\机械学习\线性回归\线性回归API模型.txt", 'wb') as f:
pickle.dump(model, f)
print('dump success!')
代码的加载:
import numpy as np
import matplotlib.pyplot as mp
import sklearn.linear_model as lm
import pickle
# 采集数据 读文本
x, y = np.loadtxt(r"C:\Users\86188\Desktop\景.txt", delimiter=',', unpack=True)
# 整理输入集(二维)与输出集(一维)
x = x.reshape(-1, 1) # 变维:n行1列
mp.figure('Linear Regression', facecolor='lightgray')
mp.title('Linear Regression', fontsize=18)
mp.grid(linestyle=':')
mp.scatter(x, y, s=70, color='dodgerblue', label='Sample Points')
# 模型从文件中加载而来
with open(r"C:\Users\86188\Desktop\机械学习\线性回归\线性回归API模型.txt", 'rb') as f:
model = pickle.load(f)
pred_y = model.predict(x)
mp.plot(x, pred_y, color='orangered', label='Regression Line')
mp.legend()
mp.show()
岭回归:岭回归是在线性回归的一种改进,在原有的代码上加上了一个正则化。
import numpy as np
import matplotlib.pyplot as mp
import sklearn.linear_model as lm
# 采集数据 读文本
x, y = np.loadtxt(r"C:\Users\86188\Desktop\岭回归.txt", delimiter=',', unpack=True)
# 整理输入集(二维)与输出集(一维)
x = x.reshape(-1, 1) # 变维:n行1列
mp.figure('Linear Regression', facecolor='lightgray')
mp.title('Linear Regression', fontsize=18)
mp.grid(linestyle=':')
mp.scatter(x, y, s=70, color='dodgerblue', label='Sample Points')
# 普通线性回归模型
model = lm.LinearRegression()
model.fit(x, y)
pred_y = model.predict(x)
mp.plot(x, pred_y, color='orangered', label='LR')
# 岭回归
model = lm.Ridge(
100, fit_intercept=True, max_iter=1000)
model.fit(x, y)
pred_y = model.predict(x)
mp.plot(x, pred_y, color='green', label='Ridge')
mp.legend()
mp.show()
多项式回归线性方程:
代码实现如下:
import numpy as np
import matplotlib.pyplot as mp
import sklearn.linear_model as lm
import sklearn.pipeline as pl
import sklearn.preprocessing as sp
# 采集数据 读文本
x, y = np.loadtxt(r"C:\Users\86188\Desktop\景.txt", delimiter=',', unpack=True)
# 整理输入集(二维)与输出集(一维)
x = x.reshape(-1, 1) # 变维:n行1列
mp.figure('Poly Regression', facecolor='lightgray')
mp.title('Poly Regression', fontsize=18)
mp.grid(linestyle=':')
mp.scatter(x, y, s=70, color='dodgerblue', label='Sample Points')
# 多项式回归
model = pl.make_pipeline(
sp.PolynomialFeatures(2),lm.LinearRegression()) # 这里的数字代表特征向量数量
model.fit(x, y)
pred_y = model.predict(x)
# 绘制多项式函数图像,从min到max拆成500个点,预测500个函数值,按顺序连线
x = np.linspace(x.min(),x.max(),500)
pred_y = model.predict(x.reshape(-1,1))
mp.plot(x, pred_y, color='orangered', label='Regression Line')
mp.legend()
mp.show()
对于特征数量需要考虑拟合情况
决策树:
这里教学给了案例一,预测波士顿的房价代码如下:
import sklearn.datasets as sd
import sklearn.utils as su
# 案例一:预测波士顿地区房价
boston = sd.load_boston()
print(boston.data.shape) # 输入数据集
print(boston.data[0])
print(boston.target.shape) # 输出数据集
print(boston.target[0])
print(boston.feature_names) # 输入数据的特征名
但是我的编译器告诉我:`load_boston` has been removed from scikit-learn since version 1.2.
根据报错信息,load_boston方法已经被移除,并且在使用波士顿房价数据集时存在道德问题,因此scikit-learn维护者强烈不建议使用该数据集。
如果你想继续学习使用波士顿房价数据集,可以使用原始数据集并进行适当的数据转换。
并且给了我一个新的预测代码:
import pandas as pd
import numpy as np
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
print(data.shape) # 输入数据集
print(data[0])
print(target.shape) # 输出数据集
print(target[0])
# 手动添加特征名
feature_names = [
'CRIM', # 犯罪率
'ZN', # 占地面积超过 25000 平方英尺的住宅的比例
'INDUS', # 非零售商业用地比例
'CHAS', # 查尔斯河指示变量 (如果是河流则为1,否则为0)
'NOX', # 一氧化氮浓度(每千万份)
'RM', # 每个住宅的平均房间数
'AGE', # 1940 年以前建造的自住单元的比例
'DIS', # 到波士顿五个就业中心的加权距离
'RAD', # 径向高速公路的可达性指数
'TAX', # 全值财产税率每一万美元
'PTRATIO',# 城镇的师生比例
'B', # 1000(Bk - 0.63)^2 其中 Bk 是黑人的比例
'LSTAT' # 人口状况较低(低社会经济地位人口的百分比)
]
print(feature_names)
这里我借用这个波士顿的数据集来进行一个决策树的预估:
import pandas as pd
import numpy as np
import sklearn.utils as su
import sklearn.tree as st
import sklearn.metrics as sm
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
# 打乱原始数据集,拆分训练集与测试集
# random_state:随机种子(随便给个数字)
# 使用相同的随机种子多次打乱得到的结果是一致的
x, y = su.shuffle(
data, target, random_state=7)
train_size = int(len(x) * 0.8)
train_x, test_x, train_y, test_y = \
x[:train_size], x[train_size:], \
y[:train_size], y[train_size:]
# 构建决策树模型
model = st.DecisionTreeRegressor(max_depth=4)
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
# 评估结果
r = sm.r2_score(test_y,pred_test_y)
print(r)
print(sm.mean_absolute_error(test_y,pred_test_y))