该笔记为个人学习笔记,看的课程是B站-数学建模老哥:10 Python数学建模实战_哔哩哔哩_bilibili
目录
1基础步骤
2定义问题
# 波士顿房价问题 共14个特征 506条数据
# 导入类库
import numpy as np
import pandas as pd
from numpy import arange
from matplotlib import pyplot, pyplot as plt
from pandas import read_csv
from pandas.plotting import scatter_matrix # 散点图
from sklearn.preprocessing import StandardScaler # 预处理标准化
from sklearn.model_selection import train_test_split # 机器学习中的数据集分割
from sklearn.model_selection import KFold # 机器学习中的交叉验证
from sklearn.model_selection import cross_val_score # 机器学习中的得分
from sklearn.model_selection import GridSearchCV # 机器学习中的随机搜索
from sklearn.linear_model import LinearRegression # 线性回归模型
from sklearn.linear_model import Lasso # 线性回归的一种,增加了L1正则化项
from sklearn.linear_model import ElasticNet # 线性网络回归
from sklearn.tree import DecisionTreeRegressor # 决策树
from sklearn.neighbors import KNeighborsRegressor # K邻近算法
from sklearn.svm import SVR # SVR 支持向量机
from sklearn.pipeline import Pipeline # 优化模型
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor # 集成学习--随机森林
from sklearn.ensemble import GradientBoostingRegressor # 随机梯度回归
from sklearn.ensemble import ExtraTreesClassifier # 分类树
from sklearn.ensemble import AdaBoostRegressor # 集成学习算法
from sklearn.metrics import mean_squared_error # MSE
from sklearn.impute import SimpleImputer
# 导入数据
data = pd.read_csv('../housing.csv') # delim_whitespace=True 自动分割
3理解数据
# 数据理解
print(data.shape) #字段类型
print(data.dtypes) #特征
print(data.head(30)) #30行
print(data.describe()) #统计性描述
print(data.corr(method='pearson')) #皮尔逊系数(相关性)
输出结果:
(506, 14)
CRIM float64
ZN float64
INDUS float64
CHAS float64
NOX float64
RM float64
AGE float64
DIS float64
RAD int64
TAX float64
PIRATIO float64
B float64
LSTAT float64
MEDV float64
dtype: object
CRIM ZN INDUS CHAS NOX ... TAX PIRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0.0 0.538 ... 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 ... 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 ... 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 ... 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 ... 222.0 18.7 396.90 5.33 36.2
5 0.02985 0.0 2.18 0.0 0.458 ... 222.0 18.7 394.12 5.21 28.7
6 0.08829 12.5 7.87 0.0 0.524 ... 311.0 15.2 395.60 12.43 22.9
7 0.14455 12.5 7.87 0.0 0.524 ... 311.0 15.2 396.90 19.15 27.1
8 0.21124 12.5 7.87 0.0 0.524 ... 311.0 15.2 386.63 29.93 16.5
9 0.17004 12.5 7.87 0.0 0.524 ... 311.0 15.2 386.71 17.10 18.9
10 0.22489 12.5 7.87 0.0 0.524 ... 311.0 15.2 392.52 20.45 15.0
11 0.11747 12.5 7.87 0.0 0.524 ... 311.0 15.2 396.90 13.27 18.9
12 0.09378 12.5 7.87 0.0 0.524 ... 311.0 15.2 390.50 15.71 21.7
13 0.62976 0.0 8.14 0.0 0.538 ... 307.0 21.0 396.90 8.26 20.4
14 0.63796 0.0 8.14 0.0 0.538 ... 307.0 21.0 380.02 10.26 18.2
15 0.62739 0.0 8.14 0.0 0.538 ... 307.0 21.0 395.62 8.47 19.9
16 1.05393 0.0 8.14 0.0 0.538 ... 307.0 21.0 386.85 6.58 23.1
17 0.78420 0.0 8.14 0.0 0.538 ... 307.0 21.0 386.75 14.67 17.5
18 0.80271 0.0 8.14 0.0 0.538 ... 307.0 21.0 288.99 11.69 20.2
19 0.72580 0.0 8.14 0.0 0.538 ... 307.0 21.0 390.95 11.28 18.2
20 1.25179 0.0 8.14 0.0 0.538 ... 307.0 21.0 376.57 21.02 13.6
21 0.85204 0.0 8.14 0.0 0.538 ... 307.0 21.0 392.53 13.83 19.6
22 1.23247 0.0 8.14 0.0 0.538 ... 307.0 21.0 396.90 18.72 15.2
23 0.98843 0.0 8.14 0.0 0.538 ... 307.0 21.0 394.54 19.88 14.5
24 0.75026 0.0 8.14 0.0 0.538 ... 307.0 21.0 394.33 16.30 15.6
25 0.84054 0.0 8.14 0.0 0.538 ... 307.0 21.0 303.42 16.51 13.9
26 0.67191 0.0 8.14 0.0 0.538 ... 307.0 21.0 376.88 14.81 16.6
27 0.95577 0.0 8.14 0.0 0.538 ... 307.0 21.0 306.38 17.28 14.8
28 0.77299 0.0 8.14 0.0 0.538 ... 307.0 21.0 387.94 12.80 18.4
29 1.00245 0.0 8.14 0.0 0.538 ... 307.0 21.0 380.23 11.98 21.0
[30 rows x 14 columns]
CRIM ZN INDUS ... B LSTAT MEDV
count 506.000000 506.000000 506.000000 ... 506.000000 506.000000 452.000000
mean 1.269195 13.295257 9.205158 ... 332.791107 11.537806 23.750442
std 2.399207 23.048697 7.169630 ... 125.322456 6.064932 8.808602
min 0.000000 0.000000 0.000000 ... 0.320000 1.730000 6.300000
25% 0.049443 0.000000 3.440000 ... 364.995000 6.877500 18.500000
50% 0.144655 0.000000 6.960000 ... 390.660000 10.380000 21.950000
75% 0.819623 18.100000 18.100000 ... 395.615000 15.015000 26.600000
max 9.966540 100.000000 27.740000 ... 396.900000 34.410000 50.000000
[8 rows x 14 columns]
CRIM ZN INDUS ... B LSTAT MEDV
CRIM 1.000000 -0.288969 0.586719 ... -0.053260 0.392225 -0.286245
ZN -0.288969 1.000000 -0.491587 ... 0.015810 -0.390092 0.331570
INDUS 0.586719 -0.491587 1.000000 ... 0.233471 0.465583 -0.411915
CHAS -0.067536 -0.005843 -0.185873 ... -0.495956 0.011260 0.154409
NOX -0.139448 0.038450 -0.394483 ... -0.856608 0.079688 -0.332778
RM -0.185045 0.078721 -0.448809 ... -0.848289 0.029450 0.740181
AGE 0.462470 -0.488006 0.700699 ... 0.417216 0.414354 -0.299893
DIS -0.312843 0.268317 -0.605973 ... -0.778075 -0.080368 0.138798
RAD -0.151996 0.062767 -0.427834 ... -0.861694 0.056185 -0.217902
TAX 0.754362 -0.256799 0.748951 ... 0.372806 0.284030 -0.345898
PIRATIO -0.140015 0.049491 -0.351166 ... -0.690245 0.049208 -0.461214
B -0.053260 0.015810 0.233471 ... 1.000000 -0.186021 0.264797
LSTAT 0.392225 -0.390092 0.465583 ... -0.186021 1.000000 -0.706255
MEDV -0.286245 0.331570 -0.411915 ... 0.264797 -0.706255 1.000000
4数据准备
4.1缺失值处理
# 随机森林回归填充缺失值
# 首先,我们需要将数据集分为两部分:一部分包含缺失值(仅针对 MEDV),另一部分不包含缺失值
# 假设其他列(自变量)都是完整的
X = data.drop('MEDV', axis=1)
y = data['MEDV']
# 分离出有缺失值的索引
missing_indices = y.isnull()
# 分离出完整的数据集来训练模型
X_train, y_train = X[~missing_indices], y[~missing_indices]
# 使用随机森林回归模型
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# 预测缺失的 MEDV 值
y_pred = model.predict(X[missing_indices])
# 填充缺失值
data.loc[missing_indices, 'MEDV'] = y_pred
print("补充后的数据描述:")
print(data.describe()) # 统计性描述
输出结果:
[14 rows x 14 columns]
补充后的数据描述:
CRIM ZN INDUS ... B LSTAT MEDV
count 506.000000 506.000000 506.000000 ... 506.000000 506.000000 506.000000
mean 1.269195 13.295257 9.205158 ... 332.791107 11.537806 25.168093
std 2.399207 23.048697 7.169630 ... 125.322456 6.064932 9.296464
min 0.000000 0.000000 0.000000 ... 0.320000 1.730000 6.300000
25% 0.049443 0.000000 3.440000 ... 364.995000 6.877500 19.000000
50% 0.144655 0.000000 6.960000 ... 390.660000 10.380000 22.750000
75% 0.819623 18.100000 18.100000 ... 395.615000 15.015000 31.500000
max 9.966540 100.000000 27.740000 ... 396.900000 34.410000 50.000000
4.2数据可视化
# 数据可视化
#直方图 sharex=False,sharey=False 不显示x,y轴
data.hist(sharex=False,sharey=False,xlabelsize=1,ylabelsize=1)
pyplot.show()
#密度图
data.plot(kind='density',subplots=True,layout=(4,4),sharex=False,fontsize=1)
pyplot.show()
#箱线图
data.plot(kind='box',subplots=True,layout=(4,4),sharex=False,fontsize=8)
pyplot.show()
#散点图
scatter_matrix(data)
pyplot.show()
#矩阵图
fig=pyplot.figure()
ax=fig.add_subplot(111)#x轴
cax=ax.matshow(data.corr(),vmin=-1,vmax=1,interpolation='none')
fig.colorbar(cax)
ticks=np.arange(0,14,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
#ax.set_xticklabels(names) 设置x轴的名称
pyplot.show()
4.3特征选择
# 特征选择:标准化数据、正态化数据
# 数据分离
array = data.values
X = array[:, 0:13]
Y = array[:, 13]
validation_size = 0.2 # 验证比例
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(
X, Y, test_size=validation_size, random_state=seed) # 数据分离
4.4数据标准化
# 数据标准化
piplines = {}
piplines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])
piplines['ScalerLasso'] = Pipeline([('Scaler', StandardScaler()), ('LASSO', Lasso())])
piplines['ScalerEN'] = Pipeline([('Scaler', StandardScaler()), ('EN', ElasticNet())])
piplines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])
piplines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())])
piplines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVR())])
results = []
for key in piplines:
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
cv_result = cross_val_score(piplines[key], X_train, Y_train, cv=kfold, scoring=scoring,
error_score='raise') # error_score='raise'显示提示信息
results.append(cv_result)
print('%s:%.3f(%.3f)' % (key, cv_result.mean(), cv_result.std()))
#查看箱型图评估
#设置支持中文
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置为黑体
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
fig = pyplot.figure()
fig.suptitle('数据标准化')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(models.keys())
pyplot.show()
输出结果:
ScalerLR:-32.249(10.883)
ScalerLasso:-37.933(15.806)
ScalerEN:-40.444(19.475)
ScalerKNN:-27.802(15.461)
ScalerCART:-21.149(14.050)
ScalerSVM:-37.356(21.108)
5评估算法
# 评估算法
num_folds = 10 # 10折交叉验证
seed = 7
scoring = 'neg_mean_squared_error' # 均方误差
models = {}
models['LR'] = LinearRegression()
models['LASSO'] = Lasso()
models['EN'] = ElasticNet() # 模糊网络
models['KNN'] = KNeighborsRegressor()
models['CART'] = DecisionTreeRegressor()
models['SVM'] = SVR()
results = []
# 只看总数
# desc = data.describe()
# print(desc.loc['count'])
for key in models:
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
cv_result = cross_val_score(models[key], X_train, Y_train, cv=kfold, scoring=scoring,
error_score='raise') # error_score='raise'显示提示信息
results.append(cv_result)
print('%s:%.3f(%.3f)' % (key, cv_result.mean(), cv_result.std()))
#查看箱型图评估
#设置支持中文
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置为黑体
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
fig = pyplot.figure()
fig.suptitle('算法比较')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(models.keys())
pyplot.show()
输出结果:
[8 rows x 14 columns]
LR:-32.249(10.883)
LASSO:-33.567(13.443)
EN:-33.308(12.972)
KNN:-37.080(16.347)
CART:-22.037(15.564)
SVM:-62.368(31.832)
6优化模型
6.1调参-K邻近算法
# 调参--K邻近算法(最好的)
# 数据标准化
scaler = StandardScaler().fit(X_train)
rescaler = scaler.transform(X_train)
# 或者以下写法
# rescaler=StandardScaler().fix_transform(X_train)
# 设置KNN算法的参数网格
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
# 初始化KNN回归模型
model = KNeighborsRegressor()
# 初始化交叉验证的折数和随机种子
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
# 使用GridSearchCV进行参数搜索
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaler, y=Y_train)
# 打印最优结果
print('最优结果:%s 使用 %s' % (grid_result.best_score_, grid_result.best_params_))
# 遍历并打印所有交叉验证的结果
cv_results = zip(grid_result.cv_results_['mean_test_score'],
grid_result.cv_results_['std_test_score'],
grid_result.cv_results_['params'])
for mean, std, params in cv_results:
print(f'{mean:.4f}({std:.4f}) with {params}') # 格式化输出平均得分、标准差和参数
# 这里的注释说明:对于每个参数组合,我们打印出其在交叉验证中的平均得分和标准差
输出结果:
最优结果:-27.682056016192412 使用 {'n_neighbors': 3}
-37.1183(15.9748) with {'n_neighbors': 1}
-27.6821(13.7744) with {'n_neighbors': 3}
-27.7537(15.6042) with {'n_neighbors': 5}
-29.8563(18.3704) with {'n_neighbors': 7}
-29.6829(18.2181) with {'n_neighbors': 9}
-30.2073(18.9710) with {'n_neighbors': 11}
-31.3286(19.0181) with {'n_neighbors': 13}
-31.8489(19.9107) with {'n_neighbors': 15}
-32.2869(19.8036) with {'n_neighbors': 17}
-32.6357(19.9702) with {'n_neighbors': 19}
-33.6418(20.1465) with {'n_neighbors': 21}
6.2集成算法
# 集成算法
ensembles = {}
ensembles['ScaledAB'] = Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostRegressor())])
ensembles['ScaledAB-KNN'] = Pipeline(
[('Scaler', StandardScaler()), ('ABKNN', AdaBoostRegressor(KNeighborsRegressor()))])
ensembles['ScaledAB-LR'] = Pipeline([('Scaler', StandardScaler()), ('ABLR', AdaBoostRegressor(LinearRegression()))])
ensembles['ScaledRFR'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestRegressor())])
ensembles['ScaledETR'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesRegressor())])
ensembles['ScaledGBR'] = Pipeline([('Scaler', StandardScaler()), ('GBR', GradientBoostingRegressor())])
results = []
for key in ensembles:
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
cv_result = cross_val_score(ensembles[key], X_train, Y_train, cv=kfold, scoring=scoring,
error_score='raise') # error_score='raise' 显示提示信息
results.append(cv_result)
print('%s:%.3f(%.3f)' % (key, cv_result.mean(), cv_result.std()))
#查看箱型图评估
#设置支持中文
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置为黑体
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
fig = pyplot.figure()
fig.suptitle('集成算法')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(models.keys())
pyplot.show()
输出结果:
ScaledAB:-13.227(8.094)
ScaledAB-KNN:-30.691(9.248)
ScaledAB-LR:-35.337(5.776)
ScaledRFR:-11.519(8.069)
ScaledETR:-8.841(6.084)
ScaledGBR:-10.645(7.395)
6.3集成算法调参
# 集成算法调参 n_estimators
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
# 定义参数网格
param_grid = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 950]}
# 梯度提升回归模型
model = GradientBoostingRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
# 使用 GridSearchCV 搜索最优参数 参数网格搜索
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
输出结果:
最优:-10.394225605634656 使用{'n_estimators': 200}
6.4ET-调参
# 集成算法调参--ET调参
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 950]}
#极端随机树回归模型
model = ExtraTreesRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
输出结果:
最优:-8.302744406197386 使用{'n_estimators': 500}
7结果部署
#Extra Trees回归器建立一个预测模型,并评估其性能
#模型的确立
scaler =StandardScaler().fit(X_train)
rescaledX=scaler.transform(X_train)
gbr =ExtraTreesRegressor(n_estimators=100)
gbr.fit(X=rescaledX,y=Y_train)
#评估篡法
rescaledX_validation =scaler.transform(X_validation)
predictions =gbr.predict(rescaledX_validation)
print(mean_squared_error(Y_validation, predictions))
输出结果:
13.299587596953911