鲍鱼数据案例(岭回归 、LASSO回归)

数据集探索性分析

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
data=pd.read_csv(r"E:\大二下\机器学习实践\abalone_dataset.csv")
data.head()
sexlengthdiameterheightwhole weightshucked weightviscera weightshell weightrings
0M0.4550.3650.0950.51400.22450.10100.15015
1M0.3500.2650.0900.22550.09950.04850.0707
2F0.5300.4200.1350.67700.25650.14150.2109
3M0.4400.3650.1250.51600.21550.11400.15510
4I0.3300.2550.0800.20500.08950.03950.0557
#查看数据集中样本数量和特征数量
data.shape
(4177, 9)
#查看数据信息,检查是否有缺失值
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
sex               4177 non-null object
length            4177 non-null float64
diameter          4177 non-null float64
height            4177 non-null float64
whole weight      4177 non-null float64
shucked weight    4177 non-null float64
viscera weight    4177 non-null float64
shell weight      4177 non-null float64
rings             4177 non-null int64
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB
data.describe()
lengthdiameterheightwhole weightshucked weightviscera weightshell weightrings
count4177.0000004177.0000004177.0000004177.0000004177.0000004177.0000004177.0000004177.000000
mean0.5239920.4078810.1395160.8287420.3593670.1805940.2388319.933684
std0.1200930.0992400.0418270.4903890.2219630.1096140.1392033.224169
min0.0750000.0550000.0000000.0020000.0010000.0005000.0015001.000000
25%0.4500000.3500000.1150000.4415000.1860000.0935000.1300008.000000
50%0.5450000.4250000.1400000.7995000.3360000.1710000.2340009.000000
75%0.6150000.4800000.1650001.1530000.5020000.2530000.32900011.000000
max0.8150000.6500001.1300002.8255001.4880000.7600001.00500029.000000

在这里插入图片描述

#观察sex列的取值的分布情况
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.countplot(x = "sex",data=data)
<matplotlib.axes._subplots.AxesSubplot at 0x27f16455080>

在这里插入图片描述

data['sex'].value_counts()
M    1528
I    1342
F    1307
Name: sex, dtype: int64
i=1 #子图计数
plt.figure(figsize=(16,8))
for col in data.columns[1:]:
    plt.subplot(4,2,i)
    i = i + 1
    sns.distplot(data[col])
plt.tight_layout()

在这里插入图片描述

sns.pairplot(data,hue="sex")
<seaborn.axisgrid.PairGrid at 0x27f16d16eb8>

在这里插入图片描述

在这里插入图片描述

corr_df = data.corr()
corr_df
lengthdiameterheightwhole weightshucked weightviscera weightshell weightrings
length1.0000000.9868120.8275540.9252610.8979140.9030180.8977060.556720
diameter0.9868121.0000000.8336840.9254520.8931620.8997240.9053300.574660
height0.8275540.8336841.0000000.8192210.7749720.7983190.8173380.557467
whole weight0.9252610.9254520.8192211.0000000.9694050.9663750.9553550.540390
shucked weight0.8979140.8931620.7749720.9694051.0000000.9319610.8826170.420884
viscera weight0.9030180.8997240.7983190.9663750.9319611.0000000.9076560.503819
shell weight0.8977060.9053300.8173380.9553550.8826170.9076561.0000000.627574
rings0.5567200.5746600.5574670.5403900.4208840.5038190.6275741.000000
fig ,ax =plt.subplots(figsize=(12,12))
##绘制热力图
ax = sns.heatmap(corr_df,linewidths=.5,
                cmap="Greens",
                annot=True,
                xticklabels=corr_df.columns,
                yticklabels=corr_df.index)
ax.xaxis.set_label_position('top')
ax.xaxis.tick_top()

在这里插入图片描述

鲍鱼数据预处理

对sex特征进行OneHot编码,便于后续模型纳入哑变量

#只用pandas的get_dummies函数对sex特征做OneHot编码处理
sex_onehot = pd.get_dummies(data["sex"],prefix="sex")
data[sex_onehot.columns] = sex_onehot
data.head()
sexlengthdiameterheightwhole weightshucked weightviscera weightshell weightringssex_Fsex_Isex_M
0M0.4550.3650.0950.51400.22450.10100.15015001
1M0.3500.2650.0900.22550.09950.04850.0707001
2F0.5300.4200.1350.67700.25650.14150.2109100
3M0.4400.3650.1250.51600.21550.11400.15510001
4I0.3300.2550.0800.20500.08950.03950.0557010
data["ones"]=1
data.head()
sexlengthdiameterheightwhole weightshucked weightviscera weightshell weightringssex_Fsex_Isex_Mones
0M0.4550.3650.0950.51400.22450.10100.150150011
1M0.3500.2650.0900.22550.09950.04850.07070011
2F0.5300.4200.1350.67700.25650.14150.21091001
3M0.4400.3650.1250.51600.21550.11400.155100011
4I0.3300.2550.0800.20500.08950.03950.05570101
data["age"]=data["rings"] + 1.5
data.head()
sexlengthdiameterheightwhole weightshucked weightviscera weightshell weightringssex_Fsex_Isex_Monesage
0M0.4550.3650.0950.51400.22450.10100.15015001116.5
1M0.3500.2650.0900.22550.09950.04850.070700118.5
2F0.5300.4200.1350.67700.25650.14150.2109100110.5
3M0.4400.3650.1250.51600.21550.11400.15510001111.5
4I0.3300.2550.0800.20500.08950.03950.055701018.5

筛选特征

在这里插入图片描述

data.columns
Index(['sex', 'length', 'diameter', 'height', 'whole weight', 'shucked weight',
       'viscera weight', 'shell weight', 'rings', 'sex_F', 'sex_I', 'sex_M',
       'ones', 'age'],
      dtype='object')
y = data["age"] #因变量
features_with_ones = ["length", "diameter", "height", "whole weight", "shucked weight",
       "viscera weight", "shell weight", "sex_F", "sex_M","ones"]
features_without_ones = ["length", "diameter", "height", "whole weight", "shucked weight",
       "viscera weight", "shell weight", "sex_F", "sex_M"]
X=data[features_with_ones]

将鲍鱼数据集划分为训练集和测试集

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9FJF6qY1-1648017224763)(attachment:image.png)]

#拆分训练集和测试集
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=111)
X
lengthdiameterheightwhole weightshucked weightviscera weightshell weightsex_Fsex_Mones
00.4550.3650.0950.51400.22450.10100.1500011
10.3500.2650.0900.22550.09950.04850.0700011
20.5300.4200.1350.67700.25650.14150.2100101
30.4400.3650.1250.51600.21550.11400.1550011
40.3300.2550.0800.20500.08950.03950.0550001
50.4250.3000.0950.35150.14100.07750.1200001
60.5300.4150.1500.77750.23700.14150.3300101
70.5450.4250.1250.76800.29400.14950.2600101
80.4750.3700.1250.50950.21650.11250.1650011
90.5500.4400.1500.89450.31450.15100.3200101
100.5250.3800.1400.60650.19400.14750.2100101
110.4300.3500.1100.40600.16750.08100.1350011
120.4900.3800.1350.54150.21750.09500.1900011
130.5350.4050.1450.68450.27250.17100.2050101
140.4700.3550.1000.47550.16750.08050.1850101
150.5000.4000.1300.66450.25800.13300.2400011
160.3550.2800.0850.29050.09500.03950.1150001
170.4400.3400.1000.45100.18800.08700.1300101
180.3650.2950.0800.25550.09700.04300.1000011
190.4500.3200.1000.38100.17050.07500.1150011
200.3550.2800.0950.24550.09550.06200.0750011
210.3800.2750.1000.22550.08000.04900.0850001
220.5650.4400.1550.93950.42750.21400.2700101
230.5500.4150.1350.76350.31800.21000.2000101
240.6150.4800.1651.16150.51300.30100.3050101
250.5600.4400.1400.92850.38250.18800.3000101
260.5800.4500.1850.99550.39450.27200.2850101
270.5900.4450.1400.93100.35600.23400.2800011
280.6050.4750.1800.93650.39400.21900.2950011
290.5750.4250.1400.86350.39300.22700.2000011
.................................
41470.6950.5500.1951.66450.72700.36000.4450011
41480.7700.6050.1752.05050.80050.52600.3550011
41490.2800.2150.0700.12400.06300.02150.0300001
41500.3300.2300.0800.14000.05650.03650.0460001
41510.3500.2500.0750.16950.08350.03550.0410001
41520.3700.2800.0900.21800.09950.05450.0615001
41530.4300.3150.1150.38400.18850.07150.1100001
41540.4350.3300.0950.39300.21900.07500.0885001
41550.4400.3500.1100.38050.15750.08950.1150001
41560.4750.3700.1100.48950.21850.10700.1460011
41570.4750.3600.1400.51350.24100.10450.1550011
41580.4800.3550.1100.44950.20100.08900.1400001
41590.5600.4400.1350.80250.35000.16150.2590101
41600.5850.4750.1651.05300.45800.21700.3000101
41610.5850.4550.1700.99450.42550.26300.2845101
41620.3850.2550.1000.31750.13700.06800.0920011
41630.3900.3100.0850.34400.18100.06950.0790001
41640.3900.2900.1000.28450.12550.06350.0810001
41650.4050.3000.0850.30350.15000.05050.0880001
41660.4750.3650.1150.49900.23200.08850.1560001
41670.5000.3800.1250.57700.26900.12650.1535011
41680.5150.4000.1250.61500.28650.12300.1765101
41690.5200.3850.1650.79100.37500.18000.1815011
41700.5500.4300.1300.83950.31550.19550.2405011
41710.5600.4300.1550.86750.40000.17200.2290011
41720.5650.4500.1650.88700.37000.23900.2490101
41730.5900.4400.1350.96600.43900.21450.2605011
41740.6000.4750.2051.17600.52550.28750.3080011
41750.6250.4850.1501.09450.53100.26100.2960101
41760.7100.5550.1951.94850.94550.37650.4950011

4177 rows × 10 columns

实现线性回归和岭回归

使用numpy实现线性回归

在这里插入图片描述

import numpy as np
def linear_regression(X,y):
    w = np.zeros_like(X.shape[1])
    if np.linalg.det(X.T.dot(X)) != 0:
        w = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
    return w
#使用上述实现的线性回归模型在鲍鱼训练集上训练模型
w1 = linear_regression(X_train,y_train)
w1 = pd.DataFrame(data = w1,index=X.columns,columns =["numpy_w"])
w1.round(decimals=2)
numpy_w
length-1.12
diameter10.00
height20.74
whole weight9.61
shucked weight-20.05
viscera weight-12.07
shell weight6.55
sex_F0.88
sex_M0.87
ones4.32

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ev9UGYGo-1648017224765)(attachment:image.png)]

使用sklearn实现线性回归

from sklearn.linear_model import LinearRegression 
lr = LinearRegression()
lr.fit(X_train[features_without_ones],y_train)
print(lr.coef_)
[ -1.118146    10.00094599  20.73712616   9.61484657 -20.05079291
 -12.06849193   6.54529076   0.87855188   0.87283083]
w1
numpy_w
length-1.118146
diameter10.000946
height20.737126
whole weight9.614847
shucked weight-20.050793
viscera weight-12.068492
shell weight6.545291
sex_F0.878552
sex_M0.872831
ones4.324477
w_lr=[]
w_lr.extend(lr.coef_)
w_lr.append(lr.intercept_)
w1["lr_sklearn_w"]=w_lr
w1.round(decimals=2)
numpy_wlr_sklearn_w
length-1.12-1.12
diameter10.0010.00
height20.7420.74
whole weight9.619.61
shucked weight-20.05-20.05
viscera weight-12.07-12.07
shell weight6.556.55
sex_F0.880.88
sex_M0.870.87
ones4.324.32
#做正则化时不包含b lambda I不是真正的系数

使用Numpy实现岭回归

def ridge_regression(X,y,ridge_lambda):
    penalty_matrix = np.eye(X.shape[1])
    penalty_matrix[X.shape[1]-1][X.shape[1]-1] = 0
    w = np.linalg.inv(X.T.dot(X) + ridge_lambda * penalty_matrix).dot(X.T).dot(y)
    return w

在鲍鱼训练集上使用ridge_regression函数训练岭回归模型,正则化系数设置为1

w2 = ridge_regression(X_train,y_train,1.0)
print(w2)
[  2.30976528   6.72038628  10.23298909   7.05879189 -17.16249532
  -7.2343118    9.3936994    0.96869974   0.9422174    4.80583032]
w1["numpy_ridge_w"] = w2
w1.round(decimals=2)
numpy_wlr_sklearn_wnumpy_ridge_w
length-1.12-1.122.31
diameter10.0010.006.72
height20.7420.7410.23
whole weight9.619.617.06
shucked weight-20.05-20.05-17.16
viscera weight-12.07-12.07-7.23
shell weight6.556.559.39
sex_F0.880.880.97
sex_M0.870.870.94
ones4.324.324.81

利用sklearn实现岭回归

与sklearn中岭回归对比,同样正则化叙述设置为1

from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train[features_without_ones],y_train)
w_ridge = []
w_ridge.extend(ridge.coef_)
w_ridge.append(ridge.intercept_)
w1["ridge_sklearn_w"] = w_ridge
w1.round(decimals=2)
numpy_wlr_sklearn_wnumpy_ridge_wridge_sklearn_w
length-1.12-1.122.312.31
diameter10.0010.006.726.72
height20.7420.7410.2310.23
whole weight9.619.617.067.06
shucked weight-20.05-20.05-17.16-17.16
viscera weight-12.07-12.07-7.23-7.23
shell weight6.556.559.399.39
sex_F0.880.880.970.97
sex_M0.870.870.940.94
ones4.324.324.814.81

岭迹分析

alphas = np.logspace(-10,10,20)
coef = pd.DataFrame()
for alpha in alphas:
    ridge_clf = Ridge(alpha=alpha)
    ridge_clf.fit(X_train[features_without_ones],y_train)
    df = pd.DataFrame([ridge_clf.coef_],columns=X_train[features_without_ones].columns)
    df['alpha']=alpha
    coef =coef.append(df,ignore_index=True)
coef.round(decimals=2)
lengthdiameterheightwhole weightshucked weightviscera weightshell weightsex_Fsex_Malpha
0-1.1210.0020.749.61-20.05-12.076.550.880.870.000000e+00
1-1.1210.0020.749.61-20.05-12.076.550.880.870.000000e+00
2-1.1210.0020.749.61-20.05-12.076.550.880.870.000000e+00
3-1.1210.0020.749.61-20.05-12.076.550.880.870.000000e+00
4-1.1210.0020.749.61-20.05-12.076.550.880.870.000000e+00
5-1.1210.0020.749.61-20.05-12.076.550.880.870.000000e+00
6-1.1210.0020.739.61-20.05-12.076.550.880.870.000000e+00
7-1.109.9820.689.60-20.04-12.056.560.880.870.000000e+00
8-0.889.7920.139.50-19.94-11.866.710.880.883.000000e-02
90.738.3315.608.55-18.97-10.057.980.920.903.000000e-01
103.205.025.405.11-13.71-3.679.611.071.003.360000e+00
111.661.761.122.53-3.54-0.093.671.331.113.793000e+01
120.510.470.221.630.180.300.790.890.694.281300e+02
130.120.100.040.460.150.090.160.210.164.832930e+03
140.010.010.000.050.020.010.020.020.025.455595e+04
150.000.000.000.000.000.000.000.000.006.158482e+05
160.000.000.000.000.000.000.000.000.006.951928e+06
170.000.000.000.000.000.000.000.000.007.847600e+07
180.000.000.000.000.000.000.000.000.008.858668e+08
190.000.000.000.000.000.000.000.000.001.000000e+10
plt.rcParams['figure.dpi'] = 300#分辨率
plt.figure(figsize=(9,6))
coef['alpha']=coef['alpha']

for feature in X_train.columns[:-1]:
    plt.plot('alpha',feature,data=coef)
ax = plt.gca()
ax.set_xscale('log')
plt.legend(loc='upper right')
plt.xlabel(r'$\alpha$',fontsize=15)
plt.ylabel('系数',fontsize=15)
Text(0, 0.5, '系数')



Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.

在这里插入图片描述

使用LASSO构建鲍鱼年龄预测模型

在这里插入图片描述

from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.01)
lasso.fit(X_train[features_without_ones],y_train)
print(lasso.coef_)
print(lasso.intercept_)
[  0.           6.37435514   0.           4.46703234 -13.44947667
  -0.          11.85934842   0.98908791   0.93313403]
6.500338023591298

LASSO的正则化路径

coef = pd.DataFrame()
for alpha in np.linspace(0.0001,0.2,20):
    lasso_clf = Lasso(alpha=alpha)
    lasso_clf.fit(X_train[features_without_ones],y_train)
    df = pd.DataFrame([lasso_clf.coef_],columns=X_train[features_without_ones].columns)
    df['alpha']=alpha
    coef = coef.append(df,ignore_index=True)
coef.head()
#绘图
plt.figure(figsize=(9,6),dpi=600)
for feature in X_train.columns[:-1]:
    plt.plot('alpha',feature,data=coef)
plt.legend(loc='upper right')
plt.xlabel(r'$\alpha$',fontsize=15)
plt.ylabel('系数',fontsize=15)
plt.show()

在这里插入图片描述

coef
lengthdiameterheightwhole weightshucked weightviscera weightshell weightsex_Fsex_Malpha
0-0.5680439.3927520.3900419.542038-19.995972-11.9003266.6353520.8814960.8751320.000100
10.0000006.025730.0000004.375754-13.127223-0.00000011.8971890.9951370.9341290.010621
20.3849270.000000.0000002.797815-7.702209-0.00000012.4785411.0934790.9482810.021142
30.0000000.000000.0000000.884778-2.7495040.00000011.7059741.0989900.8976730.031663
40.0000000.000000.0000000.322742-0.0000000.0000009.2259191.0729910.8340210.042184
50.0000000.000000.0000001.555502-0.0000000.0000004.6104251.0138240.7578910.052705
60.0000000.000000.0000002.786784-0.0000000.0000000.0000000.9547100.6818210.063226
70.0000000.000000.0000002.797514-0.0000000.0000000.0000000.8484120.5816130.073747
80.0000000.000000.0000002.807843-0.0000000.0000000.0000000.7425290.4817110.084268
90.0000000.000000.0000002.818184-0.0000000.0000000.0000000.6366320.3817990.094789
100.0000000.000000.0000002.828630-0.0000000.0000000.0000000.5306150.2818010.105311
110.0000000.000000.0000002.838944-0.0000000.0000000.0000000.4247500.1819120.115832
120.0000000.000000.0000002.849325-0.0000000.0000000.0000000.3188070.0819670.126353
130.0000000.000000.0000002.851851-0.0000000.0000000.0000000.2250240.0000000.136874
140.0000000.000000.0000002.819079-0.0000000.0000000.0000000.1861570.0000000.147395
150.0000000.000000.0000002.786307-0.0000000.0000000.0000000.1472900.0000000.157916
160.0000000.000000.0000002.7535350.0000000.0000000.0000000.1084220.0000000.168437
170.0000000.000000.0000002.7207620.0000000.0000000.0000000.0695550.0000000.178958
180.0000000.000000.0000002.6879900.0000000.0000000.0000000.0306880.0000000.189479
190.0000000.000000.0000002.6529400.0000000.0000000.0000000.0000000.0000000.200000

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VQwme8kn-1648017224767)(attachment:image.png)]

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
#MAE
y_test_pred_lr = lr.predict(X_test.iloc[:,:-1])
print(round(mean_absolute_error(y_test,y_test_pred_lr),4))
1.6016
y_test_pred_ridge = ridge.predict(X_test[features_without_ones])
print(round(mean_absolute_error(y_test,y_test_pred_ridge),4))
1.5984
y_test_pred_lasso = lasso.predict(X_test[features_without_ones])
print(round(mean_absolute_error(y_test,y_test_pred_lasso),4))
1.6402
#MSE
y_test_pred_lr = lr.predict(X_test.iloc[:,:-1])
print(round(mean_squared_error(y_test,y_test_pred_lr),4))
5.3009
y_test_pred_ridge = ridge.predict(X_test[features_without_ones])
print(round(mean_squared_error(y_test,y_test_pred_ridge),4))
4.959
y_test_pred_lasso = lasso.predict(X_test[features_without_ones])
print(round(mean_squared_error(y_test,y_test_pred_lasso),4))
5.1
#R2系数
print(round(r2_score(y_test,y_test_pred_lr),4))
print(round(r2_score(y_test,y_test_pred_ridge),4))
print(round(r2_score(y_test,y_test_pred_lasso),4))
0.5257
0.5563
0.5437

残差图

plt.figure(figsize=(9,6),dpi=600)
y_train_pred_ridge = ridge.predict(X_train[features_without_ones])
plt.scatter(y_train_pred_ridge,y_train_pred_ridge - y_train,c="g",alpha=0.6)
plt.scatter(y_test_pred_ridge,y_test_pred_ridge - y_test,c="r",alpha=0.6)
plt.hlines(y=0,xmin=0,xmax=30,color="b",alpha=0.6)
plt.ylabel("Residuals")
plt.xlabel("Predict")
Text(0.5, 0, 'Predict')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UtBjTEDE-1648017224767)(output_68_1.png)]



  • 10
    点赞
  • 79
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值