线性回归复习:普通线性回归,knn线性回归,岭回归,lasso回归


声明:内容非原创,是学习内容的总结,版权所属姜老师

线性回归引入

# 线性回归导入
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import numpy as np 
import pandas as pd
from pandas import Series,DataFrame

import matplotlib.pyplot as plt
%matplotlib inline
# 生成一组假数据 假设x和y都遵循正弦分布
x = np.linspace(0, 2*np.pi, 60)
y = np.sin(x)
plt.scatter(x,y)
<matplotlib.collections.PathCollection at 0x1facb5abdc0>

请添加图片描述

# 生成噪声数据 (-0.5, 0.5)
bias = np.random.random(30) - 0.5
bias
array([-0.39502105,  0.24340303,  0.08361402, -0.00268972, -0.03143423,
       -0.41971947,  0.2567082 , -0.07361341, -0.14754526, -0.41319023,
        0.21895744, -0.31080432,  0.21599872, -0.08704963, -0.1380524 ,
       -0.23046208,  0.3592808 , -0.16425466, -0.00648164,  0.24217655,
        0.43796813,  0.21696034,  0.4143489 ,  0.22052109,  0.25845568,
       -0.35609606, -0.00906317, -0.24740159, -0.21426299, -0.13506899])
# 把噪声数据添加到y中
y[::2] += bias
plt.scatter(x,y)
<matplotlib.collections.PathCollection at 0x1facc2489a0>

请添加图片描述

x.shape
(60,)
# vsm 映射 
X = x.reshape(-1,1)
X.shape
(60, 1)
X
array([[0.        ],
       [0.10649467],
       [0.21298933],
       [0.319484  ],
       [0.42597866],
       [0.53247333],
       [0.638968  ],
       [0.74546266],
       [0.85195733],
       [0.958452  ],
       [1.06494666],
       [1.17144133],
       [1.27793599],
       [1.38443066],
       [1.49092533],
       [1.59741999],
       [1.70391466],
       [1.81040933],
       [1.91690399],
       [2.02339866],
       [2.12989332],
       [2.23638799],
       [2.34288266],
       [2.44937732],
       [2.55587199],
       [2.66236666],
       [2.76886132],
       [2.87535599],
       [2.98185065],
       [3.08834532],
       [3.19483999],
       [3.30133465],
       [3.40782932],
       [3.51432399],
       [3.62081865],
       [3.72731332],
       [3.83380798],
       [3.94030265],
       [4.04679732],
       [4.15329198],
       [4.25978665],
       [4.36628132],
       [4.47277598],
       [4.57927065],
       [4.68576531],
       [4.79225998],
       [4.89875465],
       [5.00524931],
       [5.11174398],
       [5.21823864],
       [5.32473331],
       [5.43122798],
       [5.53772264],
       [5.64421731],
       [5.75071198],
       [5.85720664],
       [5.96370131],
       [6.07019597],
       [6.17669064],
       [6.28318531]])
# 1. knn实例化 
knn =KNeighborsRegressor(n_neighbors=3)
# 2. 训练fit 实例化好的knn回归器
knn.fit(X, y)
KNeighborsRegressor(n_neighbors=3)
# 生成测试数据,别忘了和X同样的结构
# 注意,生成的数据只要不是60就行, 因为我们样本集数据也是同样范围的等差计算出来的
X_test = np.linspace(0, 2*np.pi, 45).reshape(-1,1)
X_test
array([[0.        ],
       [0.14279967],
       [0.28559933],
       [0.428399  ],
       [0.57119866],
       [0.71399833],
       [0.856798  ],
       [0.99959766],
       [1.14239733],
       [1.28519699],
       [1.42799666],
       [1.57079633],
       [1.71359599],
       [1.85639566],
       [1.99919533],
       [2.14199499],
       [2.28479466],
       [2.42759432],
       [2.57039399],
       [2.71319366],
       [2.85599332],
       [2.99879299],
       [3.14159265],
       [3.28439232],
       [3.42719199],
       [3.56999165],
       [3.71279132],
       [3.85559098],
       [3.99839065],
       [4.14119032],
       [4.28398998],
       [4.42678965],
       [4.56958931],
       [4.71238898],
       [4.85518865],
       [4.99798831],
       [5.14078798],
       [5.28358764],
       [5.42638731],
       [5.56918698],
       [5.71198664],
       [5.85478631],
       [5.99758598],
       [6.14038564],
       [6.28318531]])
# 3.预测:使用X_test作为特征
y1_ = knn.predict(X_test)
y1_
array([ 0.01827837,  0.01827837,  0.42913475,  0.41198503,  0.60033228,
        0.61071011,  0.59043822,  0.52817234,  0.74745077,  0.91117407,
        1.05438104,  1.00544205,  0.87894106,  0.60410825,  0.58399583,
        0.78779728,  0.74957241,  0.86489298,  0.75704464,  0.76885007,
        0.24759279,  0.04078003, -0.21986071, -0.2357875 , -0.18423447,
       -0.53796589, -0.53252107, -0.53902763, -0.64776802, -0.58808059,
       -0.77527639, -0.9766071 , -0.94502224, -0.94463684, -0.85647279,
       -0.75180678, -0.83833303, -0.76975748, -0.66825555, -0.6397318 ,
       -0.43908767, -0.50822301, -0.47612273, -0.11130414, -0.11130414])
# 绘制 knn回归的预测结果
plt.figure(figsize=(12,6))
plt.scatter(x, y ,label='True Data', color ='blue')
plt.plot(X_test,y1_, label='KNN prediction Line',color = 'green')
plt.legend()
plt.show()

请添加图片描述

# 使用线性回归做预测
# 1.实例化线性模型 linearRegression
linear = LinearRegression()
# 2.fit训练模型 给定x和y
linear.fit(X,y)
LinearRegression()
# 3.模型预测 利用X_test进行预测
y2_ = linear.predict(X_test)
y2_
array([ 8.35573602e-01,  7.97617275e-01,  7.59660948e-01,  7.21704620e-01,
        6.83748293e-01,  6.45791966e-01,  6.07835639e-01,  5.69879311e-01,
        5.31922984e-01,  4.93966657e-01,  4.56010330e-01,  4.18054002e-01,
        3.80097675e-01,  3.42141348e-01,  3.04185021e-01,  2.66228693e-01,
        2.28272366e-01,  1.90316039e-01,  1.52359712e-01,  1.14403384e-01,
        7.64470572e-02,  3.84907299e-02,  5.34402699e-04, -3.74219245e-02,
       -7.53782518e-02, -1.13334579e-01, -1.51290906e-01, -1.89247234e-01,
       -2.27203561e-01, -2.65159888e-01, -3.03116215e-01, -3.41072542e-01,
       -3.79028870e-01, -4.16985197e-01, -4.54941524e-01, -4.92897851e-01,
       -5.30854179e-01, -5.68810506e-01, -6.06766833e-01, -6.44723160e-01,
       -6.82679488e-01, -7.20635815e-01, -7.58592142e-01, -7.96548469e-01,
       -8.34504797e-01])
# 预测结果作一个绘制
plt.scatter(x,y,label='TRUE',color = 'skyblue')
plt.plot(X_test,y1_,label='KNN',color='pink')
plt.plot(X_test,y2_,label='Linear',color='yellow')
plt.legend()
plt.show()

请添加图片描述

knn线性回归(糖尿病数据集)

```python
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import load_diabetes
# 这个是diabetes糖尿病数据集
diabetes = load_diabetes()
diabetes
{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990842, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06832974, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286377, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04687948,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452837, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00421986,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
        128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
        150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
        200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
         42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
         83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
        104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
        173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
        107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
         60., 174., 259., 178., 128.,  96., 126., 288.,  88., 292.,  71.,
        197., 186.,  25.,  84.,  96., 195.,  53., 217., 172., 131., 214.,
         59.,  70., 220., 268., 152.,  47.,  74., 295., 101., 151., 127.,
        237., 225.,  81., 151., 107.,  64., 138., 185., 265., 101., 137.,
        143., 141.,  79., 292., 178.,  91., 116.,  86., 122.,  72., 129.,
        142.,  90., 158.,  39., 196., 222., 277.,  99., 196., 202., 155.,
         77., 191.,  70.,  73.,  49.,  65., 263., 248., 296., 214., 185.,
         78.,  93., 252., 150.,  77., 208.,  77., 108., 160.,  53., 220.,
        154., 259.,  90., 246., 124.,  67.,  72., 257., 262., 275., 177.,
         71.,  47., 187., 125.,  78.,  51., 258., 215., 303., 243.,  91.,
        150., 310., 153., 346.,  63.,  89.,  50.,  39., 103., 308., 116.,
        145.,  74.,  45., 115., 264.,  87., 202., 127., 182., 241.,  66.,
         94., 283.,  64., 102., 200., 265.,  94., 230., 181., 156., 233.,
         60., 219.,  80.,  68., 332., 248.,  84., 200.,  55.,  85.,  89.,
         31., 129.,  83., 275.,  65., 198., 236., 253., 124.,  44., 172.,
        114., 142., 109., 180., 144., 163., 147.,  97., 220., 190., 109.,
        191., 122., 230., 242., 248., 249., 192., 131., 237.,  78., 135.,
        244., 199., 270., 164.,  72.,  96., 306.,  91., 214.,  95., 216.,
        263., 178., 113., 200., 139., 139.,  88., 148.,  88., 243.,  71.,
         77., 109., 272.,  60.,  54., 221.,  90., 311., 281., 182., 321.,
         58., 262., 206., 233., 242., 123., 167.,  63., 197.,  71., 168.,
        140., 217., 121., 235., 245.,  40.,  52., 104., 132.,  88.,  69.,
        219.,  72., 201., 110.,  51., 277.,  63., 118.,  69., 273., 258.,
         43., 198., 242., 232., 175.,  93., 168., 275., 293., 281.,  72.,
        140., 189., 181., 209., 136., 261., 113., 131., 174., 257.,  55.,
         84.,  42., 146., 212., 233.,  91., 111., 152., 120.,  67., 310.,
         94., 183.,  66., 173.,  72.,  49.,  64.,  48., 178., 104., 132.,
        220.,  57.]),
 'frame': None,
 'DESCR': '.. _diabetes_dataset:\n\nDiabetes dataset\n----------------\n\nTen baseline variables, age, sex, body mass index, average blood\npressure, and six blood serum measurements were obtained for each of n =\n442 diabetes patients, as well as the response of interest, a\nquantitative measure of disease progression one year after baseline.\n\n**Data Set Characteristics:**\n\n  :Number of Instances: 442\n\n  :Number of Attributes: First 10 columns are numeric predictive values\n\n  :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n\n  :Attribute Information:\n      - age     age in years\n      - sex\n      - bmi     body mass index\n      - bp      average blood pressure\n      - s1      tc, total serum cholesterol\n      - s2      ldl, low-density lipoproteins\n      - s3      hdl, high-density lipoproteins\n      - s4      tch, total cholesterol / HDL\n      - s5      ltg, possibly log of serum triglycerides level\n      - s6      glu, blood sugar level\n\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\n\nSource URL:\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n\nFor more information see:\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)',
 'feature_names': ['age',
  'sex',
  'bmi',
  'bp',
  's1',
  's2',
  's3',
  's4',
  's5',
  's6'],
 'data_filename': 'D:\\software\\anaconda\\lib\\site-packages\\sklearn\\datasets\\data\\diabetes_data.csv.gz',
 'target_filename': 'D:\\software\\anaconda\\lib\\site-packages\\sklearn\\datasets\\data\\diabetes_target.csv.gz'}
print(diabetes.DESCR)
.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.
(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)
diabetes.feature_names
['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
# 导出数据成为DataFrame
dataSets = DataFrame(data=diabetes.data,columns=diabetes.feature_names)
dataSets.head()
agesexbmibps1s2s3s4s5s6
00.0380760.0506800.0616960.021872-0.044223-0.034821-0.043401-0.0025920.019908-0.017646
1-0.001882-0.044642-0.051474-0.026328-0.008449-0.0191630.074412-0.039493-0.068330-0.092204
20.0852990.0506800.044451-0.005671-0.045599-0.034194-0.032356-0.0025920.002864-0.025930
3-0.089063-0.044642-0.011595-0.0366560.0121910.024991-0.0360380.0343090.022692-0.009362
40.005383-0.044642-0.0363850.0218720.0039350.0155960.008142-0.002592-0.031991-0.046641
dataSets.describe().T
countmeanstdmin25%50%75%max
age442.0-3.634285e-160.047619-0.107226-0.0372990.0053830.0380760.110727
sex442.01.308343e-160.047619-0.044642-0.044642-0.0446420.0506800.050680
bmi442.0-8.045349e-160.047619-0.090275-0.034229-0.0072840.0312480.170555
bp442.01.281655e-160.047619-0.112400-0.036656-0.0056710.0356440.132044
s1442.0-8.835316e-170.047619-0.126781-0.034248-0.0043210.0283580.153914
s2442.01.327024e-160.047619-0.115613-0.030358-0.0038190.0298440.198788
s3442.0-4.574646e-160.047619-0.102307-0.035117-0.0065840.0293120.181179
s4442.03.777301e-160.047619-0.076395-0.039493-0.0025920.0343090.185234
s5442.0-3.830854e-160.047619-0.126097-0.033249-0.0019480.0324330.133599
s6442.0-3.412882e-160.047619-0.137767-0.033179-0.0010780.0279170.135612
# 样本集拆分
from sklearn.model_selection import train_test_split
# 标签拿进来
target = diabetes.target
# 拆分训练集测试集
X_train,X_test,y_train,y_test = train_test_split(dataSets, target,test_size=0.2, random_state=2)
# 1.实例化模型 knn
knn = KNeighborsRegressor()
# 2.训练模型fit
knn.fit(X_train,y_train)
KNeighborsRegressor()
# MAE MSE
from sklearn.metrics import mean_squared_error, mean_absolute_error
# 泛化误差 -mse
mean_squared_error(y_test, knn.predict(X_test))
3941.039550561798
# 平均绝对误差  mae
mean_absolute_error(y_test, knn.predict(X_test))
50.06966292134831
target
array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 288.,  88., 292.,  71.,
       197., 186.,  25.,  84.,  96., 195.,  53., 217., 172., 131., 214.,
        59.,  70., 220., 268., 152.,  47.,  74., 295., 101., 151., 127.,
       237., 225.,  81., 151., 107.,  64., 138., 185., 265., 101., 137.,
       143., 141.,  79., 292., 178.,  91., 116.,  86., 122.,  72., 129.,
       142.,  90., 158.,  39., 196., 222., 277.,  99., 196., 202., 155.,
        77., 191.,  70.,  73.,  49.,  65., 263., 248., 296., 214., 185.,
        78.,  93., 252., 150.,  77., 208.,  77., 108., 160.,  53., 220.,
       154., 259.,  90., 246., 124.,  67.,  72., 257., 262., 275., 177.,
        71.,  47., 187., 125.,  78.,  51., 258., 215., 303., 243.,  91.,
       150., 310., 153., 346.,  63.,  89.,  50.,  39., 103., 308., 116.,
       145.,  74.,  45., 115., 264.,  87., 202., 127., 182., 241.,  66.,
        94., 283.,  64., 102., 200., 265.,  94., 230., 181., 156., 233.,
        60., 219.,  80.,  68., 332., 248.,  84., 200.,  55.,  85.,  89.,
        31., 129.,  83., 275.,  65., 198., 236., 253., 124.,  44., 172.,
       114., 142., 109., 180., 144., 163., 147.,  97., 220., 190., 109.,
       191., 122., 230., 242., 248., 249., 192., 131., 237.,  78., 135.,
       244., 199., 270., 164.,  72.,  96., 306.,  91., 214.,  95., 216.,
       263., 178., 113., 200., 139., 139.,  88., 148.,  88., 243.,  71.,
        77., 109., 272.,  60.,  54., 221.,  90., 311., 281., 182., 321.,
        58., 262., 206., 233., 242., 123., 167.,  63., 197.,  71., 168.,
       140., 217., 121., 235., 245.,  40.,  52., 104., 132.,  88.,  69.,
       219.,  72., 201., 110.,  51., 277.,  63., 118.,  69., 273., 258.,
        43., 198., 242., 232., 175.,  93., 168., 275., 293., 281.,  72.,
       140., 189., 181., 209., 136., 261., 113., 131., 174., 257.,  55.,
        84.,  42., 146., 212., 233.,  91., 111., 152., 120.,  67., 310.,
        94., 183.,  66., 173.,  72.,  49.,  64.,  48., 178., 104., 132.,
       220.,  57.])
# 消除数据的量级(量纲)
# 1. 标准化 Z-SCORE StanderdScaller  - 均值为0 标准差为1
# 2. 区缩放 MinMaxScaller 缩放到[0,1]
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# 首先 查看target的数据分布
sns.distplot(target)
<AxesSubplot:ylabel='Density'>

请添加图片描述

# 1. 实例化
ss = StandardScaler()
# 2. 转化 
ss_target = ss.fit_transform(target.reshape(-1,1))
ss_target
array([[-1.47194752e-02],
       [-1.00165882e+00],
       [-1.44579915e-01],
       [ 6.99512942e-01],
       [-2.22496178e-01],
       [-7.15965848e-01],
       [-1.83538046e-01],
       [-1.15749134e+00],
       [-5.47147277e-01],
       [ 2.05006151e+00],
     ......
       [-2.61454310e-01],
       [ 8.81317557e-01],
       [-1.23540761e+00]])
sns.distplot(ss_target)
<AxesSubplot:ylabel='Density'>

请添加图片描述

# 使用标准化之后的target 来建模(ss_target)
# 【注意】 random_state 要和之前的保持一致, 为的是保证两个模型使用的训练集和测试集一致
X_train, X_test, y_train, y_test = train_test_split(dataSets, ss_target, test_size=0.2, random_state=2)
# 重新训练k=5这个对象
knn.fit(X_train,y_train)
KNeighborsRegressor()
# 训练之后的新的评判
mean_squared_error(y_test,knn.predict(X_test))
0.6646064163260871
mean_absolute_error(y_test,knn.predict(X_test))
0.6502068430871709

找到一个更好的模型(knn调参,评价(MAE,MSE))

# 先试一下使用线性回归
#1.实例化
linear = LinearRegression()
#2.训
linear.fit(X_train,y_train)
#3.评判
mean_squared_error(y_test,linear.predict(X_test))
0.5218363683134858

问题:

  1. knn是可以调参的
  2. 算法评分是某一次的随机值
  3. 回归问题模型的好坏,要看泛化误差 和 经验误差
# 定义一个函数来评判模型
def cal_score(model, X,y, count, test_size):
    train_mse_list= []
    test_mse_list = []
    
    for i in range(count):
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_size)
        model.fit(X,y)
        train_mse_list.append(mean_squared_error(y_train, model.predict(X_train)))
        test_mse_list.append(mean_squared_error(y_test, model.predict(X_test)))
    return np.array(train_mse_list),np.array(test_mse_list)
knn = KNeighborsRegressor(n_neighbors=5)
train_score_array,test_score_array = cal_score(model=knn,X=dataSets.values,y=target,count=10,test_size=0.3)
train_score_array.mean() ,train_score_array.std()
(2336.3235339805824, 87.16520782728838)
test_score_array.mean() ,test_score_array.std()
(2357.03569924812, 202.51164826039167)
linear = LinearRegression()
train_score_array,test_score_array = cal_score(model=linear,X=dataSets.values,y=target,count=10,test_size=0.3)
train_score_array.mean() ,train_score_array.std()
(2807.125616206974, 100.88450030074503)
test_score_array.mean() ,test_score_array.std()
(2981.8145928385716, 234.3857939318062)
# 评判一下 在不同的k值上的knn模型的表现
k_list = np.arange(1,21,2)

cols = []
for k in k_list:
    knn = KNeighborsRegressor(n_neighbors=k)
    train_scores,test_scores = cal_score(model=knn,X=dataSets.values,y=target,count=10,test_size=0.2)
    row = [k,
           train_scores.mean().tolist(),
           test_scores.mean().tolist(),
           train_scores.std().tolist(),
           test_scores.std().tolist()   
    ]
    cols.append(row)
result= DataFrame(data=np.array(cols),columns=['k','Train_mean','Test_mean','Train_std','Test_std'])
result
kTrain_meanTest_meanTrain_stdTest_std
01.00.0000000.0000000.0000000.000000
13.01937.2971362021.64544372.113122286.021709
25.02333.3909462378.906921111.330407441.568918
37.02635.1952072542.22942052.116173206.707967
49.02731.6096842615.47195272.131433286.094334
511.02724.2620862998.79789259.500027235.994490
613.02829.2403622869.21319740.296168159.826374
715.02794.5466573125.19609568.837757273.030655
817.02898.6272822875.76173669.569247275.931959
919.02868.6873652924.79871866.040720261.936788
result.set_index('k').plot()
plt.xticks(result['k'].values)
plt.show()

请添加图片描述

# 1.经验误差和泛化误差越接近并且越小
# 2. 经验误差和泛化误差的标准方差更小,越小越稳定


# 我们还能做什么?
# 还可以做特征处理。
# 理解数据
dataSets.describe().T
countmeanstdmin25%50%75%max
age442.0-3.634285e-160.047619-0.107226-0.0372990.0053830.0380760.110727
sex442.01.308343e-160.047619-0.044642-0.044642-0.0446420.0506800.050680
bmi442.0-8.045349e-160.047619-0.090275-0.034229-0.0072840.0312480.170555
bp442.01.281655e-160.047619-0.112400-0.036656-0.0056710.0356440.132044
s1442.0-8.835316e-170.047619-0.126781-0.034248-0.0043210.0283580.153914
s2442.01.327024e-160.047619-0.115613-0.030358-0.0038190.0298440.198788
s3442.0-4.574646e-160.047619-0.102307-0.035117-0.0065840.0293120.181179
s4442.03.777301e-160.047619-0.076395-0.039493-0.0025920.0343090.185234
s5442.0-3.830854e-160.047619-0.126097-0.033249-0.0019480.0324330.133599
s6442.0-3.412882e-160.047619-0.137767-0.033179-0.0010780.0279170.135612
# 通过画图来查看数据分布
for col_name in dataSets.columns:
    col_data = dataSets[col_name]
    sns.distplot(col_data)
    plt.title(col_name)
    plt.show()

请添加图片描述

请添加图片描述

请添加图片描述

请添加图片描述

请添加图片描述

请添加图片描述

请添加图片描述

请添加图片描述

请添加图片描述

请添加图片描述

# 各种转化的目的 为了让特征值尽可能的正态分布
# 好处: 1.减少算法的迭代次数  2.数据降噪

岭回归

X W y
1 2 3 w1 1
2 3 4 w2 3
2 4 6 w3 2

a1 + b2 + c*3 =1

任何样本数据都必然拥有以下特点:
- 1. 存在噪声
- 2. 多重共线性
- 可能无法避免

如何解决这些问题呢: 角度一:算法的角度(正则项) 角度二:从数据集的角度(主要解决思路 也最有效果)

X.T *X

X W y
1+λ 2 3 w1 1
2 3+λ 4 w2 3
2 4 6+λ w3 2

y = f(x) + bias + 正则项

y = f(x) + 1 - 1

缩减算法:减少 无用特征影响系数w —>0

过拟合:模型对训练集的局部特征过度关注导致的

100收入 4身高 1地域 0.0005电话 0.01学历

岭回归的基本使用

import numpy as np
X = np.array([[1,2,3,4],[1,3,8,5]])
X
array([[1, 2, 3, 4],
       [1, 3, 8, 5]])
y = np.array([3,5])
y
array([3, 5])
from sklearn.linear_model import LinearRegression,Ridge
lr = LinearRegression()
lr.fit(X, y)
LinearRegression()
# 岭回归的使用
ridge = Ridge()
ridge.fit(X,y)
Ridge()

获取线性回归的系数

lr.coef_
array([5.55111512e-17, 7.40740741e-02, 3.70370370e-01, 7.40740741e-02])
ridge.coef_
array([0.        , 0.06896552, 0.34482759, 0.06896552])

获取线性回归的截距

lr.intercept_
1.4444444444444446
ridge.intercept_
1.6206896551724141

糖尿病的回归分析

from sklearn.datasets import load_diabetes
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
# 拿出数据
diabetes = load_diabetes()
train = diabetes.data # X
target = diabetes.target # y
feature_names = diabetes.feature_names
feature_names
['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
# 1.实例化岭回归
ridge = Ridge(alpha=1.0)
# 2.拆分测试机和训练集
X_train, X_test, y_train, y_test = train_test_split(train,target,test_size=0.2)
# 3.模型训练
ridge.fit(X_train,y_train)
Ridge()

利用系数来表达各个特征对目标的影响,进而可以根据这个影响的大小来进行特征选择

ridge.coef_
array([  36.01468243,  -90.75889609,  287.47200232,  180.86788548,
         18.90897004,  -23.19316941, -137.06157738,  123.45193373,
        260.76150489,  110.61743971])
feature_names
['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
import seaborn as sns
sns.set()
# 绘图来查看一下影响因子
importances =  Series(data=ridge.coef_,index=feature_names)
importances.plot(kind='bar')
plt.xticks(rotation=0)
plt.show()

请添加图片描述

# 在特征重要性选择中,即使负数也会影响到整体,因此我们使用绝对值进行处理
importances =  Series(data=np.abs(ridge.coef_),index=feature_names).sort_values(ascending=False)
importances.plot(kind='bar')
plt.xticks(rotation=0)
plt.show()

请添加图片描述

根据特征的重要性,我们发现前7个特征更加重要,因此我们可以选择前7个特征重新建模

importances
bmi    287.472002
s5     260.761505
bp     180.867885
s3     137.061577
s4     123.451934
s6     110.617440
sex     90.758896
age     36.014682
s2      23.193169
s1      18.908970
dtype: float64
importances_columns = ['bmi','s5','bp','s3','s4','s6','sex']
importances_columns
['bmi', 's5', 'bp', 's3', 's4', 's6', 'sex']
dataSets = DataFrame(data=train,columns=feature_names)
dataSets.head()
agesexbmibps1s2s3s4s5s6
00.0380760.0506800.0616960.021872-0.044223-0.034821-0.043401-0.0025920.019908-0.017646
1-0.001882-0.044642-0.051474-0.026328-0.008449-0.0191630.074412-0.039493-0.068330-0.092204
20.0852990.0506800.044451-0.005671-0.045599-0.034194-0.032356-0.0025920.002864-0.025930
3-0.089063-0.044642-0.011595-0.0366560.0121910.024991-0.0360380.0343090.022692-0.009362
40.005383-0.044642-0.0363850.0218720.0039350.0155960.008142-0.002592-0.031991-0.046641
X = dataSets[importances_columns]
X.head()
bmis5bps3s4s6sex
00.0616960.0199080.021872-0.043401-0.002592-0.0176460.050680
1-0.051474-0.068330-0.0263280.074412-0.039493-0.092204-0.044642
20.0444510.002864-0.005671-0.032356-0.002592-0.0259300.050680
3-0.0115950.022692-0.036656-0.0360380.034309-0.009362-0.044642
4-0.036385-0.0319910.0218720.008142-0.002592-0.046641-0.044642
y = target
# 已经做了特征选择的数据 重新进行划分
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
# 对已经使用了特征选择的数据 进行建模
ridge = Ridge(alpha=1.0)
ridge.fit(X_train,y_train)
Ridge()
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,ridge.predict(X_test))
3276.750176885635
linear = LinearRegression()
linear.fit(X_train,y_train)
LinearRegression()
mean_squared_error(y_test,linear.predict(X_test))
3080.257052972186
# 比较 没有做特征选择的样本集
X_train1,X_test1,y_train1,y_test1 = train_test_split(train,target,test_size=0.2,random_state=1)
linear = LinearRegression()
linear.fit(X_train1,y_train1)

mean_squared_error(y_test1, linear.predict(X_test1))
2992.5576814529445
# Ridge LinearRegression 的系数都可以用来做特征选择
建模的核心:
数据质量(特征工程)
其次 才是算法和调参
# 特征工程 (目的:为了让模型可以得到更好的一个数据集)
# 特征选择 (基于算法的选择 常用的算法 基于线性的 决策树的..)
# 特征提取 (基于经验提取)
# 无量纲处理 (基于标准正态分布)
# 分箱操作
# 空值填充
# 异常值过滤

lasso回归

from sklearn.linear_model import Lasso,Ridge,LinearRegression
import numpy as np 

X = np.array([[1,2,3,4,5],[2,3,2,1,7]])
X
array([[1, 2, 3, 4, 5],
       [2, 3, 2, 1, 7]])
y = np.array([1,2])
y
array([1, 2])
# 使用拉索回归建模
# 1.实例化
lasso = Lasso(alpha=0.001)
# 2.训练模型
lasso.fit(X,y)
Lasso(alpha=0.001)
lasso.coef_
array([ 0.        ,  0.        , -0.        , -0.33288889,  0.        ])
ridge = Ridge()
ridge.fit(X,y)
ridge.coef_
array([ 0.05555556,  0.05555556, -0.05555556, -0.16666667,  0.11111111])
linear = LinearRegression()
linear.fit(X,y)
linear.coef_
array([ 0.0625,  0.0625, -0.0625, -0.1875,  0.125 ])

三种不同的回归系数比较

a = np.array([1,2,3,4,5])
a
array([1, 2, 3, 4, 5])
a[[0,2]]= 0
a
array([0, 2, 0, 4, 5])
W[np.random.permutation(200)[10:]]
array([9.96516152e-01, 5.32391471e-01, 4.93905634e-02, 9.85577787e-01,
       8.99366918e-01, 7.54742781e-01, 3.64525327e-01, 9.63422337e-01,
       5.36894360e-01, 3.38068592e-01, 1.01662067e-01, 7.71842842e-01,
       3.54289304e-01, 4.07413512e-01, 8.47113116e-01, 5.77998640e-01,
       3.53312178e-01, 3.77569649e-02, 5.13445478e-01, 6.01971296e-01,
       2.17216303e-01, 6.06875883e-01, 1.16501776e-01, 9.29483618e-01,
       1.94729718e-01, 3.79243355e-01, 2.32664998e-01, 8.11550904e-01,
       3.45585212e-01, 8.97118361e-01, 5.18677931e-01, 4.76157464e-02,
       5.46642599e-01, 6.28107676e-01, 6.99770540e-01, 5.53885482e-01,
       1.06766013e-01, 7.41079780e-01, 9.47096359e-01, 7.76124225e-02,
       4.32180516e-01, 7.04356613e-01, 8.82416582e-01, 8.81437686e-02,
       4.85216991e-02, 4.52000028e-01, 3.82505394e-01, 9.36136345e-01,
       5.70745661e-01, 2.92274222e-01, 4.26258679e-01, 6.16950539e-01,
       5.18476200e-01, 8.96571765e-01, 3.28256203e-01, 7.68933452e-01,
       1.20772148e-01, 1.69006603e-02, 4.60183339e-01, 3.28257762e-01,
       3.98607160e-01, 8.70438678e-01, 1.51239976e-01, 7.40343824e-01,
       4.63024024e-01, 5.76156560e-01, 1.92047942e-03, 9.62859485e-01,
       7.33956356e-01, 2.39740967e-01, 1.83232929e-01, 1.00671402e-01,
       1.30679135e-01, 3.73608097e-01, 6.00838009e-01, 4.40710351e-01,
       5.70835282e-01, 2.32309233e-01, 8.87733637e-01, 6.36192443e-01,
       7.90478439e-01, 2.63124712e-01, 2.88636831e-01, 2.55830360e-02,
       4.13459126e-01, 9.72222742e-01, 6.12679019e-01, 6.05975771e-01,
       9.03416989e-01, 3.47247811e-01, 9.51756590e-01, 5.69286087e-01,
       9.65890196e-01, 5.82139890e-01, 5.51093440e-01, 5.56825893e-01,
       6.24669833e-01, 5.51314324e-01, 6.48828274e-01, 3.02442666e-01,
       6.84657374e-01, 3.71998163e-01, 7.16317195e-01, 9.65935762e-01,
       7.65903638e-01, 3.70943741e-01, 1.16563475e-01, 1.72965641e-01,
       9.18789086e-01, 8.81839640e-01, 7.13009929e-01, 6.42216668e-01,
       4.15987478e-01, 7.91627848e-02, 8.95408244e-01, 4.77017976e-01,
       8.99848891e-01, 9.69446936e-01, 9.39478644e-01, 1.77600703e-01,
       1.59593839e-01, 6.76554096e-01, 2.64041902e-01, 9.89595420e-01,
       2.90237110e-01, 4.25976169e-01, 8.44444767e-01, 2.10848895e-02,
       7.76392532e-01, 2.79007862e-01, 5.35291152e-02, 9.54334908e-01,
       8.78255077e-01, 2.58898672e-01, 3.17425087e-01, 3.63382801e-01,
       2.19014741e-01, 1.82570099e-01, 6.56801360e-01, 6.86939280e-01,
       1.08914417e-01, 1.94762611e-04, 2.22156323e-01, 3.97132753e-01,
       4.11637144e-01, 3.98905184e-01, 4.26635545e-01, 1.26789802e-01,
       4.30105437e-02, 9.35596787e-01, 3.33434333e-01, 4.57113238e-01,
       7.28383428e-01, 7.32163308e-01, 3.15656355e-02, 2.87251282e-01,
       9.53105336e-01, 6.89125739e-01, 2.34995553e-01, 9.40084446e-01,
       9.13471128e-01, 3.74629473e-01, 5.50133348e-01, 4.42872398e-01,
       6.28822215e-01, 6.61706633e-01, 6.25204117e-01, 5.80891452e-01,
       6.96894553e-01, 3.42318889e-01, 3.35023780e-01, 7.54764315e-01,
       5.51524765e-01, 7.14882962e-01, 7.97392627e-01, 8.60508187e-01,
       4.49895726e-01, 9.66104169e-01, 3.80176775e-01, 6.78332181e-01,
       6.38685181e-01, 8.87437347e-01, 5.55300557e-01, 6.84089183e-01,
       4.23983590e-01, 9.48933675e-01, 5.42585937e-01, 6.54043901e-01,
       2.96119799e-01, 2.78872800e-01])
# 数据构造
samples = 50
features = 200


# 假设真实的数据中,对结果有真实作用的特征就10个
X = np.random.random(size=(samples,features))
W = np.random.random(features)

# W 这两百个数据中(系数),只有随机的10个是非0 其他都是0
W[np.random.permutation(200)[10:]] = 0 

y = np.dot(X,W)

W
array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.4271518 , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.92177717,
       0.        , 0.        , 0.        , 0.        , 0.31777111,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.55356324,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.56237781, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.51434962, 0.        , 0.        ,
       0.        , 0.        , 0.56461673, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.18319368,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.65038963, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.92661222, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(W)
[<matplotlib.lines.Line2D at 0x1c8ea5fcf40>]

请添加图片描述

# 针对实战情况,三种不同的回归对比
# 1.实例化
linear = LinearRegression()
ridge = Ridge(alpha=10)
lasso = Lasso(alpha=0.01)

# 2.训练
linear.fit(X,y)
ridge.fit(X,y)
lasso.fit(X,y)

# 画图
plt.figure(figsize=(16,4))

ax1 = plt.subplot(1, 3, 1)
plt.plot(linear.coef_,label='linear')
plt.legend()

ax2 = plt.subplot(1,3,2)
plt.plot(ridge.coef_,label ='ridge')
plt.legend()

ax3 = plt.subplot(1,3,3)
plt.plot(lasso.coef_,label='lasso')
plt.legend()
<matplotlib.legend.Legend at 0x1c8ea6b7b50>

请添加图片描述


  • 1
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值