机器学习回归算法汇总_加利福尼亚房价

一.探索数据

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['font.serif'] = ['Microsoft YaHei']
from sklearn.datasets.california_housing import fetch_california_housing
housing = fetch_california_housing()
X = housing.data
y = housing.target
X.shape
(20640, 8)
y.shape
(20640,)
housing.feature_names
['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']
MedInc人均收入
HouseAge房龄
AveRooms房间数
AveBedrooms卧室数
Population小区人口数
AveOccup房屋居住人数
Longitude小区经度
Latitude小区纬度
housing.feature_names =['人均收入','房龄','房间数', '卧室数', '小区人口数', '房屋居住人数', '小区经度', '小区纬度']
housing.feature_names
['人均收入', '房龄', '房间数', '卧室数', '小区人口数', '房屋居住人数', '小区经度', '小区纬度']
X
array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])
X = pd.DataFrame(X,columns=housing.feature_names)
X.head(10)
人均收入房龄房间数卧室数小区人口数房屋居住人数小区经度小区纬度
08.325241.06.9841271.023810322.02.55555637.88-122.23
18.301421.06.2381370.9718802401.02.10984237.86-122.22
27.257452.08.2881361.073446496.02.80226037.85-122.24
35.643152.05.8173521.073059558.02.54794537.85-122.25
43.846252.06.2818531.081081565.02.18146737.85-122.25
54.036852.04.7616581.103627413.02.13989637.85-122.25
63.659152.04.9319070.9513621094.02.12840537.84-122.25
73.120052.04.7975271.0618241157.01.78825337.84-122.25
82.080442.04.2941181.1176471206.02.02689137.84-122.26
93.691252.04.9705880.9901961551.02.17226937.84-122.25
y = pd.DataFrame(y,columns =[ '房价'])
y
房价
04.526
13.585
23.521
33.413
43.422
......
206350.781
206360.771
206370.923
206380.847
206390.894

20640 rows × 1 columns

data = pd.concat([X,y],axis = 1)
data.head()
人均收入房龄房间数卧室数小区人口数房屋居住人数小区经度小区纬度房价
08.325241.06.9841271.023810322.02.55555637.88-122.234.526
18.301421.06.2381370.9718802401.02.10984237.86-122.223.585
27.257452.08.2881361.073446496.02.80226037.85-122.243.521
35.643152.05.8173521.073059558.02.54794537.85-122.253.413
43.846252.06.2818531.081081565.02.18146737.85-122.253.422
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
人均收入      20640 non-null float64
房龄        20640 non-null float64
房间数       20640 non-null float64
卧室数       20640 non-null float64
小区人口数     20640 non-null float64
房屋居住人数    20640 non-null float64
小区经度      20640 non-null float64
小区纬度      20640 non-null float64
房价        20640 non-null float64
dtypes: float64(9)
memory usage: 1.4 MB
data.describe()

在这里插入图片描述

data.hist(bins=40,figsize=(12,12))
plt.show()

在这里插入图片描述

ax = sns.heatmap(X.corr(),annot=True,linewidths=0.2,annot_kws={'size':8})
fig=plt.gcf()
fig.set_size_inches(8,8)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

在这里插入图片描述
数据探索在这部分中,我们不会修改任何数据,只是通过各种统计手段帮助我们对这组数据有一个直观的了解,为之后的处理提取信息。

data.isnull().sum()
人均收入      0
房龄        0
房间数       0
卧室数       0
小区人口数     0
房屋居住人数    0
小区经度      0
小区纬度      0
房价        0
dtype: int64

二.数据预处理

  • 缺失值
  • 异常值
  • 重复值
  • 数据变换 标准化
  • 编码转换
  • 主成分分析

三.算法建模

划分训练集 测试集

from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2, random_state=40)

1.线性回归

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr =lr.fit(Xtrain,Ytrain)
lr.score(Xtest,Ytest)
0.6075794091011189

2.决策树

from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=40)
dtr = dtr.fit(Xtrain, Ytrain)
dtr.score(Xtest,Ytest)
0.615018515749356

3.随机森林

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state = 40)
rfr = rfr.fit(Xtrain, Ytrain)
rfr.score(Xtest, Ytest)
0.7748884071228732

4.回归提升树

from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=100,random_state=40)
gbr = gbr.fit(Xtrain, Ytrain)
gbr.score(Xtest, Ytest)
0.7835836333713624

5.AdaBoostRegressor

from sklearn.ensemble import AdaBoostRegressor
abr = AdaBoostRegressor(random_state=40,n_estimators=300,learning_rate=0.1)
abr = abr.fit(Xtrain, Ytrain)
abr.score(Xtest, Ytest)
0.4957214730873699

6.KNN

from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn = knn.fit(Xtrain, Ytrain)
knn.score(Xtest, Ytest)
0.6972888927183265

7.BaggingRegressor

from sklearn.ensemble import BaggingRegressor
bgr  =BaggingRegressor()
bgr = bgr.fit(Xtrain, Ytrain)
bgr.score(Xtest, Ytest)
0.7885598734862586

8.Xgboost

import xgboost as xgb
xgb = xgb.XGBRegressor(n_estimators=120,max_depth=7, learning_rate=0.3,objective ='reg:squarederror')
xgb = xgb.fit(Xtrain, Ytrain)
xgb.score(Xtest,Ytest)
0.8305799952865707
def try_different_method(model, method):
    model.fit(Xtrain, Ytrain)
    score = model.score(Xtest, Ytest)
    result = model.predict(Xtest)
    plt.figure()
    plt.plot(np.arange(len(result)), Ytest, "go-", label="True value")
    plt.plot(np.arange(len(result)), result, "ro-", label="Predict value")
    plt.title(f"method:{method}---score:{score}")
    plt.legend(loc="best")
    plt.show()
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=40)

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state = 40)

from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=100,random_state=40)

from sklearn.ensemble import AdaBoostRegressor
abr = AdaBoostRegressor(random_state=40,n_estimators=300,learning_rate=0.1)

from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=6)

from sklearn.ensemble import BaggingRegressor
bgr  =BaggingRegressor(random_state=40)

import xgboost as xgb
xgb = xgb.XGBRegressor(n_estimators=120,max_depth=7, learning_rate=0.3,objective ='reg:squarederror',random_state = 40)

regressor = [lr,dtr,rfr,gbr,abr,knn,bgr,xgb]
def different_model(model,name):
    model.fit(Xtrain, Ytrain)
    score = model.score(Xtest, Ytest)
    result = model.predict(Xtest)
    print(f'{name}',score)
name =  ['lr','dtr','rfr','gbr','abr','knn','bgr','xgb']
for i,j in zip(regressor,name):
    different_model(i,j)
lr 0.6075794091011189
dtr 0.615018515749356
rfr 0.7748884071228732
gbr 0.7835836333713624
abr 0.4957214730873699
knn 0.14199477279929695
bgr 0.7735241405394527
xgb 0.8305799952865707

模型调参

from sklearn import tree
dtr = tree.DecisionTreeRegressor(random_state=40)
dtr = dtr.fit(Xtrain, Ytrain)
dtr.score(Xtest,Ytest)
score = []
C = range(1,20)
for i in C:
    dtr = tree.DecisionTreeRegressor(max_depth= i           #树的最大深度
                                     ,min_samples_leaf=13   #最小分支节点
                                     ,random_state=40)
    dtr = dtr.fit(Xtrain, Ytrain)
    score.append(dtr.score(Xtest,Ytest))
plt.plot(C,score)
print('max score:',max(score),'max_depth:',score.index(max(score))+1)
max score: 0.7269488014943908 max_depth: 12

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IupvFiyM-1592571954638)(output_42_1.png)]

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state = 40)
rfr = rfr.fit(Xtrain, Ytrain)
rfr.score(Xtest, Ytest)
score = []
C = range(140,240,20)
for i in C:
    rfr = RandomForestRegressor(n_estimators = i #基评估器
                                ,random_state = 40#随机种子
                                #,n_jobs = -1 #决定了使用的CPU内核个数,使用更多的内核能使速度增快,而令n_jobs=-1可以调用所有内核 
                               )
    rfr = rfr.fit(Xtrain, Ytrain)
    score.append(rfr.score(Xtest,Ytest))
plt.plot(C,score)
print('max score:',max(score))
max score: 0.80334868183468

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-7I1Z9r2B-1592571954641)(output_43_1.png)]

rfr = RandomForestRegressor(n_estimators=200,random_state = 40)
rfr = rfr.fit(Xtrain, Ytrain)
rfr.score(Xtest, Ytest)
0.80334868183468

网格搜索

from sklearn.model_selection import GridSearchCV
tree_param_grid = { 'min_samples_split': list((3,6,9)),'n_estimators':list((10,50,100))}
grid = GridSearchCV(RandomForestRegressor(),param_grid=tree_param_grid, cv=5)
grid.fit(Xtrain, Ytrain)
GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'min_samples_split': [3, 6, 9],
                         'n_estimators': [10, 50, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
grid.best_score_
0.7992116401881113
grid.best_params_
{'min_samples_split': 6, 'n_estimators': 100}

是否一定要标准化

from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
X=scaler.fit_transform(X)
X
array([[ 2.34476576,  0.98214266,  0.62855945, ..., -0.04959654,
         1.05254828, -1.32783522],
       [ 2.33223796, -0.60701891,  0.32704136, ..., -0.09251223,
         1.04318455, -1.32284391],
       [ 1.7826994 ,  1.85618152,  1.15562047, ..., -0.02584253,
         1.03850269, -1.33282653],
       ...,
       [-1.14259331, -0.92485123, -0.09031802, ..., -0.0717345 ,
         1.77823747, -0.8237132 ],
       [-1.05458292, -0.84539315, -0.04021111, ..., -0.09122515,
         1.77823747, -0.87362627],
       [-0.78012947, -1.00430931, -0.07044252, ..., -0.04368215,
         1.75014627, -0.83369581]])
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2, random_state=40)
from sklearn import tree
dtr = tree.DecisionTreeRegressor(random_state=40)
dtr = dtr.fit(Xtrain, Ytrain)
dtr.score(Xtest,Ytest)
0.6145069972322226
from sklearn.preprocessing import MinMaxScaler
X = housing.data
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2, random_state=40)
from sklearn import tree
dtr = tree.DecisionTreeRegressor(random_state=40)
dtr = dtr.fit(Xtrain, Ytrain)
dtr.score(Xtest,Ytest)
0.6161125643854056
  • 3
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值