3.洛杉矶房价预测的Baseline

房价预测的Baseline

结构化数据的 回归问题

import pandas as pd
import numpy as np
train=pd.read_csv('datas/los_data.csv') # 训练集的读取
train.get_dtype_counts()
float64     3
int64      35
object     43
dtype: int64
train.head(3)
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500

3 rows × 81 columns

train.info() # 类型统计
train.describe() # 类型描述
IdMSSubClassLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAddMasVnrAreaBsmtFinSF1...WoodDeckSFOpenPorchSFEnclosedPorch3SsnPorchScreenPorchPoolAreaMiscValMoSoldYrSoldSalePrice
count1460.0000001460.0000001201.0000001460.0000001460.0000001460.0000001460.0000001460.0000001452.0000001460.000000...1460.0000001460.0000001460.0000001460.0000001460.0000001460.0000001460.0000001460.0000001460.0000001460.000000
mean730.50000056.89726070.04995810516.8280826.0993155.5753421971.2678081984.865753103.685262443.639726...94.24452146.66027421.9541103.40958915.0609592.75890443.4890416.3219182007.815753180921.195890
std421.61000942.30057124.2847529981.2649321.3829971.11279930.20290420.645407181.066207456.098091...125.33879466.25602861.11914929.31733155.75741540.177307496.1230242.7036261.32809579442.502883
min1.00000020.00000021.0000001300.0000001.0000001.0000001872.0000001950.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000001.0000002006.00000034900.000000
25%365.75000020.00000059.0000007553.5000005.0000005.0000001954.0000001967.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000005.0000002007.000000129975.000000
50%730.50000050.00000069.0000009478.5000006.0000005.0000001973.0000001994.0000000.000000383.500000...0.00000025.0000000.0000000.0000000.0000000.0000000.0000006.0000002008.000000163000.000000
75%1095.25000070.00000080.00000011601.5000007.0000006.0000002000.0000002004.000000166.000000712.250000...168.00000068.0000000.0000000.0000000.0000000.0000000.0000008.0000002009.000000214000.000000
max1460.000000190.000000313.000000215245.00000010.0000009.0000002010.0000002010.0000001600.0000005644.000000...857.000000547.000000552.000000508.000000480.000000738.00000015500.00000012.0000002010.000000755000.000000

8 rows × 38 columns

查看数据DataFrame的数据类型与空值的表现

如何统计数据的缺失率?

train.isnull().head()
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0FalseFalseFalseFalseFalseFalseTrueFalseFalseFalse...FalseTrueTrueTrueFalseFalseFalseFalseFalseFalse
1FalseFalseFalseFalseFalseFalseTrueFalseFalseFalse...FalseTrueTrueTrueFalseFalseFalseFalseFalseFalse
2FalseFalseFalseFalseFalseFalseTrueFalseFalseFalse...FalseTrueTrueTrueFalseFalseFalseFalseFalseFalse
3FalseFalseFalseFalseFalseFalseTrueFalseFalseFalse...FalseTrueTrueTrueFalseFalseFalseFalseFalseFalse
4FalseFalseFalseFalseFalseFalseTrueFalseFalseFalse...FalseTrueTrueTrueFalseFalseFalseFalseFalseFalse

5 rows × 81 columns

  1. 那些列有缺失?
  2. 缺失了多少?
  3. 缺失率是多少
  4. 可否安缺失率由大到小排序
nullSum = train.isnull().sum()
train.isnull().sum()
tmp = train.isnull().sum()
nullSum_data = tmp[tmp>0]
tmp[tmp>0].shape
(19,)
nullSum_data
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64
[i for i in nullSum_data]
[259,
 1369,
 8,
 8,
 37,
 37,
 38,
 37,
 38,
 1,
 690,
 81,
 81,
 81,
 81,
 81,
 1453,
 1179,
 1406]
[i/len(train) for i in nullSum_data]
[0.1773972602739726,
 0.9376712328767123,
 0.005479452054794521,
 0.005479452054794521,
 0.025342465753424658,
 0.025342465753424658,
 0.026027397260273973,
 0.025342465753424658,
 0.026027397260273973,
 0.0006849315068493151,
 0.4726027397260274,
 0.05547945205479452,
 0.05547945205479452,
 0.05547945205479452,
 0.05547945205479452,
 0.05547945205479452,
 0.9952054794520548,
 0.8075342465753425,
 0.963013698630137]
tmp=train.isnull().sum().sort_values()
tmp[tmp>0]
Electrical         1
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
BsmtFinType2      38
BsmtExposure      38
GarageQual        81
GarageFinish      81
GarageYrBlt       81
GarageType        81
GarageCond        81
LotFrontage      259
FireplaceQu      690
Fence           1179
Alley           1369
MiscFeature     1406
PoolQC          1453
dtype: int64

对连续型数据进行均值 | 中位数填充

train1=train
train1.mean().head()
# train1.median().head()
Id               730.500000
MSSubClass        56.897260
LotFrontage       70.049958
LotArea        10516.828082
OverallQual        6.099315
dtype: float64
# fillna函数会找到对应列的均值或者是中位数,对于该列进行相应的填充
train1 = train1.fillna(train1.mean())
# 空值统计函数
def getNullCount(train):
    tmp = train.isnull().sum()
    print(tmp[tmp>0].shape)
getNullCount(train1)
(16,)

对于离散型数据进行填充

朴素办法:将NaN这种空类型变成“None”或者是“NA”的字段

train1.head()
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal250000

5 rows × 81 columns

train1=train1.fillna("None")
train1.head()
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNoneRegLvlAllPub...0NoneNoneNone022008WDNormal208500
1220RL80.09600PaveNoneRegLvlAllPub...0NoneNoneNone052007WDNormal181500
2360RL68.011250PaveNoneIR1LvlAllPub...0NoneNoneNone092008WDNormal223500
3470RL60.09550PaveNoneIR1LvlAllPub...0NoneNoneNone022006WDAbnorml140000
4560RL84.014260PaveNoneIR1LvlAllPub...0NoneNoneNone0122008WDNormal250000

5 rows × 81 columns

getNullCount(train1) # 填充完毕
(0,)

制作训练集

1.y标签

2.数据里面有Id?pandas伪造出来的

3.答案分出来了,还把答案放进去??

y = train1['SalePrice']# 标签label
train1 = train1.drop(['Id', 'SalePrice'],axis=1)
train1.head()
MSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilitiesLotConfig...ScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleCondition
060RL65.08450PaveNoneRegLvlAllPubInside...00NoneNoneNone022008WDNormal
120RL80.09600PaveNoneRegLvlAllPubFR2...00NoneNoneNone052007WDNormal
260RL68.011250PaveNoneIR1LvlAllPubInside...00NoneNoneNone092008WDNormal
370RL60.09550PaveNoneIR1LvlAllPubCorner...00NoneNoneNone022006WDAbnorml
460RL84.014260PaveNoneIR1LvlAllPubFR2...00NoneNoneNone0122008WDNormal

5 rows × 79 columns

X = pd.get_dummies(train1) # 结构化数据
X.head()
MSSubClassLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAddMasVnrAreaBsmtFinSF1BsmtFinSF2...SaleType_ConLwSaleType_NewSaleType_OthSaleType_WDSaleCondition_AbnormlSaleCondition_AdjLandSaleCondition_AllocaSaleCondition_FamilySaleCondition_NormalSaleCondition_Partial
06065.084507520032003196.07060...0001000010
12080.0960068197619760.09780...0001000010
26068.0112507520012002162.04860...0001000010
37060.0955075191519700.02160...0001100000
46084.0142608520002000350.06550...0001000010

5 rows × 303 columns

X.shape
(1460, 303)

数据切分

分一些数据做评测

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)
X_train.shape
(1168, 303)
X_test.shape
(292, 303)

BaseLine模型的训练

import xgboost as xgb
xg_reg=xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.6,learning_rate=0.01,max_depth=8,alpha=10,n_estimators=700,sum_sample=0.7)
xg_reg.fit(X_train, y_train)
XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0, importance_type='gain',
       learning_rate=0.01, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=700, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1, sum_sample=0.7)

进行模型评测

from sklearn.metrics import mean_squared_error
pred=xg_reg.predict(X_test)
rmse=np.sqrt(mean_squared_error(y_test,pred))
rmse
26348.6994479421
logrmse=np.sqrt(mean_squared_error(np.log(y_test),np.log(pred)))
logrmse
0.11996319328290858

调整模型参数

xg_reg2=xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.6,learning_rate=0.01,max_depth=5,alpha=10,n_estimators=3000,subsample=0.7,random_state=123)
xg_reg2.fit(X_train, y_train)
XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0, importance_type='gain',
       learning_rate=0.01, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=3000, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=123, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.7)
pred2=xg_reg2.predict(X_test)
logrmse=np.sqrt(mean_squared_error(np.log(y_test),np.log(pred2)))
logrmse
0.10447489682428332
pd.get_dummies(train1).shape
(1460, 303)

GridSearch

# gs=GridSearch(xg_reg,{
#     "n_estimators":[100,500,1000,3000],
#     "alpha":[0.01,0.1,1.0,10]
#     "lambda":[...]
# })

Randomized Search

m*n 不重要的点过滤掉

# rs=RandomizedSearch(xg_reg,{
#     "n_estimators":[100,500,1000,3000],
#     "alpha":np.norm(1.0,0.7),
#     "lambda":[...]
# })

交叉验证

params = {"objective":"reg:linear",'colsample_bytree': 0.7,'learning_rate': 0.1,
          'max_depth': 5, 'alpha': 10}
#xgboost可以接受的Data的一种压缩后的数据结构,.lmdb/.h5,基于此可以对训练集的读写进行优化提升训练速度
matrix=xgb.DMatrix(data=X,label=y)
# 本身数据不够用,用10折交叉
# 没必要配测试集了,用量数据进行训练和评测,再平均
X.shape
(1460, 303)
cv_results=xgb.cv(dtrain=matrix,params=params,nfold=10,num_boost_round=500,
                  metrics='rmse',as_pandas=True, verbose_eval=False)
#决策树的学习过程分为2个阶段,分裂和剪枝(前剪枝和后减枝) 都是为了防止过拟合
# verbose_eval把日志关了
type(cv_results)
pandas.core.frame.DataFrame
X=pd.get_dummies(train1) # Convert categorical variable into dummy/indicator variables
# 进行
cv_results.tail()
test-rmse-meantest-rmse-stdtrain-rmse-meantrain-rmse-std
49526897.3742196992.0567931675.321265103.741632
49626897.4697276991.4168211667.588147104.618191
49726897.0671886991.0731681661.767676105.295643
49826897.8111336991.1319301656.616650105.007822
49926897.2966806991.3623841650.411279106.048964
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值