【kaggle prices-advanced-regression-techniques】房价预测 - 前14%

得分:0.11569 前14%

import numpy as np 
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv("train.csv")
train.head()
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal250000

5 rows × 81 columns

train=train.drop(index=[523,1298],axis=0)
test = pd.read_csv("test.csv")
print('th train data has {} rows and {} features'.format(train.shape[0],train.shape[1]))
print('the test data has {} rows and {} features'.format(test.shape[0],test.shape[1]))
th train data has 1458 rows and 81 features
the test data has 1459 rows and 80 features
data=pd.concat([train.iloc[:,:-1],test],axis=0)
print('tha data has {} rows and {} features'.format(data.shape[0],data.shape[1]))
tha data has 2917 rows and 80 features
data.columns
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition'],
      dtype='object')
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2917 entries, 0 to 1458
Data columns (total 80 columns):
Id               2917 non-null int64
MSSubClass       2917 non-null int64
MSZoning         2913 non-null object
LotFrontage      2431 non-null float64
LotArea          2917 non-null int64
Street           2917 non-null object
Alley            198 non-null object
LotShape         2917 non-null object
LandContour      2917 non-null object
Utilities        2915 non-null object
LotConfig        2917 non-null object
LandSlope        2917 non-null object
Neighborhood     2917 non-null object
Condition1       2917 non-null object
Condition2       2917 non-null object
BldgType         2917 non-null object
HouseStyle       2917 non-null object
OverallQual      2917 non-null int64
OverallCond      2917 non-null int64
YearBuilt        2917 non-null int64
YearRemodAdd     2917 non-null int64
RoofStyle        2917 non-null object
RoofMatl         2917 non-null object
Exterior1st      2916 non-null object
Exterior2nd      2916 non-null object
MasVnrType       2893 non-null object
MasVnrArea       2894 non-null float64
ExterQual        2917 non-null object
ExterCond        2917 non-null object
Foundation       2917 non-null object
BsmtQual         2836 non-null object
BsmtCond         2835 non-null object
BsmtExposure     2835 non-null object
BsmtFinType1     2838 non-null object
BsmtFinSF1       2916 non-null float64
BsmtFinType2     2837 non-null object
BsmtFinSF2       2916 non-null float64
BsmtUnfSF        2916 non-null float64
TotalBsmtSF      2916 non-null float64
Heating          2917 non-null object
HeatingQC        2917 non-null object
CentralAir       2917 non-null object
Electrical       2916 non-null object
1stFlrSF         2917 non-null int64
2ndFlrSF         2917 non-null int64
LowQualFinSF     2917 non-null int64
GrLivArea        2917 non-null int64
BsmtFullBath     2915 non-null float64
BsmtHalfBath     2915 non-null float64
FullBath         2917 non-null int64
HalfBath         2917 non-null int64
BedroomAbvGr     2917 non-null int64
KitchenAbvGr     2917 non-null int64
KitchenQual      2916 non-null object
TotRmsAbvGrd     2917 non-null int64
Functional       2915 non-null object
Fireplaces       2917 non-null int64
FireplaceQu      1497 non-null object
GarageType       2760 non-null object
GarageYrBlt      2758 non-null float64
GarageFinish     2758 non-null object
GarageCars       2916 non-null float64
GarageArea       2916 non-null float64
GarageQual       2758 non-null object
GarageCond       2758 non-null object
PavedDrive       2917 non-null object
WoodDeckSF       2917 non-null int64
OpenPorchSF      2917 non-null int64
EnclosedPorch    2917 non-null int64
3SsnPorch        2917 non-null int64
ScreenPorch      2917 non-null int64
PoolArea         2917 non-null int64
PoolQC           9 non-null object
Fence            571 non-null object
MiscFeature      105 non-null object
MiscVal          2917 non-null int64
MoSold           2917 non-null int64
YrSold           2917 non-null int64
SaleType         2916 non-null object
SaleCondition    2917 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 1.8+ MB
num_features=data.select_dtypes(include=['int64','float64'])
categorical_features=data.select_dtypes(include='object')
num_features.describe()
IdMSSubClassLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAddMasVnrAreaBsmtFinSF1...GarageAreaWoodDeckSFOpenPorchSFEnclosedPorch3SsnPorchScreenPorchPoolAreaMiscValMoSoldYrSold
count2917.0000002917.0000002431.0000002917.0000002917.0000002917.0000002917.0000002917.0000002894.0000002916.000000...2916.0000002917.0000002917.0000002917.0000002917.0000002917.0000002917.0000002917.0000002917.0000002917.000000
mean1460.37607157.13575669.18058410139.4391506.0863905.5649641971.2879671984.248200101.733587439.015432...472.40946593.62941447.28008223.1141582.60404516.0733632.08879050.8608166.2135762007.792938
std842.89245642.53214022.7917197807.0365121.4067041.11341430.28699120.892257178.510291444.182329...214.620878126.53264367.11896564.26342425.19671456.20205434.561371567.5951982.7130701.315328
min1.00000020.00000021.0000001300.0000001.0000001.0000001872.0000001950.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000001.0000002006.000000
25%731.00000020.00000059.0000007476.0000005.0000005.0000001953.0000001965.0000000.0000000.000000...320.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000004.0000002007.000000
50%1461.00000050.00000068.0000009452.0000006.0000005.0000001973.0000001993.0000000.000000368.000000...480.0000000.00000026.0000000.0000000.0000000.0000000.0000000.0000006.0000002008.000000
75%2190.00000070.00000080.00000011556.0000007.0000006.0000002001.0000002004.000000164.000000733.000000...576.000000168.00000070.0000000.0000000.0000000.0000000.0000000.0000008.0000002009.000000
max2919.000000190.000000313.000000215245.00000010.0000009.0000002010.0000002010.0000001600.0000004010.000000...1488.0000001424.000000742.0000001012.000000508.000000576.000000800.00000017000.00000012.0000002010.000000

8 rows × 37 columns

categorical_features.describe()
MSZoningStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1...GarageTypeGarageFinishGarageQualGarageCondPavedDrivePoolQCFenceMiscFeatureSaleTypeSaleCondition
count291329171982917291729152917291729172917...27602758275827582917957110529162917
unique52244253259...6355334496
topRLPaveGrvlRegLvlAllPubInsideGtlNAmesNorm...AttchdUnfTATAYExMnPrvShedWDNormal
freq22632905120185926222914213227764432511...1722123026022652263943299525252402

4 rows × 43 columns

data.isnull().sum().sort_values(ascending=False)[:34]
#print(categorical_features.isnull().sum().sort_values(ascending=False)[:23])
#num_features.isnull().sum().sort_values(ascending=False)[:11]
PoolQC          2908
MiscFeature     2812
Alley           2719
Fence           2346
FireplaceQu     1420
LotFrontage      486
GarageCond       159
GarageQual       159
GarageYrBlt      159
GarageFinish     159
GarageType       157
BsmtCond          82
BsmtExposure      82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrType        24
MasVnrArea        23
MSZoning           4
BsmtHalfBath       2
Utilities          2
Functional         2
BsmtFullBath       2
BsmtFinSF1         1
Exterior1st        1
Exterior2nd        1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
SaleType           1
Electrical         1
KitchenQual        1
GarageArea         1
GarageCars         1
dtype: int64
f = open("data_description.txt", "r")
#print(f.read())
data = data.drop(columns=['Id','Street','PoolQC','Utilities'],axis=1)
#data['LotFrontage'].fillna(int(data['LotFrontage'].mean()),inplace=True)
data['LotFrontage'] = data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
data['LotFrontage'].isnull().sum()
0
#create a new class 'other'
features=['Electrical','KitchenQual','SaleType','Exterior2nd','Exterior1st','Alley','Fence', 'MiscFeature','FireplaceQu','GarageCond','GarageQual','GarageFinish','GarageType','BsmtCond','BsmtExposure','BsmtQual','BsmtFinType2','BsmtFinType1','MasVnrType']
for name in features:
    data[name].fillna('Other',inplace=True)
data[features].isnull().sum()
Electrical      0
KitchenQual     0
SaleType        0
Exterior2nd     0
Exterior1st     0
Alley           0
Fence           0
MiscFeature     0
FireplaceQu     0
GarageCond      0
GarageQual      0
GarageFinish    0
GarageType      0
BsmtCond        0
BsmtExposure    0
BsmtQual        0
BsmtFinType2    0
BsmtFinType1    0
MasVnrType      0
dtype: int64
data['MSZoning'] = data.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
#data.MSZoning = data.groupby(['MSSubClass'])['MSZoning'].transform(lambda x: x.fillna(x.value_counts()[0]))
data['Functional']=data['Functional'].fillna('typ')
zero=['GarageYrBlt','GarageArea','MasVnrArea','BsmtHalfBath','BsmtHalfBath','BsmtFullBath','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','GarageCars']
for name in zero:
    data[name].fillna(0,inplace=True)
data.isnull().sum().sum()
0
data.loc[data['MSSubClass']==60, 'MSSubClass']=0
data.loc[(data['MSSubClass']==20)|(data['MSSubClass']==120), 'MSSubClass']=1
data.loc[data['MSSubClass']==75, 'MSSubClass']=2
data.loc[(data['MSSubClass']==40)|(data['MSSubClass']==70)|(data['MSSubClass']==80), 'MSSubClass']=3
data.loc[(data['MSSubClass']==50)|(data['MSSubClass']==85)|(data['MSSubClass']==90)|(data['MSSubClass']==160)|(data['MSSubClass']==190), 'MSSubClass']=4
data.loc[(data['MSSubClass']==30)|(data['MSSubClass']==45)|(data['MSSubClass']==180), 'MSSubClass']=5
data.loc[(data['MSSubClass']==150), 'MSSubClass']=6
object_features = data.select_dtypes(include='object').columns
object_features
Index(['MSZoning', 'Alley', 'LotShape', 'LandContour', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object')
def dummies(d):
    dummies_df=pd.DataFrame()
    object_features = d.select_dtypes(include='object').columns
    for name in object_features:
        dummies = pd.get_dummies(d[name], drop_first=False)
        dummies = dummies.add_prefix("{}_".format(name))
        dummies_df=pd.concat([dummies_df,dummies],axis=1)
    return dummies_df
dummies_data=dummies(data)
dummies_data.shape
(2917, 263)
data=data.drop(columns=object_features,axis=1)
data.columns
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')
final_data=pd.concat([data,dummies_data],axis=1)
final_data.shape
(2917, 299)
#Re-spliting the data into train and test datasets
train_data=final_data.iloc[:1458,:]
test_data=final_data.iloc[1458:,:]
print(train_data.shape)
test_data.shape
(1458, 299)





(1459, 299)
# X: independent variables and y: target variable
X=train_data
y=train.loc[:,'SalePrice']
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, ElasticNet
model_las_cv = LassoCV(alphas=(0.0001, 0.0005, 0.001, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10))
model_las_cv.fit(X,y)
las_cv_preds=model_las_cv.predict(test_data)
model_ridge_cv = RidgeCV(alphas=(0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10))
model_ridge_cv.fit(X, y)
ridge_cv_preds=model_ridge_cv.predict(test_data)
model_ridge = Ridge(alpha=10, solver='auto')
model_ridge.fit(X, y)
ridge_preds=model_ridge.predict(test_data)
model_en = ElasticNet(random_state=1, alpha=0.00065, max_iter=3000)
model_en.fit(X, y)
en_preds=model_en.predict(test_data)
import xgboost as xgb
model_xgb = xgb.XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)
model_xgb.fit(X, y)
xgb_preds=model_xgb.predict(test_data)
[09:54:40] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
from sklearn.ensemble import GradientBoostingRegressor
model_gbr = GradientBoostingRegressor(n_estimators=3000, 
                                learning_rate=0.05, 
                                max_depth=4, 
                                max_features='sqrt', 
                                min_samples_leaf=15, 
                                min_samples_split=10, 
                                loss='huber', 
                                random_state =42)
model_gbr.fit(X, y)
gbr_preds=model_gbr.predict(test_data)
from lightgbm import LGBMRegressor
model_lgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       #min_data_in_leaf=2,
                                       #min_sum_hessian_in_leaf=11
                                       )
model_lgbm.fit(X, y)
lgbm_preds=model_lgbm.predict(test_data)
final_predictions = 0.3 * lgbm_preds + 0.3 * gbr_preds + 0.1 * xgb_preds + 0.3 * ridge_cv_preds
#display the first 5 predictions of sale price
final_predictions[:5]
array([120991.7547464 , 160872.32864741, 186243.01436333, 194394.84973628,
       192268.91952743])
#make the submission data frame
submission = {
    'Id': test.Id.values,
    'SalePrice': final_predictions
}
solution = pd.DataFrame(submission)
solution.head()
IdSalePrice
01461120991.754746
11462160872.328647
21463186243.014363
31464194394.849736
41465192268.919527
#make the submission file
solution.to_csv('submission3.csv',index=False)
  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值