【kaggle prices-advanced-regression-techniques】房价预测 - 前14%

一只小金毛zy

于 2019-12-17 08:56:03 发布

阅读量398

点赞数 2

分类专栏： Kaggle+阿里天池比赛

本文链接：https://blog.csdn.net/qq_39071739/article/details/99622836

版权

Kaggle+阿里天池比赛专栏收录该内容

8 篇文章 4 订阅

订阅专栏

得分：0.11569 前14%

import numpy as np 
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore")

train = pd.read_csv("train.csv")
train.head()

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

5 rows × 81 columns

train=train.drop(index=[523,1298],axis=0)

test = pd.read_csv("test.csv")

print('th train data has {} rows and {} features'.format(train.shape[0],train.shape[1]))
print('the test data has {} rows and {} features'.format(test.shape[0],test.shape[1]))

th train data has 1458 rows and 81 features
the test data has 1459 rows and 80 features

data=pd.concat([train.iloc[:,:-1],test],axis=0)
print('tha data has {} rows and {} features'.format(data.shape[0],data.shape[1]))

tha data has 2917 rows and 80 features

data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition'],
      dtype='object')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2917 entries, 0 to 1458
Data columns (total 80 columns):
Id               2917 non-null int64
MSSubClass       2917 non-null int64
MSZoning         2913 non-null object
LotFrontage      2431 non-null float64
LotArea          2917 non-null int64
Street           2917 non-null object
Alley            198 non-null object
LotShape         2917 non-null object
LandContour      2917 non-null object
Utilities        2915 non-null object
LotConfig        2917 non-null object
LandSlope        2917 non-null object
Neighborhood     2917 non-null object
Condition1       2917 non-null object
Condition2       2917 non-null object
BldgType         2917 non-null object
HouseStyle       2917 non-null object
OverallQual      2917 non-null int64
OverallCond      2917 non-null int64
YearBuilt        2917 non-null int64
YearRemodAdd     2917 non-null int64
RoofStyle        2917 non-null object
RoofMatl         2917 non-null object
Exterior1st      2916 non-null object
Exterior2nd      2916 non-null object
MasVnrType       2893 non-null object
MasVnrArea       2894 non-null float64
ExterQual        2917 non-null object
ExterCond        2917 non-null object
Foundation       2917 non-null object
BsmtQual         2836 non-null object
BsmtCond         2835 non-null object
BsmtExposure     2835 non-null object
BsmtFinType1     2838 non-null object
BsmtFinSF1       2916 non-null float64
BsmtFinType2     2837 non-null object
BsmtFinSF2       2916 non-null float64
BsmtUnfSF        2916 non-null float64
TotalBsmtSF      2916 non-null float64
Heating          2917 non-null object
HeatingQC        2917 non-null object
CentralAir       2917 non-null object
Electrical       2916 non-null object
1stFlrSF         2917 non-null int64
2ndFlrSF         2917 non-null int64
LowQualFinSF     2917 non-null int64
GrLivArea        2917 non-null int64
BsmtFullBath     2915 non-null float64
BsmtHalfBath     2915 non-null float64
FullBath         2917 non-null int64
HalfBath         2917 non-null int64
BedroomAbvGr     2917 non-null int64
KitchenAbvGr     2917 non-null int64
KitchenQual      2916 non-null object
TotRmsAbvGrd     2917 non-null int64
Functional       2915 non-null object
Fireplaces       2917 non-null int64
FireplaceQu      1497 non-null object
GarageType       2760 non-null object
GarageYrBlt      2758 non-null float64
GarageFinish     2758 non-null object
GarageCars       2916 non-null float64
GarageArea       2916 non-null float64
GarageQual       2758 non-null object
GarageCond       2758 non-null object
PavedDrive       2917 non-null object
WoodDeckSF       2917 non-null int64
OpenPorchSF      2917 non-null int64
EnclosedPorch    2917 non-null int64
3SsnPorch        2917 non-null int64
ScreenPorch      2917 non-null int64
PoolArea         2917 non-null int64
PoolQC           9 non-null object
Fence            571 non-null object
MiscFeature      105 non-null object
MiscVal          2917 non-null int64
MoSold           2917 non-null int64
YrSold           2917 non-null int64
SaleType         2916 non-null object
SaleCondition    2917 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 1.8+ MB

num_features=data.select_dtypes(include=['int64','float64'])
categorical_features=data.select_dtypes(include='object')

num_features.describe()

	Id	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	...	GarageArea	WoodDeckSF	OpenPorchSF	EnclosedPorch	3SsnPorch	ScreenPorch	PoolArea	MiscVal	MoSold	YrSold
count	2917.000000	2917.000000	2431.000000	2917.000000	2917.000000	2917.000000	2917.000000	2917.000000	2894.000000	2916.000000	...	2916.000000	2917.000000	2917.000000	2917.000000	2917.000000	2917.000000	2917.000000	2917.000000	2917.000000	2917.000000
mean	1460.376071	57.135756	69.180584	10139.439150	6.086390	5.564964	1971.287967	1984.248200	101.733587	439.015432	...	472.409465	93.629414	47.280082	23.114158	2.604045	16.073363	2.088790	50.860816	6.213576	2007.792938
std	842.892456	42.532140	22.791719	7807.036512	1.406704	1.113414	30.286991	20.892257	178.510291	444.182329	...	214.620878	126.532643	67.118965	64.263424	25.196714	56.202054	34.561371	567.595198	2.713070	1.315328
min	1.000000	20.000000	21.000000	1300.000000	1.000000	1.000000	1872.000000	1950.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	2006.000000
25%	731.000000	20.000000	59.000000	7476.000000	5.000000	5.000000	1953.000000	1965.000000	0.000000	0.000000	...	320.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	4.000000	2007.000000
50%	1461.000000	50.000000	68.000000	9452.000000	6.000000	5.000000	1973.000000	1993.000000	0.000000	368.000000	...	480.000000	0.000000	26.000000	0.000000	0.000000	0.000000	0.000000	0.000000	6.000000	2008.000000
75%	2190.000000	70.000000	80.000000	11556.000000	7.000000	6.000000	2001.000000	2004.000000	164.000000	733.000000	...	576.000000	168.000000	70.000000	0.000000	0.000000	0.000000	0.000000	0.000000	8.000000	2009.000000
max	2919.000000	190.000000	313.000000	215245.000000	10.000000	9.000000	2010.000000	2010.000000	1600.000000	4010.000000	...	1488.000000	1424.000000	742.000000	1012.000000	508.000000	576.000000	800.000000	17000.000000	12.000000	2010.000000

8 rows × 37 columns

categorical_features.describe()

	MSZoning	Street	Alley	LotShape	LandContour	Utilities	LotConfig	LandSlope	Neighborhood	Condition1	...	GarageType	GarageFinish	GarageQual	GarageCond	PavedDrive	PoolQC	Fence	MiscFeature	SaleType	SaleCondition
count	2913	2917	198	2917	2917	2915	2917	2917	2917	2917	...	2760	2758	2758	2758	2917	9	571	105	2916	2917
unique	5	2	2	4	4	2	5	3	25	9	...	6	3	5	5	3	3	4	4	9	6
top	RL	Pave	Grvl	Reg	Lvl	AllPub	Inside	Gtl	NAmes	Norm	...	Attchd	Unf	TA	TA	Y	Ex	MnPrv	Shed	WD	Normal
freq	2263	2905	120	1859	2622	2914	2132	2776	443	2511	...	1722	1230	2602	2652	2639	4	329	95	2525	2402

4 rows × 43 columns

data.isnull().sum().sort_values(ascending=False)[:34]
#print(categorical_features.isnull().sum().sort_values(ascending=False)[:23])
#num_features.isnull().sum().sort_values(ascending=False)[:11]

PoolQC          2908
MiscFeature     2812
Alley           2719
Fence           2346
FireplaceQu     1420
LotFrontage      486
GarageCond       159
GarageQual       159
GarageYrBlt      159
GarageFinish     159
GarageType       157
BsmtCond          82
BsmtExposure      82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrType        24
MasVnrArea        23
MSZoning           4
BsmtHalfBath       2
Utilities          2
Functional         2
BsmtFullBath       2
BsmtFinSF1         1
Exterior1st        1
Exterior2nd        1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
SaleType           1
Electrical         1
KitchenQual        1
GarageArea         1
GarageCars         1
dtype: int64

f = open("data_description.txt", "r")
#print(f.read())

data = data.drop(columns=['Id','Street','PoolQC','Utilities'],axis=1)

#data['LotFrontage'].fillna(int(data['LotFrontage'].mean()),inplace=True)
data['LotFrontage'] = data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

data['LotFrontage'].isnull().sum()

#create a new class 'other'
features=['Electrical','KitchenQual','SaleType','Exterior2nd','Exterior1st','Alley','Fence', 'MiscFeature','FireplaceQu','GarageCond','GarageQual','GarageFinish','GarageType','BsmtCond','BsmtExposure','BsmtQual','BsmtFinType2','BsmtFinType1','MasVnrType']
for name in features:
    data[name].fillna('Other',inplace=True)

data[features].isnull().sum()

Electrical      0
KitchenQual     0
SaleType        0
Exterior2nd     0
Exterior1st     0
Alley           0
Fence           0
MiscFeature     0
FireplaceQu     0
GarageCond      0
GarageQual      0
GarageFinish    0
GarageType      0
BsmtCond        0
BsmtExposure    0
BsmtQual        0
BsmtFinType2    0
BsmtFinType1    0
MasVnrType      0
dtype: int64

data['MSZoning'] = data.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
#data.MSZoning = data.groupby(['MSSubClass'])['MSZoning'].transform(lambda x: x.fillna(x.value_counts()[0]))

data['Functional']=data['Functional'].fillna('typ')

zero=['GarageYrBlt','GarageArea','MasVnrArea','BsmtHalfBath','BsmtHalfBath','BsmtFullBath','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','GarageCars']
for name in zero:
    data[name].fillna(0,inplace=True)

data.isnull().sum().sum()

data.loc[data['MSSubClass']==60, 'MSSubClass']=0
data.loc[(data['MSSubClass']==20)|(data['MSSubClass']==120), 'MSSubClass']=1
data.loc[data['MSSubClass']==75, 'MSSubClass']=2
data.loc[(data['MSSubClass']==40)|(data['MSSubClass']==70)|(data['MSSubClass']==80), 'MSSubClass']=3
data.loc[(data['MSSubClass']==50)|(data['MSSubClass']==85)|(data['MSSubClass']==90)|(data['MSSubClass']==160)|(data['MSSubClass']==190), 'MSSubClass']=4
data.loc[(data['MSSubClass']==30)|(data['MSSubClass']==45)|(data['MSSubClass']==180), 'MSSubClass']=5
data.loc[(data['MSSubClass']==150), 'MSSubClass']=6

object_features = data.select_dtypes(include='object').columns
object_features

Index(['MSZoning', 'Alley', 'LotShape', 'LandContour', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object')

def dummies(d):
    dummies_df=pd.DataFrame()
    object_features = d.select_dtypes(include='object').columns
    for name in object_features:
        dummies = pd.get_dummies(d[name], drop_first=False)
        dummies = dummies.add_prefix("{}_".format(name))
        dummies_df=pd.concat([dummies_df,dummies],axis=1)
    return dummies_df

dummies_data=dummies(data)
dummies_data.shape

(2917, 263)

data=data.drop(columns=object_features,axis=1)
data.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

final_data=pd.concat([data,dummies_data],axis=1)
final_data.shape

(2917, 299)

#Re-spliting the data into train and test datasets
train_data=final_data.iloc[:1458,:]
test_data=final_data.iloc[1458:,:]
print(train_data.shape)
test_data.shape

(1458, 299)





(1459, 299)

# X: independent variables and y: target variable
X=train_data
y=train.loc[:,'SalePrice']

from sklearn.linear_model import Ridge, RidgeCV, LassoCV, ElasticNet

model_las_cv = LassoCV(alphas=(0.0001, 0.0005, 0.001, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10))
model_las_cv.fit(X,y)
las_cv_preds=model_las_cv.predict(test_data)
model_ridge_cv = RidgeCV(alphas=(0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10))
model_ridge_cv.fit(X, y)
ridge_cv_preds=model_ridge_cv.predict(test_data)
model_ridge = Ridge(alpha=10, solver='auto')
model_ridge.fit(X, y)
ridge_preds=model_ridge.predict(test_data)
model_en = ElasticNet(random_state=1, alpha=0.00065, max_iter=3000)
model_en.fit(X, y)
en_preds=model_en.predict(test_data)
import xgboost as xgb
model_xgb = xgb.XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)
model_xgb.fit(X, y)
xgb_preds=model_xgb.predict(test_data)

[09:54:40] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.

from sklearn.ensemble import GradientBoostingRegressor
model_gbr = GradientBoostingRegressor(n_estimators=3000, 
                                learning_rate=0.05, 
                                max_depth=4, 
                                max_features='sqrt', 
                                min_samples_leaf=15, 
                                min_samples_split=10, 
                                loss='huber', 
                                random_state =42)
model_gbr.fit(X, y)
gbr_preds=model_gbr.predict(test_data)
from lightgbm import LGBMRegressor
model_lgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       #min_data_in_leaf=2,
                                       #min_sum_hessian_in_leaf=11
                                       )
model_lgbm.fit(X, y)
lgbm_preds=model_lgbm.predict(test_data)
final_predictions = 0.3 * lgbm_preds + 0.3 * gbr_preds + 0.1 * xgb_preds + 0.3 * ridge_cv_preds
#display the first 5 predictions of sale price
final_predictions[:5]

array([120991.7547464 , 160872.32864741, 186243.01436333, 194394.84973628,
       192268.91952743])

#make the submission data frame
submission = {
    'Id': test.Id.values,
    'SalePrice': final_predictions
}
solution = pd.DataFrame(submission)
solution.head()

	Id	SalePrice
0	1461	120991.754746
1	1462	160872.328647
2	1463	186243.014363
3	1464	194394.849736
4	1465	192268.919527

#make the submission file
solution.to_csv('submission3.csv',index=False)

一只小金毛zy

关注

2
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【kaggle prices-advanced-regression-techniques】房价预测 - 前14%

得分：0.11569 前14%import numpy as np import pandas as pd#import matplotlib.pyplot as plt#import seaborn as snsimport osfor dirname, _, filenames in os.walk('/kaggle/input'): for filename...
复制链接

扫一扫

专栏目录