得分:0.11569 前14%
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv("train.csv")
train.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
train=train.drop(index=[523,1298],axis=0)
test = pd.read_csv("test.csv")
print('th train data has {} rows and {} features'.format(train.shape[0],train.shape[1]))
print('the test data has {} rows and {} features'.format(test.shape[0],test.shape[1]))
th train data has 1458 rows and 81 features
the test data has 1459 rows and 80 features
data=pd.concat([train.iloc[:,:-1],test],axis=0)
print('tha data has {} rows and {} features'.format(data.shape[0],data.shape[1]))
tha data has 2917 rows and 80 features
data.columns
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition'],
dtype='object')
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2917 entries, 0 to 1458
Data columns (total 80 columns):
Id 2917 non-null int64
MSSubClass 2917 non-null int64
MSZoning 2913 non-null object
LotFrontage 2431 non-null float64
LotArea 2917 non-null int64
Street 2917 non-null object
Alley 198 non-null object
LotShape 2917 non-null object
LandContour 2917 non-null object
Utilities 2915 non-null object
LotConfig 2917 non-null object
LandSlope 2917 non-null object
Neighborhood 2917 non-null object
Condition1 2917 non-null object
Condition2 2917 non-null object
BldgType 2917 non-null object
HouseStyle 2917 non-null object
OverallQual 2917 non-null int64
OverallCond 2917 non-null int64
YearBuilt 2917 non-null int64
YearRemodAdd 2917 non-null int64
RoofStyle 2917 non-null object
RoofMatl 2917 non-null object
Exterior1st 2916 non-null object
Exterior2nd 2916 non-null object
MasVnrType 2893 non-null object
MasVnrArea 2894 non-null float64
ExterQual 2917 non-null object
ExterCond 2917 non-null object
Foundation 2917 non-null object
BsmtQual 2836 non-null object
BsmtCond 2835 non-null object
BsmtExposure 2835 non-null object
BsmtFinType1 2838 non-null object
BsmtFinSF1 2916 non-null float64
BsmtFinType2 2837 non-null object
BsmtFinSF2 2916 non-null float64
BsmtUnfSF 2916 non-null float64
TotalBsmtSF 2916 non-null float64
Heating 2917 non-null object
HeatingQC 2917 non-null object
CentralAir 2917 non-null object
Electrical 2916 non-null object
1stFlrSF 2917 non-null int64
2ndFlrSF 2917 non-null int64
LowQualFinSF 2917 non-null int64
GrLivArea 2917 non-null int64
BsmtFullBath 2915 non-null float64
BsmtHalfBath 2915 non-null float64
FullBath 2917 non-null int64
HalfBath 2917 non-null int64
BedroomAbvGr 2917 non-null int64
KitchenAbvGr 2917 non-null int64
KitchenQual 2916 non-null object
TotRmsAbvGrd 2917 non-null int64
Functional 2915 non-null object
Fireplaces 2917 non-null int64
FireplaceQu 1497 non-null object
GarageType 2760 non-null object
GarageYrBlt 2758 non-null float64
GarageFinish 2758 non-null object
GarageCars 2916 non-null float64
GarageArea 2916 non-null float64
GarageQual 2758 non-null object
GarageCond 2758 non-null object
PavedDrive 2917 non-null object
WoodDeckSF 2917 non-null int64
OpenPorchSF 2917 non-null int64
EnclosedPorch 2917 non-null int64
3SsnPorch 2917 non-null int64
ScreenPorch 2917 non-null int64
PoolArea 2917 non-null int64
PoolQC 9 non-null object
Fence 571 non-null object
MiscFeature 105 non-null object
MiscVal 2917 non-null int64
MoSold 2917 non-null int64
YrSold 2917 non-null int64
SaleType 2916 non-null object
SaleCondition 2917 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 1.8+ MB
num_features=data.select_dtypes(include=['int64','float64'])
categorical_features=data.select_dtypes(include='object')
num_features.describe()
Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | ... | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2917.000000 | 2917.000000 | 2431.000000 | 2917.000000 | 2917.000000 | 2917.000000 | 2917.000000 | 2917.000000 | 2894.000000 | 2916.000000 | ... | 2916.000000 | 2917.000000 | 2917.000000 | 2917.000000 | 2917.000000 | 2917.000000 | 2917.000000 | 2917.000000 | 2917.000000 | 2917.000000 |
mean | 1460.376071 | 57.135756 | 69.180584 | 10139.439150 | 6.086390 | 5.564964 | 1971.287967 | 1984.248200 | 101.733587 | 439.015432 | ... | 472.409465 | 93.629414 | 47.280082 | 23.114158 | 2.604045 | 16.073363 | 2.088790 | 50.860816 | 6.213576 | 2007.792938 |
std | 842.892456 | 42.532140 | 22.791719 | 7807.036512 | 1.406704 | 1.113414 | 30.286991 | 20.892257 | 178.510291 | 444.182329 | ... | 214.620878 | 126.532643 | 67.118965 | 64.263424 | 25.196714 | 56.202054 | 34.561371 | 567.595198 | 2.713070 | 1.315328 |
min | 1.000000 | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 |
25% | 731.000000 | 20.000000 | 59.000000 | 7476.000000 | 5.000000 | 5.000000 | 1953.000000 | 1965.000000 | 0.000000 | 0.000000 | ... | 320.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2007.000000 |
50% | 1461.000000 | 50.000000 | 68.000000 | 9452.000000 | 6.000000 | 5.000000 | 1973.000000 | 1993.000000 | 0.000000 | 368.000000 | ... | 480.000000 | 0.000000 | 26.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 |
75% | 2190.000000 | 70.000000 | 80.000000 | 11556.000000 | 7.000000 | 6.000000 | 2001.000000 | 2004.000000 | 164.000000 | 733.000000 | ... | 576.000000 | 168.000000 | 70.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 |
max | 2919.000000 | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 4010.000000 | ... | 1488.000000 | 1424.000000 | 742.000000 | 1012.000000 | 508.000000 | 576.000000 | 800.000000 | 17000.000000 | 12.000000 | 2010.000000 |
8 rows × 37 columns
categorical_features.describe()
MSZoning | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | ... | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | PoolQC | Fence | MiscFeature | SaleType | SaleCondition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2913 | 2917 | 198 | 2917 | 2917 | 2915 | 2917 | 2917 | 2917 | 2917 | ... | 2760 | 2758 | 2758 | 2758 | 2917 | 9 | 571 | 105 | 2916 | 2917 |
unique | 5 | 2 | 2 | 4 | 4 | 2 | 5 | 3 | 25 | 9 | ... | 6 | 3 | 5 | 5 | 3 | 3 | 4 | 4 | 9 | 6 |
top | RL | Pave | Grvl | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Norm | ... | Attchd | Unf | TA | TA | Y | Ex | MnPrv | Shed | WD | Normal |
freq | 2263 | 2905 | 120 | 1859 | 2622 | 2914 | 2132 | 2776 | 443 | 2511 | ... | 1722 | 1230 | 2602 | 2652 | 2639 | 4 | 329 | 95 | 2525 | 2402 |
4 rows × 43 columns
data.isnull().sum().sort_values(ascending=False)[:34]
#print(categorical_features.isnull().sum().sort_values(ascending=False)[:23])
#num_features.isnull().sum().sort_values(ascending=False)[:11]
PoolQC 2908
MiscFeature 2812
Alley 2719
Fence 2346
FireplaceQu 1420
LotFrontage 486
GarageCond 159
GarageQual 159
GarageYrBlt 159
GarageFinish 159
GarageType 157
BsmtCond 82
BsmtExposure 82
BsmtQual 81
BsmtFinType2 80
BsmtFinType1 79
MasVnrType 24
MasVnrArea 23
MSZoning 4
BsmtHalfBath 2
Utilities 2
Functional 2
BsmtFullBath 2
BsmtFinSF1 1
Exterior1st 1
Exterior2nd 1
BsmtFinSF2 1
BsmtUnfSF 1
TotalBsmtSF 1
SaleType 1
Electrical 1
KitchenQual 1
GarageArea 1
GarageCars 1
dtype: int64
f = open("data_description.txt", "r")
#print(f.read())
data = data.drop(columns=['Id','Street','PoolQC','Utilities'],axis=1)
#data['LotFrontage'].fillna(int(data['LotFrontage'].mean()),inplace=True)
data['LotFrontage'] = data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
data['LotFrontage'].isnull().sum()
0
#create a new class 'other'
features=['Electrical','KitchenQual','SaleType','Exterior2nd','Exterior1st','Alley','Fence', 'MiscFeature','FireplaceQu','GarageCond','GarageQual','GarageFinish','GarageType','BsmtCond','BsmtExposure','BsmtQual','BsmtFinType2','BsmtFinType1','MasVnrType']
for name in features:
data[name].fillna('Other',inplace=True)
data[features].isnull().sum()
Electrical 0
KitchenQual 0
SaleType 0
Exterior2nd 0
Exterior1st 0
Alley 0
Fence 0
MiscFeature 0
FireplaceQu 0
GarageCond 0
GarageQual 0
GarageFinish 0
GarageType 0
BsmtCond 0
BsmtExposure 0
BsmtQual 0
BsmtFinType2 0
BsmtFinType1 0
MasVnrType 0
dtype: int64
data['MSZoning'] = data.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
#data.MSZoning = data.groupby(['MSSubClass'])['MSZoning'].transform(lambda x: x.fillna(x.value_counts()[0]))
data['Functional']=data['Functional'].fillna('typ')
zero=['GarageYrBlt','GarageArea','MasVnrArea','BsmtHalfBath','BsmtHalfBath','BsmtFullBath','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','GarageCars']
for name in zero:
data[name].fillna(0,inplace=True)
data.isnull().sum().sum()
0
data.loc[data['MSSubClass']==60, 'MSSubClass']=0
data.loc[(data['MSSubClass']==20)|(data['MSSubClass']==120), 'MSSubClass']=1
data.loc[data['MSSubClass']==75, 'MSSubClass']=2
data.loc[(data['MSSubClass']==40)|(data['MSSubClass']==70)|(data['MSSubClass']==80), 'MSSubClass']=3
data.loc[(data['MSSubClass']==50)|(data['MSSubClass']==85)|(data['MSSubClass']==90)|(data['MSSubClass']==160)|(data['MSSubClass']==190), 'MSSubClass']=4
data.loc[(data['MSSubClass']==30)|(data['MSSubClass']==45)|(data['MSSubClass']==180), 'MSSubClass']=5
data.loc[(data['MSSubClass']==150), 'MSSubClass']=6
object_features = data.select_dtypes(include='object').columns
object_features
Index(['MSZoning', 'Alley', 'LotShape', 'LandContour', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'],
dtype='object')
def dummies(d):
dummies_df=pd.DataFrame()
object_features = d.select_dtypes(include='object').columns
for name in object_features:
dummies = pd.get_dummies(d[name], drop_first=False)
dummies = dummies.add_prefix("{}_".format(name))
dummies_df=pd.concat([dummies_df,dummies],axis=1)
return dummies_df
dummies_data=dummies(data)
dummies_data.shape
(2917, 263)
data=data.drop(columns=object_features,axis=1)
data.columns
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
'MoSold', 'YrSold'],
dtype='object')
final_data=pd.concat([data,dummies_data],axis=1)
final_data.shape
(2917, 299)
#Re-spliting the data into train and test datasets
train_data=final_data.iloc[:1458,:]
test_data=final_data.iloc[1458:,:]
print(train_data.shape)
test_data.shape
(1458, 299)
(1459, 299)
# X: independent variables and y: target variable
X=train_data
y=train.loc[:,'SalePrice']
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, ElasticNet
model_las_cv = LassoCV(alphas=(0.0001, 0.0005, 0.001, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10))
model_las_cv.fit(X,y)
las_cv_preds=model_las_cv.predict(test_data)
model_ridge_cv = RidgeCV(alphas=(0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10))
model_ridge_cv.fit(X, y)
ridge_cv_preds=model_ridge_cv.predict(test_data)
model_ridge = Ridge(alpha=10, solver='auto')
model_ridge.fit(X, y)
ridge_preds=model_ridge.predict(test_data)
model_en = ElasticNet(random_state=1, alpha=0.00065, max_iter=3000)
model_en.fit(X, y)
en_preds=model_en.predict(test_data)
import xgboost as xgb
model_xgb = xgb.XGBRegressor(learning_rate=0.01,n_estimators=3460,
max_depth=3, min_child_weight=0,
gamma=0, subsample=0.7,
colsample_bytree=0.7,
objective='reg:linear', nthread=-1,
scale_pos_weight=1, seed=27,
reg_alpha=0.00006)
model_xgb.fit(X, y)
xgb_preds=model_xgb.predict(test_data)
[09:54:40] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
from sklearn.ensemble import GradientBoostingRegressor
model_gbr = GradientBoostingRegressor(n_estimators=3000,
learning_rate=0.05,
max_depth=4,
max_features='sqrt',
min_samples_leaf=15,
min_samples_split=10,
loss='huber',
random_state =42)
model_gbr.fit(X, y)
gbr_preds=model_gbr.predict(test_data)
from lightgbm import LGBMRegressor
model_lgbm = LGBMRegressor(objective='regression',
num_leaves=4,
learning_rate=0.01,
n_estimators=5000,
max_bin=200,
bagging_fraction=0.75,
bagging_freq=5,
bagging_seed=7,
feature_fraction=0.2,
feature_fraction_seed=7,
verbose=-1,
#min_data_in_leaf=2,
#min_sum_hessian_in_leaf=11
)
model_lgbm.fit(X, y)
lgbm_preds=model_lgbm.predict(test_data)
final_predictions = 0.3 * lgbm_preds + 0.3 * gbr_preds + 0.1 * xgb_preds + 0.3 * ridge_cv_preds
#display the first 5 predictions of sale price
final_predictions[:5]
array([120991.7547464 , 160872.32864741, 186243.01436333, 194394.84973628,
192268.91952743])
#make the submission data frame
submission = {
'Id': test.Id.values,
'SalePrice': final_predictions
}
solution = pd.DataFrame(submission)
solution.head()
Id | SalePrice | |
---|---|---|
0 | 1461 | 120991.754746 |
1 | 1462 | 160872.328647 |
2 | 1463 | 186243.014363 |
3 | 1464 | 194394.849736 |
4 | 1465 | 192268.919527 |
#make the submission file
solution.to_csv('submission3.csv',index=False)