预测房价模型
数据导入
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
train_df = pd.read_csv('./house-prices/train.csv')
test_df = pd.read_csv('./house-prices/test.csv')
# SalePrice: 预测目标房价
dict_name = {
'Id':'Id',
'SalePrice': '预测目标房价',
'MSSubClass':'建筑种类(20,30,40等等)',
'MSZoning':'分区分类(5类:RH,C(all),RL等等)',
'LotFrontage':'与财产相连的街道的直线尺(234+NA)',
'LotArea':'面积(8450,11200...)',
'Street':'道路通道类别(Grvi Pave两类)',
'Alley':'通道类型(Grvi Pave两类+NA)',
'LotShape':'财产形状(4类:IR1,IR2...)',
'LandContour':'平坦性(4类:Lvl...)',
'Utilities':'可用的工具种类(2类:Allpub...)',
'LotConfig':' 配置(5类:corner...)',
'LandSlope':'倾斜属性(3类:Gtl...)',
'Neighborhood':'埃姆斯市范围内的物理位置:好多类(dlegg...)',
'Condition1':'靠近主干路或铁路(Norm,Alidg...)',
'Condition2':'靠近主干路或铁路(如果有第二条)',
'BldgType':'住宅类型(5类)',
'HouseStyle':'住宅风格(7类左右)',
'OverallQual': '整体材料和成品质量:1-10之间',
'OverallCond':'总体状况评级:1-9之间',
'YearBuilt':'建于某某年:2031',
'RemodAdd':'改变日期2001',
'RoofStyle':'屋顶风格(5类:Gable...)',
'RoofMatl': '屋顶材料(8类左右)',
'Exterior1st': '房屋外墙:(好几类呢也)',
'Exterior2nd': '第二种材料,多了个None,有些就一种材料)',
'MasVnrType': '表层砌体类型:(4类+None)',
'MasVnrArea': '砖石面积:(0,各种数字1,300,还有NA)',
'ExterQual': '外部材质:(4类,四个看不懂的英文缩写)',
'ExterCond': '外观材料的现状:由高到低,5类',
'Foundation': '地基类型:几类看不懂的类型(wood...)',
'BsmtQual': '地下室的高度:由高到低:5类',
'BsmtCond': '地下室现状:同上,5类',
'BsmtExposure': '户外或花园水平的地下室墙壁,同上,5类',
'BsmtFinType1': '地下室装修区域质量:好几类',
'BsmtFinSF1':'1型面积',
'BsmtFinType2': '第二种质量',
'BsmtFinSF2': '2型面积',
'BsmtUnfSF': '未完工的地下室面积',
'TotalBsmtSF': '地下面积的总面积',
'Heating': '加热方式:好几种',
'HeatingQC': '加热质量及条件:5种',
'CentralAir': '中央空调(N/Y)有或者没有yes or no',
'Electrical': '电气系统:好几种',
'1stFlrSF': '第一层面积',
'2ndFlrSF': '第二层面积',
'LowQualFinSF': '低品质成品平方英尺(所有楼层):各种数字',
'GrLivArea': '地面以上居住面积平方英尺',
'BsmtFullBath':'地下室全浴室(0-3)',
'BsmtHalfBath':'半地下室卫生间(0-2)',
'FullBath': '高档全浴室(0-3)',
'HalfBath': '半浴缸以上(0-2)',
'Bedroom': '地下室以上的卧室数:0-8',
'Kitchen': '厨房数量',
'KitchenQual': '厨房质量:5个',
'TotRmsAbvGrd': '房间总数(不含卫生间)',
'Functional': '家庭功能评级(min,mod...)',
'Fireplaces': '壁炉数量',
'FireplaceQu': '壁炉质量',
'GarageType':'车库位置(好几类)',
'GarageYrBlt':'车库建造年份',
'GarageFinish': '车库内部装修',
'GarageCars': '车库容量的大小0-4',
'GarageArea': '车库面积',
'GarageQual': '车库质量',
'GarageCond': '车库条件,好几类',
'PavedDrive': '道路车道(N/P/Y)',
'WoodDeckSF': '木甲板面积',
'BedroomAbvGr': '地下室以上的卧室数量,类型:数值',
'OpenPorchSF':'开放式门廊面积(平方英尺)',
'EnclosedPorch':'3英尺的封闭式玄关区域',
'SsnPorch':'三季门廊面积平方英尺',
'ScreenPorch':'屏风门廊面积(平方英尺)',
'PoolArea': '游泳池面积(平方英尺)',
'PoolQC': '泳池质量',
'Fence': '栅栏质量',
'KitchenAbvGr': '厨房数量,类型:数值',
'3SsnPorch': '三个季节门廊面积',
'MiscFeature':'其他类别中未涉及的杂项功能',
'MiscVal': '杂项特征值0-15500',
'MoSold': 'Month',
'SoldYrSold': 'Year',
'3SsnPorch': '三个季节门廊面积',
'YearRemodAdd': '修复年份',
'SoldSaleType': '销售类型',
'MoSold': '卖出月份',
'YrSold': '卖出年份',
'SaleType': '交易类型',
'SaleCondition': '销售质量'
}
train_df.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
数据预处理
# 判断空值数量
rows = train_df.isnull().sum()
indexs = rows.index
# 显示存在异常值的数据维度
tlen = len(train_df)
names = [dict_name[indexs[i]] for i in range(len(indexs))]
percents = [ '{:.2%}'.format(x / tlen ) for x in rows.tolist() ] # 计算百分比
missing_data = pd.concat([pd.Series(indexs.tolist()), pd.Series(names),
pd.Series(rows.tolist()), pd.Series(percents) ],
axis=1, keys=['name','desc','na','percents']
).sort_values(by = ['percents'], ascending = [False])
missing_data.head(20)
name | desc | na | percents | |
---|---|---|---|---|
72 | PoolQC | 泳池质量 | 1453 | 99.52% |
74 | MiscFeature | 其他类别中未涉及的杂项功能 | 1406 | 96.30% |
6 | Alley | 通道类型(Grvi Pave两类+NA) | 1369 | 93.77% |
73 | Fence | 栅栏质量 | 1179 | 80.75% |
64 | GarageCond | 车库条件,好几类 | 81 | 5.55% |
58 | GarageType | 车库位置(好几类) | 81 | 5.55% |
63 | GarageQual | 车库质量 | 81 | 5.55% |
60 | GarageFinish | 车库内部装修 | 81 | 5.55% |
59 | GarageYrBlt | 车库建造年份 | 81 | 5.55% |
57 | FireplaceQu | 壁炉质量 | 690 | 47.26% |
35 | BsmtFinType2 | 第二种质量 | 38 | 2.60% |
32 | BsmtExposure | 户外或花园水平的地下室墙壁,同上,5类 | 38 | 2.60% |
30 | BsmtQual | 地下室的高度:由高到低:5类 | 37 | 2.53% |
31 | BsmtCond | 地下室现状:同上,5类 | 37 | 2.53% |
33 | BsmtFinType1 | 地下室装修区域质量:好几类 | 37 | 2.53% |
3 | LotFrontage | 与财产相连的街道的直线尺(234+NA) | 259 | 17.74% |
25 | MasVnrType | 表层砌体类型:(4类+None) | 8 | 0.55% |
26 | MasVnrArea | 砖石面积:(0,各种数字1,300,还有NA) | 8 | 0.55% |
42 | Electrical | 电气系统:好几种 | 1 | 0.07% |
0 | Id | Id | 0 | 0.00% |
# LotFrontage, MasVnrArea 异常值处理
# LotFrontage 平均值
print('平均值 "Age" is %.2f' %(train_df["LotFrontage"].mean(skipna=True)))
# LotFrontage 中位数
print('中位数 "Age" is %.2f' %(train_df["LotFrontage"].median(skipna=True)))
# 平均值替换缺失值
train_df["LotFrontage"] = train_df['LotFrontage'].fillna(train_df['LotFrontage'].mean())
train_df["MasVnrArea"] = train_df['MasVnrArea'].fillna(train_df['MasVnrArea'].mean())
train_df['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(train_df['GarageYrBlt'].mean())
train_df["Electrical"] = train_df["Electrical"].fillna(train_df['Electrical'].value_counts().idxmax())
train_df.isnull().sum().sort_values(ascending=False).head(20)
平均值 "Age" is 70.05
中位数 "Age" is 69.00
PoolQC 1453
MiscFeature 1406
Alley 1369
Fence 1179
FireplaceQu 690
GarageFinish 81
GarageType 81
GarageQual 81
GarageCond 81
BsmtExposure 38
BsmtFinType2 38
BsmtFinType1 37
BsmtCond 37
BsmtQual 37
MasVnrType 8
BedroomAbvGr 0
KitchenAbvGr 0
HalfBath 0
KitchenQual 0
FullBath 0
dtype: int64
# 地下室的高度-类型
print(train_df['BsmtQual'].value_counts())
sns.countplot(x='BsmtQual', data=train_df, palette='Set2') # 使用条形显示每个分箱器中的观察计数
plt.show()
print('地下室的高度 %s.' %train_df['BsmtQual'].value_counts().idxmax())
TA 649
Gd 618
Ex 121
Fa 35
Name: BsmtQual, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vwCIWkMW-1678272393937)(house-price_files/house-price_6_1.png)]
地下室的高度 TA.
test_df.isnull().sum().sort_values(ascending=False).head(20)
PoolQC 1456
MiscFeature 1408
Alley 1352
Fence 1169
FireplaceQu 730
LotFrontage 227
GarageYrBlt 78
GarageQual 78
GarageFinish 78
GarageCond 78
GarageType 76
BsmtCond 45
BsmtQual 44
BsmtExposure 44
BsmtFinType1 42
BsmtFinType2 42
MasVnrType 16
MasVnrArea 15
MSZoning 4
BsmtHalfBath 2
dtype: int64
cols = ['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars',
'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'SalePrice']
test_df['GarageYrBlt'] = test_df['GarageYrBlt'].fillna(test_df['GarageYrBlt'].mean())
test_df['MasVnrArea'] = test_df['MasVnrArea'].fillna(test_df['MasVnrArea'].mean())
test_df['GarageCars'] = test_df['GarageCars'].fillna(test_df['GarageCars'].mean())
test_df['GarageArea'] = test_df['GarageArea'].fillna(test_df['GarageArea'].mean())
test_df['BsmtFinSF1'] = test_df['BsmtFinSF1'].fillna(test_df['BsmtFinSF1'].mean())
test_df['TotalBsmtSF'] = test_df['TotalBsmtSF'].fillna(test_df['TotalBsmtSF'].mean())
test_df['LotFrontage'] = test_df['LotFrontage'].fillna(test_df['LotFrontage'].mean())
test_df[cols[:-1]].isnull().sum().sort_values(ascending=False).head(20)
LotFrontage 0
OverallQual 0
WoodDeckSF 0
GarageArea 0
GarageCars 0
GarageYrBlt 0
Fireplaces 0
TotRmsAbvGrd 0
FullBath 0
GrLivArea 0
2ndFlrSF 0
1stFlrSF 0
TotalBsmtSF 0
BsmtFinSF1 0
MasVnrArea 0
YearRemodAdd 0
YearBuilt 0
OpenPorchSF 0
dtype: int64
特征选择
train_df.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
# 计算皮尔森系数 corr
plt.subplots(figsize=(16, 10))
sns.heatmap(train_df[cols].corr(), annot=True, cmap="RdYlGn")
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-KuPkAH6Z-1678272393938)(house-price_files/house-price_12_0.png)]
决策树
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from matplotlib import pyplot as plt
X_train = train_df[cols[:-1]] # 训练集
y_train = train_df['SalePrice'].values
# train_df[cols].isnull().sum().sort_values(ascending=False).head(20)
dtreg = DecisionTreeRegressor(random_state = 100)
dtreg.fit(X_train, y_train)
# tree.plot_tree(dtreg)
DecisionTreeRegressor(random_state=100)
from sklearn import metrics
# test
X_test = test_df[cols[:-1]] # 测试集
dtr_pred = dtreg.predict(X_test)
dtr_pred= dtr_pred.reshape(-1,1) # to ndarray
y_test_df = pd.read_csv("./house-prices/sample_submission.csv")
y_test = y_test_df['SalePrice'].values # test label
print('MAE:', round(metrics.mean_absolute_error(y_test, dtr_pred),4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, dtr_pred)),4))
MAE: 56025.5318
RMSE: 74419.8462
# 可视化显示
dot_data = tree.export_graphviz(dtreg, out_file=None,
feature_names=cols[:-1],
filled=True, impurity=False, rounded=True)
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor('#FFF2DD')
from IPython.display import Image
Image(graph.create_png())
---------------------------------------------------------------------------
InvocationException Traceback (most recent call last)
<ipython-input-16-93409461a4a2> in <module>
9
10 from IPython.display import Image
---> 11 Image(graph.create_png())
~/anaconda3/lib/python3.7/site-packages/pydotplus/graphviz.py in <lambda>(f, prog)
1795 self.__setattr__(
1796 'create_' + frmt,
-> 1797 lambda f=frmt, prog=self.prog: self.create(format=f, prog=prog)
1798 )
1799 f = self.__dict__['create_' + frmt]
~/anaconda3/lib/python3.7/site-packages/pydotplus/graphviz.py in create(self, prog, format)
1958 if self.progs is None:
1959 raise InvocationException(
-> 1960 'GraphViz\'s executables not found')
1961
1962 if prog not in self.progs:
InvocationException: GraphViz's executables not found
plt.figure(figsize=(12,8))
plt.scatter(y_test[:100],dtr_pred[:100],c='green')
plt.plot([120000, 260000], [120000, 260000], 'k--')
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()