机器学习-DecisionTreeRegressor

最新推荐文章于 2024-04-29 11:23:04 发布

the uzi

最新推荐文章于 2024-04-29 11:23:04 发布

阅读量356

点赞数

文章标签：机器学习 python 人工智能

本文链接：https://blog.csdn.net/albert__einstein/article/details/129409231

版权

数据挖掘专栏收录该内容

34 篇文章 8 订阅

订阅专栏

预测房价模型

数据导入

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy import stats
from scipy.stats import norm, skew

train_df = pd.read_csv('./house-prices/train.csv')
test_df = pd.read_csv('./house-prices/test.csv')

 # SalePrice: 预测目标房价
dict_name = {
 'Id':'Id',
 'SalePrice': '预测目标房价',
 'MSSubClass':'建筑种类（20，30，40等等）',
 'MSZoning':'分区分类（5类：RH,C(all),RL等等）',
 'LotFrontage':'与财产相连的街道的直线尺（234+NA)',
 'LotArea':'面积（8450，11200...)',
 'Street':'道路通道类别（Grvi Pave两类）',
 'Alley':'通道类型（Grvi Pave两类+NA）',
 'LotShape':'财产形状（4类：IR1,IR2...)',
 'LandContour':'平坦性（4类：Lvl...）',
 'Utilities':'可用的工具种类（2类：Allpub...）',
 'LotConfig':' 配置（5类：corner...)',
 'LandSlope':'倾斜属性(3类：Gtl...）',
 'Neighborhood':'埃姆斯市范围内的物理位置：好多类（dlegg...)',
 'Condition1':'靠近主干路或铁路（Norm,Alidg...)',
 'Condition2':'靠近主干路或铁路（如果有第二条）',
 'BldgType':'住宅类型（5类）',
 'HouseStyle':'住宅风格（7类左右）',
 'OverallQual': '整体材料和成品质量：1-10之间',
 'OverallCond':'总体状况评级：1-9之间',
 'YearBuilt':'建于某某年：2031',
 'RemodAdd':'改变日期2001',
 'RoofStyle':'屋顶风格（5类:Gable...）',
 'RoofMatl': '屋顶材料（8类左右)',
 'Exterior1st': '房屋外墙：（好几类呢也）',
 'Exterior2nd': '第二种材料，多了个None，有些就一种材料）',
 'MasVnrType': '表层砌体类型:(4类+None）',
 'MasVnrArea': '砖石面积:(0，各种数字1，300，还有NA）',
 'ExterQual': '外部材质:(4类，四个看不懂的英文缩写）',
 'ExterCond': '外观材料的现状：由高到低，5类',
 'Foundation': '地基类型:几类看不懂的类型(wood...)',
 'BsmtQual': '地下室的高度:由高到低：5类',
 'BsmtCond': '地下室现状：同上，5类',
 'BsmtExposure': '户外或花园水平的地下室墙壁，同上，5类',
 'BsmtFinType1': '地下室装修区域质量：好几类',
 'BsmtFinSF1':'1型面积',
 'BsmtFinType2': '第二种质量',
 'BsmtFinSF2': '2型面积',
 'BsmtUnfSF': '未完工的地下室面积',
 'TotalBsmtSF': '地下面积的总面积',
 'Heating': '加热方式：好几种',
 'HeatingQC': '加热质量及条件：5种',
 'CentralAir': '中央空调（N/Y)有或者没有yes or no',
 'Electrical': '电气系统:好几种',
 '1stFlrSF': '第一层面积',
 '2ndFlrSF': '第二层面积',
 'LowQualFinSF': '低品质成品平方英尺(所有楼层)：各种数字',
 'GrLivArea': '地面以上居住面积平方英尺',
 'BsmtFullBath':'地下室全浴室（0-3）',
 'BsmtHalfBath':'半地下室卫生间（0-2）',
 'FullBath': '高档全浴室（0-3）',
 'HalfBath': '半浴缸以上（0-2）',
 'Bedroom': '地下室以上的卧室数：0-8',
 'Kitchen': '厨房数量',
 'KitchenQual': '厨房质量：5个',
 'TotRmsAbvGrd': '房间总数(不含卫生间)',
 'Functional': '家庭功能评级（min,mod...）',
 'Fireplaces': '壁炉数量',
 'FireplaceQu': '壁炉质量',
 'GarageType':'车库位置（好几类)',
 'GarageYrBlt':'车库建造年份',
 'GarageFinish': '车库内部装修',
 'GarageCars': '车库容量的大小0-4',
 'GarageArea': '车库面积',
 'GarageQual': '车库质量',
 'GarageCond': '车库条件，好几类',
 'PavedDrive': '道路车道（N/P/Y)',
 'WoodDeckSF': '木甲板面积',
 'BedroomAbvGr': '地下室以上的卧室数量，类型：数值',
 'OpenPorchSF':'开放式门廊面积(平方英尺)',
 'EnclosedPorch':'3英尺的封闭式玄关区域',
 'SsnPorch':'三季门廊面积平方英尺',
 'ScreenPorch':'屏风门廊面积(平方英尺)',
 'PoolArea': '游泳池面积(平方英尺)',
 'PoolQC': '泳池质量',
 'Fence': '栅栏质量',
 'KitchenAbvGr': '厨房数量，类型：数值',
 '3SsnPorch': '三个季节门廊面积',
 'MiscFeature':'其他类别中未涉及的杂项功能',
 'MiscVal': '杂项特征值0-15500',
 'MoSold': 'Month',
 'SoldYrSold': 'Year',
 '3SsnPorch': '三个季节门廊面积',
 'YearRemodAdd': '修复年份',
 'SoldSaleType': '销售类型',
 'MoSold': '卖出月份',
 'YrSold': '卖出年份',
 'SaleType': '交易类型',
 'SaleCondition': '销售质量'
}
train_df.head()

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

5 rows × 81 columns

数据预处理

# 判断空值数量
rows = train_df.isnull().sum()
indexs = rows.index

# 显示存在异常值的数据维度
tlen = len(train_df)
names =  [dict_name[indexs[i]] for i in range(len(indexs))]
percents = [ '{:.2%}'.format(x / tlen ) for x in rows.tolist() ] # 计算百分比

missing_data = pd.concat([pd.Series(indexs.tolist()), pd.Series(names),
                          pd.Series(rows.tolist()), pd.Series(percents) ], 
                          axis=1, keys=['name','desc','na','percents']
                        ).sort_values(by = ['percents'], ascending = [False])

missing_data.head(20)

	name	desc	na	percents
72	PoolQC	泳池质量	1453	99.52%
74	MiscFeature	其他类别中未涉及的杂项功能	1406	96.30%
6	Alley	通道类型（Grvi Pave两类+NA）	1369	93.77%
73	Fence	栅栏质量	1179	80.75%
64	GarageCond	车库条件，好几类	81	5.55%
58	GarageType	车库位置（好几类)	81	5.55%
63	GarageQual	车库质量	81	5.55%
60	GarageFinish	车库内部装修	81	5.55%
59	GarageYrBlt	车库建造年份	81	5.55%
57	FireplaceQu	壁炉质量	690	47.26%
35	BsmtFinType2	第二种质量	38	2.60%
32	BsmtExposure	户外或花园水平的地下室墙壁，同上，5类	38	2.60%
30	BsmtQual	地下室的高度:由高到低：5类	37	2.53%
31	BsmtCond	地下室现状：同上，5类	37	2.53%
33	BsmtFinType1	地下室装修区域质量：好几类	37	2.53%
3	LotFrontage	与财产相连的街道的直线尺（234+NA)	259	17.74%
25	MasVnrType	表层砌体类型:(4类+None）	8	0.55%
26	MasVnrArea	砖石面积:(0，各种数字1，300，还有NA）	8	0.55%
42	Electrical	电气系统:好几种	1	0.07%
0	Id	Id	0	0.00%

# LotFrontage, MasVnrArea 异常值处理
#  LotFrontage 平均值
print('平均值 "Age" is %.2f' %(train_df["LotFrontage"].mean(skipna=True)))

# LotFrontage 中位数
print('中位数 "Age" is %.2f' %(train_df["LotFrontage"].median(skipna=True)))

# 平均值替换缺失值
train_df["LotFrontage"] = train_df['LotFrontage'].fillna(train_df['LotFrontage'].mean())
train_df["MasVnrArea"] = train_df['MasVnrArea'].fillna(train_df['MasVnrArea'].mean())

train_df['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(train_df['GarageYrBlt'].mean())
train_df["Electrical"] = train_df["Electrical"].fillna(train_df['Electrical'].value_counts().idxmax())

train_df.isnull().sum().sort_values(ascending=False).head(20)

平均值 "Age" is 70.05
中位数 "Age" is 69.00





PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
GarageFinish      81
GarageType        81
GarageQual        81
GarageCond        81
BsmtExposure      38
BsmtFinType2      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrType         8
BedroomAbvGr       0
KitchenAbvGr       0
HalfBath           0
KitchenQual        0
FullBath           0
dtype: int64

# 地下室的高度-类型
print(train_df['BsmtQual'].value_counts())
sns.countplot(x='BsmtQual', data=train_df, palette='Set2') # 使用条形显示每个分箱器中的观察计数
plt.show()

print('地下室的高度 %s.' %train_df['BsmtQual'].value_counts().idxmax())

TA    649
Gd    618
Ex    121
Fa     35
Name: BsmtQual, dtype: int64

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vwCIWkMW-1678272393937)(house-price_files/house-price_6_1.png)]

地下室的高度 TA.

test_df.isnull().sum().sort_values(ascending=False).head(20)

PoolQC          1456
MiscFeature     1408
Alley           1352
Fence           1169
FireplaceQu      730
LotFrontage      227
GarageYrBlt       78
GarageQual        78
GarageFinish      78
GarageCond        78
GarageType        76
BsmtCond          45
BsmtQual          44
BsmtExposure      44
BsmtFinType1      42
BsmtFinType2      42
MasVnrType        16
MasVnrArea        15
MSZoning           4
BsmtHalfBath       2
dtype: int64

cols = ['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
       'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'SalePrice']

test_df['GarageYrBlt'] = test_df['GarageYrBlt'].fillna(test_df['GarageYrBlt'].mean())
test_df['MasVnrArea'] = test_df['MasVnrArea'].fillna(test_df['MasVnrArea'].mean())
test_df['GarageCars'] = test_df['GarageCars'].fillna(test_df['GarageCars'].mean())
test_df['GarageArea'] = test_df['GarageArea'].fillna(test_df['GarageArea'].mean())
test_df['BsmtFinSF1'] = test_df['BsmtFinSF1'].fillna(test_df['BsmtFinSF1'].mean())
test_df['TotalBsmtSF'] = test_df['TotalBsmtSF'].fillna(test_df['TotalBsmtSF'].mean())
test_df['LotFrontage'] = test_df['LotFrontage'].fillna(test_df['LotFrontage'].mean())

test_df[cols[:-1]].isnull().sum().sort_values(ascending=False).head(20)

LotFrontage     0
OverallQual     0
WoodDeckSF      0
GarageArea      0
GarageCars      0
GarageYrBlt     0
Fireplaces      0
TotRmsAbvGrd    0
FullBath        0
GrLivArea       0
2ndFlrSF        0
1stFlrSF        0
TotalBsmtSF     0
BsmtFinSF1      0
MasVnrArea      0
YearRemodAdd    0
YearBuilt       0
OpenPorchSF     0
dtype: int64

特征选择

train_df.head()

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

5 rows × 81 columns

# 计算皮尔森系数 corr
plt.subplots(figsize=(16, 10))
sns.heatmap(train_df[cols].corr(), annot=True, cmap="RdYlGn")
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-KuPkAH6Z-1678272393938)(house-price_files/house-price_12_0.png)]

决策树

from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from matplotlib import pyplot as plt

X_train = train_df[cols[:-1]] # 训练集
y_train = train_df['SalePrice'].values

# train_df[cols].isnull().sum().sort_values(ascending=False).head(20)
dtreg = DecisionTreeRegressor(random_state = 100)
dtreg.fit(X_train, y_train)

# tree.plot_tree(dtreg)

DecisionTreeRegressor(random_state=100)

from sklearn import metrics

# test 
X_test = test_df[cols[:-1]] # 测试集
dtr_pred = dtreg.predict(X_test)
dtr_pred= dtr_pred.reshape(-1,1) # to ndarray

y_test_df = pd.read_csv("./house-prices/sample_submission.csv")
y_test = y_test_df['SalePrice'].values # test label 

print('MAE:', round(metrics.mean_absolute_error(y_test, dtr_pred),4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, dtr_pred)),4))

MAE: 56025.5318
RMSE: 74419.8462

# 可视化显示
dot_data = tree.export_graphviz(dtreg, out_file=None,
                                feature_names=cols[:-1], 
                                filled=True, impurity=False, rounded=True)

import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor('#FFF2DD')

from IPython.display import Image
Image(graph.create_png())

---------------------------------------------------------------------------

InvocationException                       Traceback (most recent call last)

<ipython-input-16-93409461a4a2> in <module>
      9 
     10 from IPython.display import Image
---> 11 Image(graph.create_png())


~/anaconda3/lib/python3.7/site-packages/pydotplus/graphviz.py in <lambda>(f, prog)
   1795             self.__setattr__(
   1796                 'create_' + frmt,
-> 1797                 lambda f=frmt, prog=self.prog: self.create(format=f, prog=prog)
   1798             )
   1799             f = self.__dict__['create_' + frmt]


~/anaconda3/lib/python3.7/site-packages/pydotplus/graphviz.py in create(self, prog, format)
   1958             if self.progs is None:
   1959                 raise InvocationException(
-> 1960                     'GraphViz\'s executables not found')
   1961 
   1962         if prog not in self.progs:


InvocationException: GraphViz's executables not found

plt.figure(figsize=(12,8))
plt.scatter(y_test[:100],dtr_pred[:100],c='green')
plt.plot([120000, 260000], [120000, 260000], 'k--')

plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()