房价数据处理与分析

最新推荐文章于 2022-10-10 22:48:13 发布

哈喽朝龙

最新推荐文章于 2022-10-10 22:48:13 发布

阅读量1.2k

点赞数 2

分类专栏：学习经验文章标签： python 机器学习数据分析大数据

本文链接：https://blog.csdn.net/weixin_44309097/article/details/106708241

版权

学习经验专栏收录该内容

18 篇文章 2 订阅

订阅专栏

1、直接上代码

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt #导入matplotlib模块
from warnings import simplefilter #使用 warnings 抑制第三方警告
simplefilter(action='ignore', category=FutureWarning)
plt.rcParams['font.sans-serif']=['SimHei']  ## 用于正常显示中文标签
plt.rcParams['axes.unicode_minus']=False ##用于正常显示负号
# 导入数据
data = pd.read_csv('E:\\house.csv',encoding='utf-8')#从E盘目录下读取数据文件
data.head()

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
#数据预处理（探索性分析）
# 使用describe方法，查看数据基本统计特征
data.describe()
## 使用info方法，查看是否有缺失
data.info()

# 查看缺失值比率
data.isnull().sum()/data.shape[0]

# 每平价格多少
data['PerPrice'] = data['Price']/data['Size']
data.head(10)
#查看行政区域与房价的关系
plt.figure(figsize=(15, 8))
sns.boxplot(x='Region', y='Price', data=data)
plt.title('北京各大区二手房房屋总价')
plt.show()

#查看房屋大小的分布 以及 房屋大小与价格的关系
figure, [ax1,ax2] = plt.subplots(1, 2, figsize=(12, 5))
# 房屋大小的分布情况
sns.distplot(data['Size'], bins=20, ax=ax1, color='r')
sns.kdeplot(data['Size'], shade=True, ax=ax1)
# 房屋大小和出售价格的关系
sns.regplot(x='Size', y='Price', data=data, ax=ax2)
plt.show()
data.loc[data['Size']<10]
data.loc[data['Size']>1000]
data = data[(data['Layout']!="叠拼别墅")&(data['Size']<1000)]
#查看对Size的异常值处理之后的结果
figure, [ax1,ax2] = plt.subplots(1, 2, figsize=(12, 5))
# 房屋大小的分布情况
sns.distplot(data['Size'], bins=20, ax=ax1, color='r')
sns.kdeplot(data['Size'], shade=True, ax=ax1)
# 房屋大小和出售价格的关系
sns.regplot(x='Size', y='Price', data=data, ax=ax2)
plt.show()
data.head()
#查看房屋户型的分布
figure, ax1= plt.subplots(figsize=(10,10))
sns.countplot(y='Layout', data=data, ax=ax1)
plt.title('房屋户型')
plt.show()
#查看房屋装修分布 以及 装修与房价的关系
data['Renovation'].value_counts()
plt.figure(figsize=(10,8))
sns.boxplot(x='Renovation',y='Price',data=data)
plt.show()
data['Elevator'][:10]

data['Elevator'].value_counts(dropna=False)
data.isnull().sum()/data.shape[0]
data.head(10)
data.loc[(data['Floor']>6) & (data['Elevator'].isnull()),'Elevator'] = '有电梯'
data.loc[(data['Floor']<=6) & (data['Elevator'].isnull()),'Elevator'] = '无电梯'
data['Elevator'].value_counts(dropna=False)
data.head(10)
#查看房屋楼层的分布
plt.figure(figsize=(20,8))
plt.title('房屋楼层')
sns.countplot(x='Floor',data=data)
plt.show()
#处理房屋朝向
data['Direction'].value_counts()

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# 建立一个处理direction的函数
def direct_func(x):
    if not isinstance(x,str):
        raise TypeError
    x = x.strip()
    x_len = len(x)
    x_list = pd.unique([y for y in x])
    if x_len != len(x_list):
        return 'no'
        
    if (x_len == 2) & (x not in d_list_two):
        m0 = x[0]
        m1 = x[1]
        return m1+m0
    elif (x_len == 3) & (x not in d_list_three):
        for n in d_list_three:
            if (x_list[0] in n) & (x_list[1] in n) & (x_list[2] in n):
                return n
    elif (x_len == 4) & (x not in d_list_four):
        return d_list_four[0]
    else:
        return x
       
# 通过 apply() 方法将 Direction 数据格式转换
d_list_one = ['东','西','南','北']
d_list_two = ['东西','东南','东北','西南','西北','南北']
d_list_three = ['东西南','东西北','东南北','西南北']
d_list_four = ['东西南北']    
data['Direction'] = data['Direction'].apply(direct_func)
data = data.loc[(data['Direction']!='no')&(data['Direction']!='nan')]
data['Direction'].value_counts()
data.head()
#填补 elevator 缺失值，有无电梯
#房屋大小 size 异常值
#房屋朝向整理

#接下来做一些特征工程（合并，数值化。。。）
#特征工程
#创建新特征：房间数、客厅数
data['Layout_room_num'] = data['Layout'].str.extract('(^\d).*', expand=False).astype('int64')
data['Layout_hall_num'] = data['Layout'].str.extract('^\d.*?(\d).*', expand=False).astype('int64')
#上面用的是string类型的正则表达式
data.head()
data['Layout_room_num'].value_counts()
data['Layout_hall_num'].value_counts()
#对特征Year(建筑年份) 离散化
data['Year'][:10]
data['Year'] = pd.qcut(data['Year'],8).astype('object')
data['Year'].value_counts()
data.head()
#创建新特征：房间总数、平均每间房间大小
data['Layout_total_num'] = data['Layout_room_num'] + data['Layout_hall_num']
data['Size_room_ratio'] = data['Size']/data['Layout_total_num']
data.head()
data = data.drop(['Layout','Id','Garden','PerPrice'],axis=1)
data.head()
#介绍one-hot encoding
# 可以使用pd.get_dummies实现 one-hot encoding
dummies = pd.get_dummies(data['Elevator'],prefix='Elevator')
data = pd.concat([data,dummies],axis=1) # 拼接回原来的数据结构
data = data.drop(['Elevator'],axis=1)
data['Renovation'].value_counts()
data = pd.get_dummies(data)
data.head()
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
#建立模型
# 可以使用sklearn中的train_test_split函数，进行训练集和测试集的切分
# 我们使用R^2对模型效果进行评价
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
y = data['Price']
x = data.copy().drop(['Price'], axis=1)
# (1)划分训练集和测试集，比例为80%(训练)--20%(测试)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state = 33)

# (2)导入模型，这里先尝试DecesionTree决策树(回归树)
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()

# (3)训练模型
model.fit(x_train,y_train)

# (4)模型预测
y_predict = model.predict(x_test)
# (5)模型效果评估
print('决策树模型train r2:')
print(r2_score(y_test, y_predict))
#回归树叶节点的数据类型是连续的，而分类树叶节点的数据类型是离散的。
#回归树叶节点是一个个具体的值，而分类树叶节点是依据训练样本类别确定的预测类别。
#回归树的叶节点返回的是“一团”训练数据的均值，而不是具体的、连续的预测值。
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=1)
rfr.fit(x_train,y_train)
y_predict_rfr = rfr.predict(x_test)
rfr
print('随机森林模型 r2:')
print(r2_score(y_test, y_predict_rfr))
#下面我们可以进行一些模型参数调整的过程，看能否提高模型准确率
#模型调参
#使用GridSearch(网格搜索)和K-cross_validation(K-交叉验证)
#网格搜索是我们事先为每个参数确定一组值，然后穷举各种参数组合
#K-交叉验证是将数据集平均划分为K组，其中K-1组为训练集，剩下1组为验证集
from sklearn.model_selection import GridSearchCV
tuned_parameters = {'n_estimators': [100,150], 'max_depth': [25,40]}
rfr = RandomForestRegressor(random_state=1)
clf = GridSearchCV(rfr, tuned_parameters, cv=5, scoring='r2', n_jobs=-1, verbose=1)
clf.fit(x_train, y_train)
print(clf.best_estimator_)
#得到调参后的模型
clf.best_estimator_
#使用调参后的模型进行预测
rfr_after = clf.best_estimator_
rfr_after.fit(x_train,y_train)
y_predict_after = rfr_after.predict(x_test)
print('调参数后的随机森林模型 r2:')
print(r2_score(y_test, y_predict_after))
#查看模型预测出来的房屋价格
y_predict_after[:10].astype(int)
y_test.values[:10].astype(int)

2、结果展示
在这里插入图片描述

哈喽朝龙

关注

2
点赞
踩
15

收藏

觉得还不错? 一键收藏
打赏
2
评论
房价数据处理与分析

1、直接上代码import pandas as pdimport seaborn as snsimport matplotlib.pyplot as plt #导入matplotlib模块from warnings import simplefilter #使用 warnings 抑制第三方警告simplefilter(action='ignore', category=FutureWarning)plt.rcParams['font.sans-serif']=['SimHei'] ## 用于
复制链接

扫一扫