河南郑州二手房房价预测和分析_郑州市二手房房价预测python-CSDN博客

本文链接：https://blog.csdn.net/moasad/article/details/122343625

本文深入探讨了河南郑州二手房市场的房价走势，涉及数据爬取、预处理、异常值处理和可视化分析。内容包括房价分布、区域房价对比、热门社区街道排行及影响房价的关键因素研究，为房产投资者提供决策依据。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

课程大作业

河南郑州二手房房价预测和分析

爬取数据
加载库
查看数据
数据预处理
数据可视化分析
对各个区域房价影响最大的因素

爬取数据

这里参照别人的博客
我对代码做了一些改动，因为不同的地方，网页节点会有细微的变化。所以我改了下xpath，以及其他地方
代码：

import pandas as pd # 数据存储
import requests # 网页内容获取
import re # 解析数据
from lxml import etree # 解析数据
import random
import time # 反反爬
from fastprogress import master_bar,progress_bar # 进度条显示


def ua():
    """随机获取一个浏览器用户信息"""

    user_agents = [
        'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
        'Opera/9.25 (Windows NT 5.1; U; en)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
        'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
        'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
        'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7',
        'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0',
    ]

    agent = random.choice(user_agents)

    return {
        'User-Agent': agent
    }


def get(url):
    """
    获取网页源码
    url: 目标网页的地址
    return:网页源码
    """
    res = requests.get(url=url, headers=ua())
    return res.text


def get_url(res_text):
    """
    获取源码中每个二手房详情页的url
    res_text:网页源码
    return:列表形式的30个二手房详情页的url
    """
    re_f = '<a class="" href="(.*?)" target="_blank"'
    url_list = re.findall(re_f, res_text)
    return url_list


def get_else_data(res_text):
    res_text = etree.HTML(res_text)

    title = res_text.xpath("//div[@class='sellDetailHeader']//h1/@title")

    return dict(zip(['标题'], [title]))


def get_data(res_text):
    """获取房屋的详细数据"""
    res_text = etree.HTML(res_text)

    # 获取房屋的标题
    title = res_text.xpath("//div[@class='sellDetailHeader']//h1/@title")
    # 获取房屋的总价
    # total_price = res_text.xpath("//div[@class='overview']//div/span/text()")[1]
    total_price = res_text.xpath("//div[@class='overview']//div[@class='price ']/span/text()")[0]
    # 获取房屋的单价
    # price = res_text.xpath("//div[@class='overview']//div/span/text()")[2]
    price = res_text.xpath("//div[@class='overview']//div[@class='unitPrice']/span/text()")[0]
    # print(total_price,price)

    # 获取房屋的地段
    communityName = res_text.xpath("//div[@class='aroundInfo']//div/a/text()")[0]

    regionName = res_text.xpath("//div[@class='areaName']//span/a/text()")[0]
    # 获取区域
    areaName = res_text.xpath("//div[@class='areaName']//span/a/text()")[1]

    ## 房屋基本信息获取
    # 获取房屋基本信息的标题
    lab = res_text.xpath("//div[@class='base']//span/text()")
    # 获取房屋基本信息的内容
    val = res_text.xpath("//div[@class='base']//li/text()")

    ## 获取房源交易信息
    # 获取房源交易标题
    key1 = res_text.xpath("//div[@class='transaction']//span[1]//text()")
    # 获取房源交易信息内容
    trans = res_text.xpath("//div[@class='transaction']//span[2]//text()")

    # 返回包含上述信息的字典
    return dict(zip(['标题', '总价格', '单价', '小区', '房屋所属市辖区', '房屋地址（街道）'] + lab + key1,
                    [title, total_price, price, communityName, regionName, areaName] + val + trans))


def main(qu, start_pg=1, end_pg=100, download_times=1):
    """爬虫程序
    qu: 传入要爬取的qu的拼音的列表
    start_pg:开始的页码
    end_pg:结束的页码
    download_times:第几次下载
    """
    finish_city = [
        'erqiqu','zhengdongxinqu','xingyangshi','xinzhengshi','shangjiequ','gongyishi','xinmishi','dengfengshi',
        'zhongmuxian', 'jingkaiqu','gaoxinqu18','hangkonggangqu'
    ] #中间网络断了下，所以避免重复爬取，自己写了个判断
    index = -1
    page = [100,100,82,100,23,1,1,1,59,54,100,42,100,100,100,100]
    pageText = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
    city = 'zz' #郑州
    for q in qu:
        index+=1
        if q in finish_city:
            continue
        # end_pg = page[index]
        end_pg = page[index]
        # 获取链家网（绵阳）的首页url,同学们可以换成自己想要的城市url
        url = 'https://' + city + '.lianjia.com/ershoufang/' + q + '/pg'
        # 数据储存的列表
        data = []
        # 文件保存路径
        filename = './二手房-' + q + '第' + str(download_times) + '次下载.csv'
        #         print(filename)

        print('二手房-' + q + '第' + str(download_times) + '次下载')
        mb = master_bar(range(start_pg, end_pg + 1))

        for i in mb:

            # 获取每页的url
            new_url = url + str(i) + '/'
            print(new_url)

            # 获取当前页面包含的30个房屋详情页的url
            url_list = get_url(get(new_url))

            for l in progress_bar(range(len(url_list)), parent=mb):

                # 反爬随机停止一段时间
                a = random.randint(2, 5)
                if l % a == 0:
                    time.sleep(2 * random.random())
                try:
                    # 获取当前页面的源码
                    text = get(url_list[l])
                    # 获取当前页面的房屋信息
                    data.append(get_data(text))
                except Exception as e:
                    time.sleep(100 * random.random())
                    # 获取当前页面的源码
                    text = get(url_list[l])
                    # 获取当前页面的房屋信息
                    data.append(get_data(text))

                # 反爬随机停止一段时间
                time.sleep(5 * random.random())
                mb.child.comment = '正在爬取第' + str(l + 1) + '条数据!!'
            mb.main_bar.comment = '正在爬取第' + str(i + 1) + '页数据!!'

            # 反爬随机停止一段时间
            time.sleep(5 * random.random())

            if i % 5 == 0:
                # 每5页保存一次数据
                pd.DataFrame(data).to_csv(filename)
                mb.write('前' + str(i) + '页数据已保存')
    pd.DataFrame(data).to_csv(filename)


area = ['二七区','郑东新区','荥阳市','新郑市','上街区','巩义市','新密市','登封市','中牟县','经开区','高新区','航空港区','中原区','管城回族区','惠济区','金水区']
area1 = [
    'erqiqu','zhengdongxinqu','xingyangshi','xinzhengshi','shangjiequ','gongyishi','xinmishi','dengfengshi',
    'zhongmuxian', 'jingkaiqu','gaoxinqu18','hangkonggangqu','zhongyuanqu1','guanchenghuizuqu','huijiqu','jinshuiqu1'
]
# main(['xingyangshi'],70,100,2) # 区，开始页，结束页，第几次下载（以便保存文件）
main(area1,1,100,1) # 区，开始页，结束页，第几次下载（以便保存文件）

在这里插入图片描述

加载库

#首先加载相关的库
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') #忽略警告
plt.rcParams['font.sans-serif'] = 'SimHei'  # 设置可视化的中文显示
# plt.rcParams['font.sans-serif']=['simhei']
# plt.rcParams['axes.unicode_minus'] = False

查看数据

#加载数据，观察数据的整体结构和类型
data_all = pd.read_csv('total_data.csv')

# sns.set_style('whitegrid') #设置主题风格
data_all.head() # 查看前5条数据

#查看信息总览
data_all.info()

在这里插入图片描述

数据预处理

删除不需要分析的列

一些列对二手房的房价影响并不很大，所以删掉一些不需要进行分析的信息

drop_col = [
    'id','house_book','house_listing_time','house_property','house_mortgage_info',
    'house_inner_area'
]
data_all.drop(columns=drop_col, axis=1, inplace=True)
#查看信息总览
data_all.info()

在这里插入图片描述

对数据进行去重

data_all.duplicated().value_counts()

在这里插入图片描述

data_all.drop_duplicates(inplace=True) # inplace=True表示在原数据上进行替换
data_all.index = range(len(data_all)) #删除后，要把index进行重新替换一下，不然会保留原理的下标

data_all.duplicated().value_counts()

处理缺失值

将‘暂无数据’和‘空白’给赋值为np.nan，否则处理连续值会有问题
离散值中空值处理为暂无数据

data_all.isna().sum()

在这里插入图片描述

# # 处理套内面积,
# data_all.loc[data_all.house_inner_area=='暂无数据', 'house_inner_area'] = np.nan
# data_all.loc[data_all.house_inner_area.isna(), 'house_inner_area'] = np.nan

# 处理户型结构数据
data_all.loc[data_all.house_structure.isna(), 'house_structure'] = "暂无数据"

# 处理建筑类型数据
data_all.loc[data_all.house_building_type.isna(), 'house_building_type'] = "暂无数据"

# 处理建筑结构数据
data_all.loc[data_all.house_building_structure.isna(), 'house_building_structure'] = "未知结构"

# 处理装修情况数据
data_all.loc[data_all.house_decoration.isna(), 'house_decoration'] = "其他"

# 处理梯户比例数据
data_all.loc[data_all.house_elevator_sytle.isna(), 'house_elevator_sytle'] = "暂无数据"

# 处理供暖方式数据
data_all.loc[data_all.house_heating_type.isna(), 'house_heating_type'] = "暂无数据"

#处理是否有电梯数据
data_all.loc[data_all.elevator.isna(), 'elevator'] = "暂无数据"

#处理房屋户型数据
data_all.loc[data_all.house_layout.isna(), 'house_layout'] = "暂无数据"

#把上次交易时间中暂无数据的，全部改为2050/1/1，否则后面转化要报错
data_all.loc[data_all.house_last_time == "暂无数据", 'house_last_time'] = "2050/1/1"

data_all.isna().sum()

文本数据清理

处理面积后面带㎡的问题，并且转化成float类型
处理楼层，把所处楼层和总楼层分开，并且加入到原数据中
将几室几厅几卫分开存储
将朝向方位大于等于3的去掉
把日期只保留年份

这里可以先看一下数据的具体情况

# data_all.house_area.value_counts()
# data_all.house_region.value_counts()
# data_all.house_address.value_counts()
# data_all.house_layout.value_counts()
# data_all.house_rental_area.value_counts()
# data_all.house_structure.value_counts()
# data_all.house_building_type.value_counts()
# data_all.house_orientation.value_counts()
# data_all.house_building_structure.value_counts()
# data_all.house_decoration.value_counts()
# data_all.house_elevator_sytle.value_counts()
# data_all.house_heating_type.value_counts()
# data_all.elevator.value_counts()
# data_all.house_transaction_type.value_counts()
# data_all.house_last_time.value_counts()
# data_all.house_useage.value_counts()
# data_all.house_years.value_counts()
# data_all.house_mortgage_info.value_counts()

#处理面积问题
data_all['house_rental_area'] = data_all['house_rental_area'].str.replace('㎡', '')
# print(data_all['house_rental_area'])
data_all['house_rental_area'] = data_all['house_rental_area'].astype('float')

# data_all['house_inner_area'] = data_all['house_inner_area'].str.replace('㎡', '')
# data_all['house_inner_area'] = data_all['house_inner_area'].astype('float')

# print(data_all.info())

# 处理楼层，把所处楼层和总楼层分开，并且加入到原数据中
temp0 = data_all['house_floor'].str.split('(', expand=True) # expand决定了分割后的结果是分布在多列（返回DataFrame）还是以列表的形式分布在一列中
data_all.insert(loc=21, column='house_floor_position', value=temp0[0])
data_all.insert(loc=22, column='house_total_floor', value=temp0[1])
data_all.house_floor_position = data_all.house_floor_position.str.replace(" ", "")
data_all.house_total_floor = data_all.house_total_floor.str.replace("共", "").str.replace(")", "").str.replace("层", "").str.replace(" ", "")
data_all.drop(['house_floor'], axis=1, inplace=True)
# 把楼层转化成float类型
data_all.house_total_floor = data_all.house_total_floor.astype("float")

# 这里后面没有使用了，所以注释掉了
# 将几室几厅几卫分开存储
# temp1 = data_all.house_layout.str.split("室", expand=True)
# temp2 = temp1[1].str.split("厅", expand=True)
# temp3 = temp2[1].str.split("厨", expand=True)
# temp4 = temp3[1].str.replace("卫", "")
# print(temp1)
# print(temp2)
# print(temp3)
# print(temp4)
# data_all.insert(loc=25, column='house_layout_room', value=temp1[0])
# data_all.insert(loc=26, column='house_layout_livingroom', value=temp2[0])
# data_all.insert(loc=27, column='house_layout_kitchen', value=temp3[0])
# data_all.insert(loc=28, column='house_layout_toilet', value=temp4)
#转换为float
# data_all.house_layout_room = data_all.house_layout_room.astype("float")
# data_all.house_layout_livingroom = data_all.house_layout_livingroom.astype("float")
# data_all.house_layout_kitchen = data_all.house_layout_kitchen.astype("float")
# data_all.house_layout_toilet = data_all.house_layout_toilet.astype("float")

# 将朝向方位大于等于3的去掉
data_all['house_orientation'] = data_all['house_orientation'].str.replace(' ', '')
data_all.loc[data_all.house_orientation.str.len() >= 3, "house_orientation"] = "暂无数据"
# train.house_orientation.value_counts(dropna=False)

# 把时间只取年份
# 取反~，不在此年限以外的所有下标
ind = data_all.loc[~pd.to_datetime(data_all['house_last_time']).dt.to_period('A').astype(str).isin(['2000', '2001', '2002', '2003', '2004'
                                                                                               '2005', '2006', '2007', '2008', '2009', 
                                                                                               '2010', '2011', '2012', '2013', '2014'
                                                                                               '2015', '2016', '2017', '2018', '2019', 
                                                                                               '2020','2021'])].index
data_all.drop(ind, axis=0, inplace=True)
data_all.index = range(len(data_all))
# 取年份
data_all['house_last_time'] = pd.to_datetime(data_all.house_last_time).dt.to_period('Y')


data_all.info()

在这里插入图片描述
查看数据describe

# 查看describe
for item in data_all:
    data_all_describe = data_all[item].describe()
    print('列名:',item)
    print(data_all_describe)
    print()

在这里插入图片描述

异常值处理

plt.figure(figsize=(10, 10),facecolor='white') #画布底色调成白色
sns.boxplot(y="unit_price", x="house_region", data=data_all)
plt.title("郑州各区域的单位房价箱线图")    
plt.tight_layout()

plt.figure(figsize=(10, 10),facecolor='white')
sns.boxplot(y="house_rental_area", x="house_region", data=data_all)
plt.title("郑州各区域的建筑面积箱线图")    
plt.tight_layout()

在这里插入图片描述

删除房价过高的异常值
这些值大概率是豪宅，但是豪宅数量太少，如果保留会对总体的趋势造成较大的影响
同理也删除面积过大的数据

city = ['二七区','高新区','管城回族区','航空港区','惠济区','金水区','经开区',
        '上街区','新郑市','荥阳市','郑东新区','中牟县','中原区']
# 通过观察来删除异常值，可以随意
max_unit_price = [40000,30000,26000,20000,40000,38000,32000,18000,30000,30000,52000,30000,38000]
max_house_rental_area = [260,220,250,210,360,320,310,210,360,300,300,300,270]
for i in range(13):
    data_all.drop(data_all[(data_all['house_region']==city[i]) & (data_all['unit_price']>max_unit_price[i])].index,inplace=True)
    data_all.index = range(len(data_all))

for i in range(13):
    data_all.drop(data_all[(data_all['house_region']==city[i]) & (data_all['house_rental_area']>max_house_rental_area[i])].index,inplace=True)
    data_all.index = range(len(data_all))

data_all.describe()

在这里插入图片描述

数据可视化分析

房价分布情况

plt.figure(figsize=(16, 8),frameon = True,facecolor='white')
plt.subplot(1, 2, 1)
# 饼图
bins = [0, 30, 60, 90, 120, 150, 180, 300, 500,1200]  
labels = ['< 30','< 60', '< 90','< 120','< 150','< 180','< 300','< 500','< 1200']
data_hourse_price = pd.cut(data_all.total_price, bins, right=False, labels=labels) 
# 数据在哪个区间，就给其标上指定的labels
# bin;right=False,默认是左开右闭，现在是左闭右开
# print(data_hourse_price)
values = data_hourse_price.value_counts() #统计数量，好画饼图
# explode = [0, 0, 0.1,0.2,0.3]
explode = [0, 0, 0,0,0,0,0,0.1,0.2]
plt.pie(values, labels=labels, autopct='%1.1f%%',explode=explode)
plt.title('总房价分布情况',fontsize=30)
plt.legend(labels,loc="upper left")

plt.subplot(1, 2, 2)
bins = [0, 6000, 12000, 18000, 24000, 40000,50000]  
labels = ['< 6000','< 12000', '< 18000','< 24000','< 40000','< 50000']
data_unit_price = pd.cut(data_all.unit_price, bins, right=False, labels=labels) 
# 数据在哪个区间，就给其标上指定的labels
# bin;right=False,默认是左开右闭，现在是左闭右开
# print(data_hourse_price)
values = data_unit_price.value_counts() #统计数量，好画饼图
# explode = [0, 0, 0.1,0.2,0.3]
explode = [0,0,0,0,0.1,0.2]
plt.pie(values, labels=labels, autopct='%1.1f%%',explode=explode)
plt.title('单位房价分布情况')
plt.legend(labels,loc="upper left")
plt.title("单位房价分布情况", fontsize=30)

plt.show()

在这里插入图片描述

plt.figure(figsize=(12, 4),facecolor='white')
plt.subplot(1, 2, 1)
sns.distplot(data_all.unit_price, bins=50) 
# 是直方图和核密度图的结合,用来看单个连续型变量的分布。
# bins为直方图bins（柱）的数目
sns.despine()
plt.title("单位房价分布")
plt.ylabel("Frequency")

plt.subplot(1, 2, 2)
sns.distplot(data_all.total_price, bins=50)
sns.despine() # 函数默认移除了上部和右侧的轴
plt.title("总房价分布")
plt.ylabel("Frequency")
plt.tight_layout() # 简易的调整多图、单图标签的分布位置

#可以看出其房价的分布情况，二手房的价格比较集中在哪个价位

在这里插入图片描述

各区域的整体情况

plt.figure(figsize=(10, 8),facecolor='white')
plt.subplot(2, 1, 1)
data_all.groupby("house_region").agg(np.mean).total_price.sort_values(ascending=False).plot(kind="bar")
sns.despine()
plt.title('各区平均总房价排名')
plt.subplot(2, 1, 2)
data_all.groupby("house_region").agg(np.mean).unit_price.sort_values(ascending=False).plot(kind="bar")
sns.despine()
plt.title('各区平均单位房价排名')
plt.tight_layout()

在这里插入图片描述

各区域房价的分布

plt.figure(figsize=(15, 20),facecolor='white')
plt.subplot(2, 1, 1)
sns.boxplot(y="unit_price", x="house_region", data=data_all)
plt.title("郑州各区域的单位房价分布情况") 

plt.subplot(2, 1, 2)
sns.boxplot(y="total_price", x="house_region", data=data_all)
plt.title("郑州各区域的总房价分布情况") 
plt.tight_layout()

在这里插入图片描述

各区域单位房价前五名的社区

data_all.groupby(['house_region', 'house_area']).mean().reset_index().groupby('house_region').apply(lambda x:x.sort_values('unit_price',ascending=False)[0:5])

在这里插入图片描述

各区域单位房价前五街道

data_all.groupby(['house_region', 'house_address']).mean().reset_index().groupby('house_region').apply(lambda x:x.sort_values('unit_price',ascending=False)[0:5])

在这里插入图片描述

郑州市房价前20的社区

plt.figure(figsize=(10, 8),facecolor='white')
plt.subplot(2, 1, 1)
data_all.groupby("house_area").agg(np.mean).total_price.sort_values(ascending=False)[0:20].plot(kind="bar")
ylabel = 'total_price'
xlabel = '万元/套'
sns.despine()
plt.title('郑州市总房价前20名社区')

plt.subplot(2, 1, 2)
data_all.groupby("house_area").agg(np.mean).unit_price.sort_values(ascending=False)[0:20].plot(kind="bar")
sns.despine()
plt.title('郑州市单位房价前20名社区')
ylabel = 'unit_price'
xlabel = '元/平方米'
plt.tight_layout()

在这里插入图片描述

郑州市房价前20的街道

plt.figure(figsize=(10, 8),facecolor='white')
plt.subplot(2, 1, 1)
data_all.groupby("house_address").agg(np.mean).total_price.sort_values(ascending=False)[0:20].plot(kind="bar")
ylabel = 'total_price'
xlabel = '万元/套'
sns.despine()
plt.title('郑州市总房价前20名街道')


plt.subplot(2, 1, 2)
data_all.groupby("house_address").agg(np.mean).unit_price.sort_values(ascending=False)[0:20].plot(kind="bar")
sns.despine()
plt.title('郑州市单位房价前20名街道')
ylabel = 'unit_price'
xlabel = '元/平方米'
plt.tight_layout()

在这里插入图片描述

各区域房源数量分析

plt.figure(figsize=(15, 8),facecolor='white')
sns.countplot(x='house_region',data=data_all, order=data_all['house_region'].value_counts().sort_values(ascending=False).index)
plt.title('各区域房源数量')
sns.despine()

在这里插入图片描述

不同房型的房源数量与房价分析

plt.figure(figsize=(10, 10),facecolor='white')
# train['house_layout'].value_counts()
# print(train['house_layout'].value_counts())
temp = data_all[['house_layout','unit_price','total_price','house_rental_area','house_address']]
temp.rename(columns={'house_address':'house_temp'}, inplace=True)
# print(temp)
temp1 = temp.groupby('house_layout').agg({"house_temp": "count", "unit_price": np.mean, "total_price": np.mean, "house_rental_area": np.mean}).sort_values(by='house_temp', ascending=False)
# print(temp1)
temp1['house_temp'].iloc[0:10].plot(kind = 'bar')
plt.title('房源中各房型数量前10名')
plt.ylabel("数量/套")
sns.despine()
plt.tight_layout()

在这里插入图片描述

plt.figure(figsize=(10, 10),facecolor='white')
plt.subplot(2, 1, 1)
plt.plot(temp1['unit_price'].iloc[0:10])
plt.title('房源中各房型数量前10名的平均单位房价')
plt.ylabel("平均单位房价(元/㎡)")
sns.despine()

plt.subplot(2, 1, 2)
temp1['total_price'].iloc[0:10].plot(kind='bar')
plt.title('房源中各房型数量前10名的平均总房价')
plt.ylabel("平均总房价/万元")
sns.despine()
plt.tight_layout()

在这里插入图片描述

plt.figure(figsize=(10, 10),facecolor='white')
temp1['house_rental_area'].iloc[0:10].plot(kind='bar')
plt.title('房源中各房型数量前10名的平均建筑面积')
plt.ylabel("建筑面积/㎡")
sns.despine()
plt.tight_layout()

在这里插入图片描述

各区域房源平均面积分析

plt.figure(figsize=(10, 10),facecolor='white')
data_all.groupby('house_region').agg(np.mean).house_rental_area.sort_values(ascending=False).plot(kind='bar')
plt.ylabel("面积/㎡")
sns.despine()
plt.title('各区域房源平均面积')

在这里插入图片描述

plt.figure(figsize=(10, 8),facecolor='white')
data_all[data_all.house_useage=='别墅'].groupby('house_region').agg('count').title.sort_values(ascending=False).plot(kind='bar')
sns.despine()
plt.ylabel("房屋数量/套")
plt.title("各区域别墅数量")
plt.tight_layout()

在这里插入图片描述

plt.figure(figsize=(10, 10),facecolor='white')
data_all.query("house_rental_area<80").groupby('house_region').agg('count').title.sort_values(ascending=False).plot(kind='bar')
plt.ylabel("房屋数量/套")
plt.title("各区域小户型数量")
sns.despine()

在这里插入图片描述

房屋朝向对单位房价的影响

朝南房子采光会好一些,朝北房子则相反

plt.figure(figsize=(12, 12),facecolor='white')
plt.subplot(3, 1, 1)
data_all[data_all.house_orientation.isin(['东', '南', '西', '北', '东南', '东北', '西南', '西北'])].groupby('house_orientation').agg("count").house_region.plot(kind='bar')
plt.title("各类朝向房子的数量")
plt.ylabel("数量/套")

plt.subplot(3, 1, 2)
data_all[data_all.house_orientation.isin(['东', '南', '西', '北', '东南', '东北', '西南', '西北'])].groupby('house_orientation').agg(np.mean).unit_price.plot(kind='bar')
plt.title("各类朝向房子的平均单位房价")
plt.ylabel('单位房价(元/㎡)')

plt.subplot(3, 1, 3)
data_all[data_all.house_orientation.isin(['东', '南', '西', '北', '东南', '东北', '西南', '西北'])].groupby('house_orientation').agg(np.mean).total_price.plot(kind='bar')
plt.title("各类朝向房子的总价")
plt.ylabel('单位房价(元/㎡)')

sns.despine()
plt.tight_layout()

在这里插入图片描述

二手房中房屋用途的比例

plt.figure(figsize=(8, 6),facecolor='white')
plt.subplot(2, 1, 1)
data_all.groupby('house_useage').agg("count").title.plot(kind='bar')
plt.ylabel("数量/套")
plt.title("各类房屋用途的房子数量")

在这里插入图片描述

装修对房价的影响

plt.figure(figsize=(8, 6),facecolor='white')
plt.subplot(2, 1, 1)
data_all.groupby('house_decoration').agg("count").title.plot(kind='bar')
plt.ylabel("数量/套")
plt.title("各类装修的房子数量")

plt.subplot(2, 1, 2)
data_all.groupby('house_decoration').agg(np.mean).unit_price.plot(kind='bar')
plt.ylabel("平均单位房价(元/㎡)")
plt.title("各类装修的平均单位房价")
sns.despine()                       
plt.tight_layout()

# train.groupby("house_region").agg(np.mean).total_price.sort_values(ascending=False).plot(kind="bar", cmap='rainbow')

在这里插入图片描述

有无电梯对房价的影响

plt.figure(figsize=(8, 6),facecolor='white')
plt.subplot(2, 1, 1)
data_all.groupby('elevator').agg("count").title.plot(kind='bar')
plt.ylabel("数量/套")
plt.title("有无电梯的房子数量")

plt.subplot(2, 1, 2)
data_all.groupby('elevator').agg(np.mean).unit_price.plot(kind='bar')
plt.ylabel("平均每单位房价（元/㎡）")
plt.title("有无电梯的平均单位房价")

sns.despine()                       
plt.tight_layout()

在这里插入图片描述

住宅类型对房价的影响

plt.figure(figsize=(8, 6),facecolor='white')
plt.subplot(2, 1, 1)
data_all.groupby('house_transaction_type').agg("count").title.plot(kind='bar')
plt.title("各住宅类型的房子数量")
plt.ylabel("数量/套")
plt.subplot(2, 1, 2)
data_all.groupby('house_transaction_type').agg(np.mean).unit_price.plot(kind='bar')
plt.title("各住宅类型的平均单位房价")
plt.ylabel("平均单位房价(元/㎡)")
sns.despine()
plt.tight_layout()

在这里插入图片描述

房屋结构对房价的影响

plt.figure(figsize=(10, 8),facecolor='white')
plt.subplot(2, 1, 1)
data_all.groupby('house_building_structure').agg("count").title.plot(kind='bar')
plt.title("各类房屋结构的房子数量")
plt.ylabel("数量/套")

plt.subplot(2, 1, 2)
data_all.groupby('house_building_structure').agg(np.mean).unit_price.plot(kind='bar')
plt.title("各类房屋结构的平均单位房价")
plt.ylabel("平均单位房价(元/㎡)")
sns.despine()
plt.tight_layout()

在这里插入图片描述

房子所在楼层位置对房价的影响

plt.figure(figsize=(8, 6),facecolor='white')
plt.subplot(2, 1, 1)
data_all.groupby('house_floor_position').agg("count").title.plot(kind='bar')
plt.title("房子所处楼层位置各类别的数量")
plt.ylabel("数量/套")
plt.subplot(2, 1, 2)

data_all.groupby('house_floor_position').agg(np.mean).unit_price.plot(kind='bar')
plt.title("房子所处楼层位置的平均单位房价")
plt.ylabel("平均单位房价(元/㎡)")
sns.despine()
plt.tight_layout()

在这里插入图片描述

上次交易时间和房价的关系

plt.figure(figsize=(8, 6),facecolor='white')
plt.subplot(2, 1, 1)
data_all.groupby('house_last_time').agg("count").title.plot(kind='bar')
plt.title("不同年份房套子发布的数量")
plt.ylabel("数量/套")
plt.subplot(2, 1, 2)
data_all.groupby('house_last_time').agg(np.mean).unit_price.plot()
plt.title("不同年份的平均单位房价")
plt.ylabel("平均单位房价(元/㎡)")
sns.despine()
plt.tight_layout()

在这里插入图片描述

房价与楼层总数之间的关系

plt.figure(figsize=(12, 6),facecolor='white')
plt.subplot(2, 1, 1)
data_all.groupby('house_total_floor').agg("count").title.plot(kind='bar')
plt.title("各楼层总数的房子数量")
plt.ylabel("数量/套")

plt.subplot(2, 1, 2)
data_all.groupby('house_total_floor').agg(np.mean).unit_price.plot()
plt.title("各楼层总数的房子平均单位房价")
plt.ylabel("平均单位房价(元/㎡)")
sns.despine()
plt.tight_layout()

在这里插入图片描述

对各个区域房价影响最大的因素

对每一个区域进行分析，计算其中一个特征所有类别的平均单位房价，计算最大的和最小的极差值。依次计算完所有特征的极差，极差最大的特征即该区域影响最大的因素。
查看每一个区域的极差值，可以发现影响最大因素为：
上街区 : 楼层位置
中原区 : 建筑结构
中牟县 : 房屋用途
二七区 : 朝向
惠济区 : 建筑结构
依次类推，

data_local = data_all[['unit_price', 'house_region', 'house_orientation', 'house_decoration', 
                    'house_floor_position', 'elevator', 'house_useage', 'house_building_structure']]
list_name = [
                'house_orientation_best', 'house_orientation_worst', 'house_orientation_diff', 
                'house_decoration_best', 'house_decoration_worst', 'house_decoration_diff',
                'house_floor_position_best', 'house_floor_position_worst', 'house_floor_position_diff',
                'elevator_best', 'elevator_worst', 'elevator_diff',
                'house_useage_best', 'house_useage_worst', 'house_useage_diff',
                'house_building_structure_best', 'house_building_structure_worst', 'house_building_structure_diff', 
                'max_factor', 'max_poor'
]

def solve_max_factor(data):
#     print("TEXT1")
#     print(data)
    factor_list = ['house_orientation', 'house_decoration', 
                    'house_floor_position', 'elevator', 
                   'house_useage', 'house_building_structure']
    max_min_val_list = []
    max_poor = 0
    max_factor = ''
    
    for i in range(6):
        max_label = data.groupby(factor_list[i]).agg(np.mean)['unit_price'].idxmax()
        min_label = data.groupby(factor_list[i]).agg(np.mean)['unit_price'].idxmin()
        max_price = data.groupby(factor_list[i]).agg(np.mean)['unit_price'].max()
        min_price = data.groupby(factor_list[i]).agg(np.mean)['unit_price'].min()
        max_min_val_list.append(max_label)
        max_min_val_list.append(min_label)
        max_min_val_list.append(max_price-min_price)
        if max_poor < (max_price-min_price):
            max_poor = max_price-min_price
            max_factor = factor_list[i]
    
    max_min_val_list.append(max_factor)
    max_min_val_list.append(max_poor)
    temp = pd.DataFrame(np.array(max_min_val_list).reshape(1,-1), columns=list_name)
    
    return temp

    
data_local.groupby("house_region").apply(solve_max_factor)