关于链家全网房价数据分析挖掘项目

**关于链家全网房价数据分析挖掘项目**

数据说明

  1. 数据信息:
  • 数据量:40多万条观测,20多个列变量
  • 时间:2018年5月前
  1. 数据来源

项目目标

  • 建立单位面积房价的预测模型

内容目录

数据导入

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family']='sans-serif' # 解决负号是方块
%matplotlib notebook

import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

import re
from scipy import stats
from scipy.stats import norm, skew


pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output
pd.set_option('display.max_columns',40) # 显示隐藏

dataset = pd.read_csv('./houseInfo.csv')
a = dataset.ix[1, 'info_cluster']
dataset.head(5)
introduction_house community_house href_house unit_house size_house direction_house decoration_house elevator_house type_house years_house area_house interests_house watch_times submit_period years_period tax_free total_price smeter_price region info_cluster info_flood info_follow
0 电梯花园洋房,开发商精装修带家具家电,小区人车分流 麓山国际帕萨迪纳3组 https://cd.lianjia.com/ershoufang/106101085290... NaN NaN NaN NaN NaN NaN NaN 麓山 NaN NaN NaN NaN NaN 250.000 单价25492元/平米 cd | 2室2厅 | 98.07平米 | 南 | 其他 | 有电梯 高楼层(共9层)2008年建板塔结合 - 3人关注 / 共0次带看 / 2个月以前发布
1 天府新区麓山国际跃层洋房纯清水出售 麓山国际塞尔维蒙 https://cd.lianjia.com/ershoufang/106101067528... NaN NaN NaN NaN NaN NaN NaN 麓山 NaN NaN NaN NaN NaN 420.000 单价20389元/平米 cd | 叠拼别墅 | 5室1厅 | 206平米 | 南 | 其他 | 无电梯 上叠(共4层)2008年建暂无数据 - 36人关注 / 共2次带看 / 2个月以前发布
2 麓山国际半月湾跃层,户型通透采光良好楼距开阔视野好 麓山国际半月湾 https://cd.lianjia.com/ershoufang/106101136261... NaN NaN NaN NaN NaN NaN NaN 麓山 NaN NaN NaN NaN NaN 275.000 单价24512元/平米 cd | 2室2厅 | 112.19平米 | 东南 | 其他 高楼层(共16层)2013年建板楼 - 43人关注 / 共1次带看 / 1个月以前发布
3 中丝园 装修 套三单卫 带车位 ! 心怡中丝园 https://cd.lianjia.com/ershoufang/106101229408... NaN NaN NaN NaN NaN NaN NaN 麓山 NaN NaN NaN NaN NaN 193.000 单价22043元/平米 cd | 3室2厅 | 87.56平米 | 南 | 其他 | 有电梯 高楼层(共33层)2015年建板塔结合 - 1人关注 / 共0次带看 / 12天以前发布
4 麓山国际因特拉肯A区+套三双卫+对中庭+看湖带装修 麓山国际茵特拉肯A https://cd.lianjia.com/ershoufang/106101233740... NaN NaN NaN NaN NaN NaN NaN 麓山 NaN NaN NaN NaN NaN 300.000 单价23303元/平米 cd | 3室2厅 | 128.74平米 | 西南 | 其他 中楼层(共11层)2016年建板楼 - 0人关注 / 共0次带看 / 10天以前发布

数据探索:

  1. 查看数据集中的变量情况
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474301 entries, 0 to 474300
Data columns (total 22 columns):
introduction_house    474301 non-null object
community_house       474301 non-null object
href_house            474301 non-null object
unit_house            38137 non-null object
size_house            38137 non-null object
direction_house       38137 non-null object
decoration_house      38109 non-null object
elevator_house        37093 non-null object
type_house            38137 non-null object
years_house           38100 non-null object
area_house            474301 non-null object
interests_house       38137 non-null object
watch_times           38137 non-null object
submit_period         38137 non-null object
years_period          30543 non-null object
tax_free              35260 non-null object
total_price           474301 non-null float64
smeter_price          474301 non-null object
region                474301 non-null object
info_cluster          436164 non-null object
info_flood            436164 non-null object
info_follow           436164 non-null object
dtypes: float64(1), object(21)
memory usage: 79.6+ MB
dataset.describe()
total_price
count 474301.000
mean 329.913
std 371.062
min 4.000
25% 143.000
50% 235.000
75% 390.000
max 60000.000
# 检查数据维度
print("训练集特征前的size:",dataset.shape)

训练集特征前的size: (474301, 22)

数据处理:

def size_help_func(x):
#     pattern = re.compile(r'\d+')
#     match = pattern.search(x) 
    x = str(x)
    x = x.replace('平方米','')
    x = x.replace('平米','')
    x = x.replace('米','')
    
    if (('室' in x) | ('厅' in x)|(x=='nan')|('车位' in x)|('房' in x)|('墅' in x)):
        num = 0
    else:
        num = float(x)

#     if ('米' in (x))==True:
#         x = x.replace('平米','')
#         num = float(x.strip())
        
#     else:
#         num = 0

    return num

def info_func(x):
    
    if '平米' in str(x):
        a = x.split('平米')[0].split('|')[-1].strip()
        if len(a)>1 :
            num = a
        else:
            num = 0.0
    else:
        num = 0.0
    return num
    
def size_func(x,y):
    a = size_help_func(x)
    b = info_func(y)
    
    if a == 0.0:
        if ('车位' not in str(b)):
            num = float(b)
        else:
            num = a
    else:
        num =a
    
    return num

def size_addcata_func(a):
#     a = watch_time_func(x)
    if a <= 10:
        label = str(1)
    else:
        label = str(0)
    return label


def watch_time_func(x):
    if str(x) == 'nan':
        num = -1
    else:
        a = x.split('次')[0].strip()
        num = int(a)
    return num

def watch_time_addcata_func(x):
#     a = watch_time_func(x)
    if x == -1:
        label = str(1)
    else:
        label = str(0)
    return label

def interests_house_func(x):
    if str(x) == 'nan':
        num = -1
    else:
        a = x.split('人')[0].strip()
        num = int(a)
    return num

def interests_house_addcata_func(x):
#     a = interests_house_func(x)
    if x == -1:
        label = str(1)
    else:
        label = str(0)
    return label

def submit_period_func(x):
    if str(x) == 'nan':
        num = -1
    elif '刚刚' in str(x):
        num = 0
    elif '年' in str(x):
        a = x.split('年')[0].strip()
        if a == '一':
            num = 365
        elif a == '二':
            num = 730
        else:
            num = 1000
    elif '个月' in x:
        a = x.split('个月')[0].strip()
        num = int(a)* 30
    elif '天' in x:
        a = x.split('天')[0].strip()
        num = int(a)
    else:
        num = -2
    return num

def submit_period_addcata_func(x):
    a = submit_period_func(x)
    if a == -2:
        label = 3
    elif a == -1:
        label = 2
    elif a == 1000:
        label = 1
    else:
        label = 0
    return str(label)

def years_period_func(x):
    if str(x) == 'nan':
        label = str(0)
    else:
        label = str(1)
    return label
# def tax_free_func(x):
#     if str(x) == 'nan':
#         label = str(0)
#     else:
#         label = str(1)
#     return label

def smeter_price_func(x):
    a = x.split('元')[0].replace('单价','')
    if len(a) <= 3:
        num = -1
    else:
        num = int(a)
    return num

def direction_func(x,y,z):
    x = str(x)
    y = str(y)
    z = str(z)
    dir_list = ['东','西','南','北']
    if ((dir_list[0] in x)|(dir_list[1] in x)|(dir_list[2] in x)|(dir_list[3] in x)):
        label = x
    elif (dir_list[0] in y)|(dir_list[1] in y)|(dir_list[2] in y)|(dir_list[3] in y):
        label = y
    elif (dir_list[0] in z)|(dir_list[1] in z)|(dir_list[2] in z)|(dir_list[3] in z):
        a = z.split('|')
        for value in a:
            if (dir_list[0] in value)|(dir_list[1] in value)|(dir_list[2] in value)|(dir_list[3] in value):
                label = value
            else:
                label = 'nodata'
    else:
        label = 'nodata'
    
    return label

def decoration_func(x,y,z):
    x = str(x)
    y = str(y)
    z = str(z)
    dir_list = ['精装', '其他', '毛坯', '简装']
    if ((dir_list[0] in x)|(dir_list[1] in x)|(dir_list[2] in x)|(dir_list[3] in x)):
        label = x.strip()
    elif (dir_list[0] in y)|(dir_list[1] in y)|(dir_list[2] in y)|(dir_list[3] in y):
        label = y.strip()
    elif (dir_list[0] in z)|(dir_list[1] in z)|(dir_list[2] in z)|(dir_list[3] in z):
        a = z.split('|')
        for value in a:
            if (dir_list[0] in value)|(dir_list[1] in value)|(dir_list[2] in value)|(dir_list[3] in value):
                label = value.strip()
            else:
                label = 'nodata'
    else:
        label = 'nodata'
        
    return label

def elevator_func(x,y,z):
    '''
    x-decoration_house
    y-elevator_house
    z-info_cluster
    '''
    x = str(x)
    y = str(y)
    z = str(z)
    dir_list = ['有电梯', '无电梯']
    if (dir_list[0] in x)|(dir_list[1] in x):
        label = x.strip()
    elif (dir_list[0] in y)|(dir_list[1] in y):
        label = y.strip()
    elif (dir_list[0] in z)|(dir_list[1] in z):
        a = z.split('|')
        for value in a:
            if (dir_list[0] in value)|(dir_list[1] in value):
                label = value.strip()
            else:
                label = 'nodata'
    else:
        label = 'nodata'
    return label

def floor_type_func(x):
    x = str(x)
    if '共' in x:
        a = x.split('(')[0]
        label = a
    elif '层' in x:
        a = x.split('层')[0]
        a = int(a)
        if a <=1:
            label = '底层'
        elif (a >1)|(a<6):
            label = '低楼层'
        elif (a >=6)|(a<15):
            label = '中楼层'
        else:
            label = '高楼层'
    elif '平房' in x:
        label = '底层'
    elif x == 'nan':
        label = 'nodata'
    else:
        label = 'nodata'
    return label

def years_house_type_func(x,y):
    x = str(x)
    y = str(y)
    type_list = ['板塔','板','塔','平房','叠']
    if (type_list[0] in x)|(type_list[0] in y):
        label = '板塔'
    elif (type_list[1] in x)|(type_list[1] in y):
        label = '板'
    elif (type_list[2] in x)|(type_list[2] in y):
        label = '塔'
    elif (type_list[3] in x)|(type_list[3] in y):
        label = '平房'
    elif (type_list[4] in x)|(type_list[4] in y):
        label = '别墅'
    else:
        label = 'nodata'
    return label

def years_house_year_func(x,y):
    x = str(x)
    y = str(y)
    if ('年' in x):
        a = x.split('年')[0].replace('\'','').strip()
        num = int(a)
    elif ('年' in y):
        a = y.split('年')[0].replace('\'','').strip()
        num = int(a)
    
  • 5
    点赞
  • 31
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值