数据说明
- 数据信息:
- 数据量:40多万条观测,20多个列变量
- 时间:2018年5月前
- 数据来源
- 作者:田昕峣
- 获取方式:https://github.com/XinyaoTian/lianjia_Spider
项目目标
- 建立单位面积房价的预测模型
内容目录
数据导入
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family']='sans-serif' # 解决负号是方块
%matplotlib notebook
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn
import re
from scipy import stats
from scipy.stats import norm, skew
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output
pd.set_option('display.max_columns',40) # 显示隐藏
dataset = pd.read_csv('./houseInfo.csv')
a = dataset.ix[1, 'info_cluster']
dataset.head(5)
introduction_house | community_house | href_house | unit_house | size_house | direction_house | decoration_house | elevator_house | type_house | years_house | area_house | interests_house | watch_times | submit_period | years_period | tax_free | total_price | smeter_price | region | info_cluster | info_flood | info_follow | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 电梯花园洋房,开发商精装修带家具家电,小区人车分流 | 麓山国际帕萨迪纳3组 | https://cd.lianjia.com/ershoufang/106101085290... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 麓山 | NaN | NaN | NaN | NaN | NaN | 250.000 | 单价25492元/平米 | cd | | 2室2厅 | 98.07平米 | 南 | 其他 | 有电梯 | 高楼层(共9层)2008年建板塔结合 - | 3人关注 / 共0次带看 / 2个月以前发布 |
1 | 天府新区麓山国际跃层洋房纯清水出售 | 麓山国际塞尔维蒙 | https://cd.lianjia.com/ershoufang/106101067528... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 麓山 | NaN | NaN | NaN | NaN | NaN | 420.000 | 单价20389元/平米 | cd | | 叠拼别墅 | 5室1厅 | 206平米 | 南 | 其他 | 无电梯 | 上叠(共4层)2008年建暂无数据 - | 36人关注 / 共2次带看 / 2个月以前发布 |
2 | 麓山国际半月湾跃层,户型通透采光良好楼距开阔视野好 | 麓山国际半月湾 | https://cd.lianjia.com/ershoufang/106101136261... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 麓山 | NaN | NaN | NaN | NaN | NaN | 275.000 | 单价24512元/平米 | cd | | 2室2厅 | 112.19平米 | 东南 | 其他 | 高楼层(共16层)2013年建板楼 - | 43人关注 / 共1次带看 / 1个月以前发布 |
3 | 中丝园 装修 套三单卫 带车位 ! | 心怡中丝园 | https://cd.lianjia.com/ershoufang/106101229408... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 麓山 | NaN | NaN | NaN | NaN | NaN | 193.000 | 单价22043元/平米 | cd | | 3室2厅 | 87.56平米 | 南 | 其他 | 有电梯 | 高楼层(共33层)2015年建板塔结合 - | 1人关注 / 共0次带看 / 12天以前发布 |
4 | 麓山国际因特拉肯A区+套三双卫+对中庭+看湖带装修 | 麓山国际茵特拉肯A | https://cd.lianjia.com/ershoufang/106101233740... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 麓山 | NaN | NaN | NaN | NaN | NaN | 300.000 | 单价23303元/平米 | cd | | 3室2厅 | 128.74平米 | 西南 | 其他 | 中楼层(共11层)2016年建板楼 - | 0人关注 / 共0次带看 / 10天以前发布 |
数据探索:
- 查看数据集中的变量情况
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474301 entries, 0 to 474300
Data columns (total 22 columns):
introduction_house 474301 non-null object
community_house 474301 non-null object
href_house 474301 non-null object
unit_house 38137 non-null object
size_house 38137 non-null object
direction_house 38137 non-null object
decoration_house 38109 non-null object
elevator_house 37093 non-null object
type_house 38137 non-null object
years_house 38100 non-null object
area_house 474301 non-null object
interests_house 38137 non-null object
watch_times 38137 non-null object
submit_period 38137 non-null object
years_period 30543 non-null object
tax_free 35260 non-null object
total_price 474301 non-null float64
smeter_price 474301 non-null object
region 474301 non-null object
info_cluster 436164 non-null object
info_flood 436164 non-null object
info_follow 436164 non-null object
dtypes: float64(1), object(21)
memory usage: 79.6+ MB
dataset.describe()
total_price | |
---|---|
count | 474301.000 |
mean | 329.913 |
std | 371.062 |
min | 4.000 |
25% | 143.000 |
50% | 235.000 |
75% | 390.000 |
max | 60000.000 |
# 检查数据维度
print("训练集特征前的size:",dataset.shape)
训练集特征前的size: (474301, 22)
数据处理:
def size_help_func(x):
# pattern = re.compile(r'\d+')
# match = pattern.search(x)
x = str(x)
x = x.replace('平方米','')
x = x.replace('平米','')
x = x.replace('米','')
if (('室' in x) | ('厅' in x)|(x=='nan')|('车位' in x)|('房' in x)|('墅' in x)):
num = 0
else:
num = float(x)
# if ('米' in (x))==True:
# x = x.replace('平米','')
# num = float(x.strip())
# else:
# num = 0
return num
def info_func(x):
if '平米' in str(x):
a = x.split('平米')[0].split('|')[-1].strip()
if len(a)>1 :
num = a
else:
num = 0.0
else:
num = 0.0
return num
def size_func(x,y):
a = size_help_func(x)
b = info_func(y)
if a == 0.0:
if ('车位' not in str(b)):
num = float(b)
else:
num = a
else:
num =a
return num
def size_addcata_func(a):
# a = watch_time_func(x)
if a <= 10:
label = str(1)
else:
label = str(0)
return label
def watch_time_func(x):
if str(x) == 'nan':
num = -1
else:
a = x.split('次')[0].strip()
num = int(a)
return num
def watch_time_addcata_func(x):
# a = watch_time_func(x)
if x == -1:
label = str(1)
else:
label = str(0)
return label
def interests_house_func(x):
if str(x) == 'nan':
num = -1
else:
a = x.split('人')[0].strip()
num = int(a)
return num
def interests_house_addcata_func(x):
# a = interests_house_func(x)
if x == -1:
label = str(1)
else:
label = str(0)
return label
def submit_period_func(x):
if str(x) == 'nan':
num = -1
elif '刚刚' in str(x):
num = 0
elif '年' in str(x):
a = x.split('年')[0].strip()
if a == '一':
num = 365
elif a == '二':
num = 730
else:
num = 1000
elif '个月' in x:
a = x.split('个月')[0].strip()
num = int(a)* 30
elif '天' in x:
a = x.split('天')[0].strip()
num = int(a)
else:
num = -2
return num
def submit_period_addcata_func(x):
a = submit_period_func(x)
if a == -2:
label = 3
elif a == -1:
label = 2
elif a == 1000:
label = 1
else:
label = 0
return str(label)
def years_period_func(x):
if str(x) == 'nan':
label = str(0)
else:
label = str(1)
return label
# def tax_free_func(x):
# if str(x) == 'nan':
# label = str(0)
# else:
# label = str(1)
# return label
def smeter_price_func(x):
a = x.split('元')[0].replace('单价','')
if len(a) <= 3:
num = -1
else:
num = int(a)
return num
def direction_func(x,y,z):
x = str(x)
y = str(y)
z = str(z)
dir_list = ['东','西','南','北']
if ((dir_list[0] in x)|(dir_list[1] in x)|(dir_list[2] in x)|(dir_list[3] in x)):
label = x
elif (dir_list[0] in y)|(dir_list[1] in y)|(dir_list[2] in y)|(dir_list[3] in y):
label = y
elif (dir_list[0] in z)|(dir_list[1] in z)|(dir_list[2] in z)|(dir_list[3] in z):
a = z.split('|')
for value in a:
if (dir_list[0] in value)|(dir_list[1] in value)|(dir_list[2] in value)|(dir_list[3] in value):
label = value
else:
label = 'nodata'
else:
label = 'nodata'
return label
def decoration_func(x,y,z):
x = str(x)
y = str(y)
z = str(z)
dir_list = ['精装', '其他', '毛坯', '简装']
if ((dir_list[0] in x)|(dir_list[1] in x)|(dir_list[2] in x)|(dir_list[3] in x)):
label = x.strip()
elif (dir_list[0] in y)|(dir_list[1] in y)|(dir_list[2] in y)|(dir_list[3] in y):
label = y.strip()
elif (dir_list[0] in z)|(dir_list[1] in z)|(dir_list[2] in z)|(dir_list[3] in z):
a = z.split('|')
for value in a:
if (dir_list[0] in value)|(dir_list[1] in value)|(dir_list[2] in value)|(dir_list[3] in value):
label = value.strip()
else:
label = 'nodata'
else:
label = 'nodata'
return label
def elevator_func(x,y,z):
'''
x-decoration_house
y-elevator_house
z-info_cluster
'''
x = str(x)
y = str(y)
z = str(z)
dir_list = ['有电梯', '无电梯']
if (dir_list[0] in x)|(dir_list[1] in x):
label = x.strip()
elif (dir_list[0] in y)|(dir_list[1] in y):
label = y.strip()
elif (dir_list[0] in z)|(dir_list[1] in z):
a = z.split('|')
for value in a:
if (dir_list[0] in value)|(dir_list[1] in value):
label = value.strip()
else:
label = 'nodata'
else:
label = 'nodata'
return label
def floor_type_func(x):
x = str(x)
if '共' in x:
a = x.split('(')[0]
label = a
elif '层' in x:
a = x.split('层')[0]
a = int(a)
if a <=1:
label = '底层'
elif (a >1)|(a<6):
label = '低楼层'
elif (a >=6)|(a<15):
label = '中楼层'
else:
label = '高楼层'
elif '平房' in x:
label = '底层'
elif x == 'nan':
label = 'nodata'
else:
label = 'nodata'
return label
def years_house_type_func(x,y):
x = str(x)
y = str(y)
type_list = ['板塔','板','塔','平房','叠']
if (type_list[0] in x)|(type_list[0] in y):
label = '板塔'
elif (type_list[1] in x)|(type_list[1] in y):
label = '板'
elif (type_list[2] in x)|(type_list[2] in y):
label = '塔'
elif (type_list[3] in x)|(type_list[3] in y):
label = '平房'
elif (type_list[4] in x)|(type_list[4] in y):
label = '别墅'
else:
label = 'nodata'
return label
def years_house_year_func(x,y):
x = str(x)
y = str(y)
if ('年' in x):
a = x.split('年')[0].replace('\'','').strip()
num = int(a)
elif ('年' in y):
a = y.split('年')[0].replace('\'',''