Python数据分析-北京房价分析

#!/usr/bin/env python
# coding: utf-8
# 明确分析⽬的:了解北京近年房价情况,为买房作出指导
# 各区房源数目、平均面积、均价
# 各区房屋总价均值-有/无地铁
# 各区-有地铁-是否配有电梯 均价
# 2017年 2室1厅1厨1卫户型房屋-有电梯/无电梯-有地铁/无地铁 各区均价
# 均价日趋势-统计每⽇所有房源的平均单价
# 2017年 总价200~400万、单价4~7万房源占比
# 引⼊使⽤的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 加载数据⽂件
# df = pd.read_csv('./beijing_houst_price.csv')
# 警告 DtypeWarning: Columns (0,6,7,9) have mixed types. Specify dtype option on import or set low_memory=False.
df = pd.read_csv('./beijing_houst_price.csv', dtype={
   'id':'str','tradeTime':'str', 'livingRoom':'str', 'drawingRoom':'str', 'bathRoom':'str'})
# 简单查看数据有哪些列
df.head()
id tradeTime followers totalPrice price square livingRoom drawingRoom kitchen bathRoom floor buildingType buildingStructure ladderRatio elevator fiveYearsProperty subway district communityAverage
0 101084782030 2016-08-09 106 415.0 31680 131.00 2 1 1 1 高 26 1.0 6 0.217 1.0 0.0 1.0 7 56021.0
1 101086012217 2016-07-28 126 575.0 43436 132.38 2 2 1 2 高 22 1.0 6 0.667 1.0 1.0 0.0 7 71539.0
2 101086041636 2016-12-11 48 1030.0 52021 198.00 3 2 1 3 中 4 4.0 6 0.500 1.0 0.0 0.0 7 48160.0
3 101086406841 2016-09-30 138 297.5 22202 134.00 3 1 1 1 底 21 1.0 6 0.273 1.0 0.0 0.0 6 51238.0
4 101086920653 2016-08-28 286 392.0 48396 81.00 2 1 1 1 中 6 4.0 2 0.333 0.0 1.0 1.0 1 62588.0
# 查看列数目、类型
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318851 entries, 0 to 318850
Data columns (total 19 columns):
id                   318851 non-null object
tradeTime            318851 non-null object
followers            318851 non-null int64
totalPrice           318851 non-null float64
price                318851 non-null int64
square               318851 non-null float64
livingRoom           318851 non-null object
drawingRoom          318851 non-null object
kitchen              318851 non-null int64
bathRoom             318851 non-null object
floor                318851 non-null object
buildingType         316830 non-null float64
buildingStructure    318851 non-null int64
ladderRatio          318851 non-null float64
elevator             318819 non-null float64
fiveYearsProperty    318819 non-null float64
subway               318819 non-null float64
district             318851 non-null int64
communityAverage     318388 non-null float64
dtypes: float64(8), int64(5), object(6)
memory usage: 46.2+ MB
# 查看数值类型数据的整体信息 常用统计值
df.describe()
followers totalPrice price square kitchen buildingType buildingStructure ladderRatio elevator fiveYearsProperty subway district communityAverage
count 318851.000000 318851.000000 318851.000000 318851.000000 318851.000000 316830.000000 318851.000000 3.188510e+05 318819.000000 318819.000000 318819.000000 318851.000000 318388.000000
mean 16.731508 349.030201 43530.436379 83.240597 0.994599 3.009790 4.451026 6.316486e+01 0.577055 0.645601 0.601112 6.763564 63682.446305
std 34.209185 230.780778 21709.024204 37.234661 0.109609 1.269857 1.901753 2.506851e+04 0.494028 0.478331 0.489670 2.812616 22329.215447
min 0.000000 0.100000 1.000000 6.900000 0.000000 0.048000 0.000000 0.000000e+00 0.000000 0.000000 0.000000 1.000000 10847.000000
25% 0.000000 205.000000 28050.000000 57.900000 1.000000 1.000000 2.000000 2.500000e-01 0.000000 0.000000 0.000000 6.000000 46339.000000
50% 5.000000 294.000000 38737.000000 74.260000 1.000000 4.000000 6.000000 3.330000e-01 1.000000 1.000000 1.000000 7.000000 59015.000000
75% 18.000000 425.500000 53819.500000 98.710000 1.000000 4.000000 6.000000 5.000000e-01 1.000000 1.000000 1.000000 8.000000 75950.000000
max 1143.000000 18130.000000 156250.000000 1745.500000 4.000000 4.000000 6.000000 1.000940e+07 1.000000 1.000000 1.000000 13.000000 183109.000000
# 查看各列⾮空值数量
df.count()
id                   318851
tradeTime            318851
followers            318851
totalPrice           318851
price                318851
square               318851
livingRoom           318851
drawingRoom          318851
kitchen              318851
bathRoom             318851
floor                318851
buildingType         316830
buildingStructure    318851
ladderRatio          318851
elevator             318819
fiveYearsProperty    318819
subway               318819
district             318851
communityAverage     318388
dtype: int64
# 开始数据清理
# 查看是否有重复数据
df[df.duplicated()]
# -->无完全重复的条目
id tradeTime followers totalPrice price square livingRoom drawingRoom kitchen bathRoom floor buildingType buildingStructure ladderRatio elevator fiveYearsProperty subway district communityAverage
# 查看id字段是否有重复值
df[df['id'].duplicated()]
# -->无id重复的条目
id tradeTime followers totalPrice price square livingRoom drawingRoom kitchen bathRoom floor buildingType buildingStructure ladderRatio elevator fiveYearsProperty subway district communityAverage
# 根据分析目标,我们取出需要的列即可
# 'id', 'tradeTime', 'totalPrice', 'price', 'square', 'livingRoom', 'drawingRoom', 'kitchen', 'bathRoom', 'floor', 'elevator', 'subway','district', 'communityAverage'
df = df[['id', 'tradeTime', 'totalPrice', 'price', 'square', 'livingRoom'
  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值