import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('./beijing_houst_price.csv', dtype={
'id':'str','tradeTime':'str', 'livingRoom':'str', 'drawingRoom':'str', 'bathRoom':'str'})
df.head()
|
id |
tradeTime |
followers |
totalPrice |
price |
square |
livingRoom |
drawingRoom |
kitchen |
bathRoom |
floor |
buildingType |
buildingStructure |
ladderRatio |
elevator |
fiveYearsProperty |
subway |
district |
communityAverage |
0 |
101084782030 |
2016-08-09 |
106 |
415.0 |
31680 |
131.00 |
2 |
1 |
1 |
1 |
高 26 |
1.0 |
6 |
0.217 |
1.0 |
0.0 |
1.0 |
7 |
56021.0 |
1 |
101086012217 |
2016-07-28 |
126 |
575.0 |
43436 |
132.38 |
2 |
2 |
1 |
2 |
高 22 |
1.0 |
6 |
0.667 |
1.0 |
1.0 |
0.0 |
7 |
71539.0 |
2 |
101086041636 |
2016-12-11 |
48 |
1030.0 |
52021 |
198.00 |
3 |
2 |
1 |
3 |
中 4 |
4.0 |
6 |
0.500 |
1.0 |
0.0 |
0.0 |
7 |
48160.0 |
3 |
101086406841 |
2016-09-30 |
138 |
297.5 |
22202 |
134.00 |
3 |
1 |
1 |
1 |
底 21 |
1.0 |
6 |
0.273 |
1.0 |
0.0 |
0.0 |
6 |
51238.0 |
4 |
101086920653 |
2016-08-28 |
286 |
392.0 |
48396 |
81.00 |
2 |
1 |
1 |
1 |
中 6 |
4.0 |
2 |
0.333 |
0.0 |
1.0 |
1.0 |
1 |
62588.0 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318851 entries, 0 to 318850
Data columns (total 19 columns):
id 318851 non-null object
tradeTime 318851 non-null object
followers 318851 non-null int64
totalPrice 318851 non-null float64
price 318851 non-null int64
square 318851 non-null float64
livingRoom 318851 non-null object
drawingRoom 318851 non-null object
kitchen 318851 non-null int64
bathRoom 318851 non-null object
floor 318851 non-null object
buildingType 316830 non-null float64
buildingStructure 318851 non-null int64
ladderRatio 318851 non-null float64
elevator 318819 non-null float64
fiveYearsProperty 318819 non-null float64
subway 318819 non-null float64
district 318851 non-null int64
communityAverage 318388 non-null float64
dtypes: float64(8), int64(5), object(6)
memory usage: 46.2+ MB
df.describe()
|
followers |
totalPrice |
price |
square |
kitchen |
buildingType |
buildingStructure |
ladderRatio |
elevator |
fiveYearsProperty |
subway |
district |
communityAverage |
count |
318851.000000 |
318851.000000 |
318851.000000 |
318851.000000 |
318851.000000 |
316830.000000 |
318851.000000 |
3.188510e+05 |
318819.000000 |
318819.000000 |
318819.000000 |
318851.000000 |
318388.000000 |
mean |
16.731508 |
349.030201 |
43530.436379 |
83.240597 |
0.994599 |
3.009790 |
4.451026 |
6.316486e+01 |
0.577055 |
0.645601 |
0.601112 |
6.763564 |
63682.446305 |
std |
34.209185 |
230.780778 |
21709.024204 |
37.234661 |
0.109609 |
1.269857 |
1.901753 |
2.506851e+04 |
0.494028 |
0.478331 |
0.489670 |
2.812616 |
22329.215447 |
min |
0.000000 |
0.100000 |
1.000000 |
6.900000 |
0.000000 |
0.048000 |
0.000000 |
0.000000e+00 |
0.000000 |
0.000000 |
0.000000 |
1.000000 |
10847.000000 |
25% |
0.000000 |
205.000000 |
28050.000000 |
57.900000 |
1.000000 |
1.000000 |
2.000000 |
2.500000e-01 |
0.000000 |
0.000000 |
0.000000 |
6.000000 |
46339.000000 |
50% |
5.000000 |
294.000000 |
38737.000000 |
74.260000 |
1.000000 |
4.000000 |
6.000000 |
3.330000e-01 |
1.000000 |
1.000000 |
1.000000 |
7.000000 |
59015.000000 |
75% |
18.000000 |
425.500000 |
53819.500000 |
98.710000 |
1.000000 |
4.000000 |
6.000000 |
5.000000e-01 |
1.000000 |
1.000000 |
1.000000 |
8.000000 |
75950.000000 |
max |
1143.000000 |
18130.000000 |
156250.000000 |
1745.500000 |
4.000000 |
4.000000 |
6.000000 |
1.000940e+07 |
1.000000 |
1.000000 |
1.000000 |
13.000000 |
183109.000000 |
df.count()
id 318851
tradeTime 318851
followers 318851
totalPrice 318851
price 318851
square 318851
livingRoom 318851
drawingRoom 318851
kitchen 318851
bathRoom 318851
floor 318851
buildingType 316830
buildingStructure 318851
ladderRatio 318851
elevator 318819
fiveYearsProperty 318819
subway 318819
district 318851
communityAverage 318388
dtype: int64
df[df.duplicated()]
|
id |
tradeTime |
followers |
totalPrice |
price |
square |
livingRoom |
drawingRoom |
kitchen |
bathRoom |
floor |
buildingType |
buildingStructure |
ladderRatio |
elevator |
fiveYearsProperty |
subway |
district |
communityAverage |
df[df['id'].duplicated()]
|
id |
tradeTime |
followers |
totalPrice |
price |
square |
livingRoom |
drawingRoom |
kitchen |
bathRoom |
floor |
buildingType |
buildingStructure |
ladderRatio |
elevator |
fiveYearsProperty |
subway |
district |
communityAverage |
df = df[['id', 'tradeTime', 'totalPrice', 'price', 'square', 'livingRoom'