接上一章内容:
经过了一下午的爬取,也算是终于搞到一份还算像样的数据:
经过去重还有8w3,可以玩耍了,重复原因有可能是因为,很多房源本身就发了很多回,就这8w3的数据应该也有重复,我们把title删除,再drop_duplicates一下:
还有7w多,这样的数据就比较真实了,数据到手,搞起:
df["rent"] = df["rent"].apply(lambda x: x[:-3]).astype("float32").astype("int32")
df["floor_area"] = df["floor_area"].astype("int32")
df["floor"] = df["floor"].apply(lambda x: x[:2])
df.head()
获取面积和租金:
新建一个参数,mean_rent 每平方米的租金:
df["mean_rent"] = (df["rent"]/df["floor_area"]).astype("int32")
df.sort_values("mean_rent")
获取mean_rent range 为(20,200):
df2 = df[df.mean_rent>20]
df2 = df2[df2.mean_rent<200]
df2.head()
然后在去掉大于400平米的房源:
df2 = df2[df2.floor_area < 400]
再对装修情况进行处理:
def change_decoration(x):
if x==" ":
x="暂无资料"
elif x == "中等装修":
x = "中装修"
elif x=="简单装修":
x = "简装修"
else:
pass
return x
df2["decoration"] = df2["decoration"].apply(change_decoration)
df2.head()
获取不同装修情况的房源数量看一下:
decoration = df2.groupby("decoration").count().iloc[1:,[0]].rename(columns={
"area":"number"}).reset_index()
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
figure = ff.create_table(decoration, height_constant=60)
trace1 = go.Bar(
x=decoration.decoration,
y=decoration.number,
text=decoration.number,
textposition = 'auto',
xaxis='x2',
yaxis='y2',
marker=dict(
color='rgba(132, 112, 255 ,0.5)',