短租数据集分析

短租数据集分析

数据集链接:爱彼迎短租数据集

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
%matplotlib inline

查看listings,calendar,reviews,neighbourhoods各表的基本内容

#房源基础信息,包括房源、房东、位置、类型、价格、评论数量和可租时间等等。
listings=pd.read_csv('listings.csv')
listings.head()
idnamehost_idhost_nameneighbourhood_groupneighbourhoodlatitudelongituderoom_typepriceminimum_nightsnumber_of_reviewslast_reviewreviews_per_monthcalculated_host_listings_countavailability_365
044054Modern and Comfortable Living in CBD192875East ApartmentsNaN朝阳区 / Chaoyang39.89503116.45163Entire home/apt7921892019-03-040.859341
1100213The Great Wall Box Deluxe Suite A团园长城小院东院套房527062JoeNaN密云县 / Miyun40.68434117.17231Private room1201122017-10-080.1040
2128496Heart of Beijing: House with View 2467520CindyNaN东城区39.93213116.42200Entire home/apt38932592019-02-052.70193
3161902cozy studio in center of Beijing707535RobertNaN东城区39.93357116.43577Entire home/apt3761262016-12-030.285290
4162144nice studio near subway, sleep 4707535RobertNaN朝阳区 / Chaoyang39.93668116.43798Entire home/apt5371372018-08-010.405352
#房源时间表信息,包括房源、时间、是否可租、租金和可租天数等等。
calendar=pd.read_csv('calendar_detail.csv')
calendar.head()
listing_iddateavailablepriceadjusted_priceminimum_nightsmaximum_nights
011650402019-04-17f$511.00$511.001.01125.0
111650402019-04-18t$511.00$511.001.01125.0
211650402019-04-19t$511.00$511.001.01125.0
311650402019-04-20t$511.00$511.001.01125.0
411650402019-04-21t$511.00$511.001.01125.0
#北京的行政区划
neighbour=pd.read_csv('neighbourhoods.csv')
neighbour.head()
neighbourhood_groupneighbourhood
0NaN东城区
1NaN丰台区 / Fengtai
2NaN大兴区 / Daxing
3NaN密云县 / Miyun
4NaN平谷区 / Pinggu
#房源的评论信息。包括房源 listing_id和评论日期,包括评论相关的内容和作者信息。
reviews=pd.read_csv('reviews_detail.csv')
reviews.head()
listing_ididdatereviewer_idreviewer_namecomments
044054847482010-08-25207019JarrodSev was very helpful. Sev showed us where to ...
1440541183842010-10-13218723KimberlyWe arrived in Beijing very early in the mornin...
2440544369782011-08-11609177EmmaIt is a really massive apartment and really co...
34405411186572012-04-121787536AndreynaSev was incredibly helpful, showed us around t...
44405421406502012-08-301179565FrancesThe appartment was ideal for our party of 6 ad...

对listings表数据清洗

listings.head()

idnamehost_idhost_nameneighbourhood_groupneighbourhoodlatitudelongituderoom_typepriceminimum_nightsnumber_of_reviewslast_reviewreviews_per_monthcalculated_host_listings_countavailability_365
044054Modern and Comfortable Living in CBD192875East ApartmentsNaN朝阳区 / Chaoyang39.89503116.45163Entire home/apt7921892019-03-040.859341
1100213The Great Wall Box Deluxe Suite A团园长城小院东院套房527062JoeNaN密云县 / Miyun40.68434117.17231Private room1201122017-10-080.1040
2128496Heart of Beijing: House with View 2467520CindyNaN东城区39.93213116.42200Entire home/apt38932592019-02-052.70193
3161902cozy studio in center of Beijing707535RobertNaN东城区39.93357116.43577Entire home/apt3761262016-12-030.285290
4162144nice studio near subway, sleep 4707535RobertNaN朝阳区 / Chaoyang39.93668116.43798Entire home/apt5371372018-08-010.405352
#对listings表分析
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28452 entries, 0 to 28451
Data columns (total 16 columns):
id                                28452 non-null int64
name                              28451 non-null object
host_id                           28452 non-null int64
host_name                         28452 non-null object
neighbourhood_group               0 non-null float64
neighbourhood                     28452 non-null object
latitude                          28452 non-null float64
longitude                         28452 non-null float64
room_type                         28452 non-null object
price                             28452 non-null int64
minimum_nights                    28452 non-null int64
number_of_reviews                 28452 non-null int64
last_review                       17294 non-null object
reviews_per_month                 17294 non-null float64
calculated_host_listings_count    28452 non-null int64
availability_365                  28452 non-null int64
dtypes: float64(4), int64(7), object(5)
memory usage: 3.5+ MB

# 观察发现listings表有几个问题:
# 1.neighbourhood_group列存在很多空值,查看统计信息
# 2.neighbourhood列有中文有英文,决定删掉‘/yingwen’,仅保留neighbourhood列中文部分
# 3.查看经纬度是否有异常值
# 4.查看房屋类型有多少种
# 5.查看价格是否存在异常值
# 6.查看最小入住天数是否有异常值
# 7.查看评论数前10的id
# 8.查看每月评论数前十的id
# 9.查看365天中天数是否有异常值
# 10.name,last_review和reviews_per_month中都存在空值,不过影响不大

#发现neighbourhood_group列有很多空值,查看neighbourhood_group列的统计信息
listings['neighbourhood_group'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: neighbourhood_group, dtype: float64

由以上neighbourhood_group列的统计信息可知,neighbourhood_group列全为空值,无意义
所以决定删除在listings表中删除neighbourhood_group列

#删除neighbourhood_group列
listings=listings.drop(['neighbourhood_group'],axis=1)
listings.head()

idnamehost_idhost_nameneighbourhoodlatitudelongituderoom_typepriceminimum_nightsnumber_of_reviewslast_reviewreviews_per_monthcalculated_host_listings_countavailability_365
044054Modern and Comfortable Living in CBD192875East Apartments朝阳区 / Chaoyang39.89503116.45163Entire home/apt7921892019-03-040.859341
1100213The Great Wall Box Deluxe Suite A团园长城小院东院套房527062Joe密云县 / Miyun40.68434117.17231Private room1201122017-10-080.1040
2128496Heart of Beijing: House with View 2467520Cindy东城区39.93213116.42200Entire home/apt38932592019-02-052.70193
3161902cozy studio in center of Beijing707535Robert东城区39.93357116.43577Entire home/apt3761262016-12-030.285290
4162144nice studio near subway, sleep 4707535Robert朝阳区 / Chaoyang39.93668116.43798Entire home/apt5371372018-08-010.405352
#查看neighbourhood列有哪几种不同元素
listings['neighbourhood'].unique()

array(['朝阳区 / Chaoyang', '密云县 / Miyun', '东城区', '西城区', '海淀区',
       '顺义区 / Shunyi', '房山区', '怀柔区 / Huairou', '昌平区', '通州区 / Tongzhou',
       '丰台区 / Fengtai', '大兴区 / Daxing', '延庆县 / Yanqing', '石景山区',
       '门头沟区 / Mentougou', '平谷区 / Pinggu'], dtype=object)

#neighbourhood列删掉‘/yingwen’,仅保留neighbourhood列中文部分
for i in range(len(listings)):
    new_neighbourhood=listings['neighbourhood'][i].split('/')
    listings.loc[i,'neighbourhood']=new_neighbourhood[0].strip()

listings['neighbourhood'].unique()

array(['朝阳区', '密云县', '东城区', '西城区', '海淀区', '顺义区', '房山区', '怀柔区', '昌平区',
       '通州区', '丰台区', '大兴区', '延庆县', '石景山区', '门头沟区', '平谷区'], dtype=object)

# 查看经纬度是否有异常值
listings['longitude'].describe()

count    28452.000000
mean       116.442000
std          0.204796
min        115.473390
25%        116.355283
50%        116.434665
75%        116.491122
max        117.495270
Name: longitude, dtype: float64

#经度和纬度的箱型图
fig=plt.figure()
fig.add_subplot(121)
listings.boxplot(column='longitude')
fig.add_subplot(122)
listings.boxplot(column='latitude')
plt.show()

[外链图片转存失败(img-lm4nPlmK-1569078306325)(output_18_0.png)]

#从经纬度的箱型图看,考虑到北京城区面积大,异常点的误差都在1°内,
# 所以把所有经纬度数据认为是正常值范围。

# 查看房屋类型有多少种
#房屋类型有三种,分别是Entire home/apt,Private room,Shared room
listings['room_type'].unique()

array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object)

# 查看价格是否存在异常值
listings['price'].describe()

count    28452.000000
mean       611.203325
std       1623.535077
min          0.000000
25%        235.000000
50%        389.000000
75%        577.000000
max      68983.000000
Name: price, dtype: float64



fig=plt.figure()
fig.add_subplot(131)
listings[listings.room_type=='Entire home/apt'].boxplot(column='price')
plt.title('Entire home/apt')
fig.add_subplot(132)
listings[listings.room_type=='Private room'].boxplot(column='price')
plt.title('Private room')
fig.add_subplot(133)
listings[listings.room_type=='Shared room'].boxplot(column='price')
plt.title('Shared room')
plt.show()

[外链图片转存失败(img-2PSdDcwj-1569078306326)(output_24_0.png)]

#查看价格为0的房源基本信息,
listings[listings['price']==0]

idnamehost_idhost_nameneighbourhoodlatitudelongituderoom_typepriceminimum_nightsnumber_of_reviewslast_reviewreviews_per_monthcalculated_host_listings_countavailability_365
508520670843【胡同老宅~轻语竹林房】旅游绝佳地段】步行即到雍和宫、近故宫天安门、南锣鼓巷、美食簋街129840905Jing东城区39.94292116.41323Entire home/apt02812019-03-314.09827
580621246510限时 北京二环四合院别墅拍摄聚会 商务会议 娱乐同仁堂老宅 近簋街、雍和宫、东直门、南锣鼓巷...83233661Eva东城区39.93677116.42076Entire home/apt010NaNNaN6167
2823433895187测试房源mm2185140389Ning Host朝阳区39.98147116.47109Private room010NaNNaN2359
#由名字可判断这个价格肯定不会为0,
#所以我把几个房源的price改为nan
listings.loc[listings['price']==0,'price']=np.nan


listings['price'].describe()

count    28449.000000
mean       611.267777
std       1623.608547
min         27.000000
25%        235.000000
50%        389.000000
75%        577.000000
max      68983.000000
Name: price, dtype: float64

#查看了一下50000以上的房源的基本信息
#我认为因为北京有很多四合院,所以50000以上应该也是存在的吧
#这里不认为50000以上是异常值了
listings[listings['price']>50000]

idnamehost_idhost_nameneighbourhoodlatitudelongituderoom_typepriceminimum_nightsnumber_of_reviewslast_reviewreviews_per_monthcalculated_host_listings_countavailability_365
106712689987Artistic apartment with culture68973377晨斌朝阳区39.92300116.57996Entire home/apt67104.0122016-06-030.062365
201215488817Hotel apartment close to huge Mall68973377晨斌朝阳区39.91962116.59173Entire home/apt63346.01162017-04-170.552180
516720748712大望路/九龙山大床房141070198朝阳区39.88798116.47667Entire home/apt59997.0112017-09-050.05391
661221942314【温馨小窝窝】近地铁一号线五棵松/万寿路,距离北京西站3站地,15分钟。48178909Qing海淀区39.89523116.28252Shared room59997.0142018-03-230.241180
1017024994830良乡大学城两室温馨小屋188806180房山区39.72157116.15182Entire home/apt68828.0112018-09-280.151181
1366827587044房源已下架208158466昌平区40.08912116.29895Private room66667.0300NaNNaN191
1469728134193此房不能租,不要询问了212328505海淀区39.94947116.36246Entire home/apt68983.0112018-09-100.14190
1620728803519【北京站地铁3分钟.故宫周边最优惠.王府井商圈】溪流到家静雅民宿216392612东城区39.90583116.42199Entire home/apt65970.0112018-10-290.1810
1708329138170全A小筑74938348朝阳区39.89685116.45925Entire home/apt59997.010NaNNaN10
2180931535043水立方,鸟巢附近六人间男神床位236331220王林昌平区40.07817116.42163Shared room67909.010NaNNaN2180
listings['minimum_nights'].describe()

count    28452.000000
mean         2.729685
std         17.920932
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max       1125.000000
Name: minimum_nights, dtype: float64

#查看最小入住天数的箱型图
listings.boxplot(column='minimum_nights')
plt.show()

[外链图片转存失败(img-NiB0o4PZ-1569078306327)(output_30_0.png)]

# 查阅入住最小天数为400天以上的情况
#经查,结合地理位置和房型等信息,入住最小天数为400天以上是合理的
listings[listings['minimum_nights']>400]

idnamehost_idhost_nameneighbourhoodlatitudelongituderoom_typepriceminimum_nightsnumber_of_reviewslast_reviewreviews_per_monthcalculated_host_listings_countavailability_365
117513183350鸟巢旁欧式罗曼蒂克两居56183837Jack朝阳区39.99724116.40110Entire home/apt463.0100022016-07-250.0610
5609210985786号14号线金台路青旅燕儿窝上下铺国贸CBD129486966朝阳区39.91924116.48459Shared room121.0112442018-08-030.336362
650521841908鸟巢水立方朝南大主卧159278266小梦海淀区40.03241116.36671Private room255.010000NaNNaN10
1998330752723中关村新东方北大清华五道口颐和园圆明园魏公村220133868未知海淀区39.98541116.31650Entire home/apt9998.0112512019-01-290.3820
2566433297102姚家园西里小区250785867子豪朝阳区39.94690116.51349Shared room2798.05000NaNNaN1365
#查看availability_365列
#由统计信息可知,availability_365的最大值是365,最小值是0是合理的
#所以判定availability_365列无异常值
listings['availability_365'].describe()

count    28452.000000
mean       220.342120
std        138.430677
min          0.000000
25%         87.000000
50%        209.000000
75%        361.000000
max        365.000000
Name: availability_365, dtype: float64

对listings表进行数据分析

#由经纬度查看房源的地理位置分布情况
plt.scatter(x=listings['longitude'],y=listings['latitude'],alpha=0.1)
longitude_mean=listings['longitude'].mean()
latitude_mean=listings['latitude'].mean()
plt.scatter(x=longitude_mean,y=latitude_mean,c='r')
plt.text(longitude_mean, latitude_mean-0.1, ('%1.3f'%longitude_mean,'%1.3f'%latitude_mean),ha='center', va='bottom', fontsize=12) 
plt.title('北京房源的地理位置分布情况')
plt.xlabel('经度')
plt.ylabel('纬度')

plt.savefig('北京房源的地理位置分布散点图.png',dpi=500,bbox_inches = 'tight')
plt.show()



[外链图片转存失败(img-wnd30ozn-1569078306327)(output_34_0.png)]

由散点图可看出北京房源的地理位置分布情况
其中经纬度的均值点为(116.442,39.983),该点为北京市朝阳区西坝河路附近
北京朝阳区的房源密度最高

#绘制北京各城区房源数量条形图
plt.bar(listings['neighbourhood'].value_counts().index,listings['neighbourhood'].value_counts())
plt.title('北京各城区房源数量')
plt.xticks(listings['neighbourhood'].value_counts().index, listings['neighbourhood'].value_counts().index, rotation='45')

plt.savefig('北京各城区房源数量.png',dpi=500,bbox_inches = 'tight')
plt.show()



[外链图片转存失败(img-TNNfZQMX-1569078306328)(output_36_0.png)]

listings['neighbourhood'].value_counts()

朝阳区     10810
东城区      3346
海淀区      3197
丰台区      1758
西城区      1701
通州区      1290
昌平区      1034
密云县       935
顺义区       920
怀柔区       833
大兴区       823
延庆县       718
房山区       579
石景山区      213
门头沟区      152
平谷区       143
Name: neighbourhood, dtype: int64

由北京各城区房源数量条形图可看出,北京朝阳区的房源数量最多,超过10000套,远远高于其他行政区,北京平谷区房源最少.
房源数量排名前三位的分别是朝阳区,东城区和海淀区.

listings['neighbourhood'].value_counts()

朝阳区     10810
东城区      3346
海淀区      3197
丰台区      1758
西城区      1701
通州区      1290
昌平区      1034
密云县       935
顺义区       920
怀柔区       833
大兴区       823
延庆县       718
房山区       579
石景山区      213
门头沟区      152
平谷区       143
Name: neighbourhood, dtype: int64

#查看不同房型的房源数量
fig,ax1=plt.subplots()
t=listings['room_type'].value_counts().index
data1=listings['room_type'].value_counts()
data2=[listings.loc[listings.room_type=='Entire home/apt','price'].mean(),
  listings.loc[listings.room_type=='Private room','price'].mean(),
  listings.loc[listings.room_type=='Shared room','price'].mean()]
ax1.bar(t,data1,width=0.3)
ax1.set_ylabel('房源数量')

#为每一个柱子添加数字标签
for x,y in enumerate(listings['room_type'].value_counts()):
    plt.text(x,y+200,y,ha='center')
    
ax2 = ax1.twinx()
#查看不同房型的平均价格
ax2.plot(t,data2,c='r')
ax2.set_ylabel('平均价格')
plt.show()

[外链图片转存失败(img-yWhxoEAv-1569078306328)(output_40_0.png)]

# 查看北京不同地区不同房型的房源数量
listings['neighbourhood'].value_counts().index
labels=['朝阳区', '东城区', '海淀区', '丰台区', '西城区', '通州区', '昌平区', '密云县', '顺义区', '怀柔区',
       '大兴区', '延庆县', '房山区', '石景山区', '门头沟区', '平谷区']
# 房型
# typeroom=['Entire home/apt', 'Private room', 'Shared room']
x=np.arange(16)
# Entire home/apt
y1=listings[listings.room_type=='Entire home/apt']['neighbourhood'].value_counts().values
plt.bar(x,y1,width=0.25,label='Entire home/apt')
#Private room
y2=listings[listings.room_type=='Private room']['neighbourhood'].value_counts().values
plt.bar(x+0.25,y2,width=0.25,label='Private room')
#Shared room
y3=listings[listings.room_type=='Shared room']['neighbourhood'].value_counts().values
plt.bar(x+0.5,y3,width=0.25,label='Shared room')
plt.title('北京不同地区不同房型的房源数量')
plt.xticks(np.arange(16),labels,rotation='60')
plt.legend()

plt.savefig('北京各城区不同房型的房源数量.png',dpi=500,bbox_inches = 'tight')
plt.show()


[外链图片转存失败(img-CB99ABrO-1569078306328)(output_41_0.png)]

# 北京不同城区不同房型平均价格折线图
mm=listings.groupby(['neighbourhood','room_type'])['price'].mean()
xx=np.arange(16)
labels=['东城区','丰台区','大兴区','密云县','平谷区','延庆县','怀柔区','房山区',
        '昌平区','朝阳区','海淀区','石景山区','西城区','通州区','门头沟区','顺义区']
y11=[]
y22=[]
y33=[]
#entire room/apt
for i in range(16):
    y11.append(mm[3*i])
#private room
for i in range(16):
    y22.append(mm[3*i+1])
#shared_room
for i in range(16):
    y33.append(mm[3*i+2])
plt.plot(xx,y11,label='Entire home/apt')
plt.plot(xx,y22,label='Private room')
plt.plot(xx,y33,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.show()

[外链图片转存失败(img-nnLBtSZJ-1569078306329)(output_42_0.png)]

# 考虑到这个图有部分城区的Shared room比Entire home/apt的平均价格还高,显然是不合理的
# 所以我从从实际出发,取每种房型的25%-75%之间的租金图,然后取平均数

# 北京不同城区不同房型平均价格条形图
mm=listings.groupby(['neighbourhood','room_type'])['price'].mean()
xx=np.arange(16)
labels=['东城区','丰台区','大兴区','密云县','平谷区','延庆县','怀柔区','房山区',
        '昌平区','朝阳区','海淀区','石景山区','西城区','通州区','门头沟区','顺义区']
y11=[]
y22=[]
y33=[]
#entire room/apt
for i in range(16):
    y11.append(mm[3*i])
#private room
for i in range(16):
    y22.append(mm[3*i+1])
#shared_room
for i in range(16):
    y33.append(mm[3*i+2])
plt.bar(xx,y11,width=0.1,label='Entire home/apt')
plt.bar(xx+0.1,y22,width=0.1,label='Private room')
plt.bar(xx+0.2,y33,width=0.1,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.show()


[外链图片转存失败(img-nWVXDVbX-1569078306329)(output_44_0.png)]

考虑到这个图有部分城区的Shared room比Entire home/apt的平均价格还高,显然是不合理的
所以我从从实际出发,取箱型图的中位数作为各城区不同房型的价格参考标准

#这是不同房型的箱型图
fig=plt.figure()
fig.add_subplot(131)
listings[listings.room_type=='Entire home/apt'].boxplot(column='price')
plt.title('Entire home/apt')
fig.add_subplot(132)
listings[listings.room_type=='Private room'].boxplot(column='price')
plt.title('Private room')
fig.add_subplot(133)
listings[listings.room_type=='Shared room'].boxplot(column='price')
plt.title('Shared room')

plt.savefig('不同房型价格分布箱型图.png',dpi=500,bbox_inches = 'tight')
plt.show()



[外链图片转存失败(img-yV66TVHx-1569078306329)(output_46_0.png)]

#不同城区Entire home/apt的中位数
listings[listings.room_type=='Entire home/apt'].groupby('neighbourhood')['price'].describe()['50%']

neighbourhood
东城区      530.0
丰台区      396.0
大兴区      379.0
密云县      799.0
平谷区      819.0
延庆县     1000.0
怀柔区     1678.0
房山区      282.0
昌平区      537.0
朝阳区      470.0
海淀区      490.0
石景山区     429.0
西城区      497.0
通州区      336.0
门头沟区     289.0
顺义区      396.0
Name: 50%, dtype: float64

#不同城区Private room的中位数
listings[listings.room_type=='Private room'].groupby('neighbourhood')['price'].describe()['50%']

neighbourhood
东城区     336.0
丰台区     188.0
大兴区     177.5
密云县     356.0
平谷区     382.0
延庆县     497.0
怀柔区     537.0
房山区     201.0
昌平区     201.0
朝阳区     215.0
海淀区     242.0
石景山区    251.5
西城区     302.0
通州区     188.0
门头沟区    899.0
顺义区     255.0
Name: 50%, dtype: float64

#不同城区Shared room的中位数
listings[listings.room_type=='Shared room'].groupby('neighbourhood')['price'].describe()['50%']


neighbourhood
东城区      107.0
丰台区      127.0
大兴区      148.0
密云县      148.0
平谷区      101.0
延庆县     1188.0
怀柔区      886.0
房山区      174.0
昌平区      107.0
朝阳区      101.0
海淀区      101.0
石景山区     140.5
西城区      107.0
通州区       94.0
门头沟区      94.0
顺义区      157.5
Name: 50%, dtype: float64

# 北京地区各城区不同房型中位数价格分布折线图(这个图太丑,舍弃)
xx=np.arange(16)
labels=['东城区','丰台区','大兴区','密云县','平谷区','延庆县','怀柔区','房山区',
        '昌平区','朝阳区','海淀区','石景山区','西城区','通州区','门头沟区','顺义区']
#不同城区Entire home/apt的中位数
y11=listings[listings.room_type=='Entire home/apt'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城区Private room的中位数
y22=listings[listings.room_type=='Private room'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城区Shared room的中位数
y33=listings[listings.room_type=='Shared room'].groupby('neighbourhood')['price'].describe()['50%'].values
plt.plot(xx,y11,label='Entire home/apt')
plt.plot(xx,y22,label='Private room')
plt.plot(xx,y33,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.show()

[外链图片转存失败(img-xiRIpoJM-1569078306330)(output_50_0.png)]

# 北京地区各城区不同房型中位数价格分布
xx=np.arange(16)
labels=['东城区','丰台区','大兴区','密云县','平谷区','延庆县','怀柔区','房山区',
        '昌平区','朝阳区','海淀区','石景山区','西城区','通州区','门头沟区','顺义区']
#不同城区Entire home/apt的中位数
y11=listings[listings.room_type=='Entire home/apt'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城区Private room的中位数
y22=listings[listings.room_type=='Private room'].groupby('neighbourhood')['price'].describe()['50%'].values
#不同城区Shared room的中位数
y33=listings[listings.room_type=='Shared room'].groupby('neighbourhood')['price'].describe()['50%'].values
plt.bar(xx,y11,width=0.2,label='Entire home/apt')
plt.bar(xx+0.2,y22,width=0.2,label='Private room')
plt.bar(xx+0.4,y33,width=0.2,label='Shared room')
plt.legend()
plt.xticks(np.arange(16),labels,rotation='60')
plt.grid(axis='y',alpha=0.2)
plt.title('北京地区不同城区不同房型中位数价格分布')

plt.savefig('北京地区各城区不同房型中位数价格分布.png',dpi=500,bbox_inches = 'tight')
plt.show()



[外链图片转存失败(img-tqB1XZ1n-1569078306330)(output_51_0.png)]

怀柔区,延庆县,平谷区,密云县的Entire home/apt的平均价格远高于北京主城区,可能是因为这些行政区内有多处度假村。
延庆县,怀柔区的Shared room的价格高于Private room,可能是因为他们Shared room的样本数据太少,均只有三组数据。
门头沟区的Private room价格远高于Entire home/apt,可能是因为门头沟区的Private room的样本数据较少,只有39组,且提供的数据中高租金价格占比较多。

len(listings[(listings.neighbourhood=='密云县') & (listings.room_type=='Entire home/apt')]['price'])

496

jiagecanzhaobiao=listings.groupby(['neighbourhood','room_type'])['price'].describe()['50%']
jiagecanzhaobiao.to_excel('价格参照表.xlsx')

len(listings[(listings.neighbourhood=='密云县') & (listings.room_type=='Shared room')]['price'])

5

收集大客户房东信息

#查看排名前十的房东信息,这是大客户
dakehufangdong=listings.groupby(['host_id','host_name']).agg({'id':'count'}).sort_values(by='id',axis=0,ascending=False)[:20]
dakehufangdong.to_excel('大客户房东信息表.xlsx')

# 查看第一名大客户"美婷"的房源分布情况
listings[listings.host_id==209669028]['neighbourhood'].value_counts()

朝阳区    178
东城区     44
Name: neighbourhood, dtype: int64

# 查看第二名大客户"兴伟"的房源分布情况
listings[listings.host_id==54436429]['neighbourhood'].value_counts()

海淀区    137
朝阳区     57
东城区     15
丰台区      1
Name: neighbourhood, dtype: int64

# 查看第三名大客户"海梅"的房源分布情况
listings[listings.host_id==156249912]['neighbourhood'].value_counts()

朝阳区    113
海淀区      2
Name: neighbourhood, dtype: int64

# 查看第四名大客户"Cathy"的房源分布情况
# listings[listings.host_id==17619297]
listings[listings.host_id==17619297]['neighbourhood'].value_counts()

海淀区    47
朝阳区    45
西城区     4
昌平区     1
东城区     1
Name: neighbourhood, dtype: int64

# 查看第三名大客户"金桔精品民宿"的房源分布情况
# listings[listings.host_id==156143513]
listings[listings.host_id==156143513]['neighbourhood'].value_counts()

通州区    54
顺义区    19
朝阳区     4
Name: neighbourhood, dtype: int64

#绘制前三名客户的房源分布图
x=np.arange(5)
labels=['美婷','兴伟','海梅','Cathy','金桔精品民宿']
#海淀区
y1=[0,137,2,47,0]
plt.bar(x,y1,width=0.1,label='海淀区')
#朝阳区
y2=[178,57,113,45,4]
plt.bar(x+0.1,y2,width=0.1,label='朝阳区')
#东城区
y3=[44,15,0,1,0]
plt.bar(x+0.2,y3,width=0.1,label='东城区')
#丰台区
y4=[0,1,0,0,0]
plt.bar(x+0.3,y4,width=0.1,label='丰台区')
#昌平区
y5=[0,0,0,1,0]
plt.bar(x+0.4,y5,width=0.1,label='昌平区')
#西城区
y6=[0,0,0,4,0]
plt.bar(x+0.5,y6,width=0.1,label='西城区')
# 通州区
y7=[0,0,0,0,54]
plt.bar(x+0.6,y7,width=0.1,label='通州区')
# 顺义区
y8=[0,0,0,0,19]
plt.bar(x+0.7,y8,width=0.1,label='顺义区')

plt.xticks(np.arange(5),labels)
plt.legend()
plt.title('前五名大房东的房源分布图')

plt.savefig('前五名大房东的房源分布图.png',dpi=500,bbox_inches = 'tight')
plt.show()

[外链图片转存失败(img-UlqCKdXy-1569078306330)(output_63_0.png)]

由上图可以看出,房源数量排名前五的大房东中朝阳区和海淀区的房源数量最多,而且这些大房东的房源分布通常在两个到三个行政区.

对calendar表数据清洗

calendar.head()

listing_iddateavailablepriceadjusted_priceminimum_nightsmaximum_nights
011650402019-04-17f$511.00$511.001.01125.0
111650402019-04-18t$511.00$511.001.01125.0
211650402019-04-19t$511.00$511.001.01125.0
311650402019-04-20t$511.00$511.001.01125.0
411650402019-04-21t$511.00$511.001.01125.0
#查看calendar表
#删除price和adjusted_price的美元符号
for i in range(len(calendar)):
    new_price=calendar['price'].values[i].strip('$')
    new_price=new_price.replace(',','')
    calendar['price'].values[i]=float(new_price)
    
    new_adjustedprice=calendar['adjusted_price'].values[i].strip('$')
    new_adjustedprice=new_adjustedprice.replace(',','')
    calendar['adjusted_price'].values[i]=float(new_adjustedprice)

#查看修改后的calendar的
calendar.head()

listing_iddateavailablepriceadjusted_priceminimum_nightsmaximum_nights
011650402019-04-17f5115111.01125.0
111650402019-04-18t5115111.01125.0
211650402019-04-19t5115111.01125.0
311650402019-04-20t5115111.01125.0
411650402019-04-21t5115111.01125.0

查找calendar表指定数据

#假定自己是一名普通游客,带妈妈在朝阳区租一个房间private room,价格在300-1000左右,计划入住三天,从2019-10-01开始入住,2019-10-04退房
#查看合适的房子
#want是listings表中符合要求的房源的基本信息,共计688家.
want=listings[(listings.room_type=='Private room')& (listings.price>300) & 
              (listings.price<1000) & (listings.availability_365>0)
             & (listings.minimum_nights<4)&(listings.neighbourhood=='朝阳区')]
len(want)

688

#这是calendar表里满足listing_id在want表的id的信息,且10-1到10-4都是可租的
new1=calendar[(calendar.date=='2019-10-01')&(calendar.available=='t')&(calendar['listing_id'].isin(want['id'].values))]
new2=calendar[(calendar.date=='2019-10-02')&(calendar.available=='t')&(calendar['listing_id'].isin(new1['listing_id'].values))]
new3=calendar[(calendar.date=='2019-10-03')&(calendar.available=='t')&(calendar['listing_id'].isin(new2['listing_id'].values))]
new4=calendar[(calendar.date=='2019-10-04')&(calendar.available=='t')&(calendar['listing_id'].isin(new3['listing_id'].values))]


#查看calendar建成的表new4的统计信息
new4[['minimum_nights','maximum_nights']].describe()

minimum_nightsmaximum_nights
count513.000000513.000000
mean1.222222939.087719
std1.168154391.935275
min1.0000001.000000
25%1.0000001125.000000
50%1.0000001125.000000
75%1.0000001125.000000
max24.0000001125.000000
#取new4表中最小晚数<=4的,最大晚数>=3的
new5=new4[(new4.minimum_nights<=4)&(new4.maximum_nights>=3)]
#new5即为满足要求的房源的时间信息表

对reviews表数据清洗

reviews.head()

listing_ididdatereviewer_idreviewer_namecomments
044054847482010-08-25207019JarrodSev was very helpful. Sev showed us where to ...
1440541183842010-10-13218723KimberlyWe arrived in Beijing very early in the mornin...
2440544369782011-08-11609177EmmaIt is a really massive apartment and really co...
34405411186572012-04-121787536AndreynaSev was incredibly helpful, showed us around t...
44405421406502012-08-301179565FrancesThe appartment was ideal for our party of 6 ad...
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202099 entries, 0 to 202098
Data columns (total 6 columns):
listing_id       202099 non-null int64
id               202099 non-null int64
date             202099 non-null object
reviewer_id      202099 non-null int64
reviewer_name    202093 non-null object
comments         201983 non-null object
dtypes: int64(3), object(3)
memory usage: 9.3+ MB

# 虽然reviewer_name有部分缺失,但是reviewer_id没有缺失,所以没有关系
# comments缺失也是可以接受的

#查找前20名评论次数最多的reviewer_id
top20_reviewers=reviews['reviewer_id'].value_counts()[:20]
top20_reviewers

186684246    43
21067785     35
158695647    34
99325050     32
149769588    26
140955472    26
213893643    24
6532783      23
229832388    23
196283240    23
28903457     23
117241519    21
104082034    21
3671922      21
165536239    20
16660997     20
10684339     20
228835331    20
43905550     19
50995265     19
Name: reviewer_id, dtype: int64

收集大客户住户信息

#创建dataframe topreviewer
#收集前20名评论次数最多的reviewer_id的基本信息,包括reviewer_id和reviewer_name和评论总条数

#创建dataframe topreviewer用来存放前20名评论次数最多的reviewer的信息
topreviewer=pd.DataFrame({'top_reviewer_id':np.arange(20),'top_reviewer_name':['none']*20,'sum_reviews':np.arange(20)})

#在topreviewer存放reviewer_id信息
topreviewer['top_reviewer_id']=top20_reviewers.index
#在topreviewer存放reviewer_name信息
for i in range(20):
    name=reviews[reviews.reviewer_id ==top20_reviewers.index[i] ].reviewer_name.unique()
    topreviewer.loc[i,'top_reviewer_name']=name
#在topreviewer存放sum_reviews信息
topreviewer['sum_reviews']=top20_reviewers.values

topreviewer

top_reviewer_idtop_reviewer_namesum_reviews
0186684246Tomm43
121067785Jasmine35
215869564734
399325050新月32
4149769588金龙26
5140955472Marines26
6213893643赛亚24
76532783Dee23
8229832388星河23
9196283240羊阳23
1028903457Yan23
11117241519兰兰21
12104082034Jonmiae21
133671922Kum Hong21
14165536239Holm20
1516660997Tao20
1610684339Mia20
17228835331Y20
1843905550Salome19
1950995265Bitong19
# 把topreviewer保存成表格topreviewer
topreviewer.to_excel('topreviewer.xlsx',index = False)

最受欢迎民宿特点

制作评论词云

#删除评论中的‘/r/n’,并写入file
file = open('comments1.txt','w',encoding='utf-8');
for i in range(len(reviews)):
    str1=str(reviews.comments.values[i]).replace('\r\n','')
    file.write(str1)
file.close()

#导入词云相关的库
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
import numpy as np
import  jieba
import wordcloud

#绘制评论词云
file = open('comments1.txt','r',encoding='utf-8');
data = file.read() # 读出数据
path_img='beijingmap.jpg'
background_image = np.array(Image.open(path_img))
w = wordcloud.WordCloud(font_path='./fonts/simhei.ttf',max_words=100,
                         background_color="white",
                       mask=background_image).generate(data)
image_colors = ImageColorGenerator(background_image)
   # 下面代码表示显示图片
plt.imshow(w.recolor(color_func=image_colors))
plt.axis("off")


plt.show()

w.to_file('comments.png')

[外链图片转存失败(img-vIn6IWtk-1569078306331)(output_87_0.png)]

<wordcloud.wordcloud.WordCloud at 0x22c544f7e48>

从评论的词云图里可看出,旅客最看重交通便利,房东热情,房屋干净这三点。
此外部分旅客还会关注设施齐全,床舒服,离地铁站近,有家的感觉等。

reviews.head()

listing_ididdatereviewer_idreviewer_namecomments
044054847482010-08-25207019JarrodSev was very helpful. Sev showed us where to ...
1440541183842010-10-13218723KimberlyWe arrived in Beijing very early in the mornin...
2440544369782011-08-11609177EmmaIt is a really massive apartment and really co...
34405411186572012-04-121787536AndreynaSev was incredibly helpful, showed us around t...
44405421406502012-08-301179565FrancesThe appartment was ideal for our party of 6 ad...
len(reviews)

202099

#评论表中不同房源出现次数
#np.array实现把index转换为数组
np.array(reviews['listing_id'].value_counts().index)


array([ 6622351,  6596814, 11911698, ..., 33781069, 28482595, 33261981],
      dtype=int64)

# 取前5%
haofangzi_id=np.array(reviews['listing_id'].value_counts().index)[:865]


listings.head()

idnamehost_idhost_nameneighbourhoodlatitudelongituderoom_typepriceminimum_nightsnumber_of_reviewslast_reviewreviews_per_monthcalculated_host_listings_countavailability_365
044054Modern and Comfortable Living in CBD192875East Apartments朝阳区39.89503116.45163Entire home/apt792.01892019-03-040.859341
1100213The Great Wall Box Deluxe Suite A团园长城小院东院套房527062Joe密云县40.68434117.17231Private room1201.0122017-10-080.1040
2128496Heart of Beijing: House with View 2467520Cindy东城区39.93213116.42200Entire home/apt389.032592019-02-052.70193
3161902cozy studio in center of Beijing707535Robert东城区39.93357116.43577Entire home/apt376.01262016-12-030.285290
4162144nice studio near subway, sleep 4707535Robert朝阳区39.93668116.43798Entire home/apt537.01372018-08-010.405352
aneighbour=[]
broomtype=[]
for i in range(865):
    m1=listings[listings.id==haofangzi_id[i]]['neighbourhood'].values[0]
    aneighbour.append(m1)
    m2=listings[listings.id==haofangzi_id[i]]['room_type'].values[0]
    broomtype.append(m2)

# from collections import Counter
# Counter(aneighbour) 它是用来统计不同元素出现次数的方法
#np.unique(y,return_counts=True) 这个方法也是统计不同元素出现次数的方法
aaneighbour=np.unique(aneighbour,return_counts=True)
bbroomtype=np.unique( broomtype,return_counts=True)

plt.bar(aaneighbour[0],aaneighbour[1])

plt.xticks(rotation='60')
plt.grid(axis='y',alpha=0.2)
plt.title('入住次数前5%的房源地区分布')
plt.savefig('入住次数前5%的房源地区分布.png',dpi=500,bbox_inches = 'tight')
plt.show()


[外链图片转存失败(img-BSjHOqar-1569078306332)(output_97_0.png)]

# 从上图可以看出,朝阳区和东城区的民宿入住需求最高。

plt.bar(bbroomtype[0],bbroomtype[1],width=0.25)

# plt.xticks(rotation='60')
plt.grid(axis='y',alpha=0.2)
plt.title('入住次数前5%的房型分布')
plt.savefig('入住次数前5%的房型分布.png',dpi=500,bbox_inches = 'tight')
plt.show()

[外链图片转存失败(img-74gelpwG-1569078306332)(output_99_0.png)]

# 入住次数前5%的房型分布饼图
plt.figure(figsize=(5,5))
values=bbroomtype[1].tolist()
labels=bbroomtype[0].tolist()
explode=[0.01,0.01,0.01]#设定各项距离圆心n个半径
plt.pie(values,explode=explode,labels=labels,autopct='%1.1f%%',startangle=261)
plt.title('入住次数前5%的房型分布饼图')#绘制标题
plt.savefig('入住次数前5%的房型分布饼图',dpi=500,bbox_inches = 'tight')#保存图片
plt.show()

[外链图片转存失败(img-mEcm9b8L-1569078306332)(output_100_0.png)]

# 最受欢迎价格分布
pprice=[]
for i in range(865):
    m1=listings[listings.id==haofangzi_id[i]]['price'].values[0]
    pprice.append(m1)


pprice

dandan=pd.DataFrame(pprice)
dandan.plot.box(title="入住次数前5%的价格分布")
plt.grid(linestyle="--", alpha=0.3)
plt.savefig('入住次数前5%的价格分布饼图',dpi=500,bbox_inches = 'tight')
plt.show()

[外链图片转存失败(img-urzMcN8y-1569078306333)(output_103_0.png)]

# 入住次数前5%的价格分布散点图
plt.scatter(x=pprice,y=np.arange(len(pprice)))
plt.show()

[外链图片转存失败(img-YLRZC1YC-1569078306333)(output_104_0.png)]

dandan.describe()

0
count864.000000
mean383.391204
std259.259195
min67.000000
25%201.000000
50%329.000000
75%483.000000
max2221.000000
  • 5
    点赞
  • 30
    收藏
    觉得还不错? 一键收藏
  • 8
    评论
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值