airbnb运营状况分析
airbnb北京民宿运营状况分析
导入数据
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import jn
from IPython.display import display, clear_output
import time
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
listing = r'C:\Users\Administrator\Desktop\数据分析\listings.csv'
neighbour = r'C:\Users\Administrator\Desktop\数据分析\neighbourhoods.csv'
review = r'C:\Users\Administrator\Desktop\数据分析\reviews_detail.csv'
neigh_geo = r'C:\Users\Administrator\Desktop\数据分析\neighbourhoods.geojson'
reviews = pd.read_csv(review)
listing = pd.read_csv(listing)
neighbour = pd.read_csv(neighbour)
step1查看数据结构,处理缺失值#查看各个数据结构
#查看各特征丢失缺失情况
#listing.isnull().sum()
#neighbourhood_group 整列缺失,删除
#listing = listing.drop(‘neighbourhood_group’, axis=1)
#删除name为空的记录,只有1条
#listing = listing[-listing[‘name’].isnull()]
print('listing->数据量: %d, 维度: %d' %(listing.shape[0], listing.shape[1]))
print('reviews->数据量: %d, 维度: %d' %(reviews.shape[0], reviews.shape[1]))
print('neighbour->数据量: %d, 维度: %d' %(neighbour.shape[0], neighbour.shape[1]))
listing->数据量: 28452, 维度: 16
reviews->数据量: 202099, 维度: 6
neighbour->数据量: 16, 维度: 2
#listing.info() #查看具体信息, 发现neighbourhood_group字段整列缺失,后续可以删除
#neighbour neighbour表包含北京各区名字
#reviews.info() # reviews包含200k+ 的评论信息, 中英文都有,后续可以考虑做情感分析判断民宿的满意度
listing = listing.drop(['neighbourhood_group'],axis=1)
step2 统计民宿分布情况
res = listing.groupby('neighbourhood') #按区域分组
area_list = neighbour['neighbourhood'].to_list()
neighb_counts = {}
for area in area_list:
counts = res.get_group(area).shape[0]
neighb_counts[area] = counts
#发现东城+海淀+朝阳的民宿数量占比超过60%
plt.figure(figsize=(11,11))
plt.title('各区民宿分布情况')
label = [i[0] for i in neighb_counts.items()]
data = [i[1] for i in neighb_counts.items()]
max_area = sorted(neighb_counts.items(), key=lambda x:x[1],reverse=True)[:3] #选择民宿最多的三个区突出显示
max_area = [i[0] for i in max_area]
explode = [0.1 if i in max_area else 0 for i in label]
color = sns.color_palette("RdBu", len(label))
plt.pie(data, labels = label, explode = explode, autopct='%.2f%%', colors=color)
plt.legend(loc='best', fontsize=6.5)
plt.show()
可以看到,超过60%的房源都集中在朝阳,海淀,东城三个核心区域
step3 去除离群值,做相关性分析
#通过观察数据head 5行发现room_type值有重复, 说明这应该是一个分类字段, 并且是文字信息,后续处理时应该map 到数字编码
listi