机器学习:sklearn实现基于Airbnb数据集的实验

链接:数据集:https://pan.baidu.com/s/1YS-XMUEdTl8feML88nJ7Dw

提取码:1234

在这里插入图片描述

Airbnb数据集-价格因素分析

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

calendar = pd.read_csv("./data/madrid-airbnb-data/calendar.csv")
calendar.head()

在这里插入图片描述

calendar['price'] = calendar['price'].str.
replace(r"[$,]","",regex=True).astype(np.float32)
calendar['adjusted_price'] = calendar['adjusted_price'].str.
replace(r"[$,]","",regex=True).astype(np.float32)
calendar['date'] = pd.to_datetime(calendar['date'])
calendar['date'].head()

在这里插入图片描述

calendar['weekday'] = calendar['date'].dt.weekday
calendar['month'] = calendar['date'].dt.month
calendar['month'].head()

在这里插入图片描述

month_price = calendar.groupby("month")['price'].mean()
sns.barplot(month_price.index,month_price.values)

在这里插入图片描述

weekday_price = calendar.groupby("weekday")['price'].mean()
sns.barplot(weekday_price.index,weekday_price.values)

在这里插入图片描述

sns.distplot(calendar[calendar['price']<300]['price'])

在这里插入图片描述

Airbnb数据集-房屋数据预处理

listings_detailed = pd.read_csv("./data/madrid-airbnb-data/listings_detailed.csv")
listings_detailed.columns.values.tolist()

在这里插入图片描述

listings_detailed['price'] = listings_detailed['price'].str.replace(r"[$,]","",regex=True).astype(np.float32)
listings_detailed['cleaning_fee'] = listings_detailed['cleaning_fee'].str.replace(r"[$,]","",regex=True).astype(np.float32)
listings_detailed['cleaning_fee'].fillna(0,inplace=True)
listings_detailed['cleaning_fee'].head()

在这里插入图片描述

# 添加一个新的字段:最低消费
listings_detailed['minimum_cost'] = (listings_detailed['price']+listings_detailed['cleaning_fee'])*listings_detailed['minimum_nights']
listings_detailed['minimum_cost'].head()

在这里插入图片描述

# 设施的数量
listings_detailed['n_amenities'] = 
listings_detailed['amenities'].str[1:-1].str.split(",").apply(len)
# 根据可容纳的人数,添加一个新的列,用来表示类型:Single(1)、Couple(2)、Family(5)、Group(100)
# accommodates/listings_detailed['accommodates_type'] = 
listings_detailed['accommodates_type'] = pd.cut(listings_detailed['accommodates'],bins=[1,2,3,5,100],include_lowest=True,right=False,labels=['Single','Couple','Family','Group'])
listings_detailed['neighbourhood_group_cleansed'].head()

在这里插入图片描述

listings_detailed['review_scores_rating'].head()

在这里插入图片描述

listings_detailed_df = listings_detailed[['id','host_id','listing_url','room_type',
                                          'neighbourhood_group_cleansed','price','cleaning_fee','amenities','n_amenities',
                                         'accommodates','accommodates_type','minimum_nights','minimum_cost']]
listings_detailed_df.head()

在这里插入图片描述

Airbnb数据集-房间类型和社区分析

# 房间类型的情况
room_type_counts = listings_detailed_df['room_type'].value_counts()
fig,axes = plt.subplots(1,2,figsize=(10,5))
axes[0].pie(room_type_counts.values,autopct="%.2f%%",labels=room_type_counts.index)
sns.barplot(room_type_counts.index,room_type_counts.values)
plt.tight_layout()

在这里插入图片描述

neighbourhood_counts = listings_detailed_df['neighbourhood_group_cleansed'].value_counts()
sns.barplot(y=neighbourhood_counts.index,x=neighbourhood_counts.values,orient='h')

在这里插入图片描述

Airbnb数据集-房间类型和社区对比分析

neighbour_room_type = listings_detailed_df.groupby(['neighbourhood_group_cleansed','room_type']) \
    .size() \
    .unstack('room_type') \
    .fillna(0) \
    .apply(lambda row: row/row.sum(),axis=1) \
    .sort_values("Entire home/apt",ascending=True)
neighbour_room_type

在这里插入图片描述

columns = neighbour_room_type.columns
plt.figure(figsize=(10,5))
index = neighbour_room_type.index
plt.barh(index,neighbour_room_type[columns[0]])
left = neighbour_room_type[columns[0]].copy()
plt.barh(index,neighbour_room_type[columns[1]],left=left)
left += neighbour_room_type[columns[1]].copy()
plt.barh(index,neighbour_room_type[columns[2]],left=left)
left += neighbour_room_type[columns[2]].copy()
plt.barh(index,neighbour_room_type[columns[3]],left=left)
plt.legend(columns)

在这里插入图片描述

fig,ax = plt.subplots(figsize=(10,5))
neighbour_room_type.plot(kind="barh",stacked=True,width=0.75,ax=ax)

在这里插入图片描述

Airbnb数据集-房东房源数量分析

host_number = listings_detailed_df.groupby('host_id').size()
sns.distplot(host_number[host_number<10])

在这里插入图片描述

# 1,2,3,5+
#[1,2),[2,3),[3,4),5+
host_number_bins = pd.cut(host_number,bins=[1,2,3,5,1000],include_lowest=True,right=False,labels=['1','2','3-4','5+']).value_counts()
plt.pie(host_number_bins,autopct="%.2f%%",labels=host_number_bins.index)

在这里插入图片描述

Airbnb数据集-评论数量与时间分析

reviews = pd.read_csv("./data/madrid-airbnb-data/reviews_detailed.csv",parse_dates=['date'])
reviews.head()

在这里插入图片描述

reviews['year'] = reviews['date'].dt.year
reviews['month'] = reviews['date'].dt.month
n_reviews_year = reviews.groupby("year").size()
sns.barplot(n_reviews_year.index,n_reviews_year.values)

在这里插入图片描述

n_reviews_month = reviews.groupby("month").size()
sns.barplot(n_reviews_month.index,n_reviews_month.values)

在这里插入图片描述

Airbnb数据集-评论数量与时间综合分析

year_month_reviews = reviews.groupby(['year','month']).size().unstack("month").fillna(0)
year_month_reviews

在这里插入图片描述

fig,ax = plt.subplots(figsize=(10,5))
for index in year_month_reviews.index:
    series = year_month_reviews.loc[index]
    sns.lineplot(x=series.index,y=series.values,ax=ax)
ax.legend(labels=year_month_reviews.index)
ax.grid()
_ = ax.set_xticks(list(range(1,13)))

在这里插入图片描述

'host_is_superhost',
'host_identity_verified',
'neighbourhood_group_cleansed',
'latitude',
'longitude',
'property_type',
'room_type',
'accommodates',
'bathrooms',
'bedrooms',
'cleaning_fee',
'minimum_nights',
'maximum_nights',
'availability_90',
'number_of_reviews',
'review_scores_rating',
'is_business_travel_ready',
'n_amenities',
'price'

Airbnb数据集-房屋价格预测(1)

from sklearn.preprocessing import StandardScaler
ml_listings = listings_detailed[listings_detailed['price']<300][[
    'host_is_superhost',
    'host_identity_verified',
    'neighbourhood_group_cleansed',
    'latitude',
    'longitude',
    'property_type',
    'room_type',
    'accommodates',
    'bathrooms',
    'bedrooms',
    'cleaning_fee',
    'minimum_nights',
    'maximum_nights',
    'availability_90',
    'number_of_reviews',
#     'review_scores_rating',
    'is_business_travel_ready',
    'n_amenities',
    'price'
]]

# 删除异常值
ml_listings.dropna(axis=0,inplace=True)

# 分割特征值和目标值
features = ml_listings.drop(columns=['price'])
targets = ml_listings['price']

# 针对离散型进行one-hot编码
disperse_columns = [
    'host_is_superhost',
    'host_identity_verified',
    'neighbourhood_group_cleansed',
    'property_type',
    'room_type',
    'is_business_travel_ready'
]
disperse_features = features[disperse_columns]
disperse_features = pd.get_dummies(disperse_features)

# 对连续性数据进行标准化(因为特征值之间相差并不是很大,所以标准化的可能对预测结果影响不是很大)
continuouse_features = features.drop(columns=disperse_columns)
scaler = StandardScaler()
continuouse_features = scaler.fit_transform(continuouse_features)

# 对处理后的特征进行组合
feature_array = np.hstack([disperse_features,continuouse_features])

Airbnb数据集-房屋价格预测(2)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,r2_score

X_train,X_test,y_train,y_test = train_test_split(feature_array,targets,test_size=0.25)
regressor = RandomForestRegressor()
regressor.fit(X_train,y_train)
y_predict = regressor.predict(X_test)

print("平均误差:",mean_absolute_error(y_test,y_predict))
print("R2评分:",r2_score(y_test,y_predict))

在这里插入图片描述

Airbnb数据集-评论数量预测

ym_reviews = reviews.groupby(['year','month']).size().reset_index().rename(columns={0:"count"})

features = ym_reviews[['year','month']]
targets = ym_reviews['count']

# X_train,X_test,y_train,y_test = train_test_split(features,targets,test_size=0.25)
# regressor = RandomForestRegressor(n_estimators=100)
# regressor.fit(X_train,y_train)

# y_predict = regressor.predict(X_test)

# print("平均误差:",mean_absolute_error(y_test,y_predict))
# print("R2评分:",r2_score(y_test,y_predict))
regressor = RandomForestRegressor(n_estimators=100)
regressor.fit(features,targets)

y_predict = regressor.predict([
    [2019,10],
    [2019,11],
    [2019,12]
])

y_predict

在这里插入图片描述

Airbnb数据集-预测结果可视化

predict_reviews = pd.DataFrame([[2019,10+index,x] for index,x in enumerate(y_predict)],columns=['year','month','count'])
final_reviews = pd.concat([ym_reviews,predict_reviews]).reset_index()
years = final_reviews['year'].unique()

fig,ax = plt.subplots(figsize=(10,5))
for year in years:
    df = final_reviews[final_reviews['year']==year]
    sns.lineplot(x="month",y='count',data=df)

ax.legend(labels=year_month_reviews.index)
ax.grid()
_ = ax.set_xticks(list(range(1,13)))

在这里插入图片描述

美国著名共享民宿网站 Airbnb 开放的民宿信息和住客评价数据,包括民宿的位置、房间、配置、价格、住客的评分和自然语言评论等。目前Airbnb开放数据的城市如下表所示。 城市名称 省份和地区 所在国家 Amsterdam North Holland The Netherlands Antwerp Flemish Region Belgium Asheville North Carolina United States Athens Attica Greece Austin Texas United States Barcelona Catalonia Spain Berlin Berlin Germany Boston Massachusetts United States Brussels Brussels Belgium Chicago Illinois United States Copenhagen Hovedstaden Denmark Denver Colorado United States Dublin Leinster Ireland Edinburgh Scotland United Kingdom Geneva Geneva Switzerland Hong Kong Hong Kong China London England United Kingdom Los Angeles California United States Madrid Comunidad de Madrid Spain Mallorca Islas Baleares Spain Manchester England United Kingdom Melbourne Victoria Australia Montreal Quebec Canada Nashville Tennessee United States New Orleans Louisiana United States New York City New York United States Northern Rivers New South Wales Australia Oakland California United States Paris France France Portland Oregon United States Quebec City Quebec Canada San Diego California United States San Francisco California United States Santa Cruz County California United States Seattle Washington United States Sydney New South Wales Australia Toronto Ontario Canada Trentino Trentino-Alto Adige_Südtirol Italy Vancouver British Columbia Canada Venice Veneto Italy Victoria British Columbia Canada Vienna Vienna Austria Washington D.C.District of Columbia United States
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值