提取码:1234
Airbnb数据集-价格因素分析
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
calendar = pd. read_csv( "./data/madrid-airbnb-data/calendar.csv" )
calendar. head( )
calendar[ 'price' ] = calendar[ 'price' ] . str .
replace( r"[$,]" , "" , regex= True ) . astype( np. float32)
calendar[ 'adjusted_price' ] = calendar[ 'adjusted_price' ] . str .
replace( r"[$,]" , "" , regex= True ) . astype( np. float32)
calendar[ 'date' ] = pd. to_datetime( calendar[ 'date' ] )
calendar[ 'date' ] . head( )
calendar[ 'weekday' ] = calendar[ 'date' ] . dt. weekday
calendar[ 'month' ] = calendar[ 'date' ] . dt. month
calendar[ 'month' ] . head( )
month_price = calendar. groupby( "month" ) [ 'price' ] . mean( )
sns. barplot( month_price. index, month_price. values)
weekday_price = calendar. groupby( "weekday" ) [ 'price' ] . mean( )
sns. barplot( weekday_price. index, weekday_price. values)
sns. distplot( calendar[ calendar[ 'price' ] < 300 ] [ 'price' ] )
Airbnb数据集-房屋数据预处理
listings_detailed = pd. read_csv( "./data/madrid-airbnb-data/listings_detailed.csv" )
listings_detailed. columns. values. tolist( )
listings_detailed[ 'price' ] = listings_detailed[ 'price' ] . str . replace( r"[$,]" , "" , regex= True ) . astype( np. float32)
listings_detailed[ 'cleaning_fee' ] = listings_detailed[ 'cleaning_fee' ] . str . replace( r"[$,]" , "" , regex= True ) . astype( np. float32)
listings_detailed[ 'cleaning_fee' ] . fillna( 0 , inplace= True )
listings_detailed[ 'cleaning_fee' ] . head( )
listings_detailed[ 'minimum_cost' ] = ( listings_detailed[ 'price' ] + listings_detailed[ 'cleaning_fee' ] ) * listings_detailed[ 'minimum_nights' ]
listings_detailed[ 'minimum_cost' ] . head( )
listings_detailed[ 'n_amenities' ] =
listings_detailed[ 'amenities' ] . str [ 1 : - 1 ] . str . split( "," ) . apply ( len )
listings_detailed[ 'accommodates_type' ] = pd. cut( listings_detailed[ 'accommodates' ] , bins= [ 1 , 2 , 3 , 5 , 100 ] , include_lowest= True , right= False , labels= [ 'Single' , 'Couple' , 'Family' , 'Group' ] )
listings_detailed[ 'neighbourhood_group_cleansed' ] . head( )
listings_detailed[ 'review_scores_rating' ] . head( )
listings_detailed_df = listings_detailed[ [ 'id' , 'host_id' , 'listing_url' , 'room_type' ,
'neighbourhood_group_cleansed' , 'price' , 'cleaning_fee' , 'amenities' , 'n_amenities' ,
'accommodates' , 'accommodates_type' , 'minimum_nights' , 'minimum_cost' ] ]
listings_detailed_df. head( )
Airbnb数据集-房间类型和社区分析
room_type_counts = listings_detailed_df[ 'room_type' ] . value_counts( )
fig, axes = plt. subplots( 1 , 2 , figsize= ( 10 , 5 ) )
axes[ 0 ] . pie( room_type_counts. values, autopct= "%.2f%%" , labels= room_type_counts. index)
sns. barplot( room_type_counts. index, room_type_counts. values)
plt. tight_layout( )
neighbourhood_counts = listings_detailed_df[ 'neighbourhood_group_cleansed' ] . value_counts( )
sns. barplot( y= neighbourhood_counts. index, x= neighbourhood_counts. values, orient= 'h' )
Airbnb数据集-房间类型和社区对比分析
neighbour_room_type = listings_detailed_df. groupby( [ 'neighbourhood_group_cleansed' , 'room_type' ] ) \
. size( ) \
. unstack( 'room_type' ) \
. fillna( 0 ) \
. apply ( lambda row: row/ row. sum ( ) , axis= 1 ) \
. sort_values( "Entire home/apt" , ascending= True )
neighbour_room_type
columns = neighbour_room_type. columns
plt. figure( figsize= ( 10 , 5 ) )
index = neighbour_room_type. index
plt. barh( index, neighbour_room_type[ columns[ 0 ] ] )
left = neighbour_room_type[ columns[ 0 ] ] . copy( )
plt. barh( index, neighbour_room_type[ columns[ 1 ] ] , left= left)
left += neighbour_room_type[ columns[ 1 ] ] . copy( )
plt. barh( index, neighbour_room_type[ columns[ 2 ] ] , left= left)
left += neighbour_room_type[ columns[ 2 ] ] . copy( )
plt. barh( index, neighbour_room_type[ columns[ 3 ] ] , left= left)
plt. legend( columns)
fig, ax = plt. subplots( figsize= ( 10 , 5 ) )
neighbour_room_type. plot( kind= "barh" , stacked= True , width= 0.75 , ax= ax)
Airbnb数据集-房东房源数量分析
host_number = listings_detailed_df. groupby( 'host_id' ) . size( )
sns. distplot( host_number[ host_number< 10 ] )
host_number_bins = pd. cut( host_number, bins= [ 1 , 2 , 3 , 5 , 1000 ] , include_lowest= True , right= False , labels= [ '1' , '2' , '3-4' , '5+' ] ) . value_counts( )
plt. pie( host_number_bins, autopct= "%.2f%%" , labels= host_number_bins. index)
Airbnb数据集-评论数量与时间分析
reviews = pd. read_csv( "./data/madrid-airbnb-data/reviews_detailed.csv" , parse_dates= [ 'date' ] )
reviews. head( )
reviews[ 'year' ] = reviews[ 'date' ] . dt. year
reviews[ 'month' ] = reviews[ 'date' ] . dt. month
n_reviews_year = reviews. groupby( "year" ) . size( )
sns. barplot( n_reviews_year. index, n_reviews_year. values)
n_reviews_month = reviews. groupby( "month" ) . size( )
sns. barplot( n_reviews_month. index, n_reviews_month. values)
Airbnb数据集-评论数量与时间综合分析
year_month_reviews = reviews. groupby( [ 'year' , 'month' ] ) . size( ) . unstack( "month" ) . fillna( 0 )
year_month_reviews
fig, ax = plt. subplots( figsize= ( 10 , 5 ) )
for index in year_month_reviews. index:
series = year_month_reviews. loc[ index]
sns. lineplot( x= series. index, y= series. values, ax= ax)
ax. legend( labels= year_month_reviews. index)
ax. grid( )
_ = ax. set_xticks( list ( range ( 1 , 13 ) ) )
'host_is_superhost' ,
'host_identity_verified' ,
'neighbourhood_group_cleansed' ,
'latitude' ,
'longitude' ,
'property_type' ,
'room_type' ,
'accommodates' ,
'bathrooms' ,
'bedrooms' ,
'cleaning_fee' ,
'minimum_nights' ,
'maximum_nights' ,
'availability_90' ,
'number_of_reviews' ,
'review_scores_rating' ,
'is_business_travel_ready' ,
'n_amenities' ,
'price'
Airbnb数据集-房屋价格预测(1)
from sklearn. preprocessing import StandardScaler
ml_listings = listings_detailed[ listings_detailed[ 'price' ] < 300 ] [ [
'host_is_superhost' ,
'host_identity_verified' ,
'neighbourhood_group_cleansed' ,
'latitude' ,
'longitude' ,
'property_type' ,
'room_type' ,
'accommodates' ,
'bathrooms' ,
'bedrooms' ,
'cleaning_fee' ,
'minimum_nights' ,
'maximum_nights' ,
'availability_90' ,
'number_of_reviews' ,
'is_business_travel_ready' ,
'n_amenities' ,
'price'
] ]
ml_listings. dropna( axis= 0 , inplace= True )
features = ml_listings. drop( columns= [ 'price' ] )
targets = ml_listings[ 'price' ]
disperse_columns = [
'host_is_superhost' ,
'host_identity_verified' ,
'neighbourhood_group_cleansed' ,
'property_type' ,
'room_type' ,
'is_business_travel_ready'
]
disperse_features = features[ disperse_columns]
disperse_features = pd. get_dummies( disperse_features)
continuouse_features = features. drop( columns= disperse_columns)
scaler = StandardScaler( )
continuouse_features = scaler. fit_transform( continuouse_features)
feature_array = np. hstack( [ disperse_features, continuouse_features] )
Airbnb数据集-房屋价格预测(2)
from sklearn. model_selection import train_test_split
from sklearn. linear_model import LinearRegression
from sklearn. ensemble import RandomForestRegressor
from sklearn. metrics import mean_absolute_error, r2_score
X_train, X_test, y_train, y_test = train_test_split( feature_array, targets, test_size= 0.25 )
regressor = RandomForestRegressor( )
regressor. fit( X_train, y_train)
y_predict = regressor. predict( X_test)
print ( "平均误差:" , mean_absolute_error( y_test, y_predict) )
print ( "R2评分:" , r2_score( y_test, y_predict) )
Airbnb数据集-评论数量预测
ym_reviews = reviews. groupby( [ 'year' , 'month' ] ) . size( ) . reset_index( ) . rename( columns= { 0 : "count" } )
features = ym_reviews[ [ 'year' , 'month' ] ]
targets = ym_reviews[ 'count' ]
regressor = RandomForestRegressor( n_estimators= 100 )
regressor. fit( features, targets)
y_predict = regressor. predict( [
[ 2019 , 10 ] ,
[ 2019 , 11 ] ,
[ 2019 , 12 ]
] )
y_predict
Airbnb数据集-预测结果可视化
predict_reviews = pd. DataFrame( [ [ 2019 , 10 + index, x] for index, x in enumerate ( y_predict) ] , columns= [ 'year' , 'month' , 'count' ] )
final_reviews = pd. concat( [ ym_reviews, predict_reviews] ) . reset_index( )
years = final_reviews[ 'year' ] . unique( )
fig, ax = plt. subplots( figsize= ( 10 , 5 ) )
for year in years:
df = final_reviews[ final_reviews[ 'year' ] == year]
sns. lineplot( x= "month" , y= 'count' , data= df)
ax. legend( labels= year_month_reviews. index)
ax. grid( )
_ = ax. set_xticks( list ( range ( 1 , 13 ) ) )