Airbnb数据字典
import pandas as pd
import seaborn as sns
import matplotlib. pyplot as plt
% matplotlib inline
airbnb= pd. read_csv( 'w3_airbnb.csv' )
airbnb. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6752 entries, 0 to 6751
Data columns (total 14 columns):
age 6752 non-null int64
date_account_created 6752 non-null object
date_first_booking 6752 non-null object
gender 6752 non-null object
Language_EN 6752 non-null int64
Language_ZH 6752 non-null int64
Country_US 6752 non-null int64
Country_EUR 6752 non-null int64
android 6752 non-null int64
moweb 6752 non-null int64
web 6752 non-null int64
ios 6752 non-null int64
Married 6752 non-null int64
Children 6752 non-null int64
dtypes: int64(11), object(3)
memory usage: 738.6+ KB
airbnb. head( )
age date_account_created date_first_booking gender Language_EN Language_ZH Country_US Country_EUR android moweb web ios Married Children 0 33 1/7/2010 1/8/2010 F 1 0 0 0 1 0 1 0 1 1 1 30 1/10/2010 1/11/2010 M 1 0 1 0 1 0 1 0 1 2 2 30 1/19/2010 1/21/2010 F 1 0 1 0 1 0 1 0 1 1 3 30 2/3/2010 2/4/2010 F 1 0 1 0 1 0 1 0 1 1 4 32 2/7/2010 2/7/2010 F 1 0 1 0 1 0 1 0 1 2
airbnb. describe( )
age Language_EN Language_ZH Country_US Country_EUR android moweb web ios Married Children count 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 mean 47.791321 0.972156 0.006961 0.713270 0.162767 0.658472 0.340640 0.900770 0.064425 0.796949 1.535841 std 146.177746 0.164537 0.083147 0.452268 0.369180 0.474257 0.473959 0.298993 0.245527 0.402300 0.841394 min 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 25% 28.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000 1.000000 50% 33.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 1.000000 75% 42.000000 1.000000 0.000000 1.000000 0.000000 1.000000 1.000000 1.000000 0.000000 1.000000 2.000000 max 2014.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 3.000000
airbnb= airbnb[ airbnb[ 'age' ] <= 80 ]
airbnb= airbnb[ airbnb[ 'age' ] >= 18 ]
airbnb. age. describe( )
count 6607.000000
mean 35.982443
std 10.896507
min 18.000000
25% 28.000000
50% 33.000000
75% 41.000000
max 80.000000
Name: age, dtype: float64
airbnb[ 'date_account_created' ] = pd. to_datetime( airbnb[ 'date_account_created' ] )
airbnb. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6607 entries, 0 to 6751
Data columns (total 14 columns):
age 6607 non-null int64
date_account_created 6607 non-null datetime64[ns]
date_first_booking 6607 non-null object
gender 6607 non-null object
Language_EN 6607 non-null int64
Language_ZH 6607 non-null int64
Country_US 6607 non-null int64
Country_EUR 6607 non-null int64
android 6607 non-null int64
moweb 6607 non-null int64
web 6607 non-null int64
ios 6607 non-null int64
Married 6607 non-null int64
Children 6607 non-null int64
dtypes: datetime64[ns](1), int64(11), object(2)
memory usage: 774.3+ KB
airbnb[ 'year_since_account_created' ] = airbnb[ 'date_account_created' ] . apply ( lambda x: 2019 - x. year)
airbnb. year_since_account_created. describe( )
count 6607.000000
mean 6.034812
std 0.961253
min 5.000000
25% 5.000000
50% 6.000000
75% 7.000000
max 9.000000
Name: year_since_account_created, dtype: float64
airbnb[ 'date_first_booking' ] = pd. to_datetime( airbnb[ 'date_first_booking' ] )
airbnb[ 'year_since_first_booking' ] = airbnb[ 'date_first_booking' ] . apply ( lambda x: 2019 - x. year)
airbnb. year_since_first_booking. describe( )
count 6607.000000
mean 5.910095
std 0.990769
min 4.000000
25% 5.000000
50% 6.000000
75% 6.000000
max 9.000000
Name: year_since_first_booking, dtype: float64
airbnb= pd. get_dummies( airbnb)
airbnb. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6607 entries, 0 to 6751
Data columns (total 18 columns):
age 6607 non-null int64
date_account_created 6607 non-null datetime64[ns]
date_first_booking 6607 non-null datetime64[ns]
Language_EN 6607 non-null int64
Language_ZH 6607 non-null int64
Country_US 6607 non-null int64
Country_EUR 6607 non-null int64
android 6607 non-null int64
moweb 6607 non-null int64
web 6607 non-null int64
ios 6607 non-null int64
Married 6607 non-null int64
Children 6607 non-null int64
year_since_account_created 6607 non-null int64
year_since_first_booking 6607 non-null int64
gender_F 6607 non-null uint8
gender_M 6607 non-null uint8
gender_U 6607 non-null uint8
dtypes: datetime64[ns](2), int64(13), uint8(3)
memory usage: 845.2 KB
airbnb. drop( airbnb. select_dtypes( [ 'datetime64' ] ) , inplace= True , axis= 1 )
airbnb_5= airbnb[ [ 'age' , 'web' , 'moweb' , 'ios' , 'android' ] ]
from sklearn. preprocessing import scale
x= pd. DataFrame( scale( airbnb_5) )
from sklearn import cluster
model= cluster. KMeans( n_clusters= 3 , random_state= 10 )
model. fit( x)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
random_state=10, tol=0.0001, verbose=0)
airbnb_5[ 'cluster' ] = model. labels_
C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""Entry point for launching an IPython kernel.
airbnb_5. head( 20 )
age web moweb ios android cluster 0 33 1 0 0 1 1 1 30 1 0 0 1 1 2 30 1 0 0 1 1 3 30 1 0 0 1 1 4 32 1 0 0 1 1 5 46 1 1 0 0 0 6 30 1 0 0 1 1 7 46 1 0 0 1 1 9 33 1 0 0 1 1 10 45 1 0 0 1 1 11 32 1 1 0 0 0 12 46 1 0 0 1 1 13 29 1 0 0 1 1 14 29 1 0 0 1 1 16 33 1 1 0 0 0 17 37 1 0 0 1 1 18 28 1 0 0 1 1 19 41 1 0 0 1 1 21 30 1 1 0 0 0 22 35 1 0 0 1 1
sns. scatterplot( x= 'age' , y= 'ios' , hue= 'cluster' , data= airbnb_5)
<matplotlib.axes._subplots.AxesSubplot at 0x15e7a28c940>
airbnb_5. groupby( [ 'cluster' ] ) [ 'age' ] . describe( )
count mean std min 25% 50% 75% max cluster 0 2108.0 34.911290 9.866273 18.0 28.0 32.0 39.0 78.0 1 4072.0 36.871316 11.519153 18.0 29.0 34.0 43.0 80.0 2 427.0 32.793911 8.263822 18.0 27.0 31.0 36.0 70.0
airbnb_5. groupby( [ 'cluster' ] ) [ 'ios' ] . describe( )
count mean std min 25% 50% 75% max cluster 0 2108.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 4072.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 427.0 1.0 0.0 1.0 1.0 1.0 1.0 1.0
from sklearn import metrics
x_cluster= model. fit_predict( x)
score= metrics. silhouette_score( x, x_cluster)
print ( score)
0.6359835014766492
centers= pd. DataFrame( model. cluster_centers_)
centers. to_csv( 'center_3.csv' )
model= cluster. KMeans( n_clusters= 5 , random_state= 10 )
model. fit( x)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
random_state=10, tol=0.0001, verbose=0)
centers= pd. DataFrame( model. cluster_centers_)
centers. to_csv( 'center_5.csv' )