w4_聚类分析_airbnb_参考代码

最新推荐文章于 2021-01-22 22:03:39 发布

Luara_lyy

最新推荐文章于 2021-01-22 22:03:39 发布

阅读量304

点赞数

分类专栏：商业数据分析师

本文链接：https://blog.csdn.net/weixin_40801364/article/details/100765872

版权

商业数据分析师专栏收录该内容

12 篇文章 1 订阅

订阅专栏

Airbnb数据字典

在这里插入图片描述

#调包
import pandas as pd
import seaborn as sns#更方便直接视图，查看结果
import matplotlib.pyplot as plt#调参更加灵活
%matplotlib inline#用于jupter视图语句

#数据导入
airbnb=pd.read_csv('w3_airbnb.csv')

#查看数据类型
#变量类别：用户个人信息、用户与airbnb的关系、app使用语言、用户去的国家、用户下单渠道
#这里有2个日期变量，之后会进行操作
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6752 entries, 0 to 6751
Data columns (total 14 columns):
age                     6752 non-null int64
date_account_created    6752 non-null object
date_first_booking      6752 non-null object
gender                  6752 non-null object
Language_EN             6752 non-null int64
Language_ZH             6752 non-null int64
Country_US              6752 non-null int64
Country_EUR             6752 non-null int64
android                 6752 non-null int64
moweb                   6752 non-null int64
web                     6752 non-null int64
ios                     6752 non-null int64
Married                 6752 non-null int64
Children                6752 non-null int64
dtypes: int64(11), object(3)
memory usage: 738.6+ KB

#用户数据具体情况
airbnb.head()

	age	date_account_created	date_first_booking	gender	Language_EN	Country_US	android	web	Married	Children
0	33	1/7/2010	1/8/2010	F	1	0	1	1	1	1
1	30	1/10/2010	1/11/2010	M	1	1	1	1	1	2
2	30	1/19/2010	1/21/2010	F	1	1	1	1	1	1
3	30	2/3/2010	2/4/2010	F	1	1	1	1	1	1
4	32	2/7/2010	2/7/2010	F	1	1	1	1	1	2

#单变量分析

#查看数字型变量核心指标
airbnb.describe()

	age	Language_EN	Language_ZH	Country_US	Country_EUR	android	moweb	web	ios	Married	Children
count	6752.000000	6752.000000	6752.000000	6752.000000	6752.000000	6752.000000	6752.000000	6752.000000	6752.000000	6752.000000	6752.000000
mean	47.791321	0.972156	0.006961	0.713270	0.162767	0.658472	0.340640	0.900770	0.064425	0.796949	1.535841
std	146.177746	0.164537	0.083147	0.452268	0.369180	0.474257	0.473959	0.298993	0.245527	0.402300	0.841394
min	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	28.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	1.000000	1.000000
50%	33.000000	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000	1.000000	1.000000
75%	42.000000	1.000000	0.000000	1.000000	0.000000	1.000000	1.000000	1.000000	0.000000	1.000000	2.000000
max	2014.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	3.000000

#发现年龄最小是2最大是2014，属于数据异常，进行数据清洗，这里保留用户年龄在18-80岁之间的群体

airbnb=airbnb[airbnb['age']<=80]
airbnb=airbnb[airbnb['age']>=18]

airbnb.age.describe()

count    6607.000000
mean       35.982443
std        10.896507
min        18.000000
25%        28.000000
50%        33.000000
75%        41.000000
max        80.000000
Name: age, dtype: float64

#类别型变量（日期）的调整

#计算用户注册到2019年的时间

#第一步将注册日期转变为日期时间格式

airbnb['date_account_created']=pd.to_datetime(airbnb['date_account_created'])

airbnb.info()
#发现data_account_created变量格式从object转变为datetime64

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6607 entries, 0 to 6751
Data columns (total 14 columns):
age                     6607 non-null int64
date_account_created    6607 non-null datetime64[ns]
date_first_booking      6607 non-null object
gender                  6607 non-null object
Language_EN             6607 non-null int64
Language_ZH             6607 non-null int64
Country_US              6607 non-null int64
Country_EUR             6607 non-null int64
android                 6607 non-null int64
moweb                   6607 non-null int64
web                     6607 non-null int64
ios                     6607 non-null int64
Married                 6607 non-null int64
Children                6607 non-null int64
dtypes: datetime64[ns](1), int64(11), object(2)
memory usage: 774.3+ KB

#第二步，将年份从中提取出来，将2019-注册日期的年份，并生成一个新的变量year_since_account_created

airbnb['year_since_account_created']=airbnb['date_account_created'].apply(lambda x:2019-x.year)

airbnb.year_since_account_created.describe()
#发现注册时间最短的是5年，最长的是9年

count    6607.000000
mean        6.034812
std         0.961253
min         5.000000
25%         5.000000
50%         6.000000
75%         7.000000
max         9.000000
Name: year_since_account_created, dtype: float64

#计算用户第一次预定到2019年的时间

#第一步将用户第一次预定时间转变为日期时间格式

airbnb['date_first_booking']=pd.to_datetime(airbnb['date_first_booking'])

#第二步，将年份从中提取出来，将2019-第一次注册的年份，并生成一个新的变量year_since_first_booking

airbnb['year_since_first_booking']=airbnb['date_first_booking'].apply(lambda x:2019-x.year)

airbnb.year_since_first_booking.describe()
#发现距离第一次预定时间最短的是4年，最长的是9年

count    6607.000000
mean        5.910095
std         0.990769
min         4.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         9.000000
Name: year_since_first_booking, dtype: float64

#将类别型型转化成哑变量(gender)

airbnb=pd.get_dummies(airbnb)

airbnb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6607 entries, 0 to 6751
Data columns (total 18 columns):
age                           6607 non-null int64
date_account_created          6607 non-null datetime64[ns]
date_first_booking            6607 non-null datetime64[ns]
Language_EN                   6607 non-null int64
Language_ZH                   6607 non-null int64
Country_US                    6607 non-null int64
Country_EUR                   6607 non-null int64
android                       6607 non-null int64
moweb                         6607 non-null int64
web                           6607 non-null int64
ios                           6607 non-null int64
Married                       6607 non-null int64
Children                      6607 non-null int64
year_since_account_created    6607 non-null int64
year_since_first_booking      6607 non-null int64
gender_F                      6607 non-null uint8
gender_M                      6607 non-null uint8
gender_U                      6607 non-null uint8
dtypes: datetime64[ns](2), int64(13), uint8(3)
memory usage: 845.2 KB

#删除两个日期变量，可以根据数据格式来进行drop
airbnb.drop(airbnb.select_dtypes(['datetime64']),inplace=True,axis=1)

#数据准备完成

#选择五个变量，作为分群的维度
#!这里需要注意，变量变为了airbnb_5,后面的操作中airbnb变为airbnb_5，需要提醒用户

airbnb_5=airbnb[['age','web','moweb','ios','android']]

#数据标准化，使用sklearn中预处理的scale

from sklearn.preprocessing import scale
x=pd.DataFrame(scale(airbnb_5))

#模型建立

#使用cluster建模
from sklearn import cluster

#先尝试分为3类
model=cluster.KMeans(n_clusters=3,random_state=10)
model.fit(x)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=10, tol=0.0001, verbose=0)

#提取标签，查看分类结果

airbnb_5['cluster']=model.labels_

C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

airbnb_5.head(20)

	age	web	moweb	android	cluster
0	33	1	0	1	1
1	30	1	0	1	1
2	30	1	0	1	1
3	30	1	0	1	1
4	32	1	0	1	1
5	46	1	1	0	0
6	30	1	0	1	1
7	46	1	0	1	1
9	33	1	0	1	1
10	45	1	0	1	1
11	32	1	1	0	0
12	46	1	0	1	1
13	29	1	0	1	1
14	29	1	0	1	1
16	33	1	1	0	0
17	37	1	0	1	1
18	28	1	0	1	1
19	41	1	0	1	1
21	30	1	1	0	0
22	35	1	0	1	1

#绘制散点图，查看分群结果

#横坐标为age(年龄)，纵坐标为ios(是否使用ios客户端），类别会为分群类别
sns.scatterplot(x='age',y='ios',hue='cluster',data=airbnb_5)

<matplotlib.axes._subplots.AxesSubplot at 0x15e7a28c940>

在这里插入图片描述

#模型评估与优化

#使用groupby函数，评估各个变量维度的分群效果

airbnb_5.groupby(['cluster'])['age'].describe()

	count	mean	std	min	25%	50%	75%	max
cluster
0	2108.0	34.911290	9.866273	18.0	28.0	32.0	39.0	78.0
1	4072.0	36.871316	11.519153	18.0	29.0	34.0	43.0	80.0
2	427.0	32.793911	8.263822	18.0	27.0	31.0	36.0	70.0

airbnb_5.groupby(['cluster'])['ios'].describe()

	count	mean	std	min	25%	50%	75%	max
cluster
0	2108.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	4072.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	427.0	1.0	0.0	1.0	1.0	1.0	1.0	1.0

#使用silhouette score，评估模型效果

from sklearn import metrics#调用sklearn的metrics库
x_cluster=model.fit_predict(x)#个体与群的距离
score=metrics.silhouette_score(x,x_cluster)#评分越高，个体与群越近；评分越低，个体与群越远

print(score)

0.6359835014766492

centers=pd.DataFrame(model.cluster_centers_)

centers.to_csv('center_3.csv')

#将群体分为5组
model=cluster.KMeans(n_clusters=5,random_state=10)
model.fit(x)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=10, tol=0.0001, verbose=0)

centers=pd.DataFrame(model.cluster_centers_)

centers.to_csv('center_5.csv')

Luara_lyy

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录

	age	web	moweb	android	cluster
0	33	1	0	1	1
1	30	1	0	1	1
2	30	1	0	1	1
3	30	1	0	1	1
4	32	1	0	1	1
5	46	1	1	0	0
6	30	1	0	1	1
7	46	1	0	1	1
9	33	1	0	1	1
10	45	1	0	1	1
11	32	1	1	0	0
12	46	1	0	1	1
13	29	1	0	1	1
14	29	1	0	1	1
16	33	1	1	0	0
17	37	1	0	1	1
18	28	1	0	1	1
19	41	1	0	1	1
21	30	1	1	0	0
22	35	1	0	1	1

	age	web	moweb	android	cluster
0	33	1	0	1	1
1	30	1	0	1	1
2	30	1	0	1	1
3	30	1	0	1	1
4	32	1	0	1	1
5	46	1	1	0	0
6	30	1	0	1	1
7	46	1	0	1	1
9	33	1	0	1	1
10	45	1	0	1	1
11	32	1	1	0	0
12	46	1	0	1	1
13	29	1	0	1	1
14	29	1	0	1	1
16	33	1	1	0	0
17	37	1	0	1	1
18	28	1	0	1	1
19	41	1	0	1	1
21	30	1	1	0	0
22	35	1	0	1	1

	age	web	moweb	android	cluster
0	33	1	0	1	1
1	30	1	0	1	1
2	30	1	0	1	1
3	30	1	0	1	1
4	32	1	0	1	1
5	46	1	1	0	0
6	30	1	0	1	1
7	46	1	0	1	1
9	33	1	0	1	1
10	45	1	0	1	1
11	32	1	1	0	0
12	46	1	0	1	1
13	29	1	0	1	1
14	29	1	0	1	1
16	33	1	1	0	0
17	37	1	0	1	1
18	28	1	0	1	1
19	41	1	0	1	1
21	30	1	1	0	0
22	35	1	0	1	1