目录
背景
1.引用库
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import plotly.express as px
import os
import folium
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from mpl_toolkits import mplot3d
plt.rcParams['font.sans-serif'] = ['KaiTi']
plt.rcParams['axes.unicode_minus'] = False
2.查看数据
os.chdir(r'E:\workSpace\dataProject\customer-segmentation-tutorial-in-python\data')
df=pd.read_csv('Mall_Customers.csv')
df.head()
- 字段含义:ID、性别、年龄、收入、商场根据顾客行为和消费性质分配的分数
df.describe()
df.describe(include='object')
df.info()
df.rename(columns={'Annual Income (k$)':'Annual Income','Spending Score (1-100)':'Spending Score'},inplace=True)
df.describe()
3.异常值处理
plt.subplot(131)
df['Age'].plot(kind='box')
plt.subplot(132)
df['Annual Income'].plot(kind='box')
plt.subplot(133)
df['Spending Score'].plot(kind='box')
- Annual Income存在异常值。
df['Annual Income'].quantile(0.25),df['Annual Income'].quantile(0.75)
IDR=df['Annual Income'].quantile(0.75)-df['Annual Income'].quantile(0.25)
income_max=df['Annual Income'].quantile(0.75)+1.5*IDR
income_max
df[df['Annual Income']>income_max]['Annual Income']=df['Annual Income'].median()
df.head()
4.数据可视化
4.1.性别分布
df['Gender'].value_counts().plot(kind='bar')
plt.plot?
plt.legend(loc='best')
plt.title('年龄段分布')
plt.title('性别分布')
plt.show()
- 客户主要是女性
4.2.年龄段分布
df_age=pd.cut(df['Age'],bins=[0,10,20,30,40,50,60,70,80,90])
df_age=pd.DataFrame({'Age_range':df_age})
df_age=df_age.Age_range.value_counts().sort_index()
df_age
df_age.plot(kind='bar')
plt.xlabel('age')
plt.legend(loc='best')
plt.title('年龄段分布')
plt.show()
- 年龄段主要分布在20-40岁
4.3.收入分布
px.histogram(df,x='Annual Income')
- 收入主要分布在40-80K$
4.4.消费分数分布
px.histogram(df,x='Spending Score')
- 消费分数在40-60,70-80的人数较多
4.5.收入与消费分数关系
plt.figure(figsize=[16,6])
plt.subplot(121)
sns.scatterplot(data=df,x='Annual Income',y='Spending Score',hue='Gender')
plt.subplot(122)
sns.regplot(data=df,x='Annual Income',y='Spending Score')#绘图数据和线性回归模型拟合
plt.show()
4.6.年龄与消费分数的关系
plt.figure(figsize=[16,6])
plt.subplot(121)
sns.scatterplot(data=df,x='Age',y='Spending Score',hue='Gender')
plt.subplot(122)
sns.regplot(data=df,x='Age',y='Spending Score')
plt.show()
5.数据标准化
df.head()
new_df=StandardScaler().fit_transform(df.iloc[:,2:])
new_df=pd.DataFrame(new_df,columns=['Age','Annual Income','Spending Score'])
new_df.head()
6.根据收入、消费分数分群
X1=new_df.loc[:,['Annual Income','Spending Score']].values
6.1.Kmeans肘部法则:K值选择优化
def show_n_clusters(X_values):
Error=[]
for i in range(1,15):
kmeans=KMeans(n_clusters=i).fit(X_values)
Error.append(kmeans.inertia_)
plt.plot(np.arange(1,15),Error,'x')
plt.plot(np.arange(1,15),Error,'-')
plt.xlabel('n_clusters')
plt.ylabel('Error')
plt.title('Elbow method')
plt.show()
show_n_clusters(X1)
- n_clusters取值5
6.2.KMeans分群
def show_groups(df,cols,n_clusters=1):
df=df.loc[:,cols]
X_values=df.values
kmeans_new=KMeans(n_clusters=n_clusters).fit(X_values)
y_predict=kmeans_new.fit_predict(X_values)
f1=kmeans_new.cluster_centers_
if len(cols)==2:
sns.scatterplot(data=df,x=cols[0],y=cols[1],hue=y_predict)
sns.scatterplot(x=f1[:,0],y=f1[:,1])
elif len(cols)==3:
ax = plt.axes(projection='3d')
zdata = X_values[:,2]
xdata = X_values[:,1]
ydata = X_values[:,0]
ax.scatter3D(xdata, ydata, zdata, c=y_predict)
ax.set_xlabel(cols[1])
ax.set_ylabel(cols[0])
ax.set_zlabel(cols[2])
plt.show()
return y_predict
y_predict1=show_groups(new_df,['Annual Income','Spending Score'],n_clusters=5)
y_predict1
7.根据年龄、消费分数分群
X2=new_df.loc[:,['Age','Spending Score']].values
7.1. Kmeans肘部法则:K值选择优化
show_n_clusters(X2)
- n_clusters取值5
7.2.KMeans分群
y_predict2=show_groups(new_df,['Age','Spending Score'],n_clusters=5)
y_predict2
8.根据收入、年龄与消费分数分群
X3=new_df.values
8.1.Kmeans肘部法则:K值选择优化
show_n_clusters(X3)
- n_clusters取值6
8.2.KMeans分群
show_groups(new_df,['Age','Annual Income','Spending Score'],n_clusters=6)