section8

基于KMeans与主成分分析明确目标用户群
该博客旨在明确目标用户群以更好服务现有用户。介绍了作图、数据库操作、批量读取文件等知识点,运用groupby()和agg()联合统计用户消费与登陆情况。使用KMeans对用户分类,分为重度、中度、轻度用户,还进行主成分分析降维,探讨了预处理对聚类的影响。

本章节的目的是 【明确目标用户群】 ,以更好的服务现有用户。

【知识点】

1.作图

  • 显示中文

plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体) plt.rcParams['axes.unicode_minus'] = False # 步骤二(解决坐标轴负数的负号显示问题)

2.数据库操作

  • sqlalchemy 引擎

engine = create_engine('mysql+pymysql://root:123456@localhost:3306/datascience')

3.批量读取文件

  • os.wolk()、os.path.join()用法
for root, dirs, files in os.walk(path): 
        for file in files:`
            rfile = os.path.join(root,file)
            if rfile.split('.')[-1] == 'tsv':
                rdf = pd.read_csv(rfile, sep='\t')
                df = df.append(rdf)

4.groupby()以及agg() 的联合使用,应对不同列使用不同的函数

  • 按月统计

affc = {'payment':'sum', 'log_date':'count'} dfm = df.groupby(['log_month', 'user_id']).agg(affc).reset_index()

  • 修改列明

renam = {'log_date':'access_days'} dfm.rename(columns=renam, inplace=True)

5.KMeans 聚类的使用

  • 单列的聚类(需要将单列应用 reshape(-1,1)格式化为1列)

from sklearn.cluster import KMeans a47 = action['A47'].reshape(-1, 1) kms = KMeans(n_clusters=3).fit(a47)

  • 聚类的标签 labels_ 属性

cluster = kms.labels_

  • 将标签添加至源数据中,运用groupby()查看分组情况

action['cluster'] = cluster action.groupby(['cluster'])['user_id'].count()

  • 可视化分组
snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False)
plt.figure(figsize=(8,5))
snsdf1 = snsdf.reset_index()
snsdf1[snsdf1['cluster']==2]['A47'].plot(color='r',label='2:重度用户')
snsdf1[snsdf1['cluster']==1]['A47'].plot(color='g',label='1:中度用户')
snsdf1[snsdf1['cluster']==0]['A47'].plot(color='b',label='0:轻度用户')
plt.legend()
plt.xlabel('用户分布')
plt.ylabel('排行榜得分')

6.主成分分析

  • 数据预处理

    • 提取要进行主成分分析的列
      paction = acc.iloc[:,3:(len(acc.columns)-1)]
    • 删掉0值较多的列
      cc = paction[paction==0].count(axis=0)/len(paction) cc.plot() dd = cc[cc<.9] #删掉该列中90%以上都是0值的列 paction = paction[dd.index] paction.head()
    • 删掉相关性较强的列

      # 数据概览
      corp = paction.corr()
      sns.heatmap(corp)
      mask = np.array(corp)
      mask[np.tril_indices_from(mask)] = False        # 画下三角heatmap的方法
      sns.heatmap(corp,mask=mask)
      
      # 通过下三角矩阵的方式,删掉相关性较强的数据列
      coll = corp.columns
      corp = pd.DataFrame(np.tril(corp, -1))         # 应用 np.tril(m, -1) 函数获取下三角,上三角数据全部置为0
      corp.columns = coll
      pac2 = paction.loc[:,(corp.abs()<.8).all()]      # 任何一个数都小于 0.8 的数据 all() 函数
      pac2.head()
    • 进行主成分分析

      from sklearn.decomposition import PCA
      pca = PCA()
      pca.fit(pac2)
      
      redio = pca.explained_variance_ratio_          # pca.explained_variance_ratio_ 是PCA降维后的矩阵课解释性比率
      print(redio) 
      print(pca.singular_values_)                # singular_values_ 是奇异值矩阵
    • 主成分的课解释性曲线

      recu = redio.cumsum()                     # 应用 cumsum() 函数进行逐数据累加
      plt.plot(recu)
    • 获取降维后的数据以进行下一步

      pca.set_params(n_components=10)              # 设置 维度 为 10 
      pac3 = pd.DataFrame(pca.fit_transform(pac2))     # 使用fit_transform()函数训练并获得降维后的数据
      pac3.head()
    • 继续应用 KMENAS 进行聚类, 得到所有用户的 分类 ,然后再 平均 每个分类的每个行为的所有用户的值
    • 继续应用相关性 删除 相关性强的列, 获得最后 主要观察指标
    • 对主要观察指标进行 雷达图 展示

      # 首先,对数据进行标准化处理
      from sklearn.preprocessing import scale
      ccccc = pd.DataFrame(scale(cccc))
      ccccc.columns = cccc.columns
      
      # 画图
      plt.figure(figsize=(8,8))                  
      N = ccccc.shape[1]                      # 极坐标的分割分数
      angles = np.linspace(0, 2*np.pi, N, endpoint=False)    # 设置雷达图的角度,用于平分切开一个圆面      
      angles = np.concatenate((angles,[angles[0]]))   # 使雷达图一圈封闭起来
      for i in range(len(ccccc)):
          values = ccccc.loc[i,:]              # 构造数据
          values = np.concatenate((values,[values[0]]))     # 为了使雷达图一圈封闭起来
          plt.polar(angles, values, 'o-', linewidth=2)      # 绘制
      plt.legend(ccccc.index, loc='lower right')
      plt.thetagrids(angles * 180/np.pi, labels=list(ccccc.columns))    # 添加极坐标的标签
      plt.title('重要指标雷达图呈现')
      

一、库导入以及matplotlib显示中文

import pandas as pd
import numpy as np
import pymysql
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os

plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体)
plt.rcParams['axes.unicode_minus'] = False   # 步骤二(解决坐标轴负数的负号显示问题)
%matplotlib inline

数据库引擎

engine = create_engine('mysql+pymysql://root:123456@localhost:3306/datascience')

二、批量读取文件

def read_files(path):
    df = pd.DataFrame()
    for root, dirs, files in os.walk(path):
        for file in files:
            rfile = os.path.join(root,file)
            if rfile.split('.')[-1] == 'tsv':
                rdf = pd.read_csv(rfile, sep='\t')
                df = df.append(rdf)
    return df
action_path  = 'data/sample-data/section8/daily/action/'
dau_path = 'data/sample-data/section8/daily/dau/'
dpu_path = 'data/sample-data/section8/daily/dpu/'

action = read_files(action_path)
dau = read_files(dau_path)
dpu = read_files(dpu_path)

查看数据完整性以及头部信息

print(action.isnull().sum().sum())
print(action.shape)
# print(action.info())
action.head()
0
(2653, 57)
log_dateapp_nameuser_idA1A2A3A4A5A6A7...A45A46A47A48A49A50A51A52A53A54
02013-10-31game-016541330000000...003802565500000.046
12013-10-31game-014255300000101233...19201805433473622400.071
22013-10-31game-017095960000000...004162481700000.02
32013-10-31game-015250470200900...2222352006412210000.0109
42013-10-31game-017969080000000...29293882544410000.064

5 rows × 57 columns

print(dau.isnull().sum().sum())
print(dau.shape)
print(dau.info())
dau.head()
0
(509754, 3)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 509754 entries, 0 to 2410
Data columns (total 3 columns):
log_date    509754 non-null object
app_name    509754 non-null object
user_id     509754 non-null int64
dtypes: int64(1), object(2)
memory usage: 15.6+ MB
None
log_dateapp_nameuser_id
02013-05-01game-01608801
12013-05-01game-01712453
22013-05-01game-01776853
32013-05-01game-01823486
42013-05-01game-01113600
print(dpu.isnull().sum().sum())
print(dpu.shape)
print(dpu.info())
dpu.head()
0
(3532, 4)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3532 entries, 0 to 7
Data columns (total 4 columns):
log_date    3532 non-null object
app_name    3532 non-null object
user_id     3532 non-null int64
payment     3532 non-null int64
dtypes: int64(2), object(2)
memory usage: 138.0+ KB
None
log_dateapp_nameuser_idpayment
02013-05-01game-01804005571
12013-05-01game-0179353781
22013-05-01game-0131771781
32013-05-01game-0131771781
42013-05-01game-01426525324
# 写入数据库

# action.to_sql('s8_action', engine, index=False)
# dau.to_sql('s8_dau', engine, index=False)
# dpu.to_sql('s8_dpu', engine, index=False)

三、数据预处理

1.合并 DAU DPU

df = pd.merge(dau, dpu[['log_date','user_id','payment']], how='left', on=['user_id','log_date'])
df.head()
log_dateapp_nameuser_idpayment
02013-05-01game-01608801NaN
12013-05-01game-01712453NaN
22013-05-01game-01776853NaN
32013-05-01game-01823486NaN
42013-05-01game-01113600NaN
# 将无消费记录的消费额设为 0 
print(df.payment.isnull().sum())
df['payment'].fillna(0, inplace=True)
print(df.payment.isnull().sum())
507151
0
# 添加消费额标志位
df['is_pay'] = df['payment'].apply( lambda x: 1 if x>0 else 0 )
df.head()
log_dateapp_nameuser_idpaymentis_pay
02013-05-01game-016088010.00
12013-05-01game-017124530.00
22013-05-01game-017768530.00
32013-05-01game-018234860.00
42013-05-01game-011136000.00

2.按月统计

# 增加月份列
df['log_month'] = df['log_date'].apply(lambda x: x[0:7])
df.head()
log_dateapp_nameuser_idpaymentis_paylog_month
02013-05-01game-016088010.002013-05
12013-05-01game-017124530.002013-05
22013-05-01game-017768530.002013-05
32013-05-01game-018234860.002013-05
42013-05-01game-011136000.002013-05

巧妙运用 groupby 以及 agg 函数,统计出用户按月份的 消费情况 和 登陆次数

# 按月统计
affc = {'payment':'sum', 'log_date':'count'}
dfm = df.groupby(['log_month', 'user_id']).agg(affc).reset_index()
# 修改列明
renam = {'log_date':'access_days'}
dfm.rename(columns=renam, inplace=True)
dfm.head()
log_monthuser_idpaymentaccess_days
02013-05650.01
12013-051150.01
22013-051940.01
32013-054260.04
42013-055390.01

4.使用 Kmeans 进行分类, 得到排名靠前的用户,即 重度用户/中度用户/轻度用户

A47 列即是排行榜得分, 从分布图上看出,大部分用户得分很低,符合幂律曲线

# 
action['A47'].hist(bins=50, figsize=(6,4))
<matplotlib.axes._subplots.AxesSubplot at 0x1c21d894240>

png

sns.distplot(action['A47'],bins=50,kde=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1c21af07a58>

png

对 A47 列进行聚类,分为3类
from sklearn.cluster import KMeans

a47 = action['A47'].reshape(-1, 1)

kms = KMeans(n_clusters=3).fit(a47)
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  This is separate from the ipykernel package so we can avoid doing imports until
cluster = kms.labels_
kms.cluster_centers_
array([[  9359.84787792],
       [ 69386.11297071],
       [185857.17948718]])
action['cluster'] = cluster
action.head()
log_dateapp_nameuser_idA1A2A3A4A5A6A7...A46A47A48A49A50A51A52A53A54cluster
02013-10-31game-016541330000000...03802565500000.0460
12013-10-31game-014255300000101233...201805433473622400.0712
22013-10-31game-017095960000000...04162481700000.020
32013-10-31game-015250470200900...22352006412210000.01090
42013-10-31game-017969080000000...293882544410000.0640

5 rows × 58 columns

action.groupby(['cluster'])['user_id'].count()
cluster
0    2096
1     479
2      78
Name: user_id, dtype: int64

图上显示,通过聚类分解后用户分为3个类, 0 表示轻度用户,排行榜得分最少; 1 表示中度用户,排行版得分居中; 2 表示重度用户,排行版得分较高,而且用户数量较少,符合实际情况。

snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False)
snsdf['user'] = range(len(snsdf))
sns.scatterplot(x='user',y='A47',hue='cluster',data=snsdf, palette='rainbow', alpha=.2)
<matplotlib.axes._subplots.AxesSubplot at 0x1c21b9bf898>

png

snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False)
snsdf['user'] = range(len(snsdf))

plt.figure(figsize=(8,5))
snsdf1 = snsdf.reset_index()
snsdf1[snsdf1['cluster']==2]['A47'].plot(color='r',label='2:重度用户')
snsdf1[snsdf1['cluster']==1]['A47'].plot(color='g',label='1:中度用户')
snsdf1[snsdf1['cluster']==0]['A47'].plot(color='b',label='0:轻度用户')
plt.legend()
plt.xlabel('用户分布')
plt.ylabel('排行榜得分')
Text(0,0.5,'排行榜得分')

png

限定排名靠前的用户,即得分较高的重度和中度用户,以便接下来进行分析
acc = action[action['cluster']>=1]
acc.head()
log_dateapp_nameuser_idA1A2A3A4A5A6A7...A46A47A48A49A50A51A52A53A54cluster
12013-10-31game-014255300000101233...201805433473622400.0712
52013-10-31game-017761200000900...381422146843715000.03122
72013-10-31game-0127619700007058...15546024226150800.0951
82013-10-31game-012215720000100...2439891579240000.0211
92013-10-31game-016924330000600...28507064549168000.01541

5 rows × 58 columns

5.主成分分析

获取关键的参数

paction = acc.iloc[:,3:(len(acc.columns)-1)]
paction.index=acc.user_id
paction.head()
A1A2A3A4A5A6A7A8A9A10...A45A46A47A48A49A50A51A52A53A54
user_id
425530000010123358.25288230...19201805433473622400.071
77612000009000.00325195...19381422146843715000.0312
276197000070587.25150100...1515546024226150800.095
22157200001000.004014...242439891579240000.021
69243300006000.0010295...1528507064549168000.0154

5 rows × 54 columns

1.删掉 0 值比较多的列
cc = paction[paction==0].count(axis=0)/len(paction)
print(cc.head())
cc.plot()
A1    1.000000
A2    0.926391
A3    1.000000
A4    0.994614
A5    0.055655
dtype: float64





<matplotlib.axes._subplots.AxesSubplot at 0x1c21bbb1470>

png

# cc[cc>.8]
dd = cc[cc<.95]
paction = paction[dd.index]
paction.head()
A2A5A6A7A8A9A10A11A12A13...A45A46A47A48A49A50A51A52A53A54
user_id
425530010123358.2528823019219...19201805433473622400.071
77612009000.0032519538819...19381422146843715000.0312
276197070587.2515010015311...1515546024226150800.095
22157201000.004014003...242439891579240000.021
69243306000.0010295002...1528507064549168000.0154

5 rows × 32 columns

2.删掉相关性较强的列
corp = paction.corr()
plt.figure(figsize=(15,8))
sns.heatmap(corp)
<matplotlib.axes._subplots.AxesSubplot at 0x1c21bc094a8>

png

画下三角heatmap,使用到的函数

mask = np.array(corp)
mask[np.tril_indices_from(mask)] = False
fig,ax = plt.subplots()
fig.set_size_inches(15,8)
sns.heatmap(corp,mask=mask)
<matplotlib.axes._subplots.AxesSubplot at 0x1c21bc09400>

png

获取矩阵的下三角,如果要获取上三角的话, np.tril(m, 1)

coll = corp.columns
corp = pd.DataFrame(np.tril(corp, -1))
corp.columns = coll
corp.head()
A2A5A6A7A8A9A10A11A12A13...A45A46A47A48A49A50A51A52A53A54
00.0000000.0000000.0000000.0000000.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
10.0697440.0000000.0000000.0000000.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20.0761850.1788330.0000000.0000000.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
30.1587350.2193950.3713600.0000000.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
40.1672000.1861240.2420250.8031610.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0

5 rows × 32 columns

pac2 = paction.loc[:,(corp.abs()<.7).all()]      # 任何一个数都小于0.7 的数据
pac2.head()
A2A11A12A13A20A23A24A43A44A46A48A49A50A51A53A54
user_id
425530019219000.5230.9217420347362240.071
776120038819000.0200.9025638684371500.0312
276197015311000.0100.9200015422615080.095
2215720003000.020.857142457924000.021
6924330002000.0110.7368428454916800.0154

进行主成分分析

from sklearn.decomposition import PCA
pca = PCA()
pca.fit(pac2)
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
redio = pca.explained_variance_ratio_
print(redio) 
print(pca.singular_values_)  
[9.97843804e-01 1.92024564e-03 1.20120771e-04 5.57014208e-05
 2.67905481e-05 1.54533752e-05 9.31262940e-06 4.38846214e-06
 3.02317261e-06 8.36725295e-07 1.31874979e-07 9.78197162e-08
 3.86464536e-08 2.94647596e-08 1.82272465e-08 7.54580333e-09]
[3.96183910e+04 1.73797668e+03 4.34684952e+02 2.96004755e+02
 2.05284590e+02 1.55911168e+02 1.21032418e+02 8.30848288e+01
 6.89599635e+01 3.62791414e+01 1.44027941e+01 1.24044853e+01
 7.79687146e+00 6.80796010e+00 5.35458829e+00 3.44523057e+00]
recu = redio.cumsum()
print(recu)
x = np.arange(len(recu))
plt.plot(recu, color='r')
[0.9978438  0.99976405 0.99988417 0.99993987 0.99996666 0.99998212
 0.99999143 0.99999582 0.99999884 0.99999968 0.99999981 0.99999991
 0.99999994 0.99999997 0.99999999 1.        ]





[<matplotlib.lines.Line2D at 0x1c21dadada0>]

png

得到降维后的数据
pca.set_params(n_components=10)
pac3 = pd.DataFrame(pca.fit_transform(pac2))
pacsse = pac3.copy()
pac3.head()
0123456789
02706.266005-100.824346-1.874787-1.57753612.481591-2.3943209.7708787.8075350.021273-2.169596
12373.811140147.314930-16.386795-8.42865510.019577-3.0047256.0097710.961469-1.5985312.144615
2-1171.733361-5.4930810.7449950.542033-0.785251-5.756412-1.012336-1.7780677.2568840.343277
3-2738.903900-50.4684872.3284912.965415-5.79434711.8912892.965366-1.1824130.0656191.245358
4-1493.64261858.686385-10.80761211.7779737.6646929.3129684.3764291.994214-1.5680500.426246

6.KMeans 进行聚类

from sklearn.cluster import KMeans

km = KMeans(n_clusters=5)
km.fit(pac3)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)
clu = km.labels_
pac3['clu'] = clu
pac3.head()
0123456789clu
02706.266005-100.824346-1.874787-1.57753612.481591-2.3943209.7708787.8075350.021273-2.1695960
12373.811140147.314930-16.386795-8.42865510.019577-3.0047256.0097710.961469-1.5985312.1446150
2-1171.733361-5.4930810.7449950.542033-0.785251-5.756412-1.012336-1.7780677.2568840.3432771
3-2738.903900-50.4684872.3284912.965415-5.79434711.8912892.965366-1.1824130.0656191.2453584
4-1493.64261858.686385-10.80761211.7779737.6646929.3129684.3764291.994214-1.5680500.4262461
pac3.groupby('clu')[2].count()
clu
0     90
1    113
2    122
3    109
4    123
Name: 2, dtype: int64

#### palette 的颜色风格:
Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, BuPu_r, CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, OrRd, OrRd_r, Oranges, Oranges_r, PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r, PuOr, PuOr_r, PuRd, PuRd_r, Purples, Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn, RdYlGn_r, Reds, Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Vega10, Vega10_r, Vega20, Vega20_r, Vega20b, Vega20b_r, Vega20c, Vega20c_r, Wistia, Wistia_r, YlGn, YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r, afmhot, afmhot_r, autumn, autumn_r, binary, binary_r, bone, bone_r, brg, brg_r, bwr, bwr_r, cool, cool_r, coolwarm, coolwarm_r, copper, copper_r, cubehelix, cubehelix_r, flag, flag_r, gist_earth, gist_earth_r, gist_gray, gist_gray_r, gist_heat, gist_heat_r, gist_ncar, gist_ncar_r, gist_rainbow, gist_rainbow_r, gist_stern, gist_stern_r, gist_yarg, gist_yarg_r, gnuplot, gnuplot2, gnuplot2_r, gnuplot_r, gray, gray_r, hot, hot_r, hsv, hsv_r, icefire, icefire_r, inferno, inferno_r, jet, jet_r, magma, magma_r, mako, mako_r, nipy_spectral, nipy_spectral_r, ocean, ocean_r, pink, pink_r, plasma, plasma_r, prism, prism_r, rainbow, rainbow_r, rocket, rocket_r, seismic, seismic_r, spectral, spectral_r, spring, spring_r, summer, summer_r, tab10, tab10_r, tab20, tab20_r, tab20b, tab20b_r, tab20c, tab20c_r, terrain, terrain_r, viridis, viridis_r, vlag, vlag_r, winter, winter_r

plt.figure(figsize=(13,7))
sns.scatterplot(x=0, y=1, data=pac3,style='clu',hue='clu', palette='autumn')
<matplotlib.axes._subplots.AxesSubplot at 0x1c21db35438>

png

将分类后的类别添加至原数据中

pac4 = pac2.copy()
pac4['cluster'] = list(pac3.clu)
pac4.head()
A2A11A12A13A20A23A24A43A44A46A48A49A50A51A53A54cluster
user_id
425530019219000.5230.9217420347362240.0710
776120038819000.0200.9025638684371500.03120
276197015311000.0100.9200015422615080.0951
2215720003000.020.857142457924000.0214
6924330002000.0110.7368428454916800.01541
# 计算每个类的平均值
clu5 = pac4.groupby('cluster').mean()
# 删除相关性较高的列
clu5.drop(columns='A53',inplace=True)
c5cor = clu5.corr()
plt.figure(figsize=(15,8))
sns.heatmap(c5cor,annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1c21d92a780>

png

ccrp = pd.DataFrame(np.tril(c5cor,-1))
ccrp.columns = clu5.columns
cccc = clu5.loc[:,(ccrp.abs()<.95).all()]
cccc
A2A20A23A24A44A46A50A51A54
cluster
00.0222220.3222220.6555560.1676910.85819327.60000010.6666672.011111166.711111
10.0796460.2743360.3628320.0952310.84402720.1592923.0088501.469027102.106195
20.0737700.3770490.3360660.0706280.84934324.7377054.2868851.844262121.909836
30.0183490.2293580.2844040.0982520.84598124.1192665.2660551.733945146.871560
40.2032520.2926830.2439020.0636860.77507618.9837402.1300810.97561084.032520
from sklearn.preprocessing import scale

ccccc = pd.DataFrame(scale(cccc))
ccccc.columns = cccc.columns
ccccc
A2A20A23A24A44A46A50A51A54
0-0.8555900.4688591.9184001.8620200.7858821.4229701.8677731.1184571.424282
10.002962-0.503392-0.094337-0.1049610.315530-0.940402-0.688647-0.381093-0.746672
2-0.0848841.582038-0.278379-0.7728260.4920380.513827-0.2619980.656909-0.081200
3-0.913505-1.416613-0.633601-0.0229440.3803870.3173940.0648790.3517420.757602
41.851016-0.130892-0.912083-0.961289-1.973837-1.313789-0.982007-1.746015-1.354012
plt.figure(figsize=(8,8))
# 极坐标的分割分数
N = ccccc.shape[1]
# 设置雷达图的角度,用于平分切开一个圆面
angles = np.linspace(0, 2*np.pi, N, endpoint=False)
# 使雷达图一圈封闭起来
angles = np.concatenate((angles,[angles[0]]))
for i in range(len(ccccc)):
    # 构造数据
    values = ccccc.loc[i,:]
    # 为了使雷达图一圈封闭起来
    values = np.concatenate((values,[values[0]]))
    # 绘制
    plt.polar(angles, values, 'o-', linewidth=2)
plt.legend(ccccc.index, loc='lower right')
# 添加极坐标的标签
plt.thetagrids(angles * 180/np.pi, labels=list(ccccc.columns))
plt.title('重要指标雷达图呈现')
Text(0.5,1.05,'重要指标雷达图呈现')

png

不进行预处理的降维

dfp = acc.iloc[:,3:(len(acc.columns)-1)]
dfp.index=acc.user_id
dfp.head()
A1A2A3A4A5A6A7A8A9A10...A45A46A47A48A49A50A51A52A53A54
user_id
425530000010123358.25288230...19201805433473622400.071
77612000009000.00325195...19381422146843715000.0312
276197000070587.25150100...1515546024226150800.095
22157200001000.004014...242439891579240000.021
69243300006000.0010295...1528507064549168000.0154

5 rows × 54 columns

from sklearn.decomposition import PCA

pca = PCA(whiten=False)
pca.fit(dfp)
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
retio = pca.explained_variance_ratio_
# print(retio) 
# print(pca.singular_values_)  

rec = retio.cumsum()
print(rec)
x = np.arange(len(rec))
plt.plot(rec, color='r')
[0.9996008  0.99995245 0.99997489 0.99999016 0.9999933  0.99999564
 0.99999759 0.99999838 0.99999897 0.9999995  0.99999962 0.99999972
 0.99999979 0.99999986 0.9999999  0.99999993 0.99999996 0.99999997
 0.99999997 0.99999998 0.99999998 0.99999999 0.99999999 0.99999999
 0.99999999 1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.        ]





[<matplotlib.lines.Line2D at 0x1c21f406780>]

png

pca.set_params(n_components=10)
pacsse = pd.DataFrame(pca.fit_transform(dfp))
pacsse.head()
0123456789
094938.293061-342.891655-161.442878-199.6162101.83069273.107938153.124982124.440657-34.37161246.548951
156613.313155-960.580156-38.560364-45.83657113.67016690.767620-145.846645-40.25513410.50820316.287863
2-31060.195159388.005529-6.932692-0.948812-5.33272818.23729311.39346714.689011-7.99490932.398532
3-45806.2524431579.357883-81.812845-96.488345-18.477649-90.05921731.377291-22.865193-19.72483716.293640
4-34963.135693611.858506-18.187490-16.454233-5.597209-9.722257-63.112236-3.9432667.222725-10.889839

手肘法获取最优 K 值

from sklearn.cluster import KMeans
 
df_features = pacsse # 读入数据
# '利用SSE选择k'
SSE = []  # 存放每次结果的误差平方和
for k in range(1,9):
    estimator = KMeans(n_clusters=k)  # 构造聚类器
    estimator.fit(df_features)
    SSE.append(estimator.inertia_)
X = range(1,9)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')
[<matplotlib.lines.Line2D at 0x1c2211cac50>]

png

显然,先标准化数据是不合适的
# 显然,先标准化数据是不合适的

df_features = pd.DataFrame(scale(pacsse)) 

SSE = []  
for k in range(1,9):
    estimator = KMeans(n_clusters=k) 
    estimator.fit(df_features)
    SSE.append(estimator.inertia_)
X = range(1,9)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')
[<matplotlib.lines.Line2D at 0x1c2213bc438>]

png

km = KMeans(n_clusters=4)
km.fit(pacsse)
clu = km.labels_
pacsse['clu'] = clu
pacsse.head()
0123456789clu
094938.293061-342.891655-161.442878-199.6162101.83069273.107938153.124982124.440657-34.37161246.5489512
156613.313155-960.580156-38.560364-45.83657113.67016690.767620-145.846645-40.25513410.50820316.2878630
2-31060.195159388.005529-6.932692-0.948812-5.33272818.23729311.39346714.689011-7.99490932.3985321
3-45806.2524431579.357883-81.812845-96.488345-18.477649-90.05921731.377291-22.865193-19.72483716.2936401
4-34963.135693611.858506-18.187490-16.454233-5.597209-9.722257-63.112236-3.9432667.222725-10.8898391
pacsse.groupby('clu')[2].count()
clu
0    153
1    344
2     54
3      6
Name: 2, dtype: int64
plt.figure(figsize=(13,7))
sns.scatterplot(x=0, y=1, data=pacsse,style='clu',hue='clu', palette='autumn')
<matplotlib.axes._subplots.AxesSubplot at 0x1c22118b668>

png

显然,不进行预处理的数据聚类是有问题的, 第一主成分和第二主成分 显然是相关的

pac4 = pac2.copy()
pac4['cluster'] = list(pacsse.clu)
pac4.head()

clu5 = pac4.groupby('cluster').mean()
clu5.drop(columns='A53',inplace=True)
c5cor = clu5.corr()
plt.figure(figsize=(15,8))
sns.heatmap(c5cor,annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1c22145a4e0>

png

ccrp = pd.DataFrame(np.tril(c5cor,-1))
ccrp.columns = clu5.columns
cccc = clu5.loc[:,(ccrp.abs()<.95).all()]
cccc
A12A20A51A54
cluster
03.3986930.2287581.810458146.287582
11.9389530.3168601.433140101.531977
24.5925930.4074071.870370169.777778
32.1666670.1666671.666667213.833333
from sklearn.preprocessing import scale

ccccc = pd.DataFrame(scale(cccc))

ccccc.columns = cccc.columns
ccccc
A12A20A51A54
00.352533-0.5627840.684599-0.285229
1-1.0217050.406288-1.555764-1.388557
21.4765021.4022491.0403380.293858
3-0.807330-1.245753-0.1691731.379928
plt.figure(figsize=(8,8))
# 极坐标的分割分数
N = ccccc.shape[1]
# 设置雷达图的角度,用于平分切开一个圆面
angles = np.linspace(0, 2*np.pi, N, endpoint=False)
# 使雷达图一圈封闭起来
angles = np.concatenate((angles,[angles[0]]))
for i in range(len(ccccc)):
    # 构造数据
    values = ccccc.loc[i,:]
    # 为了使雷达图一圈封闭起来
    values = np.concatenate((values,[values[0]]))
    # 绘制
    plt.polar(angles, values, 'o-', linewidth=2)
plt.legend(ccccc.index, loc='lower right')
# 添加极坐标的标签
plt.thetagrids(angles * 180/np.pi, labels=list(ccccc.columns))
plt.title('重要指标雷达图呈现')
Text(0.5,1.05,'重要指标雷达图呈现')

png

转载于:https://www.cnblogs.com/cvlas/p/9537532.html

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符  | 博主筛选后可见
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值