建模1_数据绘制

该分析基于会员消费数据,通过数据预处理、分组和聚合操作,揭示了不同年龄段、性别与消费金额的关联。使用pandas库进行数据清洗和转换,创建了年龄区间,计算了各区间内的消费频次和总金额,并进行了可视化展示,揭示了消费模式的性别和年龄分布特点。
摘要由CSDN通过智能技术生成
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_rows= 10
import datetime
import matplotlib
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif'
fec1 = pd.read_excel(r'C:\Users\Administrator\Desktop\c题\c附件\会员消费明细查询2016.xlsx')
#fec2 = pd.read_excel(r'C:\Users\Administrator\Desktop\c题\c附件\会员消费明细查询2017.xlsx')
fec1['date'] = pd.to_datetime(fec1['消费时间'])
#fec2['date'] = pd.to_datetime(fec2['消费时间'])
fec1['month']=fec1['date'].apply(lambda x:x.month)
#fec2['month']=fec2['date'].apply(lambda x:x.month+12)
fec1['month']=fec1['date'].apply(lambda x:x.month)
#fec2['month']=fec2['date'].apply(lambda x:x.month+12)
会员卡号卡类型会员编号性别年龄单据号消费时间商铺编码消费金额消费积分datemonth
08088001625白金卡MN2015030719815942585048022016/12/31 21:58:52112238.02382016-12-31 21:58:5212
18088012005白金卡MN201404301665713360512016123100762016/12/31 21:38:43605-11436.014362016-12-31 21:38:4312
28086102716VIP卡MN2013121014989363585036112016/12/31 21:21:44112200.02002016-12-31 21:21:4412
38086232983VIP卡MN2013122215249146585035022016/12/31 21:20:3711242.8422016-12-31 21:20:3712
48088001625白金卡MN2015030719815942585034222016/12/31 21:18:50112150.01502016-12-31 21:18:5012
.......................................
7364288086129227VIP卡MN2012123104100241B1242016010121986722016/1/1 9:56:09B124164.0492016-01-01 09:56:091
7364298086223710VIP卡MN2015040719923929B1242016010121668472016/1/1 9:56:09B12484.0252016-01-01 09:56:091
7364308086226933VIP卡MN20130727107860337082016010100432016/1/1 9:56:0970894.0942016-01-01 09:56:091
7364318086233209VIP卡MN20131212150274697082016010100362016/1/1 9:56:09708181.01812016-01-01 09:56:091
7364328086227574VIP卡MN2013090111804539B1242016011122018122016/1/1 0:00:00B12422.062016-01-01 00:00:001

736433 rows × 12 columns

#fec = pd.concat([fec1,fec2])
#fec['date'] = pd.to_datetime(fec['消费时间'])
#fec['month'] = fec['date'].apply(lambda x: x.month)
#fec
fec1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736433 entries, 0 to 736432
Data columns (total 12 columns):
会员卡号     736433 non-null object
卡类型      736433 non-null object
会员编号     736433 non-null object
性别       731993 non-null object
年龄       736433 non-null int64
单据号      736433 non-null object
消费时间     736433 non-null object
商铺编码     736433 non-null object
消费金额     736433 non-null float64
消费积分     736433 non-null int64
date     736433 non-null datetime64[ns]
month    736433 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(3), object(7)
memory usage: 67.4+ MB
bins=np.array([0,10,20,30,40,50,60,70,80,90,100,120])#出资额分组
fec1['年龄段']=pd.cut(fec1['年龄'],bins,labels=[1,2,3,4,5,6,7,8,9,10,11])#年龄分段
fec1.corr(method='spearman')
fec1
会员卡号卡类型会员编号性别年龄单据号消费时间商铺编码消费金额消费积分datemonth年龄段
08088001625白金卡MN2015030719815942585048022016/12/31 21:58:52112238.02382016-12-31 21:58:52125
18088012005白金卡MN201404301665713360512016123100762016/12/31 21:38:43605-11436.014362016-12-31 21:38:43124
28086102716VIP卡MN2013121014989363585036112016/12/31 21:21:44112200.02002016-12-31 21:21:44127
38086232983VIP卡MN2013122215249146585035022016/12/31 21:20:3711242.8422016-12-31 21:20:37125
48088001625白金卡MN2015030719815942585034222016/12/31 21:18:50112150.01502016-12-31 21:18:50125
..........................................
7364288086129227VIP卡MN2012123104100241B1242016010121986722016/1/1 9:56:09B124164.0492016-01-01 09:56:0915
7364298086223710VIP卡MN2015040719923929B1242016010121668472016/1/1 9:56:09B12484.0252016-01-01 09:56:0913
7364308086226933VIP卡MN20130727107860337082016010100432016/1/1 9:56:0970894.0942016-01-01 09:56:0914
7364318086233209VIP卡MN20131212150274697082016010100362016/1/1 9:56:09708181.01812016-01-01 09:56:0917
7364328086227574VIP卡MN2013090111804539B1242016011122018122016/1/1 0:00:00B12422.062016-01-01 00:00:0014

736433 rows × 13 columns

unique_cands = fec1['会员编号'].unique()#求解会员编号有多少不同, 返回会员编号这一列所有的唯一值
len(unique_cands)
fec1['会员编号'].nunique()
unique_cands
array(['MN20150307198159', 'MN20140430166571', 'MN20131210149893', ...,
       'MN2015091900255', 'MN2015071000008', 'MN20130122046159'], dtype=object)
fec1.isnull().any() #判断是否有缺失
会员卡号     False
卡类型      False
会员编号     False
性别        True
年龄       False
         ...  
消费金额     False
消费积分     False
date     False
month    False
年龄段       True
Length: 13, dtype: bool

取需要的列dataframe[[],[]…]

data_1 = fec1[['会员编号','卡类型','性别','年龄','单据号','商铺编码','消费金额','date']]
data_1
会员编号卡类型性别年龄单据号商铺编码消费金额date
0MN20150307198159白金卡4258504802112238.02016-12-31 21:58:52
1MN20140430166571白金卡336051201612310076605-11436.02016-12-31 21:38:43
2MN20131210149893VIP卡6358503611112200.02016-12-31 21:21:44
3MN20131222152491VIP卡465850350211242.82016-12-31 21:20:37
4MN20150307198159白金卡4258503422112150.02016-12-31 21:18:50
...........................
736428MN20121231041002VIP卡41B124201601012198672B124164.02016-01-01 09:56:09
736429MN20150407199239VIP卡29B124201601012166847B12484.02016-01-01 09:56:09
736430MN20130727107860VIP卡3370820160101004370894.02016-01-01 09:56:09
736431MN20131212150274VIP卡69708201601010036708181.02016-01-01 09:56:09
736432MN20130901118045VIP卡39B124201601112201812B12422.02016-01-01 00:00:00

736433 rows × 8 columns

#bins=np.array([0,20,30,40,50,60,70,80,90,100])#出资额分组
bins=np.array([0,10,20,30,40,50,60,70,80,90,100,120])#出资额分组,取值范围左开右臂

def rfm_convert1(x):
    rfm_dict = {0:'R',
                2:'M'}
    for i in (3):
        if i == 3:
            labels = [10,20,30,40,50,60,70,80,90,100]
            x[rfm_dict[i]] = pd.cut(x.iloc[:,i],10,labels=labels)
        else:
            labels = np.arange(1,6)    
            x[rfm_dict[i]] = pd.qcut(x.iloc[:,i],q=np.linspace(0,1,num=6),labels=labels)
    return x

labels1 = [10,20,30,40,50,60,70,80,90,100,120]
data_1['年龄段']=pd.cut(data_1['年龄'],bins,labels=labels1,include_lowest=True)  #include_lowest=True左边闭合
data_1.sort_values(by='年龄段',ascending=False)  #ascending表示从大到小排列

D:\anaconda3\lib\site-packages\ipykernel_launcher.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
会员编号卡类型性别年龄单据号商铺编码消费金额date年龄段
319589MN20130109042454VIP卡92813201607271407813120.02016-07-27 13:19:57100
415883MN20130109042454VIP卡92209201606121519209135.02016-06-12 15:51:35100
595627MN20130324060927VIP卡98B135201603153647B13545.02016-03-15 18:56:11100
595626MN20130324060927VIP卡98B135201603153699B13518.02016-03-15 18:56:11100
595625MN20130324060927VIP卡98B135201603153698B13525.02016-03-15 18:56:11100
..............................
534275MN2015090100295VIP卡06031201604170015603-1308.02016-04-17 13:22:5010
201409MN2016090200005普通卡0B124201609232091130B12437.02016-09-23 13:38:1510
534276MN2015122700201VIP卡0CB001201604170097025CB001649.02016-04-17 13:22:5010
201407MN2016062300093普通卡0B124201609232168665B12419.02016-09-23 13:38:1510
368216MN2015120500197VIP卡0701201607040608002701205.02016-07-04 18:03:5510

736433 rows × 9 columns

df.pivot_table(index=‘student’, values=‘grades’, aggfunc=fun)

转换各个维度去观察数据, aggfunc就是在数据转换过程中的过程函数

by_occupation_1= data_1.pivot_table('消费金额',index=['年龄段'],aggfunc='count')
print(by_occupation_1)
by_occupation_1.plot.pie(subplots = True,autopct='%.2f', fontsize=10, figsize=(5,5))
plt.show()
       消费金额
年龄段        
10   171482
20     1461
30    62698
40   252688
50   146145
..      ...
70    36302
80     8516
90      849
100     211
120       0

[11 rows x 1 columns]

在这里插入图片描述

聚合操作 .groupby

print(data_1)
f_value = data_1.groupby('年龄段')['会员编号'].agg([('2018年消费频次','count')])
print(f_value)
f_value.plot.pie(subplots = True,autopct='%.2f', fontsize=20, figsize=(8, 8))
plt.show()
                    会员编号   卡类型 性别  年龄                  单据号   商铺编码    消费金额  \
0       MN20150307198159   白金卡  女  42             58504802    112   238.0   
1       MN20140430166571   白金卡  男  33     6051201612310076  605-1  1436.0   
2       MN20131210149893  VIP卡  女  63             58503611    112   200.0   
3       MN20131222152491  VIP卡  女  46             58503502    112    42.8   
4       MN20150307198159   白金卡  女  42             58503422    112   150.0   
...                  ...   ... ..  ..                  ...    ...     ...   
736428  MN20121231041002  VIP卡  男  41  B124201601012198672   B124   164.0   
736429  MN20150407199239  VIP卡  男  29  B124201601012166847   B124    84.0   
736430  MN20130727107860  VIP卡  女  33      708201601010043    708    94.0   
736431  MN20131212150274  VIP卡  男  69      708201601010036    708   181.0   
736432  MN20130901118045  VIP卡  女  39  B124201601112201812   B124    22.0   

                      date 年龄段  
0      2016-12-31 21:58:52  50  
1      2016-12-31 21:38:43  40  
2      2016-12-31 21:21:44  70  
3      2016-12-31 21:20:37  50  
4      2016-12-31 21:18:50  50  
...                    ...  ..  
736428 2016-01-01 09:56:09  50  
736429 2016-01-01 09:56:09  30  
736430 2016-01-01 09:56:09  40  
736431 2016-01-01 09:56:09  70  
736432 2016-01-01 00:00:00  40  

[736433 rows x 9 columns]
     2018年消费频次
年龄段           
10      171482
20        1461
30       62698
40      252688
50      146145
..         ...
70       36302
80        8516
90         849
100        211
120          0

[11 rows x 1 columns]

在这里插入图片描述

xiangguan = by_occupation_1.join(f_value)
xiangguan.corr(method='spearman')
消费金额2018年消费频次
消费金额1.01.0
2018年消费频次1.01.0
f_value
2018年消费频次
年龄段
10171482
201461
3062698
40252688
50146145
......
7036302
808516
90849
100211
1200

11 rows × 1 columns

by_occupation_1= fec1.pivot_table('消费金额',index=['卡类型'],columns='性别',aggfunc='count')
by_occupation_1
性别
卡类型
VIP卡330876201053
南昌普通卡496287
普通卡5543155664
白金卡6355824628
by_occupation_1.plot(kind='barh')#横向条形
plt.show()

在这里插入图片描述

]

bins=np.array([0,10,20,30,40,50,60,70,80,90,100,120])#出资额分组
labels=pd.cut(fec1['年龄'],bins)
0         (40, 50]
1         (30, 40]
2         (60, 70]
3         (40, 50]
4         (40, 50]
            ...   
736428    (40, 50]
736429    (20, 30]
736430    (30, 40]
736431    (60, 70]
736432    (30, 40]
Name: 年龄, Length: 736433, dtype: category
Categories (11, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (70, 80] < (80, 90] < (90, 100] < (100, 120]]

grouped.size().unstack(0)的用法

stack()即“堆叠”,作用是将列旋转到行

unstack()即stack()的反操作,将行旋转到列

grouped=fec1.groupby(['卡类型',labels])
grouped.size().unstack(0)
卡类型VIP卡南昌普通卡普通卡白金卡
年龄
(0, 10]637.0NaN209.0108.0
(10, 20]1088.0NaN159.0214.0
(20, 30]49024.02.05643.08029.0
(30, 40]199018.0618.017393.035659.0
(40, 50]112961.0159.07788.025237.0
(50, 60]44441.02.02991.08647.0
(60, 70]26276.02.0828.09196.0
(70, 80]7037.0NaN273.01206.0
(80, 90]788.0NaN3.058.0
(90, 100]210.0NaN1.0NaN
#by_occupation_4= fec.pivot_table('消费金额',index=labels,columns=['卡类型'],aggfunc='count')
#by_occupation_4.plot.pie(subplots = True,autopct='%.2f', fontsize=30, figsize=(50, 50))
by_occupation_5= fec1.pivot_table('消费金额',index=labels,columns=['卡类型'],aggfunc='sum')
by_occupation_5.plot.pie(subplots = True,autopct='%.2f', fontsize=10, figsize=(20, 20))
plt.show()

在这里插入图片描述

bins=np.array([0,10,20,30,40,50,60,70,80,90,100,120])#出资额分组
labels=pd.cut(fec1['年龄'],bins)
by_occupation_4= fec1.pivot_table('消费金额',index=labels,aggfunc='sum')
by_occupation_4
消费金额
年龄
(0, 10]3.319693e+05
(10, 20]6.612430e+05
(20, 30]1.941047e+07
(30, 40]7.539194e+07
(40, 50]4.943245e+07
......
(60, 70]1.047164e+07
(70, 80]1.649638e+06
(80, 90]1.621774e+05
(90, 100]2.897200e+04
(100, 120]NaN

11 rows × 1 columns

by_occupation_4.plot(kind='barh')#横向条形
plt.show()

在这里插入图片描述

by_occupation_4.plot.pie(subplots = True,autopct='%.2f', fontsize=20, figsize=(10, 10))
plt.show()

在这里插入图片描述

fec1['date'] = pd.to_datetime(fec1['消费时间'])
#fec2['date'] = pd.to_datetime(fec2['消费时间'])
fec1['month'] = fec1['date'].apply(lambda x: x.month)
#fec2['month'] = fec2['date'].apply(lambda x: x.month)
fec1
会员卡号卡类型会员编号性别年龄单据号消费时间商铺编码消费金额消费积分datemonth
08088001625白金卡MN2015030719815942585048022016/12/31 21:58:52112238.02382016-12-31 21:58:5212
18088012005白金卡MN201404301665713360512016123100762016/12/31 21:38:43605-11436.014362016-12-31 21:38:4312
28086102716VIP卡MN2013121014989363585036112016/12/31 21:21:44112200.02002016-12-31 21:21:4412
38086232983VIP卡MN2013122215249146585035022016/12/31 21:20:3711242.8422016-12-31 21:20:3712
48088001625白金卡MN2015030719815942585034222016/12/31 21:18:50112150.01502016-12-31 21:18:5012
.......................................
7364288086129227VIP卡MN2012123104100241B1242016010121986722016/1/1 9:56:09B124164.0492016-01-01 09:56:091
7364298086223710VIP卡MN2015040719923929B1242016010121668472016/1/1 9:56:09B12484.0252016-01-01 09:56:091
7364308086226933VIP卡MN20130727107860337082016010100432016/1/1 9:56:0970894.0942016-01-01 09:56:091
7364318086233209VIP卡MN20131212150274697082016010100362016/1/1 9:56:09708181.01812016-01-01 09:56:091
7364328086227574VIP卡MN2013090111804539B1242016011122018122016/1/1 0:00:00B12422.062016-01-01 00:00:001

736433 rows × 12 columns

data_3= fec1.pivot_table('消费金额',index='month',aggfunc='sum')
data_3.plot(kind='bar')#横向条形
plt.show()

在这里插入图片描述

#data_33= fec2.pivot_table('消费金额',index='month',aggfunc='sum')
#data_33.plot(kind='bar')#横向条形

统计卡类型变换

car_label = {'VIP卡':'a','南昌普通卡':'b','普通卡':'c','白金卡':'d'}
fec1['party'] = fec1['卡类型'].map(car_label)
data2 = fec1.groupby('会员编号')['party'].unique()
data2
会员编号
MN20120928000004       [a]
MN20120928000005       [d]
MN20120928000007       [a]
MN20120928000016       [a]
MN20120928000017       [a]
                     ...  
MN2018022100100     [c, a]
MN2018032400067     [c, a]
MN2018051900017        [c]
MN2018051900057        [c]
MN2018051900133        [c]
Name: party, Length: 82391, dtype: object
data33 = data2[data2.apply(len)==2]
data33
会员编号
MN20120928000020    [d, a]
MN20120928000040    [d, a]
MN20120928000046    [d, a]
MN20120928000047    [d, a]
MN20120928000067    [d, a]
                     ...  
MN2017102100160     [c, a]
MN2017122300006     [c, a]
MN2018011800118     [c, a]
MN2018022100100     [c, a]
MN2018032400067     [c, a]
Name: party, Length: 14218, dtype: object
def f(x):
    if (x[0] is 'a'):
        if x[1] is 'b':
            x = 1
        elif x[1] is 'c':
            x=2
        else:
            x = 3
            
    elif x[0] is 'b':
        if x[1] is 'a':
            x = 4
        elif x[1] is 'c':
            x=5
        else:
            x=6
            
    elif x[0] is 'c':
        if x[1] is 'a':
            x = 7
        elif x[1] is 'b':
            x=8
        else:
            x=9
            
    elif x[0] is 'd':
        if x[1] is 'a':
            x = 10
        elif x[1] is 'b':
            x=11
        else:
            x=12
    return x
data43 = data33.apply(f)
data6 = pd.DataFrame(list(zip(data33, data43)))
data6
01
0[d, a]10
1[d, a]10
2[d, a]10
3[d, a]10
4[d, a]10
.........
14213[c, a]7
14214[c, a]7
14215[c, a]7
14216[c, a]7
14217[c, a]7

14218 rows × 2 columns

data6[1].value_counts()
7     12800
2       709
10      617
3        43
9        27
12       18
1         4
Name: 1, dtype: int64
pd.to_numeric(data6[1])
0        10
1        10
2        10
3        10
4        10
         ..
14213     7
14214     7
14215     7
14216     7
14217     7
Name: 1, Length: 14218, dtype: int64
33/(134629-25450)
0.00030225592833786716
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值