import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
from datetime import datetime
% matplotlib inline
plt. style. use( "ggplot" )
df= pd. read_csv( "D:/2018_BigData/Python/Python_files_Notebook/theme_practice/student_consumption.csv" )
df. head( )
DealTime MonDeal bf_StudentID AccName PerSex 0 2018/7/1 06:32:51 -4.5 14877 张某某 女 1 2018/7/1 11:43:05 -11.0 14917 高某某 男 2 2018/7/1 14:21:42 -9.2 14921 胡某某 男 3 2018/7/1 14:33:06 -6.9 14898 牛某某 男 4 2018/7/1 15:05:45 -7.0 14917 高某某 男
df[ "DealTime" ] = pd. to_datetime( df. DealTime)
df. head( 2 )
DealTime MonDeal bf_StudentID AccName PerSex 0 2018-07-01 06:32:51 -4.5 14877 张某某 女 1 2018-07-01 11:43:05 -11.0 14917 高某某 男
df[ "DealTime" ] = df[ "DealTime" ] . dt. strftime( '%Y-%m-%d' )
df. head( 2 )
DealTime MonDeal bf_StudentID AccName PerSex 0 2018-07-01 -4.5 14877 张某某 女 1 2018-07-01 -11.0 14917 高某某 男
print ( df. head( 2 ) )
print ( df. tail( 2 ) )
DealTime MonDeal bf_StudentID AccName PerSex
0 2018-07-01 -4.5 14877 张某某 女
1 2018-07-01 -11.0 14917 高某某 男
DealTime MonDeal bf_StudentID AccName PerSex
463902 2019-01-27 -3.5 16075 王某某 女
463903 2019-01-27 -7.8 16103 邱某某 女
df. describe( )
MonDeal bf_StudentID count 463904.000000 463904.000000 mean -8.371863 15099.442035 std 5.878407 718.459919 min -404.200000 13012.000000 25% -10.900000 14459.000000 50% -8.000000 14942.000000 75% -4.450000 15784.000000 max -0.010000 16162.000000
DealTime_counts= df[ 'DealTime' ] . value_counts( )
print ( "\n" , DealTime_counts. shape)
print ( "\n" )
print ( DealTime_counts. head( 6 ) )
print ( "\n" )
print ( DealTime_counts. tail( 6 ) )
(161,)
2018-11-26 5245
2018-09-04 5182
2018-11-20 5167
2018-11-19 5121
2018-11-29 5116
2018-09-05 5108
Name: DealTime, dtype: int64
2018-08-13 19
2018-08-19 14
2018-10-05 13
2018-11-17 5
2018-09-08 4
2019-01-30 1
Name: DealTime, dtype: int64
df1= df. groupby( [ 'DealTime' , "bf_StudentID" , "AccName" , "PerSex" ] ) [ "MonDeal" ] . agg( [ "sum" , "mean" , "count" ] )
df1. head( 12 )
sum mean count DealTime bf_StudentID AccName PerSex 2018-07-01 13983 裘某某 男 -3.7 -3.70 1 14018 虞某某 男 -9.5 -9.50 1 14073 刘某某 男 -8.0 -8.00 1 14074 周某某 男 -14.3 -7.15 2 14097 毛某某 男 -10.0 -10.00 1 14099 李某某 男 -10.5 -10.50 1 14139 敖某某 男 -3.5 -3.50 1 14140 王某某 男 -11.5 -11.50 1 14169 查某某 女 -10.0 -10.00 1 14187 叶某某 男 -4.5 -4.50 1 14200 王某某 男 -3.5 -3.50 1 14208 方某某 男 -15.5 -7.75 2
df2= pd. pivot_table( df, index= [ "DealTime" , "bf_StudentID" , "AccName" , "PerSex" ] , values= [ "MonDeal" ] , aggfunc= [ sum , np. mean, len ] )
df2. head( 12 )
sum mean len MonDeal MonDeal MonDeal DealTime bf_StudentID AccName PerSex 2018-07-01 13983 裘某某 男 -3.7 -3.70 1.0 14018 虞某某 男 -9.5 -9.50 1.0 14073 刘某某 男 -8.0 -8.00 1.0 14074 周某某 男 -14.3 -7.15 2.0 14097 毛某某 男 -10.0 -10.00 1.0 14099 李某某 男 -10.5 -10.50 1.0 14139 敖某某 男 -3.5 -3.50 1.0 14140 王某某 男 -11.5 -11.50 1.0 14169 查某某 女 -10.0 -10.00 1.0 14187 叶某某 男 -4.5 -4.50 1.0 14200 王某某 男 -3.5 -3.50 1.0 14208 方某某 男 -15.5 -7.75 2.0
df1. describe( )
sum mean count count 164422.000000 164422.000000 164422.000000 mean -23.620565 -8.702236 2.821423 std 14.832558 3.876227 1.487303 min -439.250000 -170.000000 1.000000 25% -30.150000 -10.000000 2.000000 50% -21.000000 -8.000000 3.000000 75% -13.200000 -6.550000 4.000000 max -0.090000 -0.090000 14.000000
df1. info( )
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 164422 entries, (2018-07-01, 13983, 裘某某, 男) to (2019-01-30, 15472, 陈某某, 男)
Data columns (total 3 columns):
sum 164422 non-null float64
mean 164422 non-null float64
count 164422 non-null int64
dtypes: float64(2), int64(1)
memory usage: 4.9+ MB
df2. info( )
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 164422 entries, (2018-07-01, 13983, 裘某某, 男) to (2019-01-30, 15472, 陈某某, 男)
Data columns (total 3 columns):
(sum, MonDeal) 164422 non-null float64
(mean, MonDeal) 164422 non-null float64
(len, MonDeal) 164422 non-null float64
dtypes: float64(3)
memory usage: 4.9+ MB
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463904 entries, 0 to 463903
Data columns (total 5 columns):
DealTime 463904 non-null object
MonDeal 463904 non-null float64
bf_StudentID 463904 non-null int64
AccName 463904 non-null object
PerSex 463904 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 17.7+ MB
df[ "month" ] = df. DealTime. values. astype( "datetime64[M]" )
print ( df. head( 2 ) )
print ( df. tail( 2 ) )
DealTime MonDeal bf_StudentID AccName PerSex month
0 2018-07-01 -4.5 14877 张某某 女 2018-07-01
1 2018-07-01 -11.0 14917 高某某 男 2018-07-01
DealTime MonDeal bf_StudentID AccName PerSex month
463902 2019-01-27 -3.5 16075 王某某 女 2019-01-01
463903 2019-01-27 -7.8 16103 邱某某 女 2019-01-01
student_group= df. groupby( [ "bf_StudentID" , "AccName" , "PerSex" ] ) . agg( [ "sum" , "mean" , "count" ] )
print ( student_group. head( 3 ) )
print ( student_group. tail( 3 ) )
MonDeal
sum mean count
bf_StudentID AccName PerSex
13012 张某某 女 -381.1 -7.472549 51
13564 吴某某 男 -3272.8 -8.590026 381
13599 曹某某 男 -1686.8 -9.319337 181
MonDeal
sum mean count
bf_StudentID AccName PerSex
16160 周某某 女 -2135.25 -9.490000 225
16161 韩某某 女 -1519.60 -6.970642 218
16162 陈某某 男 -1754.65 -9.588251 183
type ( student_group)
pandas.core.frame.DataFrame
student_group. columns
MultiIndex(levels=[['MonDeal'], ['sum', 'mean', 'count']],
labels=[[0, 0, 0], [0, 1, 2]])
df1. columns
Index(['sum', 'mean', 'count'], dtype='object')
df2. columns
MultiIndex(levels=[['sum', 'mean', 'len'], ['MonDeal']],
labels=[[0, 1, 2], [0, 0, 0]])
df. columns
Index(['DealTime', 'MonDeal', 'bf_StudentID', 'AccName', 'PerSex', 'month'], dtype='object')
student_group0= student_group. reset_index( )
print ( student_group0. head( ) )
print ( '\n' * 2 )
print ( student_group0. columns)
bf_StudentID AccName PerSex MonDeal
sum mean count
0 13012 张某某 女 -381.10 -7.472549 51
1 13564 吴某某 男 -3272.80 -8.590026 381
2 13599 曹某某 男 -1686.80 -9.319337 181
3 13685 毛某某 男 -1854.93 -10.847544 171
4 13947 李某某 女 -3009.40 -7.982493 377
MultiIndex(levels=[['MonDeal', 'PerSex', 'AccName', 'bf_StudentID'], ['sum', 'mean', 'count', '']],
labels=[[3, 2, 1, 0, 0, 0], [3, 3, 3, 0, 1, 2]])
df3= df1. reset_index( )
print ( df3. head( 3 ) )
print ( df3. tail( 3 ) )
DealTime bf_StudentID AccName PerSex sum mean count
0 2018-07-01 13983 裘某某 男 -3.7 -3.7 1
1 2018-07-01 14018 虞某某 男 -9.5 -9.5 1
2 2018-07-01 14073 刘某某 男 -8.0 -8.0 1
DealTime bf_StudentID AccName PerSex sum mean count
164419 2019-01-27 16149 李某某 女 -4.0 -4.0 1
164420 2019-01-27 16150 陈某某 男 -4.5 -4.5 1
164421 2019-01-30 15472 陈某某 男 -5.8 -5.8 1
df3. columns
Index(['DealTime', 'bf_StudentID', 'AccName', 'PerSex', 'sum', 'mean',
'count'],
dtype='object')
df3. rename( columns= { 'sum' : 'MonDeal' , 'mean' : 'avgMonDeal' , 'count' : 'transaction_times' } , inplace = True )
df3. head( 5 )
DealTime bf_StudentID AccName PerSex MonDeal avgMonDeal transaction_times 0 2018-07-01 13983 裘某某 男 -3.7 -3.70 1 1 2018-07-01 14018 虞某某 男 -9.5 -9.50 1 2 2018-07-01 14073 刘某某 男 -8.0 -8.00 1 3 2018-07-01 14074 周某某 男 -14.3 -7.15 2 4 2018-07-01 14097 毛某某 男 -10.0 -10.00 1
df3[ "month" ] = df3. DealTime. values. astype( "datetime64[M]" )
df3. tail( 2 )
DealTime bf_StudentID AccName PerSex MonDeal avgMonDeal transaction_times month 164420 2019-01-27 16150 陈某某 男 -4.5 -4.5 1 2019-01-01 164421 2019-01-30 15472 陈某某 男 -5.8 -5.8 1 2019-01-01
student_group1= df3. groupby( [ "bf_StudentID" , "AccName" , "PerSex" ] ) [ "MonDeal" ] . agg( [ "sum" , "mean" , "count" ] )
print ( student_group1. head( 3 ) )
print ( '\n' * 1 )
print ( student_group1. columns)
sum mean count
bf_StudentID AccName PerSex
13012 张某某 女 -381.1 -11.909375 32
13564 吴某某 男 -3272.8 -26.393548 124
13599 曹某某 男 -1686.8 -17.755789 95
Index(['sum', 'mean', 'count'], dtype='object')
student_group1. columns
Index(['sum', 'mean', 'count'], dtype='object')
student_group2= student_group1. reset_index( )
print ( student_group2. head( 3 ) )
print ( '\n' * 1 )
print ( student_group2. columns)
print ( '\n' * 1 )
print ( student_group2. shape)
bf_StudentID AccName PerSex sum mean count
0 13012 张某某 女 -381.1 -11.909375 32
1 13564 吴某某 男 -3272.8 -26.393548 124
2 13599 曹某某 男 -1686.8 -17.755789 95
Index(['bf_StudentID', 'AccName', 'PerSex', 'sum', 'mean', 'count'], dtype='object')
(1730, 6)
student_group2. describe( )
bf_StudentID sum mean count count 1730.000000 1730.000000 1730.000000 1730.000000 mean 15010.860116 -2244.936717 -23.293647 95.041618 std 710.216323 860.371730 7.495143 18.722765 min 13012.000000 -6239.950000 -100.000000 1.000000 25% 14386.250000 -2790.475000 -27.545710 91.000000 50% 14834.500000 -2216.500000 -22.793650 98.000000 75% 15721.750000 -1698.650000 -18.324731 105.000000 max 16162.000000 -9.000000 -4.950000 146.000000
student_group3= student_group2. sort_values( by= "sum" , ascending= True )
print ( student_group3. head( 5 ) )
print ( '\n' * 1 )
print ( student_group3. tail( 5 ) )
bf_StudentID AccName PerSex sum mean count
1132 15556 陶某某 男 -6239.95 -66.382447 94
176 14123 汤某某 男 -5698.20 -53.756604 106
1593 16024 余某某 男 -5125.69 -50.251863 102
1607 16038 王某某 男 -5080.40 -49.807843 102
1113 15537 潘某某 男 -4798.80 -47.047059 102
bf_StudentID AccName PerSex sum mean count
922 14892 林某某 男 -32.7 -16.35 2
12 13956 陆某某 男 -11.0 -11.00 1
581 14546 俞某某 男 -10.0 -10.00 1
412 14363 钱某某 女 -9.5 -9.50 1
22 13967 权某某 女 -9.0 -9.00 1
import seaborn as sns
import warnings
warnings. filterwarnings( "ignore" )
from matplotlib. font_manager import FontProperties
myfont= FontProperties( fname= r'C:\Windows\Fonts\simhei.ttf' , size= 14 )
sns. set ( font= myfont. get_name( ) )
x= student_group3[ "count" ]
sns. distplot( x)
plt. hist( x, bins= 15 , color= sns. desaturate( "indianred" , .9 ) , alpha= .8 )
plt. xlabel( "在校天数(消费天数)" , fontproperties= "SimHei" , fontsize= 15 )
plt. ylabel( "人数" , fontsize= 15 )
plt. title( "2018.7-2019.1学生在校时长分布" , fontsize= 18 )
plt. show( )
x1= student_group3[ "sum" ]
x2= student_group3[ "mean" ]
kwargs = dict ( histtype= 'stepfilled' , alpha= 0.8 , bins= 15 )
plt. hist( x1, ** kwargs)
plt. xlabel( "各学生总消费金额" , fontproperties= "SimHei" , fontsize= 15 )
plt. ylabel( "人数" , fontsize= 15 )
plt. title( "2018.7-2019.1学生总消费情况" , fontsize= 18 )
plt. show( )
plt. hist( x2, bins= 18 )
plt. xlabel( "各学生日均消费金额" , fontproperties= "SimHei" , fontsize= 15 )
plt. ylabel( "人数" , fontsize= 15 )
plt. title( "2018.7-2019.1学生日均消费情况" , fontsize= 18 )
plt. show( )
counts, bin_edges = np. histogram( x2, bins= 18 )
print ( counts)
[ 1 0 0 0 0 0 2 0 2 5 12 51 131 339 508 453 192 34]
df3. groupby( "month" ) . MonDeal. sum ( ) . plot( )
plt. show( )
df3. groupby( "month" ) . transaction_times. sum ( ) . plot( )
plt. show( )
df3. groupby( "bf_StudentID" ) . sum ( ) . plot. scatter( x= "MonDeal" , y= "transaction_times" )
plt. show( )
plt. figure( figsize= ( 12 , 4 ) )
plt. subplot( 121 )
df3. groupby( "bf_StudentID" ) . MonDeal. sum ( ) . hist( bins= 30 )
plt. xlabel( "学生消费总金额" , fontsize= 15 )
plt. title( "2018.7-2019.1学生消费总金额分布" , fontsize= 18 )
plt. subplot( 122 )
df3. groupby( "bf_StudentID" ) . transaction_times. sum ( ) . hist( bins= 30 )
plt. ylabel( "学生消费总次数" , fontsize= 15 )
plt. title( "2018.7-2019.1学生消费总次数分布" , fontsize= 18 )
plt. show( )
df3. groupby( "bf_StudentID" ) . month. min ( ) . value_counts( )
2018-07-01 918
2018-08-01 570
2018-09-01 231
2018-12-01 4
2018-11-01 4
2018-10-01 3
Name: month, dtype: int64
df3. groupby( "bf_StudentID" ) . month. max ( ) . value_counts( )
2019-01-01 1692
2018-12-01 13
2018-10-01 9
2018-11-01 8
2018-09-01 5
2018-07-01 2
2018-08-01 1
Name: month, dtype: int64
pivoted_counts= df3. pivot_table( index= "bf_StudentID" , columns= "month" , values= "transaction_times" , aggfunc= "sum" ) . fillna( 0 )
columns_month= df3. month. sort_values( ) . astype( "str" ) . unique( )
pivoted_counts. columns= columns_month
pivoted_counts. head( )
2018-07-01 2018-08-01 2018-09-01 2018-10-01 2018-11-01 2018-12-01 2019-01-01 bf_StudentID 13012 0.0 0.0 10.0 10.0 15.0 7.0 9.0 13564 17.0 25.0 82.0 63.0 69.0 75.0 50.0 13599 8.0 10.0 39.0 33.0 34.0 31.0 26.0 13685 8.0 12.0 28.0 33.0 34.0 39.0 17.0 13947 9.0 22.0 81.0 64.0 66.0 72.0 63.0
pivoted_counts_transf= pivoted_counts. applymap( lambda x: 1 if x> 59 else np. NaN if x== 0 else 0 )
pivoted_counts_transf. head( )
2018-07-01 2018-08-01 2018-09-01 2018-10-01 2018-11-01 2018-12-01 2019-01-01 bf_StudentID 13012 NaN NaN 0.0 0.0 0.0 0.0 0.0 13564 0.0 0.0 1.0 1.0 1.0 1.0 0.0 13599 0.0 0.0 0.0 0.0 0.0 0.0 0.0 13685 0.0 0.0 0.0 0.0 0.0 0.0 0.0 13947 0.0 0.0 1.0 1.0 1.0 1.0 1.0
( pivoted_counts_transf. sum ( ) / pivoted_counts_transf. count( ) ) . plot( figsize= ( 10 , 4 ) )
plt. xlabel( '时间(月)' , fontsize= 18 )
plt. ylabel( '百分比(%)' , fontsize= 18 )
plt. title( '各月学生爱饭堂率' , fontsize= 18 )
plt. show( )
pivoted_MonDeal= df3. pivot_table(
index= "bf_StudentID" , columns= "month" , values= "MonDeal" , aggfunc= "mean" ) . fillna( 0 )
columns_month= df3. month. sort_values( ) . astype( "str" ) . unique( )
pivoted_MonDeal. columns= columns_month
pivoted_MonDeal. head( )
2018-07-01 2018-08-01 2018-09-01 2018-10-01 2018-11-01 2018-12-01 2019-01-01 bf_StudentID 13012 0.000000 0.000000 -12.720000 -11.483333 -11.844444 -11.016667 -12.650000 13564 -30.080000 -20.180000 -27.912000 -28.309524 -27.842500 -27.668750 -21.442105 13599 -26.450000 -13.333333 -20.478947 -16.734375 -16.116667 -16.238235 -19.833333 13685 -26.375000 -15.122222 -16.628235 -20.656250 -20.747059 -23.277778 -28.556250 13947 -19.566667 -15.550000 -28.981818 -22.380000 -29.767500 -24.147826 -31.069444
pivoted_purchase= pivoted_MonDeal. applymap( lambda x: 0 if x== 0 else 1 )
pivoted_purchase. head( )
2018-07-01 2018-08-01 2018-09-01 2018-10-01 2018-11-01 2018-12-01 2019-01-01 bf_StudentID 13012 0 0 1 1 1 1 1 13564 1 1 1 1 1 1 1 13599 1 1 1 1 1 1 1 13685 1 1 1 1 1 1 1 13947 1 1 1 1 1 1 1
def purchase_return ( data) :
status = [ ]
for i in range ( 6 ) :
if data[ i] == 1 :
if data[ i+ 1 ] == 1 :
status. append( 1 )
if data[ i+ 1 ] == 0 :
status. append( 0 )
else :
status. append( np. NaN)
status. append( np. NaN)
return status
pivoted_purchase_return = pivoted_purchase. apply ( purchase_return, axis= 1 )
pivoted_purchase_return. head( 5 )
bf_StudentID
13012 [nan, nan, 1, 1, 1, 1, nan]
13564 [1, 1, 1, 1, 1, 1, nan]
13599 [1, 1, 1, 1, 1, 1, nan]
13685 [1, 1, 1, 1, 1, 1, nan]
13947 [1, 1, 1, 1, 1, 1, nan]
dtype: object
pivoted_purchase_return_mean = ( pivoted_purchase_return. sum ( ) / pivoted_purchase_return. count( ) )
pivoted_purchase_return_mean. plot( figsize= ( 12 , 4 ) )
plt. xlabel( '时间(月)' , fontsize= 18 )
plt. ylabel( '百分比(%)' , fontsize= 18 )
plt. title( '各月学生复购率' , fontsize= 18 )
plt. show( )
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-163-45678b38adb8> in <module>()
1 pivoted_purchase_return_mean = (pivoted_purchase_return.sum()/pivoted_purchase_return.count())
----> 2 pivoted_purchase_return_mean.plot()
3
4 plt.xlabel('时间(月)', fontsize=18)
5 plt.ylabel('百分比(%)', fontsize=18)
AttributeError: 'numpy.ndarray' object has no attribute 'plot'
下篇续:用Python分析用户消费行为 Student Comsumption Analysis ② https://blog.csdn.net/weixin_44216391/article/details/89329804