** 【续上篇】 用Python分析用户消费行为 Student Comsumption Analysis ① https://blog.csdn.net/weixin_44216391/article/details/89309643
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
from datetime import datetime
% matplotlib inline
plt. style. use( "ggplot" )
import warnings
warnings. filterwarnings( "ignore" )
import seaborn as sns
from matplotlib. font_manager import FontProperties
myfont= FontProperties( fname= r'C:\Windows\Fonts\simhei.ttf' , size= 14 )
sns. set ( font= myfont. get_name( ) )
df= pd. read_csv( "D:/2018_BigData/Python/Python_files_Notebook/theme_practice/student_consumption_day.csv" )
df. head( )
DealTime bf_StudentID AccName PerSex MonDeal avgMonDeal transaction_times month 0 2018-07-01 13983 裘某某 男 -3.7 -3.70 1 2018-07-01 1 2018-07-01 14018 虞某某 男 -9.5 -9.50 1 2018-07-01 2 2018-07-01 14073 刘某某 男 -8.0 -8.00 1 2018-07-01 3 2018-07-01 14074 周某某 男 -14.3 -7.15 2 2018-07-01 4 2018-07-01 14097 毛某某 男 -10.0 -10.00 1 2018-07-01
pivoted_counts= df. pivot_table( index= "bf_StudentID" , columns= "month" , values= "transaction_times" , aggfunc= "sum" ) . fillna( 0 )
columns_month= df. month. sort_values( ) . astype( "str" ) . unique( )
pivoted_counts. columns= columns_month
pivoted_counts. head( )
2018-07-01 2018-08-01 2018-09-01 2018-10-01 2018-11-01 2018-12-01 2019-01-01 bf_StudentID 13012 0.0 0.0 10.0 10.0 15.0 7.0 9.0 13564 17.0 25.0 82.0 63.0 69.0 75.0 50.0 13599 8.0 10.0 39.0 33.0 34.0 31.0 26.0 13685 8.0 12.0 28.0 33.0 34.0 39.0 17.0 13947 9.0 22.0 81.0 64.0 66.0 72.0 63.0
pivoted_purchase = pivoted_counts. applymap( lambda x: 1 if x > 30 else 0 )
pivoted_purchase. head( )
2018-07-01 2018-08-01 2018-09-01 2018-10-01 2018-11-01 2018-12-01 2019-01-01 bf_StudentID 13012 0 0 0 0 0 0 0 13564 0 0 1 1 1 1 1 13599 0 0 1 1 1 1 0 13685 0 0 0 1 1 1 0 13947 0 0 1 1 1 1 1
def active_status ( data) :
status= [ ]
for i in range ( 7 ) :
if data[ i] == 0 :
if len ( status) > 0 :
if status[ i- 1 ] == "unreg" :
status. append( "unreg" )
else :
status. append( "unlike_canteen" )
else :
status. append( "unreg" )
else :
if len ( status) == 0 :
status. append( "new" )
else :
if status[ i- 1 ] == "unlike_canteen" :
status. append( "occasionally_like_canteen" )
elif status[ i- 1 ] == "unreg" :
status. append( "new" )
else :
status. append( "love_canteen" )
return status
pivoted_purchase. loc[ 13012 ]
2018-07-01 0
2018-08-01 0
2018-09-01 0
2018-10-01 0
2018-11-01 0
2018-12-01 0
2019-01-01 0
Name: 13012, dtype: int64
pivoted_purchase. loc[ 13564 ]
2018-07-01 0
2018-08-01 0
2018-09-01 1
2018-10-01 1
2018-11-01 1
2018-12-01 1
2019-01-01 1
Name: 13564, dtype: int64
pivoted_purchase. shape
(1730, 7)
pivoted_purchase. dtypes
2018-07-01 int64
2018-08-01 int64
2018-09-01 int64
2018-10-01 int64
2018-11-01 int64
2018-12-01 int64
2019-01-01 int64
dtype: object
type ( pivoted_purchase)
pandas.core.frame.DataFrame
pivoted_purchase_status = pivoted_purchase. apply ( lambda x: active_status( x) , axis = 1 )
pivoted_purchase_status. head( 8 )
bf_StudentID
13012 [unreg, unreg, unreg, unreg, unreg, unreg, unreg]
13564 [unreg, unreg, new, love_canteen, love_canteen...
13599 [unreg, unreg, new, love_canteen, love_canteen...
13685 [unreg, unreg, unreg, new, love_canteen, love_...
13947 [unreg, unreg, new, love_canteen, love_canteen...
13948 [unreg, unreg, new, love_canteen, love_canteen...
13949 [unreg, unreg, new, love_canteen, love_canteen...
13950 [unreg, unreg, new, love_canteen, love_canteen...
dtype: object
type ( pivoted_purchase_status)
pandas.core.series.Series
pivoted_purchase_status= pd. DataFrame( pivoted_purchase_status)
pivoted_purchase_status. head( )
0 bf_StudentID 13012 [unreg, unreg, unreg, unreg, unreg, unreg, unreg] 13564 [unreg, unreg, new, love_canteen, love_canteen... 13599 [unreg, unreg, new, love_canteen, love_canteen... 13685 [unreg, unreg, unreg, new, love_canteen, love_... 13947 [unreg, unreg, new, love_canteen, love_canteen...
type ( pivoted_purchase_status)
pandas.core.frame.DataFrame
pivoted_purchase_status. columns
RangeIndex(start=0, stop=1, step=1)
pivoted_purchase_status. index
Int64Index([13012, 13564, 13599, 13685, 13947, 13948, 13949, 13950, 13951,
13952,
...
16153, 16154, 16155, 16156, 16157, 16158, 16159, 16160, 16161,
16162],
dtype='int64', name='bf_StudentID', length=1730)
pivoted_purchase_status= pivoted_purchase_status. reset_index( )
pivoted_purchase_status. head( )
bf_StudentID 0 0 13012 [unreg, unreg, unreg, unreg, unreg, unreg, unreg] 1 13564 [unreg, unreg, new, love_canteen, love_canteen... 2 13599 [unreg, unreg, new, love_canteen, love_canteen... 3 13685 [unreg, unreg, unreg, new, love_canteen, love_... 4 13947 [unreg, unreg, new, love_canteen, love_canteen...
pivoted_purchase_status. columns
Index(['bf_StudentID', 0], dtype='object')
month= pivoted_purchase_status[ 0 ] . str . split( ',' , expand= True )
month. columns= [ 'month' ]
pivoted_purchase_status= pivoted_purchase_status. join( month)
pivoted_purchase_status. head( )
bf_StudentID 0 month 0 13012 [unreg, unreg, unreg, unreg, unreg, unreg, unreg] NaN 1 13564 [unreg, unreg, new, love_canteen, love_canteen... NaN 2 13599 [unreg, unreg, new, love_canteen, love_canteen... NaN 3 13685 [unreg, unreg, unreg, new, love_canteen, love_... NaN 4 13947 [unreg, unreg, new, love_canteen, love_canteen... NaN
pd. concat( [ pivoted_purchase_status, pivoted_purchase_status[ 0 ] . str . split( ',' , expand= True ) ] , axis= 1 ) . head( 2 )
bf_StudentID 0 month 0 0 13012 [unreg, unreg, unreg, unreg, unreg, unreg, unreg] NaN NaN 1 13564 [unreg, unreg, new, love_canteen, love_canteen... NaN NaN
pivoted_purchase_status = pivoted_purchase_status. drop( [ "month" ] , axis= 1 )
pivoted_purchase_status. columns = [ "bf_StudentID" , "User_Hierarchy" ]
pivoted_purchase_status. head( 2 )
bf_StudentID User_Hierarchy 0 13012 [unreg, unreg, unreg, unreg, unreg, unreg, unreg] 1 13564 [unreg, unreg, new, love_canteen, love_canteen...
pd. concat( [ pivoted_purchase_status, pivoted_purchase_status[ "User_Hierarchy" ] . str . split( ',' , expand= True ) ] , axis= 1 ) . head( 2 )
bf_StudentID User_Hierarchy 0 0 13012 [unreg, unreg, unreg, unreg, unreg, unreg, unreg] NaN 1 13564 [unreg, unreg, new, love_canteen, love_canteen... NaN
a= pivoted_purchase_status. head( 5 )
a
bf_StudentID User_Hierarchy 0 13012 [unreg, unreg, unreg, unreg, unreg, unreg, unreg] 1 13564 [unreg, unreg, new, love_canteen, love_canteen... 2 13599 [unreg, unreg, new, love_canteen, love_canteen... 3 13685 [unreg, unreg, unreg, new, love_canteen, love_... 4 13947 [unreg, unreg, new, love_canteen, love_canteen...
a[ "User_Hierarchy" ] . str . split( ( " " ) , expand= True )
pivoted_purchase_status. to_csv( r"D:/2018_BigData/Python/Python_files_Notebook/Theme_Practice/student_consumption_pivoted_purchase_status.csv" , index= False )
pivoted_purchase_status= pd. read_csv( "D:/2018_BigData/Python/Python_files_Notebook/theme_practice/student_consumption_pivoted_purchase_status_splited_above30per month.csv" )
pivoted_purchase_status. tail( 4 )
bf_StudentID User_Hierarchy Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7 1726 16159 unreg unreg new love_canteen love_canteen love_canteen love_canteen 1727 16160 unreg unreg new love_canteen love_canteen love_canteen unlike_canteen 1728 16161 unreg unreg new love_canteen love_canteen love_canteen love_canteen 1729 16162 unreg unreg new love_canteen love_canteen love_canteen unlike_canteen
columns= [ "bf_StudentID" , '201807' , '201808' , '201809' , '201810' , '201811' , '201812' , '201901' ]
pivoted_purchase_status. columns= columns
pivoted_purchase_status. tail( 4 )
bf_StudentID 201807 201808 201809 201810 201811 201812 201901 1726 16159 unreg unreg new love_canteen love_canteen love_canteen love_canteen 1727 16160 unreg unreg new love_canteen love_canteen love_canteen unlike_canteen 1728 16161 unreg unreg new love_canteen love_canteen love_canteen love_canteen 1729 16162 unreg unreg new love_canteen love_canteen love_canteen unlike_canteen
purchase_status_counts = pivoted_purchase_status. replace( "unreg" , np. NaN) . apply ( lambda x: pd. value_counts( x) )
purchase_status_counts. tail( 10 )
bf_StudentID 201807 201808 201809 201810 201811 201812 201901 14938 1.0 NaN NaN NaN NaN NaN NaN NaN 14940 1.0 NaN NaN NaN NaN NaN NaN NaN 14942 1.0 NaN NaN NaN NaN NaN NaN NaN 14944 1.0 NaN NaN NaN NaN NaN NaN NaN 14946 1.0 NaN NaN NaN NaN NaN NaN NaN 14337 1.0 NaN NaN NaN NaN NaN NaN NaN new NaN NaN 7.0 1498.0 31.0 33.0 11.0 2.0 love_canteen NaN NaN NaN 7.0 1345.0 1324.0 1350.0 1120.0 unlike_canteen NaN NaN NaN NaN 160.0 124.0 171.0 433.0 occasionally_like_canteen NaN NaN NaN NaN NaN 88.0 48.0 27.0
purchase_status_counts= purchase_status_counts. tail( 4 )
purchase_status_counts
bf_StudentID 201807 201808 201809 201810 201811 201812 201901 new NaN NaN 7.0 1498.0 31.0 33.0 11.0 2.0 love_canteen NaN NaN NaN 7.0 1345.0 1324.0 1350.0 1120.0 unlike_canteen NaN NaN NaN NaN 160.0 124.0 171.0 433.0 occasionally_like_canteen NaN NaN NaN NaN NaN 88.0 48.0 27.0
purchase_status_counts. fillna( 0 ) . T. plot. area( figsize= ( 12 , 6 ) )
plt. show( )
purchase_status_counts = purchase_status_counts. fillna( 0 ) . drop( [ "bf_StudentID" ] , axis= 1 )
purchase_status_counts
201807 201808 201809 201810 201811 201812 201901 new 0.0 7.0 1498.0 31.0 33.0 11.0 2.0 love_canteen 0.0 0.0 7.0 1345.0 1324.0 1350.0 1120.0 unlike_canteen 0.0 0.0 0.0 160.0 124.0 171.0 433.0 occasionally_like_canteen 0.0 0.0 0.0 0.0 88.0 48.0 27.0
return_rata = purchase_status_counts. apply ( lambda x: x/ x. sum ( ) , axis= 0 )
return_rata
201807 201808 201809 201810 201811 201812 201901 new NaN 1.0 0.995349 0.020182 0.021033 0.006962 0.001264 love_canteen NaN 0.0 0.004651 0.875651 0.843850 0.854430 0.707965 unlike_canteen NaN 0.0 0.000000 0.104167 0.079031 0.108228 0.273704 occasionally_like_canteen NaN 0.0 0.000000 0.000000 0.056087 0.030380 0.017067
return_rata1 = return_rata. apply ( lambda x: x* 100 , axis= 0 )
return_rata1. loc[ "occasionally_like_canteen" ] . plot( figsize = ( 12 , 6 ) )
plt. xlabel( '时间(月)' , fontsize= 18 )
plt. ylabel( '百分比(%)' , fontsize= 18 )
plt. title( '各月偶尔爱饭堂学生的占比' , fontsize= 18 )
plt. show( )
return_rata2 = return_rata. apply ( lambda x: x* 100 , axis= 0 )
return_rata2. loc[ "love_canteen" ] . plot( figsize = ( 12 , 6 ) )
plt. xlabel( '时间(月)' , fontsize= 18 )
plt. ylabel( '百分比(%)' , fontsize= 18 )
plt. title( '各月超爱饭堂学生的比例' , fontsize= 18 )
plt. show( )
df. head( 1 )
DealTime bf_StudentID AccName PerSex MonDeal avgMonDeal transaction_times month 0 2018-07-01 13983 裘某某 男 -3.7 -3.7 1 2018-07-01
user_amount = df. groupby( "bf_StudentID" ) . MonDeal. sum ( ) . sort_values( ) . reset_index( )
user_amount. head( 3 )
bf_StudentID MonDeal 0 15556 -6239.95 1 14123 -5698.20 2 16024 -5125.69
user_amount[ "amount_cumsum" ] = user_amount. MonDeal. cumsum( )
user_amount. tail( )
bf_StudentID MonDeal amount_cumsum 1725 14892 -32.7 -3883701.02 1726 13956 -11.0 -3883712.02 1727 14546 -10.0 -3883722.02 1728 14363 -9.5 -3883731.52 1729 13967 -9.0 -3883740.52
user_amount[ "MonDeal" ] = user_amount[ "MonDeal" ] * ( - 1 )
user_amount[ "amount_cumsum" ] = user_amount[ "amount_cumsum" ] * ( - 1 )
user_amount. tail( )
bf_StudentID MonDeal amount_cumsum 1725 14892 32.7 3883701.02 1726 13956 11.0 3883712.02 1727 14546 10.0 3883722.02 1728 14363 9.5 3883731.52 1729 13967 9.0 3883740.52
amount_total= user_amount. amount_cumsum. max ( )
amount_total
3883740.520000005
user_amount[ "prop" ] = user_amount. apply ( lambda x: x. amount_cumsum/ amount_total, axis= 1 )
user_amount. tail( )
bf_StudentID MonDeal amount_cumsum prop 1725 14892 32.7 3883701.02 0.999990 1726 13956 11.0 3883712.02 0.999993 1727 14546 10.0 3883722.02 0.999995 1728 14363 9.5 3883731.52 0.999998 1729 13967 9.0 3883740.52 1.000000
user_amount. prop. plot( )
plt. show( )
plt. figure( figsize= ( 12 , 4 ) )
plt. subplot( 121 )
df. groupby( "bf_StudentID" ) . MonDeal. sum ( ) . hist( bins= 30 )
plt. xlabel( "学生消费总金额" , fontsize= 15 )
plt. title( "2018.7-2019.1学生消费总金额分布" , fontsize= 18 )
plt. subplot( 122 )
df. groupby( "bf_StudentID" ) . transaction_times. sum ( ) . hist( bins= 30 )
plt. ylabel( "学生消费总次数" , fontsize= 15 )
plt. title( "2018.7-2019.1学生消费总次数分布" , fontsize= 18 )
plt. show( )
MonDealSum = df. groupby( "bf_StudentID" ) . MonDeal. sum ( ) . reset_index( )
x1 = MonDealSum[ "MonDeal" ]
counts, bin_edges = np. histogram( x1, bins= 30 )
print ( counts)
[ 1 0 1 0 0 2 3 6 8 8 11 34 39 59 76 97 159 153
156 163 181 164 110 87 61 70 29 20 11 21]
user_amount. head( 2 )
bf_StudentID MonDeal amount_cumsum prop 0 15556 6239.95 6239.95 0.001607 1 14123 5698.20 11938.15 0.003074
array = user_amount[ "MonDeal" ] . astype( int )
np. sum ( array[ 1 : 346 ] , axis= 0 )
1188167
np. sum ( array[ 347 : 1384 ] , axis= 0 )
2320956
print ( user_amount. loc[ 346 ] . astype( int ) )
print ( "" )
print ( user_amount. loc[ 1384 ] . astype( int ) )
bf_StudentID 16123
MonDeal 2913
amount_cumsum 1197484
prop 0
Name: 346, dtype: int32
bf_StudentID 15971
MonDeal 1572
amount_cumsum 3520532
prop 0
Name: 1384, dtype: int32
print ( user_amount. loc[ 210 ] . astype( int ) )
print ( "" )
print ( user_amount. loc[ 1220 ] . astype( int ) )
bf_StudentID 15901
MonDeal 3216
amount_cumsum 781881
prop 0
Name: 210, dtype: int32
bf_StudentID 14873
MonDeal 1798
amount_cumsum 3242862
prop 0
Name: 1220, dtype: int32
array = user_amount[ "MonDeal" ]
np. sum ( array, axis= 0 )
3883740.52
np. sum ( user_amount, axis= 0 ) . astype( int )
bf_StudentID 25968788
MonDeal 3883740
amount_cumsum -2147483648
prop 1050
dtype: int32