import pandas as pd
import numpy as np
from sqlalchemy import create_engine
def reader ( query, db= 'data' ) :
sql = query
engine = create_engine( 'mysql+pymysql://root:messi123@localhost:3306/{0}?charset=utf8' . format ( db) )
df = pd. read_sql( sql, engine)
return df
reader(
"""
select
data(paidTime) as order_dt,
userId as user_id,
sum(price) as order_amount,
count(orderId) as order_products
from data.orderinfo
where isPaid = '已支付' and paidTime > '0000-00-00 00:00:00'
group by data(paidTime),userId
""" )
df[ 'order_dt' ] = pd. to_datetime( df. order_dt, format = "%Y-%m-%d" )
df[ 'month' ] = df. order_dt. values. astype( 'datetime64[M]' )
user_id order_dt order_products order_amount month 0 1 1997-01-01 1 11.77 1997-01-01 1 2 1997-01-12 1 12.00 1997-01-01
columns = [ 'user_id' , 'order_dt' , 'order_products' , 'order_amount' ]
df = pd. read_table( 'D:\\data\CDNOW_master.txt' , names= columns, sep= '\s+' )
user_id order_dt order_products order_amount 0 1 1997-01-01 1 11.77 1 2 1997-01-12 1 12.00
df[ 'order_dt' ] = pd. to_datetime( df. order_dt, format = '%Y%m%d' )
df[ 'month' ] = df. order_dt. values. astype( 'datetime64[M]' )
df. head( 2 )
user_id order_dt order_products order_amount month 0 1 1997-01-01 1 11.77 1997-01-01 1 2 1997-01-12 1 12.00 1997-01-01
df. describe( )
user_id order_products order_amount count 69659.000000 69659.000000 69659.000000 mean 11470.854592 2.410040 35.893648 std 6819.904848 2.333924 36.281942 min 1.000000 1.000000 0.000000 25% 5506.000000 1.000000 14.490000 50% 11410.000000 2.000000 25.980000 75% 17273.000000 3.000000 43.700000 max 23570.000000 99.000000 1286.010000
1.进行用户消费趋势的分析(按月)
每月的消费总金额 每月的消费次数 每月的产品购买数 每月的消费人数
grouped_month= df. groupby( 'month' )
order_month_amount = grouped_month. order_amount. sum ( )
month 1997-01-01 299060.17 1997-02-01 379590.03 1997-03-01 393155.27 1997-04-01 142824.49 1997-05-01 107933.30 1997-06-01 108395.87 1997-07-01 122078.88 1997-08-01 88367.69 1997-09-01 81948.80 1997-10-01 89780.77 1997-11-01 115448.64 1997-12-01 95577.35 1998-01-01 76756.78 1998-02-01 77096.96 1998-03-01 108970.15 1998-04-01 66231.52 1998-05-01 70989.66 1998-06-01 76109.30 Name: order_amount, dtype: float64
import matplotlib. pyplot as plt
% matplotlib inline
plt. style. use( 'ggplot' )
order_month_amount. plot( )
order_num = grouped_month. user_id. count( )
order_num. plot( )
grouped_month. order_products. sum ( ) . plot( )
df. groupby( [ 'month' , 'user_id' ] ) . count( ) . reset_index( ) . groupby( 'month' ) . count( ) . user_id
month 1997-01-01 7846 1997-02-01 9633 1997-03-01 9524 1997-04-01 2822 1997-05-01 2214 1997-06-01 2339 1997-07-01 2180 1997-08-01 1772 1997-09-01 1739 1997-10-01 1839 1997-11-01 2028 1997-12-01 1864 1998-01-01 1537 1998-02-01 1551 1998-03-01 2060 1998-04-01 1437 1998-05-01 1488 1998-06-01 1506 Name: user_id, dtype: int64
grouped_month. user_id. apply ( lambda x: len ( x. drop_duplicates( ) ) ) . plot( )
df. pivot_table( index = 'month' ,
values = [ 'order_products' , 'order_amount' , 'user_id' ] ,
aggfunc= { 'order_products' : 'sum' ,
'order_amount' : 'sum' ,
'user_id' : 'count' } ) . reset_index( )
month order_amount order_products user_id 0 1997-01-01 299060.17 19416 8928 1 1997-02-01 379590.03 24921 11272 2 1997-03-01 393155.27 26159 11598
user_num = grouped_month. user_id. apply ( lambda x: len ( x. drop_duplicates( ) ) )
average_amount = order_month_amount/ user_num
average_amount. plot( )
average_num = order_num/ user_num
average_num. plot( )
2.用户个体消费分析
用户消费金额、消费次数的描述统计 用户消费金额和消费次数的散点图 用户消费金额的分布图 用户消费次数的分布图 用户累计消费金额占比(百分之多少的用户占了百分之多少的消费额)
grouped_user = df. groupby( 'user_id' )
grouped_user. sum ( ) . describe( )
grouped_user. sum ( ) . query( 'order_amount<4000' ) . plot. scatter( x= 'order_products' , y= 'order_amount' )
plt. style. use( 'ggplot' )
grouped_user. sum ( ) . query( 'order_amount<300' ) . order_amount. plot. hist( bins= 20 )
user_cumsum= grouped_user. sum ( ) . sort_values( 'order_amount' ) . apply ( lambda x: x. cumsum( ) / x. sum ( ) )
user_cumsum. reset_index( ) . order_amount. plot( )