保洁业务数据概况分析

#调包
import pandas as pd
#数据读取#
#index_col=0 ,去除Unnamed=0数据
store=pd.read_csv('w2_store_rev.csv',index_col=0)
#数据的基本信息
#发现local_tv有50多个空值
#发现event是object,即类别型变量
store.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 7 columns):
revenue     985 non-null float64
reach       985 non-null int64
local_tv    929 non-null float64
online      985 non-null int64
instore     985 non-null int64
person      985 non-null int64
event       985 non-null object
dtypes: float64(2), int64(4), object(1)
memory usage: 61.6+ KB
#统计各个列哪些是空值
#发现local_tv有56个空值
store.isnull().sum()
revenue      0
reach        0
local_tv    56
online       0
instore      0
person       0
event        0
dtype: int64
#了解数据的分布
#判断数据是否符合业务场景
store.describe()
revenuereachlocal_tvonlineinstoreperson
count985.000000985.000000929.000000985.000000985.000000985.000000
mean38357.3550253.39593931324.0611091596.5279193350.96243711.053807
std11675.6038831.0119133970.934733496.131586976.5463813.041740
min5000.0000000.00000020000.0000000.0000000.0000000.000000
25%30223.6000003.00000028733.8300001253.0000002690.0000009.000000
50%38159.1100003.00000031104.5200001607.0000003351.00000011.000000
75%45826.5200004.00000033972.4100001921.0000004011.00000013.000000
max79342.0700007.00000043676.9000003280.0000006489.00000024.000000
#了解event的具体值
store.event.unique()
array(['non_event', 'special', 'cobranding', 'holiday'], dtype=object)
#这些类别对应的revenue(销售额)是怎样的
store.groupby(['event'])['revenue'].describe()
countmeanstdmin25%50%75%max
event
cobranding398.038277.66449711879.0973247146.9930472.152537864.15546333.560079342.07
holiday103.037791.89058311942.3691365000.0029644.525038432.78046036.130073377.15
non_event192.037903.08156211186.4367406874.4329852.377537937.17544611.637569429.39
special292.038964.13643811648.61688210207.9630325.812539197.87045897.040071757.50
#这几个类别对应的local_tv(本地电视广告投入)是怎样的
store.groupby(['event'])['local_tv'].describe()
countmeanstdmin25%50%75%max
event
cobranding376.031424.5901863951.04956621252.3528746.972531336.57033839.020042162.64
holiday96.030860.5248964448.71936421792.8427769.600030564.70533595.597541047.01
non_event182.031415.1975273952.15538320000.0029222.587531238.23534386.082542069.84
special275.031288.1109823842.41212821428.2028668.710030921.79034105.625043676.90
#将类别变量转化为哑变量
store=pd.get_dummies(store)
#生成event的4个标签,每个标签取值0/1
store.head(10)
revenuereachlocal_tvonlineinstorepersonevent_cobrandingevent_holidayevent_non_eventevent_special
84545860.28231694.912115329680010
48363588.23235040.1718262501140001
51323272.69430992.821851252460001
59945911.23229417.7824373049120001
12036644.23235611.1111221142131000
86736172.81422372.5920011881171000
84743797.03331443.7416671846151000
95041629.80435775.7511552715120001
94221303.48224888.311853367740010
55020746.15426623.481497307590100
#确认类别变量已经转换成数字变量
store.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 10 columns):
revenue             985 non-null float64
reach               985 non-null int64
local_tv            929 non-null float64
online              985 non-null int64
instore             985 non-null int64
person              985 non-null int64
event_cobranding    985 non-null uint8
event_holiday       985 non-null uint8
event_non_event     985 non-null uint8
event_special       985 non-null uint8
dtypes: float64(2), int64(4), uint8(4)
memory usage: 57.7 KB
#数据准备完成#
#相关分析#
#所有变量,任意两个变量相关分析
#local_tv,person,instore是比较好的指标,与revenue相关度高
store.corr()
revenuereachlocal_tvonlineinstorepersonevent_cobrandingevent_holidayevent_non_eventevent_special
revenue1.000000-0.1553140.6021140.1712270.3117390.559208-0.005623-0.016559-0.0191550.033752
reach-0.1553141.000000-0.034039-0.0251410.0356350.0614170.0438090.020398-0.043128-0.023330
local_tv0.602114-0.0340391.0000000.006775-0.0468250.0486640.020886-0.0396500.011335-0.005874
online0.171227-0.0251410.0067751.000000-0.0263990.036662-0.024646-0.018596-0.0205870.056799
instore0.3117390.035635-0.046825-0.0263991.000000-0.007482-0.0577250.0459630.0154950.017788
person0.5592080.0614170.0486640.036662-0.0074821.0000000.002439-0.025692-0.0255680.036771
event_cobranding-0.0056230.0438090.020886-0.024646-0.0577250.0024391.000000-0.281389-0.405169-0.534499
event_holiday-0.0165590.020398-0.039650-0.0185960.045963-0.025692-0.2813891.000000-0.168151-0.221824
event_non_event-0.019155-0.0431280.011335-0.0205870.015495-0.025568-0.405169-0.1681511.000000-0.319403
event_special0.033752-0.023330-0.0058740.0567990.0177880.036771-0.534499-0.221824-0.3194031.000000
#其他变量与revenue的相关分析
#sort_values 将revenue排序,ascending默认升序,False为降序排列
#看到前3个相关变量为local_tv,person,instore
store.corr()[['revenue']].sort_values('revenue',ascending=False)
revenue
revenue1.000000
local_tv0.602114
person0.559208
instore0.311739
online0.171227
event_special0.033752
event_cobranding-0.005623
event_holiday-0.016559
event_non_event-0.019155
reach-0.155314
#可视化分析
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#线性关系可视化
#斜率与相关系数有关
sns.regplot('local_tv','revenue',store)
<matplotlib.axes._subplots.AxesSubplot at 0x1f2206a22b0>

[外链图片转存失败(img-xyg3ngFU-1567847898797)(output_16_1.png)]

#线性关系可视化
sns.regplot('person','revenue',store)
<matplotlib.axes._subplots.AxesSubplot at 0x1f2207de8d0>

[外链图片转存失败(img-CfxQK9iP-1567847898797)(output_17_1.png)]

#线性关系可视化
sns.regplot('instore','revenue',store)
<matplotlib.axes._subplots.AxesSubplot at 0x1f220855978>

[外链图片转存失败(img-62uC8x3m-1567847898798)(output_18_1.png)]

#线性回归分析 
#调包
from sklearn.linear_model import LinearRegression
#建模
model=LinearRegression()
#设定自变量和因变量
y=store['revenue']
#第一次三个 x=store[['local_tv','person','instore']
#第二次四个 x=store[['local_tv','person','instore','online']]
model.fit(x,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
#缺失值处理,填充0
store=store.fillna(0)
#缺失值处理,均值填充
store=store.fillna(store.local_tv.mean())
store.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 10 columns):
revenue             985 non-null float64
reach               985 non-null int64
local_tv            985 non-null float64
online              985 non-null int64
instore             985 non-null int64
person              985 non-null int64
event_cobranding    985 non-null uint8
event_holiday       985 non-null uint8
event_non_event     985 non-null uint8
event_special       985 non-null uint8
dtypes: float64(2), int64(4), uint8(4)
memory usage: 57.7 KB
#自变量系数
model.coef_
array([4.01736340e-01, 2.10959413e+03, 3.61050584e+00, 3.79371728e+00])
#模型的截距
model.intercept_
-8967.736870300454
#模型的评估,x为'local_tv','person','instore'
score=model.score(x,y)#x和y打分
predictions=model.predict(x)#计算y预测值
error=predictions-y#计算误差

rmse=(error**2).mean()**.5#计算rmse
mae=abs(error).mean()#计算mae

print(rmse)
print(mae)
8321.491623472051
6556.036999600779
#模型的评估,x为'local_tv','person','instore','online'
#发现模型误差略有调整
score=model.score(x,y)#x和y打分
predictions=model.predict(x)#计算y预测值
error=predictions-y#计算误差

rmse=(error**2).mean()**.5#计算rmse
mae=abs(error).mean()#计算mae

print(rmse)
print(mae)
8106.512169325369
6402.202883441895
#模型的评估,x为'local_tv','person','instore','online',local_tv用均值填充
#发现误差大幅下降,预测效果提升
score=model.score(x,y)#x和y打分
predictions=model.predict(x)#计算y预测值
error=predictions-y#计算误差

rmse=(error**2).mean()**.5#计算rmse
mae=abs(error).mean()#计算mae

print(rmse)
print(mae)
5591.764749668997
4485.506383110859
#如果各位希望能看到标准的模型输出表
from statsmodels.formula.api import ols
x=store[['local_tv','person','instore']]   #生成自变量
y=store['revenue']  
model=ols('y~x',store).fit()
#观察coef-系数,P值显著性
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.746
Model:                            OLS   Adj. R-squared:                  0.745
Method:                 Least Squares   F-statistic:                     959.2
Date:                Wed, 19 Jun 2019   Prob (F-statistic):          4.09e-291
Time:                        15:24:38   Log-Likelihood:                -9947.5
No. Observations:                 985   AIC:                         1.990e+04
Df Residuals:                     981   BIC:                         1.992e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -5.288e+04   1804.489    -29.305      0.000   -5.64e+04   -4.93e+04
x[0]           1.7515      0.049     35.857      0.000       1.656       1.847
x[1]        2050.5749     61.866     33.146      0.000    1929.171    2171.979
x[2]           4.0903      0.193     21.229      0.000       3.712       4.468
==============================================================================
Omnibus:                        0.352   Durbin-Watson:                   2.056
Prob(Omnibus):                  0.839   Jarque-Bera (JB):                0.402
Skew:                           0.043   Prob(JB):                        0.818
Kurtosis:                       2.951   Cond. No.                     3.05e+05
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.05e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值