import pandas as pd
store=pd.read_csv('w2_store_rev.csv',index_col=0)
store.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 7 columns):
revenue 985 non-null float64
reach 985 non-null int64
local_tv 929 non-null float64
online 985 non-null int64
instore 985 non-null int64
person 985 non-null int64
event 985 non-null object
dtypes: float64(2), int64(4), object(1)
memory usage: 61.6+ KB
store.isnull().sum()
revenue 0
reach 0
local_tv 56
online 0
instore 0
person 0
event 0
dtype: int64
store.describe()
| revenue | reach | local_tv | online | instore | person |
---|
count | 985.000000 | 985.000000 | 929.000000 | 985.000000 | 985.000000 | 985.000000 |
---|
mean | 38357.355025 | 3.395939 | 31324.061109 | 1596.527919 | 3350.962437 | 11.053807 |
---|
std | 11675.603883 | 1.011913 | 3970.934733 | 496.131586 | 976.546381 | 3.041740 |
---|
min | 5000.000000 | 0.000000 | 20000.000000 | 0.000000 | 0.000000 | 0.000000 |
---|
25% | 30223.600000 | 3.000000 | 28733.830000 | 1253.000000 | 2690.000000 | 9.000000 |
---|
50% | 38159.110000 | 3.000000 | 31104.520000 | 1607.000000 | 3351.000000 | 11.000000 |
---|
75% | 45826.520000 | 4.000000 | 33972.410000 | 1921.000000 | 4011.000000 | 13.000000 |
---|
max | 79342.070000 | 7.000000 | 43676.900000 | 3280.000000 | 6489.000000 | 24.000000 |
---|
store.event.unique()
array(['non_event', 'special', 'cobranding', 'holiday'], dtype=object)
store.groupby(['event'])['revenue'].describe()
| count | mean | std | min | 25% | 50% | 75% | max |
---|
event | | | | | | | | |
---|
cobranding | 398.0 | 38277.664497 | 11879.097324 | 7146.99 | 30472.1525 | 37864.155 | 46333.5600 | 79342.07 |
---|
holiday | 103.0 | 37791.890583 | 11942.369136 | 5000.00 | 29644.5250 | 38432.780 | 46036.1300 | 73377.15 |
---|
non_event | 192.0 | 37903.081562 | 11186.436740 | 6874.43 | 29852.3775 | 37937.175 | 44611.6375 | 69429.39 |
---|
special | 292.0 | 38964.136438 | 11648.616882 | 10207.96 | 30325.8125 | 39197.870 | 45897.0400 | 71757.50 |
---|
store.groupby(['event'])['local_tv'].describe()
| count | mean | std | min | 25% | 50% | 75% | max |
---|
event | | | | | | | | |
---|
cobranding | 376.0 | 31424.590186 | 3951.049566 | 21252.35 | 28746.9725 | 31336.570 | 33839.0200 | 42162.64 |
---|
holiday | 96.0 | 30860.524896 | 4448.719364 | 21792.84 | 27769.6000 | 30564.705 | 33595.5975 | 41047.01 |
---|
non_event | 182.0 | 31415.197527 | 3952.155383 | 20000.00 | 29222.5875 | 31238.235 | 34386.0825 | 42069.84 |
---|
special | 275.0 | 31288.110982 | 3842.412128 | 21428.20 | 28668.7100 | 30921.790 | 34105.6250 | 43676.90 |
---|
store=pd.get_dummies(store)
store.head(10)
| revenue | reach | local_tv | online | instore | person | event_cobranding | event_holiday | event_non_event | event_special |
---|
845 | 45860.28 | 2 | 31694.91 | 2115 | 3296 | 8 | 0 | 0 | 1 | 0 |
---|
483 | 63588.23 | 2 | 35040.17 | 1826 | 2501 | 14 | 0 | 0 | 0 | 1 |
---|
513 | 23272.69 | 4 | 30992.82 | 1851 | 2524 | 6 | 0 | 0 | 0 | 1 |
---|
599 | 45911.23 | 2 | 29417.78 | 2437 | 3049 | 12 | 0 | 0 | 0 | 1 |
---|
120 | 36644.23 | 2 | 35611.11 | 1122 | 1142 | 13 | 1 | 0 | 0 | 0 |
---|
867 | 36172.81 | 4 | 22372.59 | 2001 | 1881 | 17 | 1 | 0 | 0 | 0 |
---|
847 | 43797.03 | 3 | 31443.74 | 1667 | 1846 | 15 | 1 | 0 | 0 | 0 |
---|
950 | 41629.80 | 4 | 35775.75 | 1155 | 2715 | 12 | 0 | 0 | 0 | 1 |
---|
942 | 21303.48 | 2 | 24888.31 | 1853 | 3677 | 4 | 0 | 0 | 1 | 0 |
---|
550 | 20746.15 | 4 | 26623.48 | 1497 | 3075 | 9 | 0 | 1 | 0 | 0 |
---|
store.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 10 columns):
revenue 985 non-null float64
reach 985 non-null int64
local_tv 929 non-null float64
online 985 non-null int64
instore 985 non-null int64
person 985 non-null int64
event_cobranding 985 non-null uint8
event_holiday 985 non-null uint8
event_non_event 985 non-null uint8
event_special 985 non-null uint8
dtypes: float64(2), int64(4), uint8(4)
memory usage: 57.7 KB
store.corr()
| revenue | reach | local_tv | online | instore | person | event_cobranding | event_holiday | event_non_event | event_special |
---|
revenue | 1.000000 | -0.155314 | 0.602114 | 0.171227 | 0.311739 | 0.559208 | -0.005623 | -0.016559 | -0.019155 | 0.033752 |
---|
reach | -0.155314 | 1.000000 | -0.034039 | -0.025141 | 0.035635 | 0.061417 | 0.043809 | 0.020398 | -0.043128 | -0.023330 |
---|
local_tv | 0.602114 | -0.034039 | 1.000000 | 0.006775 | -0.046825 | 0.048664 | 0.020886 | -0.039650 | 0.011335 | -0.005874 |
---|
online | 0.171227 | -0.025141 | 0.006775 | 1.000000 | -0.026399 | 0.036662 | -0.024646 | -0.018596 | -0.020587 | 0.056799 |
---|
instore | 0.311739 | 0.035635 | -0.046825 | -0.026399 | 1.000000 | -0.007482 | -0.057725 | 0.045963 | 0.015495 | 0.017788 |
---|
person | 0.559208 | 0.061417 | 0.048664 | 0.036662 | -0.007482 | 1.000000 | 0.002439 | -0.025692 | -0.025568 | 0.036771 |
---|
event_cobranding | -0.005623 | 0.043809 | 0.020886 | -0.024646 | -0.057725 | 0.002439 | 1.000000 | -0.281389 | -0.405169 | -0.534499 |
---|
event_holiday | -0.016559 | 0.020398 | -0.039650 | -0.018596 | 0.045963 | -0.025692 | -0.281389 | 1.000000 | -0.168151 | -0.221824 |
---|
event_non_event | -0.019155 | -0.043128 | 0.011335 | -0.020587 | 0.015495 | -0.025568 | -0.405169 | -0.168151 | 1.000000 | -0.319403 |
---|
event_special | 0.033752 | -0.023330 | -0.005874 | 0.056799 | 0.017788 | 0.036771 | -0.534499 | -0.221824 | -0.319403 | 1.000000 |
---|
store.corr()[['revenue']].sort_values('revenue',ascending=False)
| revenue |
---|
revenue | 1.000000 |
---|
local_tv | 0.602114 |
---|
person | 0.559208 |
---|
instore | 0.311739 |
---|
online | 0.171227 |
---|
event_special | 0.033752 |
---|
event_cobranding | -0.005623 |
---|
event_holiday | -0.016559 |
---|
event_non_event | -0.019155 |
---|
reach | -0.155314 |
---|
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.regplot('local_tv','revenue',store)
<matplotlib.axes._subplots.AxesSubplot at 0x1f2206a22b0>
sns.regplot('person','revenue',store)
<matplotlib.axes._subplots.AxesSubplot at 0x1f2207de8d0>
sns.regplot('instore','revenue',store)
<matplotlib.axes._subplots.AxesSubplot at 0x1f220855978>
from sklearn.linear_model import LinearRegression
model=LinearRegression()
y=store['revenue']
model.fit(x,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
store=store.fillna(0)
store=store.fillna(store.local_tv.mean())
store.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 985 entries, 845 to 26
Data columns (total 10 columns):
revenue 985 non-null float64
reach 985 non-null int64
local_tv 985 non-null float64
online 985 non-null int64
instore 985 non-null int64
person 985 non-null int64
event_cobranding 985 non-null uint8
event_holiday 985 non-null uint8
event_non_event 985 non-null uint8
event_special 985 non-null uint8
dtypes: float64(2), int64(4), uint8(4)
memory usage: 57.7 KB
model.coef_
array([4.01736340e-01, 2.10959413e+03, 3.61050584e+00, 3.79371728e+00])
model.intercept_
-8967.736870300454
score=model.score(x,y)
predictions=model.predict(x)
error=predictions-y
rmse=(error**2).mean()**.5
mae=abs(error).mean()
print(rmse)
print(mae)
8321.491623472051
6556.036999600779
score=model.score(x,y)
predictions=model.predict(x)
error=predictions-y
rmse=(error**2).mean()**.5
mae=abs(error).mean()
print(rmse)
print(mae)
8106.512169325369
6402.202883441895
score=model.score(x,y)
predictions=model.predict(x)
error=predictions-y
rmse=(error**2).mean()**.5
mae=abs(error).mean()
print(rmse)
print(mae)
5591.764749668997
4485.506383110859
from statsmodels.formula.api import ols
x=store[['local_tv','person','instore']]
y=store['revenue']
model=ols('y~x',store).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.746
Model: OLS Adj. R-squared: 0.745
Method: Least Squares F-statistic: 959.2
Date: Wed, 19 Jun 2019 Prob (F-statistic): 4.09e-291
Time: 15:24:38 Log-Likelihood: -9947.5
No. Observations: 985 AIC: 1.990e+04
Df Residuals: 981 BIC: 1.992e+04
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -5.288e+04 1804.489 -29.305 0.000 -5.64e+04 -4.93e+04
x[0] 1.7515 0.049 35.857 0.000 1.656 1.847
x[1] 2050.5749 61.866 33.146 0.000 1929.171 2171.979
x[2] 4.0903 0.193 21.229 0.000 3.712 4.468
==============================================================================
Omnibus: 0.352 Durbin-Watson: 2.056
Prob(Omnibus): 0.839 Jarque-Bera (JB): 0.402
Skew: 0.043 Prob(JB): 0.818
Kurtosis: 2.951 Cond. No. 3.05e+05
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.05e+05. This might indicate that there are
strong multicollinearity or other numerical problems.