import pandas as pd
import numpy as np
import pymysql
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/datascience' )
## 读取数据
data = 'data/section7-dau.csv'
dau = pd.read_csv(data)
dau.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
region_month region_day app_name user_id device 0 2013-01 2013-01-01 game-02 10061580 FP 1 2013-01 2013-01-01 game-02 10154440 FP 2 2013-01 2013-01-01 game-02 10164762 SP 3 2013-01 2013-01-01 game-02 10165615 FP 4 2013-01 2013-01-01 game-02 10321356 FP
dau.info()
print(dau.region_month.value_counts())
print(dau.region_day.unique())
print(dau.device.value_counts())
2013-01 25847 2013-02 23141 Name: region_month, dtype: int64 [‘2013-01-01’ ‘2013-01-02’ ‘2013-01-03’ ‘2013-01-04’ ‘2013-01-05’ ‘2013-01-06’ ‘2013-01-07’ ‘2013-01-08’ ‘2013-01-09’ ‘2013-01-10’ ‘2013-01-11’ ‘2013-01-12’ ‘2013-01-13’ ‘2013-01-14’ ‘2013-01-15’ ‘2013-01-16’ ‘2013-01-17’ ‘2013-01-18’ ‘2013-01-19’ ‘2013-01-20’ ‘2013-01-21’ ‘2013-01-22’ ‘2013-01-23’ ‘2013-01-24’ ‘2013-01-25’ ‘2013-01-26’ ‘2013-01-27’ ‘2013-01-28’ ‘2013-01-29’ ‘2013-01-30’ ‘2013-01-31’ ‘2013-02-01’ ‘2013-02-02’ ‘2013-02-03’ ‘2013-02-04’ ‘2013-02-05’ ‘2013-02-06’ ‘2013-02-07’ ‘2013-02-08’ ‘2013-02-09’ ‘2013-02-10’ ‘2013-02-11’ ‘2013-02-12’ ‘2013-02-13’ ‘2013-02-14’ ‘2013-02-15’ ‘2013-02-16’ ‘2013-02-17’ ‘2013-02-18’ ‘2013-02-19’ ‘2013-02-20’ ‘2013-02-21’ ‘2013-02-22’ ‘2013-02-23’ ‘2013-02-24’ ‘2013-02-25’ ‘2013-02-26’ ‘2013-02-27’ ‘2013-02-28’] FP 30331 SP 18657 Name: device, dtype: int64 ## 关于用户是否进行了账号迁转的数据的整理 #### 提取需要的数据列,去除重复项,得到 用户按月份和设备登陆的信息
mau = dau[['region_month' ,'user_id' ,'device' ]]
mau.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
region_month user_id device 0 2013-01 10061580 FP 1 2013-01 10154440 FP 2 2013-01 10164762 SP 3 2013-01 10165615 FP 4 2013-01 10321356 FP
print(mau.duplicated().sum())
mau.drop_duplicates(inplace=True )
print(mau.duplicated().sum())
46007 0 D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy This is separate from the ipykernel package so we can avoid doing imports until #### 非智能手机和智能手机分开
fp = dau[dau['device' ]=='FP' ][['region_month' ,'user_id' ,'device' ]].drop_duplicates()
sp = dau[dau['device' ]=='SP' ][['region_month' ,'user_id' ,'device' ]].drop_duplicates()
print(fp.info())
print(sp.info())
#### 分别获取1月份和2月份的数据
fp_m1 = fp[fp['region_month' ]=='2013-01' ]
fp_m2 = fp[fp['region_month' ]=='2013-02' ]
sp_m1 = sp[sp['region_month' ]=='2013-01' ]
sp_m2 = sp[sp['region_month' ]=='2013-02' ]
#### 1月份的非智能手机用户在2月份的访问情况
mau['is_access' ] = 1
fp_m1 = pd.merge(fp_m1,mau[mau['region_month' ]=='2013-02' ][['user_id' ,'is_access' ]],how='left' ,on='user_id' )
fp_m1['is_access' ].fillna(0 ,inplace=True )
fp_m1.head()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy This is separate from the ipykernel package so we can avoid doing imports until
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
region_month user_id device is_access 0 2013-01 10061580 FP 1.0 1 2013-01 10154440 FP 0.0 2 2013-01 10165615 FP 1.0 3 2013-01 10321356 FP 1.0 4 2013-01 10447112 FP 1.0
#### 1月份访问过游戏的非智能手机用户在2月份是否是继续通过非智能手机来访问的
fp_m2['is_fp' ] = 1
fp_m1 = pd.merge(fp_m1,fp_m2[['user_id' ,'is_fp' ]],how='left' ,on='user_id' )
fp_m1['is_fp' ].fillna(0 ,inplace=True )
fp_m1.head()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy This is separate from the ipykernel package so we can avoid doing imports until
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
region_month user_id device is_access is_fp 0 2013-01 10061580 FP 1.0 1.0 1 2013-01 10154440 FP 0.0 0.0 2 2013-01 10165615 FP 1.0 1.0 3 2013-01 10321356 FP 1.0 1.0 4 2013-01 10447112 FP 1.0 1.0
#### 1月份访问过游戏的非智能手机用户在2月份是否是通过智能手机来访问的
sp_m2['is_sp' ] = 1
fp_m1 = pd.merge(fp_m1,sp_m2[['user_id' ,'is_sp' ]],how='left' ,on='user_id' )
fp_m1['is_sp' ].fillna(0 ,inplace=True )
fp_m1.head()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy This is separate from the ipykernel package so we can avoid doing imports until
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
region_month user_id device is_access is_fp is_sp 0 2013-01 10061580 FP 1.0 1.0 0.0 1 2013-01 10154440 FP 0.0 0.0 0.0 2 2013-01 10165615 FP 1.0 1.0 0.0 3 2013-01 10321356 FP 1.0 1.0 0.0 4 2013-01 10447112 FP 1.0 1.0 0.0
#### 1月份通过非智能手机访问但2月份没有访问的用户,或者通过智能手机访问的用户
fp_m1 = fp_m1[(fp_m1['is_access' ]==0 ) | (fp_m1['is_sp' ]==1 )]
fp_m1.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
region_month user_id device is_access is_fp is_sp 1 2013-01 10154440 FP 0.0 0.0 0.0 7 2013-01 10528830 FP 0.0 0.0 0.0 20 2013-01 1163733 FP 1.0 0.0 1.0 21 2013-01 11727630 FP 0.0 0.0 0.0 43 2013-01 13401362 FP 1.0 0.0 1.0
#### 以上得到的即是可用于逻辑回归的标签项 ## 关于是否是每天访问游戏的数据的整理
fp_dau = dau[(dau['device' ]=='FP' ) & (dau['region_month' ]=='2013-01' )]
fp_dau['is_access' ] = 1
fp_dau.head()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy after removing the cwd from sys.path.
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
region_month region_day app_name user_id device is_access 0 2013-01 2013-01-01 game-02 10061580 FP 1 1 2013-01 2013-01-01 game-02 10154440 FP 1 3 2013-01 2013-01-01 game-02 10165615 FP 1 4 2013-01 2013-01-01 game-02 10321356 FP 1 6 2013-01 2013-01-01 game-02 10447112 FP 1
b = []
for a in np.arange(1 ,32 ):
b.append('X' +str(a)+'day' )
fp_dau_pivot = pd.pivot_table(fp_dau, values='is_access' , columns='region_day' , index='user_id' , fill_value=0 )
fp_dau_pivot.columns = b
fp_dau_pivot.reset_index(inplace=True )
fp_dau_pivot.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day … X22day X23day X24day X25day X26day X27day X28day X29day X30day X31day 0 397286 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1 1 1 1 471341 1 1 1 1 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 2 503874 1 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 3 512250 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1 1 1 4 513811 0 0 0 0 0 0 0 0 0 … 1 0 0 0 0 0 1 1 0 1
5 rows × 32 columns
fp_dau_m = pd.merge(fp_dau_pivot, fp_m1[['user_id' ,'is_sp' ]], how='inner' , on='user_id' )
fp_dau_m.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day … X23day X24day X25day X26day X27day X28day X29day X30day X31day is_sp 0 471341 1 1 1 1 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 1.0 1 503874 1 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0.0 2 1073544 0 0 0 0 0 0 0 0 0 … 1 1 1 0 0 0 0 0 0 0.0 3 1073864 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0.0 4 1163733 1 1 0 0 0 0 0 0 0 … 1 1 1 1 1 1 0 0 0 1.0
5 rows × 33 columns
fp_dau_m.isna().sum().sum()
0
fp_dau_m.is_sp.value_counts()
0.0 190 1.0 62 Name: is_sp, dtype: int64 #### 以上数据显示,is_sp 指示: 1表示2月份通过智能手机来访问的用户, 0表示用户为流失用户 2月份流失的用户数有190个, 更换为智能手机用户数为62个! ## 逻辑回归处理 ### 1.sklearn #### 通过修改 solve 和 惩罚系数 C ,可以将模型的准确度提升至 100%
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs' ,C=10 )
x = fp_dau_m.iloc[:,1 :-1 ]
y = fp_dau_m.iloc[:,-1 ]
lr.fit(x,y)
print('系数项:' ,lr.coef_)
print('截距项:' ,lr.intercept_)
print('得分是:' ,lr.score(x,y))
系数项: [[ 1.64264315 0.38232509 0.27375659 1.77818234 -1.2604587 -0.62425027 1.64964331 0.94366796 -0.30971957 -2.45689215 1.05453162 -0.49567095 1.37452985 -0.79198757 -1.39648934 0.18038175 -0.34026571 1.01401641 -0.49919155 -0.25791649 0.98296119 1.03952236 -1.03446927 1.53177282 -0.12212919 0.30942289 0.31267693 -0.08203749 1.32893163 1.57890364 1.29380472]] 截距项: [-3.9031072] 得分是: 0.9047619047619048
yp = lr.predict_proba(x)[:,1 ]
df = fp_dau_m.copy()
df['prob' ] = yp
df['pred' ] = df['prob' ].apply(lambda x: 1 if x > 0.5 else 0 )
df.head(15 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day … X25day X26day X27day X28day X29day X30day X31day is_sp prob pred 0 471341 1 1 1 1 0 0 0 0 0 … 0 0 0 0 0 0 0 1.0 0.543341 1 1 503874 1 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.094451 0 2 1073544 0 0 0 0 0 0 0 0 0 … 1 0 0 0 0 0 0 0.0 0.002510 0 3 1073864 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.025567 0 4 1163733 1 1 0 0 0 0 0 0 0 … 1 1 1 1 0 0 0 1.0 0.849838 1 5 1454629 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.073879 0 6 1557628 0 0 0 0 0 0 0 0 1 … 0 0 0 0 0 0 1 0.0 0.051221 0 7 2241462 1 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 1.0 0.094451 0 8 2313236 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.085385 0 9 2477685 0 0 0 0 0 0 0 0 0 … 1 0 0 0 0 0 0 0.0 0.017546 0 10 2541741 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.001726 0 11 2628661 0 0 0 0 0 1 0 0 0 … 0 1 0 0 0 0 0 0.0 0.014515 0 12 3509436 0 1 0 1 1 1 0 1 1 … 1 1 1 1 1 1 1 1.0 0.987940 1 13 3509436 0 1 0 1 1 1 0 1 1 … 1 1 1 1 1 1 1 1.0 0.987940 1 14 3955950 1 1 1 1 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.543341 1
15 rows × 35 columns
df.groupby(['is_sp' ,'pred' ])['user_id' ].count().reset_index()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
is_sp pred user_id 0 0.0 0 181 1 0.0 1 9 2 1.0 0 15 3 1.0 1 47
len(df[df['is_sp' ]==df['pred' ]])/len(df)
0.9047619047619048 #### 此模型,无需修改任何参数即可达到准确度 100% 。 重点在于 solve 和 C 的参数。
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV(cv=10 )
x = fp_dau_m.iloc[:,1 :-1 ]
y = fp_dau_m.iloc[:,-1 ]
lr.fit(x,y)
print('系数项:' ,lr.coef_)
print('截距项:' ,lr.intercept_)
print('-----------------------------------------------' )
print('得分是: ' ,lr.score(x,y))
系数项: [[ 0.66247469 0.39566209 0.12089587 0.72621501 -0.14485039 -0.11496137 0.50433275 0.25667173 0.11561233 -0.48159577 0.23713178 -0.12897139 0.31542595 -0.16714406 -0.1914315 -0.09390318 -0.05036135 0.0924934 -0.14949742 -0.05918408 0.52355482 0.58543392 0.0882812 0.39783666 0.07477356 0.14874974 0.39921228 0.38402639 0.68729765 0.6331324 0.55885631]] 截距项: [-2.95546571] ———————————————– 得分是: 0.8928571428571429 ### statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as fsm
x = fp_dau_m.iloc[:,1 :-1 ]
x['intercept' ] = 1.0
y = fp_dau_m.iloc[:,-1 ]
logit = sm.Logit(y, x)
result = logit.fit(method='bfgs' ,maxiter=100 )
Warning: Maximum number of iterations has been exceeded. Current function value: 0.222887 Iterations: 100 Function evaluations: 101 Gradient evaluations: 101 C:\Users\sylva\AppData\Roaming\Python\Python36\site-packages\statsmodels\base\model.py:508: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals “Check mle_retvals”, ConvergenceWarning)
result.pred_table()
array([[180., 10.], [ 14., 48.]])
print(result.summary2())
Results: Logit ================================================================= Model: Logit Pseudo R-squared: 0.601 Dependent Variable: is_sp AIC: 176.3352 Date: 2018-08-24 12:07 BIC: 289.2770 No. Observations: 252 Log-Likelihood: -56.168 Df Model: 31 LL-Null: -140.60 Df Residuals: 220 LLR p-value: 6.6358e-21 Converged: 0.0000 Scale: 1.0000 —————————————————————— Coef. Std.Err. z P>|z| [0.025 0.975] —————————————————————— X1day 1.9894 0.8047 2.4720 0.0134 0.4121 3.5666 X2day 0.3311 1.0705 0.3093 0.7571 -1.7671 2.4293 X3day 0.3793 0.9406 0.4033 0.6867 -1.4641 2.2227 X4day 2.0422 0.8359 2.4430 0.0146 0.4038 3.6805 X5day -1.7597 1.1991 -1.4675 0.1422 -4.1100 0.5906 X6day -0.6679 1.1717 -0.5701 0.5686 -2.9643 1.6285 X7day 2.0157 1.1176 1.8036 0.0713 -0.1747 4.2061 X8day 1.2119 1.3505 0.8974 0.3695 -1.4350 3.8589 X9day -0.4495 1.1874 -0.3786 0.7050 -2.7768 1.8778 X10day -3.2374 1.5580 -2.0779 0.0377 -6.2911 -0.1837 X11day 1.4392 1.2234 1.1764 0.2394 -0.9586 3.8370 X12day -0.6389 1.5297 -0.4176 0.6762 -3.6370 2.3592 X13day 1.7797 1.1424 1.5579 0.1193 -0.4594 4.0188 X14day -1.1242 1.2455 -0.9026 0.3668 -3.5653 1.3170 X15day -1.8115 1.3050 -1.3881 0.1651 -4.3694 0.7463 X16day 0.4940 1.1666 0.4234 0.6720 -1.7925 2.7804 X17day -0.4448 1.2234 -0.3636 0.7162 -2.8427 1.9531 X18day 1.4321 1.1465 1.2491 0.2116 -0.8150 3.6791 X19day -0.6132 1.1990 -0.5114 0.6091 -2.9632 1.7369 X20day -0.3130 1.4007 -0.2235 0.8232 -3.0585 2.4324 X21day 0.9587 1.2558 0.7634 0.4452 -1.5027 3.4201 X22day 1.1954 1.1238 1.0637 0.2875 -1.0072 3.3980 X23day -1.5371 1.2303 -1.2494 0.2115 -3.9486 0.8743 X24day 1.8445 1.1038 1.6710 0.0947 -0.3190 4.0080 X25day 0.1292 1.5317 0.0844 0.9328 -2.8727 3.1312 X26day 0.3131 1.4280 0.2192 0.8265 -2.4858 3.1119 X27day 0.3365 1.2965 0.2596 0.7952 -2.2045 2.8776 X28day -0.3918 1.8515 -0.2116 0.8324 -4.0207 3.2372 X29day 1.5941 1.0565 1.5088 0.1314 -0.4767 3.6648 X30day 1.9943 1.2117 1.6459 0.0998 -0.3806 4.3692 X31day 1.5214 1.1798 1.2896 0.1972 -0.7908 3.8337 intercept -4.2502 0.5904 -7.1985 0.0000 -5.4074 -3.0930 =================================================================
xx = fp_dau_m.iloc[:,1 :-1 ]
xx['intercept' ] = 1.0
y_p = result.predict(xx)
ydf = fp_dau_m.copy()
ydf['prob' ] = y_p
ydf['pred' ] = ydf['prob' ].apply(lambda x: 1 if x > 0.5 else 0 )
ydf.head(15 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day … X25day X26day X27day X28day X29day X30day X31day is_sp prob pred 0 471341 1 1 1 1 0 0 0 0 0 … 0 0 0 0 0 0 0 1.0 0.620506 1 1 503874 1 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.094416 0 2 1073544 0 0 0 0 0 0 0 0 0 … 1 0 0 0 0 0 0 0.0 0.000866 0 3 1073864 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.019167 0 4 1163733 1 1 0 0 0 0 0 0 0 … 1 1 1 1 0 0 0 1.0 0.870576 1 5 1454629 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.077951 0 6 1557628 0 0 0 0 0 0 0 0 1 … 0 0 0 0 0 0 1 0.0 0.039991 0 7 2241462 1 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 1.0 0.094416 0 8 2313236 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.082739 0 9 2477685 0 0 0 0 0 0 0 0 0 … 1 0 0 0 0 0 0 0.0 0.015969 0 10 2541741 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.000560 0 11 2628661 0 0 0 0 0 1 0 0 0 … 0 1 0 0 0 0 0 0.0 0.009902 0 12 3509436 0 1 0 1 1 1 0 1 1 … 1 1 1 1 1 1 1 1.0 0.992456 1 13 3509436 0 1 0 1 1 1 0 1 1 … 1 1 1 1 1 1 1 1.0 0.992456 1 14 3955950 1 1 1 1 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.620506 1
15 rows × 35 columns
ydf.groupby(['is_sp' ,'pred' ])['user_id' ].count().reset_index()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
is_sp pred user_id 0 0.0 0 180 1 0.0 1 10 2 1.0 0 14 3 1.0 1 48
len(ydf[ydf['is_sp' ]==ydf['pred' ]])/len(ydf)
0.9047619047619048 ### 结果观察 根据 sklearn 预测的结果,有9名用户预测为1,即进行了账号迁转,但实际并没有。 根据过去的访问情况来推断,这些用户应该进行了账号迁转,然而实际却是流失的用户群体。
df.head(10 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day … X25day X26day X27day X28day X29day X30day X31day is_sp prob pred 0 471341 1 1 1 1 0 0 0 0 0 … 0 0 0 0 0 0 0 1.0 0.543341 1 1 503874 1 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.094451 0 2 1073544 0 0 0 0 0 0 0 0 0 … 1 0 0 0 0 0 0 0.0 0.002510 0 3 1073864 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.025567 0 4 1163733 1 1 0 0 0 0 0 0 0 … 1 1 1 1 0 0 0 1.0 0.849838 1 5 1454629 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.073879 0 6 1557628 0 0 0 0 0 0 0 0 1 … 0 0 0 0 0 0 1 0.0 0.051221 0 7 2241462 1 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 1.0 0.094451 0 8 2313236 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.085385 0 9 2477685 0 0 0 0 0 0 0 0 0 … 1 0 0 0 0 0 0 0.0 0.017546 0
10 rows × 35 columns
df1 = df[(df['is_sp' ]==1 ) & (df['pred' ]==1 )]
df1.sort_values(by='prob' ,ascending=True ).head(15 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day … X25day X26day X27day X28day X29day X30day X31day is_sp prob pred 228 52776438 1 1 1 1 1 1 1 1 1 … 0 0 0 0 0 0 0 1.0 0.512293 1 171 32762652 1 1 1 1 1 1 1 1 1 … 0 0 0 0 0 0 0 1.0 0.512293 1 155 27800629 1 1 1 1 0 0 0 0 0 … 0 0 0 0 0 0 0 1.0 0.543341 1 0 471341 1 1 1 1 0 0 0 0 0 … 0 0 0 0 0 0 0 1.0 0.543341 1 36 8645980 0 0 0 1 0 0 0 0 0 … 0 0 0 0 1 0 0 1.0 0.551574 1 37 8645980 0 0 0 1 0 0 0 0 0 … 0 0 0 0 1 0 0 1.0 0.551574 1 169 32500332 1 1 1 1 1 1 1 1 1 … 1 0 0 0 0 0 0 1.0 0.587923 1 55 11600349 0 1 1 1 1 1 1 1 1 … 0 0 0 1 1 1 1 1.0 0.684198 1 56 11600349 0 1 1 1 1 1 1 1 1 … 0 0 0 1 1 1 1 1.0 0.684198 1 146 25787360 0 0 0 0 1 0 1 1 1 … 0 0 1 0 0 0 0 1.0 0.696295 1 145 25787360 0 0 0 0 1 0 1 1 1 … 0 0 1 0 0 0 0 1.0 0.696295 1 4 1163733 1 1 0 0 0 0 0 0 0 … 1 1 1 1 0 0 0 1.0 0.849838 1 48 10406653 0 1 1 1 1 1 1 1 0 … 1 0 1 1 1 1 1 1.0 0.865393 1 49 10406653 0 1 1 1 1 1 1 1 0 … 1 0 1 1 1 1 1 1.0 0.865393 1 165 31066299 0 1 1 1 0 1 1 1 1 … 1 1 1 0 1 1 0 1.0 0.951970 1
15 rows × 35 columns
df2 = df[(df['is_sp' ]==1 ) & (df['pred' ]==1 )]
df2.sort_values(by='prob' ,ascending=False ).head(15 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day … X25day X26day X27day X28day X29day X30day X31day is_sp prob pred 136 24791702 1 1 0 1 0 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.998618 1 137 24791702 1 1 0 1 0 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.998618 1 44 9567562 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.996302 1 43 9567562 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.996302 1 139 24900784 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.993923 1 124 23113079 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.993923 1 133 24581383 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.993923 1 134 24581383 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.993923 1 138 24900784 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.993923 1 123 23113079 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.993923 1 114 21551429 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.993923 1 147 27003770 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.993923 1 148 27003770 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.993923 1 150 27602710 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.993923 1 151 27602710 1 1 1 1 1 1 1 1 1 … 1 1 1 1 1 1 1 1.0 0.993923 1
15 rows × 35 columns
df3 = df[(df['is_sp' ]==0 ) & (df['pred' ]==1 )]
df3.sort_values(by='prob' ,ascending=False ).head(15 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day … X25day X26day X27day X28day X29day X30day X31day is_sp prob pred 194 41590801 0 0 0 0 0 0 0 0 0 … 0 0 0 0 1 0 1 0.0 0.677458 1 108 19432099 1 1 1 1 0 1 1 1 1 … 0 0 0 0 0 0 0 0.0 0.643061 1 203 43451947 1 1 1 1 1 0 1 1 1 … 1 0 0 1 1 0 0 0.0 0.599921 1 197 42276142 1 1 1 1 1 1 0 1 1 … 1 1 1 1 1 0 0 0.0 0.577420 1 209 46285446 0 0 0 0 1 1 1 1 1 … 1 1 1 0 1 0 0 0.0 0.576873 1 14 3955950 1 1 1 1 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.543341 1 158 28391896 1 1 1 1 1 1 1 1 1 … 0 0 0 0 0 0 0 0.0 0.512293 1 240 59561276 1 1 1 1 1 1 1 1 1 … 0 0 0 0 0 0 0 0.0 0.512293 1 27 6147878 1 0 0 1 1 1 1 1 1 … 1 1 0 0 0 0 0 0.0 0.502182 1
9 rows × 35 columns
df4 = df[(df['is_sp' ]==0 ) & (df['pred' ]==1 )]
df4.sort_values(by='prob' ,ascending=True ).head(15 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day … X25day X26day X27day X28day X29day X30day X31day is_sp prob pred 27 6147878 1 0 0 1 1 1 1 1 1 … 1 1 0 0 0 0 0 0.0 0.502182 1 158 28391896 1 1 1 1 1 1 1 1 1 … 0 0 0 0 0 0 0 0.0 0.512293 1 240 59561276 1 1 1 1 1 1 1 1 1 … 0 0 0 0 0 0 0 0.0 0.512293 1 14 3955950 1 1 1 1 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.543341 1 209 46285446 0 0 0 0 1 1 1 1 1 … 1 1 1 0 1 0 0 0.0 0.576873 1 197 42276142 1 1 1 1 1 1 0 1 1 … 1 1 1 1 1 0 0 0.0 0.577420 1 203 43451947 1 1 1 1 1 0 1 1 1 … 1 0 0 1 1 0 0 0.0 0.599921 1 108 19432099 1 1 1 1 0 1 1 1 1 … 0 0 0 0 0 0 0 0.0 0.643061 1 194 41590801 0 0 0 0 0 0 0 0 0 … 0 0 0 0 1 0 1 0.0 0.677458 1
9 rows × 35 columns
df5 = df[(df['is_sp' ]==0 ) & (df['pred' ]==0 )]
df5.sort_values(by='prob' ,ascending=True ).head(15 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day … X25day X26day X27day X28day X29day X30day X31day is_sp prob pred 149 27249550 0 0 0 1 1 1 0 0 0 … 0 0 0 0 0 0 0 0.0 0.000946 0 10 2541741 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.001726 0 242 60725457 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.001726 0 101 18408297 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.001745 0 172 33766090 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.002257 0 2 1073544 0 0 0 0 0 0 0 0 0 … 1 0 0 0 0 0 0 0.0 0.002510 0 227 52612953 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.003087 0 63 12582684 0 0 0 1 1 0 1 0 0 … 0 0 0 0 0 0 0 0.0 0.004780 0 208 46056688 0 0 0 0 0 1 1 0 0 … 0 0 0 0 0 0 0 0.0 0.004799 0 66 13157777 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.004969 0 190 40654033 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.004969 0 120 22437652 0 0 0 0 1 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.005689 0 87 16601600 0 0 0 0 1 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.005689 0 70 13967453 0 0 0 0 1 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.005689 0 112 20955934 0 0 0 0 1 0 0 0 0 … 0 0 0 0 0 0 0 0.0 0.005689 0
15 rows × 35 columns
df6 = df[(df['is_sp' ]==1 ) & (df['pred' ]==0 )]
df6.sort_values(by='prob' ,ascending=False ).head(15 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day … X25day X26day X27day X28day X29day X30day X31day is_sp prob pred 198 42438713 1 1 1 1 1 1 1 0 0 … 0 0 0 0 0 0 0 1.0 0.484688 0 127 23689923 1 1 0 1 1 1 1 1 1 … 0 0 0 0 0 0 0 1.0 0.359100 0 213 47332069 0 0 0 0 0 0 0 0 0 … 1 1 0 0 0 0 0 1.0 0.281079 0 140 24914421 1 1 1 0 0 0 0 1 0 … 0 1 0 0 0 0 0 1.0 0.278119 0 226 52131958 0 0 1 1 1 1 1 1 1 … 1 1 1 0 0 0 0 1.0 0.259709 0 212 47266966 1 0 0 1 0 1 1 1 1 … 0 0 0 0 0 0 0 1.0 0.232730 0 236 57869405 0 0 0 0 0 0 1 1 0 … 0 0 0 0 0 0 0 1.0 0.212521 0 161 29698758 1 1 1 0 0 0 0 0 0 … 0 0 0 0 0 0 0 1.0 0.167370 0 30 7177251 1 1 1 1 1 1 0 0 0 … 0 0 0 0 0 0 0 1.0 0.153046 0 7 2241462 1 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 1.0 0.094451 0 67 13401362 1 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 1.0 0.094451 0 80 15569351 0 0 0 0 0 0 1 0 1 … 0 0 0 0 0 0 0 1.0 0.071546 0 93 17388480 0 0 0 0 0 0 0 0 0 … 0 0 0 0 1 0 0 1.0 0.070819 0 94 17388480 0 0 0 0 0 0 0 0 0 … 0 0 0 0 1 0 0 1.0 0.070819 0 163 30103279 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 1.0 0.028795 0
15 rows × 35 columns
## copy 问题的出现了,!!! = 等号只是引用内存地址, 变量最好用 copy() 属性!!
fp_dau_m.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day … X23day X24day X25day X26day X27day X28day X29day X30day X31day is_sp 0 471341 1 1 1 1 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 1.0 1 503874 1 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0.0 2 1073544 0 0 0 0 0 0 0 0 0 … 1 1 1 0 0 0 0 0 0 0.0 3 1073864 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0.0 4 1163733 1 1 0 0 0 0 0 0 0 … 1 1 1 1 1 1 0 0 0 1.0
5 rows × 33 columns
df.equals(fp_dau_m)
False
df.equals(ydf)
False