sesection7——逻辑回归分析—根据过去的行为能否预测当下

最新推荐文章于 2023-10-14 22:06:31 发布

keithic

最新推荐文章于 2023-10-14 22:06:31 发布

阅读量626

点赞数 1

分类专栏：机器学习实战文章标签： logistics

本文链接：https://blog.csdn.net/keithic/article/details/82015913

版权

机器学习实战专栏收录该内容

4 篇文章 0 订阅

订阅专栏

import pandas as pd
import numpy as np
import pymysql
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline

# 数据库引擎
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/datascience')

## 读取数据

# 读取数据
data = 'data/section7-dau.csv'

dau = pd.read_csv(data)

# 写入 MYSQL
# dau.to_sql('s7_dau',engine,index=False)

dau.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	region_month	region_day	app_name	user_id	device
0	2013-01	2013-01-01	game-02	10061580	FP
1	2013-01	2013-01-01	game-02	10154440	FP
2	2013-01	2013-01-01	game-02	10164762	SP
3	2013-01	2013-01-01	game-02	10165615	FP
4	2013-01	2013-01-01	game-02	10321356	FP

# 查看数据信息
dau.info()

print(dau.region_month.value_counts())
print(dau.region_day.unique())
print(dau.device.value_counts())

2013-01 25847 2013-02 23141 Name: region_month, dtype: int64 [‘2013-01-01’ ‘2013-01-02’ ‘2013-01-03’ ‘2013-01-04’ ‘2013-01-05’ ‘2013-01-06’ ‘2013-01-07’ ‘2013-01-08’ ‘2013-01-09’ ‘2013-01-10’ ‘2013-01-11’ ‘2013-01-12’ ‘2013-01-13’ ‘2013-01-14’ ‘2013-01-15’ ‘2013-01-16’ ‘2013-01-17’ ‘2013-01-18’ ‘2013-01-19’ ‘2013-01-20’ ‘2013-01-21’ ‘2013-01-22’ ‘2013-01-23’ ‘2013-01-24’ ‘2013-01-25’ ‘2013-01-26’ ‘2013-01-27’ ‘2013-01-28’ ‘2013-01-29’ ‘2013-01-30’ ‘2013-01-31’ ‘2013-02-01’ ‘2013-02-02’ ‘2013-02-03’ ‘2013-02-04’ ‘2013-02-05’ ‘2013-02-06’ ‘2013-02-07’ ‘2013-02-08’ ‘2013-02-09’ ‘2013-02-10’ ‘2013-02-11’ ‘2013-02-12’ ‘2013-02-13’ ‘2013-02-14’ ‘2013-02-15’ ‘2013-02-16’ ‘2013-02-17’ ‘2013-02-18’ ‘2013-02-19’ ‘2013-02-20’ ‘2013-02-21’ ‘2013-02-22’ ‘2013-02-23’ ‘2013-02-24’ ‘2013-02-25’ ‘2013-02-26’ ‘2013-02-27’ ‘2013-02-28’] FP 30331 SP 18657 Name: device, dtype: int64 ## 关于用户是否进行了账号迁转的数据的整理 #### 提取需要的数据列，去除重复项，得到用户按月份和设备登陆的信息

mau = dau[['region_month','user_id','device']]
mau.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	region_month	user_id	device
0	2013-01	10061580	FP
1	2013-01	10154440	FP
2	2013-01	10164762	SP
3	2013-01	10165615	FP
4	2013-01	10321356	FP

# 重复数据 （用户在某一月使用相同设备登陆）
print(mau.duplicated().sum())
mau.drop_duplicates(inplace=True)
print(mau.duplicated().sum())

46007 0 D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy This is separate from the ipykernel package so we can avoid doing imports until #### 非智能手机和智能手机分开

fp = dau[dau['device']=='FP'][['region_month','user_id','device']].drop_duplicates()
sp = dau[dau['device']=='SP'][['region_month','user_id','device']].drop_duplicates()

print(fp.info())
print(sp.info())

#### 分别获取1月份和2月份的数据

# 分别获取1月份和2月份的数据

fp_m1 = fp[fp['region_month']=='2013-01']
fp_m2 = fp[fp['region_month']=='2013-02']

sp_m1 = sp[sp['region_month']=='2013-01']
sp_m2 = sp[sp['region_month']=='2013-02']

#### 1月份的非智能手机用户在2月份的访问情况

# 1月份的非智能手机用户在2月份的访问情况

mau['is_access'] = 1
fp_m1 = pd.merge(fp_m1,mau[mau['region_month']=='2013-02'][['user_id','is_access']],how='left',on='user_id')
fp_m1['is_access'].fillna(0,inplace=True)

fp_m1.head()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy This is separate from the ipykernel package so we can avoid doing imports until

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	region_month	user_id	device	is_access
0	2013-01	10061580	FP	1.0
1	2013-01	10154440	FP	0.0
2	2013-01	10165615	FP	1.0
3	2013-01	10321356	FP	1.0
4	2013-01	10447112	FP	1.0

#### 1月份访问过游戏的非智能手机用户在2月份是否是继续通过非智能手机来访问的

# 1月份访问过游戏的非智能手机用户在2月份是否是继续通过非智能手机来访问的

fp_m2['is_fp'] = 1
fp_m1 = pd.merge(fp_m1,fp_m2[['user_id','is_fp']],how='left',on='user_id')
fp_m1['is_fp'].fillna(0,inplace=True)

fp_m1.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	region_month	user_id	device	is_access	is_fp
0	2013-01	10061580	FP	1.0	1.0
1	2013-01	10154440	FP	0.0	0.0
2	2013-01	10165615	FP	1.0	1.0
3	2013-01	10321356	FP	1.0	1.0
4	2013-01	10447112	FP	1.0	1.0

#### 1月份访问过游戏的非智能手机用户在2月份是否是通过智能手机来访问的

# 1月份访问过游戏的非智能手机用户在2月份是否是通过智能手机来访问的

sp_m2['is_sp'] = 1
fp_m1 = pd.merge(fp_m1,sp_m2[['user_id','is_sp']],how='left',on='user_id')
fp_m1['is_sp'].fillna(0,inplace=True)

fp_m1.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	region_month	user_id	device	is_access	is_fp
0	2013-01	10061580	FP	1.0	1.0
1	2013-01	10154440	FP	0.0	0.0
2	2013-01	10165615	FP	1.0	1.0
3	2013-01	10321356	FP	1.0	1.0
4	2013-01	10447112	FP	1.0	1.0

#### 1月份通过非智能手机访问但2月份没有访问的用户，或者通过智能手机访问的用户

# 1月份通过非智能手机访问但2月份没有访问的用户，或者通过智能手机访问的用户

fp_m1 = fp_m1[(fp_m1['is_access']==0) | (fp_m1['is_sp']==1)]
fp_m1.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	region_month	user_id	device	is_access	is_sp
1	2013-01	10154440	FP	0.0	0.0
7	2013-01	10528830	FP	0.0	0.0
20	2013-01	1163733	FP	1.0	1.0
21	2013-01	11727630	FP	0.0	0.0
43	2013-01	13401362	FP	1.0	1.0

#### 以上得到的即是可用于逻辑回归的标签项 ## 关于是否是每天访问游戏的数据的整理

# 标记每天登陆记录

fp_dau = dau[(dau['device']=='FP') & (dau['region_month']=='2013-01')]
fp_dau['is_access'] = 1
fp_dau.head()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy after removing the cwd from sys.path.

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	region_month	region_day	app_name	user_id	device	is_access
0	2013-01	2013-01-01	game-02	10061580	FP	1
1	2013-01	2013-01-01	game-02	10154440	FP	1
3	2013-01	2013-01-01	game-02	10165615	FP	1
4	2013-01	2013-01-01	game-02	10321356	FP	1
6	2013-01	2013-01-01	game-02	10447112	FP	1

# COLUMNS 名字
b = []
for a in np.arange(1,32):
    b.append('X'+str(a)+'day')
# b.insert(0,'user_id')

# 透视表转化为登陆信息
fp_dau_pivot = pd.pivot_table(fp_dau, values='is_access', columns='region_day', index='user_id', fill_value=0)
fp_dau_pivot.columns = b
fp_dau_pivot.reset_index(inplace=True)

fp_dau_pivot.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	user_id	X1day	X2day	X3day	X4day	X5day	X6day	X7day	X8day	X9day	…	X22day	X23day	X24day	X25day	X26day	X27day	X28day	X29day	X30day	X31day
0	397286	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1	1	1
1	471341	1	1	1	1	0	0	0	0	0	…	0	0	0	0	0	0	0	0	0	0
2	503874	1	0	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	0	0	0
3	512250	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1	1	1
4	513811	0	0	0	0	0	0	0	0	0	…	1	0	0	0	0	0	1	1	0	1

5 rows × 32 columns

# 将2月份的访问数据和智能手机用户数据合并,注意这里是 inner 。。

fp_dau_m = pd.merge(fp_dau_pivot, fp_m1[['user_id','is_sp']], how='inner', on='user_id')

fp_dau_m.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	user_id	X1day	X2day	X3day	X4day	…	X23day	X24day	X25day	X26day	X27day	X28day	is_sp
0	471341	1	1	1	1	…	0	0	0	0	0	0	1.0
1	503874	1	0	0	0	…	0	0	0	0	0	0	0.0
2	1073544	0	0	0	0	…	1	1	1	0	0	0	0.0
3	1073864	0	0	0	0	…	0	0	0	0	0	0	0.0
4	1163733	1	1	0	0	…	1	1	1	1	1	1	1.0

5 rows × 33 columns

fp_dau_m.isna().sum().sum()

fp_dau_m.is_sp.value_counts()

0.0 190 1.0 62 Name: is_sp, dtype: int64 #### 以上数据显示，is_sp 指示： 1表示2月份通过智能手机来访问的用户， 0表示用户为流失用户 2月份流失的用户数有190个，更换为智能手机用户数为62个！ ## 逻辑回归处理 ### 1.sklearn #### 通过修改 solve 和惩罚系数 C ，可以将模型的准确度提升至 100%

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs',C=10)

x = fp_dau_m.iloc[:,1:-1]
y = fp_dau_m.iloc[:,-1]

lr.fit(x,y)

print('系数项：',lr.coef_)
print('截距项：',lr.intercept_)
print('得分是：',lr.score(x,y))

系数项： [[ 1.64264315 0.38232509 0.27375659 1.77818234 -1.2604587 -0.62425027 1.64964331 0.94366796 -0.30971957 -2.45689215 1.05453162 -0.49567095 1.37452985 -0.79198757 -1.39648934 0.18038175 -0.34026571 1.01401641 -0.49919155 -0.25791649 0.98296119 1.03952236 -1.03446927 1.53177282 -0.12212919 0.30942289 0.31267693 -0.08203749 1.32893163 1.57890364 1.29380472]] 截距项： [-3.9031072] 得分是： 0.9047619047619048

yp = lr.predict_proba(x)[:,1]

df = fp_dau_m.copy()
df['prob'] = yp
df['pred'] = df['prob'].apply(lambda x: 1 if x > 0.5 else 0)
df.head(15)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	user_id	X1day	X2day	X3day	X4day	X5day	X6day	X8day	X9day	…	X25day	X26day	X27day	X28day	X29day	X30day	X31day	is_sp	prob	pred
0	471341	1	1	1	1	0	0	0	0	…	0	0	0	0	0	0	0	1.0	0.543341	1
1	503874	1	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	0.0	0.094451	0
2	1073544	0	0	0	0	0	0	0	0	…	1	0	0	0	0	0	0	0.0	0.002510	0
3	1073864	0	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	0.0	0.025567	0
4	1163733	1	1	0	0	0	0	0	0	…	1	1	1	1	0	0	0	1.0	0.849838	1
5	1454629	0	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	0.0	0.073879	0
6	1557628	0	0	0	0	0	0	0	1	…	0	0	0	0	0	0	1	0.0	0.051221	0
7	2241462	1	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	1.0	0.094451	0
8	2313236	0	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	0.0	0.085385	0
9	2477685	0	0	0	0	0	0	0	0	…	1	0	0	0	0	0	0	0.0	0.017546	0
10	2541741	0	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	0.0	0.001726	0
11	2628661	0	0	0	0	0	1	0	0	…	0	1	0	0	0	0	0	0.0	0.014515	0
12	3509436	0	1	0	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.987940	1
13	3509436	0	1	0	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.987940	1
14	3955950	1	1	1	1	0	0	0	0	…	0	0	0	0	0	0	0	0.0	0.543341	1

15 rows × 35 columns

df.groupby(['is_sp','pred'])['user_id'].count().reset_index()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	is_sp	pred	user_id
0	0.0	0	181
1	0.0	1	9
2	1.0	0	15
3	1.0	1	47

len(df[df['is_sp']==df['pred']])/len(df)

0.9047619047619048 #### 此模型，无需修改任何参数即可达到准确度 100% 。重点在于 solve 和 C 的参数。

from sklearn.linear_model import LogisticRegressionCV

lr = LogisticRegressionCV(cv=10)

x = fp_dau_m.iloc[:,1:-1]
y = fp_dau_m.iloc[:,-1]

lr.fit(x,y)

print('系数项：',lr.coef_)
print('截距项：',lr.intercept_)
print('-----------------------------------------------')
print('得分是: ',lr.score(x,y))

系数项： [[ 0.66247469 0.39566209 0.12089587 0.72621501 -0.14485039 -0.11496137 0.50433275 0.25667173 0.11561233 -0.48159577 0.23713178 -0.12897139 0.31542595 -0.16714406 -0.1914315 -0.09390318 -0.05036135 0.0924934 -0.14949742 -0.05918408 0.52355482 0.58543392 0.0882812 0.39783666 0.07477356 0.14874974 0.39921228 0.38402639 0.68729765 0.6331324 0.55885631]] 截距项： [-2.95546571] ———————————————– 得分是: 0.8928571428571429 ### statsmodels

import statsmodels.api as sm
import statsmodels.formula.api as fsm     # 这个是用公式去拟合，不好用

x = fp_dau_m.iloc[:,1:-1]
x['intercept'] = 1.0          # 此处是为logistics回归添加截距项
y = fp_dau_m.iloc[:,-1]

logit = sm.Logit(y, x)
result = logit.fit(method='bfgs',maxiter=100)

Warning: Maximum number of iterations has been exceeded. Current function value: 0.222887 Iterations: 100 Function evaluations: 101 Gradient evaluations: 101 C:\Users\sylva\AppData\Roaming\Python\Python36\site-packages\statsmodels\base\model.py:508: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals “Check mle_retvals”, ConvergenceWarning)

# result1 = logit.fit_regularized(alpha=5)

result.pred_table()

array([[180., 10.], [ 14., 48.]])

# result1.pred_table()

print(result.summary2())

Results: Logit ================================================================= Model: Logit Pseudo R-squared: 0.601 Dependent Variable: is_sp AIC: 176.3352 Date: 2018-08-24 12:07 BIC: 289.2770 No. Observations: 252 Log-Likelihood: -56.168 Df Model: 31 LL-Null: -140.60 Df Residuals: 220 LLR p-value: 6.6358e-21 Converged: 0.0000 Scale: 1.0000 —————————————————————— Coef. Std.Err. z P>|z| [0.025 0.975] —————————————————————— X1day 1.9894 0.8047 2.4720 0.0134 0.4121 3.5666 X2day 0.3311 1.0705 0.3093 0.7571 -1.7671 2.4293 X3day 0.3793 0.9406 0.4033 0.6867 -1.4641 2.2227 X4day 2.0422 0.8359 2.4430 0.0146 0.4038 3.6805 X5day -1.7597 1.1991 -1.4675 0.1422 -4.1100 0.5906 X6day -0.6679 1.1717 -0.5701 0.5686 -2.9643 1.6285 X7day 2.0157 1.1176 1.8036 0.0713 -0.1747 4.2061 X8day 1.2119 1.3505 0.8974 0.3695 -1.4350 3.8589 X9day -0.4495 1.1874 -0.3786 0.7050 -2.7768 1.8778 X10day -3.2374 1.5580 -2.0779 0.0377 -6.2911 -0.1837 X11day 1.4392 1.2234 1.1764 0.2394 -0.9586 3.8370 X12day -0.6389 1.5297 -0.4176 0.6762 -3.6370 2.3592 X13day 1.7797 1.1424 1.5579 0.1193 -0.4594 4.0188 X14day -1.1242 1.2455 -0.9026 0.3668 -3.5653 1.3170 X15day -1.8115 1.3050 -1.3881 0.1651 -4.3694 0.7463 X16day 0.4940 1.1666 0.4234 0.6720 -1.7925 2.7804 X17day -0.4448 1.2234 -0.3636 0.7162 -2.8427 1.9531 X18day 1.4321 1.1465 1.2491 0.2116 -0.8150 3.6791 X19day -0.6132 1.1990 -0.5114 0.6091 -2.9632 1.7369 X20day -0.3130 1.4007 -0.2235 0.8232 -3.0585 2.4324 X21day 0.9587 1.2558 0.7634 0.4452 -1.5027 3.4201 X22day 1.1954 1.1238 1.0637 0.2875 -1.0072 3.3980 X23day -1.5371 1.2303 -1.2494 0.2115 -3.9486 0.8743 X24day 1.8445 1.1038 1.6710 0.0947 -0.3190 4.0080 X25day 0.1292 1.5317 0.0844 0.9328 -2.8727 3.1312 X26day 0.3131 1.4280 0.2192 0.8265 -2.4858 3.1119 X27day 0.3365 1.2965 0.2596 0.7952 -2.2045 2.8776 X28day -0.3918 1.8515 -0.2116 0.8324 -4.0207 3.2372 X29day 1.5941 1.0565 1.5088 0.1314 -0.4767 3.6648 X30day 1.9943 1.2117 1.6459 0.0998 -0.3806 4.3692 X31day 1.5214 1.1798 1.2896 0.1972 -0.7908 3.8337 intercept -4.2502 0.5904 -7.1985 0.0000 -5.4074 -3.0930 =================================================================

# print(result1.summary2())

xx = fp_dau_m.iloc[:,1:-1]
xx['intercept'] = 1.0         # 预测也要为logistics回归添加截距项

y_p = result.predict(xx)

ydf = fp_dau_m.copy()
ydf['prob'] = y_p
ydf['pred'] = ydf['prob'].apply(lambda x: 1 if x > 0.5 else 0)
ydf.head(15)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	user_id	X1day	X2day	X3day	X4day	X5day	X6day	X8day	X9day	…	X25day	X26day	X27day	X28day	X29day	X30day	X31day	is_sp	prob	pred
0	471341	1	1	1	1	0	0	0	0	…	0	0	0	0	0	0	0	1.0	0.620506	1
1	503874	1	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	0.0	0.094416	0
2	1073544	0	0	0	0	0	0	0	0	…	1	0	0	0	0	0	0	0.0	0.000866	0
3	1073864	0	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	0.0	0.019167	0
4	1163733	1	1	0	0	0	0	0	0	…	1	1	1	1	0	0	0	1.0	0.870576	1
5	1454629	0	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	0.0	0.077951	0
6	1557628	0	0	0	0	0	0	0	1	…	0	0	0	0	0	0	1	0.0	0.039991	0
7	2241462	1	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	1.0	0.094416	0
8	2313236	0	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	0.0	0.082739	0
9	2477685	0	0	0	0	0	0	0	0	…	1	0	0	0	0	0	0	0.0	0.015969	0
10	2541741	0	0	0	0	0	0	0	0	…	0	0	0	0	0	0	0	0.0	0.000560	0
11	2628661	0	0	0	0	0	1	0	0	…	0	1	0	0	0	0	0	0.0	0.009902	0
12	3509436	0	1	0	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.992456	1
13	3509436	0	1	0	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.992456	1
14	3955950	1	1	1	1	0	0	0	0	…	0	0	0	0	0	0	0	0.0	0.620506	1

15 rows × 35 columns

ydf.groupby(['is_sp','pred'])['user_id'].count().reset_index()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	is_sp	pred	user_id
0	0.0	0	180
1	0.0	1	10
2	1.0	0	14
3	1.0	1	48

len(ydf[ydf['is_sp']==ydf['pred']])/len(ydf)

0.9047619047619048 ### 结果观察根据 sklearn 预测的结果，有9名用户预测为1，即进行了账号迁转，但实际并没有。根据过去的访问情况来推断，这些用户应该进行了账号迁转，然而实际却是流失的用户群体。

df.head(10)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	user_id	X1day	X2day	X3day	X4day	X9day	…	X25day	X26day	X27day	X28day	X31day	is_sp	prob	pred
0	471341	1	1	1	1	0	…	0	0	0	0	0	1.0	0.543341	1
1	503874	1	0	0	0	0	…	0	0	0	0	0	0.0	0.094451	0
2	1073544	0	0	0	0	0	…	1	0	0	0	0	0.0	0.002510	0
3	1073864	0	0	0	0	0	…	0	0	0	0	0	0.0	0.025567	0
4	1163733	1	1	0	0	0	…	1	1	1	1	0	1.0	0.849838	1
5	1454629	0	0	0	0	0	…	0	0	0	0	0	0.0	0.073879	0
6	1557628	0	0	0	0	1	…	0	0	0	0	1	0.0	0.051221	0
7	2241462	1	0	0	0	0	…	0	0	0	0	0	1.0	0.094451	0
8	2313236	0	0	0	0	0	…	0	0	0	0	0	0.0	0.085385	0
9	2477685	0	0	0	0	0	…	1	0	0	0	0	0.0	0.017546	0

10 rows × 35 columns

df1 = df[(df['is_sp']==1) & (df['pred']==1)]
df1.sort_values(by='prob',ascending=True).head(15)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	user_id	X1day	X2day	X3day	X4day	X5day	X6day	X7day	X8day	X9day	…	X25day	X26day	X27day	X28day	X29day	X30day	X31day	is_sp	prob	pred
228	52776438	1	1	1	1	1	1	1	1	1	…	0	0	0	0	0	0	0	1.0	0.512293	1
171	32762652	1	1	1	1	1	1	1	1	1	…	0	0	0	0	0	0	0	1.0	0.512293	1
155	27800629	1	1	1	1	0	0	0	0	0	…	0	0	0	0	0	0	0	1.0	0.543341	1
0	471341	1	1	1	1	0	0	0	0	0	…	0	0	0	0	0	0	0	1.0	0.543341	1
36	8645980	0	0	0	1	0	0	0	0	0	…	0	0	0	0	1	0	0	1.0	0.551574	1
37	8645980	0	0	0	1	0	0	0	0	0	…	0	0	0	0	1	0	0	1.0	0.551574	1
169	32500332	1	1	1	1	1	1	1	1	1	…	1	0	0	0	0	0	0	1.0	0.587923	1
55	11600349	0	1	1	1	1	1	1	1	1	…	0	0	0	1	1	1	1	1.0	0.684198	1
56	11600349	0	1	1	1	1	1	1	1	1	…	0	0	0	1	1	1	1	1.0	0.684198	1
146	25787360	0	0	0	0	1	0	1	1	1	…	0	0	1	0	0	0	0	1.0	0.696295	1
145	25787360	0	0	0	0	1	0	1	1	1	…	0	0	1	0	0	0	0	1.0	0.696295	1
4	1163733	1	1	0	0	0	0	0	0	0	…	1	1	1	1	0	0	0	1.0	0.849838	1
48	10406653	0	1	1	1	1	1	1	1	0	…	1	0	1	1	1	1	1	1.0	0.865393	1
49	10406653	0	1	1	1	1	1	1	1	0	…	1	0	1	1	1	1	1	1.0	0.865393	1
165	31066299	0	1	1	1	0	1	1	1	1	…	1	1	1	0	1	1	0	1.0	0.951970	1

15 rows × 35 columns

df2 = df[(df['is_sp']==1) & (df['pred']==1)]
df2.sort_values(by='prob',ascending=False).head(15)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	user_id	X1day	X2day	X3day	X4day	X5day	X6day	X7day	X8day	X9day	…	X25day	X26day	X27day	X28day	X29day	X30day	X31day	is_sp	prob	pred
136	24791702	1	1	0	1	0	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.998618	1
137	24791702	1	1	0	1	0	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.998618	1
44	9567562	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.996302	1
43	9567562	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.996302	1
139	24900784	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.993923	1
124	23113079	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.993923	1
133	24581383	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.993923	1
134	24581383	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.993923	1
138	24900784	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.993923	1
123	23113079	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.993923	1
114	21551429	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.993923	1
147	27003770	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.993923	1
148	27003770	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.993923	1
150	27602710	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.993923	1
151	27602710	1	1	1	1	1	1	1	1	1	…	1	1	1	1	1	1	1	1.0	0.993923	1

15 rows × 35 columns

df3 = df[(df['is_sp']==0) & (df['pred']==1)]
df3.sort_values(by='prob',ascending=False).head(15)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	user_id	X1day	X2day	X3day	X4day	X5day	X6day	X7day	X8day	X9day	…	X25day	X26day	X27day	X28day	X29day	X31day	prob	pred
194	41590801	0	0	0	0	0	0	0	0	0	…	0	0	0	0	1	1	0.677458	1
108	19432099	1	1	1	1	0	1	1	1	1	…	0	0	0	0	0	0	0.643061	1
203	43451947	1	1	1	1	1	0	1	1	1	…	1	0	0	1	1	0	0.599921	1
197	42276142	1	1	1	1	1	1	0	1	1	…	1	1	1	1	1	0	0.577420	1
209	46285446	0	0	0	0	1	1	1	1	1	…	1	1	1	0	1	0	0.576873	1
14	3955950	1	1	1	1	0	0	0	0	0	…	0	0	0	0	0	0	0.543341	1
158	28391896	1	1	1	1	1	1	1	1	1	…	0	0	0	0	0	0	0.512293	1
240	59561276	1	1	1	1	1	1	1	1	1	…	0	0	0	0	0	0	0.512293	1
27	6147878	1	0	0	1	1	1	1	1	1	…	1	1	0	0	0	0	0.502182	1

9 rows × 35 columns

df4 = df[(df['is_sp']==0) & (df['pred']==1)]
df4.sort_values(by='prob',ascending=True).head(15)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	user_id	X1day	X2day	X3day	X4day	X5day	X6day	X7day	X8day	X9day	…	X25day	X26day	X27day	X28day	X29day	X31day	prob	pred
27	6147878	1	0	0	1	1	1	1	1	1	…	1	1	0	0	0	0	0.502182	1
158	28391896	1	1	1	1	1	1	1	1	1	…	0	0	0	0	0	0	0.512293	1
240	59561276	1	1	1	1	1	1	1	1	1	…	0	0	0	0	0	0	0.512293	1
14	3955950	1	1	1	1	0	0	0	0	0	…	0	0	0	0	0	0	0.543341	1
209	46285446	0	0	0	0	1	1	1	1	1	…	1	1	1	0	1	0	0.576873	1
197	42276142	1	1	1	1	1	1	0	1	1	…	1	1	1	1	1	0	0.577420	1
203	43451947	1	1	1	1	1	0	1	1	1	…	1	0	0	1	1	0	0.599921	1
108	19432099	1	1	1	1	0	1	1	1	1	…	0	0	0	0	0	0	0.643061	1
194	41590801	0	0	0	0	0	0	0	0	0	…	0	0	0	0	1	1	0.677458	1

9 rows × 35 columns

df5 = df[(df['is_sp']==0) & (df['pred']==0)]
df5.sort_values(by='prob',ascending=True).head(15)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	user_id	X4day	X5day	X6day	X7day	…	X25day	prob
149	27249550	1	1	1	0	…	0	0.000946
10	2541741	0	0	0	0	…	0	0.001726
242	60725457	0	0	0	0	…	0	0.001726
101	18408297	0	0	0	0	…	0	0.001745
172	33766090	0	0	0	0	…	0	0.002257
2	1073544	0	0	0	0	…	1	0.002510
227	52612953	0	0	0	0	…	0	0.003087
63	12582684	1	1	0	1	…	0	0.004780
208	46056688	0	0	1	1	…	0	0.004799
66	13157777	0	0	0	0	…	0	0.004969
190	40654033	0	0	0	0	…	0	0.004969
120	22437652	0	1	0	0	…	0	0.005689
87	16601600	0	1	0	0	…	0	0.005689
70	13967453	0	1	0	0	…	0	0.005689
112	20955934	0	1	0	0	…	0	0.005689

15 rows × 35 columns

df6 = df[(df['is_sp']==1) & (df['pred']==0)]
df6.sort_values(by='prob',ascending=False).head(15)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	user_id	X1day	X2day	X3day	X4day	X5day	X6day	X7day	X8day	X9day	…	X25day	X26day	X27day	X29day	is_sp	prob
198	42438713	1	1	1	1	1	1	1	0	0	…	0	0	0	0	1.0	0.484688
127	23689923	1	1	0	1	1	1	1	1	1	…	0	0	0	0	1.0	0.359100
213	47332069	0	0	0	0	0	0	0	0	0	…	1	1	0	0	1.0	0.281079
140	24914421	1	1	1	0	0	0	0	1	0	…	0	1	0	0	1.0	0.278119
226	52131958	0	0	1	1	1	1	1	1	1	…	1	1	1	0	1.0	0.259709
212	47266966	1	0	0	1	0	1	1	1	1	…	0	0	0	0	1.0	0.232730
236	57869405	0	0	0	0	0	0	1	1	0	…	0	0	0	0	1.0	0.212521
161	29698758	1	1	1	0	0	0	0	0	0	…	0	0	0	0	1.0	0.167370
30	7177251	1	1	1	1	1	1	0	0	0	…	0	0	0	0	1.0	0.153046
7	2241462	1	0	0	0	0	0	0	0	0	…	0	0	0	0	1.0	0.094451
67	13401362	1	0	0	0	0	0	0	0	0	…	0	0	0	0	1.0	0.094451
80	15569351	0	0	0	0	0	0	1	0	1	…	0	0	0	0	1.0	0.071546
93	17388480	0	0	0	0	0	0	0	0	0	…	0	0	0	1	1.0	0.070819
94	17388480	0	0	0	0	0	0	0	0	0	…	0	0	0	1	1.0	0.070819
163	30103279	0	0	0	0	0	0	0	0	0	…	0	0	0	0	1.0	0.028795

15 rows × 35 columns

## copy 问题的出现了，！！！ = 等号只是引用内存地址，变量最好用 copy() 属性！！

fp_dau_m.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	user_id	X1day	X2day	X3day	X4day	…	X23day	X24day	X25day	X26day	X27day	X28day	is_sp
0	471341	1	1	1	1	…	0	0	0	0	0	0	1.0
1	503874	1	0	0	0	…	0	0	0	0	0	0	0.0
2	1073544	0	0	0	0	…	1	1	1	0	0	0	0.0
3	1073864	0	0	0	0	…	0	0	0	0	0	0	0.0
4	1163733	1	1	0	0	…	1	1	1	1	1	1	1.0

5 rows × 33 columns

df.equals(fp_dau_m)

False

df.equals(ydf)

False

keithic

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
sesection7——逻辑回归分析—根据过去的行为能否预测当下

import pandas as pdimport numpy as npimport pymysqlfrom sqlalchemy import create_engineimport matplotlib.pyplot as pltimport seaborn as snsimport missingno as msno%matplotlib inline# ...
复制链接

扫一扫