逻辑回归建模案例
% matplotlib inline
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
pd. set_option( 'display.max_columns' , None )
字段名 中文含义 ID 客户编号 Suc_flag 成功入网标识 ARPU 入网后ARPU PromCnt12 12个月内的营销次数 PromCnt36 36个月内的营销次数 PromCntMsg12 12个月内发短信的次数 PromCntMsg36 36个月内发短信的次数 Class 客户重要性等级(根据前运营商消费情况) Age 年龄 Gender 性别 HomeOwner 是否拥有住房 AvgARPU 当地平均ARPU AvgHomeValue 当地房屋均价 AvgIncome 当地人均收入
telecom = pd. read_csv( 'teleco_camp.csv' , skipinitialspace= True )
telecom. head( )
ID Suc_flag ARPU PromCnt12 PromCnt36 PromCntMsg12 PromCntMsg36 Class Age Gender HomeOwner AvgARPU AvgHomeValue AvgIncome 0 12 1 50.0 5.65 9.50 1.6 3.0 4 79.0 M H 49.894904 33400 39460 1 53 0 NaN 4.50 9.00 1.4 3.6 3 71.0 M H 48.574742 37600 33545 2 67 1 25.0 6.40 11.00 2.0 3.6 1 79.0 F H 49.272646 100400 42091 3 71 1 80.0 7.15 10.25 2.4 3.6 1 63.0 F H 47.334953 39900 39313 4 142 1 15.0 5.90 10.50 2.0 3.8 1 NaN F U 47.827404 47500 0
telecom. describe( include= 'all' )
ID Suc_flag ARPU PromCnt12 PromCnt36 PromCntMsg12 PromCntMsg36 Class Age Gender HomeOwner AvgARPU AvgHomeValue AvgIncome count 9686.000000 9686.000000 4843.000000 9686.000000 9686.000000 9686.000000 9686.000000 9686.000000 7279.000000 9686 9686 9686.000000 9686.000000 9686.000000 unique NaN NaN NaN NaN NaN NaN NaN NaN NaN 3 2 NaN NaN NaN top NaN NaN NaN NaN NaN NaN NaN NaN NaN F H NaN NaN NaN freq NaN NaN NaN NaN NaN NaN NaN NaN NaN 5223 5377 NaN NaN NaN mean 97975.474086 0.500000 78.121722 3.447212 7.337059 1.178402 2.390935 2.424530 59.150845 NaN NaN 52.905156 110986.299814 40491.444249 std 56550.171120 0.500026 62.225686 1.231890 1.952436 0.287226 0.914314 1.049047 16.516400 NaN NaN 4.993775 98670.855450 28707.494146 min 12.000000 0.000000 5.000000 0.750000 1.000000 0.200000 0.400000 1.000000 0.000000 NaN NaN 46.138968 0.000000 0.000000 25% 48835.500000 0.000000 50.000000 2.900000 6.250000 1.000000 1.400000 2.000000 47.000000 NaN NaN 49.760116 52300.000000 24464.000000 50% 99106.000000 0.500000 65.000000 3.250000 7.750000 1.200000 2.600000 2.000000 60.000000 NaN NaN 50.876672 76900.000000 43100.000000 75% 148538.750000 1.000000 100.000000 3.650000 8.250000 1.400000 3.200000 3.000000 73.000000 NaN NaN 54.452822 128175.000000 56876.000000 max 191779.000000 1.000000 1000.000000 15.150000 19.500000 3.600000 5.600000 4.000000 87.000000 NaN NaN 99.444787 600000.000000 200001.000000
数据清洗
from sklearn. preprocessing import LabelEncoder
le = LabelEncoder( )
telecom[ 'Gender' ] = le. fit_transform( telecom[ 'Gender' ] )
telecom[ 'HomeOwner' ] . replace( { 'H' : 0 , 'U' : 1 } , inplace= True )
for col in [ 'AvgIncome' , 'Age' , 'AvgHomeValue' ] :
telecom[ col] . replace( { 0 : np. NaN, } , inplace= True )
from sklearn. preprocessing import Imputer
imputer = Imputer( missing_values= 'NaN' , strategy= 'mean' , axis= 0 )
telecom1 = pd. DataFrame( imputer. fit_transform( telecom) , columns= telecom. columns)
def blk ( floor, root) :
def f ( x) :
if x < floor:
x = floor
elif x > root:
x = root
return x
return f
q1 = telecom1[ 'Age' ] . quantile( 0.01 )
q99 = telecom1[ 'Age' ] . quantile( 0.99 )
blk_tot = blk( floor= q1, root= q99)
telecom1[ 'Age' ] = telecom1[ 'Age' ] . map ( blk_tot)
telecom1. describe( )
ID Suc_flag ARPU PromCnt12 PromCnt36 PromCntMsg12 PromCntMsg36 Class Age Gender HomeOwner AvgARPU AvgHomeValue AvgIncome count 9686.000000 9686.000000 9686.000000 9686.000000 9686.000000 9686.000000 9686.000000 9686.000000 9686.000000 9686.000000 9686.000000 9686.000000 9686.000000 9686.000000 mean 97975.474086 0.500000 78.121722 3.447212 7.337059 1.178402 2.390935 2.424530 59.230106 0.516312 0.444869 52.905156 112179.202755 53513.457361 std 56550.171120 0.500026 43.997933 1.231890 1.952436 0.287226 0.914314 1.049047 14.046835 0.600716 0.496977 4.993775 97997.592632 17227.468161 min 12.000000 0.000000 5.000000 0.750000 1.000000 0.200000 0.400000 1.000000 21.000000 0.000000 0.000000 46.138968 7500.000000 2499.000000 25% 48835.500000 0.000000 65.000000 2.900000 6.250000 1.000000 1.400000 2.000000 51.000000 0.000000 0.000000 49.760116 53500.000000 42775.000000 50% 99106.000000 0.500000 78.121722 3.250000 7.750000 1.200000 2.600000 2.000000 59.158972 0.000000 0.000000 50.876672 78450.000000 53513.457361 75% 148538.750000 1.000000 78.121722 3.650000 8.250000 1.400000 3.200000 3.000000 69.000000 1.000000 1.000000 54.452822 128175.000000 56876.000000 max 191779.000000 1.000000 1000.000000 15.150000 19.500000 3.600000 5.600000 4.000000 86.000000 2.000000 1.000000 99.444787 600000.000000 200001.000000
变量筛选
使用模型进行筛选
from sklearn import ensemble
X = telecom1. loc[ : , 'PromCnt12' : ]
y = telecom1[ 'Suc_flag' ]
crf = ensemble. RandomForestClassifier( )
crf. fit( X= X, y= y)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
pd. Series( X. columns, index= crf. feature_importances_) . sort_index( ascending= False )
0.356981 PromCnt12
0.304656 PromCntMsg12
0.073231 PromCnt36
0.069373 PromCntMsg36
0.052378 AvgARPU
0.042860 AvgHomeValue
0.037396 AvgIncome
0.031021 Age
0.015768 Class
0.009526 Gender
0.006809 HomeOwner
dtype: object
def IV_between ( y, x) :
all_i = y. groupby( x) . count( )
bad_i = y. groupby( x) . sum ( )
good_i = all_i - bad_i
p1 = bad_i / bad_i. sum ( )
p0 = good_i / good_i. sum ( )
woe = np. log( ( p1 + 1e - 5 ) / ( p0 + 1e - 5 ) )
IV = ( p1 - p0) * woe
return IV. sum ( )
IV = pd. Series( )
for i in X. columns:
if len ( X[ i] . unique( ) ) > 10 and X[ i] . dtype != np. object :
try :
tmp = pd. qcut( X[ i] , 5 )
except :
tmp = pd. cut( X[ i] , 5 )
IV = IV. append( pd. Series( [ i] , index= [ IV_between( y, tmp) ] ) )
else :
IV = IV. append( pd. Series( [ i] , index= [ IV_between( y, X[ i] ) ] ) )
sorted_IV = IV. sort_index( ascending= False )
sorted_IV
0.479255 PromCnt12
0.326262 PromCntMsg12
0.040798 Class
0.032807 AvgARPU
0.031443 PromCntMsg36
0.016009 PromCnt36
0.014777 AvgHomeValue
0.013139 Age
0.004153 AvgIncome
0.000263 HomeOwner
0.000030 Gender
dtype: object
变量筛选有多种方法,本例选取IV值最大的前8个变量作为示例
selected_features = [ 'Suc_flag' , ]
selected_features. extend( sorted_IV. iloc[ : 8 ] )
telecom2 = telecom1[ selected_features]
telecom2. head( )
Suc_flag PromCnt12 PromCntMsg12 Class AvgARPU PromCntMsg36 PromCnt36 AvgHomeValue Age 0 1.0 5.65 1.6 4.0 49.894904 3.0 9.50 33400.0 79.000000 1 0.0 4.50 1.4 3.0 48.574742 3.6 9.00 37600.0 71.000000 2 1.0 6.40 2.0 1.0 49.272646 3.6 11.00 100400.0 79.000000 3 1.0 7.15 2.4 1.0 47.334953 3.6 10.25 39900.0 63.000000 4 1.0 5.90 2.0 1.0 47.827404 3.8 10.50 47500.0 59.158972
划分训练集和测试集
telecom3 = telecom2. join( pd. get_dummies( telecom[ 'Class' ] ) ) . drop( 'Class' , axis= 1 )
telecom3. head( )
Suc_flag PromCnt12 PromCntMsg12 AvgARPU PromCntMsg36 PromCnt36 AvgHomeValue Age 1 2 3 4 0 1.0 5.65 1.6 49.894904 3.0 9.50 33400.0 79.000000 0 0 0 1 1 0.0 4.50 1.4 48.574742 3.6 9.00 37600.0 71.000000 0 0 1 0 2 1.0 6.40 2.0 49.272646 3.6 11.00 100400.0 79.000000 1 0 0 0 3 1.0 7.15 2.4 47.334953 3.6 10.25 39900.0 63.000000 1 0 0 0 4 1.0 5.90 2.0 47.827404 3.8 10.50 47500.0 59.158972 1 0 0 0
from sklearn. model_selection import train_test_split
data = telecom3. iloc[ : , 1 : ]
target = telecom3[ 'Suc_flag' ]
train_data, test_data, train_target, test_target = train_test_split(
data, target, test_size= 0.4 , train_size= 0.6 , random_state= 123 )
标准化
from sklearn. preprocessing import MinMaxScaler
scaler = MinMaxScaler( )
scaler. fit( train_data)
scaled_train_data = scaler. transform( train_data)
scaled_test_data = scaler. transform( test_data)
建立逻辑回归模型
from sklearn. linear_model import LogisticRegression
logistic_model = LogisticRegression( )
logistic_model. fit( scaled_train_data, train_target)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
预测
预测分类标签
train_predict = logistic_model. predict( scaled_train_data)
test_predict = logistic_model. predict( scaled_test_data)
预测概率
train_proba = logistic_model. predict_proba( scaled_train_data) [ : , 1 ]
test_proba = logistic_model. predict_proba( scaled_test_data) [ : , 1 ]
评估
logistic_model. score( scaled_test_data, test_target)
0.7685161290322581
from sklearn import metrics
print ( metrics. confusion_matrix( test_target, test_predict, labels= [ 0 , 1 ] ) )
print ( metrics. classification_report( test_target, test_predict) )
[[1510 395]
[ 502 1468]]
precision recall f1-score support
0.0 0.75 0.79 0.77 1905
1.0 0.79 0.75 0.77 1970
avg / total 0.77 0.77 0.77 3875
fpr_test, tpr_test, th_test = metrics. roc_curve( test_target, test_proba)
fpr_train, tpr_train, th_train = metrics. roc_curve( train_target, train_proba)
plt. figure( figsize= [ 6 , 6 ] )
plt. plot( fpr_test, tpr_test, 'b-' )
plt. plot( fpr_train, tpr_train, 'r-' )
plt. title( 'ROC curve' )
print ( 'AUC = %6.4f' % metrics. auc( fpr_test, tpr_test) )
AUC = 0.8304
[外链图片转存 (img-aFo8gRGr-1562725776410)(output_43_1.png)]
交叉验证优化
from sklearn. linear_model import LogisticRegressionCV
lrcv = LogisticRegressionCV( Cs= 10 , cv= 4 )
lrcv. fit( scaled_train_data, train_target)
lrcv. scores_
{1.0: array([[0.50550206, 0.54470426, 0.59009629, 0.6781293 , 0.76753783,
0.78129298, 0.78404402, 0.78404402, 0.78404402, 0.78404402],
[0.50584997, 0.55333792, 0.60289057, 0.68891948, 0.75430145,
0.76944253, 0.76324845, 0.76324845, 0.76256022, 0.76256022],
[0.50550964, 0.54063361, 0.58333333, 0.64049587, 0.74104683,
0.75550964, 0.76033058, 0.76033058, 0.76033058, 0.76033058],
[0.50550964, 0.54752066, 0.58402204, 0.70661157, 0.75757576,
0.76584022, 0.76033058, 0.76101928, 0.76101928, 0.76101928]])}
lrcv. C_
array([2.7825594])
test_cv_proba = lrcv. predict_proba( scaled_test_data) [ : , 1 ]
fpr_test, tpr_test, th_test = metrics. roc_curve( test_target, test_cv_proba)
print ( 'AUC = %6.4f' % metrics. auc( fpr_test, tpr_test) )
AUC = 0.8311