目录
学习视频
读取数据并查看基本信息
数据来源:https://github.com/ibm/telco-customer-churn-on-icp4d
已上传至资源
import pandas as pd # 读取数据
df = pd.read_csv('Telco-Customer-Churn.csv')
df.head()
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
df.shape
(7043, 21)
df.dtypes
customerID object
gender object
SeniorCitizen int64
Partner object
Dependents object
tenure int64
PhoneService object
MultipleLines object
InternetService object
OnlineSecurity object
OnlineBackup object
DeviceProtection object
TechSupport object
StreamingTV object
StreamingMovies object
Contract object
PaperlessBilling object
PaymentMethod object
MonthlyCharges float64
TotalCharges object
Churn object
dtype: object
# 删除customerID
del df['customerID']
缺失值处理
# TotalCharge 转换成数值类型
# df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])
# error: Unable to parse string " " at position 488
# 含有缺失值
# 查看缺失值
miss_data = df.loc[df['TotalCharges']==' '] # *.loc[row,col] label
len(miss_data)
11
# 用 0 补全缺失值
df.loc[(df['TotalCharges']==' '), 'TotalCharges'] = 0
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])
df.dtypes
gender object
SeniorCitizen int64
Partner object
Dependents object
tenure int64
PhoneService object
MultipleLines object
InternetService object
OnlineSecurity object
OnlineBackup object
DeviceProtection object
TechSupport object
StreamingTV object
StreamingMovies object
Contract object
PaperlessBilling object
PaymentMethod object
MonthlyCharges float64
TotalCharges float64
Churn object
dtype: object
将标签转换成数值型
# 将标签转换成 0 和 1
df.loc[(df['Churn']=='No'), 'Churn'] = 0
df.loc[(df['Churn']=='Yes'), 'Churn'] = 1
df['Churn'] = pd.to_numeric(df['Churn'])
df.dtypes
gender object
SeniorCitizen int64
Partner object
Dependents object
tenure int64
PhoneService object
MultipleLines object
InternetService object
OnlineSecurity object
OnlineBackup object
DeviceProtection object
TechSupport object
StreamingTV object
StreamingMovies object
Contract object
PaperlessBilling object
PaymentMethod object
MonthlyCharges float64
TotalCharges float64
Churn int64
dtype: object
将数据和标签分开
X = df.drop('Churn', axis=1).copy() # axis=1 删除列,默认删除行
y = df['Churn'].copy()
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# 打印所有的交互式输出
X.shape
y.shape
(7043, 19)
(7043,)
用one-hot编码将object类型转换成int
检查一下xgboost支持的数据类型
X_encoded = pd.get_dummies(X, columns=['gender',
'Partner',
'Dependents',
'PhoneService',
'MultipleLines',
'InternetService',
'OnlineSecurity',
'OnlineBackup',
'DeviceProtection',
'TechSupport',
'StreamingTV',
'StreamingMovies',
'Contract',
'PaperlessBilling',
'PaymentMethod'])
X_encoded.shape
X_encoded.dtypes
X_encoded.head()
(7043, 45)
SeniorCitizen int64
tenure int64
MonthlyCharges float64
TotalCharges float64
gender_Female uint8
gender_Male uint8
Partner_No uint8
Partner_Yes uint8
Dependents_No uint8
Dependents_Yes uint8
PhoneService_No uint8
PhoneService_Yes uint8
MultipleLines_No uint8
MultipleLines_No phone service uint8
MultipleLines_Yes uint8
InternetService_DSL uint8
InternetService_Fiber optic uint8
InternetService_No uint8
OnlineSecurity_No uint8
OnlineSecurity_No internet service uint8
OnlineSecurity_Yes uint8
OnlineBackup_No uint8
OnlineBackup_No internet service uint8
OnlineBackup_Yes uint8
DeviceProtection_No uint8
DeviceProtection_No internet service uint8
DeviceProtection_Yes uint8
TechSupport_No uint8
TechSupport_No internet service uint8
TechSupport_Yes uint8
StreamingTV_No uint8
StreamingTV_No internet service uint8
StreamingTV_Yes uint8
StreamingMovies_No uint8
StreamingMovies_No internet service uint8
StreamingMovies_Yes uint8
Contract_Month-to-month uint8
Contract_One year uint8
Contract_Two year uint8
PaperlessBilling_No uint8
PaperlessBilling_Yes uint8
PaymentMethod_Bank transfer (automatic) uint8
PaymentMethod_Credit card (automatic) uint8
PaymentMethod_Electronic check uint8
PaymentMethod_Mailed check uint8
dtype: object
SeniorCitizen | tenure | MonthlyCharges | TotalCharges | gender_Female | gender_Male | Partner_No | Partner_Yes | Dependents_No | Dependents_Yes | ... | StreamingMovies_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | PaperlessBilling_No | PaperlessBilling_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 29.85 | 29.85 | 1 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
1 | 0 | 34 | 56.95 | 1889.50 | 0 | 1 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
2 | 0 | 2 | 53.85 | 108.15 | 0 | 1 | 1 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
3 | 0 | 45 | 42.30 | 1840.75 | 0 | 1 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
4 | 0 | 2 | 70.70 | 151.65 | 1 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
5 rows × 45 columns
划分训练集和测试集
sum(y)/len(y)
0.2653698707936959
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=40, stratify=y)
# stratify = y 按照y的比例划分,即训练集和测试集的不平衡率相同
# 验证比例是否相同
sum(y_train)/len(y_train)
sum(y_test)/len(y_test)
0.2654297614539947
0.26519023282226006
用XGBoost 训练
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, seed=40)
xgb_clf.fit(X_train,
y_train,
verbose=1,
early_stopping_rounds=10,
eval_set=[(X_test, y_test)])
[11:50:18] WARNING: ..\src\learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[0] validation_0-logloss:0.58053
[1] validation_0-logloss:0.51903
[2] validation_0-logloss:0.48186
[3] validation_0-logloss:0.45798
[4] validation_0-logloss:0.44383
[5] validation_0-logloss:0.43397
[6] validation_0-logloss:0.42840
[7] validation_0-logloss:0.42540
[8] validation_0-logloss:0.42333
[9] validation_0-logloss:0.42243
[10] validation_0-logloss:0.42135
[11] validation_0-logloss:0.42205
[12] validation_0-logloss:0.42202
[13] validation_0-logloss:0.42304
[14] validation_0-logloss:0.42223
[15] validation_0-logloss:0.42301
[16] validation_0-logloss:0.42425
[17] validation_0-logloss:0.42480
[18] validation_0-logloss:0.42576
[19] validation_0-logloss:0.42654
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=40,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=40,
subsample=1, tree_method='exact', use_label_encoder=False,
validate_parameters=1, verbosity=None)
画混淆矩阵
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(xgb_clf, X_test, y_test)
D:\Anaconda\lib\site-packages\xgboost\data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
warnings.warn(
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0xb1ef790>
交叉验证 和 网格搜索 优化XGBoost
from sklearn.model_selection import GridSearchCV
# 网格搜索的参数
param_grid = {'max_depth': [4],
'learning_rate': [0.1, 0.5, 1],
'gamma': [0.25],
'reg_lambda': [10.0, 20, 100],
'scale_pos_weigth': [3]}
optimal_param = GridSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False,
verbose=0,
seed=40,
subsample=0.9,
colsample_bytree=0.5),# 使用90%的数据,50%的特征,加快速度
param_grid=param_grid,
cv=3
)
optimal_param.fit(X_train,
y_train,
verbose=0,
eval_metric='auc',
eval_set=[(X_test, y_test)])
print(optimal_param.best_params_)
GridSearchCV(cv=3,
estimator=XGBClassifier(base_score=None, booster=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.5, gamma=None,
gpu_id=None, importance_type='gain',
interaction_constraints=None,
learning_rate=None, max_delta_step=None,
max_depth=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, random_state=None,
reg_alpha=None, reg_lambda=None,
scale_pos_weight=None, seed=40,
subsample=0.9, tree_method=None,
use_label_encoder=False,
validate_parameters=None, verbose=0,
verbosity=None),
param_grid={'gamma': [0.25], 'learning_rate': [0.1, 0.5, 1],
'max_depth': [4], 'reg_lambda': [10.0, 20, 100],
'scale_pos_weigth': [3]})
{'gamma': 0.25, 'learning_rate': 0.1, 'max_depth': 4, 'reg_lambda': 100, 'scale_pos_weigth': 3}
用最优参数重新构建XGBoost
xgb_clf_optimal = xgb.XGBClassifier(use_label_encoder=False,
gamma=0.25,
learning_rate=0.1,
max_depth=4,
reg_lambda=100,
scale_pos_weigth=3,
seed=40,
subsample=0.9,
colsample_bytree=0.5,
verbose=0)
xgb_clf_optimal.fit(X_train,
y_train,
verbose=0,
eval_metric='aucpr',
eval_set=[(X_test, y_test)])
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.5, gamma=0.25, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.1, max_delta_step=0, max_depth=4,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=40,
reg_alpha=0, reg_lambda=100, scale_pos_weight=1,
scale_pos_weigth=3, seed=40, subsample=0.9, tree_method='exact',
use_label_encoder=False, validate_parameters=1, verbose=0,
verbosity=None)
# 混淆矩阵
plot_confusion_matrix(xgb_clf_optimal, X_test, y_test)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0xc08eaf0>
#bst = xgb_clf_optimal.get_booster()
#for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
# print('%s:' % importance_type, bst.get_score(importance_type=importance_type))
xgb.to_graphviz(xgb_clf_optimal, num_trees=0, size="10,10") # 第一颗树模型