1、导库
import pandas as pd
import numpy as np
from sklearn. tree import DecisionTreeClassifier
from sklearn. model_selection import train_test_split
from sklearn. model_selection import GridSearchCV
from sklearn. model_selection import cross_val_score
import matplotlib. pyplot as plt
2、加载数据,探索数据
data = pd. read_csv( './data.csv' )
data. head( )
data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
3、数据预处理
data. drop( [ "PassengerId" , "Cabin" , "Name" , "Ticket" ] , inplace= True , axis= 1 )
data[ "Age" ] = data[ "Age" ] . fillna( data[ "Age" ] . mean( ) )
data = data. dropna( )
data[ "Sex" ] = ( data[ "Sex" ] == "male" ) . astype( "int" )
labels = data[ "Embarked" ] . unique( ) . tolist( )
data[ "Embarked" ] = data[ "Embarked" ] . apply ( lambda x: labels. index( x) )
data. head( )
Survived Pclass Sex Age SibSp Parch Fare Embarked 0 0 3 1 22.0 1 0 7.2500 0 1 1 1 0 38.0 1 0 71.2833 1 2 1 3 0 26.0 0 0 7.9250 0 3 1 1 0 35.0 1 0 53.1000 0 4 0 3 1 35.0 0 0 8.0500 0
4、提取标签和特征矩阵,分测试集和训练集
y = data[ "Survived" ]
X = data. drop( "Survived" , axis= 1 )
Xtrain, Xtest, Ytrain, Ytest = train_test_split( X, y, test_size= 0.3 )
Xtrain. head( )
Pclass Sex Age SibSp Parch Fare Embarked 551 2 1 27.000000 0 0 26.0000 0 168 1 1 29.699118 0 0 25.9250 0 297 1 0 2.000000 1 2 151.5500 0 590 3 1 35.000000 0 0 7.1250 0 332 1 1 38.000000 0 1 153.4625 0
for i in [ Xtrain, Xtest, Ytrain, Ytest] :
i. index = range ( i. shape[ 0 ] )
Xtrain. head( )
Pclass Sex Age SibSp Parch Fare Embarked 0 2 1 27.000000 0 0 26.0000 0 1 1 1 29.699118 0 0 25.9250 0 2 1 0 2.000000 1 2 151.5500 0 3 3 1 35.000000 0 0 7.1250 0 4 1 1 38.000000 0 1 153.4625 0
5、导入模型,粗略跑一下查看结果
clf = DecisionTreeClassifier( random_state= 20 )
clf = clf. fit( Xtrain, Ytrain)
score = clf. score( Xtest, Ytest)
score
0.7752808988764045
clf = DecisionTreeClassifier( random_state= 20 )
score = cross_val_score( clf, X, y, cv= 10 ) . mean( )
score
0.7784473953013279
6、在不同max_depth下观察模型的拟合状况
tr = [ ]
te = [ ]
for i in range ( 10 ) :
clf = DecisionTreeClassifier( random_state= 20
, max_depth= i + 1
, criterion= "entropy"
)
clf = clf. fit( Xtrain, Ytrain)
score_tr = clf. score( Xtrain, Ytrain)
score_te = cross_val_score( clf, X, y, cv= 10 ) . mean( )
tr. append( score_tr)
te. append( score_te)
print ( max ( te) )
plt. plot( range ( 1 , 11 ) , tr, color= "red" , label= "train" )
plt. plot( range ( 1 , 11 ) , te, color= "blue" , label= "test" )
plt. xticks( range ( 1 , 11 ) )
plt. legend( )
plt. show( )
0.8200331971399386
7、用网格搜索调整参数
parameters = {
'criterion' : ( 'gini' , 'entropy' ) ,
'splitter' : ( 'best' , 'random' ) ,
'max_depth' : [ * range ( 1 , 10 ) ] ,
'min_samples_leaf' : [ * range ( 1 , 50 , 5 ) ] ,
'min_impurity_decrease' : [ * np. linspace( 0 , 0.5 , 20 ) ]
}
clf = DecisionTreeClassifier( random_state= 20 )
GS = GridSearchCV( clf, parameters, cv= 10 )
GS. fit( Xtrain, Ytrain)
GS. best_params_
{'criterion': 'gini',
'max_depth': 7,
'min_impurity_decrease': 0.0,
'min_samples_leaf': 6,
'splitter': 'best'}
GS. best_score_
0.8167434715821813
文章参考于该up主:https://space.bilibili.com/2932207