data.drop([‘特征1’,‘特征2’],axis=1,inplace=True)
axis=1 删除列
inplace=True 删除后的数据替换原来的
data.drop(['特征1','特征2'],axis=1,inplace=True)
axis=1 删除列
inplace=True 删除后的数据替换原来的
#导入所需要的库
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
#导入数据,探索数据
data=pd.read_csv('./dataset/data.csv')
data.info()
data.head()
#筛选特征
data.drop(['Name','Ticket','Cabin'],axis=1,inplace=True)
data['Age']=data['Age'].fillna(data['Age'].mean()) #处理缺失值
data=data.dropna(axis=0) #删除有缺失值的行
#将类别转换为数字
labels=data['Embarked'].unique().tolist()
data["Embarked"]=data['Embarked'].apply(lambda x: labels.index(x))
data['Sex']=(data['Sex']=='male').astype('int')
#x取,除了Survived的所有列
x=data.iloc[:,data.columns !='Survived']
#y取,Survived列
y=data.iloc[:,data.columns == 'Survived']
#分割数据
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3)
#把索引从0排序
xtrain.index=range(xtrain.shape[0]) #单个列表的排序
#对多个一起排序
for i in [xtrain,xtest,ytrain,ytest]:
i.index=range(i.shape[0])
# 跑模型
clf=DecisionTreeClassifier(random_state=30)
clf.fit(xtrain,ytrain)
score=clf.score(xtest,ytest
#交叉验证
from sklearn.model_selection import cross_val_score
clf=DecisionTreeClassifier(random_state=30)
score=cross_val_score(clf,x,y,cv=10).mean()
score
#寻找最优的参数
tr=[]
te=[]
for i in range(10):
clf=DecisionTreeClassifier(random_state=30
,max_depth=i+1
,criterion='entropy'
)
clf=clf.fit(xtrain,ytrain)
score_tr=clf.score(xtrain,ytrain)
score_te=cross_val_score(clf,x,y,cv=10).mean()
tr.append(score_tr)
te.append(score_te)
print(max(te))
plt.plot(range(1,11),tr,c='r',label='train')
plt.plot(range(1,11),te,c='b',label='test')
plt.xticks(range(1,11))
plt.legend()
plt.show()
# 网格搜索:能够帮助我们同时调整多个参数的奇数,枚举计数
import numpy as np
#np.linspace(0,0.5,50)
#gini_threholds=np.linspace(0,0.5,50)
#entropy_threholds=np.linspace(0,1,50)
#一串参数和这些参数对应的,我们希望网格搜索来搜索的参数的取值范围
import numpy as np
gini_threholds=np.linspace(0,0.5,50)
from sklearn.model_selection import GridSearchCV
parameters={'criterion':('gini','entropy')
,'splitter':('best','random')
,'max_depth':[*range(1,10)]
,'min_samples':[*range(1,50,5)]
,'min_impurity_decrease':[*np.linspace(0,0.5,20)]
}
clf=DecisionTreeClassifier(random_state=30)
GS=GridSearchCV(clf,parameters,cv=10) #网格搜索
GS.fit(xtrain,ytrain)
GS.best_score_
GS.best_score_
寻找最优参数的绘图:
网格搜索不一定有自己调的参数准确率高。
决策树的优缺点: