仍然是参照:https://www.kesci.com/home/project/5b02bf4b57e80961e67c3b06
先根据原文做,之后再做改动吧
预测模型
采用算法:
1)Logistic Regression
2)Support Vector Machines(Linear and radial)
3)Random Forest
4)K-Nearest Neighbours
5)Naive Bayes
6)Decision Tree
7)Logistic Regression
第一步,导入库
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
划分数据集
target = 'Legendary' #设置一个target
X_columns = [x for x in data.columns if x not in [target]] #除了Target之外都是features
X = data[X_columns]
Y = data['Legendary']
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.4)
print("X_train's shape is {}:".format(X_train.shape))
print("y_train's shape is {}:".format(y_train.shape))
print("X_test's shape is {}:".format(X_test.shape))
print("y_test's shape is {}:".format(y_test.shape))
Out:
X_train's shape is (478, 8):
y_train's shape is (478,):
X_test's shape is (320, 8):
y_test's shape is (320,):
rbf SVM
model = svm.SVC(kernel='rbf',C=1,gamma=0.1)
model.fit(X_train,y_train)
y_svmpred = model.predict(X_test)
print('Accuracy for rbf SVM is {:.2%}'.format(metrics.accuracy_score(y_svmpred,y_test)))
out:
Accuracy for rbf SVM is 93.44%
linear svm
model = svm.SVC(kernel='linear',C=0.1,gamma=0.1)
model.fit(X_train,y_train)
y_lsvmpred = model.predict(X_test)
print('Accuracy for rbf SVM is {:.2%}'.format(metrics.accuracy_score(y_lsvmpred,y_test)))
out:
Accuracy for rbf SVM is 94.38%
LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)
y_LRpred = model.predict(X_test)
print('Accuracy for LogisticRegression is {:.2%}'.format(metrics.accuracy_score(y_LRpred,y_test)))
out:
Accuracy for LogisticRegression is 91.25%
DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
y_Dtreepred = model.predict(X_test)
print('Accuracy for DecisionTreeClassifier is {:.2%}'.format(metrics.accuracy_score(y_Dtreepred,y_test)))
Out:
Accuracy for DecisionTreeClassifier is 92.81%
KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train,y_train)
y_KNCpred = model.predict(X_test)
print('Accuracy for KNeighborsClassifier is {:.2%}'.format(metrics.accuracy_score(y_KNCpred,y_test)))
out:
Accuracy for KNeighborsClassifier is 94.69%
找出某一参数下的最优精确率
a_index = list(range(1,11))
a = pd.Series()
x = [0,1,2,3,4,5,6,7,8,9,10]
for i in list(range(1,11)):
model = KNeighborsClassifier(n_neighbors=i)
model.fit(X_train,y_train)
prediction = model.predict(X_test)
a = a.append(pd.Series(metrics.accuracy_score(prediction,y_test)))
plt.plot(a_index,a)
plt.xticks(x)
fig=plt.gcf()
fig.set_size_inches(12,6)
plt.show()
print('Accuracies for different values of n are:{} with the max value as {}'.format(a.values,a.values.max()))
Note:
plt.gcf()
返回当前状态下的Figure对象,一般用以遍历多个图形的Axes(plt.gcf().get_axes())。另一种方法是使用Axes矩阵的索引抽取子Plot的Axes。
引自:Python数据处理学习笔记 - matplotlib API篇
Out:
Accuracies for different values of n are:[0.9375 0.9375 0.940625 0.93125 0.928125 0.93125 0.934375 0.940625
0.9375 0.940625] with the max value as 0.940625
高斯
model = GaussianNB()
model.fit(X_train,y_train)
y_Gpred = model.predict(X_test)
print('Accuracy for GaussianNB is {:.2%}'.format(metrics.accuracy_score(y_Gpred,y_test)))
Out:
Accuracy for GaussianNB is 93.75%
随机森林
model = RandomForestClassifier()
model.fit(X_train,y_train)
y_RFpred = model.predict(X_test)
print('Accuracy for RandomForestClassifier is {:.2%}'.format(metrics.accuracy_score(y_RFpred,y_test)))
Out:
Accuracy for RandomForestClassifier is 93.44%
Cross Validation对比模型表现
classifiers=['Linear Svm','Radial Svm','Logistic Regression','KNN','Decision Tree','Naive Bayes','Random Forest']
models = [svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),KNeighborsClassifier(n_neighbors=9),DecisionTreeClassifier(),GaussianNB(),RandomForestClassifier(n_estimators=100)]
for i in models:
model = i
cv_result = cross_val_score(model,X,Y,cv=kfold,scoring='accuracy')
xyz.append(cv_result.mean())
std.append(cv_result.std())
accuracy.append(cv_result)
new_models_dataframe2 = pd.DataFrame({'CV mean':xyz,'Std':std},index=classifiers)
print(new_models_dataframe2)
Out:
CV mean Std
Linear Svm 0.937247 0.056415
Radial Svm 0.918402 0.079402
Logistic Regression 0.925934 0.069352
KNN 0.934747 0.058036
Decision Tree 0.918497 0.051709
Naive Bayes 0.927326 0.041804
Random Forest 0.943497 0.058382
查看每种算法的精确率
plt.subplots(figsize = (12,6))
box = pd.DataFrame(accuracy,index=[classifiers])
box.T.boxplot()
plt.show()
精确率平均值的对比
new_models_dataframe2['CV mean'].plot.barh(width = 0.8)
plt.title('Average CV Mean Accuracy')
fig = plt.gcf()
fig.set_size_inches(8,5)
plt.show()
做混淆矩阵验证
from sklearn.model_selection import cross_val_predict
f,ax = plt.subplots(3,3,figsize=(12,10))
y_pred = cross_val_predict(svm.SVC(kernel='rbf'),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[0,0],annot=True,fmt='2.0f',cmap='coolwarm')
ax[0,0].set_title('Matrix for rbf-SVM')
y_pred = cross_val_predict(svm.SVC(kernel='linear'),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[0,1],annot=True,fmt='2.0f',cmap='coolwarm')
ax[0,1].set_title('Matrix for Linear-SVM')
y_pred = cross_val_predict(KNeighborsClassifier(n_neighbors=9),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[0,2],annot=True,fmt='2.0f',cmap='coolwarm')
ax[0,2].set_title('Matrix for KNN')
y_pred = cross_val_predict(RandomForestClassifier(n_estimators=100),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[1,0],annot=True,fmt='2.0f',cmap='coolwarm')
ax[1,0].set_title('Matrix for Random-Forests')
y_pred = cross_val_predict(LogisticRegression(),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[1,1],annot=True,fmt='2.0f',cmap='coolwarm')
ax[1,1].set_title('Matrix for Logistic Regression')
y_pred = cross_val_predict(DecisionTreeClassifier(),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[1,2],annot=True,fmt='2.0f',cmap='coolwarm')
ax[1,2].set_title('Matrix for Decision Tree')
y_pred = cross_val_predict(GaussianNB(),X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,y_pred),ax=ax[2,0],annot=True,fmt='2.0f',cmap='coolwarm')
ax[2,0].set_title('Matrix for Naive Bayes')
plt.subplots_adjust(hspace=0.3,wspace=0.5)
plt.show()
Out:
强迫症必须肝完。
调参(Hyper-Parameters Tuning)
SVM
from sklearn.model_selection import GridSearchCV
C= [0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
gamma=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
kernel=['rbf','linear']
hyper={'kernel':kernel,'C':C,'gamma':gamma}
gd = GridSearchCV(estimator=svm.SVC(),param_grid=hyper,verbose=True)
gd.fit(X_train,y_train)
print(gd.best_score_)
print(gd.best_estimator_)
OUT:
0.9414225941422594
SVC(C=0.2, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
把参数代入
model = svm.SVC(C=0.2, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
model.fit(X_train,y_train)
y_svmpred = model.predict(X_test)
print('Accuracy for rbf SVM is {:.2%}'.format(metrics.accuracy_score(y_svmpred,y_test)))
OUT:
Accuracy for rbf SVM is 95.31%
对比前面显然提升了。
Random Forests
from sklearn.model_selection import GridSearchCV
n_estimators=range(100,1000,100)
hyper={'n_estimators':n_estimators}
gd=GridSearchCV(estimator=RandomForestClassifier(random_state=0),param_grid=hyper,verbose=True)
gd.fit(X_train,y_train)
print(gd.best_score_)
print(gd.best_estimator_)
OUT:
0.9497907949790795
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)
代入:
model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)
model.fit(X_train,y_train)
y_RFpred = model.predict(X_test)
print('Accuracy for RandomForestClassifier is {:.2%}'.format(metrics.accuracy_score(y_RFpred,y_test)))
OUT:
Accuracy for RandomForestClassifier is 95.62%
集成算法
-
Voting Classifier
-
Bagging
-
Boosting.
Voting Classifier
机器学习:集成学习(Soft Voting Classifier)
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
ensemble_lin_rbf=VotingClassifier(estimators=[('KNN',KNeighborsClassifier(n_neighbors=10)),
('RBF',svm.SVC(probability=True,kernel='rbf',C=0.05,gamma=0.1)),
('RFor',RandomForestClassifier(n_estimators=200,random_state=0)),
('LR',LogisticRegression(C=0.05)),
('DT',DecisionTreeClassifier(random_state=0)),
('NB',GaussianNB()),
('svm',svm.SVC(kernel='linear',gamma=0.1,probability=True))
], voting='soft')
ensemble_lin_rbf.fit(X_train,y_train)
out = ensemble_lin_rbf.score(X_test,y_test)
print('The accuracy for ensembled model is: {}'.format(out))
cross = cross_val_score(ensemble_lin_rbf,X_train,y_train,cv=10,scoring='accuracy')
print('The cross validated score is {}'.format(cross.mean()))
OUT:
The accuracy for ensembled model is: 0.934375
The cross validated score is 0.9415544941380809
Hyper-Parameter Tuning for XGBoost
from xgboost import XGBClassifier
model = XGBClassifier(max_depth=5,subsample=0.8,objective='binary:logistic',n_estimators=1000,learning_rate=0.01)
eval_set = [(X_train,y_train),(X_test,y_test)]
model.fit(X_train,y_train,eval_metric=["error","logloss"],early_stopping_rounds=15,eval_set=eval_set,verbose=True)
OUT:
Stopping. Best iteration:
[547] validation_0-error:0.006276 validation_0-logloss:0.03823 validation_1-error:0.059375 validation_1-logloss:0.119033
from xgboost import XGBClassifier
n_estimators=list(range(400,600,50))
learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
hyper={'n_estimators':n_estimators,'learning_rate':learn_rate}
gd=GridSearchCV(estimator=XGBClassifier(),param_grid=hyper,verbose=True)
gd.fit(X_train,y_train)
print(gd.best_score_)
print(gd.best_estimator_)
OUT:
0.9560669456066946
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
learning_rate=0.05, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=400, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=None, subsample=1, verbosity=1)
from xgboost import XGBClassifier
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
learning_rate=0.05, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=400, n_jobs=1,
objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
subsample=1, verbosity=1)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print('Accuracy for XGBClassifier is {:.2%}'.format(metrics.accuracy_score(y_pred,y_test)))
OUT:
Accuracy for XGBClassifier is 94.69%
做混淆矩阵图
from sklearn.model_selection import cross_val_predict
result = cross_val_predict(model,X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,result),cmap='coolwarm',annot=True,fmt='2.0f')
plt.show()
特征重要性
f,ax = plt.subplots(figsize=(5,5))
pd.Series(model.feature_importances_,X_train.columns).sort_values(ascending=True).plot.barh(width=0.8,color='#FD0F00')
ax.set_title('Feature Importance in XGBoost')
plt.show()