Mushroom_CART

最新推荐文章于 2024-05-27 23:02:33 发布

*Major*

最新推荐文章于 2024-05-27 23:02:33 发布

阅读量459

点赞数

本文链接：https://blog.csdn.net/qq_41375318/article/details/108131066

版权

$M u s h r o o m C A R T$

Importing all the libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score

Reading the file

还是蘑菇数据集，直接采用Kaggle竞赛中22维特征
https://www.kaggle.com/uciml/mushroom-classification

# path to where the data lies
data = pd.read_csv("./data/mushrooms.csv")
data.head(6)

在这里插入图片描述

Let us check if there is any null values

该数据没有空值／缺失数据

data.isnull().sum()

在这里插入图片描述

check if we have two claasification. Either the mushroom is poisonous or edibl

data['class'].unique()

在这里插入图片描述

print(data.dtypes)

在这里插入图片描述

check if 22 features(1st one is label) and 8124 instances

data.shape

在这里插入图片描述

The dataset has values in strings.We need to convert all the unique values to integers. Thus we perform label encoding on the data

from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for col in data.columns:
    data[col] = labelencoder.fit_transform(data[col])
 
data.head()

在这里插入图片描述

X = data.iloc[:,1:23]  # all rows, all the features and no labels
y = data.iloc[:, 0]  # all rows, label only
X.head()
y.head()

在这里插入图片描述

Splitting the data into training and testing dataset

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)

default Logistic Regression

from sklearn.linear_model import LogisticRegression
model_LR= LogisticRegression()

model_LR.fit(X_train,y_train)

在这里插入图片描述

y_prob = model_LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_LR.score(X_test, y_pred)

在这里插入图片描述

accuracy

auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

在这里插入图片描述

Logistic Regression(Tuned model)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics

LR_model= LogisticRegression()

tuned_parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] ,
              'penalty':['l1','l2']
                   }

CV

from sklearn.model_selection import GridSearchCV

LR= GridSearchCV(LR_model, tuned_parameters,cv=10)

LR.fit(X_train,y_train)

在这里插入图片描述

print(LR.best_params_)

在这里插入图片描述

y_prob = LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
LR.score(X_test, y_pred)

在这里插入图片描述

auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

在这里插入图片描述

Default Decision Tree model

from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier()

model_tree.fit(X_train, y_train)

在这里插入图片描述

y_prob = model_tree.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_tree.score(X_test, y_pred)

在这里插入图片描述

auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

在这里插入图片描述

Let us tune the hyperparameters of the Decision tree model

from sklearn.tree import DecisionTreeClassifier

model_DD = DecisionTreeClassifier()


tuned_parameters= { 'max_features': ["auto","sqrt","log2"],
                  'min_samples_leaf': range(1,100,1) , 'max_depth': range(1,50,1)
                  }
#tuned_parameters= { 'max_features': ["auto","sqrt","log2"]  }


#If “auto”, then max_features=sqrt(n_features).

from sklearn.model_selection import GridSearchCV
DD = GridSearchCV(model_DD, tuned_parameters,cv=10)

DD.fit(X_train, y_train)

print(DD.grid_scores_)

print(DD.best_score_)

print(DD.best_params_)

y_prob = DD.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
DD.score(X_test, y_pred)

auc_roc=metrics.classification_report(y_test,y_pred)
auc_roc

Default Random Forest

from sklearn.ensemble import RandomForestClassifier

model_RR=RandomForestClassifier()

model_RR.fit(X_train,y_train)

y_prob = model_RR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_RR.score(X_test, y_pred)

auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

Let us tuned the parameters of Random Forest just for the purpose of knowledge

max_features 2) n_estimators 3) min_sample_leaf

from sklearn.ensemble import RandomForestClassifier

model_RR=RandomForestClassifier()

tuned_parameters = {'min_samples_leaf': range(10,100,10), 'n_estimators' : range(10,100,10),
                    'max_features':['auto','sqrt','log2']
                    }

from sklearn.model_selection import GridSearchCV
RR = GridSearchCV(model_RR, tuned_parameters,cv=10)

RR.fit(X_train,y_train)

print(RR.grid_scores_)

print(RR.best_score_)

print(RR.best_params_)

y_prob = RR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
RR_model.score(X_test, y_pred)

auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

Default XGBoost

from xgboost import XGBClassifier
model_XGB=XGBClassifier()

model_XGB.fit(X_train,y_train)

y_prob = model_XGB.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_XGB.score(X_test, y_pred)

auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

特征重要性

在XGBoost中特征重要性已经自动算好，存放在feature_importances_

print(model_XGB.feature_importances_)

# plot
from matplotlib import pyplot
pyplot.bar(range(len(model_XGB.feature_importances_)), model_XGB.feature_importances_)
pyplot.show()

上述表是按特征顺序打印，还可以使用XGBoost内嵌的函数，按特征重要性排序

# plot feature importance using built-in function
from xgboost import plot_importance
plot_importance(model_XGB)
pyplot.show()

可以根据特征重要性进行特征选择

from numpy import sort
from sklearn.feature_selection import SelectFromModel

# Fit model using each importance as a threshold
thresholds = sort(model_XGB.feature_importances_)
for thresh in thresholds:
  # select features using threshold
  selection = SelectFromModel(model_XGB, threshold=thresh, prefit=True)
  select_X_train = selection.transform(X_train)
  # train model
  selection_model = XGBClassifier()
  selection_model.fit(select_X_train, y_train)
# eval model
  select_X_test = selection.transform(X_test)
  y_pred = selection_model.predict(select_X_test)
  predictions = [round(value) for value in y_pred]
  accuracy = accuracy_score(y_test, predictions)
  print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],
      accuracy*100.0))