Mushroom_CART

M u s h r o o m C A R T MushroomCART MushroomCART

Importing all the libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score

Reading the file

还是蘑菇数据集,直接采用Kaggle竞赛中22维特征
https://www.kaggle.com/uciml/mushroom-classification

# path to where the data lies
data = pd.read_csv("./data/mushrooms.csv")
data.head(6)

在这里插入图片描述

Let us check if there is any null values

该数据没有空值/缺失数据

data.isnull().sum()

在这里插入图片描述

check if we have two claasification. Either the mushroom is poisonous or edibl

data['class'].unique()

在这里插入图片描述

print(data.dtypes)

在这里插入图片描述

check if 22 features(1st one is label) and 8124 instances

data.shape

在这里插入图片描述

The dataset has values in strings.We need to convert all the unique values to integers. Thus we perform label encoding on the data

from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for col in data.columns:
    data[col] = labelencoder.fit_transform(data[col])
 
data.head()

在这里插入图片描述

X = data.iloc[:,1:23]  # all rows, all the features and no labels
y = data.iloc[:, 0]  # all rows, label only
X.head()
y.head()

在这里插入图片描述

Splitting the data into training and testing dataset

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)

default Logistic Regression

from sklearn.linear_model import LogisticRegression
model_LR= LogisticRegression()
model_LR.fit(X_train,y_train)

在这里插入图片描述

y_prob = model_LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_LR.score(X_test, y_pred)

在这里插入图片描述

accuracy 
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

在这里插入图片描述

Logistic Regression(Tuned model)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics

LR_model= LogisticRegression()

tuned_parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] ,
              'penalty':['l1','l2']
                   }

CV

from sklearn.model_selection import GridSearchCV

LR= GridSearchCV(LR_model, tuned_parameters,cv=10)
LR.fit(X_train,y_train)

在这里插入图片描述

print(LR.best_params_)

在这里插入图片描述

y_prob = LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
LR.score(X_test, y_pred)

在这里插入图片描述

auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

在这里插入图片描述

Default Decision Tree model

from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, y_train)

在这里插入图片描述

y_prob = model_tree.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_tree.score(X_test, y_pred)

在这里插入图片描述

auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

在这里插入图片描述

Let us tune the hyperparameters of the Decision tree model

from sklearn.tree import DecisionTreeClassifier

model_DD = DecisionTreeClassifier()


tuned_parameters= { 'max_features': ["auto","sqrt","log2"],
                  'min_samples_leaf': range(1,100,1) , 'max_depth': range(1,50,1)
                  }
#tuned_parameters= { 'max_features': ["auto","sqrt","log2"]  }


#If “auto”, then max_features=sqrt(n_features).
from sklearn.model_selection import GridSearchCV
DD = GridSearchCV(model_DD, tuned_parameters,cv=10)
DD.fit(X_train, y_train)
print(DD.grid_scores_)
print(DD.best_score_)
print(DD.best_params_)
y_prob = DD.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
DD.score(X_test, y_pred)
auc_roc=metrics.classification_report(y_test,y_pred)
auc_roc

Default Random Forest

from sklearn.ensemble import RandomForestClassifier

model_RR=RandomForestClassifier()
model_RR.fit(X_train,y_train)
y_prob = model_RR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_RR.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

Let us tuned the parameters of Random Forest just for the purpose of knowledge

  1. max_features 2) n_estimators 3) min_sample_leaf
from sklearn.ensemble import RandomForestClassifier

model_RR=RandomForestClassifier()

tuned_parameters = {'min_samples_leaf': range(10,100,10), 'n_estimators' : range(10,100,10),
                    'max_features':['auto','sqrt','log2']
                    }
from sklearn.model_selection import GridSearchCV
RR = GridSearchCV(model_RR, tuned_parameters,cv=10)
RR.fit(X_train,y_train)
print(RR.grid_scores_)
print(RR.best_score_)
print(RR.best_params_)
y_prob = RR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
RR_model.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

Default XGBoost

from xgboost import XGBClassifier
model_XGB=XGBClassifier()
model_XGB.fit(X_train,y_train)
y_prob = model_XGB.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_XGB.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

特征重要性

在XGBoost中特征重要性已经自动算好,存放在feature_importances_

print(model_XGB.feature_importances_)
# plot
from matplotlib import pyplot
pyplot.bar(range(len(model_XGB.feature_importances_)), model_XGB.feature_importances_)
pyplot.show()

上述表是按特征顺序打印,还可以使用XGBoost内嵌的函数,按特征重要性排序

# plot feature importance using built-in function
from xgboost import plot_importance
plot_importance(model_XGB)
pyplot.show()

可以根据特征重要性进行特征选择

from numpy import sort
from sklearn.feature_selection import SelectFromModel

# Fit model using each importance as a threshold
thresholds = sort(model_XGB.feature_importances_)
for thresh in thresholds:
  # select features using threshold
  selection = SelectFromModel(model_XGB, threshold=thresh, prefit=True)
  select_X_train = selection.transform(X_train)
  # train model
  selection_model = XGBClassifier()
  selection_model.fit(select_X_train, y_train)
# eval model
  select_X_test = selection.transform(X_test)
  y_pred = selection_model.predict(select_X_test)
  predictions = [round(value) for value in y_pred]
  accuracy = accuracy_score(y_test, predictions)
  print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],
      accuracy*100.0))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值