2022高教社杯全国大学生数学建模竞赛C题 问题二(1) Python代码

问题 2

2.1 依据附件数据分析高钾玻璃、铅钡玻璃的分类规律


d12 = d12.drop('rowSum', axis=1)
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# data encode
# Check for and handle categorical variables
label_encoder = LabelEncoder()
x_categorical = d12.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = d12.select_dtypes(exclude=['object']).values

df_encode = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1)
# rename columns
colnames = list(d12.columns[i] for i in ([0] + list(range(6,20)))) + list(df_encode.columns[i] for i in list(range(15,21)))
df_encode.columns = colnames

5 rows × 21 columns

from sklearn.model_selection import train_test_split
X = df_encode.drop('类型', axis=1)
y = df_encode['类型']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) 
铅钡    49
高钾    18
Name: count, dtype: int64
0    49
1    18
Name: count, dtype: int64
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train)
0    37
1    37
Name: count, dtype: int64


# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train_smote, y_train_smote)
#Predict the response for test dataset
y_pred = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         2

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cf_mat = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cf_mat,
[[12  0]
 [ 0  2]]



from sklearn import tree
text_representation = tree.export_text(clf)
|--- feature_9 <= 5.46
|   |--- class: 1
|--- feature_9 >  5.46
|   |--- class: 0
fig = plt.figure(figsize=(25,20))
my_plot = tree.plot_tree(clf, 


#define metrics
y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)

plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
fig = plt.figure(figsize=(10,8))

#create ROC curve
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.rc('font', size=20)  
plt.rc('figure', titlesize=20)


import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn import tree
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

rf = RandomForestClassifier(n_estimators=500,
rf.fit(X_train_smote, y_train_smote)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         2

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
fn = list(X_train_smote.columns) 
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
my_plot = tree.plot_tree(rf.estimators_[200],
               feature_names = fn, 
               filled = True)


import xgboost as xgb

# Use "hist" for constructing the trees, with early stopping enabled.
xgb = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2)
# Fit the model, test sets are used for early stopping.
xgb.fit(X_train_smote, y_train_smote)

#Predict the response for test dataset
y_pred = xgb.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         2

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
# plot single tree


from lightgbm import LGBMClassifier

gbm = LGBMClassifier()
gbm.fit(X_train_smote, y_train_smote)
#Predict the response for test dataset
y_pred = gbm.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         2

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
from catboost import CatBoostClassifier
cat = CatBoostClassifier(verbose=0, n_estimators=100)
cat.fit(X_train_smote, y_train_smote)

#Predict the response for test dataset
y_pred = cat.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         2

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
基于直方图的梯度提升Histogram-Based Gradient Boosting
from sklearn.ensemble import HistGradientBoostingClassifier

hbg = HistGradientBoostingClassifier()
hbg.fit(X_train_smote, y_train_smote)

#Predict the response for test dataset
y_pred = hbg.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         2

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
梯度提升树Gradient Boosting Tree
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train_smote, y_train_smote)

y_pred = gb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         2

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
from sklearn import linear_model
import numpy

logr = linear_model.LogisticRegression()
logr.fit(X_train_smote, y_train_smote)

y_pred = gb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         2

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
log_odds = logr.coef_
odds = numpy.exp(log_odds)

array([[0.82909358, 1.21679202, 1.00751889, 1.15839779, 1.06619743,
        0.99198439, 0.96850246, 0.99970948, 1.01100367, 0.75457187,
        0.91357586, 0.99337601, 0.99650844, 1.00080449, 1.00000987,
        1.020977  , 1.02869626, 0.99088894, 0.8292335 , 0.98475949]])
朴素贝叶斯Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_pred = gnb.fit(X_train_smote, y_train_smote)

y_pred = gnb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         2

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
from sklearn import svm
svm = svm.SVC()
svm.fit(X_train_smote, y_train_smote)

y_pred = svm.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         2

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
神经网络Neural network
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)
nn.fit(X_train_smote, y_train_smote)

y_pred = nn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         2

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14






