目录
问题 2
2.1 依据附件数据分析高钾玻璃、铅钡玻璃的分类规律
数据类别编码
d12 = d12.drop('rowSum', axis=1)
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
# data encode
# Check for and handle categorical variables
label_encoder = LabelEncoder()
x_categorical = d12.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = d12.select_dtypes(exclude=['object']).values
df_encode = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1)
# rename columns
colnames = list(d12.columns[i] for i in ([0] + list(range(6,20)))) + list(df_encode.columns[i] for i in list(range(15,21)))
df_encode.columns = colnames
df_encode.head()
文物编号 | 二氧化硅(SiO2) | 氧化钠(Na2O) | 氧化钾(K2O) | 氧化钙(CaO) | 氧化镁(MgO) | 氧化铝(Al2O3) | 氧化铁(Fe2O3) | 氧化铜(CuO) | 氧化铅(PbO) | ... | 五氧化二磷(P2O5) | 氧化锶(SrO) | 氧化锡(SnO2) | 二氧化硫(SO2) | 纹饰 | 类型 | 颜色 | 表面风化 | 文物采样点 | 风化标记 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 69.33 | 0.0 | 9.99 | 6.32 | 0.87 | 3.93 | 1.74 | 3.87 | 0.00 | ... | 1.17 | 0.00 | 0.0 | 0.39 | 2 | 1 | 6 | 0 | 0 | 1 |
1 | 2.0 | 36.28 | 0.0 | 1.05 | 2.34 | 1.18 | 5.73 | 1.86 | 0.26 | 47.43 | ... | 3.57 | 0.19 | 0.0 | 0.00 | 0 | 0 | 1 | 1 | 1 | 1 |
2 | 3.0 | 87.05 | 0.0 | 5.19 | 2.01 | 0.00 | 4.06 | 0.00 | 0.78 | 0.25 | ... | 0.66 | 0.00 | 0.0 | 0.00 | 0 | 1 | 6 | 0 | 2 | 1 |
3 | 3.0 | 61.71 | 0.0 | 12.37 | 5.87 | 1.11 | 5.50 | 2.16 | 5.09 | 1.41 | ... | 0.70 | 0.10 | 0.0 | 0.00 | 0 | 1 | 6 | 0 | 3 | 1 |
4 | 4.0 | 65.88 | 0.0 | 9.67 | 7.12 | 1.56 | 6.44 | 2.06 | 2.18 | 0.00 | ... | 0.79 | 0.00 | 0.0 | 0.36 | 0 | 1 | 6 | 0 | 4 | 1 |
5 rows × 21 columns
from sklearn.model_selection import train_test_split
X = df_encode.drop('类型', axis=1)
y = df_encode['类型']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
不平衡数据处理
d12['类型'].value_counts()
类型
铅钡 49
高钾 18
Name: count, dtype: int64
df_encode['类型'].value_counts()
类型
0 49
1 18
Name: count, dtype: int64
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train)
y_train_smote.value_counts()
类型
0 37
1 37
Name: count, dtype: int64
分类模型
决策树分类
模型评估:https://www.statology.org/sklearn-classification-report/
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train_smote, y_train_smote)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
precision recall f1-score support
0 1.00 1.00 1.00 12
1 1.00 1.00 1.00 2
accuracy 1.00 14
macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cf_mat = confusion_matrix(y_test, y_pred)
print('混淆矩阵')
print(cf_mat)
disp = ConfusionMatrixDisplay(confusion_matrix=cf_mat,
display_labels=clf.classes_)
disp.plot()
plt.show()
混淆矩阵
[[12 0]
[ 0 2]]
决策树可视化
https://mljar.com/blog/visualize-decision-tree/
from sklearn import tree
text_representation = tree.export_text(clf)
print(text_representation)
|--- feature_9 <= 5.46
| |--- class: 1
|--- feature_9 > 5.46
| |--- class: 0
fig = plt.figure(figsize=(25,20))
my_plot = tree.plot_tree(clf,
feature_names=list(X.columns),
class_names=['高钾','铅钡'],
filled=True)
#define metrics
y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
fig = plt.figure(figsize=(10,8))
#create ROC curve
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.rc('font', size=20)
plt.rc('figure', titlesize=20)
plt.show()
随机森林分类
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn import tree
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
rf = RandomForestClassifier(n_estimators=500,
random_state=0)
rf.fit(X_train_smote, y_train_smote)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
precision recall f1-score support
0 1.00 1.00 1.00 12
1 1.00 1.00 1.00 2
accuracy 1.00 14
macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
fn = list(X_train_smote.columns)
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
my_plot = tree.plot_tree(rf.estimators_[200],
feature_names = fn,
class_names=['高钾','铅钡'],
filled = True)
XGBoost分类
import xgboost as xgb
# Use "hist" for constructing the trees, with early stopping enabled.
xgb = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2)
# Fit the model, test sets are used for early stopping.
xgb.fit(X_train_smote, y_train_smote)
#Predict the response for test dataset
y_pred = xgb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
precision recall f1-score support
0 1.00 1.00 1.00 12
1 1.00 1.00 1.00 2
accuracy 1.00 14
macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
# plot single tree
#plot_tree(xgb)
#plt.show()
LightGBM分类
from lightgbm import LGBMClassifier
gbm = LGBMClassifier()
gbm.fit(X_train_smote, y_train_smote)
[LightGBM] [Info] Number of positive: 37, number of negative: 37
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003718 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 292
[LightGBM] [Info] Number of data points in the train set: 74, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
LGBMClassifier()
#Predict the response for test dataset
y_pred = gbm.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
precision recall f1-score support
0 1.00 1.00 1.00 12
1 1.00 1.00 1.00 2
accuracy 1.00 14
macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
Catboost分类
from catboost import CatBoostClassifier
cat = CatBoostClassifier(verbose=0, n_estimators=100)
cat.fit(X_train_smote, y_train_smote)
#Predict the response for test dataset
y_pred = cat.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
precision recall f1-score support
0 1.00 1.00 1.00 12
1 1.00 1.00 1.00 2
accuracy 1.00 14
macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
基于直方图的梯度提升Histogram-Based Gradient Boosting
from sklearn.ensemble import HistGradientBoostingClassifier
hbg = HistGradientBoostingClassifier()
hbg.fit(X_train_smote, y_train_smote)
#Predict the response for test dataset
y_pred = hbg.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
precision recall f1-score support
0 1.00 1.00 1.00 12
1 1.00 1.00 1.00 2
accuracy 1.00 14
macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
梯度提升树Gradient Boosting Tree
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train_smote, y_train_smote)
y_pred = gb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
precision recall f1-score support
0 1.00 1.00 1.00 12
1 1.00 1.00 1.00 2
accuracy 1.00 14
macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
逻辑回归Logistic
from sklearn import linear_model
import numpy
logr = linear_model.LogisticRegression()
logr.fit(X_train_smote, y_train_smote)
y_pred = gb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
precision recall f1-score support
0 1.00 1.00 1.00 12
1 1.00 1.00 1.00 2
accuracy 1.00 14
macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
print('模型的回归系数:')
log_odds = logr.coef_
odds = numpy.exp(log_odds)
odds
模型的回归系数:
array([[0.82909358, 1.21679202, 1.00751889, 1.15839779, 1.06619743,
0.99198439, 0.96850246, 0.99970948, 1.01100367, 0.75457187,
0.91357586, 0.99337601, 0.99650844, 1.00080449, 1.00000987,
1.020977 , 1.02869626, 0.99088894, 0.8292335 , 0.98475949]])
朴素贝叶斯Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train_smote, y_train_smote)
y_pred = gnb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
precision recall f1-score support
0 1.00 1.00 1.00 12
1 1.00 1.00 1.00 2
accuracy 1.00 14
macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
支持向量机SVM
from sklearn import svm
svm = svm.SVC()
svm.fit(X_train_smote, y_train_smote)
y_pred = svm.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
precision recall f1-score support
0 1.00 1.00 1.00 12
1 1.00 1.00 1.00 2
accuracy 1.00 14
macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
神经网络Neural network
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(solver='lbfgs', alpha=1e-5,
hidden_layer_sizes=(5, 2), random_state=1)
nn.fit(X_train_smote, y_train_smote)
y_pred = nn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0
precision recall f1-score support
0 1.00 1.00 1.00 12
1 1.00 1.00 1.00 2
accuracy 1.00 14
macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
相关阅读: