import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
数据分析与处理
data=pd.read_excel('quanbushuju.xlsx')
data.head()
MOF | Metal 1 Al | Metal 2 Cr | Metal 3 Hf | Metal 4 Y | Metal 5 Zr | Metal 6 Zn | Metal 7 Ni | Metal 8 Eu | Metal 9 Co | ... | metal node 31 [Ba(BDPD)(H2O)] | metal node 32 [(CH3)2NH2][M3(BTC)(HCOO)4(H2O)] | metal node 33 M6(TATAB)4(DABCO)3(H20)3 | metal node 34 Zn(PZDC)(ATZ) | sub PO | sub ECH | sub BO | sub SO | sub EBP | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | MOF-5 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
1 | USTC-253-TFA | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
2 | En-ZIF-8 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
3 | ZIF-8 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
4 | ZIF-68 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
5 rows × 128 columns
data.shape
(105, 128)
data.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 128 columns):
# Column Dtype
--- ------ -----
0 MOF object
1 Metal 1 Al int64
2 Metal 2 Cr int64
3 Metal 3 Hf int64
4 Metal 4 Y int64
5 Metal 5 Zr int64
6 Metal 6 Zn int64
7 Metal 7 Ni int64
8 Metal 8 Eu int64
9 Metal 9 Co int64
10 Metal 10 Cu int64
11 Metal 11 Ti int64
12 Metal 12 Cd int64
13 Metal 13 Li int64
14 Metal 14 Na int64
15 Metal 15 K int64
16 Metal 16 Rb int64
17 Metal 17 Cs int64
18 Metal 18 Mg int64
19 Metal 19 In int64
20 Metal 20 W int64
21 Metal 21 V int64
22 Metal 22 Ba int64
23 Metal 23 Mn int64
24 Anion 1 I- int64
25 Anion 2 Br- int64
26 Anion 3 -OH int64
27 Anion 4 Cl- int64
28 Anion 5 -COOH int64
29 Anion 6 -NH- int64
30 organic linker1 BDC int64
31 organic linker2 sbpdc int64
32 organic linker3 TFA int64
33 organic linker4 cbIM int64
34 organic linker5 MeIM int64
35 organic linker6 bIM int64
36 organic linker7 nIM int64
37 organic linker8 H4TBAPy int64
38 organic linker9 BTB int64
39 organic linker10 Me2-BPDC int64
40 organic linker11 NDC int64
41 organic linker12 TCPE int64
42 organic linker13 muco int64
43 organic linker14 bpa int64
44 organic linker15 1,2,4-btc int64
45 organic linker16 phen int64
46 organic linker17 H3L int64
47 organic linker18 BTC int64
48 organic linker19 AIP int64
49 organic linker20 NIP int64
50 organic linker21 2-F-BIM int64
51 organic linker22 TATAB int64
52 organic linker23 DABCO int64
53 organic linker24 salphen int64
54 organic linker25 saldpen int64
55 organic linker26 TCPP int64
56 organic linker27 TBPP int64
57 organic linker28 tdcbpp int64
58 organic linker29 tactmb int64
59 organic linker30 H8L int64
60 organic linker31 bpH2 int64
61 organic linker32 bpyH2 int64
62 organic linker33 NH2-BDC int64
63 organic linker34 NH2-BPY int64
64 organic linker35 MTTP int64
65 organic linker36 Glu int64
66 organic linker37 NH2-L int64
67 organic linker38 EDS int64
68 organic linker39 H4DHBDC int64
69 organic linker40 Etim-H2BDC int64
70 organic linker41 BDC-NHx(Me)3-x (I-) int64
71 organic linker42 ICA int64
72 organic linker43 F-ICA int64
73 organic linker44 IL int64
74 organic linker45 BDC-N(n-Bu)3Br int64
75 organic linker46 Im-BDC int64
76 organic linker47 BDC-P(n-Bu)3B int64
77 organic linker48 Meim-BDC int64
78 organic linker49 Allylim-2-bp int64
79 organic linker50 H4BDPD int64
80 organic linker 51 4,4`-bipy int64
81 organic linker 52 ZnTCPP int64
82 organic linker 53 citric acid int64
83 organic linker 54 MA int64
84 organic linker 55 H2L int64
85 organic linker 56 H2tzpa int64
86 organic linker 57 PZDC int64
87 organic linker 58 ATZ int64
88 metal node1 Zn4O(CO2)6 int64
89 metal node2 [Al-OH] chain int64
90 metal node3 [Cr3O(CO2)6] int64
91 metal node4 [Fe3O(CO2)6] int64
92 metal node5 [Hf6] int64
93 metal node6 [Y9(μ3-OH)8(μ2-OH)3(O2C)18] int64
94 metal node7 [Zr6O4(OH)4(CO2)8] int64
95 metal node8 [In-OH-In]n chain int64
96 metal node9 [Zn2(CO2)4] int64
97 metal node10 [Co2(CO2)4] int64
98 metal node11 [Cu2(CO2)4] int64
99 metal node12 [Cd2] int64
100 metal node13 [Cd4] int64
101 metal node14 [Zr6O4(OH)4(CO2)6] int64
102 metal node15 [Zr-O-P-O] chain int64
103 metal node16[Zr6O4(OH)4(CO2)7] int64
104 metal node17 [Zr6O4(OH)4(CO2)12] int64
105 metal node18 [Zn4O] int64
106 metal node19 [Co2x(OH2)2x(CO2)x] int64
107 metal node20 Mg2x(OH2)2x(CO2)x int64
108 metal node21 [Zr6O4(OH)4(CO2)12-x] int64
109 metal node22 Cu24(BDPO)12(H2O)12 int64
110 metal node23 Zr6O4(OH)4 int64
111 metal node 24 (Br-)Etim-Zr6 int64
112 metal node 25 Co(2)6 int64
113 metal node 26 Zn2MA(H2L) int64
114 metal node 27 {[Co(µ3-L)(H2O)]0.5H2O} int64
115 metal node 28 [Cu2(Cu-tactmb)(H2O)3(NO3)2] int64
116 metal node 29 In2(OH)(btc)(Hbtc)0.4(L)0.6·3H2O int64
117 metal node 30 [Co2(tzpa)(OH)(H2O)2 int64
118 metal node 31 [Ba(BDPD)(H2O)] int64
119 metal node 32 [(CH3)2NH2][M3(BTC)(HCOO)4(H2O)] int64
120 metal node 33 M6(TATAB)4(DABCO)3(H20)3 int64
121 metal node 34 Zn(PZDC)(ATZ) int64
122 sub PO int64
123 sub ECH int64
124 sub BO int64
125 sub SO int64
126 sub EBP int64
127 target int64
dtypes: int64(127), object(1)
memory usage: 105.1+ KB
特征数大于样本,需要降维
X=data.iloc[:,1:-1].values
X.shape
(105, 126)
y=data.iloc[:,-1].values
y.shape
(105,)
pd.value_counts(y==1)
True 76
False 29
dtype: int64
样本不平衡,提升0类数量,使用smote方法
from imblearn.over_sampling import SMOTE
sm=SMOTE()
X_res,y_res=sm.fit_resample(X,y)
print('numbers:',pd.value_counts(y_res==1))
numbers: True 76
False 76
dtype: int64
X_res.shape,y_res.shape
((152, 126), (152,))
划分数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_res,y_res,random_state=0,
test_size=0.25)
降维
from sklearn.decomposition import PCA
pca=PCA(n_components=0.6)
X_train_pca=pca.fit_transform(X_train,y_train)
X_test_pca=pca.transform(X_test)
X_train_pca.shape
(114, 10)
训练模型并优化
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(splitter='best')
from sklearn.model_selection import GridSearchCV
param={"max_depth":np.arange(3,15,1),
"min_samples_leaf":np.arange(3,16,1),
"min_samples_split":np.arange(5,12,1),
"max_leaf_nodes":np.arange(3,10,1)}
gs=GridSearchCV(estimator=dtc,
param_grid=param,
cv=5,
n_jobs=-1)
gs.fit(X_train_pca,y_train)
#获取最好模型
best_dtc=gs.best_estimator_
best_dtc.fit(X_train_pca,y_train)
#计数测试集和训练集平均准确率
print('{}{:.3f}'.format('train accuracy:',best_dtc.score(X_train_pca,y_train)))
print('{}{:.3f}'.format('test accuracy:',best_dtc.score(X_test_pca,y_test)))
train accuracy:0.825
test accuracy:0.763
gs.best_params_
{'max_depth': 4,
'max_leaf_nodes': 5,
'min_samples_leaf': 7,
'min_samples_split': 5}
性能评定
#混淆矩阵
from sklearn.metrics import confusion_matrix
y_pred=best_dtc.predict(X_test_pca)
confmat=confusion_matrix(y_test,y_pred)
plt.matshow(confmat,cmap=plt.cm.inferno,alpha=0.2)
for i in range(confmat.shape[0]):
for j in range(confmat.shape[1]):
plt.text(x=j,y=i,s=confmat[i,j],
va='center',ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()
#准确率和召回率
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score,f1_score
print('{}{:.3f}'.format('precsion:',precision_score(y_true=y_test,y_pred=y_pred)))
print('-------')
print('{}{:.3f}'.format('recall:',recall_score(y_true=y_test,y_pred=y_pred)))
print('-------')
print('{}{:.3f}'.format('F1:',f1_score(y_true=y_test,y_pred=y_pred)))
precsion:0.933
-------
recall:0.636
-------
F1:0.757
mean_fpr=np.linspace(0,1,100)
fig=plt.figure(figsize=(7,5))
probas=best_dtc.fit(X_train_pca,
y_train).predict_proba(X_test_pca)#得到每样本属于每类的概率
fpr,tpr,thresholds=roc_curve(y_test,probas[:,1],pos_label=1)
mean_tpr+=interp(mean_fpr,fpr,tpr)#插值得到fpr
roc_auc=auc(fpr,tpr)
plt.plot(fpr,
tpr)
#画random guessing
plt.plot([0,1],
[0,1],
linestyle='--',
color=(0.6,0.6,0.6),
label='random guesing')
#画出完美分类器
plt.plot([0,0,1],
[0,1,1],
linestyle=':',
color='k',
label='perfect performance')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.legend(loc='best')
plt.show()
C:\Users\lenovo\AppData\Local\Temp\ipykernel_3720\755020257.py:8: DeprecationWarning: scipy.interp is deprecated and will be removed in SciPy 2.0.0, use numpy.interp instead
mean_tpr+=interp(mean_fpr,fpr,tpr)#插值得到fpr