KNN预测QSAR生物浓缩类别
importnumpyimport pandas #导入Excel文件
from sklearn.neighbors import KNeighborsClassifier #机器学习算法库,没有深度学习算法
shen=pandas.read_csv(r"D:\Python\代码\Machine-Learn\1-KNN\data\shenwu.csv")
print("总数据条数:{};列数:{}".format(shen.shape[0],shen.shape[1]))
shen.head()
总数据条数:779;列数:14
CASSMILESSetnHMpiPC09PCDX2AvMLOGPON1VN-072B02[C-N]F04[C-O]ClasslogBCF
0
100-02-7
O=[N+](c1ccc(cc1)O)[O-]
Train
0
0.0
1.49
0.14
1.35
0.72
0
1
5
1
0.74
1
100-17-4
O=[N+](c1ccc(cc1)OC)[O-]
Train
0
0.0
1.47
0.14
1.70
0.88
0
1
5
1
0.93
2
100-18-5
c1cc(ccc1C(C)C)C(C)C
Train
0
0.0
1.20
0.25
4.14
2.06
0
0
0
3
3.24
3
100-25-4
O=[N+]([O-])c1ccc(cc1)[N+](=O)[O-]
Train
0
0.0
1.69
0.13
1.89
0.79
0
1
8
3
-0.40
4
100-40-3
C=CC1CCC=CC1
Train
0
0.0
0.52
0.25
2.65
1.31
0
0
0
1
2.24
#筛选set值为Train的训练数据
shen_train=shen[shen.Set.isin(["Train"])]
shen_test=shen[shen.Set.isin(["Test"])]print("训练数据:{}个\n测试数据:{}个".format((shen_train.shape)[0],(shen_test.shape[0])))
shen_test.head()
训练数据:584个
测试数据:195个
CASSMILESSetnHMpiPC09PCDX2AvMLOGPON1VN-072B02[C-N]F04[C-O]ClasslogBCF
5
100-42-5
C=Cc1ccccc1
Test
0
0.000
1.40
0.18
2.85
0.86
0
0
0
3
1.13
12
101-53-1
Oc1ccc(cc1)Cc1ccccc1
Test
0
5.768
2.21
0.18
3.40
1.47
0
0
1
3
1.40
15
101-84-8
O(c1ccccc1)c1ccccc1
Test
0
5.614
2.21
0.16
3.40
1.31
0
0
2
1
2.57
16
102-06-7
N=C(Nc1ccccc1)Nc1ccccc1
Test
0
5.030
2.07
0.16
3.09
1.54
0
1
0
2
1.05
19
10315-98-7
O1CCN(CC1)CC(C)C
Test
0
0.000
0.00
0.28
1.00
1.80
0
1
1
1
0.23
获得训练,测试数据中的训练数据和结果
#筛选训练数据中的结果数据
y_train=shen_train.iloc[:,[-2,-1]]
y_test=shen_test.iloc[:,[-2,-1]]print("训练数据结果:\n{}\n测试数据结果:\n{}\n".format(y_train.head(),y_test.head()))
训练数据结果:
Class logBCF
0 1 0.74
1 1 0.93
2 3 3.24
3 3 -0.40
4 1 2.24
测试数据结果:
Class logBCF
5 3 1.13
12 3 1.40
15 1 2.57
16 2 1.05
19 1 0.23
#筛选训练数据中的训练数据
x_train=shen_train.iloc[:,[3,4,5,6,7,8,9,10,11]]
x_test=shen_test.iloc[:,[3,4,5,6,7,8,9,10,11]]print("训练数据:\n{}\n测试数据:\n{}\n".format(x_train.head(),x_test.head()))
训练数据:
nHM piPC09 PCD X2Av MLOGP ON1V N-072 B02[C-N] F04[C-O]
0 0 0.0 1.49 0.14 1.35 0.72 0 1 5
1 0 0.0 1.47 0.14 1.70 0.88 0 1 5
2 0 0.0 1.20 0.25 4.14 2.06 0 0 0
3 0 0.0 1.69 0.13 1.89 0.79 0 1 8
4 0 0.0 0.52 0.25 2.65 1.31 0 0 0
测试数据:
nHM piPC09 PCD X2Av MLOGP ON1V N-072 B02[C-N] F04[C-O]
5 0 0.000 1.40 0.18 2.85 0.86 0 0 0
12 0 5.768 2.21 0.18 3.40 1.47 0 0 1
15 0 5.614 2.21 0.16 3.40 1.31 0 0 2
16 0 5.030 2.07 0.16 3.09 1.54 0 1 0
19 0 0.000 0.00 0.28 1.00 1.80 0 1 1
#查看训练数据的数据类型,当数据类型不是int时要将数据映射为数字才能进行训练
y_train.info()
Int64Index: 584 entries, 0 to 776
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Class 584 non-null int64
1 logBCF 584 non-null float64
dtypes: float64(1), int64(1)
memory usage: 13.7 KB
将非int类型的数据量化为整数
defchange_type(values):for col invalues.columns[:]:
u=values[col].unique()#获取每个属性的具体属性值,argwhere方法可以获取属性值的索引值(0,1,2...),并将具体属性值映射为索引值
defconver(x):return numpy.argwhere(u==x)[0,0]
values[col]=values[col].map(conver)
change_type(x_train)
change_type(x_test)
change_type(y_train)
change_type(y_test)
y_train
584 rows × 2 columns
ClasslogBCF
0
0
0
1
0
1
2
1
2
3
1
3
4
0
4
...
...
...
771
0
333
772
0
334
773
0
41
774
0
142
776
0
335
knn=KNeighborsClassifier(n_neighbors=5,weights="distance",n_jobs=-1)
knn.fit(x_train, y_train)
y_=knn.predict(x_test)
acc=(y_==y_test).mean()print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
预测生物富集因子准确率:0.041025641025641026;预测生物富集等级准确率:0.4153846153846154
提高算法准确率
1,修改算法参数
knn=KNeighborsClassifier(n_neighbors=3,weights="distance",p=1,n_jobs=-1)
knn.fit(x_train, y_train)
y_=knn.predict(x_test)
acc=(y_==y_test).mean()print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
预测生物富集因子准确率:0.06666666666666667;预测生物富集等级准确率:0.441025641025641
2,修改训练数据
#最大值最小值归一化(当数据间隔很大时可以有效提高准确率,消除属性之间的差异)
x_train_min=x_train.min()
x_train_max=x_train.max()
x2_train=(x_train-x_train_min)/(x_train_max-x_train_min)
x_test_min=x_test.min()
x_test_max=x_test.max()
x2_test=(x_test-x_test_min)/(x_test_max-x_test_min)
x2_test.head()
nHMpiPC09PCDX2AvMLOGPON1VN-072B02[C-N]F04[C-O]
5
0.0
0.000000
0.000000
0.000000
0.000000
0.000000
0.0
0.0
0.000000
12
0.0
0.008929
0.009524
0.000000
0.006849
0.007874
0.0
0.0
0.058824
15
0.0
0.017857
0.009524
0.029412
0.006849
0.015748
0.0
0.0
0.117647
16
0.0
0.026786
0.019048
0.029412
0.013699
0.023622
0.0
1.0
0.000000
19
0.0
0.000000
0.028571
0.058824
0.020548
0.031496
0.0
1.0
0.058824
knn=KNeighborsClassifier(n_neighbors=3,weights="distance",p=1,n_jobs=-1)
knn.fit(x2_train, y_train)
y_=knn.predict(x2_test)
acc=(y_==y_test).mean()print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
预测生物富集因子准确率:0.02564102564102564;预测生物富集等级准确率:0.4358974358974359
#Z-score归一化
#求平均值
x_train_mean=x_train.mean()#求方差
x_train_std=x_train.std()
x3_train=(x_train-x_train_mean)/x_train_std
x3_train.head()#求平均值
x_test_mean=x_test.mean()#求方差
x_test_std=x_test.std()
x3_test=(x_test-x_test_mean)/x_test_std
x3_test.head()
nHMpiPC09PCDX2AvMLOGPON1VN-072B02[C-N]F04[C-O]
5
-0.858971
-0.961536
-1.433213
-1.308189
-1.621471
-1.571899
-0.37945
-0.885971
-0.901314
12
-0.858971
-0.934067
-1.399267
-1.308189
-1.597619
-1.544272
-0.37945
-0.885971
-0.706463
15
-0.858971
-0.906599
-1.399267
-1.184954
-1.597619
-1.516645
-0.37945
-0.885971
-0.511611
16
-0.858971
-0.879131
-1.365321
-1.184954
-1.573767
-1.489018
-0.37945
1.122917
-0.901314
19
-0.858971
-0.961536
-1.331375
-1.061719
-1.549914
-1.461391
-0.37945
1.122917
-0.706463
knn=KNeighborsClassifier(n_neighbors=3,weights="distance",p=1,n_jobs=-1)
knn.fit(x3_train, y_train)
y_=knn.predict(x3_test)
acc=(y_==y_test).mean()print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
预测生物富集因子准确率:0.015384615384615385;预测生物富集等级准确率:0.41025641025641024
#sklearn封装为的z-score归一化操作
from sklearn.preprocessing importStandardScaler,MinMaxScaler
s=StandardScaler()
x4_train=s.fit_transform(x_train)#x4_train 为z-score归一化后的数据
x4_test=s.fit_transform(x_test)
m=MinMaxScaler()
x5_train=m.fit_transform(x_train)
x5_test=m.fit_transform(x_test)#x5_train 为最大值,最小值归一化后的数据
保存算法模型
from sklearn.externals importjoblib
joblib.dump(knn,'./model',cache_size=9)#保存模型,默认保存为压缩类型,会保存所有数据,cache_size=9时为压缩最小大小
#加载模型 model=joblib.load('./model')
x=model.predict(x2_test)
acc=(y_==y_test).mean()print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
预测生物富集因子准确率:0.020512820512820513;预测生物富集等级准确率:0.41025641025641024