from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import joblib
import time
4. 载入数据集
cancer = load_breast_cancer()
X = cancer.data
Y = cancer.target
print(f'The shape of cancer dataset is {X.shape}')print(f'The shape of label is {Y.shape}')
5. 数据预处理
导入数据和标签
cancer = load_breast_cancer()
X = cancer.data
Y = cancer.target
print(f'The shape of cancer dataset is {X.shape}')print(f'The shape of label is {Y.shape}')
运行结果为
6. 特征工程
由于此数据集的特征是构造好的,我们无需构造,所以此步骤省略
7. 建模
划分训练集合测试集,比例为 8:2
# 划分训练集合测试集,比例为 8:2X_train,X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2)
print(f'The shape of X_train is {X_train.shape}')
print(f'The shape of y_train is {y_train.shape}')
print(f'The shape of X_test is {X_test.shape}')
print(f'The shape of y_test is {y_test.shape}')
# 模型评估
accuracy = clf_svc.score(X_test, y_test)print(f'The accuracy of the SVC model is {accuracy*100}%')
y_pred = clf_svc.predict(X_test)print(f'The report of the SVC model is \n {classification_report(y_test, y_pred)}')
# 参数调优
parameters ={'kernel':('linear','rbf'),'C':[1,10]}
svc = SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)print(f'The best parameters are {clf.best_params_}')
accuracy = clf.score(X_test, y_test)print(f'The accuracy of the SVC model is {accuracy*100}%')
y_pred = clf.predict(X_test)print(f'The report of the SVC model is \n {classification_report(y_test, y_pred)}')# 保存模型
joblib.dump(clf,"svc_breast_cancer.m")
# 模拟线上部署
clf_online = joblib.load("svc_breast_cancer.m")for i, e inenumerate(X_test):
result_SVM = clf_online.predict([e])print(f"--------------The {i+1}th data result is: ---------------------- ")print(f"The Result of Model is {result_SVM[0]}")print(f"The True value is {y_test[i]} \n")
time.sleep(1)