import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
col_names = list(cancer.feature_names)
col_names.append('target')
df = pd.DataFrame(np.c_[cancer.data, cancer.target], columns=col_names)
df.head()
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | ... | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | ... | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 0.0 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | ... | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 0.0 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | ... | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 0.0 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | ... | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 0.0 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | ... | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 0.0 |
5 rows × 31 columns
df.describe()
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | ... | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | ... | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 |
mean | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | 0.062798 | ... | 25.677223 | 107.261213 | 880.583128 | 0.132369 | 0.254265 | 0.272188 | 0.114606 | 0.290076 | 0.083946 | 0.627417 |
std | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | 0.027414 | 0.007060 | ... | 6.146258 | 33.602542 | 569.356993 | 0.022832 | 0.157336 | 0.208624 | 0.065732 | 0.061867 | 0.018061 | 0.483918 |
min | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | 0.106000 | 0.049960 | ... | 12.020000 | 50.410000 | 185.200000 | 0.071170 | 0.027290 | 0.000000 | 0.000000 | 0.156500 | 0.055040 | 0.000000 |
25% | 11.700000 | 16.170000 | 75.170000 | 420.300000 | 0.086370 | 0.064920 | 0.029560 | 0.020310 | 0.161900 | 0.057700 | ... | 21.080000 | 84.110000 | 515.300000 | 0.116600 | 0.147200 | 0.114500 | 0.064930 | 0.250400 | 0.071460 | 0.000000 |
50% | 13.370000 | 18.840000 | 86.240000 | 551.100000 | 0.095870 | 0.092630 | 0.061540 | 0.033500 | 0.179200 | 0.061540 | ... | 25.410000 | 97.660000 | 686.500000 | 0.131300 | 0.211900 | 0.226700 | 0.099930 | 0.282200 | 0.080040 | 1.000000 |
75% | 15.780000 | 21.800000 | 104.100000 | 782.700000 | 0.105300 | 0.130400 | 0.130700 | 0.074000 | 0.195700 | 0.066120 | ... | 29.720000 | 125.400000 | 1084.000000 | 0.146000 | 0.339100 | 0.382900 | 0.161400 | 0.317900 | 0.092080 | 1.000000 |
max | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | 0.304000 | 0.097440 | ... | 49.540000 | 251.200000 | 4254.000000 | 0.222600 | 1.058000 | 1.252000 | 0.291000 | 0.663800 | 0.207500 | 1.000000 |
8 rows × 31 columns
特征选择
df.columns
Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error', 'fractal dimension error',
'worst radius', 'worst texture', 'worst perimeter', 'worst area',
'worst smoothness', 'worst compactness', 'worst concavity',
'worst concave points', 'worst symmetry', 'worst fractal dimension',
'target'],
dtype='object')
sns.countplot(x = 'target', label = "Count",data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x7f8baae71f60>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UAFlq8RK-1678272580557)(svm_files/svm_5_1.png)]
plt.figure(figsize=(10, 8))
sns.scatterplot(x = 'mean area', y = 'mean smoothness', hue = 'target', data = df)
<matplotlib.axes._subplots.AxesSubplot at 0x7f8b882cbe10>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pGFIfh4O-1678272580558)(svm_files/svm_6_1.png)]
# 皮尔森系数
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(), annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f8b882b2128>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hfY2Ize2-1678272580558)(svm_files/svm_7_1.png)]
2. 2. 模型训练
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
X = df.drop('target', axis=1)
y = df.target
print(f"'X' shape: {X.shape}")
print(f"'y' shape: {y.shape}")
pipeline = Pipeline([
('min_max_scaler', MinMaxScaler()),
('std_scaler', StandardScaler())
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
'X' shape: (569, 30)
'y' shape: (569,)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
if train: # 训练集
pred = clf.predict(X_train)
clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
print("Train Result:\n================================================")
print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
elif train==False: # 测试集
pred = clf.predict(X_test)
clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
print("Test Result:\n================================================")
print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")
多项式核
C:C-SVC的惩罚参数C?默认值是1.0
C越大,相当于惩罚松弛变量,希望松弛变量接近0,即对误分类的惩罚增大,趋向于对训练集全分对的情况,这样对训练集测试时准确率很高,但泛化能力弱。C值小,对误分类的惩罚减小,允许容错,将他们当成噪声点,泛化能力较强。
kernel:核函数,默认是rbf,可以是‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
0 – 线性:u’v
1 – 多项式:(gamma*u’v + coef0)^degree
2 – RBF函数:exp(-gamma|u-v|^2)
3 –sigmoid:tanh(gammau’*v + coef0)
degree :多项式poly函数的维度,默认是3,选择其他核函数时会被忽略。
gamma : ‘rbf’,‘poly’ 和‘sigmoid’的核函数参数。默认是’auto’,则会选择1/n_features
coef0 :核函数的常数项。对于‘poly’和 ‘sigmoid’有用。
from sklearn.svm import SVC
linear_model = SVC(kernel='linear')
linear_model.fit(X_train, y_train)
print_score(linear_model, X_train, y_train, X_test, y_test, train=True)
print_score(linear_model, X_train, y_train, X_test, y_test, train=False)
/Users/gaoguli/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['str_']. An error will be raised in 1.2.
FutureWarning,
Train Result:
================================================
Accuracy Score: 96.92%
_______________________________________________
CLASSIFICATION REPORT:
0.0 1.0 accuracy macro avg weighted avg
precision 0.975460 0.965753 0.969231 0.970607 0.969359
recall 0.940828 0.986014 0.969231 0.963421 0.969231
f1-score 0.957831 0.975779 0.969231 0.966805 0.969112
support 169.000000 286.000000 0.969231 455.000000 455.000000
_______________________________________________
Confusion Matrix:
[[159 10]
[ 4 282]]
Test Result:
================================================
Accuracy Score: 95.61%
_______________________________________________
CLASSIFICATION REPORT:
0.0 1.0 accuracy macro avg weighted avg
precision 0.975000 0.945946 0.95614 0.960473 0.956905
recall 0.906977 0.985915 0.95614 0.946446 0.956140
f1-score 0.939759 0.965517 0.95614 0.952638 0.955801
support 43.000000 71.000000 0.95614 114.000000 114.000000
_______________________________________________
Confusion Matrix:
[[39 4]
[ 1 70]]
/Users/gaoguli/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['str_']. An error will be raised in 1.2.
FutureWarning,
/Users/gaoguli/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['str_']. An error will be raised in 1.2.
FutureWarning,
from sklearn.svm import SVC
poly_model = SVC(kernel='poly', degree=2, gamma='auto', coef0=1, C=5)
poly_model.fit(X_train, y_train)
print_score(poly_model, X_train, y_train, X_test, y_test, train=True)
print_score(poly_model, X_train, y_train, X_test, y_test, train=False)
/Users/gaoguli/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['str_']. An error will be raised in 1.2.
FutureWarning,
Train Result:
================================================
Accuracy Score: 97.14%
_______________________________________________
CLASSIFICATION REPORT:
0.0 1.0 accuracy macro avg weighted avg
precision 0.987500 0.962712 0.971429 0.975106 0.971919
recall 0.934911 0.993007 0.971429 0.963959 0.971429
f1-score 0.960486 0.977625 0.971429 0.969056 0.971259
support 169.000000 286.000000 0.971429 455.000000 455.000000
_______________________________________________
Confusion Matrix:
[[158 11]
[ 2 284]]
Test Result:
================================================
Accuracy Score: 94.74%
_______________________________________________
CLASSIFICATION REPORT:
0.0 1.0 accuracy macro avg weighted avg
precision 0.974359 0.933333 0.947368 0.953846 0.948808
recall 0.883721 0.985915 0.947368 0.934818 0.947368
f1-score 0.926829 0.958904 0.947368 0.942867 0.946806
support 43.000000 71.000000 0.947368 114.000000 114.000000
_______________________________________________
Confusion Matrix:
[[38 5]
[ 1 70]]
/Users/gaoguli/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['str_']. An error will be raised in 1.2.
FutureWarning,
/Users/gaoguli/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['str_']. An error will be raised in 1.2.
FutureWarning,
高斯核函数
rbf_model = SVC(kernel='rbf', gamma=0.1, C= 1)
rbf_model.fit(X_train, y_train)
print_score(rbf_model, X_train, y_train, X_test, y_test, train=True)
print_score(rbf_model, X_train, y_train, X_test, y_test, train=False)
Train Result:
================================================
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
0.0 1.0 accuracy macro avg weighted avg
precision 1.0 1.0 1.0 1.0 1.0
recall 1.0 1.0 1.0 1.0 1.0
f1-score 1.0 1.0 1.0 1.0 1.0
support 169.0 286.0 1.0 455.0 455.0
_______________________________________________
Confusion Matrix:
[[169 0]
[ 0 286]]
Test Result:
================================================
Accuracy Score: 62.28%
_______________________________________________
CLASSIFICATION REPORT:
0.0 1.0 accuracy macro avg weighted avg
precision 0.0 0.622807 0.622807 0.311404 0.387889
recall 0.0 1.000000 0.622807 0.500000 0.622807
f1-score 0.0 0.767568 0.622807 0.383784 0.478046
support 43.0 71.000000 0.622807 114.000000 114.000000
_______________________________________________
Confusion Matrix:
[[ 0 43]
[ 0 71]]
/Users/gaoguli/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['str_']. An error will be raised in 1.2.
FutureWarning,
/Users/gaoguli/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['str_']. An error will be raised in 1.2.
FutureWarning,
/Users/gaoguli/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['str_']. An error will be raised in 1.2.
FutureWarning,
/Users/gaoguli/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/Users/gaoguli/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/Users/gaoguli/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))