import pandas as pd
import numpy as np
path= "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_name = [ 'Sample code number' , 'Clump Thickness' , 'Uniformity of Cell Size' , 'Uniformity of Cell Shape' ,
'Marginal Adhesion' , 'Single Epithelial Cell Size' , 'Bare Nuclei' , 'Bland Chromatin' ,
'Normal Nucleoli' , 'Mitoses' , 'Class' ]
data= pd. read_csv( path, names= column_name)
data= data. replace( to_replace= "?" , value= np. nan)
data. dropna( inplace= True )
data. isnull( ) . any ( )
Sample code number False
Clump Thickness False
Uniformity of Cell Size False
Uniformity of Cell Shape False
Marginal Adhesion False
Single Epithelial Cell Size False
Bare Nuclei False
Bland Chromatin False
Normal Nucleoli False
Mitoses False
Class False
dtype: bool
from sklearn. model_selection import train_test_split
x= data. iloc[ : , 1 : - 1 ]
y= data[ "Class" ]
x_train, x_test, y_train, y_test= train_test_split( x, y)
from sklearn. preprocessing import StandardScaler
transfer= StandardScaler( )
x_train= transfer. fit_transform( x_train)
x_test= transfer. transform( x_test)
from sklearn. linear_model import LogisticRegression
estimator= LogisticRegression( )
estimator. fit( x_train, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
estimator. coef_
array([[ 1.34866956, 0.39407465, 0.95414338, 0.52087229, -0.30587845,
1.18209596, 0.94562307, 0.95800325, 0.58231319]])
estimator. intercept_
array([-1.14018688])
y_predict= estimator. predict( x_test)
print ( "y_predict\n" , y_predict)
print ( "直接比对真实值和预测值:\n" , y_test== y_predict)
score= estimator. score( x_test, y_test)
print ( "准确率:\n" , score)
y_predict
[2 2 4 2 2 2 4 4 4 2 4 4 2 4 4 4 2 2 2 4 2 2 2 4 2 2 2 2 2 2 2 4 2 2 2 2 4
2 4 4 2 2 2 2 2 4 2 2 2 4 2 2 2 4 2 2 2 4 4 2 2 4 2 4 4 2 2 4 2 4 4 2 4 2
4 4 4 4 4 2 4 4 4 2 2 2 2 4 2 2 2 2 2 4 2 4 2 2 2 2 2 4 4 2 2 4 2 2 2 4 2
4 4 4 2 2 4 2 4 2 4 4 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 2 2 2
4 2 4 4 4 2 4 4 2 4 2 4 4 2 2 4 2 2 2 2 2 2 2]
直接比对真实值和预测值:
93 True
165 True
670 True
380 True
385 True
...
291 True
370 True
332 True
632 True
492 True
Name: Class, Length: 171, dtype: bool
准确率:
0.9707602339181286
from sklearn. metrics import classification_report
report= classification_report( y_test, y_predict, labels= [ 2 , 4 ] , target_names= [ '良性' , '恶性' ] )
print ( report)
precision recall f1-score support
良性 0.96 0.99 0.98 102
恶性 0.98 0.94 0.96 69
accuracy 0.97 171
macro avg 0.97 0.97 0.97 171
weighted avg 0.97 0.97 0.97 171