这里通过LogisticRegression分类器和随机梯度分类器对一些肿瘤数据进行分类,分出是良性还是恶性
代码来自《python机器学习及实践》
#pandas用于处理数据
import pandas as pd
import numpy as np
#将数据分割为训练集和测试集
from sklearn import train_test_split
#用于标准化数据
from sklearn.preprocessing import StandardScaler
#一种线性分类技术
from sklearn.linear_model import LogisticRegression
#一种线性分类技术
from sklearn.linear_model import SGDClassifier
#用于分析数据
from sklearn.metrics import classification_report
#所有的属性和编号,标签
colum_names = ['Sample code number',
'Clump Thickness',
'Uniformity of Cell Size',
'Uniformity of Cell Shape',
'Marginal Adhesion',
'Single Epithelial Cell Size',
'Bare Nuclei', 'Bland Chromatin',
'Normal Nucleoli',
'Mitoses',
'Class']
#从csv文件中读入数据
data = pd.read_csv('data.csv', names = colum_names)
#将?用nan(非数字)替代
data = data.replace(to_replace='?', value=np.nan)
#放弃含有nan的数据
data = data.dropna(how='any')
X_train, X_test, y_train, y_test =
"""属性为每行的1-9号值(左闭右开)
label为每行的第11个属性
测试集占25%
随机数的种子为33
"""
train_test_split(data[colum_names[1:10]], data[colum_names[10]],
test_size=0.25, random_state=33)
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)
lr = LogisticRegression()
sgdc = SGDClassifier()
lr.fit(X_train, y_train)
lr_y_predict = lr.predict(X_test) #用fit之后的结果对X_test进行预测
sgdc.fit(X_train, y_train)
sgdc_y_predict = sgdc.predict(X_test)
#对X_test的预测结果与y_test进行对比
print('Accuracy of LR Classifier:', lr.score(X_test, y_test))
#输出y_test与预测结果的对比
print(classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant']))
print('Accuarcy of SGD Classifier:', sgdc.score(X_test, y_test))
print(classification_report(y_test, sgdc_y_predict, target_names=['Benign', 'Malignant']))