import pandas as pd
import requests
from io import StringIO
import numpy as np
from sklearn.model_selection import train_test_split #分割数据
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# 从网页读取文件
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
response = requests.get(url)
data = response.text
#创建特征列表
columns=["样本编号","肿块厚度","细胞大小均匀性","细胞形状均匀性","边缘黏性","单上皮细胞大小","裸核","染色体","正常核","有丝分裂","肿瘤性质"]
brdata = pd.read_csv(StringIO(data), names=columns)
print(brdata.shape)
#将缺失值数据替换为标准缺失值
data =brdata.replace('?',np.nan)
#丢弃带有缺失值的数据
data = data.dropna(how='any')
#输出data的数据量与维度
print(data.shape)
#将数据分为训练集和测试集 20%用于测试集80%用于训练集
X_train, X_test, y_train, y_test = train_test_split(data[columns[1:10]],data[columns[10]], test_size=0.2)
#查验训练,测试样本的数量和类别分布
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
#将特征值数据标准化,保证每个维度的特征数据方差为1,均值为0
scaler = StandardScaler()
X_train_scalered = scaler.fit_transform(X_train) #训练集将训练集的数据标准化
X_test_scalered = scaler.transform(X_test)#测试集将测试集的数据标准化
#使用LogisticRegression模型建立模型创建模型
lr = LogisticRegression(solver='liblinear')#使用liblinear作为求解器使用liblinear算法
lr.fit(X_train_scalered, y_train)#使用训练集数据拟合模型训练模型 fit()方法 fit计算平均值与标准差
#计算模型在测试集上的准确度使用测试集数据预测模型的结果
print("LogisticRegression模型准确度为:R model accuracy on test set: ", lr.score(X_test_scalered, y_test))
#调用SGDClassifier模型建立模型
sgdc= SGDClassifier(max_iter=1000)#最大迭代次数迭代次数
sgdc.fit(X_train_scalered, y_train)
print("SGDClassifier模型准确度为:R model accuracy on test set: ", sgdc.score(X_test_scalered, y_test))
#调用KNeighborsClassifier模型建立模型
knnc= KNeighborsClassifier(n_neighbors=6)#邻域数量邻居数量
knnc.fit(X_train_scalered, y_train)
print("KNeighborsClassifier模型准确度为:R model accuracy on test set: ", knnc.score(X_test_scalered, y_test))#拟合模型
#调用SVC模型建立模型
svc = SVC(kernel='linear', C=0.025)#线性核函数线性核函数C参数
svc.fit(X_train_scalered, y_train)
print("SVC模型准确度为:R model accuracy on test set: ", svc.score(X_test_scalered, y_test))
#调用GaussianNB模型建立模型
gnb = GaussianNB()
gnb.fit(X_train_scalered, y_train)
print("GaussianNB模型准确度为:R model accuracy on test set: ", gnb.score(X_test_scalered, y_test))
#调用DecisionTreeClassifier模型建立模型
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(X_train_scalered, y_train)
print("DecisionTreeClassifier模型准确度为:R model accuracy on test set: ", dtc.score(X_test_scalered, y_test))
#调用RandomForestClassifier模型建立模型
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)#随机森林分类器决策树数量决策树的深度随机种子
rfc.fit(X_train_scalered, y_train)
print("RandomForestClassifier模型准确度为:R model accuracy on test set: ", rfc.score(X_test_scalered, y_test))
#使用测试集数据预测模型的结果使用测试集数据预测模型的结果
rfc_y= rfc.predict(X_test_scalered)
print(classification_report(y_test, rfc_y,target_names=['Malignant','Benign']))#分类报告打印分类报告 Malignant:恶性 Benign:良性lignant 恶性肿瘤 Benign 良性肿瘤
良/恶性乳腺癌肿瘤预测
于 2024-05-08 14:42:10 首次发布