加载数据给数据中的每一列起列名称
#把数据的头置为空,因为要自己设置列名称 data = pd.read_csv('./breast-cancer-wisconsin.data',header=None) # print(data) #增加列名 columns = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class'] data.columns = columns
把数据中的问号改成NAN,然后将其删除
data.replace('?',np.nan,inplace=True) #缺失值检测 # print(data.isnull().sum()) #删除缺失值 data.dropna(axis=0,how='any',inplace=True)
筛选特征值和目标值
#特征筛选,筛选数据集,去除第一列的编号 data = data.iloc[:,1:] #获取特征值、目标值 feature = data.iloc[:,:-1].values target = data.iloc[:,-1].values
异常值处理---没有异常值
拆分数据集和特征值标准化
#拆分数据集 #返回四种结果 x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.3,random_state=1) #标准化数据,特征值需要标准化,目标值不需要标准化 stand = StandardScaler() x_train = stand.fit_transform(x_train) x_test = stand.fit_transform(x_test)
利用逻辑回归对数据进行分类
#利用逻辑回归分类 lr = LogisticRegression() #参考sgd #训练数据 lr.fit(x_train,y_train) #进行预测数据 y_predict = lr.predict(x_test) #准确率 score = lr.score(x_test,y_test) #获取权重与偏置 weight = lr.coef_ bias = lr.intercept_ # print('权重:\n',weight) # print('偏置:\n',bias) # print('准确率:\n',score) # print('预测值:\n',y_predict)
计算召回率
#计算召回率,召回率越高越好 #fl-score越高越好,模型越稳健 #labels=[2,4],target_names=['良性','恶性'],把显示的行名称由2,4变成良性和恶性 res_report = classification_report(y_test,y_predict,labels=[2,4],target_names=['良性','恶性']) # print(res_report)
计算auc指标
针对样本不平衡的状态,这里样本均衡,不使用auc,只作为参考示例
注: 如果样本不均衡就变成均衡#np.where,把2,4转化成0、1 y_test = np.where(y_test > 3,1,0) # print(y_test) #计算auc指标 --针对样本不平衡的状态,这里样本均衡,不使用auc,只作为参考 auc = roc_auc_score(y_test,y_predict) print(auc)
完整代码如下
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report #召回率计算 from sklearn.metrics import roc_auc_score#引入auc指标 #加载数据 data = pd.read_csv('./breast-cancer-wisconsin.data',header=None) # print(data) #增加列名 columns = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class'] data.columns = columns # print(data) data.replace('?',np.nan,inplace=True) #缺失值检测 # print(data.isnull().sum()) #删除缺失值 data.dropna(axis=0,how='any',inplace=True) # print(data.isnull().sum()) #特征筛选,筛选数据集,去除第一列的编号 data = data.iloc[:,1:] #获取特征值、目标值 feature = data.iloc[:,:-1].values target = data.iloc[:,-1].values #异常值处理---没有异常值 #拆分数据集 #返回四种结果 x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.3,random_state=1) #标准化数据,特征值需要标准化,目标值不需要标准化 stand = StandardScaler() x_train = stand.fit_transform(x_train) x_test = stand.fit_transform(x_test) #利用逻辑回归分类 lr = LogisticRegression() #参考sgd #训练数据 lr.fit(x_train,y_train) #进行预测数据 y_predict = lr.predict(x_test) #准确率 score = lr.score(x_test,y_test) #获取权重与偏置 weight = lr.coef_ bias = lr.intercept_ # print('权重:\n',weight) # print('偏置:\n',bias) # print('准确率:\n',score) # print('预测值:\n',y_predict) #计算召回率,召回率越高越好 #fl-score越高越好,模型越稳健 #labels=[2,4],target_names=['良性','恶性'],把显示的行名称由2,4变成良性和恶性 res_report = classification_report(y_test,y_predict,labels=[2,4],target_names=['良性','恶性']) # print(res_report) #np.where,把2,4转化成0、1 y_test = np.where(y_test > 3,1,0) # print(y_test) #计算auc指标 --针对样本不平衡的状态,这里样本均衡,不使用auc,只作为参考 auc = roc_auc_score(y_test,y_predict) print(auc) #如果样本不均衡就变成均衡