预测癌症状况
data = pd.read_csv('./data.csv')
data = data.replace(to_replace = '?', value = np.nan).dropna()
对数据进行划分
x = data.loc[:, data.columns != 'Class']
y = data.loc[:, data.columns == 'Class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 1)
绘制数据的分布图像
# 统计各类的数量
class_counts = pd.value_counts(data['Class'], sort = True).sort_index()
class_counts.plot(kind = 'bar')
plt.title('Fraud class histogram')
plt.xlabel('class')
plt.ylabel('frequency')
plt.show()
print("良性的比例为:", len(data[data.Class == 2])/len(data['Class'])) # 0.66
print("恶性的比例为:", len(data[data.Class == 4])/len(data['Class'])) # 0.34
正反例的数量大概是2:1, 可以先直接对数据进行建模。下面使用过采样方法增加数据集后对数据进行建模
os_x_train, os_y_train = SMOTE(random_state = 1).fit_sample(x_train, y_train)
# 标准化数据
std = StandardScaler()
os_x_train = std.fit_transform(os_x_train)
x_test = std.transform(x_test)
lr = LogisticRegression()
lr.fit(os_x_train, os_y_train)
y_pred = lr.predict(x_test)
# 进行模型预测
y_pred = lr.predict(x_test)
# 输出模型的精确度
print("模型分类的精确度为:{}".format(lr.score(x_test, y_test)))
# 输出模型的召回率
print(classification_report(y_test, y_pred, labels=[2,4], target_names=['良性',"恶性"]))
定义混淆矩阵进行数据的可视化
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
调用函数并绘制矩阵
# 绘制混淆矩阵
cnf_matrix = confusion_matrix(y_test,y_pred)
class_names = [2,4]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()