from sklearn.metrics import confusion_matrix
import seaborn as sns
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from glob import glob
# 设置数据集路径
base_dir = "D:/dataset"
# 创建数据集DataFrame - 从文件名自动推断标签
def create_dataframe(dataset_path):
data = []
for img_file in glob(dataset_path + r'/*/0/*.png'):
if img_file.endswith('.png'):
# img_path = os.path.join(dataset_path, img_file)
img_path = img_file
# 从文件名推断标签: 文件名中包含"class1"为阳性(1), 其他为阴性(0)
# label = 1 if "class1" in img_file.lower() else 0
data.append([img_path, 0])
for img_file in glob(dataset_path + r'/*/1/*.png'):
if img_file.endswith('.png'):
# img_path = os.path.join(dataset_path, img_file)
img_path = img_file
# 从文件名推断标签: 文件名中包含"class1"为阳性(1), 其他为阴性(0)
# label = 1 if "class1" in img_file.lower() else 0
data.append([img_path, 1])
# for img_file in glob(dataset_path):
# if img_file.endswith('.png'):
# # img_path = os.path.join(dataset_path, img_file)
# img_path = img_file
# # 从文件名推断标签: 文件名中包含"class1"为阳性(1), 其他为阴性(0)
# label = 1 if "class1" in img_file.lower() else 0
# data.append([img_path, label])
return pd.DataFrame(data, columns=['path', 'label'])
# 创建数据集DataFrame
df = create_dataframe(base_dir)
df = create_dataframe("D:/dataset")
print("总样本数:", len(df))
print(df['label'].value_counts())
print(df.head())
# 检查数据集分布
print(f"阴性样本数(0): {len(df[df['label'] == 0])}")
print(f"阳性样本数(1): {len(df[df['label'] == 1])}")
# 划分训练集和测试集 (80%训练, 20%测试)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
# 自定义数据生成器 - 直接从文件路径加载图像
class CustomDataGenerator(tf.keras.utils.Sequence):
def __init__(self, df, batch_size=32, img_size=(50, 50), shuffle=True, augment=False):
self.df = df
self.batch_size = batch_size
self.img_size = img_size
self.shuffle = shuffle
self.augment = augment
self.on_epoch_end()
# 创建数据增强生成器
self.augmenter = ImageDataGenerator(
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest'
) if augment else None
def __len__(self):
return int(np.ceil(len(self.df) / self.batch_size))
def __getitem__(self, index):
batch_paths = self.paths[index * self.batch_size:(index + 1) * self.batch_size]
batch_labels = self.labels[index * self.batch_size:(index + 1) * self.batch_size]
batch_images = []
for path in batch_paths:
img = load_img(path, target_size=self.img_size)
img_array = img_to_array(img) / 255.0 # 归一化
if self.augment and self.augmenter:
# 应用数据增强
img_array = self.augmenter.random_transform(img_array)
batch_images.append(img_array)
return np.array(batch_images), np.array(batch_labels)
def on_epoch_end(self):
self.paths = self.df['path'].values
self.labels = self.df['label'].values
if self.shuffle:
indices = np.arange(len(self.paths))
np.random.shuffle(indices)
self.paths = self.paths[indices]
self.labels = self.labels[indices]
# 图像尺寸 (参考Kaggle数据集)
img_width, img_height = 50, 50
batch_size = 32
# 创建数据生成器
train_generator = CustomDataGenerator(
train_df,
batch_size=batch_size,
img_size=(img_width, img_height),
augment=True # 训练集使用数据增强
)
test_generator = CustomDataGenerator(
test_df,
batch_size=batch_size,
img_size=(img_width, img_height),
shuffle=False # 测试集不需要打乱
)
# 构建CNN模型
model = Sequential([
Conv2D(32, (3, 3), activation='relu', input_shape=(img_width, img_height, 3)),
MaxPooling2D((2, 2)),
Conv2D(64, (3, 3), activation='relu'),
MaxPooling2D((2, 2)),
Conv2D(128, (3, 3), activation='relu'),
MaxPooling2D((2, 2)),
Flatten(),
Dense(256, activation='relu'),
Dropout(0.5),
Dense(1, activation='sigmoid')
])
# 编译模型
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy',
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='auc')])
# 提前停止回调
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# 训练模型
history = model.fit(
train_generator,
epochs=30,
validation_data=test_generator,
callbacks=[early_stop]
)
# 评估测试集
test_results = model.evaluate(test_generator)
print(
f"测试集准确率: {test_results[1]:.4f}, 精确率: {test_results[2]:.4f}, 召回率: {test_results[3]:.4f}, AUC: {test_results[4]:.4f}")
# 保存模型
model.save('breast_cancer_cnn.h5')
# 绘制训练历史
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='训练准确率')
plt.plot(history.history['val_accuracy'], label='验证准确率')
plt.title('模型准确率')
plt.ylabel('准确率')
plt.xlabel('轮次')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='训练损失')
plt.plot(history.history['val_loss'], label='验证损失')
plt.title('模型损失')
plt.ylabel('损失')
plt.xlabel('轮次')
plt.legend()
plt.savefig('training_history.png')
plt.show()
# 获取测试集真实标签和预测标签
test_labels = []
for i in range(len(test_generator)):
_, labels = test_generator[i]
test_labels.extend(labels)
test_labels = np.array(test_labels)
# 模型预测(输出概率)
pred_probs = model.predict(test_generator)
# 转换为二分类标签(阈值0.5)
pred_labels = (pred_probs > 0.5).astype(int).flatten()
# 计算混淆矩阵
cm = confusion_matrix(test_labels, pred_labels)
# 绘制并保存混淆矩阵图
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['IDC(-)', 'IDC(+)'],
yticklabels=['IDC(-)', 'IDC(+)'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('confusion_matrix.png')
plt.show()
# === 新增的ROC曲线绘制代码 ===
from sklearn.metrics import roc_curve, auc
# 计算ROC曲线参数
fpr, tpr, thresholds = roc_curve(test_labels, pred_probs)
roc_auc = auc(fpr, tpr)
# 绘制ROC曲线
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2,
label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') # 随机猜测线
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
# 保存ROC曲线图
plt.savefig('roc_curve.png')
plt.show()
# === 新增模型评估指标代码 ===
from sklearn.metrics import classification_report
# 生成分类报告
report = classification_report(test_labels, pred_labels,
target_names=['IDC(-)', 'IDC(+)'])
print("分类报告:\n", report)
Testing dataset accuracy: 87.11%
Precision: 78.79%
Recall: 74.70%
AUC: 92.24%总结英文分析报告
最新发布