李沐 kaggle 树叶分类

钢铁小狗侠
已于 2023-10-28 15:51:26 修改
阅读量137
点赞数
文章标签：分类 python 数据挖掘
于 2023-10-27 20:02:28 首次发布
本文链接：https://blog.csdn.net/m0_63086198/article/details/134083274
版权
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os
import cv2
from collections import defaultdict
import matplotlib.pyplot as plt
import torchvision.models as models
from torch.optim.lr_scheduler import CosineAnnealingLR
# This is for the progress bar.
from tqdm import tqdm
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, cross_val_score
import albumentations
from albumentations.pytorch.transforms import ToTensorV2
import timm
from sklearn.metrics import f1_score, accuracy_score

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

path = '/mnt/datab/home/yuanwenzheng/kaggle/树叶分类'
train_file_path = os.path.join(path, 'train.csv')
test_file_path = os.path.join(path, 'test.csv')

df = pd.read_csv(train_file_path)
sub_df = pd.read_csv(test_file_path)

# 法2
le = LabelEncoder()
# le.fit(df['label']) 操作会修改 le 对象，使得 le 现在包含了一个属性 classes_，
# 这个属性是一个numpy数组，其中包含了唯一标签的列表，并且这些标签已经按照一定顺序（字母排列）映射到整数值。
le.fit(df['label'])
df['label'] = le.transform(df['label'])
classes_to_num = dict(zip(le.classes_, le.transform(le.classes_)))
num_to_classes = {k : v for v, k in classes_to_num.items()}

# 数据增强
def get_train_transforms():
    return albumentations.Compose(
        [
            albumentations.Resize(240,240),
            albumentations.HorizontalFlip(p=0.5),
            albumentations.VerticalFlip(p=0.5),
            albumentations.Rotate(limit=180, p=0.7),
            albumentations.RandomBrightnessContrast(),
            albumentations.ShiftScaleRotate(
                shift_limit=0.25, scale_limit=0.1, rotate_limit=0
            ),
            albumentations.Normalize(
                [0.485, 0.456, 0.406], [0.229, 0.224, 0.225],
                max_pixel_value=255.0, always_apply=True
            ),
            ToTensorV2(p=1.0),
        ]
    )

def get_valid_test_transforms():
    return albumentations.Compose(
        [
            albumentations.Resize(240, 240),
            albumentations.Normalize(
                [0.485, 0.456, 0.406], [0.229, 0.224, 0.225],
                max_pixel_value=255.0, always_apply=True
            ),
            ToTensorV2(p=1.0)
        ]
    )

# 建立数据库(获取图片并做数据增强)
class LeafDataset(Dataset):
    def __init__(self, images_file_names, labels, transform=None):
        self.images_file_names = images_file_names
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.images_file_names)

    def __getitem__(self, idx):
        image_file_name = self.images_file_names[idx]
        image = Image.open(image_file_name)
        # 将图像从BGR转换为RGB通道顺序
        # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        label = self.labels[idx]
        if self.transform is not None:
            image = np.array(image)
            # 数据增强
            image = self.transform(image=image)["image"]

        return image, label

# 监控数据
class MetricMonitor:
    def __init__(self, float_precision=3):
        self.float_precision = float_precision
        self.reset()

    def reset(self):
        self.metrics = defaultdict(lambda: {"val": 0, "count": 0, "avg": 0})

    def update(self, metric_name, val):
        metric = self.metrics[metric_name]

        metric["val"] += val
        metric["count"] += 1
        metric["avg"] = metric["val"] / metric["count"]

    def __str__(self):
        return " | ".join(
            [
                "{metric_name}: {avg:.{float_precision}f}".format(
                    metric_name=metric_name, avg=metric["avg"],
                    float_precision=self.float_precision
                )
                for (metric_name, metric) in self.metrics.items()
            ]
        )

params = {
    'model':'resnet50',
    'device': device,
    'lr': 1e-4,
    'batch_size': 64,
    'num_workers': 0,
    'epochs': 50,
    # unique()是只包含唯一的label，nunique（）返回种类的数量，比如这里返回176
    'out_features': df['label'].nunique(),
    'weight_decay': 4e-5
}
# 8 to 0.824

class LeafNet(nn.Module):
    def __init__(self, model_name=params['model'], out_features=params['out_features'], pretrained=True):
        super().__init__()
        # self.model = timm.create_model("seresnext50_32x4d", pretrained=True)
        self.model = models.__dict__[model_name](pretrained=True)
        n_features = self.model.fc.in_features
        # 将模型的最后一层替换
        self.model.fc = nn.Linear(n_features, out_features)

    def forward(self, x):
        x = self.model(x)
        return x


class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.classes = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.classes - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))


def accuracy(output, target):
    y_pred = torch.softmax(output, dim=1)
    y_pred = torch.argmax(y_pred, dim=1).cpu()
    target = target.cpu()

    return accuracy_score(target, y_pred)


def calculate_f1_macro(output, target):
    y_pred = torch.softmax(output, dim=1)
    y_pred = torch.argmax(y_pred, dim=1).cpu()
    target = target.cpu()

    return f1_score(target, y_pred, average='macro')


def train(train_loader, model, smoothing_loss, optimizer, epoch, params):
    metric_monitor = MetricMonitor()
    scheduler = CosineAnnealingLR(optimizer, T_max=25)
    model.train()
    # 总批量
    Batch = len(train_loader)
    stream = tqdm(train_loader)
    # start=1索引从1开始迭代
    for i, (images, labels) in enumerate(stream, start=1):
        # non_blocking=True 图像移动到设备时不会阻塞主线程，允许其他操作继续执行
        images = images.to(params['device'], non_blocking=True)
        labels = labels.to(params['device'], non_blocking=True)
        predict = model(images)
        loss = smoothing_loss(predict, labels)
        f1_macro = calculate_f1_macro(predict, labels)
        acc = accuracy(predict, labels)
        metric_monitor.update('Loss', loss.item())
        metric_monitor.update('F1', f1_macro)
        metric_monitor.update('Accuracy', acc)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()  # 更新学习率
        stream.set_description(
            "Epoch: {epoch}. Train.      {metric_monitor}".format(
                epoch=epoch,
                metric_monitor=metric_monitor)
        )
    # 从 metric_monitor 中获取名为 "Accuracy" 的指标的平均值
    return metric_monitor.metrics['Accuracy']["avg"]


def validate(val_loader, model, smoothing_loss, epoch, params):
    metric_monitor = MetricMonitor()
    model.eval()
    stream = tqdm(val_loader)
    with torch.no_grad():
        for i, (images, labels) in enumerate(stream, start=1):
            images = images.to(params['device'], non_blocking=True)
            labels = labels.to(params['device'], non_blocking=True)
            predict = model(images)
            loss = smoothing_loss(predict, labels)
            f1_macro = calculate_f1_macro(predict, labels)
            acc = accuracy(predict, labels)
            metric_monitor.update('Loss', loss.item())
            metric_monitor.update('F1', f1_macro)
            metric_monitor.update('Accuracy', acc)
            stream.set_description(
                "Epoch: {epoch}. Validation. {metric_monitor}".format(
                    epoch=epoch,
                    metric_monitor=metric_monitor)
            )

    return metric_monitor.metrics['Accuracy']["avg"]


def __train__():
    # K折交叉验证(每个验证集的比例是总数据集的 1/5,循环5次)
    # 使用交叉验证迭代器将数据集分成不同的训练集和测试集，然后通过 enumerate() 获取每个折叠的索引 k、
    # 训练集的索引 train_index 和测试集的索引 test_index
    kf = StratifiedKFold(n_splits=5)
    for k, (train_index, valid_index) in enumerate(kf.split(df['image'], df['label'])):
        train_img, valid_img = df['image'][train_index], df['image'][valid_index]
        train_labels, valid_labels = df['label'][train_index], df['label'][valid_index]

        train_image_paths = '/mnt/datab/home/yuanwenzheng/kaggle/树叶分类/' + train_img
        valid_image_paths = '/mnt/datab/home/yuanwenzheng/kaggle/树叶分类/' + valid_img
        train_dataset = LeafDataset(images_file_names=train_image_paths.values,
                                    labels=train_labels.values,
                                    transform=get_train_transforms())
        valid_dataset = LeafDataset(images_file_names=valid_image_paths.values,
                                    labels=valid_labels.values,
                                    transform=get_valid_test_transforms())

        train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=params['batch_size'],
            shuffle=True,
            num_workers=params['num_workers'],
            # 数据加载到 CUDA 错误时不会报告警告
            pin_memory=True,
        )

        val_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=params['batch_size'],
            shuffle=False,
            num_workers=params['num_workers'],
            # 数据加载到 CUDA 错误时不会报告警告
            pin_memory=True,
        )

        # 建立模型
        model = LeafNet()
        # 多个gpu并行训练时使用
        # 有多个gpu才可以使用多模型训练和测试
        # model = nn.DataParallel(model)
        model = model.to(params['device'])
        smoothing_loss = LabelSmoothingLoss(classes=params['out_features'], smoothing=0.1)
        optimizer = torch.optim.AdamW(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])

        best_acc = 0.0
        for epoch in range(1, params['epochs'] + 1):
            train(train_loader, model, smoothing_loss, optimizer, epoch, params)
            valid_acc = validate(val_loader, model, smoothing_loss, epoch, params)
            if valid_acc > best_acc:
                best_acc = valid_acc
                torch.save(model.state_dict(),
                           f"/mnt/datab/home/yuanwenzheng/kaggle/树叶分类/checkpoints/{params['model']}_{k}flod_{epoch}epochs_accuracy{valid_acc:.5f}_weights.pth")
                print('saving model with acc {:.3f}'.format(best_acc))


# 测试
def __test__():
    test_paths = '/mnt/datab/home/yuanwenzheng/kaggle/树叶分类/' + sub_df['image']

    labels = np.zeros(len(test_paths))
    test_dataset = LeafDataset(images_file_names=test_paths.values,
                               labels=labels,
                               transform=get_valid_test_transforms())
    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=128,
        shuffle=False,
        num_workers=5,
        # 数据加载到 CUDA 错误时不会报告警告
        pin_memory=True,
    )

    # K折交叉验证每一循环的训练模型
    model_path_list = [
        '/mnt/datab/home/yuanwenzheng/kaggle/树叶分类/checkpoints/resnet50_0flod_38epochs_accuracy0.96585_weights.pth',
        '/mnt/datab/home/yuanwenzheng/kaggle/树叶分类/checkpoints/resnet50_2flod_42epochs_accuracy0.96767_weights.pth',
        '/mnt/datab/home/yuanwenzheng/kaggle/树叶分类/checkpoints/resnet50_3flod_50epochs_accuracy0.96740_weights.pth',
        '/mnt/datab/home/yuanwenzheng/kaggle/树叶分类/checkpoints/resnet50_4flod_33epochs_accuracy0.96606_weights.pth',
        '/mnt/datab/home/yuanwenzheng/kaggle/树叶分类/checkpoints/resnet50_4flod_43epochs_accuracy0.96902_weights.pth',
    ]

    model_list = []
    for i in range(len(model_path_list)):
        model_list.append(LeafNet())
        # model_list[i] = nn.DataParallel(model_list[i])
        model_list[i] = model_list[i].to(params['device'])
        init = torch.load(model_path_list[i])
        model_list[i].load_state_dict(init)
        model_list[i].eval()
        model_list[i].cuda()

    predicted_labels = []
    pred_string = []
    preds = []

    with torch.no_grad():
        for (images, labels) in tqdm(test_loader):
            images = images.cuda()
            onehots = sum([model(images) for model in model_list]) / len(model_list)
            # onehots is predict, label is list of zero
            for oh, name in zip(onehots, labels):
                lbs = num_to_classes[torch.argmax(oh).item()]
                preds.append(dict(image=name, labels=lbs))
    df_preds = pd.DataFrame(preds)
    sub_df['label'] = df_preds['labels']
    saveFileName = '/mnt/datab/home/yuanwenzheng/kaggle/树叶分类/submission.csv'
    sub_df.to_csv(saveFileName, index=False)
    print("Done!")

if __name__ == "__main__":
    Flag = 0
    if Flag == 0:
        __train__()
    else:
        __test__()