SETI Breakthrough Listen - E.T. Signal Search

1.比赛简介

1.1 链接

https://www.kaggle.com/c/seti-breakthrough-listen

1.2 简介

“宇宙中只有我们吗?”

这是人类最深刻和永恒的问题之一。随着技术的进步,我们正在寻找新的、更强大的方法来寻找答案。加利福尼亚大学的突破侦探小组(伯克利)利用世界上最强大的望远镜来扫描数百万颗恒星,寻找技术的迹象。现在它希望Kaggle社区帮助解释他们接收到的信号。

Listen团队是搜寻外星智能(SETI)的一部分,使用地球上最大的可操纵碟形望远镜,直径100米的格林班克望远镜。与任何SETI搜索一样,沟通的动机也是主要的挑战。人类已经制造了大量的无线电设备。在现代科技探测的大海捞针中,很难找到外星传播的微弱信号。

当前的方法使用两个过滤器在草堆中搜索。首先,Listen团队将目标恒星的扫描与天空其他区域的扫描穿插在一起。两组扫描中出现的任何信号都可能不是来自目标恒星的方向。其次,管道会丢弃不改变频率的信号,因为这意味着它们可能在望远镜附近。运动中的震源应该有一个指示移动的信号,类似于经过的消防车警报器的音调变化。这两个过滤器相当有效,但我们知道它们可以改进。毫无疑问,管道漏掉了感兴趣的信号,特别是那些具有复杂时间或频率结构的信号,以及那些在频谱中存在大量干扰的区域的信号。

在本次比赛中,使用您的数据科学技能帮助识别突破性目标扫描中的异常信号。由于目前还没有确定的外星信号用于训练机器学习算法的例子,研究小组在望远镜的大量数据中加入了一些模拟信号(他们称之为“针”)。他们已经确定了一些隐藏的针,这样你就可以训练你的模型来发现更多。数据由二维阵列组成,因此可能有计算机视觉的方法很有前途,还有数字信号处理、异常检测等。成功识别最多针头的算法将获得现金奖励,但也有可能帮助回答科学中最大的问题之一。

2.数据下载

链接: https://pan.baidu.com/s/1wr8m9ZOo7PpafZYKHjrw5g 提取码: 4wdj
id	|target	|img_path-------- | -----0000799a2b2c42d|	0|	../input/seti-breakthrough-listen/train/0/0000...00042890562ff68|	0|	../input/seti-breakthrough-listen/train/0/0004...0005364cdcb8e5b|	0|	../input/seti-breakthrough-listen/train/0/0005...0007a5a46901c56|	0|	../input/seti-breakthrough-listen/train/0/0007...0009283e145448e|	0|	../input/seti-breakthrough-listen/train/0/0009...

3.baseline代码

3.1 EfficientNet pytorch

import os
import sys
sys.path = ['../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master',] + sys.path
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
import cv2
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm import tqdm
import torch
import torchvision.models as models
import torch.nn as nn
from efficientnet_pytorch import model as enet
import random
from sklearn.model_selection import KFold, StratifiedKFold
import albumentations as A
import torch.optim as optim
import torch.nn.functional as F

"""
设置显卡的使用数量
"""
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
if torch.cuda.device_count() > 1:
    print("We can use", torch.cuda.device_count(), "GPUs!")

"""
设置是否并行
"""
parallel=False

"""
选择使用gpu或者cpu运行
"""
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

"""
设置随即种子,达到可以复现的效果
"""
def set_seed(seed = 0):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

seed = 666
random_state = set_seed(seed)

"""
对图像进行增强,在dataload的时候进行使用
"""
# img_sz = 256
img_sz = 512
ttransform = A.Compose([
    A.Resize(img_sz, img_sz, cv2.INTER_NEAREST),
    #A.VerticalFlip(p=0.5),
    #A.HorizontalFlip(p=0.5),
])
vtransform = A.Compose([
    A.Resize(img_sz, img_sz, cv2.INTER_NEAREST),
    #A.VerticalFlip(p=0.5),
    #A.HorizontalFlip(p=0.5),
])


class ClassificationDataset:
    """
    加载数据dataloader方法
    """
    
    def __init__(self, image_paths, targets, tr):
        """
        初始化对图像进行处理
        """
        self.image_paths = image_paths
        self.targets = targets
        self.tr = tr

    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, item):
        """
        将图像转为tensor
        """
        image = np.load(self.image_paths[item])
        image = np.vstack(image).astype(float)
        image = self.tr(image = image)["image"][np.newaxis, ]

        targets = self.targets[item]
                
        return {
            "image": torch.tensor(image, dtype=torch.float),
            "targets": torch.tensor(targets, dtype=torch.long),
        }


class FocalLoss(nn.Module):
    """
    定义focalloss,这里主要针对二分类标签不均衡问题
    """

    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = 1e-12  # prevent training from Nan-loss error

    def forward(self, logits, target):
        """
        logits & target should be tensors with shape [batch_size, num_classes]
        """
        probs = F.sigmoid(logits)
        one_subtract_probs = 1.0 - probs
        # add epsilon
        probs_new = probs + self.epsilon
        one_subtract_probs_new = one_subtract_probs + self.epsilon
        # calculate focal loss
        log_pt = target * torch.log(probs_new) + (1.0 - target) * torch.log(one_subtract_probs_new)
        pt = torch.exp(log_pt)
        focal_loss = -1.0 * (self.alpha * (1 - pt) ** self.gamma) * log_pt
        return torch.mean(focal_loss)

class enetv2(nn.Module):
    """
    主干网络
    """
    def __init__(self, backbone, out_dim):
        """
        :param backbone: 主干网络
        :param out_dim: 输出的维度,这里只有一个线性输出,故输出维度为1
        """
        super(enetv2, self).__init__()
        self.enet = enet.EfficientNet.from_name(backbone)
        self.enet.load_state_dict(torch.load(pretrained_model[backbone]))#默认会加载官方的预训练模型参数
        self.myfc = nn.Linear(self.enet._fc.in_features, out_dim)
        self.enet._fc = nn.Identity()
        self.conv1 = nn.Conv2d(1, 3, kernel_size=3, stride=1, padding=3, bias=False)


    def extract(self, x):
        return self.enet(x)

    def forward(self, x):
        x = self.conv1(x)
        x = self.extract(x)
        x = self.myfc(x)
        return x

def mixup_data(x, y, alpha=1.0, use_cuda=True):
    """
    混合精读提速
    :param x:
    :param y:
    :param alpha:
    :param use_cuda:
    :return:
    """
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    if use_cuda:
        index = torch.randperm(batch_size).cuda()
    else:
        index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def mixup_criterion(criterion, pred, y_a, y_b, lam):
    """
    :param criterion: 损失函数
    :param pred:
    :param y_a:
    :param y_b:
    :param lam:
    :return:
    """
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)


def train(data_loader, model, optimizer, device):
    """
    训练数据函数
    :param data_loader:
    :param model:
    :param optimizer:
    :param device:
    :return:
    """
    criterion =  FocalLoss()
    model.train()
    
    for data in tqdm(data_loader, position=0, leave=True, desc='Training'):
        
        inputs1 = data["image"]
        targets = data['targets']
        
        inputs1, targets_a, targets_b, lam = mixup_data(inputs1, targets.view(-1, 1), use_cuda=True)

        inputs1 = inputs1.to(device, dtype=torch.float)
        targets_a = targets_a.to(device, dtype=torch.float)
        targets_b = targets_b.to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        outputs = model(inputs1)
        loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam)
        loss.backward()
        optimizer.step()
        
def evaluate(data_loader, model, device):
    """
    数据推理
    :param data_loader:
    :param model:
    :param device:
    :return:
    """
    model.eval()
    
    final_targets = []
    final_outputs = []
    
    with torch.no_grad():
        
        for data in tqdm(data_loader, position=0, leave=True, desc='Evaluating'):
            inputs = data["image"]
            targets = data["targets"]
            inputs = inputs.to(device, dtype=torch.float)
            targets = targets.to(device, dtype=torch.float)
            
            output = model(inputs)
            
            targets = targets.detach().cpu().numpy().tolist()
            output = output.detach().cpu().numpy().tolist()
            
            final_targets.extend(targets)
            final_outputs.extend(output)
            
    return final_outputs, final_targets

"""
设置官方预训练模型
"""
paths = [
 'efficientnet-b0-08094119.pth',
 'efficientnet-b1-dbc7070a.pth',
 'efficientnet-b2-27687264.pth',
 'efficientnet-b3-c8376fa2.pth',
 'efficientnet-b4-e116e8b3.pth',
 'efficientnet-b5-586e6cc6.pth',
 'efficientnet-b6-c76e70fd.pth',
 'efficientnet-b7-dcc49843.pth',
]

"""设置一些超参数"""
use_mode_index="4"
baseline_name = 'efficientnet-b{}'.format(use_mode_index)
pretrained_model = {
    'efficientnet-b{}'.format(use_mode_index): '../input/efficientnet-pytorch/' + paths[int(use_mode_index)]
}
models = []
device = "cuda"
# df = df.sample(n = 1000).reset_index(drop=True)
epochs = 10
Batch_Size = 32
skf = StratifiedKFold(n_splits=5)
fold = 0
# criterion = nn.BCEWithLogitsLoss()
best_roc_auc=0

"""读取数据"""
df = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
print (df.shape)
df['img_path'] = df['id'].apply(lambda x: f'../input/seti-breakthrough-listen/train/{x[0]}/{x}.npy')
# df = pd.read_csv('../input/seti-breakthrough-listen/old_leaky_data/train_labels_old.csv')
# print (df.shape)
# df['img_path'] = df['id'].apply(lambda x: f'../input/seti-breakthrough-listen/old_leaky_data/train_old/{x[0]}/{x}.npy')
X = df.img_path.values
Y = df.target.values


for train_index, test_index in skf.split(X, Y):
    
    model = enetv2(baseline_name, out_dim=1)
    if parallel == True:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)
        model.to(device)
        model.module.load_state_dict(torch.load("efficientnet-b4_0.pt"))
    else:
        model.to(device)
        model.load_state_dict(torch.load("efficientnet-b4_0.pt"))
    
    train_images, valid_images = X[train_index], X[test_index]
    train_targets, valid_targets = Y[train_index], Y[test_index]

    train_dataset = ClassificationDataset(image_paths=train_images, targets=train_targets, tr=ttransform)
    valid_dataset = ClassificationDataset(image_paths=valid_images, targets=valid_targets, tr=vtransform)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=Batch_Size,shuffle=True, num_workers=4)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=Batch_Size,shuffle=False, num_workers=4)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
#     optimizer = optim.AdamW(model.parameters(), weight_decay=1e-02)
#     Scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-3,epochs=20, pct_start=0.1,
#                                                    anneal_strategy='cos',div_factor=1e+3, final_div_factor=1e+3, steps_per_epoch=len(train_loader))
    

    for epoch in range(epochs):
        train(train_loader, model, optimizer, device=device)
        predictions, valid_targets = evaluate(valid_loader, model, device=device)
        roc_auc = metrics.roc_auc_score(valid_targets, predictions)
        print(f"Epoch={epoch}, Valid ROC AUC={roc_auc}")
        
        if roc_auc > best_roc_auc:
            if parallel == True:
                torch.save(model.module.state_dict(), baseline_name + '_new_data' + str(fold) + '.pt')
            else:
                torch.save(model.state_dict(), baseline_name + '_new_data' + str(fold) + '.pt')
            best_roc_auc = roc_auc
    
    fold += 1

"""
结果推理
"""
submission = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')
submission['img_path'] = submission['id'].apply(lambda x: f'../input/seti-breakthrough-listen/test/{x[0]}/{x}.npy')
"""推理时将数据进行预处理,即tta Test Time Augmentation"""
test_dataset = ClassificationDataset(image_paths=submission.img_path.values, targets=submission.target.values, tr=ttransform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4)
test_predictions, test_targets = evaluate(test_loader, model, device=device)
"""保存推理结果"""
test_predictions = np.array(test_predictions)
submission.target = test_predictions[:, 0]
submission.drop(['img_path'], axis=1, inplace=True)
submission.to_csv('submission_B4_new_1v1.csv', index=False)

3.2 EfficientNet keras

import numpy as np
import pandas as pd
from pathlib import Path
import math
import tensorflow as tf
from tensorflow import keras
from sklearn import model_selection
import efficientnet.tfkeras as efn


"""
数据读取以及参数设置
"""
data_dir = Path('../input/seti-breakthrough-listen/')
train_data_dir = data_dir / 'train'
test_data_dir = data_dir / 'test'
train_label_file = data_dir / 'train_labels.csv'
sample_file = data_dir / 'sample_submission.csv'

label = pd.read_csv(train_label_file, index_col='id')
sub = pd.read_csv(sample_file, index_col='id')

def id_to_path(s, train=True):
    data_dir = train_data_dir if train else test_data_dir
    return data_dir / s[0] / f'{s}.npy'

input_size = (273, 256, 3)
batch_size = 32
n_epoch = 2
seed = 42

"""
定义数据加载
"""
class DataGenerator(keras.utils.Sequence):
    def __init__(self, x_set, y_set=None, batch_size=32):
        self.x , self.y = x_set, y_set
        self.batch_size = batch_size
        self.is_train = False if y_set is None else True
        
    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)
    
    def __getitem__(self, idx):
        batch_ids = self.x[idx * self.batch_size: (idx + 1) * self.batch_size]
        if self.y is not None:
            batch_y = self.y[idx * self.batch_size: (idx + 1) * self.batch_size]
        
        list_x = [np.load(id_to_path(x, self.is_train))[::2] for x in batch_ids]
        batch_x = np.moveaxis(list_x,1,-1)
        batch_x = batch_x.astype("float") / 255
        
        if self.is_train:
            return batch_x, batch_y
        else:
            return batch_x

"""
定义模型
"""
model = tf.keras.Sequential([
        efn.EfficientNetB3(input_shape=input_size,weights='imagenet',include_top=False),
        keras.layers.GlobalAveragePooling2D(),
        keras.layers.Dense(1, activation='sigmoid')
        ])

model.summary()
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4),
              loss='binary_crossentropy', metrics=[keras.metrics.AUC()])
"""
训练模型
"""
x0 = label.index.values
y0 = label['target'].values

x1 = sub.index.values

x_train, x_val, y_train, y_val = model_selection.train_test_split(x0, y0, test_size=.2, random_state=seed)

train = DataGenerator(x_train, y_train, batch_size=batch_size)
val = DataGenerator(x_val, y_val, batch_size=batch_size)
test = DataGenerator(x1, batch_size=batch_size)

model.fit(train, validation_data=val, epochs=n_epoch)

"""
模型推理与结果生成
"""
prediction = model.predict(test).flatten()

sub['target'] = prediction
sub.to_csv('submission.csv')

3.3 Resnet18 pytorch

'''Load librarires'''
import pickle
import time
import random
import glob
import os
from copy import deepcopy
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from pathlib import Path
from collections import defaultdict

import matplotlib.pyplot as plt
import plotly.express as px

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
import torchvision
from torchvision import models, transforms, utils

import albumentations as A
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, roc_auc_score

class CFG:

    '''Store all hyperparameters here.''' 
    
#     DEBUG=False

    SEED = 420
#     TEST_SIZE = 0.2
    VAL_SIZE = 0.1
    CLASSES = None #Need to update manually
    OUTPUT_FEATURES = None #Need to update manually
    
    #Transforms
    TRAIN_TRANSFORMS = A.Compose([
        A.Resize(224, 224),
        ToTensorV2(),
        ])
    VAL_TRANSFORMS = A.Compose([
        A.Resize(224, 224),
        ToTensorV2(),
    ])
    TEST_TRANSFORMS = A.Compose([
        A.Resize(224, 224),
        ToTensorV2(),
    ])
    
    #model
    MODEL1 = {
        'name': 'resnet18',
        'transfer': True,
        'architecture': models.resnet18(pretrained=True), # ResNet18
        'criterion': nn.CrossEntropyLoss(),
        'optimizer': optim.Adam,
        'weight_decay': 1e-6,
        'lr': 1e-4,
        'history': None
    }

    BATCH_SIZE = 192
    EPOCHS = 3
    
    DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print('You are using ->', DEVICE)    


def seed_everything(seed):
    '''Make the results reproducible'''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True 

seed_everything(CFG.SEED)

train = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
sub = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')

'''Store data paths and their labels in pandas dataframe. Will be used to create pytorch datasets. '''

train_paths = glob.glob('../input/seti-breakthrough-listen/train/*/*' )
meta = pd.DataFrame(sorted(train_paths),columns=['path'])

#assign id and target to from the train df
meta['id'], meta['target'] = train.id, train.target

# get class mappings
classes = dict(enumerate(meta.target.astype('category').cat.categories))
CFG.CLASSES = classes
CFG.OUTPUT_FEATURES = len(CFG.CLASSES)

meta.head()

#get test data paths
test_paths = glob.glob('../input/seti-breakthrough-listen/test/*/*' )
test = pd.DataFrame(sorted(test_paths),columns=['path'])
test['target'] = 0 #dummy targets

test.head()

'''Split data into train, validation and test sets'''

X = list(meta.path)
y = list(meta.target)

X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=CFG.VAL_SIZE,
                                                  random_state=CFG.SEED,
                                                  stratify=y) #stratified split

X_test,y_test = test.path, test.target


print(f'Train length -> {len(X_train)}')
print(f'Val length -> {len(X_val)}')
print(f'Test length -> {len(X_test)}')

'''Custom pytorch dataset implementation.'''
class SETIDataset(Dataset):
    def __init__(self,X,y, transform=None):
        self.X = X
        self.y = y
        self.transform = transform

        assert len(self.X) == len(self.y), f'X and y have different lengths -> {len(self.X)} != {len(self.y)} '

    def __len__(self):
        return len(self.X)

    def __getitem__(self,idx):
        img_path = self.X[idx]
        img = np.load(img_path)
        img = img.astype(np.float32)
        img = np.vstack(img).transpose((1, 0))
        if self.transform is not None:
            img = self.transform(image=img)['image']
        label = torch.tensor(self.y[idx],dtype=torch.long)
        return (img, label)
    
    def show_img(self,idx):
        '''Plot image'''
        img,label = self.__getitem__(idx)
        img = img.numpy().transpose((1, 2, 0))
        plt.figure(figsize=(16, 8))
        plt.axis('off')
        plt.imshow(img)
        plt.title(CFG.CLASSES[int(label)]) #using CFG.CLASSES dict
        plt.pause(0.001)

'''Instantiate pytorch train, validation and test sets'''
TRAIN = SETIDataset(X_train,y_train, CFG.TRAIN_TRANSFORMS)
VAL = SETIDataset(X_val,y_val, CFG.VAL_TRANSFORMS)
TEST = SETIDataset(X_test,y_test, CFG.TEST_TRANSFORMS)

'''Instantiate Dataloaders'''
TRAIN_LOADER = DataLoader(TRAIN,CFG.BATCH_SIZE)
VAL_LOADER = DataLoader(VAL,CFG.BATCH_SIZE)
TEST_LOADER = DataLoader(TEST,CFG.BATCH_SIZE)

class Net(nn.Module):
    '''
    ========================
          NEURAL NET
    ========================
    
    Args:
        model_dict(dict): configuration dict containing the model architecture
        output_features(int): length of output tensor; for classification equals to number of classes
    '''
    def __init__(self, model_dict, output_features):
        super().__init__()
        self.__dict__.update(model_dict) #unpack model dict from CFG into this class
        
        if self.transfer:
            model = self.architecture
            #modify the last layer
            num_ftrs = model.fc.in_features
            model.fc = nn.Linear(num_ftrs, output_features)
            #modify the input size to fit the grayscale images!
            model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
            self.model = model
        else:
#             self.model = self.architecture
            pass
        self.output_features = output_features
        #optimizer
        self.optimizer = self.optimizer(self.model.parameters(),lr = self.lr, weight_decay = self.weight_decay,amsgrad=False)
        #path where to save model
        self.save_path = 'models'
        
    def forward(self, x):
        return self.model(x)

    def fit(self,
            train_loader,
            val_loader,
            epochs = 5,
            batch_size = 32,
            device = 'cpu'):
        '''
        =============================
            OPTIMIZATION LOOP
        =============================

        Args:
            train_loader(torch dataloader)
            val_loader(torch dataloader)
            epochs(int)
            batch_size(int)
            device(str)

            
        Output style inspired by skorch fit() method
        https://skorch.readthedocs.io/en/stable/net.html?highlight=fit#skorch.net.NeuralNet.fit

        '''
        #may be changed if lrscheduler is used???
        lr = deepcopy(self.lr)

        #get model training history
        history = self.history
        if history == None:
            history = defaultdict(list)
        else:
            pass
        #get train and val sizes
        train_size = len(train_loader.dataset)
        val_size = len(val_loader.dataset)
        #stuff for printing epoch metrics as a beautiful table
        headers = ['epoch','train_loss','val_loss','val_acc','cp','lr','dur']
        template = '{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}'
        print(template.format(*headers))
        print(template.replace(':', ':-').format('','','','','','',''))
        cyan = "\033[96m{:<10}\033[00m" #cyan
        purple = "\033[95m{:<10}\033[00m" #purple
        green = "\033[92m{:<10}\033[00m" #green
        white = "\033[0m{:<10}\033[0m" #white
        #set model into train mode
        self.model.train()
        #send model to device
        self.model.to(device)
        #training loop
        for epoch in range(epochs):
            start_time = time.time()
            train_loss = 0
            val_loss, val_acc = 0, 0
            #optimization  loop
            time.sleep(.2)
            for X,y in tqdm(train_loader, desc ="Train batches"):
                #Send training data to device
                X,y = X.to(device), y.to(device)
                #Forward propagation
                pred = self.model(X)
                loss = self.criterion(pred,y)
                #Backpropagation
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                #update loss
                train_loss = loss.item()
                
            #validation loop
            with torch.no_grad():
                for X, y in tqdm(val_loader, desc='Validation Batches'):
                    X,y = X.to(device),y.to(device)
                    pred = self.model(X)
                    val_loss = self.criterion(pred,y).item()
                    val_acc += (pred.argmax(1) == y).type(torch.float).sum().item()
            #calculate validation accuracy after the epoch
            val_acc /= val_size
            #append epoch results
            history['epoch'].append(epoch+1)
            history['train_loss'].append(train_loss)
            history['val_loss'].append(val_loss)
            history['val_acc'].append(val_acc)
            
            #colorize epoch's output if it improves
            colortemp = template.split(' ')
            # colorize train loss if it decreases
            if history['train_loss'][-1] == min(history['train_loss']):
                colortemp[1] = cyan
            else:
                colortemp[1] = white
            #colorize validation loss if it decreases
            if history['val_loss'][-1] == min(history['val_loss']):
                colortemp[2] = purple
            else:
                colortemp[2] = white
            # colorize validation accuracy & save best weights if it increases
            if history['val_acc'][-1] == max(history['val_acc']):
                #colorize       
                colortemp[3] = green
                #checkpoint
                cp = '+'
                if not os.path.exists(self.save_path):
                    os.mkdir(self.save_path)
                torch.save(self.model.state_dict(), Path(self.save_path,f'best_{self.name}.pth'))
            else:
                colortemp[3] = white
                cp = '-'
            colortemp = ' '.join(colortemp)

            #calculate epoch duration (in seconds)
            end_time = time.time()
            dur = end_time - start_time
            #append the rest of epoch results
            history['cp'].append(cp)
            history['lr'].append(lr)
            history['dur'].append(dur)
            #display the epoch results
            print(colortemp.format(*f'{epoch+1}/{epochs} {train_loss:.4f} {val_loss:.4f} {val_acc:.2f} {cp} {lr} {dur:.2f}'.split(' ')))
        #update epoch number of the entire training history
        history['epoch'] = [e+1 for e in range(len(history['epoch']))]
        #update model's training history
        self.history = history
        #save training history as csv
        self.save_history()
        
    def predict(self,dataloader,device ='cpu'):
        '''
        ===============
           Predict
        ===============
        '''
        #set model to evaluation mode
        self.model.eval()
        #model to device, default cpu
        self.model.to(device)

        preds = []
        with torch.no_grad():
            for X, y in tqdm(dataloader):
                X,y = X.to(device),y.to(device)
                pred = self.model(X)
                pred = pred.argmax(1)
                preds.append(pred)
        return preds
    
    def eval_model(self,dataloader,avg=None,device ='cpu'):
        '''
        ==================================
           ACCURACY PRECISION RECALL F1
        ==================================
        '''
        labels = [l for l in range(self.output_features)]
        loader_size = len(dataloader)
        dataset_size = len(dataloader.dataset)

        acc = 0
        precision = 0
        recall = 0
        f1 = 0
        roc = 0

        #set model to evaluation mode
        self.model.eval()
        #model to device, default cpu
        self.model.to(device)

        with torch.no_grad():
            for X, y in tqdm(dataloader, desc = 'Evaluating the model'):
                X,y = X.to(device),y.to(device)
                pred = self.model(X)
                #accuracy
                acc += (pred.argmax(1) == y).type(torch.float).sum().item()
                pred = pred.argmax(1)
                
                # to int 
                pred,y = list(pred), list(y)
                pred = [int(p) for p in pred]
                y = [int(p) for p in y]

                #precision
                p = precision_score(y, pred, labels = labels, zero_division = 1, average = avg)
                precision+=p
                #recall
                r = recall_score(y, pred, labels = labels, zero_division = 1,  average = avg)
                recall+=r
                #f1 score
                f = f1_score(y, pred, labels = labels, zero_division = 1,  average = avg)
                f1 += f
                #roc
                roc += roc_auc_score(y, pred)

        acc /= dataset_size
        precision /= loader_size
        recall /= loader_size
        f1 /= loader_size
        roc /= loader_size
            
            
        print(f" Accuracy: {(100*acc):>0.1f}%")
        print(f"Precision: {(100*np.mean(precision)):>0.1f}%")
        print(f"   Recall: {(100*np.mean(recall)):>0.1f}%")
        print(f" F1 Score: {(100*np.mean(f1)):>0.1f}%")
        print(f"      ROC: {(100*np.mean(f1)):>0.1f}%")

        
    def plot_loss_history(self):
        '''
        Plot loss history
        '''
        assert self.history != None, 'No history to plot -> the model has not been trained yet!'
        
        df = pd.DataFrame(self.history)
        fig = px.line(x = df.epoch,
                    y = [df.train_loss, df.val_loss],
                    title = 'Loss History',
                    labels={'x':'epoch','value': 'loss', 'variable': 'loss'})
        fig.data[0].name = 'train'
        fig.data[1].name = 'val'
        fig.show()

    def save_history(self):
        '''Save model's training history'''
        assert self.history != None, 'No history to save -> the model has not been trained yet!'
        #save as csv
        pd.DataFrame(self.history).to_csv(Path(f'models/{self.name}_history.csv')) 
        #save as pickle file
        with open(Path(f'models/{self.name}_history.pkl'), 'wb') as f:
            pickle.dump(self.history, f, protocol=pickle.HIGHEST_PROTOCOL)

    def save_model(self):
        torch.save(self.model.state_dict(), Path(self.save_path,f'latest_{self.name}.pth'))

    def load_model(self,path = 'models', device = 'cpu'):
        '''Load model'''
        try:
            #load model weights
            p = Path(path,f'best_{self.name}.pth')
            self.model.load_state_dict(torch.load(p, map_location=torch.device(device)))
            #load model training history
            with open(Path(path,f'{self.name}_history.pkl'), 'rb') as h: 
                self.history = pickle.load(h)
        except:
            print('No model to load!')

"""
开始训练
"""
#instantiate model and send to device
Resnet18 = Net(CFG.MODEL1,CFG.OUTPUT_FEATURES)
Resnet18.load_model()
Resnet18.fit(TRAIN_LOADER,
             VAL_LOADER,
             CFG.EPOCHS,
             CFG.BATCH_SIZE,
             CFG.DEVICE)
#画出训练的loss图像
Resnet18.plot_loss_history()

#推理
#evaluate best model's performance on the VAL set在验证集上验证模型效果
print('Best Model:')
print('-'*20)
time.sleep(0.1)
best_model = deepcopy(Resnet18)
best_model.load_model(device = CFG.DEVICE)
best_model.eval_model(VAL_LOADER, avg = 'binary',device = CFG.DEVICE)
#evaluate current model's performance on the VAL set
print()
print('Current Model:')
time.sleep(0.1)
print('-'*20)
Resnet18.eval_model(VAL_LOADER,avg = 'binary', device = CFG.DEVICE)


'''
    Get predictions on the TEST set在测试集上直接进行结果的推理并保存结果
'''

TEST_PREDS = Resnet18.predict(TEST_LOADER, CFG.DEVICE)
preds = [int(i) for p in TEST_PREDS  for i in list(p)]
sub.target = preds
sub.to_csv('submission.csv', index=False)
sub.head()

3.3 GRU pytorch

# Libraries
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import cv2
from tqdm import tqdm

from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

import tensorflow as tf
from tensorflow.keras import layers

import torch
import torchvision.models as models
import torch.nn as nn

from tensorflow.keras.optimizers import Adam
from tensorflow.keras import models
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from skimage.io import imshow, imread, imsave
from skimage.transform import rotate, AffineTransform, warp,rescale, resize, downscale_local_mean
from skimage import color,data
from skimage.exposure import adjust_gamma
from skimage.util import random_noise

"""
数据预处理
"""
data_dir = '../input/seti-breakthrough-listen'
train_merger = os.path.join(data_dir,'train_labels.csv')
train_labels = pd.read_csv(train_merger)
print('train_label_csv : ' +str(train_labels.shape[0]))
#adding the path for each id for easier processing
train_labels['path'] = train_labels['id'].apply(lambda x: f'../input/seti-breakthrough-listen/train/{x[0]}/{x}.npy')
train_labels.head()

"""
数据增强
"""
import albumentations
from albumentations.pytorch.transforms import ToTensorV2
from typing import *

class Transform:
    def __init__(self, aug_kwargs: Dict):
        albumentations_aug = [getattr(A, name)(**kwargs)
                            for name, kwargs in aug_kwargs.items()]
        albumentations_aug.append(ToTensorV2(p=1))
        self.transform = A.Compose(albumentations_aug)
    
    def __call__(self, image):
        image = self.transform(image = image)['image']
        return image
class ModeTransform():
    def __init__(self, df_frame, config, channel_mode,mode,target,transform):
        self.df_frame = df_frame
        self.channel_mode = channel_mode
        self.config = config
        self.target = target
        self.file_names = df_frame['path'].values
        self.labels = df_frame['target'].values
        self.transform = transform
        self.mode = mode
        
    def __len__(self):
        return len(self.df_frame)

    def __getitem__(self, idx):
        image = np.load(self.file_names[idx])
        # print(image.shape) -> (6, 273, 256)
        if self.channel_mode == 'spatial_6ch':
            image = image.astype(np.float32)
            image = np.vstack(image) # no transpose here (1638, 256) 
            #image = np.vstack(image).transpose((1, 0))
            # print(image.shape) -> (256, 1638)

        elif self.channel_mode == 'spatial_3ch':
            image = image[::2].astype(np.float32)
            image = np.vstack(image).transpose((1, 0))
        elif self.channel_mode == '6_channel':
            image = image.astype(np.float32)
            image = np.transpose(image, (1,2,0))
        elif self.channel_mode == '3_channel':
            image = image[::2].astype(np.float32)
            image = np.transpose(image, (1,2,0))
        
        if self.transform:
            image = self.transform(image)
  
        else:
            image = torch.from_numpy(image).float()

        if self.mode == 'test':
            return image    
        else:
            label = torch.tensor(self.labels[idx]).float()
            return image, label

import albumentations as A
CONFIG = { 
    "TRAIN_TRANSFORMS": {        
        "VerticalFlip": {"p": 0.5},
        "HorizontalFlip": {"p": 0.5},
        "Resize": {"height": 640, "width": 640, "p": 1},
    }}
config = CONFIG

# Parameters
params_train  = {'mode'            :  'train',
                 'channel_mode'    : 'spatial_6ch',
                 'target'          : True}

train_dset = ModeTransform(train_labels,config,
                           **params_train,
                           transform=Transform(config["TRAIN_TRANSFORMS"]))

for i in range(2):
    image, label = train_dset[i]
    plt.imshow(image[0])
    plt.title(f'label: {label}')
    plt.show()
image.shape


"""
数据划分等参数设置
"""
class SETIDataset(tf.keras.utils.Sequence):
    def __init__(self,df, directory, batch_size, random_state, shuffle, target):
        np.random.seed(random_state)
        self.directory = directory
        self.df = df
        self.target = target
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.ext = '.npy'
        self.on_epoch_end()
        
    def __len__(self):  
        len_ = np.ceil(self.df.shape[0] / self.batch_size).astype(int)
        return len_
    
    def __getitem__(self, idx):
        start_idx = idx * self.batch_size
        batch = self.df[start_idx: start_idx + self.batch_size]
        
        signals = []

        for fname in batch.id:
            path = os.path.join(self.directory, fname[0], fname + self.ext)
            data = np.load(path)
            signals.append(data)
        
        signals = np.transpose(np.stack(signals), (0, 1, 3, 2)).astype('float32')
        
        if self.target:
            return signals, batch.target.values
        else:
            return signals
    
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)

train = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
sub = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')

sample_df = train.sample(frac=1).reset_index(drop=True)

split = int(sample_df.shape[0] * 0.8)
train_df = sample_df[:split]
valid_df = sample_df[split:]

# Parameters
params_train  = {'batch_size'   : 64,
                'shuffle'       : True,
                'random_state'  : 42,
                'target'        : True}

params_valid  = {'batch_size'   : 64,
                 'shuffle'      : False,
                 'random_state' : 42,
                 'target'       : True}

params_test   = {'batch_size'   : 64,
                'shuffle'       : False,
                'random_state'  : 42,
                'target'        : False}

train_dset = SETIDataset(
    train_df, "../input/seti-breakthrough-listen/train", **params_train )

valid_dset = SETIDataset(
    valid_df, "../input/seti-breakthrough-listen/train", **params_valid)

test_dset = SETIDataset(
    sub, "../input/seti-breakthrough-listen/test", **params_test)

"""
定义模型
"""
def build_model(unit):
    inputs = layers.Input(shape=(6, 256, 273))

    gru1 = layers.Bidirectional(layers.GRU(unit, return_sequences = True))
    gru2 = layers.Bidirectional(layers.GRU(unit, return_sequences = True))
    pool = layers.GlobalAveragePooling1D()

    model = layers.TimeDistributed(gru1, name="bi_gru_1")(inputs)
    model = layers.TimeDistributed(gru2, name="bi_gru_2")(model)
    model = layers.TimeDistributed(pool, name="pool")(model)
    
    model = layers.Flatten()(model)
    model = layers.Dense(128, activation="relu")(model)
    model = layers.Dense(1, activation="sigmoid", name="sigmoid")(model)

    model = models.Model(inputs = inputs, outputs = model)
    
    model.compile("adam", 
              loss="binary_crossentropy",
              metrics=[tf.keras.metrics.AUC()])
    model.summary()
    
    return model

"""
训练模型
"""
model = build_model(unit = 128)
model_save = ModelCheckpoint("model_weights.h5", 
                             save_best_only=True, 
                             save_weights_only=True)

history = model.fit(train_dset, 
                    use_multiprocessing=True, 
                    workers=4, 
                    epochs=10,
                    validation_data=valid_dset,
                    callbacks=[model_save])

"""
训练过程loss可视化
"""
acc = history.history['auc']
val_acc = history.history['val_auc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
sns.set_style("white")
plt.suptitle('Train history', size = 15)

ax1.plot(epochs, acc, "bo", label = "Training acc")
ax1.plot(epochs, val_acc, "b", label = "Validation acc")
ax1.set_title("Training and validation acc")
ax1.legend()

ax2.plot(epochs, loss, "bo", label = "Training loss", color = 'red')
ax2.plot(epochs, val_loss, "b", label = "Validation loss", color = 'red')
ax2.set_title("Training and validation loss")
ax2.legend()

plt.show()


"""
模型推理与结果生成
"""
model.load_weights('model_weights.h5')
y_pred = model.predict(
    test_dset, 
    use_multiprocessing=True, 
    workers=4, 
    verbose=1)

sub['target'] = y_pred
sub.to_csv('submission.csv', index=False)
sub.head()

"""
参考
https://keras.io/api/layers/recurrent_layers/gru/
"""

4. 进阶代码

5. trick

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值