1. 题目
目标是利用常规静息心电图的电压信号,预测“正常心电图”和“异常心电图”。根据心血管医生对心电图的诊断结果作为金标准,我们将那些没有明显异常的心电图作为“正常心电图”,并将包含一种或更多异常或疾病的心电图作为“异常心电图”。各团队需要在初赛规定时间内,利用训练集中常规心电图的电压信号,设计并实现可预测正常和异常等两类心电图的算法。
2. 数据
完整的训练集和测试集,共1000例常规心电图,其中训练集中包含600例,测试集中共400例。该数据是从多个公开数据集中获取。参赛团队需要利用有正常/异常两类标签的训练集数据设计和实现算法,并在没有标签的测试集上做出预测。
该心电数据的采样率为500 Hz。为了方便参赛团队用不同编程语言都能读取数据,所有心电数据的存储格式为MAT格式。该文件中存储了12个导联的电压信号。训练数据对应的标签存储在txt文件中,其中0代表正常,1代表异常。
3. 分析
- 数据集共有1000个样本,其中训练集包括600例,测试机400例。训练集是具有label的,用于训练模型;测试集没有label,需要我们用训练好的模型进行预测。
- 其实就是一个二分类问题
- 流程包括:数据加载与预处理,模型搭建,模型训练,模型测试
4. pytorch实现
(1)数据加载与处理 (dataset.py)
from scipy.io import loadmat
import os
from torch.utils import data
import pandas as pd
import numpy as np
# 将标签转为OneHot(便于计算损失)
def convert2oneHot(index, Lens):
hot = np.zeros((Lens,))
hot[index] = 1
return hot
#对数据进行归一化
def normalize(v):
part1 = v - v.mean(axis=1).reshape((v.shape[0], 1))
part2 = v.max(axis=1).reshape((v.shape[0], 1)) + 2e-12
return part1 / part2
# 自定义数据加载函数
class MyDataset(data.Dataset):
def __init__(self, mode, data_path):
super(MyDataset, self).__init__()
self.csv_path = os.path.join(data_path, "reference.csv")
self.data_path = os.path.join(data_path, "TRAIN")
self.temp_list = [] # mat文件名列表
self._parse_dataset()
self.mode = mode.lower()
if self.mode == 'train':
self.temp_list = self.temp_list[:500]
elif self.mode == 'valid':
self.temp_list = self.temp_list[500:]
else:
raise ValueError('mode must be "train" or "valid"!')
def __getitem__(self, item):
feature = self.get_feature(self.temp_list[item, 0])
label = convert2oneHot(self.temp_list[item, 1], 2)
return feature, label
def __len__(self):
return len(self.temp_list)
def get_feature(self, name):
mat = loadmat(os.path.join(self.data_path, name))
dat = mat['data']
feature = dat[0:12] # feature: (12, 5000)
# return normalize(feature).transpose() # feature: (5000, 12)
return normalize(feature)
# 读入csv并转成np.array
def _parse_dataset(self):
self.temp_list = np.array(pd.read_csv(self.csv_path))
(2)网络模型搭建(model.py)
from torch import nn
# input: (20, 12, 5000)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv1d(in_channels=12, out_channels=16, kernel_size=16, stride=2, padding=8),
nn.ReLU(),
nn.Conv1d(in_channels=16, out_channels=16, kernel_size=16, stride=2, padding=8),
nn.ReLU(),
nn.MaxPool1d(2)
)
self.layer2 = nn.Sequential(
nn.Conv1d(in_channels=16, out_channels=64, kernel_size=8, stride=2, padding=4),
nn.ReLU(),
nn.Conv1d(in_channels=64, out_channels=64, kernel_size=8, stride=2, padding=4),
nn.ReLU(),
nn.MaxPool1d(2)
)
self.layer3 = nn.Sequential(
nn.Conv1d(in_channels=64, out_channels=128, kernel_size=4, stride=2, padding=2),
nn.ReLU(),
nn.Conv1d(in_channels=128, out_channels=128, kernel_size=4, stride=2, padding=2),
nn.ReLU(),
nn.MaxPool1d(2)
)
self.layer4 = nn.Sequential(
nn.Conv1d(in_channels=128, out_channels=256, kernel_size=2, stride=1, padding=1),
nn.ReLU(),
nn.Conv1d(in_channels=256, out_channels=256, kernel_size=2, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool1d(2)
)
self.layer5 = nn.Sequential(
nn.AdaptiveAvgPool1d(2),
nn.Flatten()
)
self.layer6 = nn.Sequential(
nn.Linear(in_features=256 * 2, out_features=2),
nn.Dropout(0.3),
nn.Softmax()
)
def forward(self, x):
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.layer5(x)
x = self.layer6(x)
return x
(3)模型训练与测试(train.py)
import time
from torch import optim, nn
from torch.utils import data
from dataset import MyDataset
from model import Net
from function import *
# 加载数据
cur_path = os.getcwd()
data_path = os.path.join(cur_path, "data\\preliminary")
train_dataset = MyDataset(mode='train', data_path=data_path)
train_dataloader = data.DataLoader(train_dataset, batch_size=20, shuffle=True)
valid_dataset = MyDataset(mode='valid', data_path=data_path)
valid_dataloader = data.DataLoader(valid_dataset, batch_size=20)
# 实例化模型
model = Net()
# 优化器
optimizer = optim.Adam(model.parameters(), lr=0.0003)
# 损失函数
criterion = nn.CrossEntropyLoss()
# 训练批次
epochs = 50
# 模型保存路径
save_path = 'checkpoint/'
if not os.path.exists(save_path):
os.mkdir(save_path)
# 训练 & 测试过程
train_acc, valid_acc, train_losses, valid_losses = [], [], [], []
best_acc = 0.0
for epoch in range(epochs):
epoch_start = time.time()
model.train()
train_loss = 0.0
train_a = 0.0
valid_loss = 0.0
valid_a = 0.0
t0 = 0
t1 = 0
for i, (inputs, labels) in enumerate(train_dataloader):
inputs, labels = torch.tensor(inputs, dtype=torch.float), torch.tensor(labels, dtype=torch.float)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
train_loss += loss.item()
loss.backward()
optimizer.step()
ret, predictions = torch.max(outputs.data, 1) # 返回概率大的值和索引
labels = labels[:, -1]
acc = torch.sum(predictions == labels) / outputs.shape[0]
train_a += acc.item()
t0 = t0 + 1
with torch.no_grad():
model.eval()
for j, (inputs, labels) in enumerate(valid_dataloader):
inputs, labels = torch.tensor(inputs, dtype=torch.float), torch.tensor(labels, dtype=torch.float)
outputs = model(inputs)
loss = criterion(outputs, labels)
valid_loss += loss.item()
ret, predictions = torch.max(outputs.data, 1)
labels = labels[:, -1]
acc = torch.sum(predictions == labels) / outputs.shape[0]
valid_a += acc.item()
t1 = t1 + 1
train_loss = train_loss / t0
train_a = train_a / t0
valid_loss = valid_loss / t1
valid_a = valid_a / t1
train_acc.append(train_a)
valid_acc.append(valid_a)
train_losses.append(train_loss)
valid_losses.append(valid_loss)
epoch_end = time.time()
# 保存模型
if valid_a > best_acc:
torch.save(model.state_dict(), save_path + 'best_model')
best_acc = valid_a
if epoch == epochs - 1:
torch.save(model.state_dict(), save_path + 'final_model')
print("Epoch: {}/{}, Training:\tLoss: {:.4f}, Accuracy: {:.2f}%, "
"\t\tValidation:\tLoss: {:.4f}, Accuracy: {:.2f}%, Time: {:.4f}s".format(
epoch + 1, epochs, train_loss, train_a * 100, valid_loss, valid_a * 100,
epoch_end - epoch_start))
print("training end.best_model save to checkpoint.")
plot_acc(train_acc, valid_acc)
plot_loss(train_losses, valid_losses)
plot_results(epochs, train_acc, train_losses, valid_acc, train_losses)
(4)模型预测 (demo.py)
from torch.autograd import Variable
from glob import glob
from model import Net
from function import *
from scipy.io import loadmat
def normalize(v):
part1 = v - v.mean(axis=1).reshape((v.shape[0], 1))
part2 = v.max(axis=1).reshape((v.shape[0], 1)) + 2e-12
return part1 / part2
def load_data(path):
mat = loadmat(path)
dat = mat['data']
feature = dat[0:12] # feature: (12, 5000)
return normalize(feature)
def save_txt(files, predictions, txt_path):
a = open(txt_path, "w", encoding='UTF-8')
for i in range(len(files)):
a.write(files[i] + ' ' + str(predictions[i].item()) + '\n')
a.close()
if __name__ == '__main__':
# 加载数据
cur_path = os.getcwd()
data_path = os.path.join(cur_path, "data\\preliminary\\TEST")
files = glob(data_path + '/*.mat')
# 加载模型
model = Net()
model.load_state_dict(torch.load(os.path.join(cur_path, 'checkpoint\\best_model')))
model.eval()
predictions = []
file_list = []
for file in files:
inputs = load_data(file)
inputs = torch.tensor(inputs, dtype=torch.float)
inputs = Variable(torch.unsqueeze(inputs, dim=0).float())
pred = model(inputs)
ret, pred_ = torch.max(pred.data, 1)
name = os.path.basename(file)
file_list.append(name)
predictions.append(pred_)
# name = file.split('\\')[-1]
print(f'{name} \t {pred_.item()}')
save_txt(file_list, predictions, 'pred_result.txt')
(5)功能函数(function.py)
# 解决中文显示问题
import os
import pandas as pd
import numpy as np
import torch
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 定义画图函数
def plot_loss(train_loss, val_loss):
plt.plot(train_loss, label='train_loss')
plt.plot(val_loss, label='val_loss')
plt.legend(loc='best')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.title("训练集和验证集loss值得对比图")
plt.savefig('results/loss.png')
plt.show()
def plot_acc(train_acc, val_acc):
plt.plot(train_acc, label='train_acc')
plt.plot(val_acc, label='val_acc')
plt.legend(loc='best')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.title("训练集和验证集acc值得对比图")
plt.savefig('results/acc.png')
plt.show()
def plot_results(epochs, train_acc, train_loss, test_acc, test_loss):
x = np.arange(epochs)
plt.plot(x, train_acc, label='train_acc')
plt.plot(x, train_loss, label='train_loss')
plt.plot(x, test_acc, label='test_acc')
plt.plot(x, test_loss, label='test_loss')
plt.title("Results", fontsize=15)
plt.xlabel("X", fontsize=13)
plt.ylabel("Y", fontsize=13)
plt.legend()
plt.savefig('results/result.png')
plt.show()
# 读取reference.txt打乱顺序保存到reference.csv
def create_csv(txt_path, csv_path):
lists = pd.read_csv(txt_path, sep=r"\t", header=None)
lists = lists.sample(frac=1)
lists.to_csv(csv_path, index=None)
print("Finish save csv")