import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
# 数据预处理
# 将字符串转化成one hot编码形式
def seq2onehot(seq):
module = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
i = 0
promoter_onehot = []
while i < len(seq):
tmp = []
for item in seq[i]:
if item == 't' or item == 'T':
tmp.append(module[0])
elif item == 'c' or item == 'C':
tmp.append(module[1])
elif item == 'g' or item == 'G':
tmp.append(module[2])
elif item == 'a' or item == 'A':
tmp.append(module[3])
else:
tmp.append([0, 0, 0, 0])
promoter_onehot.append(tmp)
i = i + 1
data = np.zeros((len(seq), 50, 1, 4))
data = np.float32(data)
m = 0
while m < len(seq):
n = 0
while n < len(seq[0]):
data[m, n, 0, :] = promoter_onehot[m][n]
n = n + 1
m = m + 1
return data
# random perm the sequence and expression data
def random_perm(seq, exp, shuffle_flag):
indices = np.arange(seq.shape[0])
np.random.seed(shuffle_flag)
np.random.shuffle(indices)
seq = seq[indices]
exp = exp[indices]
return seq, exp
# load data
promoter = np.load('./seq/promoter.npy')
expression = np.load('./seq/gene_expression.npy')
data = seq2onehot(promoter)
expression_new = np.zeros((len(expression),))
i = 0
while i < len(expression):
expression_new[i] = float(expression[i])
i = i + 1
expression = np.log2(expression_new)
data, expression = random_perm(data, expression, shuffle_flag=3) # data(11884, 50, 4, 1)
data = data.reshape([11884, 50, 4, 1])
r = 10000
train_feature = data[0:9000]
eval_feature = data[9000:10000]
test_feature = data[r:len(data)]
train_label = expression[0:9000]
eval_label = expression[9000:r]
test_label = expression[r:len(expression)]
np.save('test_feature.npy', test_feature)
np.save('test_label.npy', test_label)
class train_dataset(Dataset):
def __init__(self, feature, labels):
self.feature = feature
self.labels = labels
def __getitem__(self, ix):
self.fea = self.feature[ix][:]
self.lab = self.labels[ix]
data = {}
data['feature'] = torch.from_numpy(np.array(self.fea, dtype=float)).type(torch.FloatTensor)
data['label'] = torch.from_numpy(np.array(self.lab, dtype=float)).type(torch.FloatTensor)
return data
def __len__(self):
return len(self.labels)
train_dataset = train_dataset(train_feature, train_label)
train_data = DataLoader(train_dataset, batch_size=128)
class eval_dataset(Dataset):
def __init__(self, feature, labels):
self.feature = feature
self.labels = labels
def __getitem__(self, ix):
self.fea = self.feature[ix][:]
self.lab = self.labels[ix]
data = {}
data['feature'] = torch.from_numpy(np.array(self.fea, dtype=float)).type(torch.FloatTensor)
data['label'] = torch.from_numpy(np.array(self.lab, dtype=float)).type(torch.FloatTensor)
return data
def __len__(self):
return len(self.labels)
eval_dataset = eval_dataset(eval_feature, eval_label)
eval_data = DataLoader(eval_dataset, batch_size=128)
class PREDICT(nn.Module):
# 卷积层结构卷积-ReLU-池化-卷积-ReLU-卷积-ReLU-池化-全连接-ReLU-全连接
def __init__(self):
super().__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(50, 100, kernel_size=(6, 1), padding=(3, 0)),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 1))
)
self.layer2 = nn.Sequential(
nn.Conv2d(100, 200, kernel_size=(5, 1), padding=(3, 0)),
nn.ReLU()
)
self.layer3 = nn.Sequential(
nn.Conv2d(200, 200, kernel_size=(6, 1), padding=(3, 0)),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2, 1)),
nn.Flatten()
)
self.fc = nn.Sequential(
nn.Linear(400, 1024),
nn.ReLU(),
nn.Linear(1024, 1)
)
def forward(self, x):
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.fc(x)
x = x.squeeze(-1)
# x = x.squeeze(-1)
return x
# # 确定全连接层的输入参数
# image = torch.randn(1000, 50, 4, 1)
# net = PREDICT()
# out = net.layer1(image)
# out = net.layer2(out)
# out = net.layer3(out)
# # out = net.fc(out)
# a = out.size()
# print(a)
#
torch.manual_seed(101)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = PREDICT()
model.to(device)
# print(model)
# 设置损失函数
criterion = nn.MSELoss()
criterion = criterion.to(device)
# # 设置优化器
optimizer = torch.optim.SGD(model.parameters(), lr=0.0005)
# # 网络训练
EPOCHS = 56
Loss = []
Val_loss = []
for epoch in range(EPOCHS):
model.train()
epoch_train_loss = 0.0
train_steps = 0
for train_batch, data in enumerate(train_data):
seq = data['feature'].to(device)
exp = data['label'].to(device) # 传到gpu
# seq = data['feature']
# exp = data['label']
optimizer.zero_grad() # 梯度清零
train_prediction = model(seq) # 训练
batch_train_loss = criterion(train_prediction, exp) # 计算每个batch的损失
batch_train_loss.backward()
optimizer.step()
epoch_train_loss += batch_train_loss.item() # 计算epoch的损失
train_steps += 1
model.eval()
epoch_test_loss = 0.0
with torch.no_grad():
for eval_batch, data in enumerate(eval_data):
seq = data['feature'].to(device)
exp = data['label'].to(device)
eval_prediction = model(seq)
eval_loss = criterion(eval_prediction, exp)
epoch_test_loss += eval_loss.item()
Loss.append(epoch_train_loss/71)
Val_loss.append(epoch_test_loss/8)
np.save('Loss.npy', Loss)
np.save('Val_loss.npy', Val_loss)
print(f'Epoch: {epoch + 1:2} Loss: {epoch_train_loss/71} Val_loss: {epoch_test_loss/8}')
#
# for seq, exp in zip(train_feature, train_label):
# # 每次更新参数前都梯度归零和初始化
# optimizer.zero_grad()
# seq = torch.tensor(seq)
# seq = seq.unsqueeze(0)
# seq = seq.to(device)
# exp = torch.tensor(exp).to(torch.float32)
# exp = exp.to(device)
# train_prediction = model(seq).to(torch.float32)
# loss = criterion(train_prediction, exp)
# loss.backward()
# optimizer.step()
# print(f'Epoch: {epoch + 1:2} Loss: {loss.item():10.8f}')
torch.save(model, 'CNN_train.pth')
复现CNNpredictor
最新推荐文章于 2023-09-19 10:41:03 发布