前言
图卷积神经网络的论文理解见我的博客:GCN图卷积神经网络论文阅读,今天我们来手搓一个图卷积神经网络。
数据
我使用的数据是CATH数据,它的介绍见我的博客:A protein classification neural network based on Convolution and Attention,不过这次我们使用的是具有空间结构和额外连接关系的蛋白质数据。
其中seq依旧是氨基酸序列,node_pos是每个氨基酸中氮原子、α碳原子、羧基碳原子、羧基氧原子、β碳原子的三维空间坐标,node_idx用来截取seq和node_pos;edge_nho是除了前后氨基酸相连之外的连接关系,edge_idx用来截取edge_nho;lab和之前相同,我们用第0维作one-hot编码。
这里给出数据处理主要代码:
def load(fn):
with h5py.File(fn) as f:
seq = f['node_seq'][()]
node_pos = f['node_pos'][()]
node_idx = f['node_idx'][()]
edge_nho = f['edge_nho'][()]
edge_idx = f['edge_idx'][()]
lab = f['label'][()]
return seq, node_pos, node_idx,edge_nho, edge_idx, lab
class ProteinDataset(Dataset):
def __init__(self, dataset, mapping=None):
super(ProteinDataset, self).__init__()
if isinstance(dataset, tuple): # raw data
self.seq = pt.tensor(dataset[0], dtype=pt.int32)
self.node_pos = pt.tensor(dataset[1], dtype=pt.float32)
self.node_idx = pt.tensor(dataset[2])
self.edge_nho = pt.tensor(dataset[3], dtype=pt.int32)
self.edge_idx = pt.tensor(dataset[4])
self.lab = pt.tensor(dataset[5])
self.map = pt.arange(len(self.lab), dtype=pt.int32) # 恒等映射
assert len(self.seq) == self.node_idx[-1]
assert len(self.lab) == len(self.node_idx) - 1
assert len(self.lab) == len(self.edge_idx) - 1
else: # structured data
self.seq = dataset.seq
self.node_pos = dataset.node_pos
self.node_idx = dataset.node_idx
self.edge_nho = dataset.edge_nho
self.edge_idx = dataset.edge_idx
self.lab = dataset.lab
self.map = mapping
assert self.map is not None
assert pt.max(self.map) < len(self.lab)
# self.map旨在维护一个data子集的映射,数据仍然是全部数据
def get(self, idx):
idx_ = self.map[idx]
seq = self.seq[self.node_idx[idx_] : self.node_idx[idx_+1]]
len_seq = len(seq)
seq_ = pt.zeros(len_seq, 21, dtype=pt.float32)
seq_[pt.arange(len_seq) , seq[:]-1] = 1.0
# shape:(len_seq,5,3) 0:N, 1:α, 2:C, 3:O, 4:β
node_pos = self.node_pos[self.node_idx[idx_] : self.node_idx[idx_+1]]
# N-α-β这个夹角反映了氨基酸的空间结构,在化学键确定的前提下,N和β之间的距离就能反应角度
seq_[:, 20] = pt.sqrt(pt.sum((node_pos[:,0] - node_pos[:,4])**2, dim=1))
# 连接关系(氢键)
edge_nho = pt.stack((self.edge_nho[0][self.edge_idx[idx_] : self.edge_idx[idx_+1]],
self.edge_nho[1][self.edge_idx[idx_] : self.edge_idx[idx_+1]]), dim=0)
# 连接关系(肽键)
edge_tai = pt.stack((pt.arange(0, len_seq-1), pt.arange(1, len_seq)), dim=0)
# 连接关系
edge_idx = pt.cat((edge_tai, edge_nho), dim=1)
# 边长
nho_attr = pt.empty(edge_nho.shape[1])
# 氢键:氨基的氢和羧基的氧之间吸引产生,用氨基的氮坐标代替氢坐标
nho_attr[:] = pt.sqrt(pt.sum((node_pos[edge_nho[0][:], 0] - node_pos[edge_nho[1][:], 3])**2, dim=1))
tai_attr = pt.empty(edge_tai.shape[1])
# 羧基碳接氨基氮
tai_attr[:] = pt.sqrt(pt.sum((node_pos[edge_tai[0][:], 2] - node_pos[edge_tai[1][:], 0])**2, dim=1))
edge_attr = pt.cat((tai_attr, nho_attr))[:, None]
lab = self.lab[idx_]
one_hot = pt.zeros(6630, dtype=pt.float32)
one_hot[lab[0]] = 1.0
graph = Data(x=seq_, edge_index=edge_idx, edge_attr=edge_attr, y=one_hot)
return graph
def len(self):
return len(self.map) #子集的大小是map的大小
模型
这里我是自定义了GCN卷积,有关PyG(pytorch geometric)的内容参见官方网站:https://pytorch-geometric.readthedocs.io/en/latest/。
class GCNConv(gnn.MessagePassing):
def __init__(self, emb_dim:int):
super().__init__(aggr='add')
self.bias = nn.Parameter(pt.Tensor(emb_dim))
self.reset_parameters()
def reset_parameters(self):
self.bias.data.zero_()
def forward(self, x, edge_index, edge_attr):
# Step 1: Add self-loops to the adjacency matrix.
edge_index, _ = add_self_loops(edge_index, edge_attr, fill_value=.0, num_nodes=x.size(0))
# Step 2: Compute normalization.
row, col = edge_index
deg = degree(row, x.size(0), dtype=x.dtype)
deg_inv_sqrt = deg.pow(-0.5)
deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]
# Step 3: Start propagating messages.
out = self.propagate(edge_index, x=x, edge_attr=edge_attr, norm=norm) + self.bias
return out
def message(self, x_j, norm):
# x_j has shape [E, out_channels]
# Step 4: Normalize node features.
return norm.view(-1, 1) * x_j
class HeteroGCNConv(nn.Module):
def __init__(self, in_channels, out_channels) -> None:
super(HeteroGCNConv, self).__init__()
self.lin = nn.Linear(in_channels, out_channels, bias=False)
self.conv1 = GCNConv(out_channels)
self.conv2 = GCNConv(out_channels)
def forward(self, x, edge_index, edge_attr):
# edge_index:[2, N-1+num_nho] edge_attr:[N-1+num_nho, 1]
N = x.size(0)
tai_idx = pt.stack((edge_index[0, :N-1], edge_index[1, :N-1]), dim=0)
tai_idx = pt.cat((tai_idx, tai_idx.flip(0)), dim=1)
nho_idx = pt.stack((edge_index[0, N-1:], edge_index[1, N-1:]), dim=0)
nho_idx = pt.cat((nho_idx, nho_idx.flip(0)), dim=1)
tai_attr = edge_attr[:N-1]
tai_attr = pt.cat((tai_attr, tai_attr), dim=0)
nho_attr = edge_attr[N-1:]
nho_attr = pt.cat((nho_attr, nho_attr), dim=0)
x = self.lin(x)
x = F.relu(x)
x = self.conv1(x, tai_idx, tai_attr)
x = F.relu(x)
x = self.conv2(x, nho_idx, nho_attr)
return x
class ProteinGCN(nn.Module):
def __init__(self, in_channels:int, batch_size:int, out_channels=6630) -> None:
super(ProteinGCN, self).__init__()
self.batch_size = batch_size
self.out_channels = out_channels
self.conv1 = HeteroGCNConv(in_channels, 128)
self.conv2 = HeteroGCNConv(128, 256)
self.conv3 = HeteroGCNConv(256, 512)
self.conv4 = HeteroGCNConv(512, 1024)
self.JK = gnn.JumpingKnowledge('cat', [128, 256, 512, 1024])
self.lin = nn.Linear(1920, out_channels)
def forward(self, data):
x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
x1 = self.conv1(x, edge_index, edge_attr)
x2 = self.conv2(x1, edge_index, edge_attr)
x3 = self.conv3(x2, edge_index, edge_attr)
x4 = self.conv4(x3, edge_index, edge_attr)
x = self.JK([x1, x2, x3, x4])
x = gnn.global_mean_pool(x, batch)
x = self.lin(x)
return x
训练
采用AdamW(Adam+weight decay)优化器进行训练,这里依旧给出主要代码:
data_ = load('../cath/hdf5/struct256.hdf5')
dataset = ProteinDataset(data_)
datamap=pt.arange(len(dataset),dtype=pt.int32)
trainmap,validmap=train_test_split(datamap,test_size=20480,random_state=7)
validmap, testmap=train_test_split(validmap,test_size=10240,random_state=7)
trainset=ProteinDataset(dataset,trainmap)
validset=ProteinDataset(dataset,validmap)
testset=ProteinDataset(dataset,testmap)
batchsize = 256
trainloader = DataLoader(trainset, batch_size=batchsize, shuffle=True, drop_last=True, num_workers=6)
validloader = DataLoader(validset, batch_size=batchsize, shuffle=False, drop_last=False, num_workers=6)
testloader = DataLoader(testset, batch_size=batchsize, shuffle=False, drop_last=False, num_workers=6)
prot_model = ProteinGCN(in_channels=4, out_channels=6630).cuda()
criterion = pt.nn.CrossEntropyLoss()
learning_rate = 0.01
optimizer = pt.optim.AdamW(prot_model.parameters(), lr=learning_rate)
num_epochs = 15
logging.basicConfig(level=logging.INFO)
def main():
for epoch in range(num_epochs):
train_loss = []
prot_model.train()
for g in trainloader:
g = g.cuda()
lab = g.y.view(batchsize, -1)
output = prot_model(g)
loss = criterion(output, lab)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss.append(loss)
# scheduler.step()
valid_acc = []
prot_model.eval() # 设置为评估模式,不使用dropout等技巧
with pt.no_grad(): # 不计算梯度,节省内存
for g in validloader:
g = g.cuda()
lab = g.y.view(batchsize, -1)
output = prot_model(g) # [batchsize, num_class]
# [batchsize, 1]
output = pt.argmax(output, axis=1, keepdim=True)
acc = float(pt.sum(output == pt.argmax(lab, axis=1, keepdim=True)))/batchsize
valid_acc.append(acc)
logging.info("epoch: %d\n train loss: %.4f \n valid acc: %.2f%%" % (epoch+1,
pt.mean(pt.tensor(train_loss)).detach().numpy(),
pt.mean(pt.tensor(valid_acc)).detach().numpy()*100))
test_acc = []
prot_model.eval()
with pt.no_grad():
for g in testloader:
g = g.cuda()
lab = g.y.view(batchsize, -1)
output = prot_model(g)
output = pt.argmax(output, axis=1, keepdim=True)
acc = float(pt.sum(output == pt.argmax(lab, axis=1, keepdim=True)))/batchsize
test_acc.append(acc)
logging.info("test acc: %.2f%%" % (pt.mean(pt.tensor(valid_acc)).detach().numpy()*100))
结语
完整代码以GitHub为准:bioinfo。