DGL教程
1 构建图graph
/data文件夹下
方法1:dgl.graph图构造器
pandas读取原始数据
import pandas as pd #用于读取raw data
df = pd.read_csv("/data/onlyknowndiseasemirnainteraction.txt",header=None, sep='\t')
#逐行读取,‘\t‘ tab键切分
利用dgl.graph图构造器构造无特征的图像数据
dgl图构造器来构造图,需要知道
- 节点的个数:本例中(383+495=878)
- 边列表;dgl中的节点序号是从0开始,要用panda处理数据为0开头
- 依次添加节点特征和边特征
import dgl
import numpy as np
import torch
df = df-1 # 原始数据中节点的下表是从1开始,因此全部-1
g = dgl.graph((df[0].tolist(),df[1].tolist()),num_nodes = 878)
# df[0].tolist()是将第0列转变成一组List列表
print(g) #打印图,查看基本信息
print(g.edges()) #打印图中的边列表
Graph(num_nodes=878, num_edges=5430,
ndata_schemes={}
edata_schemes={})
此时图中没有节点和边的特征
(tensor([ 0, 1, 2, …, 13, 115, 147]),
tensor([495, 495, 495, …, 877, 877, 877]))
为dgl.graph图添加特征
注意:往图中添加的特征必须都是tensor格式
# 用相似度做特征
import torch
import numpy as np
# 这里我们先考虑同质图,因此需要做特征匹配
# 把495和383个特征,统一变成495,因此
# 要对383之外的特征赋值为0
features = np.zeros((878,495),dtype=float)
# 1读取miRNA的特征495*495
df_m = pd.read_csv("~/data/hmddv20/4.miNA functional simialrity/functional similarity matrix.txt",sep='\t',header=None)
# 用二维numpy存储前495个节点的特征
features[0:495,0:495] = df_m.values
# 2读取disease的特征
df_d2 = pd.read_csv("~/data/hmddv20/3.disease semantic similarity 2/disease semantic similarity matrix 2.txt", sep='\t', header=None)
df_d1 = pd.read_csv("~/data/hmddv20/2.disease semantic similarity 1/disease semantic similarity matrix 1.txt", sep='\t', header=None)
df_d = ((df_d1+df_d2)/2).values
# 用二维numpy存储后383个特征
features[495:878,0:383] = df_d
# 将特征格式转为tensor
torch_features = torch.Tensor(features)
# 为每个节点分配节点特征向量
g.ndata['feat'] = torch_features
# 为每条边分配一个1维的全1向量,作为标签
g.edata['edges'] = torch.ones(5430, 1)
查看dgl.graph图中的基本信息
print(g.num_nodes())
print(g.num_edges())
# 节点1的出度
print(g.out_degrees(0))
# 节点1的入度
print(g.in_degrees(0))
878
5430
27
0
dgl.graph中的实例化子图、图保存和加载等操作
- 从graph中实例化子图-基于节点
# 从原始图的0,1,3号节点中实例化子图
sg1 = g.subgraph([0, 1, 3])
- 从graph中实例化子图-基于边
# 从原始图的0,1,3号边中实例化子图
sg2 = g.edge_subgraph([0, 1, 3])
- 给每条边添加反向边,原始边的特征会删除,但原始节点的特征会保留
newg = dgl.add_reverse_edges(g)
newg.edges()
newg.edata['edges'] = torch.ones(10860,1)
- 图保存和加载
# save graph
dgl.save_graphs('data/graph_md_xsd.dgl',newg) # 相对路径,没有文件夹会自动创建
# load graph
hmdd20_g = dgl.load_graphs('data/graph_md_xsd.dgl')
print(hmdd20_g)
([Graph(num_nodes=878, num_edges=10860,
ndata_schemes={‘feat’: Scheme(shape=(495,), dtype=torch.float32)}
edata_schemes={‘edges’: Scheme(shape=(1,), dtype=torch.float32)})])
方法2:dgl.DGLgraph()异质图构造器
# 2数据准备
# g为miRNA和disease的同质(一种关联类型)关联图,g0为miRNA、disease异质关联图(多种关联类型),所有样本的疾病节点向量list,所有样本的的基因节点向量
# ndarray: ID【383,383】,IM【495,495】
g, g0,disease_vertices, mirna_vertices, ID, IM, samples = build_graph(
directory, random_seed) # 构建图像
# 返回二元异质图dgl,三元异质图dgl,所有样本的疾病节点向量list,所有样本的的基因节点向量list,ndarray疾病特征,基因特征,l基因特征,ndarray所有样本,ndarray [19,3]:ml样本,[677,3]:ld样本
# return g, g0, sample_disease_vertices, sample_mirna_vertices, ID, IM, samples
# DGLHeteroGraph:
#g:Graph(
# num_nodes=878,
# num_edges=21720,
# ndata_schemes={'type': Scheme(shape=(), dtype=torch.int64),
# 'd_sim': Scheme(shape=(383,), dtype=torch.float32),
# 'm_sim': Scheme(shape=(495,), dtype=torch.float32)}
# edata_schemes = {'label': Scheme(shape=(), dtype=torch.float32)}
# )
#g0:Graph(
# num_nodes=878, num_edges=25078,
# ndata_schemes={'type': Scheme(shape=(), dtype=torch.int64), 'd_sim': Scheme(shape=(383,), dtype=torch.float32), 'm_sim': Scheme(shape=(495,), dtype=torch.float32)}
# edata_schemes={'dm': Scheme(shape=(), dtype=torch.float32), 'md': Scheme(shape=(), dtype=torch.float32), 'multi_label': Scheme(shape=(), dtype=torch.float32)}
# )
# list : disease_vertices[10860],mirna_vertices[10860]
# narray : samples[10860*3]
def load_data(directory, random_seed):
# 读特征
D_SSM1 = np.loadtxt(directory + '/D_SSM1.txt') # 疾病相似度1ndarray[383,383] ,np.loadtxt默认dtype=float
D_SSM2 = np.loadtxt(directory + '/D_SSM2.txt') # 疾病相似度2ndarray[383,383]
D_GSM = np.loadtxt(directory + '/D_GSM.txt') # 疾病高斯相似度1ndarray[383,383]
M_FSM = np.loadtxt(directory + '/M_FSM.txt')# mirna功能相似度[495,495]
M_GSM = np.loadtxt(directory + '/M_GSM.txt')# mirna高斯相似度[495,495]
# 读多分类的基因疾病关联数据
multi_md_associations1 = pd.read_csv(directory + '/wfy_multi_all_mirna_disease_pairs_without_negative.csv', names=['miRNA', 'disease', 'label']) #mirna-disease关联 dataFrame【189585,3】正样本5430
# multi_md_associations1['label'] = multi_md_associations1['label']
# 读全部的基因疾病关联数据(负样本用0表示,正样本用1表示)
all_associations = pd.read_csv(directory + '/all_mirna_disease_pairs.csv', names=['miRNA', 'disease', 'label']) #mirna-disease关联 dataFrame【189585,3】正样本5430
D_SSM = (D_SSM1 + D_SSM2) / 2 # 疾病相似度2个特征汇总成1个
ID = D_SSM #疾病语意相似度nadarry【383,383】,代表整合的疾病语意相似度,还没有用高斯进行填充
IM = M_FSM #基因功能相似度nadarry【495,495】,代表整合的基因功能相似度,还没有用高斯核相似度进行填充
for i in range(D_SSM.shape[0]): # 根据16、17公式进行相似度整合
for j in range(D_SSM.shape[1]):
if ID[i][j] == 0:
ID[i][j] = D_GSM[i][j] # 为0的部分,用高斯相似度进行补充
for i in range(M_FSM.shape[0]):
for j in range(M_FSM.shape[1]):
if IM[i][j] == 0:
IM[i][j] = M_GSM[i][j]
# 筛选miRNA-disease正样本和与正样本数相同的负样本
# known_associations = all_associations.loc[all_associations['label'] == 1] # df.loc按照行取,从189585得5430
known_associations = all_associations.loc[all_associations['label'] != 0] # df.loc按照条件行取,从189585得5430 ,由于可能做多分类,所以md关联不能用==1了
unknown_associations = all_associations.loc[all_associations['label'] == 0] # df.loc按条件行取所有负样本
random_negative = unknown_associations.sample(n=known_associations.shape[0], random_state=random_seed, axis=0) # df.sample 按照行随机采样与正样本等量的负样本,axis=0表示按行取
sample_df = known_associations.append(random_negative) # 拼接m-d关联的正样本和负样本10860
#df索引重置,因为df随机采样后,会保留原本的index索引,即超过10860个数,而且有随机性,因此需要重置
sample_df.reset_index(drop=True, inplace=True) # df.reset_index 打乱m-d样本,drop=True代表需要将重置的索引作为新列插入到dataframe当中,插入也是插入到index当中,不影响数据
#wfy
multi_md_associations1.reset_index(drop=True,inplace=True) # 打乱multi_md
multi_md_associations = multi_md_associations1.values
samples = sample_df.values # 获得重新编号index的新样本的值,从df中取numpy数组
return ID,IM,multi_md_associations,samples # 未知关联数量较多,选择和已知关联数目相同的未知关联组成样本,返回 整合的D,M,,和md的关联numpy数组
def build_graph(directory, random_seed):
# 加载数据集返回 2个特征narray矩阵ID【383,383】,IM【495,495】和多关联样本【1451,3】和采样后的全部正负【10860,3】样本
ID,IM,multi_md_associations, samples = load_data(directory,random_seed)
# miRNA-disease二元异质图
# 1.1 创建单一md关联二质图并载入节点特征
g = dgl.DGLGraph()#先实例化一个dgl图,dgl.DGLGraph()是利用默认的构造器直接实例化一个DGLGraph类
# g = dgl.graph([]) # dgl.graph([])是用dgl中的建图操作创建DGLGraph对象
g.add_nodes(ID.shape[0] + IM.shape[0]) # 节点个数等于m+d的个数383+495
node_type = torch.zeros(g.number_of_nodes(), dtype=torch.int64) #dgl推荐tensor作为其输入,节点类型首先设置为全0
node_type[: ID.shape[0]] = 1 # 把前383个节点的类型设置为1,即disease,后面剩下的495个节点的类型为0,即mirna
g.ndata['type'] = node_type # 节点的类型,用节点的属性g.ndata['type']表示
# 传入前383个疾病节点特征
d_sim = torch.zeros(g.number_of_nodes(), ID.shape[1])
d_sim[: ID.shape[0], :] = torch.from_numpy(ID.astype('float32')) # 给不同类型的节点添加不同特征时,按照节点需要填充tensor矩阵,其余部分用0填充,根据g.ndata['']内的名字来进行划分
g.ndata['d_sim'] = d_sim #传入疾病节点的特征,
# 传后495个,383-877,miRNA节点特征
m_sim = torch.zeros(g.number_of_nodes(), IM.shape[1])
m_sim[ID.shape[0]: ID.shape[0]+IM.shape[0], :] = torch.from_numpy(IM.astype('float32'))
g.ndata['m_sim'] = m_sim
# 1.2 载入边字典
# 让指针从0开始,原本边数据中的节点标签从1开始
disease_ids = list(range(1, ID.shape[0]+1))#得到list【1-383】
mirna_ids = list(range(1, IM.shape[0]+1))# 得到list【1-495】
disease_ids_invmap = {id_: i for i, id_ in enumerate(disease_ids)} #{i:id_},从1-383转0-382
mirna_ids_invmap = {id_: i for i, id_ in enumerate(mirna_ids)} # #{i:id_},从1-495转0-494
# 提取边向量的顶点
sample_disease_vertices = [disease_ids_invmap[id_] for id_ in samples[:, 1]] # 从m-d中取d,并将下标转为0-382 疾病list【】
sample_mirna_vertices = [mirna_ids_invmap[id_] + ID.shape[0] for id_ in samples[:, 0]] # 从m-d中取m,并将下标转为383-877
# 给图添加边,因为是边预测,需要给边打上标签数据data={'label':...}
g.add_edges(sample_disease_vertices, sample_mirna_vertices,
data={'label': torch.from_numpy(samples[:, 2].astype('float32'))}) #标签和特征都需要是tensor格式的
g.add_edges(sample_mirna_vertices, sample_disease_vertices, #dgl构建无向图就是,添加相反的同一组边
data={'label': torch.from_numpy(samples[:, 2].astype('float32'))})
g.readonly() # 设置图不可变
# g2.readonly()
# multi-types of miRNA-disease 多类关联异质图
# 1.1建图加入节点,异质图设置节点类型
g0 = dgl.DGLGraph()
# g0.add_nodes(ID.shape[0] + IM.shape[0] + IL.shape[0])
g0.add_nodes(ID.shape[0] + IM.shape[0])
node_type = torch.zeros(g0.number_of_nodes(), dtype=torch.int64) # 返回一个878全为0的tensor 878+467=1345
node_type[: ID.shape[0]] = 1 # disease383标记为1,miRNA标记为2
node_type[ID.shape[0] + IM.shape[0]:] = 2
g0.ndata['type'] = node_type # 将图中疾病的节点记为1,miRNA标记为2
# 1.2 异质图设置节点特征
d_sim = torch.zeros(g0.number_of_nodes(), ID.shape[1]) # (1345,383)
d_sim[: ID.shape[0], :] = torch.from_numpy(ID.astype('float32'))
g0.ndata['d_sim'] = d_sim
m_sim = torch.zeros(g0.number_of_nodes(), IM.shape[1]) # (1345,495)
m_sim[ID.shape[0]: ID.shape[0]+IM.shape[0], :] = torch.from_numpy(IM.astype('float32'))
g0.ndata['m_sim'] = m_sim # 每一行表示一个miRNA的特征
#wfy 提取边向量的顶点,提取边信息,注意边和节点的序号都是从0开始
multi_disease_vertices = [disease_ids_invmap[id_] for id_ in multi_md_associations[:, 1]] #转为0-382的list
multi_mirna_vertices = [mirna_ids_invmap[id_] + ID.shape[0] for id_ in multi_md_associations[:, 0]] # 从m-d中取m,并将下标转为383-877
# 添加同质所有关联边,m-d 和 d-m
g0.add_edges(sample_disease_vertices, sample_mirna_vertices, # 0-10859 , 其中sample_disease_vertices 是list
data={'dm': torch.from_numpy(samples[:, 2].astype('float32'))})
g0.add_edges(sample_mirna_vertices, sample_disease_vertices, # 10860-21719
data={'md': torch.from_numpy(samples[:, 2].astype('float32'))})
# wfy
g0.add_edges(multi_disease_vertices, multi_mirna_vertices,
data={'multi_label': torch.from_numpy(multi_md_associations[:, 2].astype('float32'))}) #标签和特征都需要是tensor格式的
g0.add_edges(multi_mirna_vertices, multi_disease_vertices, #dgl构建无向图就是,添加相反的同一组边
data={'multi_label': torch.from_numpy(multi_md_associations[:, 2].astype('float32'))})
g0.readonly()
# 返回二元异质图dgl,三元异质图dgl,所有样本的疾病节点向量list,所有样本的的基因节点向量list,ndarray疾病特征,基因特征,l基因特征,ndarray所有样本,ndarray [19,3]:ml样本,[677,3]:ld样本
# return g, g0,g2, sample_disease_vertices, sample_mirna_vertices, ID, IM, samples
return g, g0, sample_disease_vertices, sample_mirna_vertices, ID, IM, samples
2构建数据集并划分
/main.py内构建数据集并划分
方法1:划分边的顶点列表
代码参考DEMLP
划分训练/测试集——就是划分边所对应的顶点列表(官方例子中的划分方式)
构建正负样本的顶点对,用顶点列表的形式构建训练集合和测试集。
g = dgl.load_graphs('data/graph_md_xsd.dgl')
# Split edge set for training and testing 划分训练集和测试集的边
u, v = g.edges()
eids = np.arange(g.number_of_edges()) #取0到边的个数的列表
eids = np.random.permutation(eids) # 对一个list进行随机排序
test_size = int(len(eids) * 0.1) # 测试集1086
train_size = g.number_of_edges() - test_size #训练集大小9774
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]
# Find all negative edges and split them for training and testing #找到所有的负样本并对其进行划分
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)
neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) # 随机取与正样本个数相同的负样本
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]
构建用于训练的子图 train_g
在训练的时候,需要在原图中移除测试边集合。你可以通过dgl.remove_edges来移除它,得到划分后的数据集
train_g = dgl.remove_edges(g, eids[:test_size])
方法2:同样是划分边,然后生成训练子图
只不过具体的实现方式有所区别,而且用了五折交叉验证。
代码参考MSHGANMDA