Cora数据集介绍
代码读取数据集
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
#from scipy.sparse.linalg.eigen.arpack import eigsh 不知道为什么这个报错
from scipy.sparse.linalg.eigen import arpack
import sys
def parse_index_file(filename):
"""Parse index file."""
index = []
for line in open(filename):
index.append(int(line.strip()))
return index
def sample_mask(idx, l):
"""Create mask."""
mask = np.zeros(l)
mask[idx] = 1
return np.array(mask, dtype=np.bool)
def load_data(dataset_str):
"""
Loads input data from gcn/data directory
ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
(a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
object;
ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
All objects above must be saved using python pickle module.
:param dataset_str: Dataset name
:return: All data input files loaded (as well the training/test data).
"""
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(names)): #分别读取文件
with open("Cora/Cora/raw/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
if sys.version_info > (3, 0): #python版本大于3.0
data = pkl.load(f, encoding='latin1')
if(names[i].find('graph')==-1): #如果不是.graph文件
print(f)
"""
x:(140, 1433) 140个节点参与训练,每个节点的向量为1433维度
y:(140,7) 140个参与训练的节点的训练目标,7维的独热编码
tx:(1000, 1433) 1000个参与测试的节点
ty:(1000, 7)
allx: (1708, 1433)
ally: (1708, 7)
"""
print(data.shape)
print(type(data))
# >>> <class 'scipy.sparse._csr.csr_matrix'>
print(type(data[0]))
# >>> <class 'scipy.sparse._csr.csr_matrix'>
for j in range(data.shape[0]): #矩阵的行数
"""
#x: data[j]第j个节点的向量表示
#y: data[j]第j个节点的标签 y j (7,)
"""
print('********',names[i],j,data[j].shape,'**********')
print(data[j])
print('\n')
else:
print(f)
print(type(data))
# >>> <class 'collections.defaultdict'>
print(data)
objects.append(data)
else:
objects.append(pkl.load(f))
x, y, tx, ty, allx, ally, graph = tuple(objects)
#训练数据集
print(x[0][0],x.shape,type(x)) ##x是一个稀疏矩阵,记住1的位置,140个实例,每个实例的特征向量维度是1433 (140,1433)
print(y[0],y.shape) ##y是标签向量,7分类,140个实例 (140,7)
##测试数据集
print(tx[0][0],tx.shape,type(tx)) ##tx是一个稀疏矩阵,1000个实例,每个实例的特征向量维度是1433 (1000,1433)
print(ty[0],ty.shape) ##y是标签向量,7分类,1000个实例 (1000,7)
##allx,ally和上面的形式一致
print(allx[0][0],allx.shape,type(allx)) ##tx是一个稀疏矩阵,1708个实例,每个实例的特征向量维度是1433 (1708,1433)
print(ally[0],ally.shape) ##y是标签向量,7分类,1708个实例 (1708,7)
##graph是一个字典,大图总共2708个节点
for i in graph:
print(i,graph[i])
test_idx_reorder = parse_index_file("Cora/Cora/raw/ind.{}.test.index".format(dataset_str))
test_idx_range = np.sort(test_idx_reorder)
print(test_idx_range.size)
print(type(test_idx_range))
print(test_idx_range)
if dataset_str == 'citeseer':
# Fix citeseer dataset (there are some isolated nodes in the graph)
# Find isolated nodes, add them as zero-vecs into the right position
test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
tx_extended[test_idx_range-min(test_idx_range), :] = tx
tx = tx_extended
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
ty_extended[test_idx_range-min(test_idx_range), :] = ty
ty = ty_extended
features = sp.vstack((allx, tx)).tolil()
features[test_idx_reorder, :] = features[test_idx_range, :]
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
# print(adj,adj.shape)
labels = np.vstack((ally, ty))
labels[test_idx_reorder, :] = labels[test_idx_range, :]
idx_test = test_idx_range.tolist()
idx_train = range(len(y))
idx_val = range(len(y), len(y)+500)
train_mask = sample_mask(idx_train, labels.shape[0])
val_mask = sample_mask(idx_val, labels.shape[0])
test_mask = sample_mask(idx_test, labels.shape[0])
y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
y_test = np.zeros(labels.shape)
y_train[train_mask, :] = labels[train_mask, :]
y_val[val_mask, :] = labels[val_mask, :]
y_test[test_mask, :] = labels[test_mask, :]
return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
def sparse_to_tuple(sparse_mx):
"""Convert sparse matrix to tuple representation."""
def to_tuple(mx):
if not sp.isspmatrix_coo(mx):
mx = mx.tocoo()
coords = np.vstack((mx.row, mx.col)).transpose()
values = mx.data
shape = mx.shape
return coords, values, shape
if isinstance(sparse_mx, list):
for i in range(len(sparse_mx)):
sparse_mx[i] = to_tuple(sparse_mx[i])
else:
sparse_mx = to_tuple(sparse_mx)
return sparse_mx
def preprocess_features(features):
"""Row-normalize feature matrix and convert to tuple representation"""
rowsum = np.array(features.sum(1))
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
features = r_mat_inv.dot(features)
return sparse_to_tuple(features)
def normalize_adj(adj):
"""Symmetrically normalize adjacency matrix."""
adj = sp.coo_matrix(adj)
rowsum = np.array(adj.sum(1))
d_inv_sqrt = np.power(rowsum, -0.5).flatten()
d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
def preprocess_adj(adj):
"""Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
return sparse_to_tuple(adj_normalized)
def construct_feed_dict(features, support, labels, labels_mask, placeholders):
"""Construct feed dictionary."""
feed_dict = dict()
feed_dict.update({placeholders['labels']: labels})
feed_dict.update({placeholders['labels_mask']: labels_mask})
feed_dict.update({placeholders['features']: features})
feed_dict.update({placeholders['support'][i]: support[i] for i in range(len(support))})
feed_dict.update({placeholders['num_features_nonzero']: features[1].shape})
return feed_dict
def chebyshev_polynomials(adj, k):
"""Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation)."""
print("Calculating Chebyshev polynomials up to order {}...".format(k))
adj_normalized = normalize_adj(adj)
laplacian = sp.eye(adj.shape[0]) - adj_normalized
largest_eigval, _ = eigsh(laplacian, 1, which='LM')
scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0])
t_k = list()
t_k.append(sp.eye(adj.shape[0]))
t_k.append(scaled_laplacian)
def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap):
s_lap = sp.csr_matrix(scaled_lap, copy=True)
return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two
for i in range(2, k+1):
t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian))
return sparse_to_tuple(t_k)
load_data('cora')
打印结果及分析:
1) .x文件
.x 文件存储用来训练的节点的特征向量。训练集中有140个节点,每个节点的特征向量为1433维,因为Cora数据集中,为每个论文选择了1433个关键词,通过关键词对论文进行分类。特征向量中用0/1表示是否存在该关键词。使用类scipy.sparse._csr.csr_matrix进行存储,(0,19) 1.0 表示第一篇论文中第19个关键词存在。
<_io.BufferedReader name='Cora/Cora/raw/ind.cora.x'>
(140, 1433)
<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>
******** x 0 (1, 1433) **********
(0, 19) 1.0
(0, 81) 1.0
(0, 146) 1.0
(0, 315) 1.0
(0, 774) 1.0
(0, 877) 1.0
(0, 1194) 1.0
(0, 1247) 1.0
(0, 1274) 1.0
csr_matrix
scipy.sparse._csr.csr_matrix 该数据类型适用于稀疏矩阵
import scipy.sparse as sp
row = [0, 1, 2]
col = [0, 0, 1]
value = [1, 2, 3]
s = sp.csr_matrix((value, (row, col)), shape=[3, 3])
print(s)
>>> (0, 0) 1
(1, 0) 2
(2, 1) 3
2) .y文件
.y文件存储.x文件中节点对应的标签。论文主题被分为7类,使用独热编码(one-hot) 表示7种标签。
<_io.BufferedReader name='Cora/Cora/raw/ind.cora.y'>
(140, 7)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
******** y 0 (7,) **********
[0 0 0 1 0 0 0]
******** y 1 (7,) **********
[0 0 0 0 1 0 0]
******** y 2 (7,) **********
[0 0 0 0 1 0 0]
3) .graph文件
存储每个节点的邻居节点的索引。
defaultdict(<class 'list'>, {0: [633, 1862, 2582], 1: [2, 652, 654], 2: [1986, 332, 1666, 1, 1454], 3: [2544], 4: [2176, 1016, 2176, 1761, 1256, 2175], 5: [1629, 2546, 1659, 1659], 6: [1416, 1602, 1042, 373], 7: [208], 8: [281, 1996, 269], 9: [2614, 723, 723], 10: [476, 2545], 11: [1655, 1839], 12: [2661, 1001, 1318, 2662], 13: [1810, 1701], 14: [2034, 2075, 158, 2077, 2668]...
4) .test.index
1000
<class 'numpy.ndarray'>
[1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721
1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735
1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749
1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763
1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777
5) 输出:
(<2708x2708 sparse matrix of type '<class 'numpy.intc'>'
with 10556 stored elements in Compressed Sparse Row format>,
<2708x1433 sparse matrix of type '<class 'numpy.float32'>'
with 49216 stored elements in List of Lists format>,
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 1., 0., 0.],
[0., 0., 0., ..., 1., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]]),
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]]),
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]]),
array([ True, True, True, ..., False, False, False]),
array([False, False, False, ..., False, False, False]),
array([False, False, False, ..., True, True, True]))