GNN学习笔记（四）：Cora数据集读取与分析

花锄

已于 2022-03-25 21:34:49 修改

阅读量4.3k

点赞数 4

分类专栏： GNN学习笔记文章标签： anaconda python

于 2022-03-25 21:30:27 首次发布

本文链接：https://blog.csdn.net/weixin_45795602/article/details/123745513

版权

GNN学习笔记专栏收录该内容

4 篇文章 3 订阅

订阅专栏

Cora数据集介绍

在这里插入图片描述

代码读取数据集

import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
#from scipy.sparse.linalg.eigen.arpack import eigsh 不知道为什么这个报错
from scipy.sparse.linalg.eigen import arpack
import sys


def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

def load_data(dataset_str):
    """
    Loads input data from gcn/data directory

    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.

    All objects above must be saved using python pickle module.

    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)): #分别读取文件
        with open("Cora/Cora/raw/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0): #python版本大于3.0
                data = pkl.load(f, encoding='latin1')
                if(names[i].find('graph')==-1):  #如果不是.graph文件
                    print(f)
                    """
                    x:(140, 1433) 140个节点参与训练，每个节点的向量为1433维度
                    y:(140,7) 140个参与训练的节点的训练目标，7维的独热编码
                    tx:(1000, 1433) 1000个参与测试的节点
                    ty:(1000, 7)
                    allx: (1708, 1433)
                    ally: (1708, 7)
                    """
                    
                    print(data.shape) 
                    print(type(data))
                    # >>> <class 'scipy.sparse._csr.csr_matrix'>
                    print(type(data[0]))
                    # >>> <class 'scipy.sparse._csr.csr_matrix'>

                    for j in range(data.shape[0]): #矩阵的行数
                        """
                        #x: data[j]第j个节点的向量表示
                        #y: data[j]第j个节点的标签 y j (7,)
                        """ 
                        print('********',names[i],j,data[j].shape,'**********')
                        print(data[j])
                        print('\n')
                else:
                    print(f)
                    print(type(data))
                    # >>> <class 'collections.defaultdict'>
                    print(data)
                   
                objects.append(data)

            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)

    #训练数据集
    print(x[0][0],x.shape,type(x))  ##x是一个稀疏矩阵,记住1的位置,140个实例,每个实例的特征向量维度是1433  (140,1433)
    print(y[0],y.shape)   ##y是标签向量,7分类，140个实例 (140,7)

    ##测试数据集
    print(tx[0][0],tx.shape,type(tx))  ##tx是一个稀疏矩阵,1000个实例,每个实例的特征向量维度是1433  (1000,1433)
    print(ty[0],ty.shape)   ##y是标签向量,7分类，1000个实例 (1000,7)

    ##allx,ally和上面的形式一致
    print(allx[0][0],allx.shape,type(allx))  ##tx是一个稀疏矩阵,1708个实例,每个实例的特征向量维度是1433  (1708,1433)
    print(ally[0],ally.shape)   ##y是标签向量,7分类，1708个实例 (1708,7)


    ##graph是一个字典，大图总共2708个节点
    for i in graph:
        print(i,graph[i])


    test_idx_reorder = parse_index_file("Cora/Cora/raw/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)
    print(test_idx_range.size)
	print(type(test_idx_range))
    print(test_idx_range)

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
    # print(adj,adj.shape)

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]
    

    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask


def sparse_to_tuple(sparse_mx):
    """Convert sparse matrix to tuple representation."""
    def to_tuple(mx):
        if not sp.isspmatrix_coo(mx):
            mx = mx.tocoo()
        coords = np.vstack((mx.row, mx.col)).transpose()
        values = mx.data
        shape = mx.shape
        return coords, values, shape

    if isinstance(sparse_mx, list):
        for i in range(len(sparse_mx)):
            sparse_mx[i] = to_tuple(sparse_mx[i])
    else:
        sparse_mx = to_tuple(sparse_mx)

    return sparse_mx


def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    features = r_mat_inv.dot(features)
    return sparse_to_tuple(features)


def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()


def preprocess_adj(adj):
    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
    adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
    return sparse_to_tuple(adj_normalized)


def construct_feed_dict(features, support, labels, labels_mask, placeholders):
    """Construct feed dictionary."""
    feed_dict = dict()
    feed_dict.update({placeholders['labels']: labels})
    feed_dict.update({placeholders['labels_mask']: labels_mask})
    feed_dict.update({placeholders['features']: features})
    feed_dict.update({placeholders['support'][i]: support[i] for i in range(len(support))})
    feed_dict.update({placeholders['num_features_nonzero']: features[1].shape})
    return feed_dict


def chebyshev_polynomials(adj, k):
    """Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation)."""
    print("Calculating Chebyshev polynomials up to order {}...".format(k))

    adj_normalized = normalize_adj(adj)
    laplacian = sp.eye(adj.shape[0]) - adj_normalized
    largest_eigval, _ = eigsh(laplacian, 1, which='LM')
    scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0])

    t_k = list()
    t_k.append(sp.eye(adj.shape[0]))
    t_k.append(scaled_laplacian)

    def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap):
        s_lap = sp.csr_matrix(scaled_lap, copy=True)
        return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two

    for i in range(2, k+1):
        t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian))

    return sparse_to_tuple(t_k)

load_data('cora')

打印结果及分析：

1) .x文件
.x 文件存储用来训练的节点的特征向量。训练集中有140个节点，每个节点的特征向量为1433维，因为Cora数据集中，为每个论文选择了1433个关键词，通过关键词对论文进行分类。特征向量中用0/1表示是否存在该关键词。使用类scipy.sparse._csr.csr_matrix进行存储，（0，19） 1.0 表示第一篇论文中第19个关键词存在。

<_io.BufferedReader name='Cora/Cora/raw/ind.cora.x'>
(140, 1433)
<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>
******** x 0 (1, 1433) **********
  (0, 19)	1.0
  (0, 81)	1.0
  (0, 146)	1.0
  (0, 315)	1.0
  (0, 774)	1.0
  (0, 877)	1.0
  (0, 1194)	1.0
  (0, 1247)	1.0
  (0, 1274)	1.0

csr_matrix

scipy.sparse._csr.csr_matrix  该数据类型适用于稀疏矩阵
import scipy.sparse as sp
row = [0, 1, 2]
col = [0, 0, 1]
value = [1, 2, 3]
s = sp.csr_matrix((value, (row, col)), shape=[3, 3])
print(s)
>>>   (0, 0)        1
	  (1, 0)        2
	  (2, 1)        3

2) .y文件
.y文件存储.x文件中节点对应的标签。论文主题被分为7类，使用独热编码（one-hot) 表示7种标签。

<_io.BufferedReader name='Cora/Cora/raw/ind.cora.y'>
(140, 7)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
******** y 0 (7,) **********
[0 0 0 1 0 0 0]

******** y 1 (7,) **********
[0 0 0 0 1 0 0]

******** y 2 (7,) **********
[0 0 0 0 1 0 0]

3) .graph文件
存储每个节点的邻居节点的索引。

defaultdict(<class 'list'>, {0: [633, 1862, 2582], 1: [2, 652, 654], 2: [1986, 332, 1666, 1, 1454], 3: [2544], 4: [2176, 1016, 2176, 1761, 1256, 2175], 5: [1629, 2546, 1659, 1659], 6: [1416, 1602, 1042, 373], 7: [208], 8: [281, 1996, 269], 9: [2614, 723, 723], 10: [476, 2545], 11: [1655, 1839], 12: [2661, 1001, 1318, 2662], 13: [1810, 1701], 14: [2034, 2075, 158, 2077, 2668]...

4) .test.index

1000
<class 'numpy.ndarray'>
[1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721
 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735
 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749
 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763
 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777

5) 输出：

(<2708x2708 sparse matrix of type '<class 'numpy.intc'>'
 	with 10556 stored elements in Compressed Sparse Row format>,
 <2708x1433 sparse matrix of type '<class 'numpy.float32'>'
 	with 49216 stored elements in List of Lists format>,
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([ True,  True,  True, ..., False, False, False]),
 array([False, False, False, ..., False, False, False]),
 array([False, False, False, ...,  True,  True,  True]))