THU-HyperG超图生成部分

最新推荐文章于 2024-09-02 09:17:30 发布

umbrellazg

最新推荐文章于 2024-09-02 09:17:30 发布

阅读量426

点赞数 7

文章标签：前端 javascript linux

本文链接：https://blog.csdn.net/m0_51576139/article/details/134648490

版权

init：

from .neighbors import *
from .clustering import *
from .sparse import *
from .grid import *
from .fusion import *
from .attribute import *

__all__ = [
    'gen_knn_hg',
    'gen_epsilon_ball_hg',
    'gen_clustering_hg',
    'gen_l1_hg',
    'gen_grid_neigh_hg',
    'concat_multi_hg',
    'gen_attribute_hg'
]

attribute.py

# coding=utf-8
import numpy as np
import scipy.sparse as sparse

from hyperg.hyperg import HyperG
from hyperg.utils import print_log


def gen_attribute_hg(n_nodes, attr_dict, X=None):
    """
    :param attr_dict: dict, {'attri_1': [node_idx_1, node_idx_2, ...], 'attri_2':[...]} (属性 'attri_1' 对应的节点索引为 node_idx_1、node_index_2 和 ...)
    :param n_nodes: int,
    :param X: numpy array, 形如(n_samples, n_features) (optional)
    :return: HyperG类的一个实例
    """

    if X is not None:
        assert n_nodes == X.shape[0]

    n_edges = len(attr_dict) #统计属性字典中不同属性的数量，这个数量就是超图的边数
    node_idx = []
    edge_idx = []

    for idx, attr in enumerate(attr_dict):
        nodes = sorted(attr_dict[attr])
        node_idx.extend(nodes)
        edge_idx.extend([idx] * len(nodes))
    
    #创建一个值为1的数组 values，其长度与node_idx的长度相同。
    node_idx = np.asarray(node_idx)
    edge_idx = np.asarray(edge_idx)
    values = np.ones(node_idx.shape[0])

    #利用节点索引、边索引和值创建一个 COO（Coordinate Format）稀疏矩阵H
    H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))

    return HyperG(H, X=X)

clustering.py

import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import scipy.sparse as sparse

from hyperg.hyperg import HyperG
from hyperg.utils import print_log

def gen_clustering_hg(X, n_clusters, method="kmeans", with_feature=False, random_state=None):
    """
    :param X: numpy array, shape = (n_samples, n_features)
    :param n_clusters: int, 聚类的数量
    :param method: str, 聚类的方法，目前就只有kmeans聚类
    :param with_feature: bool, 超图带不带节点的特征，这个参数是可选的
    :param random_state: int, optional(default=False) 随机选择一个初始的聚类中心
    :return: HyperG类的一个实例
    """
    if method == "kmeans":
        cluster = KMeans(n_clusters=n_clusters, random_state=random_state).fit(X).labels_
    else:
        raise ValueError("{} method is not supported".format(method))

    assert n_clusters >= 1

    n_edges = n_clusters
    n_nodes = X.shape[0]

    node_idx = np.arange(n_nodes)
    edge_idx = cluster

    values = np.ones(node_idx.shape[0])
    H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))

    w = np.ones(n_edges)

    if with_feature:
        return HyperG(H, w=w, X=X)

    return HyperG(H, w=w)

fusion.py

import numpy as np
import scipy.sparse as sparse

from hyperg.hyperg import HyperG

#将多个超图链接一个超图
def concat_multi_hg(hg_list):
    """concatenate multiple hypergraphs to one hypergraph
    :param hg_list: list, 超图列表
    :return: 超图实例
    """
    H_s = [hg.incident_matrix() for hg in hg_list]
    w_s = [hg.hyperedge_weights() for hg in hg_list]

    H = sparse.hstack(H_s)
    w = np.hstack(w_s)

    X = None
    for hg in hg_list:
        if X is not None and hg.node_features() is not None:
            assert (X == hg.node_features()).all()
        elif hg.node_features() is not None:
            X = hg.node_features()

    return HyperG(H, X=X, w=w)


#将多个小超图合成一个大超图
def fuse_mutli_sub_hg(hg_list):
    """
    :param hg_list: list, 超图实例列表
    :return: 超图实例
    """
    #分别存储每个子超图的关联矩阵的行索引、列索引和数据。
    incident_mat_row = [hg.incident_matrix().row for hg in hg_list]
    incident_mat_col = [hg.incident_matrix().col for hg in hg_list]
    incident_mat_data = [hg.incident_matrix().data for hg in hg_list]
    
    #获取每个子超图的节点数和边数
    num_nodes = [hg.num_nodes() for hg in hg_list]
    num_edges = [hg.num_edges() for hg in hg_list]

    #计算新的节点和边的索引偏移,确保不重叠
    nodes_to_add = [0] + [sum(num_nodes[:i+1]) for i in range(len(hg_list)-1)]
    edges_to_add = [0] + [sum(num_edges[:i+1]) for i in range(len(hg_list)-1)]

    #遍历每个小超图，将其关联矩阵的行索引和列索引分别加上节点索引和边索引的偏移。
    for i in range(len(hg_list)):
        incident_mat_row[i] = incident_mat_row[i] + nodes_to_add[i]
        incident_mat_col[i] = incident_mat_col[i] + edges_to_add[i]
    
    #连接所有的关联矩阵索引和数据
    incident_mat_row = np.concatenate(incident_mat_row)
    incident_mat_col = np.concatenate(incident_mat_col)
    incident_mat_data = np.concatenate(incident_mat_data)
    
    #创建新的超图关联矩阵
    H = sparse.coo_matrix((incident_mat_data, (incident_mat_row, incident_mat_col)),
                          shape=(sum(num_nodes), sum(num_edges)))

    return HyperG(H)

grid.py

import numpy as np
import scipy.sparse as sparse

from hyperg.hyperg import HyperG

#用于生成一个表示网格邻域关系的超图
def gen_grid_neigh_hg(input_size):
    """
    :param input_size: numpy array,形如(2,)，表示网格的高度和宽度。
    :return: 超图实例
    """
    input_size = np.array(input_size).reshape(-1)
    assert input_size.shape[0] == 2

    h, w = input_size
    n_nodes = w * h

    node_set = np.arange(n_nodes)
    
    #定义邻居关系
    neigh_idx = [
        node_set - w - 1,
        node_set - w,
        node_set - w + 1,

        node_set - 1,
        node_set,
        node_set + 1,

        node_set + w - 1,
        node_set + w,
        node_set + w + 1,
    ]
    #屏蔽不在网格内的邻居索引的布尔掩码
    neigh_mask = [
        (node_set // w == 0) | (node_set % w == 0),
        (node_set // w == 0),
        (node_set // w == 0) | (node_set % w == w - 1),

        (node_set % w == 0),
        np.zeros_like(node_set, dtype=np.bool),
        (node_set % w == w - 1),

        (node_set // w == h-1) | (node_set % w == 0),
        (node_set // w == h-1),
        (node_set // w == h-1) | (node_set % w == w - 1),
    ]

    #使用掩码将不在网格内的邻居索引设置为 -1。
    for i in range(len(neigh_idx)):
        neigh_idx[i][neigh_mask[i]] = -1
    
    #将邻居索引、节点索引和权重值连接在一起
    node_idx = np.hstack(neigh_idx)
    edge_idx = np.tile(node_set.reshape(1, -1), [len(neigh_idx), 1]).reshape(-1)
    values = np.ones_like(node_idx)

    #过滤掉负的节点索引，以去除在网格外的邻居
    # filter negative elements
    non_neg_idx = np.where(node_idx != -1)

    node_idx = node_idx[non_neg_idx]
    edge_idx = edge_idx[non_neg_idx]
    values = values[non_neg_idx]

    n_edges = n_nodes
    H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))

    return HyperG(H)


if __name__ == "__main__":
    gen_grid_neigh_hg((4, 5))

neighbor.py

import numpy as np
from sklearn.metrics import pairwise_distances
import scipy.sparse as sparse

from hyperg.hyperg import HyperG
from hyperg.utils import print_log

#生成一个k最近邻超图
def gen_knn_hg(X, n_neighbors, is_prob=True, with_feature=False):
    """
    :param X: numpy array,形如(n_samples, n_features)
    :param n_neighbors: int,邻居的数量
    :param is_prob: bool,如果is_prob为True，则计算每个邻居的权重，权重为指数衰减函数。默认为True
    :param with_feature:带不带节点特征。可选参数，默认不带
    :return: HyperG的实例
    """

    assert isinstance(X, (np.ndarray, list))
    assert n_neighbors > 0

    X = np.array(X)
    n_nodes = X.shape[0]
    n_edges = n_nodes

    #计算样本之间的欧几里得距离矩阵。
    m_dist = pairwise_distances(X)

    #对距离矩阵的每一行进行分区，得到每个样本的k+1个最近邻的索引和对应的距离值
    m_neighbors = np.argpartition(m_dist, kth=n_neighbors+1, axis=1)
    m_neighbors_val = np.take_along_axis(m_dist, m_neighbors, axis=1)

    #将邻居的索引和距离值展平，作为节点索引和边索引
    m_neighbors = m_neighbors[:, :n_neighbors+1]
    m_neighbors_val = m_neighbors_val[:, :n_neighbors+1]

    # check检查每个neighbor是否包含自身，并确保有n+1个邻居
    for i in range(n_nodes):
        if not np.any(m_neighbors[i, :] == i):
            m_neighbors[i, -1] = i
            m_neighbors_val[i, -1] = 0.
    
    #用于创建稀疏超图的坐标信息
    node_idx = m_neighbors.reshape(-1)
    edge_idx = np.tile(np.arange(n_edges).reshape(-1, 1), (1, n_neighbors+1)).reshape(-1)

    if not is_prob:
        values = np.ones(node_idx.shape[0])
    else:
        avg_dist = np.mean(m_dist)
        m_neighbors_val = m_neighbors_val.reshape(-1)
        values = np.exp(-np.power(m_neighbors_val, 2.) / np.power(avg_dist, 2.))
    #生成超图
    H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))

    w = np.ones(n_edges)

    if with_feature:
        return HyperG(H, w=w, X=X)

    return HyperG(H, w=w)


def gen_epsilon_ball_hg(X, ratio, is_prob=True, with_feature=False):
    """
    :param X: numpy array, shape = (n_samples, n_features)
    :param ratio: float, 选取邻居的平均距离的比率
    :param is_prob: bool,是否有权重，默认为true，可选的
    :param with_feature: bool, 是否带特征
    :return: instance of HyperG
    """
    assert isinstance(X, (np.ndarray, list))
    assert ratio > 0

    X = np.array(X)
    n_nodes = X.shape[0]
    n_edges = n_nodes
    
    #计算样本间的距离
    m_dist = pairwise_distances(X)

    avg_dist = np.mean(m_dist)
    #确定选择邻居的阈值。
    threshold = ratio * avg_dist

    coo = np.where(m_dist <= threshold)
    edge_idx, node_idx = coo

    if not is_prob:
        values = np.ones(node_idx.shape[0])
    else:
        m_neighbors_val = m_dist[coo]
        values = np.exp(-np.power(m_neighbors_val, 2.) / np.power(avg_dist, 2.))

    H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))

    w = np.ones(n_edges)

    if with_feature:
        return HyperG(H, w=w, X=X)

    return HyperG(H, w=w)

sparse.py

import numpy as np
from sklearn.metrics import pairwise_distances
import scipy.sparse as sparse
import cvxpy as cp
from cvxpy.error import SolverError

from hyperg.hyperg import HyperG
from hyperg.utils import print_log

# TODO: 1. elastic net hypergraph


def gen_l1_hg(X, gamma, n_neighbors, log=False, with_feature=False):
    """
    :param X: numpy array, shape = (n_samples, n_features)
    :param gamma: float,  Elastic Net模型的超参数，用于平衡L1正则化和L2正则化的权重。
    :param n_neighbors: int, 用于选择邻居的数量。
    :param log: bool 是否打印日志，默认为 False。
    :param with_feature: bool, optional(default=False)
    :return: instance of HyperG
    """

    assert n_neighbors >= 1.
    assert isinstance(X, np.ndarray)
    assert X.ndim == 2

    n_nodes = X.shape[0]
    n_edges = n_nodes

    m_dist = pairwise_distances(X)
    m_neighbors = np.argsort(m_dist)[:, 0:n_neighbors+1]

    edge_idx = np.tile(np.arange(n_edges).reshape(-1, 1), (1, n_neighbors+1)).reshape(-1)
    node_idx = []
    values = []

    for i_edge in range(n_edges):
        if log:
            print_log("processing edge {} ".format(i_edge))

        neighbors = m_neighbors[i_edge].tolist()
        if i_edge in neighbors:
            neighbors.remove(i_edge)
        else:
            neighbors = neighbors[:-1]

        P = X[neighbors, :]
        v = X[i_edge, :]

        # cvxpy使用 CVXPY 库来解决 Elastic Net 模型的优化问题
        x = cp.Variable(P.shape[0], nonneg=True)
        objective = cp.Minimize(cp.norm((P.T@x).T-v, 2) + gamma * cp.norm(x, 1))
        # objective = cp.Minimize(cp.norm(x@P-v, 2) + gamma * cp.norm(x, 1))
        prob = cp.Problem(objective)
        try:
            prob.solve()
        except SolverError:
            prob.solve(solver='SCS', verbose=False)

        node_idx.extend([i_edge] + neighbors)
        values.extend([1.] + x.value.tolist())

    node_idx = np.array(node_idx)
    values = np.array(values)

    H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))

    if with_feature:
        return HyperG(H, X=X)

    return HyperG(H)