生成有feature的pattern-graph数据集

本文介绍了如何生成Pattern并应用于Graph数据集的流程。首先生成Pattern,然后在graph_generator.py中设置pattern文件路径。尽管该代码包含与特征相关的参数,但实际上它们并不影响结果,因为Graph的特征维度与Pattern保持一致。为了保持代码结构,这些参数并未删除。
摘要由CSDN通过智能技术生成

生成有feature的pattern-graph synthetic dataset


在做有关子图与图的图学习时候,面临的一个困境就是没有数据集。所以目前常用的方法是代码生成pattern-graph synthetic dataset。例如在Subgraph Isomophism Counting中作者所做的一样。
但此前的synthetic dataset的一个缺陷是没有feature(只有label,相当于只有类似cora数据集中的class,而没有feature)。
所以我们要做的就是在这原有基础上,建立有feature的synthetic dataset。
在Subgraph Isomophism Counting的数据生成代码基础上进行修改,从而生成有feature的pattern-graph数据集(原数据集只有node,edge,label)

运行方法

先生成pattern,然后将graph_generator.py中的pattern路径指到pattern的文件路径。需要注意的是,graph_generator.py中虽然有feature相关的参数,但其实并不生效,因为graph的feature维度和feature数是和pattern一致的,但是由于笔者在写代码时设计了这两个参数,所以并不能直接删掉。

python pattern_generator.py
python graph_generator.py

代码

utils.py

import numpy as np
import igraph as ig
import json
from itertools import chain, combinations
import math
import random

def generate_png(dot_filename, png_filename=None, prog="neato"):
    if png_filename is None:
        png_filename = dot_filename.replace(".dot", ".png")
    os.system("%s.exe -T png %s > %s" % (prog, dot_filename, png_filename))

def generate_labels(number_of_items, number_of_labels):
    labels = list(range(number_of_labels))
    if number_of_items < number_of_labels:
        np.random.shuffle(labels)
        labels = labels[:number_of_items]
    else:
        for i in range(number_of_labels, number_of_items):
            labels.append(np.random.randint(number_of_labels))
        np.random.shuffle(labels)
    return labels


def generate_tree(number_of_vertices, directed=True):
    # Alexey S. Rodionov and Hyunseung Choo, On Generating Random Network Structures: Trees, ICCS 2003, LNCS 2658, pp. 879-887, 2003.
    # [connected vertices] + [unconnected vertices]
    shuffle_vertices = list(range(number_of_vertices))
    np.random.shuffle(shuffle_vertices)
    # randomly choose one vertex from the connected vertex set
    # randomly choose one vertex from the unconnected vertex set
    # connect them by one edge
    # add the latter vertex in the connected vertex set
    edges = list()
    for v in range(1, number_of_vertices):
        u = shuffle_vertices[np.random.randint(0, v)]
        v = shuffle_vertices[v]
        if get_direction():
            src_tgt = (u, v)
        else:
            src_tgt = (v, u)
        edges.append(src_tgt)
    tree = ig.Graph(directed=directed)
    tree.add_vertices(number_of_vertices)
    tree.add_edges(edges)
    return tree

def get_direction():
    return np.random.randint(0, 2)

def retrieve_multiple_edges(graph, source=-1, target=-1):
    if source != -1:
        e = graph.incident(source, mode=ig.OUT)
        if target != -1:
            e = set(e).intersection(graph.incident(target, mode=ig.IN))
        return ig.EdgeSeq(graph, e)     
    else:
        if target != -1:
            e = graph.incident(target, mode=ig.IN)
        else:
            e = list()
        return ig.EdgeSeq(graph, e)

def str2bool(x):
    x = x.lower()
    return x == "true" or x == "yes" or x == "t"

def sample_element(s):
    index = np.random.randint(0, len(s))
    return s[index]

def powerset(iterable, min_size=0, max_size=-1):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = sorted(iterable)
    if max_size == -1:
        max_size = len(s)
    return chain.from_iterable(combinations(s, r) for r in range(min_size, max_size+1))



def ArrayToNHot(array):
    num_of_items=np.size(array,0)
    dim_of_feature=np.size(array,1)
    results=np.zeros(num_of_items,dtype=int)
    for i in range(num_of_items):
        for j in range(dim_of_feature):
            results[i]=results[i]+math.pow(10,dim_of_feature-1-j)*array[i,j]
    return results.tolist()

#-----------------------------------------------------------------------------------------------------------------------
#----------------------------------------------------generate featrues--------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------
#pattern_feature用于判断是为pattern还是graph生成feature。如果是为pattern生成,则pattern_feature=None;
#如果是为graph生成feature,则只利用pattern_feature中的值生成graph的feature,否则产生同构
def generate_features(number_of_items, dim_of_features, num_of_features, pattern_vertex_features=None):#考虑cora数据集,num_of_features代表一个维度里features有几个可选值(从0开始,例如:0,1,2...),feature_dim代表feature有几个维度
    if pattern_vertex_features==None:
        features = np.zeros([number_of_items, dim_of_features], dtype=int)
        for i in range(number_of_items):
            for j in range(dim_of_features):
                rand=np.random.randint(num_of_features)
                features[i, j]=rand
        features=ArrayToNHot(features)
    else:
        features=list()
        for i in range(number_of_items):
            features.append(random.choice(pattern_vertex_features))
    '''   for i in range(len(pattern_features), number_of_items):
            features.append(np.random.randint(number_of_labels))
        np.random.shuffle(labels)'''
    return features




pattern_generator.py

# import networkx as nx
import igraph as ig
import argparse
import numpy as np
import os
from utils import generate_labels, generate_tree, get_direction, str2bool, generate_features, ArrayToNHot
from collections import Counter, defaultdict
from time import time

def generate_patterns(number_of_vertices, number_of_edges, number_of_vertex_labels, number_of_edge_labels, number_of_patterns,
                      dim_of_vertex_features, number_of_vertex_features):
    patterns = []
    
    for p in range(number_of_patterns):
        start = time()
        
        pattern = ig.Graph(directed=True)
        
        # vertex labels
        vertex_labels = generate_labels(number_of_vertices, number_of_vertex_labels)
        # edge labels
        edge_labels = generate_labels(number_of_edges, number_of_edge_labels)


        #feature
        vertex_features=generate_features(number_of_vertices, dim_of_vertex_features, number_of_vertex_features)
        #array_edge_features=generate_features(number_of_edges, dim_of_edge_features, number_of_edge_features)
        #vertex features
        #edge features
        #edge_features=ArrayToNHot(array_edge_features)



        # first, generate a tree
        pattern = generate_tree(number_of_vertices, directed=True)
        edge_label_mapping = defaultdict(set)
        for e, edge in enumerate(pattern.es):
            edge_label_mapping[edge.tuple].add(edge_labels[e])
        edge_keys = [0] * (number_of_vertices-1)

        # second, random add edges 
        ecount = pattern.ecount()
        new_edges = list()
        while ecount < number_of_edges:
            u = np.random.randint(0, number_of_vertices)
            v = np.random.randint(0, number_of_vertices)
            src_tgt = (u, v)
            edge_label = edge_labels[ecount]
            # # we do not generate edges between two same vertices with same labels
            if edge_label in edge_label_mapping[src_tgt]:
                continue
            new_edges.append(src_tgt)
            edge_keys.append(len(edge_label_mapping[src_tgt]))
            edge_label_mapping[src_tgt].add(edge_label)
            ecount += 1
        pattern.add_edges(new_edges)
        pattern.vs["label"] = vertex_labels
        pattern.es["label"] = edge_labels

        #feature
        pattern.vs["feature"]=vertex_features
        #pattern.es["feature"]=edge_features

        pattern.es["key"] = edge_keys

        patterns.append(pattern)
    return patterns

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--number_of_vertices", type=int, default=3)
    parser.add_argument("--number_of_edges", type=int, default=3)
    parser.add_argument("--number_of_vertex_labels", type=int, default=2)
    parser.add_argument("--number_of_edge_labels", type=int, default=2)

    parser.add_argument("--number_of_vertex_features", type=int, default=2)
    parser.add_argument("--dim_of_vertex_featrues", type=int, default=5)
    #parser.add_argument("--number_of_edge_features", type=int, default=2)
    #parser.add_argument("--dim_of_edge_featuers", type=int, default=4)


    parser.add_argument("--number_of_patterns", type=int, default=2)
    parser.add_argument("--save_dir", type=str, default="patterns")
    parser.add_argument("--save_png", type=str2bool, default=True)
    args = parser.parse_args()

    np.random.seed(args.seed)

    patterns = generate_patterns(args.number_of_vertices, args.number_of_edges,
        args.number_of_vertex_labels, args.number_of_edge_labels,
        args.number_of_patterns,args.dim_of_vertex_featrues, args.number_of_vertex_features)



    if args.save_dir:
        os.makedirs(args.save_dir, exist_ok=True)
        for p, pattern in enumerate(patterns):

            print(pattern.vs["feature"])

            pattern_id = "P_N%d_E%d_NL%d_EL%d_NFD%d_NF%d_%d" % (
                args.number_of_vertices, args.number_of_edges, args.number_of_vertex_labels, args.number_of_edge_labels,
                args.dim_of_vertex_featrues, args.number_of_vertex_features, p)
            filename = os.path.join(args.save_dir, pattern_id)
            # nx.nx_pydot.write_dot(pattern, filename + ".dot")
            pattern.write(filename + ".gml")
            if args.save_png:
                ig.plot(pattern, filename + ".png")



pattern_checker.py

import numpy as np
import igraph as ig
from collections import Counter
from utils import retrieve_multiple_edges

INF = float("inf")

class PatternChecker(object):
    def __init__(self):
        pass
    
    @classmethod
    def node_compat_fn(cls, g1, g2, v1, v2):
        vertex1 = g1.vs[v1]
        vertex2 = g2.vs[v2]

        #feature
        return vertex1["label"] == vertex2["label"] and vertex1["feature"] == vertex2["feature"]
    
    @classmethod
    def edge_compat_fn(cls, g1, g2, e1, e2):
        edge1 = g1.es[e1]
        edge2 = g2.es[e2]
        if edge1.is_loop() != edge2.is_loop():
            return False
        # for multiedges
        edges1 = retrieve_multiple_edges(g1, edge1.source, edge1.target)
        edges2 = retrieve_multiple_edges(g2, edge2.source, edge2.target)
        if len(edges1) < len(edges2):
            return False
        edge1_labels = set(edges1["label"])
        for el in edges2["label"]:
            if el not in edge1_labels:
                return False
        return True
    
    @classmethod
    def get_vertex_color_vectors(cls, g1, g2, seed_v1=-1, seed_v2=-1):
        N1 = g1.vcount()
        N2 = g2.vcount()
        color_vectors = list()
        if seed_v1 == -1 and seed_v2 == -1:
            color_vectors.append((None, None))
        elif seed_v1 == -1 and seed_v2 != -1:
            vertex = g1.vs[seed_v1]
            seed_label = vertex["label"]

            #featrue
            seed_feature=vertex["feature"]

            for seed_v1, vertex in enumerate(g1.vs):

                #feature
                if vertex["label"] == seed_label and vertex["feature"] == seed_feature:

                    color1 = [0] * N1
                    color1[seed_v1] = 1
                    color2 = [0] * N2
                    color2[seed_v2] = 1
                    color_vectors.append((color1, color2))
        elif seed_v1 != -1 and seed_v2 == -1:
            seed_label = g1.vs[seed_v1]["label"]

            #feature
            seed_feature = g1.vs[
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值