gSpan 是一种高效的频繁子图挖掘算法,参考
http://www.cs.ucsb.edu/~xyan/software/gSpan.htm 。
/* * gSpan algorithm implemented by coolypf * http://blog.csdn.net/coolypf */ #define _CRT_SECURE_NO_WARNINGS 1 #include <stdio.h> #include <stdlib.h> #include <string.h> #include <assert.h> #include <time.h> #include <vector> #include <map> #include <set> using namespace std; const int LABEL_MAX = 100; int min_support; int nr_graph; struct Graph { vector<int> node_label; vector<int> *edge_next, *edge_label; vector<int> gs; void removeEdge(int x, int a, int y) { for (size_t i = 0; i < node_label.size(); ++i) { int t; if (node_label[i] == x) t = y; else if (node_label[i] == y) t = x; else continue; for (size_t j = 0; j < edge_next[i].size(); ++j) { if (edge_label[i][j] == a && node_label[edge_next[i][j]] == t) { /* remove edge */ edge_label[i][j] = edge_label[i].back(); edge_label[i].pop_back(); edge_next[i][j] = edge_next[i].back(); edge_next[i].pop_back(); j--; } } } } bool hasEdge(int x, int a, int y) { for (size_t i = 0; i < node_label.size(); ++i) { int t; if (node_label[i] == x) t = y; else if (node_label[i] == y) t = x; else continue; for (size_t j = 0; j < edge_next[i].size(); ++j) if (edge_label[i][j] == a && node_label[edge_next[i][j]] == t) return true; } return false; } } *GS; struct GraphData { vector<int> nodel; vector<bool> nodev; vector<int> edgex; vector<int> edgey; vector<int> edgel; vector<bool> edgev; }; class EdgeFrequency { int *array; int u, v; public: void init(int max_node_label, int max_edge_label) { u = max_node_label + 1; v = u * (max_edge_label + 1); array = new int[u * v]; } int& operator()(int x, int a, int y) { return array[x * v + a * u + y]; } int operator()(int x, int a, int y) const { return array[x * v + a * u + y]; } } EF; struct Edge { int ix, iy; int x, a, y; Edge(int _ix, int _iy, int _x, int _a, int _y) : ix(_ix), iy(_iy), x(_x), a(_a), y(_y) {} bool operator<(const Edge &e) const { if (ix > iy) { if (e.ix < e.iy) return true; if (iy < e.iy || (iy == e.iy && a < e.a)) return true; } else if (e.ix < e.iy) { if (ix > e.ix) return true; if (ix == e.ix) { if (x < e.x) return true; if (x == e.x && (a < e.a || (a == e.a && y < e.y))) return true; } } return false; } }; struct GraphCode { vector<const Edge *> seq; vector<int> gs; }; vector<Graph *> S; // graph mining result void subgraph_mining(GraphCode &gc, int next); int main(int argc, char **argv) { clock_t clk = clock(); /* parse command line options */ assert(argc == 5); int num = atoi(argv[3]), denom = atoi(argv[4]); assert(num && denom && num <= denom); /* read graph data */ FILE *fp = fopen(argv[1], "r"); assert(fp); bool occ_node_label[LABEL_MAX + 1], occ_edge_label[LABEL_MAX + 1]; int freq_node_label[LABEL_MAX + 1], freq_edge_label[LABEL_MAX + 1]; memset(freq_node_label, 0, sizeof(freq_node_label)); memset(freq_edge_label, 0, sizeof(freq_edge_label)); GraphData *gd = NULL; vector<GraphData *> v_gd; while (1) { static char dummy[10]; if (fscanf(fp, "%s", dummy) <= 0) { if (gd) { v_gd.push_back(gd); for (int i = 0; i <= LABEL_MAX; ++i) { if (occ_node_label[i]) freq_node_label[i]++; if (occ_edge_label[i]) freq_edge_label[i]++; } } break; } if (*dummy == 't') { int id; fscanf(fp, "%s%d", dummy, &id); if (gd) { v_gd.push_back(gd); for (int i = 0; i <= LABEL_MAX; ++i) { if (occ_node_label[i]) freq_node_label[i]++; if (occ_edge_label[i]) freq_edge_label[i]++; } } if (id < 0) break; assert(id == (int)v_gd.size()); gd = new GraphData; memset(occ_node_label, 0, sizeof(occ_node_label)); memset(occ_edge_label, 0, sizeof(occ_edge_label)); } else if (*dummy == 'v') { int id, label; fscanf(fp, "%d%d", &id, &label); assert(id == (int)gd->nodel.size() && label <= LABEL_MAX); gd->nodel.push_back(label); gd->nodev.push_back(true); occ_node_label[label] = true; } else if (*dummy == 'e') { int x, y, label; fscanf(fp, "%d%d%d", &x, &y, &label); assert(x < (int)gd->nodel.size() && y < (int)gd->nodel.size() && label <= LABEL_MAX); gd->edgex.push_back(x); gd->edgey.push_back(y); gd->edgel.push_back(label); gd->edgev.push_back(true); occ_edge_label[label] = true; } else assert(0); } fclose(fp); min_support = v_gd.size() * num / denom; if (min_support < 1) min_support = 1; printf("%d graphs with minSup = %d\n", (int)v_gd.size(), min_support); /* sort labels of vertices and edges in GS by their frequency */ int rank_node_label[LABEL_MAX + 1], rank_edge_label[LABEL_MAX + 1]; for (int i = 0; i <= LABEL_MAX; ++i) { rank_node_label[i] = i; rank_edge_label[i] = i; } for (int i = LABEL_MAX; i > 0; --i) { for (int j = 0; j < i; ++j) { int tmp; if (freq_node_label[rank_node_label[j]] < freq_node_label[rank_node_label[j + 1]]) { tmp = rank_node_label[j]; rank_node_label[j] = rank_node_label[j + 1]; rank_node_label[j + 1] = tmp; } if (freq_edge_label[rank_edge_label[j]] < freq_edge_label[rank_edge_label[j + 1]]) { tmp = rank_edge_label[j]; rank_edge_label[j] = rank_edge_label[j + 1]; rank_edge_label[j + 1] = tmp; } } } /* remove infrequent vertices and edges */ /* ralabel the remaining vertices and edges in descending frequency */ int max_node_label = 0, max_edge_label = 0; for (int i = 0; i <= LABEL_MAX; ++i) { if (freq_node_label[rank_node_label[i]] >= min_support) max_node_label = i; if (freq_edge_label[rank_edge_label[i]] >= min_support) max_edge_label = i; } printf("remaining max vertex, edge label: %d, %d\n", max_node_label, max_edge_label); int rank_to_node_label[LABEL_MAX + 1], rank_to_edge_label[LABEL_MAX + 1]; memcpy(rank_to_node_label, rank_node_label, sizeof(rank_to_node_label)); for (int i = 0; i <= LABEL_MAX; ++i) rank_node_label[rank_to_node_label[i]] = i; memcpy(rank_to_edge_label, rank_edge_label, sizeof(rank_to_edge_label)); for (int i = 0; i <= LABEL_MAX; ++i) rank_edge_label[rank_to_edge_label[i]] = i; for (size_t i = 0; i < v_gd.size(); ++i) { GraphData &gd = *v_gd[i]; for (size_t j = 0; j < gd.nodel.size(); ++j) { if (freq_node_label[gd.nodel[j]] < min_support) gd.nodev[j] = false; else gd.nodel[j] = rank_node_label[gd.nodel[j]]; } for (size_t j = 0; j < gd.edgel.size(); ++j) { if (!gd.nodev[gd.edgex[j]] || !gd.nodev[gd.edgey[j]]) { gd.edgev[j] = false; continue; } if (freq_edge_label[gd.edgel[j]] < min_support) gd.edgev[j] = false; else gd.edgel[j] = rank_edge_label[gd.edgel[j]]; } /* re-map vertex index */ map<int, int> m; int cur = 0; for (size_t j = 0; j < gd.nodel.size(); ++j) { if (!gd.nodev[j]) continue; m[j] = cur++; } for (size_t j = 0; j < gd.edgel.size(); ++j) { if (!gd.edgev[j]) continue; gd.edgex[j] = m[gd.edgex[j]]; gd.edgey[j] = m[gd.edgey[j]]; } } /* build graph set */ nr_graph = (int)v_gd.size(); GS = new Graph[nr_graph]; for (int i = 0; i < nr_graph; ++i) { Graph &g = GS[i]; GraphData &gd = *v_gd[i]; for (size_t j = 0; j < gd.nodel.size(); ++j) if (gd.nodev[j]) g.node_label.push_back(gd.nodel[j]); g.edge_next = new vector<int>[g.node_label.size()]; g.edge_label = new vector<int>[g.node_label.size()]; for (size_t j = 0; j < gd.edgel.size(); ++j) { if (!gd.edgev[j]) continue; g.edge_label[gd.edgex[j]].push_back(gd.edgel[j]); g.edge_label[gd.edgey[j]].push_back(gd.edgel[j]); g.edge_next[gd.edgex[j]].push_back(gd.edgey[j]); g.edge_next[gd.edgey[j]].push_back(gd.edgex[j]); } } /* find all subgraphs with only one vertex for reference */ for (int i = 0; i <= max_node_label; ++i) { Graph *g = new Graph; g->node_label.push_back(i); for (int j = 0; j < nr_graph; ++j) { const Graph &G = GS[j]; for (size_t k = 0; k < G.node_label.size(); ++k) { if (G.node_label[k] == i) { g->gs.push_back(j); break; } } } S.push_back(g); } /* enumerate all frequent 1-edge graphs in GS */ EF.init(max_node_label, max_edge_label); for (int x = 0; x <= max_node_label; ++x) { for (int a = 0; a <= max_edge_label; ++a) { for (int y = x; y <= max_node_label; ++y) { int count = 0; for (int i = 0; i < nr_graph; ++i) if (GS[i].hasEdge(x, a, y)) count++; EF(x, a, y) = count; EF(y, a, x) = count; } } } for (int x = 0; x <= max_node_label; ++x) { for (int a = 0; a <= max_edge_label; ++a) { for (int y = x; y <= max_node_label; ++y) { if (EF(x, a, y) < min_support) continue; GraphCode gc; for (int i = 0; i < nr_graph; ++i) if (GS[i].hasEdge(x, a, y)) gc.gs.push_back(i); Edge e(0, 1, x, a, y); gc.seq.push_back(&e); subgraph_mining(gc, 2); /* GS <- GS - e */ for (int i = 0; i < nr_graph; ++i) GS[i].removeEdge(x, a, y); } } } /* output mining result */ printf("Found %d frequent subgraphs\n", (int)S.size()); fp = fopen(argv[2], "w"); assert(fp); for (int i = 0; i < (int)S.size(); ++i) { const Graph &g = *S[i]; fprintf(fp, "t # %d * %d\nx", i, (int)g.gs.size()); for (size_t j = 0; j < g.gs.size(); ++j) fprintf(fp, " %d", g.gs[j]); fprintf(fp, "\n"); for (size_t j = 0; j < g.node_label.size(); ++j) fprintf(fp, "v %d %d\n", (int)j, rank_to_node_label[g.node_label[j]]); if (g.node_label.size() < 2) { fprintf(fp, "\n"); continue; } for (int j = 0; j < (int)g.node_label.size(); ++j) { for (size_t k = 0; k < g.edge_label[j].size(); ++k) if (j < g.edge_next[j][k]) fprintf(fp, "e %d %d %d\n", j, g.edge_next[j][k], rank_to_edge_label[g.edge_label[j][k]]); } fprintf(fp, "\n"); } fclose(fp); printf("Total time: (%d / %d) seconds\n", (int)(clock() - clk), (int)CLOCKS_PER_SEC); return 0; } /* traverse a graph and find its minimal DFS code */ class Traveler { const vector<const Edge *> &s; const Graph &g; bool is_min; vector<int> g2s; vector<vector<bool> > f; void DFS(vector<int> &v, int c, int next) { if (c >= (int)s.size()) return; vector<int> bak; while (!v.empty()) { bool flag = false; int x = v.back(); for (size_t i = 0; i < g.edge_next[x].size(); ++i) { int y = g.edge_next[x][i]; if (f[x][y]) continue; flag = true; if (g2s[y] < 0) { Edge e(g2s[x], next, g.node_label[x], g.edge_label[x][i], g.node_label[y]); if (*s[c] < e) continue; if (e < *s[c]) { is_min = false; return; } g2s[y] = next; v.push_back(y); f[x][y] = true; f[y][x] = true; DFS(v, c + 1, next + 1); if (!is_min) return; f[x][y] = false; f[y][x] = false; v.pop_back(); g2s[y] = -1; } else { Edge e(g2s[x], g2s[y], g.node_label[x], g.edge_label[x][i], g.node_label[y]); if (*s[c] < e) continue; if (e < *s[c]) { is_min = false; return; } f[x][y] = true; f[y][x] = true; DFS(v, c + 1, next); if (!is_min) return; f[x][y] = false; f[y][x] = false; } } if (flag) break; bak.push_back(v.back()); v.pop_back(); } while (!bak.empty()) { v.push_back(bak.back()); bak.pop_back(); } } public: Traveler(const vector<const Edge *> &_s, const Graph &_g) : s(_s), g(_g), is_min(true) {} bool isMin() const { return is_min; } void travel() { g2s.resize(0); g2s.resize(g.node_label.size(), -1); f.resize(g.node_label.size()); for (size_t i = 0; i < g.node_label.size(); ++i) { f[i].resize(0); f[i].resize(g.node_label.size(), false); } for (size_t i = 0; i < g.node_label.size(); ++i) { int x = g.node_label[i]; if (x > s[0]->x) continue; assert(x == s[0]->x); vector<int> v(1, i); g2s[i] = 0; DFS(v, 0, 1); g2s[i] = -1; } } }; /* traverse a subgraph and find its children in graph */ class SubTraveler { const vector<const Edge *> &s; const Graph &g; set<Edge> &c; int next; vector<int> rm; // rightmost vector<int> s2g; vector<int> g2s; vector<vector<bool> > f; void DFS(int p) { int x, y; if (p >= (int)s.size()) { /* grow from rightmost path, pruning stage (3) */ int at = 0; while (at >= 0) { x = s2g[at]; for (size_t i = 0; i < g.edge_label[x].size(); ++i) { y = g.edge_next[x][i]; if (f[x][y]) continue; int ix = g2s[x], iy = g2s[y]; assert(ix == at); Edge ne(0, 1, g.node_label[x], g.edge_label[x][i], g.node_label[y]); /* pruning */ if (EF(ne.x, ne.a, ne.y) < min_support) continue; /* pruning stage (1) */ if (ne < *s[0]) continue; /* pruning stage (2) */ if (iy >= 0) { if (ix <= iy) continue; bool flag = false; for (size_t j = 0; j < g.edge_label[y].size(); ++j) { int z = g.edge_next[y][j], w = g.edge_label[y][j]; if (!f[y][z]) continue; if (w > ne.a || (w == ne.a && g.node_label[z] > ne.x)) { flag = true; break; } } if (flag) continue; } else iy = next; ne.ix = ix; ne.iy = iy; c.insert(ne); } at = rm[at]; } return; } const Edge &e = *s[p]; x = s2g[e.ix]; assert(g.node_label[x] == e.x); for (size_t i = 0; i < g.edge_label[x].size(); ++i) { if (g.edge_label[x][i] != e.a) continue; y = g.edge_next[x][i]; if (f[x][y]) continue; if (g.node_label[y] != e.y) continue; if (s2g[e.iy] < 0 && g2s[y] < 0) { s2g[e.iy] = y; g2s[y] = e.iy; f[x][y] = true; f[y][x] = true; DFS(p + 1); f[x][y] = false; f[y][x] = false; g2s[y] = -1; s2g[e.iy] = -1; } else { if (y != s2g[e.iy]) continue; if (e.iy != g2s[y]) continue; f[x][y] = true; f[y][x] = true; DFS(p + 1); f[x][y] = false; f[y][x] = false; } } } public: SubTraveler(const vector<const Edge *> &_s, const Graph &_g, set<Edge> &_c) : s(_s), g(_g), c(_c) {} void travel(int _next) { next = _next; s2g.resize(0); s2g.resize(next, -1); g2s.resize(0); g2s.resize(g.node_label.size(), -1); f.resize(g.node_label.size()); for (size_t i = 0; i < g.node_label.size(); ++i) { f[i].resize(0); f[i].resize(g.node_label.size(), false); } /* find rightmost path */ rm.resize(0); rm.resize(next, -1); for (size_t i = 0; i < s.size(); ++i) if (s[i]->iy > s[i]->ix && s[i]->iy > rm[s[i]->ix]) rm[s[i]->ix] = s[i]->iy; for (size_t i = 0; i < g.node_label.size(); ++i) { if (g.node_label[i] != s[0]->x) continue; s2g[s[0]->ix] = (int)i; g2s[i] = s[0]->ix; DFS(0); g2s[i] = -1; s2g[s[0]->ix] = -1; } } }; void subgraph_mining(GraphCode &gc, int next) { /* construct graph from DFS code */ Graph *g = new Graph; vector<const Edge *> &s = gc.seq; g->node_label.resize(next); g->edge_label = new vector<int>[next]; g->edge_next = new vector<int>[next]; for (size_t i = 0; i < s.size(); ++i) { int ix = s[i]->ix, iy = s[i]->iy, a = s[i]->a; g->node_label[ix] = s[i]->x; g->node_label[iy] = s[i]->y; g->edge_label[ix].push_back(a); g->edge_label[iy].push_back(a); g->edge_next[ix].push_back(iy); g->edge_next[iy].push_back(ix); } /* minimum DFS code pruning stage (4) */ Traveler t(s, *g); t.travel(); if (!t.isMin()) { delete[] g->edge_label; delete[] g->edge_next; delete g; return; } S.push_back(g); /* enumerate potential children with 1-edge growth */ map<Edge, vector<int> > m; for (size_t i = 0; i < gc.gs.size(); ++i) { set<Edge> c; SubTraveler st(s, GS[gc.gs[i]], c); st.travel(next); for (set<Edge>::const_iterator j = c.begin(); j != c.end(); ++j) m[*j].push_back(gc.gs[i]); } /* mining subgraph children */ for (map<Edge, vector<int> >::iterator i = m.begin(); i != m.end(); ++i) { const Edge *e = &i->first; vector<int> &spp = i->second; if ((int)spp.size() < min_support) continue; GraphCode child_gc; child_gc.gs.swap(spp); child_gc.seq = s; child_gc.seq.push_back(e); if (e->iy == next) subgraph_mining(child_gc, next + 1); else subgraph_mining(child_gc, next); } g->gs.swap(gc.gs); }