实验二 :聚类技术—复杂网络社团检测
实验内容
-
导入karate.gml中的空手道网络数据;
-
根据网络结构特征给出节点相似性度量指标;
-
采用层次聚类过程对网络数据进行聚类;
-
计算模块性指标Q值,当Q值最大时输出聚类结果;
-
采用Cytoscape工具,可视化聚类结果。
分析及设计
-
导入数据包:
- 用python的networks包中的read_ gml方法读取“图”的数据;
- 观察图的信息:34个顶点、78条边的无向图;
-
构建节点相似度矩阵:
-
无向图的节点相似度矩阵
S i , j = ∣ N i ∩ N j ∣ ∣ N i ∪ N j ∣ S_{i,j}={{|N_i \cap N_j|}\over{|N_i \cup N_j|}} Si,j=∣Ni∪Nj∣∣Ni∩Nj∣
- N i N_i Ni代表 i i i所连的全部点的集合;
- N j N_j Nj代表 j j j所连的全部点的集合;
-
-
用平均相似度定义聚类密度:
- d e n s i t y = 1 C i ≠ j ∑ i , j ∈ C , i ≠ j S i , j density = {1 \over C_{i \neq j} \sum _{i,j \in C, i \neq j}{S_{i,j}}} density=Ci=j∑i,j∈C,i=jSi,j1
-
划分簇集:
- 贪心算法:
- 随机取一个点作为簇开始;
- 遍历所以剩余的点使每次加入一个使簇变化密度最小且满足大于设定的密度阀值threshold的点;
- 重复上述过程,直到所有点都有划分。
- 贪心算法:
-
模块度Q值:
-
设置不同的密度阀值,划分簇集后计算Q值,输出Q最大的簇集,并用matplotlib.pylot绘图;
详细实现
-
导入数据包.py
import networkx as nx import matplotlib.pyplot as plt gml_path = './数据包/karate.gml' G = nx.read_gml(gml_path, label='id') # print(type(G)) G_nn = G.number_of_nodes() G_en = G.number_of_edges() # print("顶点数:", G_nn) # print("边数:", G_en) # 输出图像 # nx.draw(G, with_labels=True, alpha=0.7) # plt.show()
-
相似度矩阵.py
from 实验二复杂网络社团检测 import 导入数据包 as shuju import numpy as np G = shuju.G '''Q:字典型,key为一个节点,value为与该节点连的所有节点的集合''' Q = {i: set(G[i]) for i in G} # print(Q) Sim = np.zeros((shuju.G_nn+1, shuju.G_en+1)) ''' 无向图中点的相似度: Sim(i,j) = (i与j相同邻居个数,即Q[i]与Q[j]的交集)/(与i、j连接的所有点个数,即Q[i]与Q[j]的并集) 对于无向图的两个点来说,有越多相同的邻居说明它们可能越相似; 但是如果一个点有很多邻居的话, 则该点可能只是流行或者热门,相似的可能性小了; ''' for i in G: for j in G: Sim[i, j] = len(Q[i] & Q[j]) / len(Q[i] | Q[j]) # print(Sim)
-
密度函数.py
from 实验二复杂网络社团检测 import 相似度矩阵 as sim Sim = sim.Sim # 平均相似度下的密度函数 # 除自己外和其他簇中点所有相似度的和的平均值 def density_avg(club): if len(club) == 1: return 1.0 density = 0.0 for i in club: for j in club: if i != j: density += Sim[i, j] density /= len(club) ** 2 - len(club) return density # print(density_avg(list(G.nodes)))
-
划分簇集.py
import random from 实验二复杂网络社团检测 import 密度函数 as md, 导入数据包 as shuju d_avg = md.density_avg G = shuju.G ''' 贪心算法: 随机取一个点作为簇开始, 遍历所以剩余的点使每次加入一个使簇变化密度最小且满足大于设定的密度阀值threshold的点, 重复上述过程,直到所有点都有划分。 ''' def find_clubs(threshold=0.20, density_fun=d_avg): # 最终划分的簇的集合:clubs clubs = [] # 全部点的集合candidate candidate = list(G.nodes) # 临时保存一个簇:one_club one_club = [] while(len(candidate) > 0): if len(one_club) == 0: # 随机取一个点,加入到簇中,开始贪心算法 picked = candidate[random.randint(0, len(candidate) - 1)] # print(picked) candidate.remove(picked) one_club.append(picked) # 记录临时簇的密度 one_density = density_fun(one_club) if(len(candidate) == 0): clubs.append(one_club) else: min_det = float('inf') min_id = -1 # 遍历整个未被选择进簇里的点集 for id in candidate: new_club = one_club + [id, ] # print('newclub = ', new_club) new_density = density_fun(new_club) # 如果一个点加入簇之后的密度变化为最小密度变化,则记录该点 det = one_density - new_density min_det, min_id = (det, id) if det < min_det else (min_det, min_id) # 更新密度,判断是否满足设定的密度阀值;如果大于设定的密度阀值,说明添加的点有效;小于则添加的点无效; new_density = one_density - min_det # print(min_id, new_density) if new_density < threshold: clubs.append(one_club) # 如果最小密度的点都不满足要求,则说明没有满足要求的点,迭代完成; one_club = [] else: # print(min_id) candidate.remove(min_id) one_club.append(min_id) # 所有点都被比较过了,迭代完成; if(len(candidate) == 0): clubs.append(one_club) one_club = [] # print('clubs=', clubs) # print(len(clubs)) return clubs
-
模块度Q.py
from 实验二复杂网络社团检测 import 相似度矩阵 as sim import numpy as np def Qf(club, E): # 生成社区矩阵C C = np.zeros((len(club), len(club))) ai = 0 aj = 0 for key, value in E.items(): for i in value: if key < i: for j in range(0, len(club)): if key in club[j]: ai = j if i in club[j]: aj = j C[ai][aj] += 1 C[aj][ai] = C[ai][aj] # 上对角线矩阵的和为总边数(对角线为本簇内的边数,矩阵是对称的,如果把矩阵的全部值求和,就会把簇与簇之间的边算了两遍) # 根据公式计算模块度Q值 [rows, cols] = C.shape edges_num = 0 sum_duijiao = 0 sum_fengzi = 0 sum_i = 0 for i in range(rows): for j in range(cols): if i == j: edges_num += C[i][j] sum_duijiao += C[i][j] if i < j: edges_num += C[i][j] for j in range(cols): for i in range(rows): if i == j: sum_i += C[i][j]*2 else: sum_i += C[i][j] sum_fengzi += sum_i ** 2 sum_i = 0 # print(sum_fengzi) # print(sum_duijiao) # print(edges_num) Q = sum_duijiao / edges_num - sum_fengzi / ((2 * edges_num)**2) return Q # # club = [[1,2,3],[4,5,6],[7,8,9,10]] # E = {1: {2, 3, 10}, 2: {1, 3}, 3: {1, 2}, 4: {5, 6, 10}, 5: {4, 6}, 6: {4, 5}, 7: {8, 9, 10}, 8: {7, 9}, 9: {7, 8}, 10: {1, 4, 7}} # Q = Qf(club, E) # print(Q)
-
绘图.py
from 实验二复杂网络社团检测 import 划分簇集 as cj, 模块度Q as mk, 相似度矩阵 as sim import matplotlib.pyplot as plt import networkx as nx import random import numpy as np G = cj.G def getRandomColor(): colorArr = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'] color = "#" # 一个标准的颜色代码:#后更6位1~F的字符串 for i in range(6): color += colorArr[random.randint(0, 14)] return color def draw_clubs(clubs): pos = nx.spring_layout(G) Q = mk.Qf(clubs, sim.Q) plt.title('Q=%.2f' % Q) for club in clubs: print(club) nx.draw(G, pos=pos, nodelist=club, node_color=getRandomColor(), with_labels=True) def find_the_best_Qclubs(density_fun): # thresholds代表密度 thresholds = np.arange(0.00, 1.00, 0.01) # thresholds = [0.2] Qmax = -1 for i in range(0, len(thresholds)): clubs = cj.find_clubs(thresholds[i], density_fun) Q = mk.Qf(clubs, sim.Q) if Q > Qmax: Qmax = Q # thresholdsmax = thresholds[i] best_clubs = clubs # print(Qmax) # print(thresholdsmax) return best_clubs def show(density_fun): plt.figure(figsize=(12, 10)) clubs = find_the_best_Qclubs(density_fun) # clubs = cj.find_clubs(0.3, density_fun) draw_clubs(clubs) plt.show() show(cj.d_avg)
实验结果