1.构造数据集
将facebook无向图的边的50%随机删除作为训练集,再将其中50%随机删除作为测试集。针对训练集未来会产生边的点构成一个source集,source里面的每个点构建相应的positive(将产生边)和negative(不产生边)样本集。
2.构造特征
边的特征:基于图结构(公共neighbor等)
点的特征:性别等个体特征
3.计算边strength,作为状态转移矩阵的初始化值
边的strength a 由特征权重系数和特征进行点乘获得
def calStrength(features, beta):
#return np.exp(np.dot(features, beta))
# use logistic strength function to prevent potential overflow or under flow
# of floating point numbers
return 1.0 / (1+ np.exp(-1 * np.dot(features, beta) ))
def genTrans(nnodes, g, features, s, alpha, beta):
# feature is supplied in per-edge manner
# the transition matrix is created with teleportation
trans = np.zeros((nnodes, nnodes))
for i in range(len(g)):
#strength = calStrength(np.asarray(features[g[i][0],])*np.asarray(features[g[i][1],])
#, beta)
strength = calStrength(features[g[i][0]][g[i][1]], beta)
trans[g[i][0], g[i][1]] = strength
trans[g[i][1], g[i][0]] = strength
# normalize the transition matrix
for i in range(nnodes):
tempSum = sum(trans[i,])
if tempSum > 0:
trans[i,] = map(lambda x: x/tempSum, trans[i, ])
# create a list of transition matrices for a set of sources
trans_multi = []
for si in range(len(s)):
# create the one matrix
one = np.zeros((nnodes, nnodes))
for i in range(nnodes):
one[i, s[si]] = 1
# combine the regular transition matrix and the one matrix
trans_multi.append((1-alpha)*trans + alpha*one)
return trans_multi
4.求source中所有点的page score
采用 power iteration:
def iterPageRank(pp, trans):
ppnew = np.dot(pp, trans)
while not(np.allclose(pp, ppnew)):
pp = ppnew
ppnew = np.dot(pp, trans)
return ppnew
5. 构建目标方程
我们希望negative样本的pagerank score 越小越好,而positive样本的pagerank score 越大越好,所以目标方程最小化两者之差,再加上正则项。
def minObj(Dset, Lset, offset, lam, nnodes, g, features, source, alpha, beta):
# calculate PageRank according to features and beta values
# transform input features into matrix form
features_m = genFeatures(nnodes, g, features)
# compute transition matrices for sources
trans = genTrans(nnodes, g, features_m, source, alpha, beta)
# cost value
cost = 0
# calculate cost function for every selected sources nodes
for i in range(len(source)):
pp = np.repeat(1.0/nnodes, nnodes)
pgrank = iterPageRank(pp, trans[i])
# compute cost from the generated PageRank value
for d in Dset[i]:
for l in Lset[i]:
cost += costFunc(pgrank[l], pgrank[d], offset)
penalty = lam * np.dot(beta, beta)
return (cost + penalty)
6.梯度下降优化
其中第二个式子求导可以采用pwer iteration
def objDiff(Dset, Lset, offset, lam, nnodes, g, features, source, alpha, beta):
diffVec = [0] * len(beta)
# calculate PageRank according to features and beta values
# transform input features into matrix form
features_m = genFeatures(nnodes, g, features)
###########################################################
### trans_p and transDiff are independent of source node ##
###########################################################
# trans_p is the original transition matrix
# (without teleportation and varying strength)
# this is used to calculate gradient of transition matrix
trans_p = genTrans_plain(nnodes, g, [0], 0)[0]
# a list of matrices is returned by diffQ function
transDiff = diffQ(features_m, beta, trans_p, alpha)
###########################################################
###########################################################
# compute transition matrices for sources
trans = genTrans(nnodes, g, features_m, source, alpha, beta)
# calculate gradient for every selected sources nodes
for i in range(len(source)):
pp = np.repeat(1.0/nnodes, nnodes)
pgrank = iterPageRank(pp, trans[i])
for k in range(len(beta)):
tempObjDiff = 0
pDiff = np.zeros((1, nnodes))
pDiff = iterPageDiff(pDiff, pgrank, trans[i], transDiff[k])
for d in Dset[i]:
for l in Lset[i]:
tempObjDiff += costDiff(pgrank[l], pgrank[d], offset)*(pDiff[l] - pDiff[d])
# penalty term
#tempObjDiff += 2.0 * lam * beta[k]
#diffVec.append(tempObjDiff)
diffVec[k] += tempObjDiff
# penalty term
for k in range(len(beta)):
diffVec[k] += 2.0 * lam * beta[k]
return np.asarray(diffVec)