标准PageRank
原理
P R ( v j k ) = d ∗ ∑ v i ∈ i n ( v j ) P R ( v i k − 1 ) ∣ O U T ( v i ) ∣ + ( 1 − d ) PR(v_j^k) = d*\sum \limits _{v_i\in in(v_j)}\frac{PR(v_i^{k-1})}{|OUT(v_i)|} +(1-d) PR(vjk)=d∗vi∈in(vj)∑∣OUT(vi)∣PR(vik−1)+(1−d)
代码
#include <iostream>
#include <fstream>
#include <vector>
#include <set>
using namespace std;
double d = 0.85;
double threshold = 1e-7;
struct Page
{
vector<int> outPage; //出邻居
double oldPR = 0;
double newPR = 0;
};
//读文件,获取顶点个数
int getN()
{
set<int> s;
ifstream inFile("/Users/joy/git-repo/PageRank-cpp/a.txt");
int u, v;
while(inFile >> u >> v) //假设每一行插入一条边u->v
{
s.insert(u);
s.insert(v);
}
int N = s.size();
return N;
}
//把每个页面的信息(outPage,oldPR,newPR)存到向量里
void getPagesVec(int N, vector<Page>& pages)
{
ifstream inFile("/Users/joy/git-repo/PageRank-cpp/a.txt");
int u, v;
while(inFile >> u >> v)
pages[u].outPage.push_back(v);
}
int main()
{
int N = getN(); // N是页面总数
vector<Page> pages(N); //pages存放页面向量,大小为N
getPagesVec(N, pages);
int cnt = 0; //统计迭代几轮
int shouldStop = 0; //根据oldPR与newPR的差值 判断是否停止迭代
// 开始pr值的迭代更新
while(!shouldStop)
{
shouldStop = 1;
//对于i->j,根据公式,页面j能接收到的PR值为:PR(j) = d * sum(PR(i)/OUT(i)) + (1-d)
//因此每个顶点i向出页面发送的PR值为:oldPR/outDegree
for(auto &page: pages)
{
int outDegree = page.outPage.size();
double tmpPR = page.oldPR / outDegree;
for(int j = 0; j < outDegree; j++)
pages[page.outPage[j]].newPR += tmpPR * d;
}
for(auto &page: pages)
{
page.newPR += 1-d;
if(abs(page.newPR - page.oldPR) < threshold)
shouldStop = 0;
page.oldPR = page.newPR;
page.newPR = 0;
}
cnt++;
}
printf("%s%d%s\n", "共迭代", cnt, "轮");
for(auto page: pages)
printf("%.9lf\n", page.oldPR/N);
return 0;
}
使用的数据集
0 2
0 1
0 3
1 0
1 3
2 0
3 1
3 2
运行结果
共迭代90轮
0.324561278
0.225146115
0.225146115
0.225146115
增量PageRank
原理
d e l t a ( v j k ) = ∑ v i ∈ i n ( v j ) d e l t a ( v i k − 1 ) ∣ O U T ( v i ) ∣ ∗ d v a l u e ( v j k ) + = d e l t a ( v j k ) delta(v_j^k)=\sum \limits _{v_i\in in(v_j)} \frac{delta(v_i^{k-1})}{|OUT(v_i)|}*d\\ value(v_j^k)\mathrel{+}=delta(v_j^k) delta(vjk)=vi∈in(vj)∑∣OUT(vi)∣delta(vik−1)∗dvalue(vjk)+=delta(vjk)
代码
#include <iostream>
#include <fstream>
#include <vector>
#include <set>
using namespace std;
double d = 0.85;
double threshold = 1e-7;
struct Page
{
vector<int> outPage; //出邻居
double value = 0;
double oldDelta= 0;
double recvDelta = 1-d;
};
//读文件,获取顶点个数
int getN()
{
set<int> s;
ifstream inFile("/Users/joy/git-repo/PageRank-cpp/a.txt");
int u, v;
while(inFile >> u >> v) //假设每一行插入一条边u->v
{
s.insert(u);
s.insert(v);
}
int N = s.size();
return N;
}
//把每个页面的信息(outPage,value,oldDelta,recvDelta)存到向量里
void getPagesVec(int N, vector<Page>& pages)
{
ifstream inFile("/Users/joy/git-repo/PageRank-cpp/a.txt");
int u, v;
while(inFile >> u >> v)
pages[u].outPage.push_back(v);
}
int main()
{
int N = getN(); // N是页面总数
vector<Page> pages(N); //pages存放页面向量,大小为N
getPagesVec(N, pages);
int cnt = 0; //统计迭代几轮
int shouldStop = 0; //根据oldPR与newPR的差值 判断是否停止迭代
// 开始迭代
while(!shouldStop)
{
shouldStop = 1;
//对于i->j,根据公式,页面j能接收到的delta值为:
//recvDelta(j) = oldDelta(i) / outDegree
//因此每个顶点i向出邻居发送的delta值为:oldDelta(i)
for(auto &page: pages)
{
int outDegree = page.outPage.size();
double tmpDelta = page.oldDelta / outDegree;
for(int j = 0; j < outDegree; j++)
pages[page.outPage[j]].recvDelta += tmpDelta * d;
}
for(auto &page: pages)
{
if(page.recvDelta < threshold)
shouldStop = 0;
page.value += page.recvDelta;
page.oldDelta = page.recvDelta;
page.recvDelta = 0;
}
cnt++;
}
printf("%s%d%s\n", "共迭代", cnt, "轮");
for(auto page: pages)
printf("%.9lf\n", page.value/N);
return 0;
}
使用的数据集
0 2
0 1
0 3
1 0
1 3
2 0
3 1
3 2
运行结果
共迭代90轮
0.324561278
0.225146115
0.225146115
0.225146115
使用networkx进行验证
代码
以下这个利用python的第三方库networkx实现的pagerank算法可以用来测试自己代码正确性:
import networkx as nx
filename = 'dataset/a.txt'
G = nx.DiGraph()
with open(filename) as file:
for line in file:
head, tail = [int(x) for x in line.split()]
G.add_edge(head, tail)
pr = nx.pagerank(G)
print(pr)
运行结果
{0: 0.3245609358176832,
2: 0.22514635472743894,
1: 0.22514635472743894,
3: 0.22514635472743896}