1.PageRank简单介绍
PageRank是Google专有的算法,用于衡量特定网页相对于搜索引擎索引中的其他
网页而言的重要程度。它由Larry Page 和 Sergey Brin在20世纪90年代后期发明。
PageRank实现了将链接价值概念PageRank是Google的核心算法,用于给每个网页
做评分,是google在“垃圾中找黄金”的关键算法,这个算法成就了今天的google。
作为排名因素。
PageRank有两大特性:
PR值的传递性:网页A指向网页B时,A的PR值也部分传递给B
重要性的传递性:一个重要网页比一个不重要网页传递的权重要多
为了说明问题,简单起见,本例子避免了网页指向自身的情况
2.算法思想
计算公式
首先要计算出概率矩阵
然后更新PR值
接着标准化
标准化时,PR矩阵的每一个元素除以所有的pr值之和即可
最后是迭代计算即可.
3.计算流程
4.实现
- 生成概率矩阵
需要注意的是这个mapreduce输出是按列输出的,比如:
1 0.0,0.33333334,0.33333334,0.33333334
2 0.0,0.0,0.5,0.5
3 0.0,0.0,0.0,1.0
4 0.0,1.0,0.0,0.0
code:
public class AdjacencyMatrix {
/**
* 输出邻接表
*/
public static class AdjacencyMapper extends Mapper<LongWritable, Text, Text, Text> {
Text k = new Text();
Text v = new Text();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
System.out.println("AdjacencyMapper input:");
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//打印当前读入的数据
System.out.println(value.toString());
String[] strArr = HadoopUtils.SPARATOR.split(value.toString());
//原始用户id为key,目标用户id为value
k.set(strArr[0]);
v.set(strArr[1]);
context.write(k, v);
}
}
/**
* 输入邻接表
* 输出邻接概率矩阵
*/
public static class AdjacencyReducer extends Reducer<Text, Text, Text, Text> {
Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//初始化概率矩阵,概率矩阵只有一列,函数和总用户数相同
//用户数
int nums = 4;
//构建用户邻接矩阵
float[] U = new float[nums];
//该用户的链出数
int out = 0;
StringBuilder printSb = new StringBuilder();
for (Text value : values) {
//从value中拿到目标用户的id
int targetUserIndex = Integer.parseInt(value.toString());
//邻接矩阵中每个目标用户对应的值为1,其余为0
U[targetUserIndex - 1] = 1;
out++;
printSb.append(",").append(value.toString());
}
//打印reducer的输入
System.out.println("AdjacencyReducer input:");
System.out.println(key.toString() + ":" + printSb.toString().replaceFirst(",", ""));
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < nums; i++) {
stringBuilder.append(",").append(U[i] / out);
}
v.set(stringBuilder.toString().replaceFirst(",", ""));
System.out.println("AdjacencyReducer output:");
System.out.println(key.toString() + ":" + v.toString());
System.out.println();
context.write(key, v);
}
}
public static void run() throws InterruptedException, IOException, ClassNotFoundException {
Configuration conf = new Configuration();
String inPath = "/pagerank/page.csv";
String outPath = "/pagerank/probility-matrix";
Job job = Job.getInstance(conf, "AdjacencyMatrix");
HDFSUtils hdfs = new HDFSUtils(conf);
hdfs.deleteDir(outPath);
job.setJarByClass(AdjacencyMatrix.class);
job.setMapperClass(AdjacencyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(AdjacencyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(inPath));
FileOutputFormat.setOutputPath(job, new Path(outPath));
job.waitForCompletion(true);
}
public static void main(String[] args) throws Exception {
AdjacencyMatrix.run();
}
}
2.更新PR值
code:
public class CalcPageRank {
/**
* 输入邻接概率矩阵和pr矩阵
* 按照矩阵相乘的公式,将对应的数据输出到reduce进行计算
*/
public static class CalcPeopleRankMapper extends Mapper<LongWritable, Text, Text, Text> {
Text k = new Text();
Text v = new Text();
String flag = "";
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
FileSplit fileSplit = (FileSplit) context.getInputSplit();
flag = fileSplit.getPath().getName();
System.out.println("CalcPeopleRankMapper input type:");
System.out.println(flag);
}
/**
* k的作用是将pr矩阵的列和邻接矩阵的行对应起来
* 如:pr矩阵的第一列要和邻接矩阵的第一行相乘,所以需要同时输入到reduce中
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
System.out.println(value.toString());
int nums = 4;
//处理pr矩阵
if (flag.startsWith("pagerank")) {
String[] strArr = HadoopUtils.SPARATOR.split(value.toString());
//第一位为用户id,输入的每行内容都为pr矩阵中的一列,所以也可以看成是列数
for (int i = 1; i <= nums; i++) {
k.set(String.valueOf(i));
//pr为标识符,i为该列中第i行,strArr[1]为值
v.set("pr:" + strArr[0] + "," + strArr[1]);
context.write(k, v);
}
}
//处理邻接概率矩阵
else {
String[] strArr = HadoopUtils.SPARATOR.split(value.toString());
//k为用户id,输入的每行就是邻接概率矩阵中的一行,所以也可以看成行号
System.out.println("strArr.length " +strArr.length);
for (int i = 1; i < strArr.length; i++) {
k.set(String.valueOf(i));
//matrix为标识符,i为该行中第i列,strArr[i]为值
v.set("matrix:" + strArr[0] + "," + strArr[i]);
context.write(k, v);
}
}
}
}
/**
* 每行输入都是两个矩阵相乘中对应的值
* 如:邻接矩阵的第一行的值和pr矩阵第一列的值
*/
public static class CalcPeopleRankReducer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
System.out.println("CalcPeopleRankReducer input:");
StringBuilder printStr = new StringBuilder();
Text v = new Text();
//阻尼系数
int nums = 4;
//阻尼系数
float d = 0.85f;
//pr统计
float pr = 0f;
//存储pr矩阵列的值
Map<Integer, Float> prMap = new HashMap<Integer, Float>();
//存储邻接矩阵行的值
Map<Integer, Float> matrixMap = new HashMap<Integer, Float>();
//将两个矩阵对应的值存入对应的map中
for (Text value : values) {
String valueStr = value.toString();
String[] kv = HadoopUtils.SPARATOR.split(valueStr.split(":")[1]);
if (valueStr.startsWith("pr")) {
prMap.put(Integer.parseInt(kv[0]), Float.valueOf(kv[1]));
} else {
matrixMap.put(Integer.parseInt(kv[0]), Float.valueOf(kv[1]));
}
printStr.append(",").append(valueStr);
}
System.out.println(printStr.toString().replaceFirst(",", ""));
//根据map中的数据进行计算
for (Map.Entry<Integer, Float> entry : matrixMap.entrySet()) {
pr += entry.getValue() * prMap.get(entry.getKey());
}
pr = pr * d + (1 - d) / nums;
v.set(String.valueOf(pr));
System.out.println("CalcPeopleRankReducer output:");
System.out.println(key.toString() + ":" + v.toString());
System.out.println();
context.write(key, v);
}
}
public static void run() throws InterruptedException, IOException, ClassNotFoundException {
Configuration conf = new Configuration();
String inPath1 = "/pagerank/probility-matrix/part-*";
String inPath2 = "/pagerank/pagerank.csv";
String outPath = "/pagerank/pr/";
Job job = Job.getInstance(conf, "AdjacencyMatrix");
HDFSUtils hdfs = new HDFSUtils(conf);
hdfs.deleteDir(outPath);
job.setJarByClass(CalcPageRank.class);
job.setMapperClass(CalcPeopleRankMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(CalcPeopleRankReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(inPath1));
FileInputFormat.addInputPath(job, new Path(inPath2));
FileOutputFormat.setOutputPath(job, new Path(outPath));
job.waitForCompletion(true);
hdfs.deleteDir(inPath2);
hdfs.rename(outPath + "/part-r-00000", inPath2);
}
public static void main(String[] args) throws Exception{
CalcPageRank.run();
}
}
3 标准化
代码比较简单,map阶段直接输出k-v就好了,key为finally,value为pr值和序列
code:
public class Standardization {
public static class FinallyResultMapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable
, Text, Text, Text> {
Text k = new Text("finally");
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
System.out.println("Standardization input:");
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
System.out.println(value.toString());
context.write(k, value);
}
}
public static class FinallyResultReducer extends org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text> {
Text k = new Text();
Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
System.out.println("Standardization input:");
StringBuilder printStr = new StringBuilder();
float totalPr = 0f;
List<String> list = new ArrayList<String>();
for (Text value : values) {
String valueStr = value.toString();
list.add(valueStr);
String[] strArr = HadoopUtils.SPARATOR.split(valueStr);
totalPr += Float.parseFloat(strArr[1]);
printStr.append(",").append(valueStr);
}
System.out.println(printStr.toString().replace(",", ""));
for (String s : list) {
String[] strArr = HadoopUtils.SPARATOR.split(s);
k.set(strArr[0]);
v.set(String.valueOf(Float.parseFloat(strArr[1]) / totalPr));
context.write(k, v);
System.out.println("Standardization output:");
System.out.println(k.toString() + ":" + v.toString());
System.out.println();
}
}
}
public static void run() throws InterruptedException, IOException, ClassNotFoundException {
Configuration conf = new Configuration();
String inPath = "/pagerank/pagerank.csv";
String outPath = "/pagerank/finally-result";
Job job = Job.getInstance(conf, "AdjacencyMatrix");
HDFSUtils hdfs = new HDFSUtils(conf);
hdfs.deleteDir(outPath);
job.setJarByClass(Standardization.class);
job.setMapperClass(FinallyResultMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(FinallyResultReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(inPath));
FileOutputFormat.setOutputPath(job, new Path(outPath));
job.waitForCompletion(true);
}
public static void main(String[] args) throws Exception{
Standardization.run();
}
}
4.最后是迭代计算
public class PageRankDriver {
public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
// 生成概率矩阵
AdjacencyMatrix.run();
for (int i = 0; i < 10; i++) {
// 2.迭代
CalcPageRank.run();
}
// 标准化
Standardization.run();
}
}
最后给出迭代十次之后的PR值
4 0.3882488
3 0.2032348
2 0.3849407
1 0.023575656
完整代码在我的GitHub