使用MapReduce实现PageRank算法

最新推荐文章于 2021-03-31 17:01:01 发布

红豆和绿豆

最新推荐文章于 2021-03-31 17:01:01 发布

阅读量3.3k

点赞数

分类专栏： hadoop 文章标签： mapreduce 算法 PageRank

本文链接：https://blog.csdn.net/u011955252/article/details/50535294

版权

hadoop 专栏收录该内容

93 篇文章 2 订阅

订阅专栏

首先简单的介绍一下PageRank的算法

PageRank首先是Google 公司提出的网页排名的算法，主要就是将网页的中的连接看成入度和出度的这么一个矩阵。然后对这个排名的矩阵不断的迭代，知道达到收敛为止

主要的公式

PR(u)=求和v 属于Bu （PR（v）/L(v)）

Bu是所有网网页指向u集合，v是其中的一个指向u页面的元素，L（v）页面v的出度

这个公式会造成网页的排名下降以及排名泄露

PR(u)=（1-d）/N + d*求和v 属于Bu （PR（v）/L(v)）

为了编程的方便，我们可以简化上面的公式

PR(u)=（1-d） + d*求和v 属于Bu （PR（v）/L(v)）

具体的MapReduce的算法如下：

第一个Maper主要就是创建一个图

Maper

<标题，（初始化PR值，具体的链接）>

Reducer 不用干神马，直接输出

第二个Map

输出的是<标题，标题\t PR值>

输出<标题，| 当前网页的具体链接>

Reducer

<标题，pr值，当前网页的连接>

第三个MapReduce

根据pr值排序，输出标题

第四个程序就是调度，设置每一个MapReduce，以及迭代的次数

package pagerank;

import java.util.StringTokenizer;

public class GraphBuilder {
/** 得到输出 <FromPage, <1.0 ,ToPage1,ToPage2...>> */
public static class GraphBuilderMapper extends
Mapper<LongWritable, Text, Text, Text> {
// 正则表达式，匹配出一对方括号”[]“及其所包含的内容，注意方括号内的内容不含换行符并且至少含有一个字符
private static final Pattern wikiLinksPatern = Pattern
.compile("\\[.+?\\]");

public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String pagerank = "1.0\t";
boolean first = true;
String[] titleAndText = parseTitleAndText(value);
String pageName = titleAndText[0];
Text page = new Text(pageName.replace(',', '_')); // 得到网页的title
Matcher matcher = wikiLinksPatern.matcher(titleAndText[1]);
while (matcher.find()) {
String otherPage = matcher.group();

// 过滤出只含有wiki内部链接的网页链接//
otherPage = getWikiPageFromLink(otherPage);
if (otherPage == null || otherPage.isEmpty())
continue;
StringTokenizer itr = new StringTokenizer(otherPage.toString(),
"\n");
for (; itr.hasMoreTokens();) {
if (!first)
pagerank += ",";
pagerank += itr.nextToken();
first = false;
}

}
context.write(page, new Text(pagerank));
}

private String[] parseTitleAndText(Text value) throws IOException {
String[] titleAndText = new String[2];
int start = value.find("&lttitle&gt");
start += 11; // 加上start字符串的长度
int end = value.find("&lt/title&gt", start);
if (start == -1 || end == -1)
return new String[] { "", "" };
titleAndText[0] = Text.decode(value.getBytes(), start, end - start); // getBytes()方法得到字符编码方式

start = value.find("&lttext xml:space");
start += 17; // 加上start字符串的长度
end = value.find("&lt/text&gt", start);
if (start == -1 || end == -1)
return new String[] { "", "" };
titleAndText[1] = Text.decode(value.getBytes(), start, end - start);
return titleAndText;
}

private String getWikiPageFromLink(String aLink) {
if (isNotWikiLink(aLink))
return null;

int start = aLink.startsWith("[[") ? 2 : 1;
int endLink = aLink.indexOf("]");

int pipePosition = aLink.indexOf("|");
if (pipePosition > 0) {
endLink = pipePosition;
}

int part = aLink.indexOf("#");
if (part > 0) {
endLink = part;
}

aLink = aLink.substring(start, endLink);
aLink = aLink.replaceAll("\\s", "_"); // 将空白字符（换行、空格等）转换为"_"
aLink = aLink.replaceAll(",", "");
if (aLink.contains("&"))
aLink.replaceAll("&amp", "&");
return aLink;
}

/** 判断是否是wiki百科内部的链接 **/
private boolean isNotWikiLink(String aLink) {
int start = aLink.startsWith("[[") ? 2 : 1;
if (aLink.length() < start + 2 || aLink.length() > 100)
return true;
char firstChar = aLink.charAt(start);

if (firstChar == '#')
return true;
if (firstChar == ',')
return true;
if (firstChar == '.')
return true;
if (firstChar == '&')
return true;
if (firstChar == '\'')
return true;
if (firstChar == '-')
return true;
if (firstChar == '{')
return true;
if (aLink.contains(":"))
return true;
if (aLink.contains(","))
return true;
if (aLink.contains("&"))
return true;
return false;
}
}

public static class GraphBuilderReducer extends
Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Text value, Context context)
throws IOException, InterruptedException {
context.write(key, value);
}
}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job1 = new Job(conf, "Graph Builder");
job1.setJarByClass(GraphBuilder.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
job1.setMapperClass(GraphBuilderMapper.class);
job1.setReducerClass(GraphBuilderReducer.class);
FileInputFormat.addInputPath(job1, new Path(args[0]));
FileOutputFormat.setOutputPath(job1, new Path(args[1]));
job1.waitForCompletion(true);
}
}

package pagerank;

import java.io.IOException;

public class PageRankIter {
private static final double damping = 0.85;

public static class PRIterMapper extends
Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] tuple = line.split("\t");
String pageKey = tuple[0];
double pr = Double.parseDouble(tuple[1]);

if (tuple.length > 2) {
String[] linkPages = tuple[2].split(",");
for (String linkPage : linkPages) {
String prValue = pageKey + "\t"
+ String.valueOf(pr / linkPages.length);
context.write(new Text(linkPage), new Text(prValue));
}
context.write(new Text(pageKey), new Text("|" + tuple[2]));
}
}
}

public static class PRIterReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String links = "";
double pagerank = 0;
for (Text value : values) {
String tmp = value.toString();

if (tmp.startsWith("|")) {
links = "\t" + tmp.substring(tmp.indexOf("|") + 1);// index从0开始
continue;
}

String[] tuple = tmp.split("\t");
if (tuple.length > 1)
pagerank += Double.parseDouble(tuple[1]);
}
pagerank = (double) (1 - damping) + damping * pagerank; // PageRank的计算迭代公式
context.write(new Text(key), new Text(String.valueOf(pagerank)
+ links));
}

}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job2 = new Job(conf, "PageRankIter");
job2.setJarByClass(PageRankIter.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(Text.class);
job2.setMapperClass(PRIterMapper.class);
job2.setReducerClass(PRIterReducer.class);
FileInputFormat.addInputPath(job2, new Path(args[0]));
FileOutputFormat.setOutputPath(job2, new Path(args[1]));
job2.waitForCompletion(true);
}
}

package pagerank;

import java.io.IOException;

/*
* 主要负责排序根据pr值得大小
* */
public class PageRankViewer {

public static class PageRankViewerMapper extends
Mapper<LongWritable, Text, FloatWritable, Text> {
private Text outPage = new Text();
private FloatWritable outPr = new FloatWritable();

public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] line = value.toString().split("\t");
String page = line[0];
float pr = Float.parseFloat(line[1]);
outPage.set(page);
outPr.set(pr);
context.write(outPr, outPage);
}
}

/** 重载key的比较函数，使其经过shuffle和sort后反序（从大到小）输出 **/
public static class DescFloatComparator extends FloatWritable.Comparator {
// @Override
public float compare(WritableComparator a,
WritableComparable<FloatWritable> b) {
return -super.compare(a, b);
}

public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return -super.compare(b1, s1, l1, b2, s2, l2);
}
}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job3 = new Job(conf, "PageRankViewer");
job3.setJarByClass(PageRankViewer.class);
job3.setOutputKeyClass(FloatWritable.class);
job3.setSortComparatorClass(DescFloatComparator.class);
job3.setOutputValueClass(Text.class);
job3.setMapperClass(PageRankViewerMapper.class);
FileInputFormat.addInputPath(job3, new Path(args[0]));
FileOutputFormat.setOutputPath(job3, new Path(args[1]));
job3.waitForCompletion(true);
}
}

package pagerank;

public class PageRankDriver {
private static int times = 10; // 设置迭代次数

public static void main(String[] args) throws Exception {
String[] forGB = { "", args[1] + "/Data0" };
forGB[0] = args[0];
GraphBuilder.main(forGB);

String[] forItr = { "", "" };
for (int i = 0; i < times; i++) {
forItr[0] = args[1] + "/Data" + i;
forItr[1] = args[1] + "/Data" + String.valueOf(i + 1);
PageRankIter.main(forItr);
}

String[] forRV = { args[1] + "/Data" + times, args[1] + "/FinalRank" };
PageRankViewer.main(forRV);
}
}

红豆和绿豆

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
使用MapReduce实现PageRank算法

首先简单的介绍一下PageRank的算法PageRank首先是Google 公司提出的网页排名的算法，主要就是将网页的中的连接看成入度和出度的这么一个矩阵。然后对这个排名的矩阵不断的迭代，知道达到收敛为止主要的公式PR(u)=求和v 属于Bu （PR（v）/L(v)）Bu是所有网网页指向u集合，v是其中的一个指向u页面的元素，L（v）页面v的出度这个公式会造成网页的排名下降以
复制链接

扫一扫

专栏目录