使用MapReduce实现PageRank算法

首先简单的介绍一下PageRank的算法

PageRank首先是Google 公司提出的网页排名的算法,主要就是将网页的中的连接看成入度和出度的这么一个矩阵。然后对这个排名的矩阵不断的迭代,知道达到收敛为止

主要的公式

PR(u)=求和v 属于Bu  (PR(v)/L(v))

Bu是所有网网页指向u集合,v是其中的一个指向u页面的元素,L(v)页面v的出度

这个公式会造成网页的排名下降以及排名泄露

PR(u)=(1-d)/N  +  d*求和v 属于Bu  (PR(v)/L(v))

为了编程的方便,我们可以简化上面的公式

PR(u)=(1-d)  +  d*求和v 属于Bu  (PR(v)/L(v))


具体的MapReduce的算法如下:

第一个Maper主要就是创建一个图

Maper

<标题,(初始化PR值,具体的链接)>

Reducer 不用干神马,直接输出

第二个Map 

输出的是<标题,标题\t PR值>

输出<标题,| 当前网页的具体链接>


Reducer

<标题,pr值,当前网页的连接>

第三个MapReduce

根据pr值排序,输出标题

第四个程序就是调度,设置每一个MapReduce,以及迭代的次数


package pagerank;


import java.util.StringTokenizer;


public class GraphBuilder {
/** 得到输出 <FromPage, <1.0 ,ToPage1,ToPage2...>> */
public static class GraphBuilderMapper extends
Mapper<LongWritable, Text, Text, Text> {
// 正则表达式,匹配出一对方括号”[]“及其所包含的内容,注意方括号内的内容不含换行符并且至少含有一个字符
private static final Pattern wikiLinksPatern = Pattern
.compile("\\[.+?\\]");


public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String pagerank = "1.0\t";
boolean first = true;
String[] titleAndText = parseTitleAndText(value);
String pageName = titleAndText[0];
Text page = new Text(pageName.replace(',', '_')); // 得到网页的title
Matcher matcher = wikiLinksPatern.matcher(titleAndText[1]);
while (matcher.find()) {
String otherPage = matcher.group();


// 过滤出只含有wiki内部链接的网页链接//
otherPage = getWikiPageFromLink(otherPage);
if (otherPage == null || otherPage.isEmpty())
continue;
StringTokenizer itr = new StringTokenizer(otherPage.toString(),
"\n");
for (; itr.hasMoreTokens();) {
if (!first)
pagerank += ",";
pagerank += itr.nextToken();
first = false;
}


}
context.write(page, new Text(pagerank));
}


private String[] parseTitleAndText(Text value) throws IOException {
String[] titleAndText = new String[2];
int start = value.find("&lttitle&gt");
start += 11; // 加上start字符串的长度
int end = value.find("&lt/title&gt", start);
if (start == -1 || end == -1)
return new String[] { "", "" };
titleAndText[0] = Text.decode(value.getBytes(), start, end - start); // getBytes()方法得到字符编码方式


start = value.find("&lttext xml:space");
start += 17; // 加上start字符串的长度
end = value.find("&lt/text&gt", start);
if (start == -1 || end == -1)
return new String[] { "", "" };
titleAndText[1] = Text.decode(value.getBytes(), start, end - start);
return titleAndText;
}


private String getWikiPageFromLink(String aLink) {
if (isNotWikiLink(aLink))
return null;


int start = aLink.startsWith("[[") ? 2 : 1;
int endLink = aLink.indexOf("]");


int pipePosition = aLink.indexOf("|");
if (pipePosition > 0) {
endLink = pipePosition;
}


int part = aLink.indexOf("#");
if (part > 0) {
endLink = part;
}


aLink = aLink.substring(start, endLink);
aLink = aLink.replaceAll("\\s", "_"); // 将空白字符(换行、空格等)转换为"_"
aLink = aLink.replaceAll(",", "");
if (aLink.contains("&amp;"))
aLink.replaceAll("&amp", "&");
return aLink;
}


/** 判断是否是wiki百科内部的链接 **/
private boolean isNotWikiLink(String aLink) {
int start = aLink.startsWith("[[") ? 2 : 1;
if (aLink.length() < start + 2 || aLink.length() > 100)
return true;
char firstChar = aLink.charAt(start);


if (firstChar == '#')
return true;
if (firstChar == ',')
return true;
if (firstChar == '.')
return true;
if (firstChar == '&')
return true;
if (firstChar == '\'')
return true;
if (firstChar == '-')
return true;
if (firstChar == '{')
return true;
if (aLink.contains(":"))
return true;
if (aLink.contains(","))
return true;
if (aLink.contains("&"))
return true;
return false;
}
}


public static class GraphBuilderReducer extends
Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Text value, Context context)
throws IOException, InterruptedException {
context.write(key, value);
}
}


public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job1 = new Job(conf, "Graph Builder");
job1.setJarByClass(GraphBuilder.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
job1.setMapperClass(GraphBuilderMapper.class);
job1.setReducerClass(GraphBuilderReducer.class);
FileInputFormat.addInputPath(job1, new Path(args[0]));
FileOutputFormat.setOutputPath(job1, new Path(args[1]));
job1.waitForCompletion(true);
}
}


package pagerank;


import java.io.IOException;


public class PageRankIter {
private static final double damping = 0.85;


public static class PRIterMapper extends
Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] tuple = line.split("\t");
String pageKey = tuple[0];
double pr = Double.parseDouble(tuple[1]);


if (tuple.length > 2) {
String[] linkPages = tuple[2].split(",");
for (String linkPage : linkPages) {
String prValue = pageKey + "\t"
+ String.valueOf(pr / linkPages.length);
context.write(new Text(linkPage), new Text(prValue));
}
context.write(new Text(pageKey), new Text("|" + tuple[2]));
}
}
}


public static class PRIterReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String links = "";
double pagerank = 0;
for (Text value : values) {
String tmp = value.toString();


if (tmp.startsWith("|")) {
links = "\t" + tmp.substring(tmp.indexOf("|") + 1);// index从0开始
continue;
}


String[] tuple = tmp.split("\t");
if (tuple.length > 1)
pagerank += Double.parseDouble(tuple[1]);
}
pagerank = (double) (1 - damping) + damping * pagerank; // PageRank的计算迭代公式
context.write(new Text(key), new Text(String.valueOf(pagerank)
+ links));
}


}


public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job2 = new Job(conf, "PageRankIter");
job2.setJarByClass(PageRankIter.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(Text.class);
job2.setMapperClass(PRIterMapper.class);
job2.setReducerClass(PRIterReducer.class);
FileInputFormat.addInputPath(job2, new Path(args[0]));
FileOutputFormat.setOutputPath(job2, new Path(args[1]));
job2.waitForCompletion(true);
}
}


package pagerank;


import java.io.IOException;


/*
 * 主要负责排序根据pr值得大小
 * */
public class PageRankViewer {

public static class PageRankViewerMapper extends
Mapper<LongWritable, Text, FloatWritable, Text> {
private Text outPage = new Text();
private FloatWritable outPr = new FloatWritable();


public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] line = value.toString().split("\t");
String page = line[0];
float pr = Float.parseFloat(line[1]);
outPage.set(page);
outPr.set(pr);
context.write(outPr, outPage);
}
}


/** 重载key的比较函数,使其经过shuffle和sort后反序(从大到小)输出 **/
public static class DescFloatComparator extends FloatWritable.Comparator {
// @Override
public float compare(WritableComparator a,
WritableComparable<FloatWritable> b) {
return -super.compare(a, b);
}


public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return -super.compare(b1, s1, l1, b2, s2, l2);
}
}


public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job3 = new Job(conf, "PageRankViewer");
job3.setJarByClass(PageRankViewer.class);
job3.setOutputKeyClass(FloatWritable.class);
job3.setSortComparatorClass(DescFloatComparator.class);
job3.setOutputValueClass(Text.class);
job3.setMapperClass(PageRankViewerMapper.class);
FileInputFormat.addInputPath(job3, new Path(args[0]));
FileOutputFormat.setOutputPath(job3, new Path(args[1]));
job3.waitForCompletion(true);
}
}


package pagerank;


public class PageRankDriver {
private static int times = 10; // 设置迭代次数


public static void main(String[] args) throws Exception {
String[] forGB = { "", args[1] + "/Data0" };
forGB[0] = args[0];
GraphBuilder.main(forGB);


String[] forItr = { "", "" };
for (int i = 0; i < times; i++) {
forItr[0] = args[1] + "/Data" + i;
forItr[1] = args[1] + "/Data" + String.valueOf(i + 1);
PageRankIter.main(forItr);
}


String[] forRV = { args[1] + "/Data" + times, args[1] + "/FinalRank" };
PageRankViewer.main(forRV);
}
}

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值