一、算法描述:
。。。
二、代码实现:
package com.hadoop.mr.pagerank;
import java.io.IOException;
import java.util.Arrays;
import org.apache.commons.lang.StringUtils;
public class Node {
private double pageRank = 1.0;
private String[] adjacentNodeNames;
public static final char fieldSeparator = '\t';
public double getPageRank() {
return pageRank;
}
public Node setPageRank(double pageRank) {
this.pageRank = pageRank;
return this;
}
public String[] getAdjacentNodeNames() {
return adjacentNodeNames;
}
public Node setAdjacentNodeNames(String[] adjacentNodeNames) {
this.adjacentNodeNames = adjacentNodeNames;
return this;
}
public boolean containsAdjacentNodes() {
return adjacentNodeNames != null && adjacentNodeNames.length > 0;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(pageRank);
if (getAdjacentNodeNames() != null) {
// 以指定的字符作为分隔符,将一个数组拼接为一个字符串
// 1.0 B D
sb.append(fieldSeparator).append(StringUtils.join(getAdjacentNodeNames(), fieldSeparator));
}
return sb.toString();
}
// value =1.0 B D
public static Node fromMR(String value) throws IOException {
//利用\t作为分隔符,将value值转换为一个String的数组
// 1.0\tB\tD {"1.0", "B", "D"}
String[] parts = StringUtils.splitPreserveAllTokens(value, fieldSeparator);
if (parts.length < 1) {
throw new IOException("Expected 1 or more parts but received "
+ parts.length);
}
//记录下当前页面的PR值
Node node = new Node().setPageRank(Double.valueOf(parts[0]));
if (parts.length > 1) {
//找到当前页面出链指向的页面列表
node.setAdjacentNodeNames(Arrays
.copyOfRange(parts, 1, parts.length));
}
return node;
}
public static Node fromMR(String v1,String v2) throws IOException {
return fromMR(v1+fieldSeparator+v2);
//1.0 B D
}
}
package com.hadoop.mr.pagerank;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class RunJob {
public static enum Mycounter {
my
}
public static void main(String[] args) {
Configuration conf = new Configuration(true);
conf.set("mapreduce.app-submission.corss-paltform", "true");
// 如果分布式运行,必须打jar包
// 且,client在集群外非hadoop jar 这种方式启动,client中必须配置jar的位置
conf.set("mapreduce.framework.name", "local");
// 这个配置,只属于,切换分布式到本地单进程模拟运行的配置
// 这种方式不是分布式,所以不用打jar包
double d = 0.0000001;
int i = 0;
while (true) {
i++;
try {
conf.setInt("runCount", i);
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
job.setJarByClass(RunJob.class);
job.setJobName("pr" + i);
job.setMapperClass(PageRankMapper.class);
job.setReducerClass(PageRankReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// A\tB\tD key:A value:B\tD
// 使用了新的输入格式化类,读取一行数据,按照\t将数据分为key和value
job.setInputFormatClass(KeyValueTextInputFormat.class);
Path inputPath = new Path("/data/pagerank/input/");
if (i > 1) {
inputPath = new Path("/data/pagerank/output/pr" + (i - 1));
}
FileInputFormat.addInputPath(job, inputPath);
Path outpath = new Path("/data/pagerank/output/pr" + i);
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
boolean f = job.waitForCompletion(true);
if (f) {
System.out.println("success.");
long sum = job.getCounters().findCounter(Mycounter.my).getValue();
System.out.println(sum);
// 乘以1000除以4000相当于除以4,因为是四个页面,当然,这个值看实际情况来设置。
// 该算法并不影响收敛性
double avgd = sum / 4000.0;
if (avgd < d) {
break;
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
static class PageRankMapper extends Mapper<Text, Text, Text, Text> {
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
int runCount = context.getConfiguration().getInt("runCount", 1);
// A\tB\tD
// page = "A" "B\tD"
String page = key.toString();
Node node = null;
if (runCount == 1) {
// PR初始值: 1.0 "B\tD"
node = Node.fromMR("1.0", value.toString());
} else {
node = Node.fromMR(value.toString());
}
// A\t0.5\tB\tD,记录下当前页面的PR值,同时记录下出链关系
context.write(new Text(page), new Text(node.toString()));
//对每个出链指向的页面,它们都会得到当前页面的PR值,这个值对于公式来讲,就是∑求和的元素
// A\t1.0\tB\tD
if (node.containsAdjacentNodes()) {
// 计算出链的价值 = PR/出链数量
double outValue = node.getPageRank() / node.getAdjacentNodeNames().length;
for (int i = 0; i < node.getAdjacentNodeNames().length; i++) {
String outPage = node.getAdjacentNodeNames()[i];
// 由于当前页面指向B,所以B得到PR值0.5
// B\t0.5 <"B", 0.5> <"D", 0.5>
context.write(new Text(outPage), new Text(outValue + ""));
}
}
//最终中间结果包括两类键值对数据,一个是单纯的某个页面得到的PR值列表,一个是每个页面当前PR值和出链指向页面关系
// B\t0.5 B\t1.0\tC
}
}
static class PageRankReducer extends Reducer<Text, Text, Text, Text> {
protected void reduce(Text key, Iterable<Text> iterable, Context context)
throws IOException, InterruptedException {
// 相同的key为一组
// key:页面名称比如B
// 包含两类数据
// B:1.0 C //页面对应关系及老的pr值
// B:0.5 //投票值
// B:0.5
double sum = 0.0;
//表示提供出链的页面
//B\t0.5 B\t1.0\tC
Node sourceNode = null;
for (Text i : iterable) {
Node node = Node.fromMR(i.toString());
// B\t0.5和B\t1.0\tC
if (node.containsAdjacentNodes()) {
// 在实际中,只有一条记录表示了该页面出链到其他页面的关系,就是这个sourceNode
sourceNode = node;
} else {
// 计算出从所有页面得到的PR值总和
sum = sum + node.getPageRank();
}
}
// A B D
//
// A 0.5 B D
// B 1.5 C
// 按照公式进行计算,得到新的PR值
double newPR = (0.15 / 4.0) + (0.85 * sum);
System.out.println("*********** new pageRank value is " + newPR);
// 把新的pr值和计算之前的pr比较,sourceNode中有记录原来的PR值:B\t1.0\tC
double d = newPR - sourceNode.getPageRank();
int j = (int) (d * 1000.0);
j = Math.abs(j);
System.out.println(j + "___________");
//MyCounter用于记录所有两次计算之间PR值差值的总和
context.getCounter(Mycounter.my).increment(j);
//给当前页面设置新的PR值
sourceNode.setPageRank(newPR);
//将当前页面信息写到reducer输出中。B\t1.5\tC
context.write(key, new Text(sourceNode.toString()));
}
}
}