PageRank算法

别和硬盘比记忆

于 2019-07-09 13:04:06 发布

阅读量231

点赞数

分类专栏： hadoop

本文链接：https://blog.csdn.net/qq_21705851/article/details/95185826

版权

hadoop 专栏收录该内容

18 篇文章 0 订阅

订阅专栏

一、算法描述：
。。。
二、代码实现：

package com.hadoop.mr.pagerank;

import java.io.IOException;
import java.util.Arrays;

import org.apache.commons.lang.StringUtils;

public class Node {

	private double pageRank = 1.0;
	private String[] adjacentNodeNames;

	public static final char fieldSeparator = '\t';

	public double getPageRank() {
		return pageRank;
	}

	public Node setPageRank(double pageRank) {
		this.pageRank = pageRank;
		return this;
	}

	public String[] getAdjacentNodeNames() {
		return adjacentNodeNames;
	}

	public Node setAdjacentNodeNames(String[] adjacentNodeNames) {
		this.adjacentNodeNames = adjacentNodeNames;
		return this;
	}

	public boolean containsAdjacentNodes() {
		return adjacentNodeNames != null && adjacentNodeNames.length > 0;
	}

	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();
		sb.append(pageRank);

		if (getAdjacentNodeNames() != null) {
			//                                                以指定的字符作为分隔符，将一个数组拼接为一个字符串
			// 1.0	B	D
			sb.append(fieldSeparator).append(StringUtils.join(getAdjacentNodeNames(), fieldSeparator));
		}
		return sb.toString();
	}

	// value =1.0	B	D
	public static Node fromMR(String value) throws IOException {
		//利用\t作为分隔符，将value值转换为一个String的数组
		//   1.0\tB\tD  {"1.0", "B", "D"}
		String[] parts = StringUtils.splitPreserveAllTokens(value, fieldSeparator);
		if (parts.length < 1) {
			throw new IOException("Expected 1 or more parts but received "
					+ parts.length);
		}
		//记录下当前页面的PR值
		Node node = new Node().setPageRank(Double.valueOf(parts[0]));
		if (parts.length > 1) {
			//找到当前页面出链指向的页面列表
			node.setAdjacentNodeNames(Arrays
					.copyOfRange(parts, 1, parts.length));
		}
		return node;
	}
	public static Node fromMR(String v1,String v2) throws IOException {
		return fromMR(v1+fieldSeparator+v2);
		//1.0	B	D
	}
}

package com.hadoop.mr.pagerank;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class RunJob {

	public static enum Mycounter {
		my
	}

	public static void main(String[] args) {

		Configuration conf = new Configuration(true);
		conf.set("mapreduce.app-submission.corss-paltform", "true");
		// 如果分布式运行,必须打jar包
		// 且,client在集群外非hadoop jar 这种方式启动,client中必须配置jar的位置
		conf.set("mapreduce.framework.name", "local");
		// 这个配置,只属于,切换分布式到本地单进程模拟运行的配置
		// 这种方式不是分布式,所以不用打jar包

		double d = 0.0000001;
		int i = 0;
		while (true) {
			i++;
			try {
				conf.setInt("runCount", i);

				FileSystem fs = FileSystem.get(conf);
				Job job = Job.getInstance(conf);
				job.setJarByClass(RunJob.class);
				job.setJobName("pr" + i);
				job.setMapperClass(PageRankMapper.class);
				job.setReducerClass(PageRankReducer.class);
				job.setMapOutputKeyClass(Text.class);
				job.setMapOutputValueClass(Text.class);
				
				// A\tB\tD  key:A   value:B\tD
				// 使用了新的输入格式化类，读取一行数据，按照\t将数据分为key和value
				job.setInputFormatClass(KeyValueTextInputFormat.class);

				Path inputPath = new Path("/data/pagerank/input/");

				if (i > 1) {
					inputPath = new Path("/data/pagerank/output/pr" + (i - 1));
				}
				FileInputFormat.addInputPath(job, inputPath);

				Path outpath = new Path("/data/pagerank/output/pr" + i);
				if (fs.exists(outpath)) {
					fs.delete(outpath, true);
				}
				FileOutputFormat.setOutputPath(job, outpath);

				boolean f = job.waitForCompletion(true);
				if (f) {
					System.out.println("success.");
					long sum = job.getCounters().findCounter(Mycounter.my).getValue();

					System.out.println(sum);
					// 乘以1000除以4000相当于除以4，因为是四个页面，当然，这个值看实际情况来设置。
					// 该算法并不影响收敛性
					double avgd = sum / 4000.0;
					if (avgd < d) {
						break;
					}
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	static class PageRankMapper extends Mapper<Text, Text, Text, Text> {
		protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {

			int runCount = context.getConfiguration().getInt("runCount", 1);

			// A\tB\tD
			// page = "A"  "B\tD"
			String page = key.toString();
			Node node = null;
			if (runCount == 1) {
				//          PR初始值：  1.0      "B\tD"
				node = Node.fromMR("1.0", value.toString());
			} else {
				node = Node.fromMR(value.toString());
			}
			// A\t0.5\tB\tD，记录下当前页面的PR值，同时记录下出链关系
			context.write(new Text(page), new Text(node.toString()));
			
			//对每个出链指向的页面，它们都会得到当前页面的PR值，这个值对于公式来讲，就是∑求和的元素
			// A\t1.0\tB\tD
			if (node.containsAdjacentNodes()) {
				// 计算出链的价值 = PR/出链数量
				double outValue = node.getPageRank() / node.getAdjacentNodeNames().length;
				for (int i = 0; i < node.getAdjacentNodeNames().length; i++) {
					String outPage = node.getAdjacentNodeNames()[i];
					// 由于当前页面指向B，所以B得到PR值0.5
					// B\t0.5   <"B", 0.5> <"D", 0.5>
					context.write(new Text(outPage), new Text(outValue + ""));
				}
			}
			//最终中间结果包括两类键值对数据，一个是单纯的某个页面得到的PR值列表，一个是每个页面当前PR值和出链指向页面关系
			//                               B\t0.5                  B\t1.0\tC
		}
	}

	static class PageRankReducer extends Reducer<Text, Text, Text, Text> {
		protected void reduce(Text key, Iterable<Text> iterable, Context context)
				throws IOException, InterruptedException {

			// 相同的key为一组
			// key：页面名称比如B
			// 包含两类数据
			// B:1.0 C //页面对应关系及老的pr值

			// B:0.5 //投票值
			// B:0.5

			double sum = 0.0;

			//表示提供出链的页面
			//B\t0.5                  B\t1.0\tC
			Node sourceNode = null;
			for (Text i : iterable) {
				Node node = Node.fromMR(i.toString());
				// B\t0.5和B\t1.0\tC
				if (node.containsAdjacentNodes()) {
					// 在实际中，只有一条记录表示了该页面出链到其他页面的关系，就是这个sourceNode
					sourceNode = node;
				} else {
					// 计算出从所有页面得到的PR值总和
					sum = sum + node.getPageRank();
				}
			}

			
//			A	B	D
//			
//			A 0.5	B	D
//			B	1.5	C
			
			
			
			// 按照公式进行计算，得到新的PR值
			double newPR = (0.15 / 4.0) + (0.85 * sum);
			System.out.println("*********** new pageRank value is " + newPR);

			// 把新的pr值和计算之前的pr比较，sourceNode中有记录原来的PR值：B\t1.0\tC
			double d = newPR - sourceNode.getPageRank();

			int j = (int) (d * 1000.0);
			j = Math.abs(j);
			System.out.println(j + "___________");
			//MyCounter用于记录所有两次计算之间PR值差值的总和
			context.getCounter(Mycounter.my).increment(j);
			//给当前页面设置新的PR值
			sourceNode.setPageRank(newPR);
			//将当前页面信息写到reducer输出中。B\t1.5\tC
			context.write(key, new Text(sourceNode.toString()));
		}
	}
}