- 初始化过程: 将原始文档的每个 Page 行末尾加上 1.0 表示的 PageRank 值初始化为 1
- 迭代计算过程:经过 Map 方法处理后,每行存放的数据格式为 page_name \t list_page_name(用,隔开) \t pagerank,在 Map 中迭代计算每个 Page 的出链的 rank 值,在 Reduce 时,对相同 page 在 Map 中得到的每一个 rank 值相加得到最终的 PageRank
- 最终排序并得到结果过程:经过上述两个过程后, 得到的仍然是 page_name \t list_page_name(用,隔开) \t pagerank 格式的文件,我们并不需要中间的 list_page_name,因此在这个过程中将中间的部分去掉,并按 PageRank 值倒序排序。
package org.apache.hadoop.examples
import java.io.IOException
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.Mapper
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
public class PageRank_Initialzation {
public static class Map extends Mapper<Object,Text,Text,Text>
{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String pr="1.0"
context.write(value, new Text(pr))
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//指定输入输出目录
if (args.length != 2)
{
System.err.println("路径出错")
System.exit(2)
}
Configuration conf = new Configuration()
conf.set("fs.defaultFS", "hdfs://10.102.0.197:9000")
final String OUTPUT_PATH = args[1]
Path path = new Path(OUTPUT_PATH)
//加载配置文件
FileSystem fileSystem = path.getFileSystem(conf)
//输出目录若存在则删除
if (fileSystem.exists(new Path(OUTPUT_PATH)))
{
fileSystem.delete(new Path(OUTPUT_PATH),true)
}
//一些初始化
Job job = Job.getInstance(conf,"PageRank_Initialzation")
job.setJarByClass(PageRank_Initialzation.class)
job.setMapperClass(Map.class)
job.setOutputKeyClass(Text.class)
job.setOutputValueClass(Text.class)
FileInputFormat.addInputPath(job, new Path(args[0]))
FileOutputFormat.setOutputPath(job, new Path(args[1]))
job.waitForCompletion(true)
}
}
package org.apache.hadoop.examples;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class PageRankIter {
private static double d = 0.85;
public static class Map extends Mapper<Object,Text,Text,Text>
{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String page[]=value.toString().split("\t");
String page_name=page[0];
Text prValue = new Text();
if(page.length>2)
{
String page_list[]=page[1].split(",");
double pr = Double.parseDouble(page[2]);
for(String list:page_list)
{
if (list.isEmpty()) {
continue;
}
prValue.set( new Text(String.valueOf(pr / page_list.length)));
context.write(new Text(list),prValue);
}
context.write(new Text(page_name), new Text("|"+page[1]));
}
}
}
public static class Reduce extends Reducer<Text,Text,Text,Text>
{
public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException
{
String list="";
double pr=0;
for(Text val:values)
{
if(val.toString().startsWith("|"))
list+=val.toString().substring(1);
else
{
pr+=Double.parseDouble(val.toString());
}
}
pr=pr*d+(1-d);
String v="";
v=String.valueOf(pr);
context.write(key, new Text(list+"\t"+v));
}
}
public static void main(String[] args) throws Exception {
if (args.length != 2)
{
System.err.println("路径出错");
System.exit(2);
}
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://10.102.0.197:9000");
final String OUTPUT_PATH = args[1];
Path path = new Path(OUTPUT_PATH);
FileSystem fileSystem = path.getFileSystem(conf);
if (fileSystem.exists(new Path(OUTPUT_PATH)))
{
fileSystem.delete(new Path(OUTPUT_PATH),true);
}
Job job = Job.getInstance(conf,"PageRank_Iter");
job.setJarByClass(PageRankIter.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
package org.apache.hadoop.examples;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class PageRankViewer {
public static class Map extends Mapper<Object,Text,DoubleWritable,Text>
{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line[] =value.toString().split("\t");
DoubleWritable pr= new DoubleWritable();
pr.set(Double.parseDouble(line[2]));
context.write(pr, new Text(line[0]));
}
}
public static class DescFloatComparator extends DoubleWritable.Comparator {
public float compare(WritableComparator a, WritableComparable<DoubleWritable> b) {
return -super.compare(a, b);
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return -super.compare(b1, s1, l1, b2, s2, l2);
}
}
public static class Reduce extends Reducer<DoubleWritable,Text,Text,Text>
{
public void reduce(DoubleWritable key, Iterable<Text> values, Context context)throws IOException, InterruptedException
{
String out_key="(";
String out_val="";
for(Text val:values)
{
out_key+=val.toString();
}
out_val=String.format("%.10f", key.get())+")";
context.write(new Text(out_key),new Text(out_val));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
if (args.length != 2)
{
System.err.println("路径出错");
System.exit(2);
}
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://10.102.0.197:9000");
conf.set("mapred.textoutputformat.ignoreseparator", "true");
conf.set("mapred.textoutputformat.separator", ",");
final String OUTPUT_PATH = args[1];
Path path = new Path(OUTPUT_PATH);
FileSystem fileSystem = path.getFileSystem(conf);
if (fileSystem.exists(new Path(OUTPUT_PATH)))
{
fileSystem.delete(new Path(OUTPUT_PATH),true);
}
Job job = Job.getInstance(conf,"PageRankViewer");
job.setJarByClass(PageRankViewer.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setSortComparatorClass(DescFloatComparator.class);
job.setMapOutputKeyClass(DoubleWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
4.PageRankDriver:驱动类,在该类中运行PageRank三个步骤的main方法
package org.apache.hadoop.examples;
public class PageRankDriver {
public static void main(String[] args) throws Exception
{
String[] otherArgs = new String[]{"/Experiment_3","Experiment_3_Hadoop"};
if (otherArgs.length != 2)
{
System.err.println("路径出错");
System.exit(2);
}
String temp="temp";
String[] PR_Ini = { otherArgs[0], temp+"0"};
PageRank_Initialzation.main(PR_Ini);
String[] temp_PRIter_args = { "", "" };
int times = 10;
for (int i = 0; i < times; i++)
{
temp_PRIter_args[0] = temp + i;
temp_PRIter_args[1] = temp + (i + 1);
PageRankIter.main(temp_PRIter_args);
}
String[] final_PR = { "temp10", otherArgs[1] };
PageRankViewer.main(final_PR);
}
}