package package1.pagerank;
import java.io.IOException;
import java.util.ArrayList;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class MyPageRank {
public static class PageRankMap extends Mapper<Object, Text, IntWritable, Text> {
private IntWritable id;
private String pr;
private int count;
private float average_pr;
public void map(Object key, Text value, Context context)
{
StringTokenizer str = new StringTokenizer(value.toString());
if(str.hasMoreTokens())
{
id = new IntWritable(Integer.parseInt(str.nextToken()));
}else{
return;
}
pr = str.nextToken();
count = str.countTokens();
average_pr = Float.parseFloat(pr)/count;
while(str.hasMoreTokens())
{
try{
String nextId = str.nextToken();
IntWritable linid = new IntWritable(Integer.parseInt(nextId));
//将网页向外链接的ID以“pr+得到贡献值”格式输出
Text avpr = new Text("pr" + average_pr);
context.write(linid, avpr);
// 将网页ID和PR值输出
Text ids = new Text("id" + nextId);
context.write(id, ids);
}catch(IOException e)
{
e.printStackTrace();
}catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
public static class PageRankReducer extends Reducer<IntWritable, Text, IntWritable, Text>{
public void reduce(IntWritable key, Iterable<Text> values,
Context context) {
// 定义一个存储网页链接ID的队列
ArrayList<String> ids = new ArrayList<String>();
// 将所有的链接ID以String格式保存
String strid = " ";
// 定义一个保存网页PR值的变量
float pr = 0;
//遍历
System.out.println(key.get());
for(Text txt : values) {
String str = txt.toString();
//判断value是贡献值还是向外部的链接
if (str.startsWith("pr")) {
// 贡献值
pr += Float.parseFloat(str.substring(2));
System.out.println(pr);
} else if (str.startsWith("id")) {
// 链接id
String id = str.substring(2);
ids.add(id);
}
}
// 得到所有链接ID的String形式
for (int i = 0; i < ids.size(); i++) {
strid += ids.get(i) + " ";
}
// 组合pr+lianjie成原文件的格式类型
String strpr = String.format("%.5f", pr);
String result = strpr + strid;
try {
context.write(key, new Text(result));
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
String paths= otherArgs[0];
String path1= paths;
String path2="";
for (int i = 1; i <= 10; i++) {
Job job = new Job(conf, "MapReduce pagerank");
path2 = paths + i;
job.setJarByClass(MyPageRank.class);
job.setMapperClass(PageRankMap.class);
job.setReducerClass(PageRankReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(path1));
FileOutputFormat.setOutputPath(job, new Path(path2));
path1 = path2;
job.waitForCompletion(true);
}
}
}
import java.io.IOException;
import java.util.ArrayList;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class MyPageRank {
public static class PageRankMap extends Mapper<Object, Text, IntWritable, Text> {
private IntWritable id;
private String pr;
private int count;
private float average_pr;
public void map(Object key, Text value, Context context)
{
StringTokenizer str = new StringTokenizer(value.toString());
if(str.hasMoreTokens())
{
id = new IntWritable(Integer.parseInt(str.nextToken()));
}else{
return;
}
pr = str.nextToken();
count = str.countTokens();
average_pr = Float.parseFloat(pr)/count;
while(str.hasMoreTokens())
{
try{
String nextId = str.nextToken();
IntWritable linid = new IntWritable(Integer.parseInt(nextId));
//将网页向外链接的ID以“pr+得到贡献值”格式输出
Text avpr = new Text("pr" + average_pr);
context.write(linid, avpr);
// 将网页ID和PR值输出
Text ids = new Text("id" + nextId);
context.write(id, ids);
}catch(IOException e)
{
e.printStackTrace();
}catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
public static class PageRankReducer extends Reducer<IntWritable, Text, IntWritable, Text>{
public void reduce(IntWritable key, Iterable<Text> values,
Context context) {
// 定义一个存储网页链接ID的队列
ArrayList<String> ids = new ArrayList<String>();
// 将所有的链接ID以String格式保存
String strid = " ";
// 定义一个保存网页PR值的变量
float pr = 0;
//遍历
System.out.println(key.get());
for(Text txt : values) {
String str = txt.toString();
//判断value是贡献值还是向外部的链接
if (str.startsWith("pr")) {
// 贡献值
pr += Float.parseFloat(str.substring(2));
System.out.println(pr);
} else if (str.startsWith("id")) {
// 链接id
String id = str.substring(2);
ids.add(id);
}
}
// 得到所有链接ID的String形式
for (int i = 0; i < ids.size(); i++) {
strid += ids.get(i) + " ";
}
// 组合pr+lianjie成原文件的格式类型
String strpr = String.format("%.5f", pr);
String result = strpr + strid;
try {
context.write(key, new Text(result));
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
String paths= otherArgs[0];
String path1= paths;
String path2="";
for (int i = 1; i <= 10; i++) {
Job job = new Job(conf, "MapReduce pagerank");
path2 = paths + i;
job.setJarByClass(MyPageRank.class);
job.setMapperClass(PageRankMap.class);
job.setReducerClass(PageRankReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(path1));
FileOutputFormat.setOutputPath(job, new Path(path2));
path1 = path2;
job.waitForCompletion(true);
}
}
}