学习Hadoop已经有段时间了,自己尝试写了一下PageRank算法。
1.mapper
package myPageRank;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.StringTokenizer;
/**
* @author tulie
* @ceerte $(YEAR)-$(MONTH)-$(DAY)-$(TIME)
*/
public class PRMapper extends Mapper<Object, Text, IntWritable, Text> {
private IntWritable id;
private String pr;
private int count;
private float average_pr;
//重写map方法
public void map(Object key, Text value, Context context)
{
//构造一个用来解析 str 的 StringTokenizer 对象。java 默认的分隔符是空格("")、制表符(\t)、换行符(\n)、回车符(\r)。
StringTokenizer str = new StringTokenizer(value.toString()); //|1 6 2 4 5
if(str.hasMoreTokens())//返回是否还有分隔符
{
id = new IntWritable(Integer.parseInt(str.nextToken()));//1 |6 2 4 5
}else{
return;
}
pr = str.nextToken();//返回从当前位置到下一个分隔符的字符串 1 6 |2 4 5
count = str.countTokens();//3
average_pr = Float.parseFloat(pr)/count;
while(str.hasMoreTokens())
{
try{
String nextId = str.nextToken();//1 6 2| 4 5
IntWritable linid = new IntWritable(Integer.parseInt(nextId));
//将网页向外链接的ID以“pr+得到贡献值”格式输出
Text avpr = new Text("pr" + average_pr);
context.write(linid, avpr);
// 将网页ID和PR值输出
Text ids = new Text("id" + nextId);
context.write(id, ids);
}catch(IOException e)
{
e.printStackTrace();
}catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
2.reducer
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
/**
* @author tulie
* @ceerte $(YEAR)-$(MONTH)-$(DAY)-$(TIME)
*/
public class PRReducer extends Reducer<IntWritable, Text, IntWritable, Text> {
//重写reduce方法
public void reduce(IntWritable key, Iterable<Text> values,
Context context) {
// 定义一个存储网页链接ID的队列
ArrayList<String> ids = new ArrayList<String>();
// 将所有的链接ID以String格式保存
String strid = " ";
// 定义一个保存网页PR值的变量
float pr = 0;
//遍历
System.out.println(key.get());
for(Text txt : values) {
String str = txt.toString();
//判断value是贡献值还是向外部的链接
if (str.startsWith("pr")) {
// 贡献值
pr += Float.parseFloat(str.substring(2));
System.out.println(pr);
} else if (str.startsWith("id")) {
// 链接id
String id = str.substring(2);
ids.add(id);
}
}
pr = 0.85f*pr + 0.15f;
// 得到所有链接ID的String形式
for (int i = 0; i < ids.size(); i++) {
strid += ids.get(i) + " ";
}
// 组合pr+lianjie成原文件的格式类型
String strpr = String.format("%.5f", pr);
String result = strpr + strid;
try {
context.write(key, new Text(result));
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
3.Driver
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.lib.ChainMapper;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* @author tulie
* @ceerte $(YEAR)-$(MONTH)-$(DAY)-$(TIME)
*/
public class PRDriver {
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
String pathIn1 = “D:/PageRank/input/input.txt”;
String pathOut=“D:/PageRank/output1/PageRankOutput0”;
for(int i=1;i<41;i++){ //加入for循环
Job job = Job.getInstance(conf, “MapReduce pagerank”);
job.setJarByClass(PRDriver.class);
job.setMapperClass(PRMapper.class);
job.setReducerClass(PRReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(pathIn1));
FileOutputFormat.setOutputPath(job, new Path(pathOut));
pathIn1 = pathOut;//把输出的地址改成下一次迭代的输入地址
pathOut = pathOut+i;//把下一次的输出设置成一个新地址。
job.waitForCompletion(true);//把System.exit()去掉
}
}