hadoop模拟实现pagerank算法过程

最新推荐文章于 2021-05-23 06:34:00 发布

「已注销」

最新推荐文章于 2021-05-23 06:34:00 发布

阅读量628

点赞数

分类专栏： hadoop 文章标签： hadoop 算法 mapreduce pagerank

本文链接：https://blog.csdn.net/sddagege/article/details/38731089

版权

hadoop 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

本文介绍了如何使用Hadoop模拟实现PageRank算法。通过Mapper类解析输入数据，Reducer类处理链接关系并计算PageRank值。代码中详细展示了MapReduce任务的配置和执行过程。

摘要由CSDN通过智能技术生成

package package1.pagerank;

import java.io.IOException;
import java.util.ArrayList;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class MyPageRank {

   public static class PageRankMap extends Mapper<Object, Text, IntWritable, Text> {

       private IntWritable id;
       private String pr;
       private int count;
       private float average_pr;

       public void map(Object key, Text value, Context context)
       {
           StringTokenizer str = new StringTokenizer(value.toString());
           if(str.hasMoreTokens())
           {
               id = new IntWritable(Integer.parseInt(str.nextToken()));
           }else{
               return;
           }
           pr = str.nextToken();
           count = str.countTokens();
           average_pr = Float.parseFloat(pr)/count;
           while(str.hasMoreTokens())
           {
               try{
                   String nextId = str.nextToken();
                   IntWritable linid = new IntWritable(Integer.parseInt(nextId));
                    //将网页向外链接的ID以“pr+得到贡献值”格式输出
                    Text avpr = new Text("pr" + average_pr);
                    context.write(linid, avpr);
                    // 将网页ID和PR值输出
                    Text ids = new Text("id" + nextId);
                    context.write(id, ids);
               }catch(IOException e)
               {
                   e.printStackTrace();
               }catch (InterruptedException e) {
                    e.printStackTrace();
                }
           }
       }
   }

   public static class PageRankReducer extends Reducer<IntWritable, Text, IntWritable, Text>{
        public void reduce(IntWritable key, Iterable<Text> values,
                Context context) {

            // 定义一个存储网页链接ID的队列
            ArrayList<String> ids = new ArrayList<String>();
            // 将所有的链接ID以String格式保存
            String strid = " ";
            // 定义一个保存网页PR值的变量
            float pr = 0;
            //遍历
            System.out.println(key.get());
            for(Text txt : values) {
                  String str = txt.toString();
                //判断value是贡献值还是向外部的链接
                  if (str.startsWith("pr")) {
                    // 贡献值
                    pr += Float.parseFloat(str.substring(2));
                    System.out.println(pr);
                } else if (str.startsWith("id")) {
                    // 链接id
                    String id = str.substring(2);
                    ids.add(id);
                }
            }

            // 得到所有链接ID的String形式
            for (int i = 0; i < ids.size(); i++) {
               strid += ids.get(i) + " ";
            }
            // 组合pr+lianjie成原文件的格式类型
            String strpr = String.format("%.5f", pr);
            String result = strpr + strid;
            try {
                context.write(key, new Text(result));
            } catch (IOException e) {
                e.printStackTrace();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
   }

    public static void main(String[] args) throws IOException,
        InterruptedException, ClassNotFoundException {

       Configuration conf = new Configuration();
       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
       String paths= otherArgs[0];
        String path1= paths;
        String path2="";
       for (int i = 1; i <= 10; i++) {
            Job job = new Job(conf, "MapReduce pagerank");
            path2 = paths + i;
            job.setJarByClass(MyPageRank.class);
            job.setMapperClass(PageRankMap.class);
            job.setReducerClass(PageRankReducer.class);
            job.setOutputKeyClass(IntWritable.class);
            job.setOutputValueClass(Text.class);
            FileInputFormat.addInputPath(job, new Path(path1));
            FileOutputFormat.setOutputPath(job, new Path(path2));
            path1 = path2;
            job.waitForCompletion(true);

       }

   }

}