Hadoop2.8.0<Mapreduce实现Google-PageRank算法>

1.PageRank简单介绍

PageRank是Google专有的算法,用于衡量特定网页相对于搜索引擎索引中的其他
网页而言的重要程度。它由Larry Page 和 Sergey Brin在20世纪90年代后期发明。
PageRank实现了将链接价值概念PageRank是Google的核心算法,用于给每个网页
做评分,是google在“垃圾中找黄金”的关键算法,这个算法成就了今天的google。
 作为排名因素。
 PageRank有两大特性:
 PR值的传递性:网页A指向网页B时,A的PR值也部分传递给B
 重要性的传递性:一个重要网页比一个不重要网页传递的权重要多

为了说明问题,简单起见,本例子避免了网页指向自身的情况

2.算法思想

计算公式
这里写图片描述

首先要计算出概率矩阵

这里写图片描述

然后更新PR值

这里写图片描述

接着标准化

标准化时,PR矩阵的每一个元素除以所有的pr值之和即可

最后是迭代计算即可.

3.计算流程

这里写图片描述

4.实现

  1. 生成概率矩阵
    这里写图片描述

需要注意的是这个mapreduce输出是按列输出的,比如:

1       0.0,0.33333334,0.33333334,0.33333334
2       0.0,0.0,0.5,0.5
3       0.0,0.0,0.0,1.0
4       0.0,1.0,0.0,0.0

code:

public class AdjacencyMatrix {

    /**
     * 输出邻接表
     */
    public static class AdjacencyMapper extends Mapper<LongWritable, Text, Text, Text> {
        Text k = new Text();
        Text v = new Text();

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
            System.out.println("AdjacencyMapper input:");
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //打印当前读入的数据
            System.out.println(value.toString());
            String[] strArr = HadoopUtils.SPARATOR.split(value.toString());
            //原始用户id为key,目标用户id为value
            k.set(strArr[0]);
            v.set(strArr[1]);
            context.write(k, v);
        }
    }

    /**
     * 输入邻接表
     * 输出邻接概率矩阵
     */
    public static class AdjacencyReducer extends Reducer<Text, Text, Text, Text> {

        Text v = new Text();

        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            //初始化概率矩阵,概率矩阵只有一列,函数和总用户数相同
            //用户数
            int nums = 4;


            //构建用户邻接矩阵
            float[] U = new float[nums];
            //该用户的链出数
            int out = 0;
            StringBuilder printSb = new StringBuilder();
            for (Text value : values) {
                //从value中拿到目标用户的id
                int targetUserIndex = Integer.parseInt(value.toString());
                //邻接矩阵中每个目标用户对应的值为1,其余为0
                U[targetUserIndex - 1] = 1;
                out++;
                printSb.append(",").append(value.toString());
            }
            //打印reducer的输入
            System.out.println("AdjacencyReducer input:");
            System.out.println(key.toString() + ":" + printSb.toString().replaceFirst(",", ""));

            StringBuilder stringBuilder = new StringBuilder();
            for (int i = 0; i < nums; i++) {
                stringBuilder.append(",").append(U[i] / out);
            }
            v.set(stringBuilder.toString().replaceFirst(",", ""));
            System.out.println("AdjacencyReducer output:");
            System.out.println(key.toString() + ":" + v.toString());
            System.out.println();
            context.write(key, v);
        }
    }

    public static void run() throws InterruptedException, IOException, ClassNotFoundException {
        Configuration conf = new Configuration();
        String inPath = "/pagerank/page.csv";
        String outPath = "/pagerank/probility-matrix";

        Job job = Job.getInstance(conf, "AdjacencyMatrix");
        HDFSUtils hdfs = new HDFSUtils(conf);
        hdfs.deleteDir(outPath);
        job.setJarByClass(AdjacencyMatrix.class);
        job.setMapperClass(AdjacencyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setReducerClass(AdjacencyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path(inPath));
        FileOutputFormat.setOutputPath(job, new Path(outPath));
        job.waitForCompletion(true);
    }

    public static void main(String[] args) throws Exception {
        AdjacencyMatrix.run();
    }
}

2.更新PR值

这里写图片描述
code:


public class CalcPageRank {

    /**
     * 输入邻接概率矩阵和pr矩阵
     * 按照矩阵相乘的公式,将对应的数据输出到reduce进行计算
     */
    public static class CalcPeopleRankMapper extends Mapper<LongWritable, Text, Text, Text> {

        Text k = new Text();
        Text v = new Text();
        String flag = "";

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
            FileSplit fileSplit = (FileSplit) context.getInputSplit();
            flag = fileSplit.getPath().getName();
            System.out.println("CalcPeopleRankMapper input type:");
            System.out.println(flag);
        }

        /**
         * k的作用是将pr矩阵的列和邻接矩阵的行对应起来
         * 如:pr矩阵的第一列要和邻接矩阵的第一行相乘,所以需要同时输入到reduce中
         */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            System.out.println(value.toString());
            int nums = 4;
            //处理pr矩阵
            if (flag.startsWith("pagerank")) {
                String[] strArr = HadoopUtils.SPARATOR.split(value.toString());
                //第一位为用户id,输入的每行内容都为pr矩阵中的一列,所以也可以看成是列数

                for (int i = 1; i <= nums; i++) {
                    k.set(String.valueOf(i));
                    //pr为标识符,i为该列中第i行,strArr[1]为值
                    v.set("pr:" + strArr[0] + "," + strArr[1]);
                    context.write(k, v);
                }
            }
            //处理邻接概率矩阵
            else {
                String[] strArr = HadoopUtils.SPARATOR.split(value.toString());
                //k为用户id,输入的每行就是邻接概率矩阵中的一行,所以也可以看成行号
                System.out.println("strArr.length "   +strArr.length);
                for (int i = 1; i < strArr.length; i++) {
                    k.set(String.valueOf(i));
                    //matrix为标识符,i为该行中第i列,strArr[i]为值
                    v.set("matrix:" + strArr[0] + "," + strArr[i]);
                    context.write(k, v);
                }
            }
        }
    }

    /**
     * 每行输入都是两个矩阵相乘中对应的值
     * 如:邻接矩阵的第一行的值和pr矩阵第一列的值
     */
    public static class CalcPeopleRankReducer extends Reducer<Text, Text, Text, Text> {



        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            System.out.println("CalcPeopleRankReducer input:");
            StringBuilder printStr = new StringBuilder();

            Text v = new Text();
            //阻尼系数
            int nums = 4;
            //阻尼系数
            float d = 0.85f;

            //pr统计
            float pr = 0f;
            //存储pr矩阵列的值
            Map<Integer, Float> prMap = new HashMap<Integer, Float>();
            //存储邻接矩阵行的值
            Map<Integer, Float> matrixMap = new HashMap<Integer, Float>();
            //将两个矩阵对应的值存入对应的map中
            for (Text value : values) {
                String valueStr = value.toString();
                String[] kv = HadoopUtils.SPARATOR.split(valueStr.split(":")[1]);
                if (valueStr.startsWith("pr")) {
                    prMap.put(Integer.parseInt(kv[0]), Float.valueOf(kv[1]));
                } else {
                    matrixMap.put(Integer.parseInt(kv[0]), Float.valueOf(kv[1]));
                }
                printStr.append(",").append(valueStr);
            }
            System.out.println(printStr.toString().replaceFirst(",", ""));
            //根据map中的数据进行计算
            for (Map.Entry<Integer, Float> entry : matrixMap.entrySet()) {
                pr += entry.getValue() * prMap.get(entry.getKey());
            }
            pr = pr * d + (1 - d) / nums;
            v.set(String.valueOf(pr));
            System.out.println("CalcPeopleRankReducer output:");
            System.out.println(key.toString() + ":" + v.toString());
            System.out.println();
            context.write(key, v);
        }
    }

    public static void run() throws InterruptedException, IOException, ClassNotFoundException {
        Configuration conf = new Configuration();
        String inPath1 = "/pagerank/probility-matrix/part-*";
        String inPath2 = "/pagerank/pagerank.csv";
        String outPath = "/pagerank/pr/";
        Job job = Job.getInstance(conf, "AdjacencyMatrix");
        HDFSUtils hdfs = new HDFSUtils(conf);
        hdfs.deleteDir(outPath);
        job.setJarByClass(CalcPageRank.class);
        job.setMapperClass(CalcPeopleRankMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setReducerClass(CalcPeopleRankReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path(inPath1));
        FileInputFormat.addInputPath(job, new Path(inPath2));
        FileOutputFormat.setOutputPath(job, new Path(outPath));
        job.waitForCompletion(true);

        hdfs.deleteDir(inPath2);
        hdfs.rename(outPath + "/part-r-00000", inPath2);
    }

    public static void main(String[] args) throws Exception{
        CalcPageRank.run();
    }
}

3 标准化
代码比较简单,map阶段直接输出k-v就好了,key为finally,value为pr值和序列
code:


public class Standardization {

    public static class FinallyResultMapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable
            , Text, Text, Text> {

        Text k = new Text("finally");

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
            System.out.println("Standardization input:");
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            System.out.println(value.toString());
            context.write(k, value);
        }
    }

    public static class FinallyResultReducer extends org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text> {
        Text k = new Text();
        Text v = new Text();

        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            System.out.println("Standardization input:");
            StringBuilder printStr = new StringBuilder();
            float totalPr = 0f;
            List<String> list = new ArrayList<String>();
            for (Text value : values) {
                String valueStr = value.toString();
                list.add(valueStr);

                String[] strArr = HadoopUtils.SPARATOR.split(valueStr);
                totalPr += Float.parseFloat(strArr[1]);

                printStr.append(",").append(valueStr);
            }
            System.out.println(printStr.toString().replace(",", ""));

            for (String s : list) {
                String[] strArr = HadoopUtils.SPARATOR.split(s);
                k.set(strArr[0]);
                v.set(String.valueOf(Float.parseFloat(strArr[1]) / totalPr));
                context.write(k, v);
                System.out.println("Standardization output:");
                System.out.println(k.toString() + ":" + v.toString());
                System.out.println();
            }
        }
    }

    public static void run() throws InterruptedException, IOException, ClassNotFoundException {
        Configuration conf = new Configuration();
        String inPath = "/pagerank/pagerank.csv";
        String outPath = "/pagerank/finally-result";

        Job job = Job.getInstance(conf, "AdjacencyMatrix");
        HDFSUtils hdfs = new HDFSUtils(conf);
        hdfs.deleteDir(outPath);
        job.setJarByClass(Standardization.class);
        job.setMapperClass(FinallyResultMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setReducerClass(FinallyResultReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path(inPath));
        FileOutputFormat.setOutputPath(job, new Path(outPath));
        job.waitForCompletion(true);
    }

    public static void main(String[] args) throws Exception{
        Standardization.run();
    }
}

4.最后是迭代计算

public class PageRankDriver {
    public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
        // 生成概率矩阵
        AdjacencyMatrix.run();
        for (int i = 0; i < 10; i++) {
            // 2.迭代
            CalcPageRank.run();
        }
        // 标准化
        Standardization.run();
    }
}

最后给出迭代十次之后的PR值

4   0.3882488
3   0.2032348
2   0.3849407
1   0.023575656

完整代码在我的GitHub

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值