MR 矩阵相乘

最新推荐文章于 2024-07-18 20:21:46 发布

Oasen

最新推荐文章于 2024-07-18 20:21:46 发布

阅读量1.4k

点赞数

分类专栏： Hadoop

本文链接：https://blog.csdn.net/dec_sun/article/details/93753805

版权

Hadoop 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

文章目录

矩阵乘法:
- 第一种方法
- 第二种方法

矩阵乘法:

A 矩阵	B 矩阵
1, 2, 3	7, 9
4, 5, 6	4, 6
7, 8, 9	1, 3
10, 11, 12

一般矩阵乘法为：
在这里插入图片描述

MR 实现分析
因为分布式计算的特点，需要找到相互独立的计算过程，以便能够在不同的节点上进行计算而不会彼此影响。根据矩阵乘法的公式 C 中各个元素的计算都是相互独立的，即各个cij在计算过程中彼此不影响。这样的话，在 Map 阶段可以把计算所需要的元素都集中到同一个 key 中，然后，在 Reduce 阶段就可以从中解析出各个元素来计算 cij。另外，以 a11 为例，它将会在 c11、c12……c1p 的计算中使用。也就是说，在 Map 阶段，当我们从 HDFS 取出一行
记录时，如果该记录是 A 的元素，则需要存储成 p 个 <key,value> 对，并且这 p 个 key 互不相同；如果该记录是 B 的元素，则需要存储成 m 个 <key,value> 对，同样的，m 个 key 也应互不相同；但同时，用于存放计算 cij 的 ai1、ai2……ain 和 b1j、b2j……bnj 的 <key,value> 对的 key 应该都是相同的，这样才能被传递到同一个 Reduce 中。

在这里插入图片描述

第一种方法

切割。
key 的两个值分别是 <第一个数组的纵轴的长度的偏移量, 第二个数组的横轴长度的偏移量>
value 的三个值分别是 <标志符,第一个数组的横轴元素的偏移量/第二个数组纵轴元素的偏移量, 当前偏移量上的元素>
重组。
以 key 相同的为一组，并且将 <a,偏移量,value1> 与 <b,偏移量,value2> 的相同的偏移量的value1 与 value2 进行相乘。然后将同一组 key 的不同偏移量的 (value1 * value2) 进行相加 sum。得到 <key, sum>

代码

public class MatrixMapReduce extends Configured implements Tool{ 
    public static class MatrixMapper extends 
            Mapper<LongWritable, Text, Text, Text> { 
        private String flag = null;// 数据集名称 
        private int rowNum = 4;// 矩阵A的行数 
        private int colNum = 2;// 矩阵B的列数 
        private int rowIndexA = 1; // 矩阵A，当前在第几行 
        private int rowIndexB = 1; // 矩阵B，当前在第几行 
 
        @Override 
        protected void setup(Context context) throws IOException, 
                InterruptedException { 
            flag = ((FileSplit) context.getInputSplit()).getPath().getName();
        } 
 
        @Override 
        protected void map(LongWritable key, Text value, Context context) 
                throws IOException, InterruptedException { 
            String[] tokens = value.toString().split(","); 
            if ("test1".equals(flag)) { 
                for (int i = 1; i <= colNum; i++) { 
                    Text k = new Text(rowIndexA + "," + i);  //A 数组, 构建 key,
                    for (int j = 0; j < tokens.length; j++) { 
                        Text v = new Text("a," + (j + 1) + "," + tokens[j]);  //A数组, 构建value
                        context.write(k, v); 
                    } 
                } 
                rowIndexA++;// 每执行一次map方法，矩阵向下移动一行 
            } else if ("test2".equals(flag)) { 
                for (int i = 1; i <= rowNum; i++) { 
                    for (int j = 0; j < tokens.length; j++) { 
                        Text k = new Text(i + "," + (j + 1));  //B数组, 构建 key.
                        Text v = new Text("b," + rowIndexB + "," + tokens[j]);  B数组, 构建 value
                        context.write(k, v); 
                    } 
                } 
                rowIndexB++;// 每执行一次map方法，矩阵向下移动一行 
            } 
        } 
    } 
 
    public static class MatrixReducer extends 
            Reducer<Text, Text, Text, IntWritable> { 
        @Override 
        protected void reduce(Text key, Iterable<Text> values, Context context) 
                throws IOException, InterruptedException { 
            Map<String, String> mapA = new HashMap<String, String>(); 
            Map<String, String> mapB = new HashMap<String, String>(); 
 
            for (Text value : values) { 
                String[] val = value.toString().split(","); 
                if ("a".equals(val[0])) { 
                    mapA.put(val[1], val[2]); 
                } else if ("b".equals(val[0])) { 
                    mapB.put(val[1], val[2]); 
                } 
            } 
 
            int result = 0; 
            Iterator<String> mKeys = mapA.keySet().iterator(); 
            while (mKeys.hasNext()) { 
                String mkey = mKeys.next(); 
                if (mapB.get(mkey) == null) {// 因为mkey取的是mapA的key集合，所以只需要判断 mapB 是否存在即可。 
                    continue;
                    } 
                result += Integer.parseInt(mapA.get(mkey)) 
                        * Integer.parseInt(mapB.get(mkey)); 
            } 
            context.write(key, new IntWritable(result)); 
        } 
    } 
  
    // driver:任务相关设置
    public int run(String[] args) throws Exception {
        // 获取相关配置
        Configuration conf = this.getConf();
        Job job = Job.getInstance(conf, this.getClass().getSimpleName());
        job.setJarByClass(MatrixMapReduce.class);      
       
        FileInputFormat.setInputPaths(job, new Path(args[0]), new Path(args[1]));// 加载2个输入数据集 
        Path outputPath = new Path(args[2]); 
        outputPath.getFileSystem(conf).delete(outputPath, true);
        FileSystem fs = FileSystem.get(new URI(outputPath.toString()), conf);
        if (fs.exists(new Path(outputPath.toString()))) {
            fs.delete(new Path(outputPath.toString()), true);
        }
        FileOutputFormat.setOutputPath(job, outputPath);
        // 设置map
        job.setMapperClass(MatrixMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // 设置reduce
        job.setReducerClass(MatrixReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        // 将job提交给Yarn
        boolean isSuccess = job.waitForCompletion(true);
        return isSuccess ? 0 : 1;
    }
    public static void main(String[] args) throws Exception { 
        String input1 = "hdfs://lee01.cniao5.com:8020/user/root/mapreduce/input/ma"; 
        String input2 = "hdfs://lee01.cniao5.com:8020/user/root/mapreduce/input/mb"; 
        String output = "hdfs://lee01.cniao5.com:8020/user/root/mapreduce/output"; 
 
        Configuration conf = new Configuration(); 
 
        args = new String[] {input1,input2,output};
 
        int statas = ToolRunner.run(conf, new MatrixMapReduce(), args);
 
        System.exit(statas);
    } 
}

第二种方法

方法使用的数据结构是我们通常映像中的矩阵表达方式，相同行内不同列数据通过 " , "分割，不同行通过换行分割。

通常，我们不采用这种方式，用行列表示法，即文件中的每行数据有三个元素通过分隔符分割，第一个元素表示行，第二个元素表示列，第三个元素表示数据。这种方式对于可以不列出为0的元素，即可以减少稀疏矩阵的数据量。例如：

行列值
1, 1, 1
1, 2, 2,
1, 3, 3,
2, 1, 4,
…

public class MatrixMapReduce2 extends Configured implements Tool{ 
    public static class MatrixMapper extends 
            Mapper<LongWritable, Text, Text, Text> { 
        private String flag = null;// 数据集名称 
        private int rowNum = 4;// 矩阵A的行数 
        private int colNum = 2;// 矩阵B的列数 
 
        @Override 
        protected void setup(Context context) throws IOException, 
                InterruptedException { 
            flag = ((FileSplit) context.getInputSplit()).getPath().getName();
        } 
 
        @Override 
        protected void map(LongWritable key, Text value, Context context) 
                throws IOException, InterruptedException { 
            String[] tokens = value.toString().split(","); 
            if ("test3".equals(flag)) { 
                for (int i = 1; i <= colNum; i++) { 
                    context.write(new Text(tokens[0] + "," + i), new Text("a," 
                            + tokens[1] + "," + tokens[2])); 
                }
            } else if ("test4".equals(flag)) {  
                    for (int i = 1; i <= rowNum; i++) { 
                        context.write(new Text(i + "," + tokens[1]), new Text("b," 
                                + tokens[0] + "," + tokens[2])); 
                    }
                } 
            
        } 
    } 
 
    public static class MatrixReducer extends 
            Reducer<Text, Text, Text, IntWritable> { 
        @Override 
        protected void reduce(Text key, Iterable<Text> values, Context context) 
                throws IOException, InterruptedException { 
            Map<String, String> mapA = new HashMap<String, String>(); 
            Map<String, String> mapB = new HashMap<String, String>(); 
 
            for (Text value : values) { 
                String[] val = value.toString().split(","); 
                if ("a".equals(val[0])) { 
                    mapA.put(val[1], val[2]); 
                } else if ("b".equals(val[0])) { 
                    mapB.put(val[1], val[2]); 
                } 
            } 
 
            int result = 0; 
            Iterator<String> mKeys = mapA.keySet().iterator(); 
            while (mKeys.hasNext()) { 
                String mkey = mKeys.next(); 
                if (mapB.get(mkey) == null) {// 因为mkey取的是mapA的key集合，所以只需要判断 mapB 是否存在即可。 
                    continue; 
                } 
                result += Integer.parseInt(mapA.get(mkey)) 
                        * Integer.parseInt(mapB.get(mkey)); 
            } 
            context.write(key, new IntWritable(result)); 
        } 
    } 
   
    // driver:任务相关设置
    public int run(String[] args) throws Exception {
        // 获取相关配置
        Configuration conf = this.getConf();
        Job job = Job.getInstance(conf, this.getClass().getSimpleName());
        job.setJarByClass(MatrixMapReduce2.class);     
       
        FileInputFormat.setInputPaths(job, new Path(args[0]), new Path(args[1]));// 加载2个输入数据集 
       
        Path outputPath = new Path(args[2]); 
        outputPath.getFileSystem(conf).delete(outputPath, true);
        FileSystem fs = FileSystem.get(new URI(outputPath.toString()), conf);
        if (fs.exists(new Path(outputPath.toString()))) {
            fs.delete(new Path(outputPath.toString()), true);
        }
        FileOutputFormat.setOutputPath(job, outputPath);
        // 设置map
        job.setMapperClass(MatrixMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // 设置reduce
        job.setReducerClass(MatrixReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        // 将job提交给Yarn
        boolean isSuccess = job.waitForCompletion(true);
        return isSuccess ? 0 : 1;
    }
 
    public static void main(String[] args) throws Exception { 
        String input1 = "hdfs://lee01.cniao5.com:8020/user/root/mapreduce/input/na"; 
        String input2 = "hdfs://lee01.cniao5.com:8020/user/root/mapreduce/input/nb"; 
        String output = "hdfs://lee01.cniao5.com:8020/user/root/mapreduce/output"; 
        Configuration conf = new Configuration(); 
        args = new String[] {input1,input2,output};
        int statas = ToolRunner.run(conf, new MatrixMapReduce2(), args);
 
        System.exit(statas);
    } 
}