CUHK-IEMS5730-HW1

最新推荐文章于 2019-04-19 16:19:26 发布

yonghangtian

最新推荐文章于 2019-04-19 16:19:26 发布

阅读量576

点赞数 1

分类专栏： IERG5730 CUHK

本文链接：https://blog.csdn.net/yonghangtian/article/details/87436711

版权

CUHK 同时被 2 个专栏收录

7 篇文章 0 订阅

订阅专栏

IERG5730

5 篇文章 1 订阅

订阅专栏

CUHK-IEMS5730-HW1

Environment

Google Cloud Platform
Ubuntu 14.04 LTS
Instance: 2 cores, 8GB ROM, 50GB storage
Openjdk-7-jdk/jre
Hadoop 2.9.2 with Yarn

1, Map Reduce Source Code

I use Preprocess.java to add labels to original data-set, and use two map-reduce job to achieve matrix multiplication, which are MatrixMultiplication1.java and MatrixMultiplication2.java.

Part I: Preprocess.java

import java.io.*;

public class Preprocess {
    public static void main(String[] args) {
        File fileM = new File("src/main/resources/hw1-large-dataset/M_large.dat");
        File fileN = new File("src/main/resources/hw1-large-dataset/N_large.dat");
        // can not find file.
        File fileMwithLabel = new File("src/main/resources/hw1-large-dataset/M_large_labeled.dat");
        File fileNwithLabel = new File("src/main/resources/hw1-large-dataset/N_large_labeled.dat");

        /*
            data format:
             matrix M:
                <i> <TAB> <j> <TAB> <mij>
             matrix N:
                <j> <TAB> <k> <TAB> <njk>
         */
        try {
            // edit file M
            BufferedReader br = new BufferedReader(new FileReader(fileM));
            String line = "";
            BufferedWriter bw = new BufferedWriter(new FileWriter(fileMwithLabel));
            int numOfRowsM = 0;
            int numOfJinM = 0;
            while ((line = br.readLine())!=null){
                String[] data = line.split("\t");
                int row = Integer.valueOf(data[0]);
                if (row > numOfRowsM){
                    numOfRowsM = row;
                }
                int j = Integer.valueOf(data[1]);
                if (j > numOfJinM){
                    numOfJinM = j;
                }
                // remember the function of "\t"
                // add label 0 to file M.
                line += "\t"+"0";
                bw.write(line);
                bw.newLine();
            }
            System.out.println("The number of rows in matrix M is "+numOfRowsM);
            System.out.println("The number of J for M is "+numOfJinM);
            br.close();
            bw.close();

            // edit file N
            BufferedReader br1 = new BufferedReader(new FileReader(fileN));
            BufferedWriter bw1 = new BufferedWriter(new FileWriter(fileNwithLabel));
            int numOfColN = 0;
            int numOfJinN = 0;
            while ((line = br1.readLine())!= null){
                String[] data = line.split("\t");
                int col = Integer.valueOf(data[1]);
                if (col > numOfColN){
                    numOfColN = col;
                }
                int j = Integer.valueOf(data[0]);
                if (j > numOfJinN){
                    numOfJinN = j;
                }
                // add label 1 to file N.
                line += "\t"+"1";
                bw1.write(line);
                bw1.newLine();
            }
            System.out.println("The number of cols in matrix N is "+ numOfColN);
            System.out.println("The number of J for N is "+numOfJinN);
            br1.close();
            bw1.close();

        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

Part II: MatrixMultiplication1.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;


import java.io.IOException;
import java.util.HashMap;


/*
    matrix multiplication part 1
*/
public class MatrixMultiplication1 {

    public static class CommonKeyMapper
            extends Mapper<Object, Text, Text, Text> {

        @Override
        public void map(Object key, Text value, Context context
        ) throws IOException, InterruptedException {
            /*
             value format for matrix M:
                <i> <TAB> <j> <TAB> <mij> <TAB> <0>
             value format for matrix N:
                <j> <TAB> <k> <TAB> <njk> <TAB> <1>
            */
            // change value to string.
            String line = value.toString();

            String[] data = line.split("\t");
            // label "0" stands for matrix M
            // label "1" stands for matrix N
            Text commonKey = new Text();
            Text diffValue = new Text();
            if (data[3].equals("0")){
                // commonKey is <j>
                commonKey.set(data[1]);
                // diffValue is <i><,><mij><,><0>
                diffValue.set(data[0]+","+data[2]+","+data[3]);
            }else if(data[3].equals("1")){
                // commonKey is <j>
                commonKey.set(data[0]);
                // diffValue is <k><,><njk><,><1>
                diffValue.set(data[1]+","+data[2]+","+data[3]);
            }
            context.write(commonKey,diffValue);
        }

    }


    public static class GenPairReducer
            extends Reducer<Text, Text, Text, FloatWritable> {

        @Override
        public void reduce(Text key, Iterable<Text> values,
                           Context context
        ) throws IOException, InterruptedException {
            /*
                key format:
                    <j>
                value format for matrix M:
                    <i><,><mij><,><0>
                value format for matrix N:
                    <k><,><nik><,><1>
            */
            String[] value;

            HashMap<Integer, Float> mapM = new HashMap<Integer, Float>();
            HashMap<Integer, Float> mapN = new HashMap<Integer, Float>();

            for (Text val : values){
                value = val.toString().split(",");
                if (value[2].equals("0")){
                    mapM.put(Integer.parseInt(value[0]), Float.parseFloat(value[1]));
                }else if(value[2].equals("1")) {
                    mapN.put(Integer.parseInt(value[0]), Float.parseFloat(value[1]));
                }
            }

            Text commonKey = new Text();
            float product = 0.0f;

            for (Integer i : mapM.keySet()){
                for (Integer k : mapN.keySet()){
                    // commonkey format:
                    //    <i><,><k>
                    commonKey.set(i+","+k);

                    product = mapM.get(i)*mapN.get(k);
                    context.write(commonKey, new FloatWritable(product));
                }
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: matrixMultiply <in> <out>");
            System.exit(2);
        }

        @SuppressWarnings("deprecation")
        Job job = new Job(conf, "matrix multiply part 1");

        job.setJarByClass(MatrixMultiplication1.class);

        job.setMapperClass(CommonKeyMapper.class);
        // how to set two reducers for this job ?!!
        job.setReducerClass(GenPairReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FloatWritable.class);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

Part III: MatrixMultiplication2.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;

/*
    matrix multiplication part 2
*/
public class MatrixMultiplication2 {


    public static class DoNothingMapper
            extends Mapper<Object, Text, Text, FloatWritable> {

        @Override
        public void map(Object key, Text value, Context context
        ) throws IOException, InterruptedException {

            String[] data = value.toString().split("\t");

            Text strKey = new Text(data[0]);
            float val = Float.parseFloat(data[1]);

            context.write(strKey,new FloatWritable(val));
        }
    }


    public static class FloatSumReducer
            extends Reducer<Text, FloatWritable, Text, FloatWritable> {

        @Override
        public void reduce(Text key, Iterable<FloatWritable> values,
                           Context context
        ) throws IOException, InterruptedException {

            float result = 0.0f;

            for (FloatWritable val : values){
                result += val.get();
            }

            if (result != 0.0f){
                context.write(key,new FloatWritable(result));
            }
        }
    }


    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: matrixMultiply <in> <out>");
            System.exit(2);
        }

        @SuppressWarnings("deprecation")
        Job job = new Job(conf, "matrix multiply part 2");

        job.setJarByClass(MatrixMultiplication2.class);

        job.setMapperClass(DoNothingMapper.class);
        // how to set two reducers for this job ?!!
        job.setReducerClass(FloatSumReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FloatWritable.class);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}