CUHK-IEMS5730-HW1
Environment
- Google Cloud Platform
- Ubuntu 14.04 LTS
- Instance: 2 cores, 8GB ROM, 50GB storage
- Openjdk-7-jdk/jre
- Hadoop 2.9.2 with Yarn
1, Map Reduce Source Code
I use Preprocess.java
to add labels to original data-set, and use two map-reduce job to achieve matrix multiplication, which are MatrixMultiplication1.java
and MatrixMultiplication2.java
.
Part I: Preprocess.java
import java.io.*;
public class Preprocess {
public static void main(String[] args) {
File fileM = new File("src/main/resources/hw1-large-dataset/M_large.dat");
File fileN = new File("src/main/resources/hw1-large-dataset/N_large.dat");
// can not find file.
File fileMwithLabel = new File("src/main/resources/hw1-large-dataset/M_large_labeled.dat");
File fileNwithLabel = new File("src/main/resources/hw1-large-dataset/N_large_labeled.dat");
/*
data format:
matrix M:
<i> <TAB> <j> <TAB> <mij>
matrix N:
<j> <TAB> <k> <TAB> <njk>
*/
try {
// edit file M
BufferedReader br = new BufferedReader(new FileReader(fileM));
String line = "";
BufferedWriter bw = new BufferedWriter(new FileWriter(fileMwithLabel));
int numOfRowsM = 0;
int numOfJinM = 0;
while ((line = br.readLine())!=null){
String[] data = line.split("\t");
int row = Integer.valueOf(data[0]);
if (row > numOfRowsM){
numOfRowsM = row;
}
int j = Integer.valueOf(data[1]);
if (j > numOfJinM){
numOfJinM = j;
}
// remember the function of "\t"
// add label 0 to file M.
line += "\t"+"0";
bw.write(line);
bw.newLine();
}
System.out.println("The number of rows in matrix M is "+numOfRowsM);
System.out.println("The number of J for M is "+numOfJinM);
br.close();
bw.close();
// edit file N
BufferedReader br1 = new BufferedReader(new FileReader(fileN));
BufferedWriter bw1 = new BufferedWriter(new FileWriter(fileNwithLabel));
int numOfColN = 0;
int numOfJinN = 0;
while ((line = br1.readLine())!= null){
String[] data = line.split("\t");
int col = Integer.valueOf(data[1]);
if (col > numOfColN){
numOfColN = col;
}
int j = Integer.valueOf(data[0]);
if (j > numOfJinN){
numOfJinN = j;
}
// add label 1 to file N.
line += "\t"+"1";
bw1.write(line);
bw1.newLine();
}
System.out.println("The number of cols in matrix N is "+ numOfColN);
System.out.println("The number of J for N is "+numOfJinN);
br1.close();
bw1.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
Part II: MatrixMultiplication1.java
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.IOException;
import java.util.HashMap;
/*
matrix multiplication part 1
*/
public class MatrixMultiplication1 {
public static class CommonKeyMapper
extends Mapper<Object, Text, Text, Text> {
@Override
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
/*
value format for matrix M:
<i> <TAB> <j> <TAB> <mij> <TAB> <0>
value format for matrix N:
<j> <TAB> <k> <TAB> <njk> <TAB> <1>
*/
// change value to string.
String line = value.toString();
String[] data = line.split("\t");
// label "0" stands for matrix M
// label "1" stands for matrix N
Text commonKey = new Text();
Text diffValue = new Text();
if (data[3].equals("0")){
// commonKey is <j>
commonKey.set(data[1]);
// diffValue is <i><,><mij><,><0>
diffValue.set(data[0]+","+data[2]+","+data[3]);
}else if(data[3].equals("1")){
// commonKey is <j>
commonKey.set(data[0]);
// diffValue is <k><,><njk><,><1>
diffValue.set(data[1]+","+data[2]+","+data[3]);
}
context.write(commonKey,diffValue);
}
}
public static class GenPairReducer
extends Reducer<Text, Text, Text, FloatWritable> {
@Override
public void reduce(Text key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException {
/*
key format:
<j>
value format for matrix M:
<i><,><mij><,><0>
value format for matrix N:
<k><,><nik><,><1>
*/
String[] value;
HashMap<Integer, Float> mapM = new HashMap<Integer, Float>();
HashMap<Integer, Float> mapN = new HashMap<Integer, Float>();
for (Text val : values){
value = val.toString().split(",");
if (value[2].equals("0")){
mapM.put(Integer.parseInt(value[0]), Float.parseFloat(value[1]));
}else if(value[2].equals("1")) {
mapN.put(Integer.parseInt(value[0]), Float.parseFloat(value[1]));
}
}
Text commonKey = new Text();
float product = 0.0f;
for (Integer i : mapM.keySet()){
for (Integer k : mapN.keySet()){
// commonkey format:
// <i><,><k>
commonKey.set(i+","+k);
product = mapM.get(i)*mapN.get(k);
context.write(commonKey, new FloatWritable(product));
}
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: matrixMultiply <in> <out>");
System.exit(2);
}
@SuppressWarnings("deprecation")
Job job = new Job(conf, "matrix multiply part 1");
job.setJarByClass(MatrixMultiplication1.class);
job.setMapperClass(CommonKeyMapper.class);
// how to set two reducers for this job ?!!
job.setReducerClass(GenPairReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Part III: MatrixMultiplication2.java
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.IOException;
/*
matrix multiplication part 2
*/
public class MatrixMultiplication2 {
public static class DoNothingMapper
extends Mapper<Object, Text, Text, FloatWritable> {
@Override
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String[] data = value.toString().split("\t");
Text strKey = new Text(data[0]);
float val = Float.parseFloat(data[1]);
context.write(strKey,new FloatWritable(val));
}
}
public static class FloatSumReducer
extends Reducer<Text, FloatWritable, Text, FloatWritable> {
@Override
public void reduce(Text key, Iterable<FloatWritable> values,
Context context
) throws IOException, InterruptedException {
float result = 0.0f;
for (FloatWritable val : values){
result += val.get();
}
if (result != 0.0f){
context.write(key,new FloatWritable(result));
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: matrixMultiply <in> <out>");
System.exit(2);
}
@SuppressWarnings("deprecation")
Job job = new Job(conf, "matrix multiply part 2");
job.setJarByClass(MatrixMultiplication2.class);
job.setMapperClass(DoNothingMapper.class);
// how to set two reducers for this job ?!!
job.setReducerClass(FloatSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}