Problem
让m * n矩阵A和n * p 的矩阵B相乘
则相乘之后的结果矩阵AB为
Input
输入文件含有多行,每行的格式如下,矩阵M,下标i,j,非0元素m(i,j)
<M><i><j><m_ij>
假设A、B如下所示:
则输入的文件内容如下:
A,0,1,1.0
A,0,2,2.0
A,0,3,3.0
A,0,4,4.0
A,1,0,5.0
A,1,1,6.0
A,1,2,7.0
A,1,3,8.0
A,1,4,9.0
B,0,1,1.0
B,0,2,2.0
B,1,0,3.0
B,1,1,4.0
B,1,2,5.0
B,2,0,6.0
B,2,1,7.0
B,2,2,8.0
B,3,0,9.0
B,3,1,10.0
B,3,2,11.0
B,4,0,12.0
B,4,1,13.0
B,4,2,14.0
Output
数出文件格式是,矩阵M以及其非0元素m(i,j)
<i><j><m_ij>
上述AB的数出结果大致如下:
0,0,90.0
0,1,100.0
0,2,110.0
1,0,240.0
1,1,275.0
1,2,310.0
伪代码如下:
map(key, value):
// value is ("A", i, j, a_ij) or ("B", j, k, b_jk)
if value[0] == "A":
i = value[1]
j = value[2]
a_ij = value[3]
for k = 1 to p:
emit((i, k), (A, j, a_ij))
else:
j = value[1]
k = value[2]
b_jk = value[3]
for i = 1 to m:
emit((i, k), (B, j, b_jk))
reduce(key, values):
// key is (i, k)
// values is a list of ("A", j, a_ij) and ("B", j, b_jk)
hash_A = {j: a_ij for (x, j, a_ij) in values if x == A}
hash_B = {j: b_jk for (x, j, b_jk) in values if x == B}
result = 0
for j = 1 to n:
result += hash_A[j] * hash_B[j]
emit(key, result)
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class OneStepMatrixMultiplication {
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
int m = Integer.parseInt(conf.get("m"));
int p = Integer.parseInt(conf.get("p"));
String line = value.toString();
String[] indicesAndValue = line.split(",");
Text outputKey = new Text();
Text outputValue = new Text();
if (indicesAndValue[0].equals("A")) {
for (int k = 0; k < p; k++) {
outputKey.set(indicesAndValue[1] + "," + k);
outputValue.set("A," + indicesAndValue[2] + "," + indicesAndValue[3]);
context.write(outputKey, outputValue);
}
} else {
for (int i = 0; i < m; i++) {
outputKey.set(i + "," + indicesAndValue[2]);
outputValue.set("B," + indicesAndValue[1] + "," + indicesAndValue[3]);
context.write(outputKey, outputValue);
}
}
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String[] value;
HashMap<Integer, Float> hashA = new HashMap<Integer, Float>();
HashMap<Integer, Float> hashB = new HashMap<Integer, Float>();
for (Text val : values) {
value = val.toString().split(",");
if (value[0].equals("A")) {
hashA.put(Integer.parseInt(value[1]), Float.parseFloat(value[2]));
} else {
hashB.put(Integer.parseInt(value[1]), Float.parseFloat(value[2]));
}
}
int n = Integer.parseInt(context.getConfiguration().get("n"));
float result = 0.0f;
float a_ij;
float b_jk;
for (int j = 0; j < n; j++) {
a_ij = hashA.containsKey(j) ? hashA.get(j) : 0.0f;
b_jk = hashB.containsKey(j) ? hashB.get(j) : 0.0f;
result += a_ij * b_jk;
}
if (result != 0.0f) {
context.write(null, new Text(key.toString() + "," + Float.toString(result)));
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// A is an m-by-n matrix; B is an n-by-p matrix.
conf.set("m", "2");
conf.set("n", "5");
conf.set("p", "3");
Job job = new Job(conf, "MatrixMatrixMultiplicationOneStep");
job.setJarByClass(OneStepMatrixMultiplication.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
记得要设置m,n和p的值
Analysis
你会发现,总共有mp的Reduce jobs,每个Reduce接收来自矩阵A含有n个的元素的一行,来自矩阵B含有n个元素的一列,然后输出一行
也就是说:
1、reduce job的数量和发送到reduce任务的unique keys是相等的,即mp
2、发送到每个reduce的矩阵A元素的个数是n
3、发送到每个reduce的矩阵B元素的个数是n
4、每个reduce的输出只有一行
而且
1、map端的计算复杂度为O(mn + np)
2、reduce断的计算复杂度为O(2mnp)
如果矩阵A和矩阵B很大,reduce的节点可能没有足够的内存来容纳所有来自矩阵A的行和来自矩阵B的列。
下次将尝试另一个方法