项目地址:https://github.com/tudoupaisimalingshu/hadoop_matrix
矩阵相乘
一、理论基础
二、如何用程序实现?A[M][N]*B[N][P]
import java.util.Arrays;
public class Matrix {
public static void main(String[] args) {
int[][] matrix1 = {
{1,2,-2,0},
{3,3,4,-3},
{-2,0,2,3},
{5,3,-1,2},
{-4,2,0,2}};//左矩阵,5*4
int[][] matrix2 = {
{0,3,-1,2,-3},
{1,3,5,-2,-1},
{0,1,4,-1,2},
{-2,2,-1,1,2}};//右矩阵,4*5
int [][] matrix3 = new int[5][5];//结果矩阵,5*5
for(int i=0;i<5;i++)//计算结果矩阵的每一行
{
int[] row = matrix1[i];//左边矩阵第i行
System.out.println("row=" + Arrays.toString(row));
for(int j=0;j<5;j++)//计算结果矩阵的每一列
{
int[] line = new int[4];//右边矩阵第j列
for(int k=0;k<4;k++)
{
line[k] = matrix2[k][j];
}//因为列向量是竖着的,用循环获得该列的各个元素
System.out.println("line=" + Arrays.toString(line));
int result_i_j = 0;//定义相乘结果
for(int m=0;m<4;m++)
{
result_i_j += row[m] * line[m];//累加乘积
}
System.out.println("result_i_j=" + result_i_j);
System.out.println("--------------------");
matrix3[i][j] = result_i_j;//设置结果矩阵对应位置的值
}
}
//输出结果矩阵
for(int i=0;i<5;i++)
{
for(int j=0;j<5;j++)
{
System.out.print(matrix3[i][j] + "\t");
}
System.out.println();
}
}
}
输出:
row=[1, 2, -2, 0]
line=[0, 1, 0, -2]
result_i_j=2
--------------------
line=[3, 3, 1, 2]
result_i_j=7
--------------------
line=[-1, 5, 4, -1]
result_i_j=1
--------------------
line=[2, -2, -1, 1]
result_i_j=0
--------------------
line=[-3, -1, 2, 2]
result_i_j=-9
--------------------
row=[3, 3, 4, -3]
line=[0, 1, 0, -2]
result_i_j=9
--------------------
line=[3, 3, 1, 2]
result_i_j=16
--------------------
line=[-1, 5, 4, -1]
result_i_j=31
--------------------
line=[2, -2, -1, 1]
result_i_j=-7
--------------------
line=[-3, -1, 2, 2]
result_i_j=-10
--------------------
row=[-2, 0, 2, 3]
line=[0, 1, 0, -2]
result_i_j=-6
--------------------
line=[3, 3, 1, 2]
result_i_j=2
--------------------
line=[-1, 5, 4, -1]
result_i_j=7
--------------------
line=[2, -2, -1, 1]
result_i_j=-3
--------------------
line=[-3, -1, 2, 2]
result_i_j=16
--------------------
row=[5, 3, -1, 2]
line=[0, 1, 0, -2]
result_i_j=-1
--------------------
line=[3, 3, 1, 2]
result_i_j=27
--------------------
line=[-1, 5, 4, -1]
result_i_j=4
--------------------
line=[2, -2, -1, 1]
result_i_j=7
--------------------
line=[-3, -1, 2, 2]
result_i_j=-16
--------------------
row=[-4, 2, 0, 2]
line=[0, 1, 0, -2]
result_i_j=-2
--------------------
line=[3, 3, 1, 2]
result_i_j=-2
--------------------
line=[-1, 5, 4, -1]
result_i_j=12
--------------------
line=[2, -2, -1, 1]
result_i_j=-10
--------------------
line=[-3, -1, 2, 2]
result_i_j=14
--------------------
2 7 1 0 -9
9 16 31 -7 -10
-6 2 7 -3 16
-1 27 4 7 -16
-2 -2 12 -10 14
三、传统程序的问题:
1、不能并发执行,总是按照循环的条件一次一次执行。
2、如果矩阵的规模很大,以至于放不到内存中,则可能要放入文件中,那么对于左侧矩阵还好说,每次只需要读取一行放入内存,下次循环再读取下一行即可;从程序中可以看到,对于右侧矩阵,我们需要得到列向量,也就是遍历所有的行,每行取一个元素,然后组成列向量,当文件很大时,速度太慢。
四、解决方案
1、针对问题1引入并发执行框架Hadoop,其中的Map和Reduce操作可以并发执行。
2、针对问题2,将右边矩阵转置,从而实现列向量转为行向量
五、使用Hadoop Map Reduce 进行矩阵相乘
1、矩阵的存储结构
为什么要把每一行的所有列写在一行?
矩阵文件可能很大,此时Hadoop的HDFS就会将文件分片,如果没有将同一行的所有列写在一起,则属于同一行的元素可能会被分到不同的分片,导致后面还会消耗时间和空间去查找拼接,也就是还需要写reduce来合并行。
为什么要对一行的每一个元素标出列的序号?
由于Hadoop是并行的,在进行map拆分的之后进行reduce合并的过程中,并不能保证一行的各个元素是有序的,因此要标出元素对应的下标,在hadoop中,由于行号是唯一的,再加上标明的列号,就能保证在并行处理过程中的正确性。
2、矩阵转置的Map Reduce实现
package hadoop;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
public class Step1 {
public static class Mapper1 extends Mapper<LongWritable,Text,Text,Text>
{
private Text outKey = new Text();
private Text outValue = new Text();
/*
待转置矩阵
0 3 -1 2 -3
1 3 5 -2 -1
0 1 4 -1 2
-2 2 -1 1 2
*/
/*
目标矩阵
0 1 1 -2
3 3 1 2
-1 5 4 -1
2 -2 -1 1
-3 -1 2 2
*/
//对于每一行,以第一行为例
//key : 1
//value : "1 1_0,2_3,3_-1,4_2,5_-3"
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] rowAndline = value.toString().split("\t");
//rowAndline : {"1","1_0,2_3,3_-1,4_2,5_-3"}
String row = rowAndline[0];
//row "1"
String[] lines = rowAndline[1].split(",");
//rowAndline[1] : "1_0,2_3,3_-1,4_2,5_-3"
//lines : {"1_0","2_3","3_-1","4_2","5_-3"}
for(String line : lines)//对于每一列,以第一列为例,line "1_0"
{
String colunm = line.split("_")[0];
//colunm : 1
String valueStr = line.split("_")[1];
//valueStr : 0
outKey.set(colunm);
//将列作为行
outValue.set(row + "_" + valueStr);
//将行作为列
context.write(outKey, outValue);
// 产生(1,"1_0")
}
//循环结束,对于{"1_0","2_3","3_-1","4_2","5_-3"}
//产生(1,"1_0") 第一行,第一列_0 (2,"1_3") 第二行,第一列_3 (3,"1_-1") (4,"1_2")(5,"1_-3")
/*
目标转置矩阵
0 1 1 -2
3 3 1 2
-1 5 4 -1
2 -2 -1 1
-3 -1 2 2
*/
//正好对应于转置矩阵的第一列
}
/*
所有map操作产生
("1","1_0") ("2","1_3") ("3","1_-1") ("4","1_2") ("5","1_-3")
("1","2_1") ("2","2_3") ("3","2_5") ("4","2_-2") ("5","2_-1")
("1","3_0") ("2","3_1") ("3","3_4") ("4","3_-1") ("5","3_2")
("1","4_-2") ("2","4_2") ("3","4_-1") ("4","4_1") ("5","4_2")
*/
}
/*
Reduce任务,将map操作产生的所有键值对集合进行合并,生成转置矩阵的存储表示
key值相同的值会组成值的集合
如:
key:"1"时
values:{"3_0","1_0","4_-2","2_1"}
注意:这里就是为什么要进行列标号的原因,values的顺序不一定就是原来矩阵列的顺序
*/
public static class Reducer1 extends Reducer<Text,Text,Text,Text>
{
private Text outKey = new Text();
private Text outValue = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for(Text text : values)
{
sb.append(text + ",");
}
//sb : "3_0,1_0,4_-2,2_1,"
//注意这里末尾有个逗号
String line = "";
if(sb.toString().endsWith(","))
{
line = sb.substring(0,sb.length()-1);
}
//去掉逗号
//line : "3_0,1_0,4_-2,2_1"
outKey.set(key);
outValue.set(line);
//("1","3_0,1_0,4_-2,2_1")
context.write(outKey, outValue);
}
}
private static final String INPATH = "input/matrix.txt";//输入文件路径
private static final String OUTPATH = "output/step1";//输出文件路径
private static final String HDFS = "hdfs://pc1:9000";//HDFS路径
public void run() throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
//String[] otherArgs = {"hdfs://pc1:9000/input/chenjie.txt","hdfs://pc1:9000/output/out4"};
String[] otherArgs = {"input/matrix.txt","hdfs://pc1:9000/output/step1"};
//这里需要配置参数即输入和输出的HDFS的文件路径
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
//conf.set("fs.defaultFS",HDFS);
// JobConf conf1 = new JobConf(WordCount.class);
Job job = new Job(conf, "step1");//Job(Configuration conf, String jobName) 设置job名称和
job.setJarByClass(Step1.class);
job.setMapperClass(Mapper1.class); //为job设置Mapper类
//job.setCombinerClass(IntSumReducer.class); //为job设置Combiner类
job.setReducerClass(Reducer1.class); //为job设置Reduce类
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class); //设置输出key的类型
job.setOutputValueClass(Text.class);// 设置输出value的类型
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0])); //为map-reduce任务设置InputFormat实现类 设置输入路径
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//为map-reduce任务设置OutputFormat实现类 设置输出路径
System.exit(job.waitForCompletion(true) ? 0 : 1);
/*Configuration conf = new Configuration();
conf.set("fs.defaultFS",HDFS);
Job job = Job.getInstance(conf,"step1");
job.setJarByClass(Step1.class);
job.setMapperClass(Mapper1.class);
job.setReducerClass(Reducer1.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileSystem fs = FileSystem.get(conf);
Path inPath = new Path(INPATH);
if(fs.exists(inPath))
{
//FileInputFormat.addInputPath(conf, inPath);
}
Path outPath = new Path(OUTPATH);
if(fs.exists(outPath))
{
fs.delete(outPath, true);
}*/
}
public static void main(String[] args)
{
try {
new Step1().run();
} catch (ClassNotFoundException | IOException | InterruptedException e) {
e.printStackTrace();
}
}
}
使用hadoop fs -text 文件路径查看转置结果: