Hadoop-实现矩阵相乘

最新推荐文章于 2021-08-08 01:22:38 发布

chunliangdu8082

最新推荐文章于 2021-08-08 01:22:38 发布

阅读量964

点赞数

文章标签：大数据

原文链接：https://my.oschina.net/u/3494913/blog/1561015

版权

在慕课网学习的hadoop进阶视频的小练习

需求：实现两个矩阵的相乘

矩阵相乘的条件是：左矩阵的列 = 右矩阵的行

相乘结果：左行 X 右，然后依次累加得到一个结果

结果坐标：行：左行标，列：右列标

矩阵格式：行号 tab 列号_值，列号_值，列号_值，列号_值

1   1_1,2_2,3_-2,4_0
2   1_3,2_3,3_4,4_-3
3   1_-2,2_0,3_2,4_3
4   1_5,2_3,3_-1,4_2
5   1_4,2_2,3_0,4_2

--------------------------------------------------------------------------------------------------

代码实现：先将右矩阵转置，即行变成列，列变成行，然后与左矩阵相乘

1、将右矩阵转置

Mapper1.java

package step1;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class Mapper1 extends Mapper<LongWritable, Text, Text, Text>{
	
	private Text outKey = new Text();
	private Text outValue = new Text();
	
	/**
	 * key 行号
	 * value 行字符串  1 1_0,2_3,3_-1,4_2,5_-3
	 * 
	 * 将矩阵的行号变成列号，列号变成行号
	 */
	@Override
	protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
			throws IOException, InterruptedException {
		String[] rowAndLine = value.toString().split("\t");
		
		//矩阵的行号
		String row = rowAndLine[0];
		String[] lines = rowAndLine[1].split(",");
		
		//["1_0","2_3","3_-1","4_2","5_-3"]
		for(int i = 0;i<lines.length; i++){
			String column = lines[i].split("_")[0];
			String valueStr = lines[i].split("_")[1];
			//key:列号	value:行号_值
			outKey.set(column);
			outValue.set(row + "_" + valueStr);
			context.write(outKey,  outValue);
		}
	}

}

Reducer1.java

package step1;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class Reducer1 extends Reducer<Text, Text, Text, Text>{

	private Text outKey = new Text();
	private Text outValue = new Text();
	
	//key:列号	value:[行号_值,行号_值,行号_值,行号_值,行号_值...]
	@Override
	protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
			throws IOException, InterruptedException {
		StringBuilder sb = new StringBuilder();
		for(Text text : values){
			//text:行号_值
			sb.append(text + ",");
		}
		String line = null;
		if(sb.toString().endsWith(",")){
			line = sb.substring(0, sb.length() - 1);
		}
		
		outKey.set(key);
		outValue.set(line);
		
		context.write(outKey, outValue);
	}
}

MR1.java（main函数，这里要注意把项目打包jar放到D：\\jar\\test2.jar）

package step1;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class MR1 {
	//输入文件的相对路径
	private static String inPath = "/matrix/step1_input/matrix2.txt";
	//输出文件的相对路径
	private static String outPath = "/matrix/step1_output";
	//hdfs地址
	private static String hdfs = "hdfs://192.168.77.128:9000";
	
	public int run(){
		try {
			//创建job配置类
			Configuration conf = new Configuration();
			//设置hdfs的地址
			//conf.set("fs.defaultFS", hdfs);
			conf.set("mapred.jar", "D:\\jar\\test2.jar");
			//创建一个job实例
			Job job = Job.getInstance(conf, "step1");
			
			//设置job的主类
			job.setJarByClass(MR1.class);
			//设置job的Mapper类和Reducer类
			job.setMapperClass(Mapper1.class);
			job.setReducerClass(Reducer1.class);
		
			//设置Mapper输出的类型
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(Text.class);
		
			FileSystem fs = FileSystem.get(conf);
			//设置输入和输出的路径
			Path inputPath = new Path(hdfs + inPath);
			if(fs.exists(inputPath)){
				FileInputFormat.addInputPath(job, inputPath);
			}
			
			Path outputPath = new Path(hdfs + outPath);
			fs.delete(outputPath, true);
			
			FileOutputFormat.setOutputPath(job, outputPath);
			
			return job.waitForCompletion(true)?1:-1;
		
		} catch (IOException e) {
				e.printStackTrace();
		    } catch (ClassNotFoundException e) {
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		return -1;
	}
	
	public static void main(String[] args){
		int result = -1;
		result = new MR1().run();
		if(result == 1){
			System.out.println("step1运行成功。。。");
		}else if(result == -1){
			System.out.println("step1运行失败。。。");
		}
		
	}
}

2、矩阵相乘

Mapper2.java

package step2;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class Mapper2 extends Mapper<LongWritable, Text, Text, Text>{
	
	private Text outKey = new Text();
	private Text outValue = new Text();
	
	private List<String> cacheList = new ArrayList<String>();
	
	@Override
	protected void setup(Context context)
			throws IOException, InterruptedException {
		
		URI[] uris = DistributedCache.getCacheFiles(context.getConfiguration());
		FileSystem fs = FileSystem.get(URI.create("hdfs://192.168.77.128:9000"), context.getConfiguration());
		
		FSDataInputStream in = fs.open(new Path(uris[0].getPath()));
		
//		Scanner input_A = new Scanner(in);
//		while(input_A.hasNext()){
//			System.out.println(input_A.next());
//			cacheList.add(input_A.next());
//		}
//		input_A.close();
		
		//通过输入流将全局缓存中的 右侧矩阵 读入List<String>中
		//FileReader fr = new FileReader("matrix2");
		BufferedReader br = new BufferedReader(new InputStreamReader(in));
		
		//每一行的格式是 ： 行 tab 列_值,列_值,列_值,列_值
		String line = null;
		while((line = br.readLine()) != null ){
			cacheList.add(line);
			System.out.println(line);
		}
		
		in.close();
		br.close();
	}
	
	/**
	 * key:行号
	 * value：行 tab 列_值,列_值,列_值,列_值
	 */
	@Override
	protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
			throws IOException, InterruptedException {
		//行
		String row_matrix1 = value.toString().split("\t")[0];
		//列值（数组）
		String[] column_value_array_matrix1 = value.toString().split("\t")[1].split(",");
		
		for(String line : cacheList){
			//右侧矩阵的行line
			//格式 ： 行 tab 列_值,列_值,列_值,列_值
			String row_matrix2 = line.toString().split("\t")[0];
			String[] column_value_array_matrix2 = line.toString().split("\t")[1].split(",");
			
			//矩阵两行相乘得到结果
			int result = 0;
			//遍历左矩阵的第一行的每一列
			for(String columm_value_matrix1 : column_value_array_matrix1){
				String column_matrix1 = columm_value_matrix1.split("_")[0];
				String value_matrix1 = columm_value_matrix1.split("_")[1];
				
				//遍历右矩阵的每一行每一列
				for(String columm_value_matrix2 : column_value_array_matrix2){
					if(columm_value_matrix2.startsWith(column_matrix1 + "_")){
						String value_matrix2 = columm_value_matrix2.split("_")[1];
						//将两列的值相乘并累加
						System.out.println(value_matrix1 + "---" + value_matrix2);
						result = result + Integer.valueOf(value_matrix1) * Integer.valueOf(value_matrix2);
					}
					
				}
			}
			//result是结果矩阵中的某元素，坐标为 行：row_matrix1       列：row_matrix2（因为右矩阵已经转置）
			outKey.set(row_matrix1);
			outValue.set(row_matrix2 + "_" + result);
			//输出格式key：行	value：列_值
			context.write(outKey, outValue);
		}
	}
}

Reducer2.java

package step2;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class Reducer2 extends Reducer<Text, Text, Text, Text>{

	private Text outKey = new Text();
	private Text outValue = new Text();
	
	@Override
	protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
			throws IOException, InterruptedException {
		StringBuilder sb = new StringBuilder();
		for(Text value : values){
			sb.append(value + ",");
		}
		
		String result = null;
		if(sb.toString().endsWith(",")){
			result = sb.substring(0, sb.length() - 1);
		}
		
		//outKey:行号	outValue:列_值,列_值,列_值,列_值
		outKey.set(key);
		outValue.set(result);
		
		context.write(outKey, outValue);
	}
}

MR2.java

package step2;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class MR2 {
	//输入文件的相对路径
	private static String inPath = "/matrix/step2_input/matrix1.txt";
	//输出文件的相对路径
	private static String outPath = "/matrix/output";
	//将step1输出的转置矩阵作为全局缓存
	private static String cache = "/matrix/step1_output/part-r-00000";
	//hdfs地址
	private static String hdfs = "hdfs://192.168.77.128:9000";
	
	public int run(){
		try {
			//创建job配置类
			Configuration conf = new Configuration();
			//设置hdfs的地址
			//conf.set("fs.defaultFS", hdfs);
			conf.set("mapred.jar", "D:\\jar\\test2.jar");
			
			//添加分布式缓存文件
			DistributedCache.addCacheFile(new URI(hdfs + cache), conf);
			
			//创建一个job实例
			Job job = Job.getInstance(conf, "step2");
			
			//设置job的主类
			job.setJarByClass(MR2.class);
			//设置job的Mapper类和Reducer类
			job.setMapperClass(Mapper2.class);
			job.setReducerClass(Reducer2.class);
		
			//设置Mapper输出的类型
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(Text.class);
		
			FileSystem fs = FileSystem.get(conf);
			//设置输入和输出的路径
			Path inputPath = new Path(hdfs + inPath);
			if(fs.exists(inputPath)){
				FileInputFormat.addInputPath(job, inputPath);
			}
			
			Path outputPath = new Path(hdfs + outPath);
			fs.delete(outputPath, true);
			
			FileOutputFormat.setOutputPath(job, outputPath);
			
			return job.waitForCompletion(true)?1:-1;
		
		} catch (IOException e) {
				e.printStackTrace();
		    } catch (ClassNotFoundException e) {
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		} catch (URISyntaxException e) {
			e.printStackTrace();
		}
		return -1;
	}
	
	public static void main(String[] args){
		int result = 0;
		result = new MR2().run();
		if(result == 1){
			System.out.println("step2运行成功。。。");
		}else if(result == -1){
			System.out.println("step2运行失败。。。");
		}
		
	}
}

HDFS文件夹截图

出现问题解决：

1、Win环境下本地运行hadoop1版本出现访问权限问题，会遇到如下报错：
Exception in thread "main" java.io.IOException: Failed to set permissions of path: \tmp\hadoop-Administrator\mapred\staging\Administrator-519341271\.staging to 0700

此时只需要修改org.apache.hadoop.fs.FileUtil文件并重新编译即可，简单解决步骤如下：

1.eclipse中新建java工程

2.将hadoop相关jar包都导入工程

3.到源码中拷贝src/core/org/apache/hadoop/fs/FileUtil.java文件，粘贴到eclipse工程的src目录下

4.找到以下部分，注释掉checkReturnValue方法中的代码

private static void checkReturnValue(boolean rv, File p,

                                      FsPermission permission

                                      ) throws IOException {

    /*

//win7 connect to linux hadoop

if (!rv) {

      throw new IOException("Failed to set permissions of path: " + p +

                            " to " +

                            String.format("%04o", permission.toShort()));

    }

*/

  }

5.到工程的输出目录找到class文件，会有两个class文件，因为FileUtil.java有内部类

6.将该class文件添加到hadoop-core-1.2.1.jar中对应的目录，覆盖原文件

7.将更新过的hadoop-core-1.2.1.jar拷贝到Hadoop集群，覆盖原有文件，重启Hadoop集群

8.将更新过的hadoop-core-1.2.1.jar添加到项目的依赖中，如果用到Maven可能涉及覆盖Maven库中的对应文件

9.运行程序，万事大吉！！！

===============================================

如果不会编译，那就下载一个吧！！！

http://download.csdn.net/detail/echoqun/6198467#comment

转载于:https://my.oschina.net/u/3494913/blog/1561015

chunliangdu8082

关注

0
点赞
踩
8

收藏

觉得还不错? 一键收藏
0
评论
Hadoop-实现矩阵相乘

在慕课网学习的hadoop进阶视频的小练习需求：实现两个矩阵的相乘矩阵相乘的条件是：左矩阵的列 = 右矩阵的行相乘结果：左行 X 右，然后依次累加得到一个结果结果坐标：行：左行标，列：右列标矩阵格式：行号 tab 列号_值，列号_值，列号_值，列号_值 1...
复制链接

扫一扫