hadoop实现表连接算法

最新推荐文章于 2017-10-24 23:53:51 发布

spring8743

最新推荐文章于 2017-10-24 23:53:51 发布

阅读量466

点赞数

本文链接：https://blog.csdn.net/spring8743/article/details/41008109

版权

常见的单表，多表连接可以用SQL很简单的表示出来，但是对于hadoop就有点复杂了，这里简单介绍下如何用hadoop的key/value实现表之间的连接。

现在有两张表emp, dept分别显示员工信息和部门信息，需要实现以下的需求

1. 求每个部门的总工资

2. 求每个部门的平均工资和人数

SQL> select * from emp;

EMPNO ENAME JOB MGR HIREDATE SAL COMM DEPTNO
---------- ---------- --------- ---------- -------------- ---------- ---------- ----------
7369 SMITH CLERK 7902 17-12月-80 800 20
7499 ALLEN SALESMAN 7698 20-2月 -81 1600 300 30
7521 WARD SALESMAN 7698 22-2月 -81 1250 500 30
7566 JONES MANAGER 7839 02-4月 -81 2975 20
7654 MARTIN SALESMAN 7698 28-9月 -81 1250 1400 30
7698 BLAKE MANAGER 7839 01-5月 -81 2850 30
7782 CLARK MANAGER 7839 09-6月 -81 2450 10
7839 KING PRESIDENT 17-11月-81 5000 10
7844 TURNER SALESMAN 7698 08-9月 -81 1500 0 30
7900 JAMES CLERK 7698 03-12月-81 950 30
7902 FORD ANALYST 7566 03-12月-81 3000 20
7934 MILLER CLERK 7782 23-1月 -82 1300 10

已选择12行。

SQL> select * from dept;

DEPTNO DNAME LOC
---------- -------------- -------------
10 ACCOUNTING NEW YORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON

代码如下：

package homework;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class Exercise_4 extends Configured implements Tool{
	
	enum Counter{
		LINESKIP;
	}

	public static class Map extends Mapper<LongWritable, Text, Text, Text> 
	{
		
		public void map ( LongWritable key, Text value, Context context ) throws IOException, InterruptedException 
		{
			String line = value.toString();				//读取源数据
			
			try
			{
				if(line.substring(0,10).trim().length() == 4){
					String salary = line.substring(59, 69).trim();
					String deptno = line.substring(78, 88).trim();
					context.write(new Text(deptno), new Text("1" + salary));
					
				}else if (line.substring(0,10).trim().length() == 2){
					String deptno = line.substring(0,10).trim();
					String dname = line.substring(11, 25).trim();
					context.write(new Text(deptno), new Text("2" + dname));
				}
				
			}
			catch ( java.lang.ArrayIndexOutOfBoundsException e )
			{
				context.getCounter(Counter.LINESKIP).increment(1);	//出错令计数器+1
				return;
			}
		}
	}
	
	public static class Reduce extends Reducer<Text, Text, Text, Text> 
	{
		public void reduce ( Text key, Iterable<Text> values, Context context ) throws IOException, InterruptedException
		{
			String valueString;
			Integer sumSalary = 0;
			Integer perCount = 0;
			Integer avgSalary = 0;
			
			for ( Text value : values )
			{
				valueString = value.toString();
				Long flag = Long.parseLong(valueString.substring(0, 1));
				
				if(flag == 1){
					perCount += 1;
					sumSalary = sumSalary + Integer.parseInt(((valueString.substring(1))));
					
				}else if(flag == 2){
					key = new Text(valueString.substring(1));
				}
	
			}
			
			if(perCount == 0){
				avgSalary = 0;
			}else{
				avgSalary = sumSalary / perCount;
			}
			
			context.write( key, new Text(sumSalary.toString()+ "   " + avgSalary.toString() + "   " + perCount.toString()));
			//context.write( key, new Text(sumSalary.toString()));
		}
	}
	
	@Override
	public int run(String[] args) throws Exception 
	{
		
		Job job = new Job();
		job.setJarByClass(Exercise_4.class);								//指定Class
		
		FileInputFormat.addInputPath( job, new Path(args[0]) );			//输入路径
		FileOutputFormat.setOutputPath( job, new Path(args[1]) );		//输出路径
		
		job.setMapperClass( Map.class );								//调用上面Map类作为Map任务代码
		job.setReducerClass ( Reduce.class );							//调用上面Reduce类作为Reduce任务代码
		job.setOutputFormatClass( TextOutputFormat.class );
		job.setOutputKeyClass( Text.class );							//指定输出的KEY的格式
		job.setOutputValueClass( Text.class );							//指定输出的VALUE的格式
		
		job.waitForCompletion(true);

		return job.isSuccessful() ? 0 : 1;
	}
	
	/**  
	 * 设置系统说明
	 * 设置MapReduce任务
	 */  
	public static void main(String[] args) throws Exception 
	{
		
		//判断参数个数是否正确
		//如果无参数运行则显示以作程序说明
		if ( args.length != 2 )
		{
			System.err.println("");
			System.err.println("Usage: Exercise_4 < input path > < output path > ");
			System.err.println("Example: hadoop jar ~/Exercise_4.jar hdfs://localhost:9000/home/james/Exercise_4 hdfs://localhost:9000/home/james/output");
			System.err.println("Counter:");
			System.err.println("\t"+"LINESKIP"+"\t"+"Lines which are too short");
			System.exit(-1);
		}
		
		
		//运行任务
		int res = ToolRunner.run(new Configuration(), new Exercise_4(), args);

        System.exit(res);
	}

}

发现写MapReduce程序单元测试很重要啊，不然调试起来会很麻烦的，这里贴下MRUnit单元测试的代码

package homework;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.junit.Before;
import org.junit.Test;

public class Exercise_4Test {
	MapDriver<LongWritable, Text, Text, Text> mapDriver;
	ReduceDriver<Text, Text, Text, Text> reduceDriver;
	
	 @Before
	  public void setUp() {
		Exercise_4.Map mapper = new Exercise_4.Map();
		Exercise_4.Reduce reducer = new Exercise_4.Reduce();
		
	    mapDriver = MapDriver.newMapDriver(mapper);;
	    reduceDriver = ReduceDriver.newReduceDriver(reducer);
	
	  }
	 
	 @Test
	  public void testMapper() throws IOException{
	  // Temperature
	  Text value1 = new Text("      7369 SMITH      CLERK           7902 17-12-80            800                    20");
	  //Text value2 = new Text("        10 ACCOUNTING     NEW YORK");
	  
	  mapDriver.withInput(new LongWritable(), value1);
	  mapDriver.withOutput(new Text("20"), new Text("1800"));
	  mapDriver.runTest();
	 }
	 
	// @Test
	  public void testReducer() throws IOException {
	    List<Text> values = new ArrayList<Text>();
	    values.add(new Text("2ACCOUNTING"));
	    values.add(new Text("1800"));
	    values.add(new Text("11600"));
	    values.add(new Text("13000"));
	    reduceDriver.withInput(new Text("20"), values);
	    reduceDriver.withOutput(new Text("ACCOUNTING"), new Text("5400" +  "   " + "1800" + "   " + "3"));
	    reduceDriver.runTest();
	  }

}

运行结果截图：

spring8743

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hadoop实现表连接算法

常见的单表，多表连接可以用SQL很简单的表示出来，但是对于hadoop就有点复杂了，这里简单介绍下如何用hadoop的key/value实现表之间的连接。现在有两张表emp, dept分别显示员工信息和部门信息，需要实现以下的需求1. 求每个部门的总工资2. 求每个部门的平均工资和人数SQL> select * from emp; EMPNO ENAME JOB
复制链接

扫一扫