hadoop实现表连接算法

常见的单表,多表连接可以用SQL很简单的表示出来,但是对于hadoop就有点复杂了,这里简单介绍下如何用hadoop的key/value实现表之间的连接。

现在有两张表emp, dept分别显示员工信息和部门信息,需要实现以下的需求
1. 求每个部门的总工资
2. 求每个部门的平均工资和人数

SQL> select * from emp;

EMPNO ENAME JOB MGR HIREDATE SAL COMM DEPTNO
---------- ---------- --------- ---------- -------------- ---------- ---------- ----------
7369 SMITH CLERK 7902 17-12月-80 800 20
7499 ALLEN SALESMAN 7698 20-2 -81 1600 300 30
7521 WARD SALESMAN 7698 22-2 -81 1250 500 30
7566 JONES MANAGER 7839 02-4 -81 2975 20
7654 MARTIN SALESMAN 7698 28-9 -81 1250 1400 30
7698 BLAKE MANAGER 7839 01-5 -81 2850 30
7782 CLARK MANAGER 7839 09-6 -81 2450 10
7839 KING PRESIDENT 17-11月-81 5000 10
7844 TURNER SALESMAN 7698 08-9 -81 1500 0 30
7900 JAMES CLERK 7698 03-12月-81 950 30
7902 FORD ANALYST 7566 03-12月-81 3000 20
7934 MILLER CLERK 7782 23-1 -82 1300 10

已选择12行。

SQL> select * from dept;

DEPTNO DNAME LOC
---------- -------------- -------------
10 ACCOUNTING NEW YORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON

代码如下:

package homework;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class Exercise_4 extends Configured implements Tool{

enum Counter{
LINESKIP;
}

public static class Map extends Mapper<LongWritable, Text, Text, Text>
{

public void map ( LongWritable key, Text value, Context context ) throws IOException, InterruptedException
{
String line = value.toString(); //读取源数据

try
{
if(line.substring(0,10).trim().length() == 4){
String salary = line.substring(59, 69).trim();
String deptno = line.substring(78, 88).trim();
context.write(new Text(deptno), new Text("1" + salary));

}else if (line.substring(0,10).trim().length() == 2){
String deptno = line.substring(0,10).trim();
String dname = line.substring(11, 25).trim();
context.write(new Text(deptno), new Text("2" + dname));
}

}
catch ( java.lang.ArrayIndexOutOfBoundsException e )
{
context.getCounter(Counter.LINESKIP).increment(1); //出错令计数器+1
return;
}
}
}

public static class Reduce extends Reducer<Text, Text, Text, Text>
{
public void reduce ( Text key, Iterable<Text> values, Context context ) throws IOException, InterruptedException
{
String valueString;
Integer sumSalary = 0;
Integer perCount = 0;
Integer avgSalary = 0;

for ( Text value : values )
{
valueString = value.toString();
Long flag = Long.parseLong(valueString.substring(0, 1));

if(flag == 1){
perCount += 1;
sumSalary = sumSalary + Integer.parseInt(((valueString.substring(1))));

}else if(flag == 2){
key = new Text(valueString.substring(1));
}

}

if(perCount == 0){
avgSalary = 0;
}else{
avgSalary = sumSalary / perCount;
}

context.write( key, new Text(sumSalary.toString()+ " " + avgSalary.toString() + " " + perCount.toString()));
//context.write( key, new Text(sumSalary.toString()));
}
}

@Override
public int run(String[] args) throws Exception
{

Job job = new Job();
job.setJarByClass(Exercise_4.class); //指定Class

FileInputFormat.addInputPath( job, new Path(args[0]) ); //输入路径
FileOutputFormat.setOutputPath( job, new Path(args[1]) ); //输出路径

job.setMapperClass( Map.class ); //调用上面Map类作为Map任务代码
job.setReducerClass ( Reduce.class ); //调用上面Reduce类作为Reduce任务代码
job.setOutputFormatClass( TextOutputFormat.class );
job.setOutputKeyClass( Text.class ); //指定输出的KEY的格式
job.setOutputValueClass( Text.class ); //指定输出的VALUE的格式

job.waitForCompletion(true);

return job.isSuccessful() ? 0 : 1;
}

/**
* 设置系统说明
* 设置MapReduce任务
*/
public static void main(String[] args) throws Exception
{

//判断参数个数是否正确
//如果无参数运行则显示以作程序说明
if ( args.length != 2 )
{
System.err.println("");
System.err.println("Usage: Exercise_4 < input path > < output path > ");
System.err.println("Example: hadoop jar ~/Exercise_4.jar hdfs://localhost:9000/home/james/Exercise_4 hdfs://localhost:9000/home/james/output");
System.err.println("Counter:");
System.err.println("\t"+"LINESKIP"+"\t"+"Lines which are too short");
System.exit(-1);
}


//运行任务
int res = ToolRunner.run(new Configuration(), new Exercise_4(), args);

System.exit(res);
}

}


发现写MapReduce程序单元测试很重要啊,不然调试起来会很麻烦的,这里贴下MRUnit单元测试的代码

package homework;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.junit.Before;
import org.junit.Test;

public class Exercise_4Test {
MapDriver<LongWritable, Text, Text, Text> mapDriver;
ReduceDriver<Text, Text, Text, Text> reduceDriver;

@Before
public void setUp() {
Exercise_4.Map mapper = new Exercise_4.Map();
Exercise_4.Reduce reducer = new Exercise_4.Reduce();

mapDriver = MapDriver.newMapDriver(mapper);;
reduceDriver = ReduceDriver.newReduceDriver(reducer);

}

@Test
public void testMapper() throws IOException{
// Temperature
Text value1 = new Text(" 7369 SMITH CLERK 7902 17-12-80 800 20");
//Text value2 = new Text(" 10 ACCOUNTING NEW YORK");

mapDriver.withInput(new LongWritable(), value1);
mapDriver.withOutput(new Text("20"), new Text("1800"));
mapDriver.runTest();
}

// @Test
public void testReducer() throws IOException {
List<Text> values = new ArrayList<Text>();
values.add(new Text("2ACCOUNTING"));
values.add(new Text("1800"));
values.add(new Text("11600"));
values.add(new Text("13000"));
reduceDriver.withInput(new Text("20"), values);
reduceDriver.withOutput(new Text("ACCOUNTING"), new Text("5400" + " " + "1800" + " " + "3"));
reduceDriver.runTest();
}

}


运行结果截图:
hadoop实现表连接算法 - spring8743 - 我的博客
 
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值