MapReduce之Join多表查询实现
一、需求说明
- 要求:实现dept部门表和emp员工表关联查询即等值查询,实现如下SQL功能:
select b.deptname,a.ename from emp a join dept b on a.deptno=b.deptno
二、测试数据
- 员工信息表:下载地址
- 表字段说明:
三、编程思路
- 思路:
1、在map阶段注意区分读取进来的数据所属哪张表,需做判断进行区分
2、在reduce阶段注意对相同key的value进行处理,分别取出哪些是部门表和员工表的信息
3、编写Job类,设置mapper及输入输出
4、注意将emp.cvs和dept.csv放在同一个目录下
四、实现步骤
-
在Idea或eclipse中创建maven项目
-
在pom.xml中添加hadoop依赖
<dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.7.3</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.7.3</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-common</artifactId> <version>2.7.3</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>2.7.3</version> </dependency>
-
添加log4j.properties文件在资源目录下即resources,文件内容如下:
### 配置根 ### log4j.rootLogger = debug,console,fileAppender ## 配置输出到控制台 ### log4j.appender.console = org.apache.log4j.ConsoleAppender log4j.appender.console.Target = System.out log4j.appender.console.layout = org.apache.log4j.PatternLayout log4j.appender.console.layout.ConversionPattern = %d{ABSOLUTE} %5p %c:%L - %m%n ### 配置输出到文件 ### log4j.appender.fileAppender = org.apache.log4j.FileAppender log4j.appender.fileAppender.File = logs/logs.log log4j.appender.fileAppender.Append = false log4j.appender.fileAppender.Threshold = DEBUG,INFO,WARN,ERROR log4j.appender.fileAppender.layout = org.apache.log4j.PatternLayout log4j.appender.fileAppender.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n
-
编写maper类
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class EqualJoinMapper extends Mapper<LongWritable, Text, IntWritable,Text> { IntWritable key2 = new IntWritable(); Text value2 = new Text(); protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { /**数据结构: * emp:7499,ALLEN,SALESMAN,7698,1981/2/20,1600,300,30 * dept:20,RESEARCH,DALLAS */ System.out.println("偏移量:" + key + ",value : " + value.toString()); //1、分词 String[] splits = value.toString().split(","); //2、区别emp和dept if (splits.length >= 8){ //读取的是emp表数据 String empName = splits[1]; String empDept = splits[7]; key2.set(Integer.parseInt(empDept)); value2.set(empName); }else{ //读取是dept表的数据 String detpNo = splits[0]; String deptName = "*" + splits[1];//加*的目的是标识当前的数据是属于部门表里面的 key2.set(Integer.parseInt(detpNo)); value2.set(deptName); } //3、通过context写出去 context.write(key2,value2); } }
-
编写reducer类
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class EqualJoinReducer extends Reducer<IntWritable,Text,Text,Text> { Text key4 = new Text(); Text value4 = new Text(); protected void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { //从values中取出部门名称和员工的名称 String empNameList = ""; String deptName = ""; for (Text v: values) { String data = v.toString(); int deptFlag = data.indexOf("*"); if (deptFlag != -1 ){//找到包含有*号的数据:部门名称 如*RESEARCH deptName = data.substring(1); }else{ empNameList = data + ";" + empNameList; } } key4.set(deptName); value4.set(empNameList); // 写出去 context.write(key4,value4); } }
-
编写Driver类
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.util.Random; public class EqualJoinJob { public static void main(String[] args) throws Exception { Job job = Job.getInstance(new Configuration()); job.setJarByClass(EqualJoinJob.class); //设置Mapper job.setMapperClass(EqualJoinMapper.class); job.setMapOutputKeyClass(IntWritable.class);//key2 job.setMapOutputValueClass(Text.class);//value2 //设置Reducer job.setReducerClass(EqualJoinReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); //先使用本地文件做测试 FileInputFormat.setInputPaths(job,new Path("F:\\NIIT\\hadoopOnWindow\\equaljoin\\")); FileOutputFormat.setOutputPath(job,new Path(getOutputDir())); boolean result = job.waitForCompletion(true); System.out.println("result:" + result); } //用于产生随机输出目录 public static String getOutputDir(){ String prefix = "F:\\NIIT\\hadoopOnWindow\\output\\"; long time = System.currentTimeMillis(); int random = new Random().nextInt(); return prefix + "result_" + time + "_" + random; } }
-
本地运行代码,测试下结果正确与否
五、打包上传到集群中运行
-
上传emp.csv到hdfs中的datas目录下
-
本地运行测试结果正确后,需要对Driver类输入输出部分代码进行修改,具体修改如下:
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1])); -
将程序打成jar包,需要在pom.xml中配置打包插件
<build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId> maven-assembly-plugin </artifactId> <configuration> <!-- 使用Maven预配置的描述符--> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id> <!-- 绑定到package生命周期 --> <phase>package</phase> <goals> <!-- 只运行一次 --> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build>
按照如下图所示进行操作
-
提交集群运行,执行如下命令:
hadoop jar packagedemo-1.0-SNAPSHOT.jar com.niit.mr.EmpJob /datas/emp.csv /output/emp/
至此,所有的步骤已经完成,大家可以试试,祝大家好运~~~~