hadoop-mapreduce map端多表合并

最新推荐文章于 2023-05-28 09:31:19 发布

大铁锤20

最新推荐文章于 2023-05-28 09:31:19 发布

阅读量869

点赞数

分类专栏： hadoop-mapreduce 文章标签： map-join

本文链接：https://blog.csdn.net/datiechui20/article/details/78824587

版权

hadoop-mapreduce 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

package cn.nyzc.mapjoin;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
*
* Title: MapJoinMapper
* Description:map端合并 
* Company: www.xnlc.cn
* @author 黄庭华
* @date 2016年7月17日下午12:46:53
* @version 1.0
*/
public class MapJoinMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

// 把小文件中的内容全部缓存到map中
Map<String, String> buf = new HashMap<>();

@Override
protected void setup(Context context) throws IOException, InterruptedException {
/**
* 执行map方法前，把需要合并的另一个小文件加载到缓存中
*/
// 获取需要缓存小文件的输入流对象
BufferedReader in = new BufferedReader(// win环境问题，linux上不需要写全路径
new InputStreamReader(new FileInputStream("e:/fortest/pd.txt"), "UTF-8"));

// 输入流对象读取一行内容的缓存区
String line = null;
while (StringUtils.isNotEmpty(line = in.readLine())) {
// 切割读取的一行内容
String[] values = line.split("\t");
// 往缓冲中装数据
buf.put(values[0], values[1]);
}
// 关流
IOUtils.closeStream(in);
}

// 输出数据的暂存容器
Text text = new Text();

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

// 获取数据
String line = value.toString();
String[] values = line.split("\t");

// 处理数据
String pname = buf.get(values[1]);
text.set(line + "\t" + pname);

// 输出数据
context.write(text, NullWritable.get());

}

}

//================================

package cn.nyzc.mapjoin;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 任务驱动类
* Title: MapJoinDriver
* Description: 
* Company: www.xnlc.cn
* @author 黄庭华
* @date 2016年7月17日下午1:04:16
* @version 1.0
*/
public class MapJoinDriver {

public static void main(String[] args) throws Exception {

//设置输入输出路径
args=new String[]{"e:/fortest/input03","e:/output01"};

//1 创建任务对象job
Job job = Job.getInstance(new Configuration());

//2 设置jar所在的位置
job.setJarByClass(MapJoinDriver.class);

//3 设置mapreduce程序运行的主类
job.setMapperClass(MapJoinMapper.class);
//简单案例不需要启动reduceTask
job.setNumReduceTasks(0);
//设置需要加载到缓存中的小文件路径
job.addCacheFile(new URI("file:///e:/fortest/pd.txt"));

//4 设置输出数据类型此时无需设置mapper的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);

//5 设置数据源和结果数据的路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));

//6 提交
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}

}