MapReduce之Map端Join实现

最新推荐文章于 2024-05-30 21:35:29 发布

李元乐

最新推荐文章于 2024-05-30 21:35:29 发布

阅读量862

点赞数

分类专栏：数据处理文章标签： mapreduce map端join 分布式缓存 jar hadoop

本文链接：https://blog.csdn.net/hugolyl/article/details/50410598

版权

数据处理专栏收录该内容

21 篇文章 1 订阅

订阅专栏

前面说了个MapReduce之Reduce端Join实现，Reduce端做join，在数据量比较大的时候往往会内存不足，还在Map端白白做了很多工作，这样是不经济的。这回咱们也来说个在Map端的Join实现。在数据量较大的时候，如果一个表的数据很多，而另外一个表的数据较少，可以把数据较少的放到Cache中，在map的时候就把要关联的信息直接给带上，而不用到Reduce的时候再去找，这样当然会节省很多。下面来看看具体的实现代码，网上也有很多类似，我也是在别人的例子基础上，按照上篇的格式做了修改。

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class TestMapSideJoin {

public static class MyMapper extends Mapper<Object, Text, Text, Text> {
private Map<String, String> pdDataMap = new HashMap<String, String>();
@Override
public void setup(Context context){
BufferedReader in = null;
try
{
Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration()); //分布式缓存
String user = null;
String[] pdInfo = null;
for (Path path : paths)
{
if (path.toString().contains("pd.csv"))
{
in = new BufferedReader(new FileReader(path.toString()));
while (null != (user = in.readLine()))
{
pdInfo = user.split(","); //缓存文件中的数据
pdDataMap.put(pdInfo[0], pdInfo[2]);
}
}
}
}
catch (IOException e)
{
e.printStackTrace();
}
finally
{
try
{
if(in!=null) in.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}

@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
// StringTokenizer itr = new StringTokenizer(value.toString());
String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();
// System.out.println("File="+pathName);
String str = value.toString();
if (pathName.endsWith("sale.csv")) {
String ss[] = str.split(",");
Text jkey = new Text();
jkey.set(ss[0]);
double d = Double.parseDouble(ss[1]) * Double.parseDouble(ss[2]);
IntWritable v = new IntWritable((int) d);
System.out.println("File=" + pathName + "" + jkey.toString() + "," + new Text("" + (int) d));
String name = pdDataMap.get(jkey.toString()); //直接获得关联数据
context.write(jkey, new Text(name+"," + (int) d));

}
}
}

public static class MyReducer extends Reducer<Text, Text, Text, Text> {
private Text result = new Text();

@Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String str = key.toString();
String v = "";
for (Text val : values) {
System.out.println(str+"=" + val.toString());
v+=val.toString();
}
result.set(v);
context.write(key, result);

}
}

public static String InpuCacheFile="hdfs://192.168.202.128:9000/input/pd.csv";

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// conf.set("mapred.tasktracker.reduce.tasks.maximum", "3");
DistributedCache.addCacheFile(new Path(InpuCacheFile).toUri(), conf);
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
Job job = new Job(conf, "testjoin");
job.setJarByClass(TestMapSideJoin.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);

job.setMapperClass(MyMapper.class);
// job.setCombinerClass(MyReducer.class);
job.setReducerClass(MyReducer.class);

//job.setNumReduceTasks(1);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
long t = System.currentTimeMillis();
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1] + "/" + t));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

编译好之后，我在Eclipse插件Run on hadoop，可是却失败，报错说分布式文件没有找到路径。这样是不对的，通过打包jar文件之后，上传到hadoop上，执行

[hadoop@vm11 bin]$ ./hadoop jar ../testjoin.jar hdfs://192.168.202.128:9000/input/ hdfs://192.168.202.128:9000/result

这样就没有错误了，很容易看到结果是相要的。

李元乐

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
MapReduce之Map端Join实现

前面说了个MapReduce之Reduce端Join实现，Reduce端做join，在数据量比较大的时候往往会内存不足，还在Map端白白做了很多工作，这样是不经济的。这回咱们也来说个在Map端的Join实现。在数据量较大的时候，如果一个表的数据很多，而另外一个表的数据较少，可以把数据较少的放到Cache中，在map的时候就把要关联的信息直接给带上，而不用到Reduce的时候再去找，这样当然会节省很
复制链接

扫一扫

专栏目录