前面说了个MapReduce之Reduce端Join实现,Reduce端做join,在数据量比较大的时候往往会内存不足,还在Map端白白做了很多工作,这样是不经济的。这回咱们也来说个在Map端的Join实现。在数据量较大的时候,如果一个表的数据很多,而另外一个表的数据较少,可以把数据较少的放到Cache中,在map的时候就把要关联的信息直接给带上,而不用到Reduce的时候再去找,这样当然会节省很多。下面来看看具体的实现代码,网上也有很多类似,我也是在别人的例子基础上,按照上篇的格式做了修改。
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class TestMapSideJoin {
public static class MyMapper extends Mapper<Object, Text, Text, Text> {
private Map<String, String> pdDataMap = new HashMap<String, String>();
@Override
public void setup(Context context){
BufferedReader in = null;
try
{
Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration()); //分布式缓存
String user = null;
String[] pdInfo = null;
for (Path path : paths)
{
if (path.toString().contains("pd.csv"))
{
in = new BufferedReader(new FileReader(path.toString()));
while (null != (user = in.readLine()))
{
pdInfo = user.split(","); //缓存文件中的数据
pdDataMap.put(pdInfo[0], pdInfo[2]);
}
}
}
}
catch (IOException e)
{
e.printStackTrace();
}
finally
{
try
{
if(in!=null) in.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
// StringTokenizer itr = new StringTokenizer(value.toString());
String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();
// System.out.println("File="+pathName);
String str = value.toString();
if (pathName.endsWith("sale.csv")) {
String ss[] = str.split(",");
Text jkey = new Text();
jkey.set(ss[0]);
double d = Double.parseDouble(ss[1]) * Double.parseDouble(ss[2]);
IntWritable v = new IntWritable((int) d);
System.out.println("File=" + pathName + "" + jkey.toString() + "," + new Text("" + (int) d));
String name = pdDataMap.get(jkey.toString()); //直接获得关联数据
context.write(jkey, new Text(name+"," + (int) d));
}
}
}
public static class MyReducer extends Reducer<Text, Text, Text, Text> {
private Text result = new Text();
@Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String str = key.toString();
String v = "";
for (Text val : values) {
System.out.println(str+"=" + val.toString());
v+=val.toString();
}
result.set(v);
context.write(key, result);
}
}
public static String InpuCacheFile="hdfs://192.168.202.128:9000/input/pd.csv";
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// conf.set("mapred.tasktracker.reduce.tasks.maximum", "3");
DistributedCache.addCacheFile(new Path(InpuCacheFile).toUri(), conf);
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
Job job = new Job(conf, "testjoin");
job.setJarByClass(TestMapSideJoin.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(MyMapper.class);
// job.setCombinerClass(MyReducer.class);
job.setReducerClass(MyReducer.class);
//job.setNumReduceTasks(1);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
long t = System.currentTimeMillis();
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1] + "/" + t));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
编译好之后,我在Eclipse插件Run on hadoop,可是却失败,报错说分布式文件没有找到路径。这样是不对的,通过打包jar文件之后,上传到hadoop上,执行
[hadoop@vm11 bin]$ ./hadoop jar ../testjoin.jar hdfs://192.168.202.128:9000/input/ hdfs://192.168.202.128:9000/result
这样就没有错误了,很容易看到结果是相要的。