hadoop 处理不同的输入文件，文件关联

最新推荐文章于 2022-04-19 23:57:25 发布

果冻还不错

最新推荐文章于 2022-04-19 23:57:25 发布

阅读量488

点赞数

类型一：一一对应

file1：

a 1
b 2
c 3

file2：

1 ！
2 @
3 #

file1和file2进行关联，想要的结果：

a !

b @

3 #

思路：

1、标记不同输入文件

2、将file1的key、value颠倒；file1和file2的key相同，file1的value做key，file2的value做value ，输出。

程序：

[java]view plaincopy 
   
 package smiple;  
   
 import java.io.IOException;  
 import java.util.StringTokenizer;  
   
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.hadoop.fs.Path;  
 import org.apache.hadoop.io.LongWritable;  
 import org.apache.hadoop.io.Text;  
 import org.apache.hadoop.mapreduce.InputSplit;  
 import org.apache.hadoop.mapreduce.Job;  
 import org.apache.hadoop.mapreduce.Mapper;  
 import org.apache.hadoop.mapreduce.Reducer;  
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;  
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
 import org.apache.hadoop.util.GenericOptionsParser;  
   
 public class FileJoin {  
   
     public static class MyMap extends Mapper<LongWritable , Text, Text, Text> {  
   
         public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {  
 //          String line = value.toString();  
             String line=new String(value.getBytes(),0,value.getLength(),"GBK");  
             StringTokenizer tokenizer = new StringTokenizer(line);  
             String keystr = tokenizer.nextToken();  
             String valuestr = tokenizer.nextToken();  
               
             //获取文件名  
             InputSplit inputSplit = context.getInputSplit();  
             String fileName = ((FileSplit) inputSplit).getPath().getName();  
               
               
             if("file1".equals(fileName)){//加标记  
                 context.write(new Text(valuestr),new Text("file1_"+keystr));  
             }else if("file2".equals(fileName)){  
                 context.write(new Text(keystr), new Text("file2_"+valuestr));  
             }  
               
         }  
     }  
   
     public static class MyReduce extends Reducer<Text, Text, Text, Text> {  
   
         public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException {  
             Text resultKey = new Text("key0");  
             Text resultValue = new Text("value0");  
             for (Text val : values) {  
                 if("file1_".equals(val.toString().substring(0, 6))){  
                     resultKey = new Text(val.toString().substring(6));  
                 }else if("file2_".equals(val.toString().substring(0, 6))){  
                     resultValue = new Text(val.toString().substring(6));  
                 }  
             }  
             System.out.println(resultKey.toString()+"   " + resultValue.toString());  
             context.write(resultKey, resultValue);  
         }  
     }  
   
     public static void main(String[] args) throws Exception {  
         Configuration conf = new Configuration();  
         String[] ioArgs = new String[] { "hdfs://ip:port/mr/join/in","hdfs://ip:port/mr/join/out" };  
         String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();  
   
         if (otherArgs.length != 2) {  
             System.err.println("Usage: Data Sort <in> <out>");  
             System.exit(2);  
         }  
         Job job = new Job(conf, "file join ");  
   
         job.setJarByClass(Sort.class);  
   
         // 设置Map和Reduce处理类  
         job.setMapperClass(MyMap.class);  
         job.setReducerClass(MyReduce.class);  
   
         // 设置输出类型  
         job.setOutputKeyClass(Text.class);  
         job.setOutputValueClass(Text.class);  
   
         // 设置输入和输出目录  
         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
   
         System.exit(job.waitForCompletion(true) ? 0 : 1);  
   
     }  
   
 }  

结果：

果冻还不错

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hadoop 处理不同的输入文件，文件关联

类型一：一一对应file1：a 1b 2c 3file2：1 ！2 @3 #file1和file2进行关联，想要的结果：a !b @3 #思路：1、标记不同输入文件2、将file1的key、value颠倒；file1和file2的key相同，file1
复制链接

扫一扫