hadoop关联文件处理

最新推荐文章于 2024-11-06 11:12:27 发布

weixin_34259159

最新推荐文章于 2024-11-06 11:12:27 发布

阅读量90

点赞数

文章标签：大数据 java

c001.txt

------------------------------

c002.txt

----------------------------

a001.txt

-------------------------

filetype|addressid|address
addr|1|罗湖
addr|2|福田
addr|3|南山
addr|4|宝安

输出结果：

-----------------------

commid commname addr
15 福田小区6 福田
16 福田小区8 福田
6 福田小区1 福田
7 福田小区2 福田
13 南山小区6 南山
14 南山小区7 南山
4 南山小区1 南山
5 南山小区2 南山
9 南山3 南山
3 宝安小区1 宝安
8 宝安2 宝安
12 宝安小区5 宝安

----------------------------

代码：

package org.apache.hadoop.examples;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.fs.Path;

public class TestUnion {

    public static int count=0;
    public static class TestUnionMapper extends Mapper<Object,Text,Text,Text>
    {        
        public void map(Object key,Text values,Context context) throws IOException,InterruptedException
        {            
            if(values.toString().indexOf("filetype")>=0)
            {        
                return;
            }
            StringTokenizer itr=new StringTokenizer(values.toString(),"|");
            String fileType="";
            String fileTypeId="";
            while(itr.hasMoreTokens())
            {            
                fileType=itr.nextToken();            
                if(fileType.compareToIgnoreCase("addr")==0) 
                {                    
                    String addressId=itr.nextToken();
                    String addressName=itr.nextToken();
                    fileTypeId="2";    //标记为地址
                    context.write(new Text(addressId),new Text(fileTypeId+"|"+addressName));
                }
                else if(fileType.compareToIgnoreCase("comm")==0) 
                {                
                    String commId=itr.nextToken();
                    String commName=itr.nextToken();
                    String addressId=itr.nextToken();
                    fileTypeId="1";    //标记为小区
                    context.write(new Text(addressId),new Text(fileTypeId+"|"+commId+"|"+commName));
                }
            }
        }
    }
    public static class TestUnionReducer extends Reducer<Text,Text,Text,Text>
    {
        public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException
        {
            List<String> addrs=new ArrayList<String>();
            List<String> comms=new ArrayList<String>();
            if(count<=0)
            {
                count++;
                context.write(new Text("commid"),new Text("commname    addr"));                
                return;
            }
            else 
            {   　　　　　　      
                for(Text val:values)
                {
                    String []astr=val.toString().trim().split("\\|"); // | 为特殊字符，必须转义        
                    String fileTypeId=astr[0];                        
                    if(fileTypeId.compareToIgnoreCase("1")==0) //comm
                    {                            
                        String commId=astr[1];
                        String commName=astr[2];
                        comms.add(commId+"    "+commName);                        
                    }
                    else if(fileTypeId.compareToIgnoreCase("2")==0)  //addr
                    {
                        String addr=astr[1];
                        addrs.add(addr);                        
                    }                    
                }
            }
            if(comms.size()>0 && addrs.size()>0)
            {                                
                for(int m=0;m<comms.size();m++)                
                    for(int n=0;n<addrs.size();n++)    //其实只有一条记录对应上面的
                        context.write(new Text(comms.get(m)),new Text(addrs.get(n)));                
            }
        }
    }
    
    public static void main(String[] args) throws Exception{
        // TODO Auto-generated method stub
        if(args.length!=2)
        {
            System.err.println("please input two agrs:<in>  <out>");
            System.exit(2);
        }
        Configuration conf=new Configuration();
        Job    job=new Job(conf,"union data");
        job.setJarByClass(TestUnion.class);
        job.setMapperClass(TestUnionMapper.class);
        job.setReducerClass(TestUnionReducer.class);
        //job.setNumReduceTasks(0);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        System.exit(job.waitForCompletion(true)?0:1);
    }

}