第一个M/R程序

代码:


import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MyJob extends Configured implements Tool {
    
    public static class MapClass extends MapReduceBase
        implements Mapper<Text, Text, Text, Text> {
        
        public void map(Text key, Text value,
                        OutputCollector<Text, Text> output,
                        Reporter reporter) throws IOException {
                        
            output.collect(value, key);
        }
    }
    
    public static class Reduce extends MapReduceBase
        implements Reducer<Text, Text, Text, Text> {
        
        public void reduce(Text key, Iterator<Text> values,
                           OutputCollector<Text, Text> output,
                           Reporter reporter) throws IOException {
                           
            String csv = "";
            while (values.hasNext()) {
                if (csv.length() > 0) csv += ",";
                csv += values.next().toString();
            }
            output.collect(key, new Text(csv));
        }
    }
    
    public int run(String[] args) throws Exception {
        Configuration conf = getConf();
        
        JobConf job = new JobConf(conf, MyJob.class);
        
        Path in = new Path(args[0]);
        Path out = new Path(args[1]);
        FileInputFormat.setInputPaths(job, in);
        FileOutputFormat.setOutputPath(job, out);
        
        job.setJobName("MyJob");
        job.setMapperClass(MapClass.class);
        job.setReducerClass(Reduce.class);
        
        job.setInputFormat(KeyValueTextInputFormat.class);
        job.setOutputFormat(TextOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.set("key.value.separator.in.input.line", ",");
        
        JobClient.runJob(job);
        
        return 0;
    }
    
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new MyJob(), args);
        
        System.exit(res);
    }
}

编译:

javac -classpath hadoop-0.20.2-core.jar -d ~/playground/classes/ MyJob.java

jar -cvf ~/hadoop/myjob.jar -C  ~/playground/classes/ .

运行:

hadoop jar myjob.jar MyJob /user/hive/warehouse/qq ~/hadoop/aa



运行过程:

11/10/26 19:04:31 INFO mapred.FileInputFormat: Total input paths to process : 1
11/10/26 19:04:31 INFO net.NetworkTopology: Adding a new node: /default-rack/10.11.1.233:50010
11/10/26 19:04:31 INFO net.NetworkTopology: Adding a new node: /default-rack/10.11.1.232:50010
11/10/26 19:04:31 INFO net.NetworkTopology: Adding a new node: /default-rack/10.11.1.230:50010
11/10/26 19:04:31 INFO net.NetworkTopology: Adding a new node: /default-rack/10.11.1.236:50010
11/10/26 19:04:31 INFO net.NetworkTopology: Adding a new node: /default-rack/10.11.1.223:50010
11/10/26 19:04:32 INFO mapred.JobClient: Running job: job_201110151042_27056
11/10/26 19:04:33 INFO mapred.JobClient:  map 0% reduce 0%
11/10/26 19:04:45 INFO mapred.JobClient:  map 10% reduce 0%
11/10/26 19:04:46 INFO mapred.JobClient:  map 12% reduce 0%
11/10/26 19:04:47 INFO mapred.JobClient:  map 23% reduce 0%
11/10/26 19:04:48 INFO mapred.JobClient:  map 50% reduce 0%
11/10/26 19:04:49 INFO mapred.JobClient:  map 52% reduce 0%
11/10/26 19:04:50 INFO mapred.JobClient:  map 69% reduce 0%
11/10/26 19:04:51 INFO mapred.JobClient:  map 78% reduce 0%
11/10/26 19:04:53 INFO mapred.JobClient:  map 96% reduce 0%
11/10/26 19:04:54 INFO mapred.JobClient:  map 100% reduce 1%
11/10/26 19:04:55 INFO mapred.JobClient:  map 100% reduce 3%
11/10/26 19:04:57 INFO mapred.JobClient:  map 100% reduce 12%
11/10/26 19:04:58 INFO mapred.JobClient:  map 100% reduce 17%
11/10/26 19:04:59 INFO mapred.JobClient:  map 100% reduce 25%
11/10/26 19:05:00 INFO mapred.JobClient:  map 100% reduce 27%
11/10/26 19:05:01 INFO mapred.JobClient:  map 100% reduce 29%
11/10/26 19:05:02 INFO mapred.JobClient:  map 100% reduce 33%
11/10/26 19:05:04 INFO mapred.JobClient:  map 100% reduce 39%
11/10/26 19:05:05 INFO mapred.JobClient:  map 100% reduce 50%
11/10/26 19:05:06 INFO mapred.JobClient:  map 100% reduce 54%
11/10/26 19:05:07 INFO mapred.JobClient:  map 100% reduce 59%
11/10/26 19:05:08 INFO mapred.JobClient:  map 100% reduce 79%
11/10/26 19:05:09 INFO mapred.JobClient:  map 100% reduce 88%
11/10/26 19:05:10 INFO mapred.JobClient:  map 100% reduce 93%
11/10/26 19:05:11 INFO mapred.JobClient:  map 100% reduce 94%
11/10/26 19:05:12 INFO mapred.JobClient:  map 100% reduce 98%
11/10/26 19:05:13 INFO mapred.JobClient:  map 100% reduce 100%
11/10/26 19:05:15 INFO mapred.JobClient: Job complete: job_201110151042_27056
11/10/26 19:05:15 INFO mapred.JobClient: Counters: 19
11/10/26 19:05:15 INFO mapred.JobClient:   Job Counters
11/10/26 19:05:15 INFO mapred.JobClient:     Launched reduce tasks=50
11/10/26 19:05:15 INFO mapred.JobClient:     Rack-local map tasks=41
11/10/26 19:05:15 INFO mapred.JobClient:     Launched map tasks=50
11/10/26 19:05:15 INFO mapred.JobClient:     Data-local map tasks=9
11/10/26 19:05:15 INFO mapred.JobClient:   FileSystemCounters
11/10/26 19:05:15 INFO mapred.JobClient:     FILE_BYTES_READ=90810133
11/10/26 19:05:15 INFO mapred.JobClient:     HDFS_BYTES_READ=268556993
11/10/26 19:05:15 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=195724036
11/10/26 19:05:15 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=71469833
11/10/26 19:05:15 INFO mapred.JobClient:   Map-Reduce Framework
11/10/26 19:05:15 INFO mapred.JobClient:     Reduce input groups=3258984
11/10/26 19:05:15 INFO mapred.JobClient:     Combine output records=0
11/10/26 19:05:15 INFO mapred.JobClient:     Map input records=16522439
11/10/26 19:05:15 INFO mapred.JobClient:     Reduce shuffle bytes=102906053
11/10/26 19:05:15 INFO mapred.JobClient:     Reduce output records=3258984
11/10/26 19:05:15 INFO mapred.JobClient:     Spilled Records=33044878
11/10/26 19:05:15 INFO mapred.JobClient:     Map output bytes=264075431
11/10/26 19:05:15 INFO mapred.JobClient:     Map input bytes=264075431
11/10/26 19:05:15 INFO mapred.JobClient:     Combine input records=0
11/10/26 19:05:15 INFO mapred.JobClient:     Map output records=16522439

11/10/26 19:05:15 INFO mapred.JobClient:     Reduce input records=16522439


终于可以写成功一个M/R了

hadoop dfs -getmerge /user/hive/warehouse/xxx  ~/hadoop/aa.gz

gunzip aa.gz









评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值