代码:
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MyJob extends Configured implements Tool {
public static class MapClass extends MapReduceBase
implements Mapper<Text, Text, Text, Text> {
public void map(Text key, Text value,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
output.collect(value, key);
}
}
public static class Reduce extends MapReduceBase
implements Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
String csv = "";
while (values.hasNext()) {
if (csv.length() > 0) csv += ",";
csv += values.next().toString();
}
output.collect(key, new Text(csv));
}
}
public int run(String[] args) throws Exception {
Configuration conf = getConf();
JobConf job = new JobConf(conf, MyJob.class);
Path in = new Path(args[0]);
Path out = new Path(args[1]);
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
job.setJobName("MyJob");
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
job.setInputFormat(KeyValueTextInputFormat.class);
job.setOutputFormat(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.set("key.value.separator.in.input.line", ",");
JobClient.runJob(job);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new MyJob(), args);
System.exit(res);
}
}
编译:
javac -classpath hadoop-0.20.2-core.jar -d ~/playground/classes/ MyJob.java
jar -cvf ~/hadoop/myjob.jar -C ~/playground/classes/ .
运行:
hadoop jar myjob.jar MyJob /user/hive/warehouse/qq ~/hadoop/aa
运行过程:
11/10/26 19:04:31 INFO mapred.FileInputFormat: Total input paths to process : 111/10/26 19:04:31 INFO net.NetworkTopology: Adding a new node: /default-rack/10.11.1.233:50010
11/10/26 19:04:31 INFO net.NetworkTopology: Adding a new node: /default-rack/10.11.1.232:50010
11/10/26 19:04:31 INFO net.NetworkTopology: Adding a new node: /default-rack/10.11.1.230:50010
11/10/26 19:04:31 INFO net.NetworkTopology: Adding a new node: /default-rack/10.11.1.236:50010
11/10/26 19:04:31 INFO net.NetworkTopology: Adding a new node: /default-rack/10.11.1.223:50010
11/10/26 19:04:32 INFO mapred.JobClient: Running job: job_201110151042_27056
11/10/26 19:04:33 INFO mapred.JobClient: map 0% reduce 0%
11/10/26 19:04:45 INFO mapred.JobClient: map 10% reduce 0%
11/10/26 19:04:46 INFO mapred.JobClient: map 12% reduce 0%
11/10/26 19:04:47 INFO mapred.JobClient: map 23% reduce 0%
11/10/26 19:04:48 INFO mapred.JobClient: map 50% reduce 0%
11/10/26 19:04:49 INFO mapred.JobClient: map 52% reduce 0%
11/10/26 19:04:50 INFO mapred.JobClient: map 69% reduce 0%
11/10/26 19:04:51 INFO mapred.JobClient: map 78% reduce 0%
11/10/26 19:04:53 INFO mapred.JobClient: map 96% reduce 0%
11/10/26 19:04:54 INFO mapred.JobClient: map 100% reduce 1%
11/10/26 19:04:55 INFO mapred.JobClient: map 100% reduce 3%
11/10/26 19:04:57 INFO mapred.JobClient: map 100% reduce 12%
11/10/26 19:04:58 INFO mapred.JobClient: map 100% reduce 17%
11/10/26 19:04:59 INFO mapred.JobClient: map 100% reduce 25%
11/10/26 19:05:00 INFO mapred.JobClient: map 100% reduce 27%
11/10/26 19:05:01 INFO mapred.JobClient: map 100% reduce 29%
11/10/26 19:05:02 INFO mapred.JobClient: map 100% reduce 33%
11/10/26 19:05:04 INFO mapred.JobClient: map 100% reduce 39%
11/10/26 19:05:05 INFO mapred.JobClient: map 100% reduce 50%
11/10/26 19:05:06 INFO mapred.JobClient: map 100% reduce 54%
11/10/26 19:05:07 INFO mapred.JobClient: map 100% reduce 59%
11/10/26 19:05:08 INFO mapred.JobClient: map 100% reduce 79%
11/10/26 19:05:09 INFO mapred.JobClient: map 100% reduce 88%
11/10/26 19:05:10 INFO mapred.JobClient: map 100% reduce 93%
11/10/26 19:05:11 INFO mapred.JobClient: map 100% reduce 94%
11/10/26 19:05:12 INFO mapred.JobClient: map 100% reduce 98%
11/10/26 19:05:13 INFO mapred.JobClient: map 100% reduce 100%
11/10/26 19:05:15 INFO mapred.JobClient: Job complete: job_201110151042_27056
11/10/26 19:05:15 INFO mapred.JobClient: Counters: 19
11/10/26 19:05:15 INFO mapred.JobClient: Job Counters
11/10/26 19:05:15 INFO mapred.JobClient: Launched reduce tasks=50
11/10/26 19:05:15 INFO mapred.JobClient: Rack-local map tasks=41
11/10/26 19:05:15 INFO mapred.JobClient: Launched map tasks=50
11/10/26 19:05:15 INFO mapred.JobClient: Data-local map tasks=9
11/10/26 19:05:15 INFO mapred.JobClient: FileSystemCounters
11/10/26 19:05:15 INFO mapred.JobClient: FILE_BYTES_READ=90810133
11/10/26 19:05:15 INFO mapred.JobClient: HDFS_BYTES_READ=268556993
11/10/26 19:05:15 INFO mapred.JobClient: FILE_BYTES_WRITTEN=195724036
11/10/26 19:05:15 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=71469833
11/10/26 19:05:15 INFO mapred.JobClient: Map-Reduce Framework
11/10/26 19:05:15 INFO mapred.JobClient: Reduce input groups=3258984
11/10/26 19:05:15 INFO mapred.JobClient: Combine output records=0
11/10/26 19:05:15 INFO mapred.JobClient: Map input records=16522439
11/10/26 19:05:15 INFO mapred.JobClient: Reduce shuffle bytes=102906053
11/10/26 19:05:15 INFO mapred.JobClient: Reduce output records=3258984
11/10/26 19:05:15 INFO mapred.JobClient: Spilled Records=33044878
11/10/26 19:05:15 INFO mapred.JobClient: Map output bytes=264075431
11/10/26 19:05:15 INFO mapred.JobClient: Map input bytes=264075431
11/10/26 19:05:15 INFO mapred.JobClient: Combine input records=0
11/10/26 19:05:15 INFO mapred.JobClient: Map output records=16522439
11/10/26 19:05:15 INFO mapred.JobClient: Reduce input records=16522439
终于可以写成功一个M/R了
hadoop dfs -getmerge /user/hive/warehouse/xxx ~/hadoop/aa.gz
gunzip aa.gz